From 3b3c4268ed71abeddd8d4a268776399fa7b165dc Mon Sep 17 00:00:00 2001
From: AlongWY <AlongWY@users.noreply.github.com>
Date: Fri, 7 Jun 2024 05:24:38 +0000
Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac

---
 .nojekyll   |      0
 cache.json  |      1 +
 favicon.ico |    Bin 0 -> 15086 bytes
 index.css   |    355 +
 index.html  | 105183 +++++++++++++++++++++++++++++++++++++++++++++++++
 index.js    |     39 +
 6 files changed, 105578 insertions(+)
 create mode 100644 .nojekyll
 create mode 100644 cache.json
 create mode 100644 favicon.ico
 create mode 100644 index.css
 create mode 100644 index.html
 create mode 100644 index.js

diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 00000000..e69de29b
diff --git a/cache.json b/cache.json
new file mode 100644
index 00000000..010c8ed0
--- /dev/null
+++ b/cache.json
@@ -0,0 +1 @@
+{"2024-05-30T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2405.20341v1","updated":"2024-05-30T17:59:51Z","published":"2024-05-30T17:59:51Z","title":"From Zero to Hero: Cold-Start Anomaly Detection","summary":"  When first deploying an anomaly detection system, e.g., to detect\nout-of-scope queries in chatbots, there are no observed data, making\ndata-driven approaches ineffective. Zero-shot anomaly detection methods offer a\nsolution to such \"cold-start\" cases, but unfortunately they are often not\naccurate enough. This paper studies the realistic but underexplored cold-start\nsetting where an anomaly detection model is initialized using zero-shot\nguidance, but subsequently receives a small number of contaminated observations\n(namely, that may include anomalies). The goal is to make efficient use of both\nthe zero-shot guidance and the observations. We propose ColdFusion, a method\nthat effectively adapts the zero-shot anomaly detector to contaminated\nobservations. To support future development of this new setting, we propose an\nevaluation suite consisting of evaluation protocols and metrics.\n","authors":["Tal Reiss","George Kour","Naama Zwerdling","Ateret Anaby-Tavor","Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2405.20341v1.pdf","comment":"ACL 2024. Our code is available at\n  https://github.com/talreiss/ColdFusion"},{"id":"http://arxiv.org/abs/2405.20335v1","updated":"2024-05-30T17:59:31Z","published":"2024-05-30T17:59:31Z","title":"Xwin-LM: Strong and Scalable Alignment Practice for LLMs","summary":"  In this work, we present Xwin-LM, a comprehensive suite of alignment\nmethodologies for large language models (LLMs). This suite encompasses several\nkey techniques, including supervised finetuning (SFT), reward modeling (RM),\nrejection sampling finetuning (RS), and direct preference optimization (DPO).\nThe key components are as follows: (1) Xwin-LM-SFT, models initially finetuned\nwith high-quality instruction data; (2) Xwin-Pair, a large-scale, multi-turn\npreference dataset meticulously annotated using GPT-4; (3) Xwin-RM, reward\nmodels trained on Xwin-Pair, developed at scales of 7B, 13B, and 70B\nparameters; (4) Xwin-Set, a multiwise preference dataset in which each prompt\nis linked to 64 unique responses generated by Xwin-LM-SFT and scored by\nXwin-RM; (5) Xwin-LM-RS, models finetuned with the highest-scoring responses\nfrom Xwin-Set; (6) Xwin-LM-DPO, models further optimized on Xwin-Set using the\nDPO algorithm. Our evaluations on AlpacaEval and MT-bench demonstrate\nconsistent and significant improvements across the pipeline, demonstrating the\nstrength and scalability of Xwin-LM. The repository\nhttps://github.com/Xwin-LM/Xwin-LM will be continually updated to foster\ncommunity research.\n","authors":["Bolin Ni","JingCheng Hu","Yixuan Wei","Houwen Peng","Zheng Zhang","Gaofeng Meng","Han Hu"],"pdf_url":"https://arxiv.org/pdf/2405.20335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20318v1","updated":"2024-05-30T17:55:28Z","published":"2024-05-30T17:55:28Z","title":"CausalQuest: Collecting Natural Causal Questions for AI Agents","summary":"  Humans have an innate drive to seek out causality. Whether fuelled by\ncuriosity or specific goals, we constantly question why things happen, how they\nare interconnected, and many other related phenomena. To develop AI agents\ncapable of addressing this natural human quest for causality, we urgently need\na comprehensive dataset of natural causal questions. Unfortunately, existing\ndatasets either contain only artificially-crafted questions that do not reflect\nreal AI usage scenarios or have limited coverage of questions from specific\nsources. To address this gap, we present CausalQuest, a dataset of 13,500\nnaturally occurring questions sourced from social networks, search engines, and\nAI assistants. We formalize the definition of causal questions and establish a\ntaxonomy for finer-grained classification. Through a combined effort of human\nannotators and large language models (LLMs), we carefully label the dataset. We\nfind that 42% of the questions humans ask are indeed causal, with the majority\nseeking to understand the causes behind given effects. Using this dataset, we\ntrain efficient classifiers (up to 2.85B parameters) for the binary task of\nidentifying causal questions, achieving high performance with F1 scores of up\nto 0.877. We conclude with a rich set of future research directions that can\nbuild upon our data and models.\n","authors":["Roberto Ceraolo","Dmitrii Kharlapenko","Amélie Reymond","Rada Mihalcea","Mrinmaya Sachan","Bernhard Schölkopf","Zhijing Jin"],"pdf_url":"https://arxiv.org/pdf/2405.20318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09919v3","updated":"2024-05-30T17:55:19Z","published":"2024-03-14T23:40:56Z","title":"Recurrent Drafter for Fast Speculative Decoding in Large Language Models","summary":"  In this paper, we introduce an improved approach of speculative decoding\naimed at enhancing the efficiency of serving large language models. Our method\ncapitalizes on the strengths of two established techniques: the classic\ntwo-model speculative decoding approach, and the more recent single-model\napproach, Medusa. Drawing inspiration from Medusa, our approach adopts a\nsingle-model strategy for speculative decoding. However, our method\ndistinguishes itself by employing a single, lightweight draft head with a\nrecurrent dependency design, akin in essence to the small, draft model uses in\nclassic speculative decoding, but without the complexities of the full\ntransformer architecture. And because of the recurrent dependency, we can use\nbeam search to swiftly filter out undesired candidates with the draft head. The\noutcome is a method that combines the simplicity of single-model design and\navoids the need to create a data-dependent tree attention structure only for\ninference in Medusa. We empirically demonstrate the effectiveness of the\nproposed method on several popular open source language models, along with a\ncomprehensive analysis of the trade-offs involved in adopting this approach.\n","authors":["Aonan Zhang","Chong Wang","Yi Wang","Xuanyu Zhang","Yunfei Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.09919v3.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.20315v1","updated":"2024-05-30T17:54:40Z","published":"2024-05-30T17:54:40Z","title":"ANAH: Analytical Annotation of Hallucinations in Large Language Models","summary":"  Reducing the `$\\textit{hallucination}$' problem of Large Language Models\n(LLMs) is crucial for their wide applications. A comprehensive and fine-grained\nmeasurement of the hallucination is the first key step for the governance of\nthis issue but is under-explored in the community. Thus, we present\n$\\textbf{ANAH}$, a bilingual dataset that offers $\\textbf{AN}$alytical\n$\\textbf{A}$nnotation of $\\textbf{H}$allucinations in LLMs within Generative\nQuestion Answering. Each answer sentence in our dataset undergoes rigorous\nannotation, involving the retrieval of a reference fragment, the judgment of\nthe hallucination type, and the correction of hallucinated content. ANAH\nconsists of ~12k sentence-level annotations for ~4.3k LLM responses covering\nover 700 topics, constructed by a human-in-the-loop pipeline. Thanks to the\nfine granularity of the hallucination annotations, we can quantitatively\nconfirm that the hallucinations of LLMs progressively accumulate in the answer\nand use ANAH to train and evaluate hallucination annotators. We conduct\nextensive experiments on studying generative and discriminative annotators and\nshow that, although current open-source LLMs have difficulties in fine-grained\nhallucination annotation, the generative annotator trained with ANAH can\nsurpass all open-source LLMs and GPT-3.5, obtain performance competitive with\nGPT-4, and exhibits better generalization ability on unseen questions.\n","authors":["Ziwei Ji","Yuzhe Gu","Wenwei Zhang","Chengqi Lyu","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2405.20315v1.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2405.20314v1","updated":"2024-05-30T17:54:35Z","published":"2024-05-30T17:54:35Z","title":"S3D: A Simple and Cost-Effective Self-Speculative Decoding Scheme for\n  Low-Memory GPUs","summary":"  Speculative decoding (SD) has attracted a significant amount of research\nattention due to the substantial speedup it can achieve for LLM inference.\nHowever, despite the high speedups they offer, speculative decoding methods\noften achieve optimal performance on high-end devices or with a substantial GPU\nmemory overhead. Given limited memory and the necessity of quantization, a\nhigh-performing model on a high-end GPU can slow down by up to 7 times. To this\nend, we propose Skippy Simultaneous Speculative Decoding (or S3D), a\ncost-effective self-speculative SD method based on simultaneous multi-token\ndecoding and mid-layer skipping. When compared against recent effective\nopen-source SD systems, our method has achieved one of the top\nperformance-memory ratios while requiring minimal architecture changes and\ntraining data. Leveraging our memory efficiency, we created a smaller yet more\neffective SD model based on Phi-3. It is 1.4 to 2 times faster than the\nquantized EAGLE model and operates in half-precision while using less VRAM.\n","authors":["Wei Zhong","Manasa Bharadwaj"],"pdf_url":"https://arxiv.org/pdf/2405.20314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20309v1","updated":"2024-05-30T17:52:36Z","published":"2024-05-30T17:52:36Z","title":"Large Language Models Can Self-Improve At Web Agent Tasks","summary":"  Training models to act as agents that can effectively navigate and perform\nactions in a complex environment, such as a web browser, has typically been\nchallenging due to lack of training data. Large language models (LLMs) have\nrecently demonstrated some capability to navigate novel environments as agents\nin a zero-shot or few-shot fashion, purely guided by natural language\ninstructions as prompts. Recent research has also demonstrated LLMs have the\ncapability to exceed their base performance through self-improvement, i.e.\nfine-tuning on data generated by the model itself. In this work, we explore the\nextent to which LLMs can self-improve their performance as agents in\nlong-horizon tasks in a complex environment using the WebArena benchmark. In\nWebArena, an agent must autonomously navigate and perform actions on web pages\nto achieve a specified objective. We explore fine-tuning on three distinct\nsynthetic training data mixtures and achieve a 31\\% improvement in task\ncompletion rate over the base model on the WebArena benchmark through a\nself-improvement procedure. We additionally contribute novel evaluation metrics\nfor assessing the performance, robustness, capabilities, and quality of\ntrajectories of our fine-tuned agent models to a greater degree than simple,\naggregate-level benchmark scores currently used to measure self-improvement.\n","authors":["Ajay Patel","Markus Hofmarcher","Claudiu Leoveanu-Condrei","Marius-Constantin Dinu","Chris Callison-Burch","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2405.20309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18719v2","updated":"2024-05-30T17:51:53Z","published":"2024-05-29T02:57:15Z","title":"Contextual Position Encoding: Learning to Count What's Important","summary":"  The attention mechanism is a critical component of Large Language Models\n(LLMs) that allows tokens in a sequence to interact with each other, but is\norder-invariant. Incorporating position encoding (PE) makes it possible to\naddress by position, such as attending to the i-th token. However, current PE\nmethods use token counts to derive position, and thus cannot generalize to\nhigher levels of abstraction, such as attending to the i-th sentence. In this\npaper, we propose a new position encoding method, Contextual Position Encoding\n(CoPE), that allows positions to be conditioned on context by incrementing\nposition only on certain tokens determined by the model. This allows more\ngeneral position addressing such as attending to the $i$-th particular word,\nnoun, or sentence. We show that CoPE can solve the selective copy, counting and\nFlip-Flop tasks where popular position embeddings fail, and improves perplexity\non language modeling and coding tasks.\n","authors":["Olga Golovneva","Tianlu Wang","Jason Weston","Sainbayar Sukhbaatar"],"pdf_url":"https://arxiv.org/pdf/2405.18719v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20304v1","updated":"2024-05-30T17:50:04Z","published":"2024-05-30T17:50:04Z","title":"Group Robust Preference Optimization in Reward-free RLHF","summary":"  Adapting large language models (LLMs) for specific tasks usually involves\nfine-tuning through reinforcement learning with human feedback (RLHF) on\npreference data. While these data often come from diverse labelers' groups\n(e.g., different demographics, ethnicities, company teams, etc.), traditional\nRLHF approaches adopt a \"one-size-fits-all\" approach, i.e., they\nindiscriminately assume and optimize a single preference model, thus not being\nrobust to unique characteristics and needs of the various groups. To address\nthis limitation, we propose a novel Group Robust Preference Optimization (GRPO)\nmethod to align LLMs to individual groups' preferences robustly. Our approach\nbuilds upon reward-free direct preference optimization methods, but unlike\nprevious approaches, it seeks a robust policy which maximizes the worst-case\ngroup performance. To achieve this, GRPO adaptively and sequentially weights\nthe importance of different groups, prioritizing groups with worse cumulative\nloss. We theoretically study the feasibility of GRPO and analyze its\nconvergence for the log-linear policy class. By fine-tuning LLMs with GRPO\nusing diverse group-based global opinion data, we significantly improved\nperformance for the worst-performing groups, reduced loss imbalances across\ngroups, and improved probability accuracies compared to non-robust baselines.\n","authors":["Shyam Sundhar Ramesh","Yifan Hu","Iason Chaimalas","Viraj Mehta","Pier Giuseppe Sessa","Haitham Bou Ammar","Ilija Bogunovic"],"pdf_url":"https://arxiv.org/pdf/2405.20304v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2403.01643v2","updated":"2024-05-30T17:46:22Z","published":"2024-03-03T23:40:35Z","title":"You Need to Pay Better Attention: Rethinking the Mathematics of\n  Attention Mechanism","summary":"  Scaled Dot Product Attention (SDPA) is the backbone of many modern\ndeep-learning models. It is so versatile that it has been used in natural\nlanguage, vision, and multi-modal domains with very little change compared to\nits original formulation. This paper discusses why the current formulation is\ninefficient by delving into the mathematical details of the attention\nmechanism. We propose three improvements to mitigate these inefficiencies,\nthereby, introducing three enhanced attention mechanisms: Optimised, Efficient,\nand Super Attention. Optimised and Efficient Attention have one and two matrix\nmultiplications fewer per head, respectively, and 25% and 50% fewer parameters,\nrespectively, than standard SDPA, but perform similarly to standard SDPA in\nboth vision and natural language tasks. They can be used in all applications\nwhere SDPA is used while offering smaller model sizes and faster training and\ninference without noticeable loss in performance. Super Attention introduces a\nnew linear transformation on the values, transforming them from the left. It\noutperforms standard SPDA on vision and natural language tasks by up to 17%\nwhile having one fewer matrix multiplication per head and 25% fewer parameters\nthan standard SDPA. Consequently, it is also faster than standard SDPA. Super\nAttention is ideal in applications where the attention layer's context length\nis fixed, such as Vision Transformers. In addition to providing mathematical\nreasoning, we evaluate the presented attention mechanisms on several datasets\nincluding MNIST, CIFAR100, ImageNet, IMDB Movie Reviews, and Amazon Reviews\ndatasets, as well as combined Europarl and Anki English-Spanish datasets for\nneural machine translation.\n","authors":["Mehran Hosseini","Peyman Hosseini"],"pdf_url":"https://arxiv.org/pdf/2403.01643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20285v1","updated":"2024-05-30T17:38:44Z","published":"2024-05-30T17:38:44Z","title":"Who Writes the Review, Human or AI?","summary":"  With the increasing use of Artificial Intelligence in Natural Language\nProcessing, concerns have been raised regarding the detection of AI-generated\ntext in various domains. This study aims to investigate this issue by proposing\na methodology to accurately distinguish AI-generated and human-written book\nreviews. Our approach utilizes transfer learning, enabling the model to\nidentify generated text across different topics while improving its ability to\ndetect variations in writing style and vocabulary. To evaluate the\neffectiveness of the proposed methodology, we developed a dataset consisting of\nreal book reviews and AI-generated reviews using the recently proposed Vicuna\nopen-source language model. The experimental results demonstrate that it is\nfeasible to detect the original source of text, achieving an accuracy rate of\n96.86%. Our efforts are oriented toward the exploration of the capabilities and\nlimitations of Large Language Models in the context of text identification.\nExpanding our knowledge in these aspects will be valuable for effectively\nnavigating similar models in the future and ensuring the integrity and\nauthenticity of human-generated content.\n","authors":["Panagiotis C. Theocharopoulos","Spiros V. Georgakopoulos","Sotiris K. Tasoulis","Vassilis P. Plagianakos"],"pdf_url":"https://arxiv.org/pdf/2405.20285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03893v3","updated":"2024-05-30T17:37:11Z","published":"2024-03-06T17:51:43Z","title":"From One to Many: Expanding the Scope of Toxicity Mitigation in Language\n  Models","summary":"  To date, toxicity mitigation in language models has almost entirely been\nfocused on single-language settings. As language models embrace multilingual\ncapabilities, it's crucial our safety measures keep pace. Recognizing this\nresearch gap, our approach expands the scope of conventional toxicity\nmitigation to address the complexities presented by multiple languages. In the\nabsence of sufficient annotated datasets across languages, we employ translated\ndata to evaluate and enhance our mitigation techniques. We also compare\nfinetuning mitigation approaches against retrieval-augmented techniques under\nboth static and continual toxicity mitigation scenarios. This allows us to\nexamine the effects of translation quality and the cross-lingual transfer on\ntoxicity mitigation. We also explore how model size and data quantity affect\nthe success of these mitigation efforts. Covering nine languages, our study\nrepresents a broad array of linguistic families and levels of resource\navailability, ranging from high to mid-resource languages. Through\ncomprehensive experiments, we provide insights into the complexities of\nmultilingual toxicity mitigation, offering valuable insights and paving the way\nfor future research in this increasingly important field. Code and data are\navailable at https://github.com/for-ai/goodtriever.\n","authors":["Luiza Pozzobon","Patrick Lewis","Sara Hooker","Beyza Ermis"],"pdf_url":"https://arxiv.org/pdf/2403.03893v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05140v2","updated":"2024-05-30T17:37:06Z","published":"2024-02-06T20:11:54Z","title":"Tag-LLM: Repurposing General-Purpose LLMs for Specialized Domains","summary":"  Large Language Models (LLMs) have demonstrated remarkable proficiency in\nunderstanding and generating natural language. However, their capabilities wane\nin highly specialized domains underrepresented in the pretraining corpus, such\nas physical and biomedical sciences. This work explores how to repurpose\ngeneral LLMs into effective task solvers for specialized domains. We introduce\na novel, model-agnostic framework for learning custom input tags, which are\nparameterized as continuous vectors appended to the LLM's embedding layer, to\ncondition the LLM. We design two types of input tags: domain tags are used to\ndelimit specialized representations (e.g., chemical formulas) and provide\ndomain-relevant context; function tags are used to represent specific functions\n(e.g., predicting molecular properties) and compress function-solving\ninstructions. We develop a three-stage protocol to learn these tags using\nauxiliary data and domain knowledge. By explicitly disentangling task domains\nfrom task functions, our method enables zero-shot generalization to unseen\nproblems through diverse combinations of the input tags. It also boosts LLM's\nperformance in various specialized domains, such as predicting protein or\nchemical properties and modeling drug-target interactions, outperforming expert\nmodels tailored to these tasks.\n","authors":["Junhong Shen","Neil Tenenholtz","James Brian Hall","David Alvarez-Melis","Nicolo Fusi"],"pdf_url":"https://arxiv.org/pdf/2402.05140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14700v3","updated":"2024-05-30T17:31:46Z","published":"2024-02-22T16:56:13Z","title":"Unveiling Linguistic Regions in Large Language Models","summary":"  Large Language Models (LLMs) have demonstrated considerable cross-lingual\nalignment and generalization ability. Current research primarily focuses on\nimproving LLMs' cross-lingual generalization capabilities. However, there is\nstill a lack of research on the intrinsic mechanisms of how LLMs achieve\ncross-lingual alignment. From the perspective of region partitioning, this\npaper conducts several investigations on the linguistic competence of LLMs. We\ndiscover a core region in LLMs that corresponds to linguistic competence,\naccounting for approximately 1% of the total model parameters. Removing this\ncore region by setting parameters to zero results in a significant performance\ndecrease across 30 different languages. Furthermore, this core region exhibits\nsignificant dimensional dependence, perturbations to even a single parameter on\nspecific dimensions leading to a loss of linguistic competence. Moreover, we\ndiscover that distinct monolingual regions exist for different languages, and\ndisruption to these specific regions substantially reduces the LLMs'\nproficiency in those corresponding languages. Our research also indicates that\nfreezing the core linguistic region during further pre-training can mitigate\nthe issue of catastrophic forgetting (CF), a common phenomenon observed during\nfurther pre-training of LLMs. Overall, exploring the LLMs' functional regions\nprovides insights into the foundation of their intelligence.\n","authors":["Zhihao Zhang","Jun Zhao","Qi Zhang","Tao Gui","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2402.14700v3.pdf","comment":"Accepted by ACL 2024. Camera-Ready Version"},{"id":"http://arxiv.org/abs/2405.20274v1","updated":"2024-05-30T17:29:15Z","published":"2024-05-30T17:29:15Z","title":"ROAST: Review-level Opinion Aspect Sentiment Target Joint Detection","summary":"  Aspect-Based Sentiment Analysis (ABSA) has experienced tremendous expansion\nand diversity due to various shared tasks spanning several languages and fields\nand organized via SemEval workshops and Germeval. Nonetheless, a few\nshortcomings still need to be addressed, such as the lack of low-resource\nlanguage evaluations and the emphasis on sentence-level analysis. To thoroughly\nassess ABSA techniques in the context of complete reviews, this research\npresents a novel task, Review-Level Opinion Aspect Sentiment Target (ROAST).\nROAST seeks to close the gap between sentence-level and text-level ABSA by\nidentifying every ABSA constituent at the review level. We extend the available\ndatasets to enable ROAST, addressing the drawbacks noted in previous research\nby incorporating low-resource languages, numerous languages, and a variety of\ntopics. Through this effort, ABSA research will be able to cover more ground\nand get a deeper comprehension of the task and its practical application in a\nvariety of languages and domains (https://github.com/RiTUAL-UH/ROAST-ABSA).\n","authors":["Siva Uday Sampreeth Chebolu","Franck Dernoncourt","Nedim Lipka","Thamar Solorio"],"pdf_url":"https://arxiv.org/pdf/2405.20274v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2309.13297"},{"id":"http://arxiv.org/abs/2405.20271v1","updated":"2024-05-30T17:26:02Z","published":"2024-05-30T17:26:02Z","title":"ETHER: Efficient Finetuning of Large-Scale Models with Hyperplane\n  Reflections","summary":"  Parameter-efficient finetuning (PEFT) has become ubiquitous to adapt\nfoundation models to downstream task requirements while retaining their\ngeneralization ability. However, the amount of additionally introduced\nparameters and compute for successful adaptation and hyperparameter searches\ncan explode quickly, especially when deployed at scale to serve numerous\nindividual requests. To ensure effective, parameter-efficient, and\nhyperparameter-robust adaptation, we propose the ETHER transformation family,\nwhich performs Efficient fineTuning via HypErplane Reflections. By design,\nETHER transformations require a minimal number of parameters, are less likely\nto deteriorate model performance, and exhibit robustness to hyperparameter and\nlearning rate choices. In particular, we introduce ETHER and its relaxation\nETHER+, which match or outperform existing PEFT methods with significantly\nfewer parameters ($\\sim$$10$-$100$ times lower than LoRA or OFT) across\nmultiple image synthesis and natural language tasks without exhaustive\nhyperparameter tuning. Finally, we investigate the recent emphasis on\nHyperspherical Energy retention for adaptation and raise questions on its\npractical utility. The code is available at https://github.com/mwbini/ether.\n","authors":["Massimo Bini","Karsten Roth","Zeynep Akata","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2405.20271v1.pdf","comment":"Accepted to ICML 2024. Code available at\n  https://github.com/mwbini/ether"},{"id":"http://arxiv.org/abs/2405.20269v1","updated":"2024-05-30T17:21:15Z","published":"2024-05-30T17:21:15Z","title":"IsraParlTweet: The Israeli Parliamentary and Twitter Resource","summary":"  We introduce IsraParlTweet, a new linked corpus of Hebrew-language\nparliamentary discussions from the Knesset (Israeli Parliament) between the\nyears 1992-2023 and Twitter posts made by Members of the Knesset between the\nyears 2008-2023, containing a total of 294.5 million Hebrew tokens. In addition\nto raw text, the corpus contains comprehensive metadata on speakers and Knesset\nsessions as well as several linguistic annotations. As a result, IsraParlTweet\ncan be used to conduct a wide variety of quantitative and qualitative analyses\nand provide valuable insights into political discourse in Israel.\n","authors":["Guy Mor-Lan","Effi Levi","Tamir Sheafer","Shaul R. Shenhav"],"pdf_url":"https://arxiv.org/pdf/2405.20269v1.pdf","comment":"Presented at LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2405.20267v1","updated":"2024-05-30T17:19:19Z","published":"2024-05-30T17:19:19Z","title":"Auto Arena of LLMs: Automating LLM Evaluations with Agent Peer-battles\n  and Committee Discussions","summary":"  As LLMs evolve on a daily basis, there is an urgent need for a trustworthy\nevaluation method that can provide robust evaluation results in a timely\nfashion. Currently, as static benchmarks are prone to contamination concerns,\nusers tend to trust human voting platforms, such as Chatbot Arena. However,\nhuman annotations require extensive manual efforts. To provide an automatic,\nrobust, and trustworthy evaluation framework, we innovatively propose the\nAuto-Arena of LLMs, which automates the entire evaluation process with LLM\nagents. Firstly, an examiner LLM devises queries. Then, a pair of candidate\nLLMs engage in a multi-round peer-battle around the query, during which the\nLLM's true performance gaps become visible. Finally, a committee of LLM judges\ncollectively discuss and determine the winner, which alleviates bias and\npromotes fairness. In our extensive experiment on the 17 newest LLMs,\nAuto-Arena shows the highest correlation with human preferences, providing a\npromising alternative to human evaluation platforms.\n","authors":["Ruochen Zhao","Wenxuan Zhang","Yew Ken Chia","Deli Zhao","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2405.20267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19327v2","updated":"2024-05-30T17:17:21Z","published":"2024-05-29T17:57:16Z","title":"MAP-Neo: Highly Capable and Transparent Bilingual Large Language Model\n  Series","summary":"  Large Language Models (LLMs) have made great strides in recent years to\nachieve unprecedented performance across different tasks. However, due to\ncommercial interest, the most competitive models like GPT, Gemini, and Claude\nhave been gated behind proprietary interfaces without disclosing the training\ndetails. Recently, many institutions have open-sourced several strong LLMs like\nLLaMA-3, comparable to existing closed-source LLMs. However, only the model's\nweights are provided with most details (e.g., intermediate checkpoints,\npre-training corpus, and training code, etc.) being undisclosed. To improve the\ntransparency of LLMs, the research community has formed to open-source truly\nopen LLMs (e.g., Pythia, Amber, OLMo), where more details (e.g., pre-training\ncorpus and training code) are being provided. These models have greatly\nadvanced the scientific study of these large models including their strengths,\nweaknesses, biases and risks. However, we observe that the existing truly open\nLLMs on reasoning, knowledge, and coding tasks are still inferior to existing\nstate-of-the-art LLMs with similar model sizes. To this end, we open-source\nMAP-Neo, a highly capable and transparent bilingual language model with 7B\nparameters trained from scratch on 4.5T high-quality tokens. Our MAP-Neo is the\nfirst fully open-sourced bilingual LLM with comparable performance compared to\nexisting state-of-the-art LLMs. Moreover, we open-source all details to\nreproduce our MAP-Neo, where the cleaned pre-training corpus, data cleaning\npipeline, checkpoints, and well-optimized training/evaluation framework are\nprovided. Finally, we hope our MAP-Neo will enhance and strengthen the open\nresearch community and inspire more innovations and creativities to facilitate\nthe further improvements of LLMs.\n","authors":["Ge Zhang","Scott Qu","Jiaheng Liu","Chenchen Zhang","Chenghua Lin","Chou Leuang Yu","Danny Pan","Esther Cheng","Jie Liu","Qunshu Lin","Raven Yuan","Tuney Zheng","Wei Pang","Xinrun Du","Yiming Liang","Yinghao Ma","Yizhi Li","Ziyang Ma","Bill Lin","Emmanouil Benetos","Huan Yang","Junting Zhou","Kaijing Ma","Minghao Liu","Morry Niu","Noah Wang","Quehry Que","Ruibo Liu","Sine Liu","Shawn Guo","Soren Gao","Wangchunshu Zhou","Xinyue Zhang","Yizhi Zhou","Yubo Wang","Yuelin Bai","Yuhan Zhang","Yuxiang Zhang","Zenith Wang","Zhenzhu Yang","Zijian Zhao","Jiajun Zhang","Wanli Ouyang","Wenhao Huang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2405.19327v2.pdf","comment":"https://map-neo.github.io/"},{"id":"http://arxiv.org/abs/2310.12815v2","updated":"2024-05-30T17:09:56Z","published":"2023-10-19T15:12:09Z","title":"Formalizing and Benchmarking Prompt Injection Attacks and Defenses","summary":"  A prompt injection attack aims to inject malicious instruction/data into the\ninput of an LLM-Integrated Application such that it produces results as an\nattacker desires. Existing works are limited to case studies. As a result, the\nliterature lacks a systematic understanding of prompt injection attacks and\ntheir defenses. We aim to bridge the gap in this work. In particular, we\npropose a framework to formalize prompt injection attacks. Existing attacks are\nspecial cases in our framework. Moreover, based on our framework, we design a\nnew attack by combining existing ones. Using our framework, we conduct a\nsystematic evaluation on 5 prompt injection attacks and 10 defenses with 10\nLLMs and 7 tasks. Our work provides a common benchmark for quantitatively\nevaluating future prompt injection attacks and defenses. To facilitate research\non this topic, we make our platform public at\nhttps://github.com/liu00222/Open-Prompt-Injection.\n","authors":["Yupei Liu","Yuqi Jia","Runpeng Geng","Jinyuan Jia","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2310.12815v2.pdf","comment":"To appear in USENIX Security Symposium 2024"},{"id":"http://arxiv.org/abs/2405.20253v1","updated":"2024-05-30T17:06:03Z","published":"2024-05-30T17:06:03Z","title":"Evaluating Large Language Model Biases in Persona-Steered Generation","summary":"  The task of persona-steered text generation requires large language models\n(LLMs) to generate text that reflects the distribution of views that an\nindividual fitting a persona could have. People have multifaceted personas, but\nprior work on bias in LLM-generated opinions has only explored multiple-choice\nsettings or one-dimensional personas. We define an incongruous persona as a\npersona with multiple traits where one trait makes its other traits less likely\nin human survey data, e.g. political liberals who support increased military\nspending. We find that LLMs are 9.7% less steerable towards incongruous\npersonas than congruous ones, sometimes generating the stereotypical stance\nassociated with its demographic rather than the target stance. Models that we\nevaluate that are fine-tuned with Reinforcement Learning from Human Feedback\n(RLHF) are more steerable, especially towards stances associated with political\nliberals and women, but present significantly less diverse views of personas.\nWe also find variance in LLM steerability that cannot be predicted from\nmultiple-choice opinion evaluation. Our results show the importance of\nevaluating models in open-ended text generation, as it can surface new LLM\nopinion biases. Moreover, such a setup can shed light on our ability to steer\nmodels toward a richer and more diverse range of viewpoints.\n","authors":["Andy Liu","Mona Diab","Daniel Fried"],"pdf_url":"https://arxiv.org/pdf/2405.20253v1.pdf","comment":"Accepted to Findings of ACL 2024. Code and data available at\n  https://github.com/andyjliu/persona-steered-generation-bias"},{"id":"http://arxiv.org/abs/2405.20252v1","updated":"2024-05-30T17:05:45Z","published":"2024-05-30T17:05:45Z","title":"Towards Hierarchical Multi-Agent Workflows for Zero-Shot Prompt\n  Optimization","summary":"  Large language models (LLMs) have shown great progress in responding to user\nquestions, allowing for a multitude of diverse applications. Yet, the quality\nof LLM outputs heavily depends on the prompt design, where a good prompt might\nenable the LLM to answer a very challenging question correctly. Therefore,\nrecent works have developed many strategies for improving the prompt, including\nboth manual crafting and in-domain optimization. However, their efficacy in\nunrestricted scenarios remains questionable, as the former depends on human\ndesign for specific questions and the latter usually generalizes poorly to\nunseen scenarios. To address these problems, we give LLMs the freedom to design\nthe best prompts according to themselves. Specifically, we include a hierarchy\nof LLMs, first constructing a prompt with precise instructions and accurate\nwording in a hierarchical manner, and then using this prompt to generate the\nfinal answer to the user query. We term this pipeline Hierarchical Multi-Agent\nWorkflow, or HMAW. In contrast with prior works, HMAW imposes no human\nrestriction and requires no training, and is completely task-agnostic while\ncapable of adjusting to the nuances of the underlying task. Through both\nquantitative and qualitative experiments across multiple benchmarks, we verify\nthat despite its simplicity, the proposed approach can create detailed and\nsuitable prompts, further boosting the performance of current LLMs.\n","authors":["Yuchi Liu","Jaskirat Singh","Gaowen Liu","Ali Payani","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.20252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12715v2","updated":"2024-05-30T16:59:56Z","published":"2024-04-19T08:52:22Z","title":"Ensemble Learning for Heterogeneous Large Language Models with Deep\n  Parallel Collaboration","summary":"  Large language models (LLMs) exhibit complementary strengths in various\ntasks, motivating the research of LLM ensembling. However, existing work\nfocuses on training an extra reward model or fusion model to select or combine\nall candidate answers, posing a great challenge to the generalization on unseen\ndata distributions. Besides, prior methods use textual responses as\ncommunication media, ignoring the valuable information in the internal\nrepresentations. In this work, we propose a training-free ensemble framework\nDeePEn, fusing the informative probability distributions yielded by different\nLLMs at each decoding step. Unfortunately, the vocabulary discrepancy between\nheterogeneous LLMs directly makes averaging the distributions unfeasible due to\nthe token misalignment. To address this challenge, DeePEn maps the probability\ndistribution of each model from its own probability space to a universal\nrelative space based on the relative representation theory, and performs\naggregation. Next, we devise a search-based inverse transformation to transform\nthe aggregated result back to the probability space of one of the ensembling\nLLMs (main model), in order to determine the next token. We conduct extensive\nexperiments on ensembles of different number of LLMs, ensembles of LLMs with\ndifferent architectures, and ensembles between the LLM and the specialist\nmodel. Experimental results show that (i) DeePEn achieves consistent\nimprovements across six benchmarks covering subject examination, reasoning, and\nknowledge, (ii) a well-performing specialist model can benefit from a less\neffective LLM through distribution fusion, and (iii) DeePEn has complementary\nstrengths with other ensemble methods such as voting.\n","authors":["Yichong Huang","Xiaocheng Feng","Baohang Li","Yang Xiang","Hui Wang","Bing Qin","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2404.12715v2.pdf","comment":"16 pages, 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2405.20245v1","updated":"2024-05-30T16:54:42Z","published":"2024-05-30T16:54:42Z","title":"Retrieval Augmented Structured Generation: Business Document Information\n  Extraction As Tool Use","summary":"  Business Document Information Extraction (BDIE) is the problem of\ntransforming a blob of unstructured information (raw text, scanned documents,\netc.) into a structured format that downstream systems can parse and use. It\nhas two main tasks: Key-Information Extraction (KIE) and Line Items Recognition\n(LIR). In this paper, we argue that BDIE is best modeled as a Tool Use problem,\nwhere the tools are these downstream systems. We then present Retrieval\nAugmented Structured Generation (RASG), a novel general framework for BDIE that\nachieves state of the art (SOTA) results on both KIE and LIR tasks on BDIE\nbenchmarks.\n  The contributions of this paper are threefold: (1) We show, with ablation\nbenchmarks, that Large Language Models (LLMs) with RASG are already competitive\nwith or surpasses current SOTA Large Multimodal Models (LMMs) without RASG on\nBDIE benchmarks. (2) We propose a new metric class for Line Items Recognition,\nGeneral Line Items Recognition Metric (GLIRM), that is more aligned with\npractical BDIE use cases compared to existing metrics, such as ANLS*, DocILE,\nand GriTS. (3) We provide a heuristic algorithm for backcalculating bounding\nboxes of predicted line items and tables without the need for vision encoders.\nFinally, we claim that, while LMMs might sometimes offer marginal performance\nbenefits, LLMs + RASG is oftentimes superior given real-world applications and\nconstraints of BDIE.\n","authors":["Franz Louis Cesista","Rui Aguiar","Jason Kim","Paolo Acilo"],"pdf_url":"https://arxiv.org/pdf/2405.20245v1.pdf","comment":"Accepted by IEEE 7th International Conference on Multimedia\n  Information Processing and Retrieval (MIPR), 2024"},{"id":"http://arxiv.org/abs/2205.15744v2","updated":"2024-05-30T16:40:52Z","published":"2022-05-31T12:29:25Z","title":"EMS: Efficient and Effective Massively Multilingual Sentence Embedding\n  Learning","summary":"  Massively multilingual sentence representation models, e.g., LASER,\nSBERT-distill, and LaBSE, help significantly improve cross-lingual downstream\ntasks. However, the use of a large amount of data or inefficient model\narchitectures results in heavy computation to train a new model according to\nour preferred languages and domains. To resolve this issue, we introduce\nefficient and effective massively multilingual sentence embedding (EMS), using\ncross-lingual token-level reconstruction (XTR) and sentence-level contrastive\nlearning as training objectives. Compared with related studies, the proposed\nmodel can be efficiently trained using significantly fewer parallel sentences\nand GPU computation resources. Empirical results showed that the proposed model\nsignificantly yields better or comparable results with regard to cross-lingual\nsentence retrieval, zero-shot cross-lingual genre classification, and sentiment\nclassification. Ablative analyses demonstrated the efficiency and effectiveness\nof each component of the proposed model. We release the codes for model\ntraining and the EMS pre-trained sentence embedding model, which supports 62\nlanguages ( https://github.com/Mao-KU/EMS ).\n","authors":["Zhuoyuan Mao","Chenhui Chu","Sadao Kurohashi"],"pdf_url":"https://arxiv.org/pdf/2205.15744v2.pdf","comment":"This work is a multilingual extension of arXiv:2105.13856. This work\n  has been accepted by IEEE/ACM Transactions on Audio, Speech, and Language\n  Processing (DOI: 10.1109/TASLP.2024.3402064). Copyright has been transferred"},{"id":"http://arxiv.org/abs/2402.14800v2","updated":"2024-05-30T16:24:16Z","published":"2024-02-22T18:56:07Z","title":"Not All Experts are Equal: Efficient Expert Pruning and Skipping for\n  Mixture-of-Experts Large Language Models","summary":"  A pivotal advancement in the progress of large language models (LLMs) is the\nemergence of the Mixture-of-Experts (MoE) LLMs. Compared to traditional LLMs,\nMoE LLMs can achieve higher performance with fewer parameters, but it is still\nhard to deploy them due to their immense parameter sizes. Different from\nprevious weight pruning methods that rely on specifically designed hardware,\nthis paper mainly aims to enhance the deployment efficiency of MoE LLMs by\nintroducing plug-and-play expert-level sparsification techniques. Specifically,\nwe propose, for the first time to our best knowledge, post-training approaches\nfor task-agnostic and task-specific expert pruning and skipping of MoE LLMs,\ntailored to improve deployment efficiency while maintaining model performance\nacross a wide range of tasks. Extensive experiments show that our proposed\nmethods can simultaneously reduce model sizes and increase the inference speed,\nwhile maintaining satisfactory performance. Data and code will be available at\nhttps://github.com/Lucky-Lance/Expert_Sparsity.\n","authors":["Xudong Lu","Qi Liu","Yuhui Xu","Aojun Zhou","Siyuan Huang","Bo Zhang","Junchi Yan","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2402.14800v2.pdf","comment":"Mixture-of-Experts Large Language Models, ACL2024"},{"id":"http://arxiv.org/abs/2405.20215v1","updated":"2024-05-30T16:17:40Z","published":"2024-05-30T16:17:40Z","title":"TS-Align: A Teacher-Student Collaborative Framework for Scalable\n  Iterative Finetuning of Large Language Models","summary":"  Mainstream approaches to aligning large language models (LLMs) heavily rely\non human preference data, particularly when models require periodic updates.\nThe standard process for iterative alignment of LLMs involves collecting new\nhuman feedback for each update. However, the data collection process is costly\nand challenging to scale. To address this issue, we introduce the \"TS-Align\"\nframework, which fine-tunes a policy model using pairwise feedback data\nautomatically mined from its outputs. This automatic mining process is\nefficiently accomplished through the collaboration between a large-scale\nteacher model and a small-scale student model. The policy fine-tuning process\ncan be iteratively repeated using on-policy generations within our proposed\nteacher-student collaborative framework. Through extensive experiments, we\ndemonstrate that our final aligned policy outperforms the base policy model\nwith an average win rate of 69.7% across seven conversational or\ninstruction-following datasets. Furthermore, we show that the ranking\ncapability of the teacher is effectively distilled into the student through our\npipeline, resulting in a small-scale yet effective reward model for policy\nmodel alignment.\n","authors":["Chen Zhang","Chengguang Tang","Dading Chong","Ke Shi","Guohua Tang","Feng Jiang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2405.20215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20213v1","updated":"2024-05-30T16:16:25Z","published":"2024-05-30T16:16:25Z","title":"PostDoc: Generating Poster from a Long Multimodal Document Using Deep\n  Submodular Optimization","summary":"  A poster from a long input document can be considered as a one-page\neasy-to-read multimodal (text and images) summary presented on a nice template\nwith good design elements. Automatic transformation of a long document into a\nposter is a very less studied but challenging task. It involves content\nsummarization of the input document followed by template generation and\nharmonization. In this work, we propose a novel deep submodular function which\ncan be trained on ground truth summaries to extract multimodal content from the\ndocument and explicitly ensures good coverage, diversity and alignment of text\nand images. Then, we use an LLM based paraphraser and propose to generate a\ntemplate with various design aspects conditioned on the input content. We show\nthe merits of our approach through extensive automated and human evaluations.\n","authors":["Vijay Jaisankar","Sambaran Bandyopadhyay","Kalp Vyas","Varre Chaitanya","Shwetha Somasundaram"],"pdf_url":"https://arxiv.org/pdf/2405.20213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20204v1","updated":"2024-05-30T16:07:54Z","published":"2024-05-30T16:07:54Z","title":"Jina CLIP: Your CLIP Model Is Also Your Text Retriever","summary":"  Contrastive Language-Image Pretraining (CLIP) is widely used to train models\nto align images and texts in a common embedding space by mapping them to\nfixed-sized vectors. These models are key to multimodal information retrieval\nand related tasks. However, CLIP models generally underperform in text-only\ntasks compared to specialized text models. This creates inefficiencies for\ninformation retrieval systems that keep separate embeddings and models for\ntext-only and multimodal tasks. We propose a novel, multi-task contrastive\ntraining method to address this issue, which we use to train the jina-clip-v1\nmodel to achieve the state-of-the-art performance on both text-image and\ntext-text retrieval tasks.\n","authors":["Andreas Koukounas","Georgios Mastrapas","Michael Günther","Bo Wang","Scott Martens","Isabelle Mohr","Saba Sturua","Mohammad Kalim Akram","Joan Fontanals Martínez","Saahil Ognawala","Susana Guzman","Maximilian Werk","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2405.20204v1.pdf","comment":"4 pages, ICML2024 workshop submission"},{"id":"http://arxiv.org/abs/2405.20192v1","updated":"2024-05-30T15:57:19Z","published":"2024-05-30T15:57:19Z","title":"TAIA: Large Language Models are Out-of-Distribution Data Learners","summary":"  Fine-tuning on task-specific question-answer pairs is a predominant method\nfor enhancing the performance of instruction-tuned large language models (LLMs)\non downstream tasks. However, in certain specialized domains, such as\nhealthcare or harmless content generation, it is nearly impossible to obtain a\nlarge volume of high-quality data that matches the downstream distribution. To\nimprove the performance of LLMs in data-scarce domains with domain-mismatched\ndata, we re-evaluated the Transformer architecture and discovered that not all\nparameter updates during fine-tuning contribute positively to downstream\nperformance. Our analysis reveals that within the self-attention and\nfeed-forward networks, only the fine-tuned attention parameters are\nparticularly beneficial when the training set's distribution does not fully\nalign with the test set. Based on this insight, we propose an effective\ninference-time intervention method: \\uline{T}raining \\uline{A}ll parameters but\n\\uline{I}nferring with only \\uline{A}ttention (\\trainallInfAttn). We\nempirically validate \\trainallInfAttn using two general instruction-tuning\ndatasets and evaluate it on seven downstream tasks involving math, reasoning,\nand knowledge understanding across LLMs of different parameter sizes and\nfine-tuning techniques. Our comprehensive experiments demonstrate that\n\\trainallInfAttn achieves superior improvements compared to both the fully\nfine-tuned model and the base model in most scenarios, with significant\nperformance gains. The high tolerance of \\trainallInfAttn to data mismatches\nmakes it resistant to jailbreaking tuning and enhances specialized tasks using\ngeneral data.\n","authors":["Shuyang Jiang","Yusheng Liao","Ya Zhang","Yu Wang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20192v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2405.20179v1","updated":"2024-05-30T15:47:54Z","published":"2024-05-30T15:47:54Z","title":"Robo-Instruct: Simulator-Augmented Instruction Alignment For Finetuning\n  CodeLLMs","summary":"  Large language models (LLMs) have shown great promise at generating robot\nprograms from natural language given domain-specific robot application\nprogramming interfaces (APIs). However, the performance gap between proprietary\nLLMs and smaller open-weight LLMs remains wide. This raises a question: Can we\nfine-tune smaller open-weight LLMs for generating domain-specific robot\nprograms to close the performance gap with proprietary LLMs? While\nSelf-Instruct is a promising solution by generating a diverse set of training\ndata, it cannot verify the correctness of these programs. In contrast, a robot\nsimulator with a well-defined world can identify execution errors but limits\nthe diversity of programs that it can verify. In this work, we introduce\nRobo-Instruct, which brings the best of both worlds -- it promotes the\ndiversity of Self-Instruct while providing the correctness of simulator-based\nchecking. Robo-Instruct introduces RoboSim to synthesize a consistent world\nstate on the fly by inferring properties relevant to the program being checked,\nand simulating actions accordingly. Furthermore, the instructions and programs\ngenerated by Self-Instruct may be subtly inconsistent -- such as the program\nmissing a step implied by the instruction. Robo-Instruct further addresses this\nwith InstAlign, an instruction-program alignment procedure that revises the\ntask instruction to reflect the actual results of the generated program. Given\na few seed task descriptions and the robot APIs, Robo-Instruct is capable of\ngenerating a training dataset using only a small open-weight model. This\ndataset can then be used to fine-tune small open-weight language models,\nenabling them to match or even exceed the performance of several proprietary\nLLMs, such as GPT-3.5-Turbo and Gemini-Pro.\n","authors":["Zichao Hu","Junyi Jessy Li","Arjun Guha","Joydeep Biswas"],"pdf_url":"https://arxiv.org/pdf/2405.20179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11505v2","updated":"2024-05-30T15:46:10Z","published":"2024-02-18T08:32:59Z","title":"Federated Fine-tuning of Large Language Models under Heterogeneous Tasks\n  and Client Resources","summary":"  Federated Learning (FL) has recently been applied to the parameter-efficient\nfine-tuning of Large Language Models (LLMs). While promising, it raises\nsignificant challenges due to the heterogeneous resources and data\ndistributions of clients. This study introduces FlexLoRA, a simple yet\neffective aggregation scheme for LLM fine-tuning, which mitigates the ``bucket\neffect'' in traditional FL that restricts the potential of clients with ample\nresources by tying them to the capabilities of the least-resourced\nparticipants. FlexLoRA allows for dynamic adjustment of local LoRA ranks,\nfostering the development of a global model imbued with broader, less\ntask-specific knowledge. By synthesizing a full-size LoRA weight from\nindividual client contributions and employing Singular Value Decomposition\n(SVD) for weight redistribution, FlexLoRA fully leverages heterogeneous client\nresources. Involving thousands of clients performing heterogeneous NLP tasks\nand client resources, our experiments validate the efficacy of FlexLoRA, with\nthe federated global model achieving consistently better improvement over SOTA\nFL methods in downstream NLP task performance across various heterogeneous\ndistributions. FlexLoRA's practicality is further underscored by our\ntheoretical analysis and its seamless integration with existing LoRA-based FL\nmethods, offering a path toward cross-device, privacy-preserving federated\ntuning for LLMs.\n","authors":["Jiamu Bai","Daoyuan Chen","Bingchen Qian","Liuyi Yao","Yaliang Li"],"pdf_url":"https://arxiv.org/pdf/2402.11505v2.pdf","comment":"19 pages, 13 tables, 9 figures"},{"id":"http://arxiv.org/abs/2405.20175v1","updated":"2024-05-30T15:45:13Z","published":"2024-05-30T15:45:13Z","title":"InstructionCP: A fast approach to transfer Large Language Models into\n  target language","summary":"  The rapid development of large language models (LLMs) in recent years has\nlargely focused on English, resulting in models that respond exclusively in\nEnglish. To adapt these models to other languages, continual pre-training (CP)\nis often employed, followed by supervised fine-tuning (SFT) to maintain\nconversational abilities. However, CP and SFT can reduce a model's ability to\nfilter harmful content. We propose Instruction Continual Pre-training (InsCP),\nwhich integrates instruction tags into the CP process to prevent loss of\nconversational proficiency while acquiring new languages. Our experiments\ndemonstrate that InsCP retains conversational and Reinforcement Learning from\nHuman Feedback (RLHF) abilities. Empirical evaluations on language alignment,\nreliability, and knowledge benchmarks confirm the efficacy of InsCP. Notably,\nthis approach requires only 0.1 billion tokens of high-quality\ninstruction-following data, thereby reducing resource consumption.\n","authors":["Kuang-Ming Chen","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20175v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2402.15159v3","updated":"2024-05-30T15:44:51Z","published":"2024-02-23T07:43:26Z","title":"Machine Unlearning of Pre-trained Large Language Models","summary":"  This study investigates the concept of the `right to be forgotten' within the\ncontext of large language models (LLMs). We explore machine unlearning as a\npivotal solution, with a focus on pre-trained models--a notably\nunder-researched area. Our research delineates a comprehensive framework for\nmachine unlearning in pre-trained LLMs, encompassing a critical analysis of\nseven diverse unlearning methods. Through rigorous evaluation using curated\ndatasets from arXiv, books, and GitHub, we establish a robust benchmark for\nunlearning performance, demonstrating that these methods are over $10^5$ times\nmore computationally efficient than retraining. Our results show that\nintegrating gradient ascent with gradient descent on in-distribution data\nimproves hyperparameter robustness. We also provide detailed guidelines for\nefficient hyperparameter tuning in the unlearning process. Our findings advance\nthe discourse on ethical AI practices, offering substantive insights into the\nmechanics of machine unlearning for pre-trained LLMs and underscoring the\npotential for responsible AI development.\n","authors":["Jin Yao","Eli Chien","Minxin Du","Xinyao Niu","Tianhao Wang","Zezhou Cheng","Xiang Yue"],"pdf_url":"https://arxiv.org/pdf/2402.15159v3.pdf","comment":"ACL 2024 main. Code and data at\n  https://github.com/yaojin17/Unlearning_LLM"},{"id":"http://arxiv.org/abs/2405.20172v1","updated":"2024-05-30T15:44:27Z","published":"2024-05-30T15:44:27Z","title":"Iterative Feature Boosting for Explainable Speech Emotion Recognition","summary":"  In speech emotion recognition (SER), using predefined features without\nconsidering their practical importance may lead to high dimensional datasets,\nincluding redundant and irrelevant information. Consequently, high-dimensional\nlearning often results in decreasing model accuracy while increasing\ncomputational complexity. Our work underlines the importance of carefully\nconsidering and analyzing features in order to build efficient SER systems. We\npresent a new supervised SER method based on an efficient feature engineering\napproach. We pay particular attention to the explainability of results to\nevaluate feature relevance and refine feature sets. This is performed\niteratively through feature evaluation loop, using Shapley values to boost\nfeature selection and improve overall framework performance. Our approach\nallows thus to balance the benefits between model performance and transparency.\nThe proposed method outperforms human-level performance (HLP) and\nstate-of-the-art machine learning methods in emotion recognition on the TESS\ndataset.\n","authors":["Alaa Nfissi","Wassim Bouachir","Nizar Bouguila","Brian Mishara"],"pdf_url":"https://arxiv.org/pdf/2405.20172v1.pdf","comment":"Published in: 2023 International Conference on Machine Learning and\n  Applications (ICMLA)"},{"id":"http://arxiv.org/abs/2405.20163v1","updated":"2024-05-30T15:38:54Z","published":"2024-05-30T15:38:54Z","title":"Reasoning about concepts with LLMs: Inconsistencies abound","summary":"  The ability to summarize and organize knowledge into abstract concepts is key\nto learning and reasoning. Many industrial applications rely on the consistent\nand systematic use of concepts, especially when dealing with decision-critical\nknowledge. However, we demonstrate that, when methodically questioned, large\nlanguage models (LLMs) often display and demonstrate significant\ninconsistencies in their knowledge. Computationally, the basic aspects of the\nconceptualization of a given domain can be represented as Is-A hierarchies in a\nknowledge graph (KG) or ontology, together with a few properties or axioms that\nenable straightforward reasoning. We show that even simple ontologies can be\nused to reveal conceptual inconsistencies across several LLMs. We also propose\nstrategies that domain experts can use to evaluate and improve the coverage of\nkey domain concepts in LLMs of various sizes. In particular, we have been able\nto significantly enhance the performance of LLMs of various sizes with openly\navailable weights using simple knowledge-graph (KG) based prompting strategies.\n","authors":["Rosario Uceda-Sosa","Karthikeyan Natesan Ramamurthy","Maria Chang","Moninder Singh"],"pdf_url":"https://arxiv.org/pdf/2405.20163v1.pdf","comment":"15 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2405.09983v2","updated":"2024-05-30T15:34:10Z","published":"2024-05-16T11:01:09Z","title":"Zero-Shot Hierarchical Classification on the Common Procurement\n  Vocabulary Taxonomy","summary":"  Classifying public tenders is a useful task for both companies that are\ninvited to participate and for inspecting fraudulent activities. To facilitate\nthe task for both participants and public administrations, the European Union\npresented a common taxonomy (Common Procurement Vocabulary, CPV) which is\nmandatory for tenders of certain importance; however, the contracts in which a\nCPV label is mandatory are the minority compared to all the Public\nAdministrations activities. Classifying over a real-world taxonomy introduces\nsome difficulties that can not be ignored. First of all, some fine-grained\nclasses have an insufficient (if any) number of observations in the training\nset, while other classes are far more frequent (even thousands of times) than\nthe average. To overcome those difficulties, we present a zero-shot approach,\nbased on a pre-trained language model that relies only on label description and\nrespects the label taxonomy. To train our proposed model, we used industrial\ndata, which comes from contrattipubblici.org, a service by SpazioDati s.r.l.\nthat collects public contracts stipulated in Italy in the last 25 years.\nResults show that the proposed model achieves better performance in classifying\nlow-frequent classes compared to three different baselines, and is also able to\npredict never-seen classes.\n","authors":["Federico Moiraghi","Matteo Palmonari","Davide Allavena","Federico Morando"],"pdf_url":"https://arxiv.org/pdf/2405.09983v2.pdf","comment":"Full-length version of the short paper accepted at COMPSAC 2024"},{"id":"http://arxiv.org/abs/2405.20145v1","updated":"2024-05-30T15:23:34Z","published":"2024-05-30T15:23:34Z","title":"Heidelberg-Boston @ SIGTYP 2024 Shared Task: Enhancing Low-Resource\n  Language Analysis With Character-Aware Hierarchical Transformers","summary":"  Historical languages present unique challenges to the NLP community, with one\nprominent hurdle being the limited resources available in their closed corpora.\nThis work describes our submission to the constrained subtask of the SIGTYP\n2024 shared task, focusing on PoS tagging, morphological tagging, and\nlemmatization for 13 historical languages. For PoS and morphological tagging we\nadapt a hierarchical tokenization method from Sun et al. (2023) and combine it\nwith the advantages of the DeBERTa-V3 architecture, enabling our models to\nefficiently learn from every character in the training data. We also\ndemonstrate the effectiveness of character-level T5 models on the lemmatization\ntask. Pre-trained from scratch with limited data, our models achieved first\nplace in the constrained subtask, nearly reaching the performance levels of the\nunconstrained task's winner. Our code is available at\nhttps://github.com/bowphs/SIGTYP-2024-hierarchical-transformers\n","authors":["Frederick Riemenschneider","Kevin Krahn"],"pdf_url":"https://arxiv.org/pdf/2405.20145v1.pdf","comment":"Accepted for publication at the 6th Workshop on Research in\n  Computational Linguistic Typology and Multilingual NLP (SIGTYP-WS) 2024; 11\n  pages, 1 figure, 9 tables"},{"id":"http://arxiv.org/abs/2405.17503v2","updated":"2024-05-30T15:20:19Z","published":"2024-05-26T04:00:30Z","title":"Code Repair with LLMs gives an Exploration-Exploitation Tradeoff","summary":"  Iteratively improving and repairing source code with large language models\n(LLMs), known as refinement, has emerged as a popular way of generating\nprograms that would be too complex to construct in one shot. Given a bank of\ntest cases, together with a candidate program, an LLM can improve that program\nby being prompted with failed test cases. But it remains an open question how\nto best iteratively refine code, with prior work employing simple greedy or\nbreadth-first strategies. We show here that refinement exposes an\nexplore-exploit tradeoff: exploit by refining the program that passes the most\ntest cases, or explore by refining a lesser considered program. We frame this\nas an arm-acquiring bandit problem, which we solve with Thompson Sampling. The\nresulting LLM-based program synthesis algorithm is broadly applicable: Across\nloop invariant synthesis, visual reasoning puzzles, and competition programming\nproblems, we find that our new method can solve more problems using fewer\nlanguage model calls.\n","authors":["Hao Tang","Keya Hu","Jin Peng Zhou","Sicheng Zhong","Wei-Long Zheng","Xujie Si","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2405.17503v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15112v3","updated":"2024-05-30T15:17:55Z","published":"2024-03-22T11:08:48Z","title":"Text clustering with LLM embeddings","summary":"  Text clustering is an important approach for organising the growing amount of\ndigital content, helping to structure and find hidden patterns in uncategorised\ndata. However, the effectiveness of text clustering heavily relies on the\nchoice of textual embeddings and clustering algorithms. We argue that recent\nadvances in large language models (LLMs) can potentially improve this task. In\nthis research, we investigated how different textual embeddings -- particularly\nthose used in LLMs -- and clustering algorithms affect how text datasets are\nclustered. A series of experiments were conducted to assess how embeddings\ninfluence clustering results, the role played by dimensionality reduction\nthrough summarisation, and model size adjustment. Findings reveal that LLM\nembeddings excel at capturing subtleties in structured language, while BERT\nleads the lightweight options in performance. In addition, we observe that\nincreasing model dimensionality and employing summarization techniques do not\nconsistently lead to improvements in clustering efficiency, suggesting that\nthese strategies require careful analysis to use in real-life models. These\nresults highlight a complex balance between the need for refined text\nrepresentation and computational feasibility in text clustering applications.\nThis study extends traditional text clustering frameworks by incorporating\nembeddings from LLMs, providing a path for improved methodologies, while\ninforming new avenues for future research in various types of textual analysis.\n","authors":["Alina Petukhova","João P. Matos-Carvalho","Nuno Fachada"],"pdf_url":"https://arxiv.org/pdf/2403.15112v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20139v1","updated":"2024-05-30T15:14:24Z","published":"2024-05-30T15:14:24Z","title":"GNN-RAG: Graph Neural Retrieval for Large Language Model Reasoning","summary":"  Knowledge Graphs (KGs) represent human-crafted factual knowledge in the form\nof triplets (head, relation, tail), which collectively form a graph. Question\nAnswering over KGs (KGQA) is the task of answering natural questions grounding\nthe reasoning to the information provided by the KG. Large Language Models\n(LLMs) are the state-of-the-art models for QA tasks due to their remarkable\nability to understand natural language. On the other hand, Graph Neural\nNetworks (GNNs) have been widely used for KGQA as they can handle the complex\ngraph information stored in the KG. In this work, we introduce GNN-RAG, a novel\nmethod for combining language understanding abilities of LLMs with the\nreasoning abilities of GNNs in a retrieval-augmented generation (RAG) style.\nFirst, a GNN reasons over a dense KG subgraph to retrieve answer candidates for\na given question. Second, the shortest paths in the KG that connect question\nentities and answer candidates are extracted to represent KG reasoning paths.\nThe extracted paths are verbalized and given as input for LLM reasoning with\nRAG. In our GNN-RAG framework, the GNN acts as a dense subgraph reasoner to\nextract useful graph information, while the LLM leverages its natural language\nprocessing ability for ultimate KGQA. Furthermore, we develop a retrieval\naugmentation (RA) technique to further boost KGQA performance with GNN-RAG.\nExperimental results show that GNN-RAG achieves state-of-the-art performance in\ntwo widely used KGQA benchmarks (WebQSP and CWQ), outperforming or matching\nGPT-4 performance with a 7B tuned LLM. In addition, GNN-RAG excels on multi-hop\nand multi-entity questions outperforming competing approaches by 8.9--15.5%\npoints at answer F1.\n","authors":["Costas Mavromatis","George Karypis"],"pdf_url":"https://arxiv.org/pdf/2405.20139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20131v1","updated":"2024-05-30T15:10:37Z","published":"2024-05-30T15:10:37Z","title":"Language Models Need Inductive Biases to Count Inductively","summary":"  Counting is a fundamental example of generalization, whether viewed through\nthe mathematical lens of Peano's axioms defining the natural numbers or the\ncognitive science literature for children learning to count. The argument holds\nfor both cases that learning to count means learning to count infinitely. While\nfew papers have tried to distill transformer \"reasoning\" to the simplest case\nof counting, investigating length generalization does occur throughout the\nliterature. In the \"train short, test long\" paradigm of NLP, length refers to\nthe training sentence length. In formal language recognition, length refers to\nthe input sequence length, or the maximum stack size induced by a pushdown\nautomata. In general problem solving, length refers to the number of hops in a\ndeductive reasoning chain or the recursion depth. For all cases, counting is\ncentral to task success. And crucially, generalizing counting inductively is\ncentral to success on OOD instances. This work provides extensive empirical\nresults on training language models to count. We experiment with architectures\nranging from RNNs, Transformers, State-Space Models and RWKV. We present\ncarefully-designed task formats, auxiliary tasks and positional embeddings to\navoid limitations in generalization with OOD-position and OOD-vocabulary. We\nfind that while traditional RNNs trivially achieve inductive counting,\nTransformers have to rely on positional embeddings to count out-of-domain. As\ncounting is the basis for many arguments concerning the expressivity of\nTransformers, our finding calls for the community to reexamine the application\nscope of primitive functions defined in formal characterizations. Finally,\nmodern RNNs also largely underperform traditional RNNs in generalizing counting\ninductively. We discuss how design choices that enable parallelized training of\nmodern RNNs cause them to lose merits of a recurrent nature.\n","authors":["Yingshan Chang","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2405.20131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12942v5","updated":"2024-05-30T14:49:25Z","published":"2023-10-19T17:39:47Z","title":"On the Representational Capacity of Recurrent Neural Language Models","summary":"  This work investigates the computational expressivity of language models\n(LMs) based on recurrent neural networks (RNNs). Siegelmann and Sontag (1992)\nfamously showed that RNNs with rational weights and hidden states and unbounded\ncomputation time are Turing complete. However, LMs define weightings over\nstrings in addition to just (unweighted) language membership and the analysis\nof the computational power of RNN LMs (RLMs) should reflect this. We extend the\nTuring completeness result to the probabilistic case, showing how a rationally\nweighted RLM with unbounded computation time can simulate any deterministic\nprobabilistic Turing machine (PTM) with rationally weighted transitions. Since,\nin practice, RLMs work in real-time, processing a symbol at every time step, we\ntreat the above result as an upper bound on the expressivity of RLMs. We also\nprovide a lower bound by showing that under the restriction to real-time\ncomputation, such models can simulate deterministic real-time rational PTMs.\n","authors":["Franz Nowak","Anej Svete","Li Du","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2310.12942v5.pdf","comment":"Added requirement for non-negative probabilities to definitions 2.3\n  and 3.1, fixed typos"},{"id":"http://arxiv.org/abs/2404.04530v2","updated":"2024-05-30T14:44:10Z","published":"2024-04-06T07:10:47Z","title":"A Morphology-Based Investigation of Positional Encodings","summary":"  Contemporary deep learning models effectively handle languages with diverse\nmorphology despite not being directly integrated into them. Morphology and word\norder are closely linked, with the latter incorporated into transformer-based\nmodels through positional encodings. This prompts a fundamental inquiry: Is\nthere a correlation between the morphological complexity of a language and the\nutilization of positional encoding in pre-trained language models? In pursuit\nof an answer, we present the first study addressing this question, encompassing\n22 languages and 5 downstream tasks. Our findings reveal that the importance of\npositional encoding diminishes with increasing morphological complexity in\nlanguages. Our study motivates the need for a deeper understanding of\npositional encoding, augmenting them to better reflect the different languages\nunder consideration.\n","authors":["Poulami Ghosh","Shikhar Vashishth","Raj Dabre","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2404.04530v2.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2405.20101v1","updated":"2024-05-30T14:41:39Z","published":"2024-05-30T14:41:39Z","title":"Fill in the Gap! Combining Self-supervised Representation Learning with\n  Neural Audio Synthesis for Speech Inpainting","summary":"  Most speech self-supervised learning (SSL) models are trained with a pretext\ntask which consists in predicting missing parts of the input signal, either\nfuture segments (causal prediction) or segments masked anywhere within the\ninput (non-causal prediction). Learned speech representations can then be\nefficiently transferred to downstream tasks (e.g., automatic speech or speaker\nrecognition). In the present study, we investigate the use of a speech SSL\nmodel for speech inpainting, that is reconstructing a missing portion of a\nspeech signal from its surrounding context, i.e., fulfilling a downstream task\nthat is very similar to the pretext task. To that purpose, we combine an SSL\nencoder, namely HuBERT, with a neural vocoder, namely HiFiGAN, playing the role\nof a decoder. In particular, we propose two solutions to match the HuBERT\noutput with the HiFiGAN input, by freezing one and fine-tuning the other, and\nvice versa. Performance of both approaches was assessed in single- and\nmulti-speaker settings, for both informed and blind inpainting configurations\n(i.e., the position of the mask is known or unknown, respectively), with\ndifferent objective metrics and a perceptual evaluation. Performances show that\nif both solutions allow to correctly reconstruct signal portions up to the size\nof 200ms (and even 400ms in some cases), fine-tuning the SSL encoder provides a\nmore accurate signal reconstruction in the single-speaker setting case, while\nfreezing it (and training the neural vocoder instead) is a better strategy when\ndealing with multi-speaker data.\n","authors":["Ihab Asaad","Maxime Jacquelin","Olivier Perrotin","Laurent Girin","Thomas Hueber"],"pdf_url":"https://arxiv.org/pdf/2405.20101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20092v1","updated":"2024-05-30T14:31:33Z","published":"2024-05-30T14:31:33Z","title":"Divide-and-Conquer Meets Consensus: Unleashing the Power of Functions in\n  Code Generation","summary":"  Despite recent progress made by large language models in code generation,\nthey still struggle with programs that meet complex requirements. Recent work\nutilizes plan-and-solve decomposition to decrease the complexity and leverage\nself-tests to refine the generated program. Yet, planning deep-inside\nrequirements in advance can be challenging, and the tests need to be accurate\nto accomplish self-improvement. To this end, we propose FunCoder, a code\ngeneration framework incorporating the divide-and-conquer strategy with\nfunctional consensus. Specifically, FunCoder recursively branches off\nsub-functions as smaller goals during code generation, represented by a tree\nhierarchy. These sub-functions are then composited to attain more complex\nobjectives. Additionally, we designate functions via a consensus formed by\nidentifying similarities in program behavior, mitigating error propagation.\nFunCoder outperforms state-of-the-art methods by +9.8% on average in HumanEval,\nMBPP, xCodeEval and MATH with GPT-3.5 and GPT-4. Moreover, our method\ndemonstrates superiority on smaller models: With FunCoder, StableCode-3b\nsurpasses GPT-3.5 by +18.6% and achieves 97.7% of GPT-4's performance on\nHumanEval. Further analysis reveals that our proposed dynamic function\ndecomposition is capable of handling complex requirements, and the functional\nconsensus prevails over self-testing in correctness evaluation.\n","authors":["Jingchang Chen","Hongxuan Tang","Zheng Chu","Qianglong Chen","Zekun Wang","Ming Liu","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2405.20092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01032v2","updated":"2024-05-30T14:27:21Z","published":"2022-12-02T08:56:53Z","title":"Systematic Analysis for Pretrained Language Model Priming for\n  Parameter-Efficient Fine-tuning","summary":"  Parameter-efficient (PE) methods (like Prompts or Adapters) for adapting\npre-trained language models (PLM) to downstream tasks have been popular\nrecently. However, hindrances still prevent these methods from reaching their\nfull potential. For example, two significant challenges are few-shot adaptation\nand cross-task generalization. To tackle these issues, we propose a general PE\npriming framework to enhance and explore the few-shot adaptation and\ngeneralization ability of PE methods. In this framework, PLMs are primed with\nPE methods for rapidly adapting to various target tasks. To evaluate the\ngeneralization ability of these PE methods, we conduct experiments on a\nfew-shot cross-domain benchmark containing 160 diverse NLP tasks. Our\nexperiment not only reveals the best priming strategy but also verifies that\npriming facilitates the adaptation to target tasks.\n","authors":["Shih-Cheng Huang","Shih-Heng Wang","Min-Han Shih","Saurav Sahay","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2212.01032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20089v1","updated":"2024-05-30T14:25:56Z","published":"2024-05-30T14:25:56Z","title":"The Fine-Tuning Paradox: Boosting Translation Quality Without\n  Sacrificing LLM Abilities","summary":"  Fine-tuning large language models (LLMs) for machine translation has shown\nimprovements in overall translation quality. However, it is unclear what is the\nimpact of fine-tuning on desirable LLM behaviors that are not present in neural\nmachine translation models, such as steerability, inherent document-level\ntranslation abilities, and the ability to produce less literal translations. We\nperform an extensive translation evaluation on the LLaMA and Falcon family of\nmodels with model size ranging from 7 billion up to 65 billion parameters. Our\nresults show that while fine-tuning improves the general translation quality of\nLLMs, several abilities degrade. In particular, we observe a decline in the\nability to perform formality steering, to produce technical translations\nthrough few-shot examples, and to perform document-level translation. On the\nother hand, we observe that the model produces less literal translations after\nfine-tuning on parallel data. We show that by including monolingual data as\npart of the fine-tuning data we can maintain the abilities while simultaneously\nenhancing overall translation quality. Our findings emphasize the need for\nfine-tuning strategies that preserve the benefits of LLMs for machine\ntranslation.\n","authors":["David Stap","Eva Hasler","Bill Byrne","Christof Monz","Ke Tran"],"pdf_url":"https://arxiv.org/pdf/2405.20089v1.pdf","comment":"Accepted to ACL 2024 (long, main)"},{"id":"http://arxiv.org/abs/2405.20079v1","updated":"2024-05-30T14:09:43Z","published":"2024-05-30T14:09:43Z","title":"Student Answer Forecasting: Transformer-Driven Answer Choice Prediction\n  for Language Learning","summary":"  Intelligent Tutoring Systems (ITS) enhance personalized learning by\npredicting student answers to provide immediate and customized instruction.\nHowever, recent research has primarily focused on the correctness of the answer\nrather than the student's performance on specific answer choices, limiting\ninsights into students' thought processes and potential misconceptions. To\naddress this gap, we present MCQStudentBert, an answer forecasting model that\nleverages the capabilities of Large Language Models (LLMs) to integrate\ncontextual understanding of students' answering history along with the text of\nthe questions and answers. By predicting the specific answer choices students\nare likely to make, practitioners can easily extend the model to new answer\nchoices or remove answer choices for the same multiple-choice question (MCQ)\nwithout retraining the model. In particular, we compare MLP, LSTM, BERT, and\nMistral 7B architectures to generate embeddings from students' past\ninteractions, which are then incorporated into a finetuned BERT's\nanswer-forecasting mechanism. We apply our pipeline to a dataset of language\nlearning MCQ, gathered from an ITS with over 10,000 students to explore the\npredictive accuracy of MCQStudentBert, which incorporates student interaction\npatterns, in comparison to correct answer prediction and traditional\nmastery-learning feature-based approaches. This work opens the door to more\npersonalized content, modularization, and granular support.\n","authors":["Elena Grazia Gado","Tommaso Martorella","Luca Zunino","Paola Mejia-Domenzain","Vinitra Swamy","Jibril Frej","Tanja Käser"],"pdf_url":"https://arxiv.org/pdf/2405.20079v1.pdf","comment":"Accepted as a poster paper at EDM 2024: 17th International Conference\n  on Educational Data Mining in Atlanta, USA"},{"id":"http://arxiv.org/abs/2402.03271v2","updated":"2024-05-30T14:03:35Z","published":"2024-02-05T18:28:44Z","title":"Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information\n  Seeking in Large Language Models","summary":"  In the face of uncertainty, the ability to *seek information* is of\nfundamental importance. In many practical applications, such as medical\ndiagnosis and troubleshooting, the information needed to solve the task is not\ninitially given and has to be actively sought by asking follow-up questions\n(for example, a doctor asking a patient for more details about their symptoms).\nIn this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to\naugment large language models with the ability to actively seek information by\nasking effective questions. UoT combines 1) an *uncertainty-aware simulation\napproach* which enables the model to simulate possible future scenarios and how\nlikely they are to occur, 2) *uncertainty-based rewards* motivated by\ninformation gain which incentivizes the model to seek information, and 3) a\n*reward propagation scheme* to select the optimal question to ask in a way that\nmaximizes the expected reward. In experiments on medical diagnosis,\ntroubleshooting, and the `20 Questions` game, UoT achieves an average\nperformance improvement of 38.1% in the rate of successful task completion\nacross multiple LLMs compared with direct prompting and also improves\nefficiency (i.e., the number of questions needed to complete the task). Our\ncode has been released [here](https://github.com/zhiyuanhubj/UoT)\n","authors":["Zhiyuan Hu","Chumin Liu","Xidong Feng","Yilun Zhao","See-Kiong Ng","Anh Tuan Luu","Junxian He","Pang Wei Koh","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.03271v2.pdf","comment":"Update Results"},{"id":"http://arxiv.org/abs/2402.06782v3","updated":"2024-05-30T13:59:34Z","published":"2024-02-09T21:05:01Z","title":"Debating with More Persuasive LLMs Leads to More Truthful Answers","summary":"  Common methods for aligning large language models (LLMs) with desired\nbehaviour heavily rely on human-labelled data. However, as models grow\nincreasingly sophisticated, they will surpass human expertise, and the role of\nhuman evaluation will evolve into non-experts overseeing experts. In\nanticipation of this, we ask: can weaker models assess the correctness of\nstronger models? We investigate this question in an analogous setting, where\nstronger models (experts) possess the necessary information to answer questions\nand weaker models (non-experts) lack this information. The method we evaluate\nis debate, where two LLM experts each argue for a different answer, and a\nnon-expert selects the answer. We find that debate consistently helps both\nnon-expert models and humans answer questions, achieving 76% and 88% accuracy\nrespectively (naive baselines obtain 48% and 60%). Furthermore, optimising\nexpert debaters for persuasiveness in an unsupervised manner improves\nnon-expert ability to identify the truth in debates. Our results provide\nencouraging empirical evidence for the viability of aligning models with debate\nin the absence of ground truth.\n","authors":["Akbir Khan","John Hughes","Dan Valentine","Laura Ruis","Kshitij Sachan","Ansh Radhakrishnan","Edward Grefenstette","Samuel R. Bowman","Tim Rocktäschel","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2402.06782v3.pdf","comment":"For code please check: https://github.com/ucl-dark/llm_debate"},{"id":"http://arxiv.org/abs/2309.08952v2","updated":"2024-05-30T13:49:47Z","published":"2023-09-16T11:07:52Z","title":"Cross-Lingual Knowledge Editing in Large Language Models","summary":"  Knowledge editing aims to change language models' performance on several\nspecial cases (i.e., editing scope) by infusing the corresponding expected\nknowledge into them. With the recent advancements in large language models\n(LLMs), knowledge editing has been shown as a promising technique to adapt LLMs\nto new knowledge without retraining from scratch. However, most of the previous\nstudies neglect the multi-lingual nature of some main-stream LLMs (e.g., LLaMA,\nChatGPT and GPT-4), and typically focus on monolingual scenarios, where LLMs\nare edited and evaluated in the same language. As a result, it is still unknown\nthe effect of source language editing on a different target language. In this\npaper, we aim to figure out this cross-lingual effect in knowledge editing.\nSpecifically, we first collect a large-scale cross-lingual synthetic dataset by\ntranslating ZsRE from English to Chinese. Then, we conduct English editing on\nvarious knowledge editing methods covering different paradigms, and evaluate\ntheir performance in Chinese, and vice versa. To give deeper analyses of the\ncross-lingual effect, the evaluation includes four aspects, i.e., reliability,\ngenerality, locality and portability. Furthermore, we analyze the inconsistent\nbehaviors of the edited models and discuss their specific challenges. Data and\ncodes are available at https://github.com/krystalan/Bi_ZsRE\n","authors":["Jiaan Wang","Yunlong Liang","Zengkui Sun","Yuxuan Cao","Jiarong Xu","Fandong Meng"],"pdf_url":"https://arxiv.org/pdf/2309.08952v2.pdf","comment":"Accepted to ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2306.16092v2","updated":"2024-05-30T13:46:00Z","published":"2023-06-28T10:48:34Z","title":"Chatlaw: A Multi-Agent Collaborative Legal Assistant with Knowledge\n  Graph Enhanced Mixture-of-Experts Large Language Model","summary":"  AI legal assistants based on Large Language Models (LLMs) can provide\naccessible legal consulting services, but the hallucination problem poses\npotential legal risks. This paper presents Chatlaw, an innovative legal\nassistant utilizing a Mixture-of-Experts (MoE) model and a multi-agent system\nto enhance the reliability and accuracy of AI-driven legal services. By\nintegrating knowledge graphs with artificial screening, we construct a\nhigh-quality legal dataset to train the MoE model. This model utilizes\ndifferent experts to address various legal issues, optimizing the accuracy of\nlegal responses. Additionally, Standardized Operating Procedures (SOP), modeled\nafter real law firm workflows, significantly reduce errors and hallucinations\nin legal services. Our MoE model outperforms GPT-4 in the Lawbench and Unified\nQualification Exam for Legal Professionals by 7.73% in accuracy and 11 points,\nrespectively, and also surpasses other models in multiple dimensions during\nreal-case consultations, demonstrating our robust capability for legal\nconsultation.\n","authors":["Jiaxi Cui","Munan Ning","Zongjian Li","Bohua Chen","Yang Yan","Hao Li","Bin Ling","Yonghong Tian","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2306.16092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20053v1","updated":"2024-05-30T13:38:52Z","published":"2024-05-30T13:38:52Z","title":"Would I Lie To You? Inference Time Alignment of Language Models using\n  Direct Preference Heads","summary":"  Pre-trained Language Models (LMs) exhibit strong zero-shot and in-context\nlearning capabilities; however, their behaviors are often difficult to control.\nBy utilizing Reinforcement Learning from Human Feedback (RLHF), it is possible\nto fine-tune unsupervised LMs to follow instructions and produce outputs that\nreflect human preferences. Despite its benefits, RLHF has been shown to\npotentially harm a language model's reasoning capabilities and introduce\nartifacts such as hallucinations where the model may fabricate facts. To\naddress this issue we introduce Direct Preference Heads (DPH), a fine-tuning\nframework that enables LMs to learn human preference signals through an\nauxiliary reward head without directly affecting the output distribution of the\nlanguage modeling head. We perform a theoretical analysis of our objective\nfunction and find strong ties to Conservative Direct Preference Optimization\n(cDPO). Finally we evaluate our models on GLUE, RACE, and the GPT4All\nevaluation suite and demonstrate that our method produces models which achieve\nhigher scores than those fine-tuned with Supervised Fine-Tuning (SFT) or Direct\nPreference Optimization (DPO) alone.\n","authors":["Avelina Asada Hadji-Kyriacou","Ognjen Arandjelovic"],"pdf_url":"https://arxiv.org/pdf/2405.20053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14016v3","updated":"2024-05-30T13:26:38Z","published":"2024-01-25T08:48:21Z","title":"Towards Uncertainty-Aware Language Agent","summary":"  While Language Agents have achieved promising success by placing Large\nLanguage Models at the core of a more versatile design that dynamically\ninteracts with the external world, the existing approaches neglect the notion\nof uncertainty during these interactions. We present the Uncertainty-Aware\nLanguage Agent (UALA), a framework that orchestrates the interaction between\nthe agent and the external world using uncertainty quantification. Compared\nwith other well-known counterparts like ReAct, our extensive experiments across\n3 representative tasks (HotpotQA, StrategyQA, MMLU) and various LLM sizes\ndemonstrate that UALA brings a significant improvement of performance, while\nhaving a substantially lower reliance on the external world (i.e., reduced\nnumber of tool calls and tokens). Our analyses provide various insights\nincluding the great potential of UALA compared with agent fine-tuning, and\nunderscore the unreliability of verbalised confidence of LLMs as a proxy for\nuncertainty.\n","authors":["Jiuzhou Han","Wray Buntine","Ehsan Shareghi"],"pdf_url":"https://arxiv.org/pdf/2401.14016v3.pdf","comment":"Our code and data are at https://uala-agent.github.io. (accepted to\n  ACL 2024 Findings). arXiv admin note: text overlap with arXiv:2310.05915"},{"id":"http://arxiv.org/abs/2305.12392v3","updated":"2024-05-30T13:23:24Z","published":"2023-05-21T08:11:24Z","title":"PiVe: Prompting with Iterative Verification Improving Graph-based\n  Generative Capability of LLMs","summary":"  Large language models (LLMs) have shown great abilities of solving various\nnatural language tasks in different domains. Due to the training objective of\nLLMs and their pre-training data, LLMs are not very well equipped for tasks\ninvolving structured data generation. We propose a framework, Prompting with\nIterative Verification (PiVe), to improve graph-based generative capability of\nLLMs. We show how a small language model could be trained to act as a verifier\nmodule for the output of an LLM~(i.e., ChatGPT, GPT-4), and to iteratively\nimprove its performance via fine-grained corrective instructions. We also show\nhow the verifier module could apply iterative corrections offline for a more\ncost-effective solution to the text-to-graph generation task. Experiments on\nthree graph-based datasets show consistent improvement gained via PiVe.\nAdditionally, we create GenWiki-HIQ and highlight that the verifier module can\nbe used as a data augmentation tool to help improve the quality of\nautomatically generated parallel text-graph datasets.\n","authors":["Jiuzhou Han","Nigel Collier","Wray Buntine","Ehsan Shareghi"],"pdf_url":"https://arxiv.org/pdf/2305.12392v3.pdf","comment":"Our code and data are at https://github.com/Jiuzhouh/PiVe (accepted\n  to ACL 2024 Findings)"},{"id":"http://arxiv.org/abs/2402.07865v2","updated":"2024-05-30T13:08:48Z","published":"2024-02-12T18:21:14Z","title":"Prismatic VLMs: Investigating the Design Space of Visually-Conditioned\n  Language Models","summary":"  Visually-conditioned language models (VLMs) have seen growing adoption in\napplications such as visual dialogue, scene understanding, and robotic task\nplanning; adoption that has fueled a wealth of new models such as LLaVa,\nInstructBLIP, and PaLI-3. Despite the volume of new releases, key design\ndecisions around image preprocessing, architecture, and optimization are\nunder-explored, making it challenging to understand what factors account for\nmodel performance $-$ a challenge further complicated by the lack of objective,\nconsistent evaluations. To address these gaps, we first compile a suite of\nstandardized evaluations spanning visual question answering, object\nlocalization, and challenge sets that probe properties such as hallucination;\nevaluations that provide fine-grained insight VLM capabilities. Second, we\nrigorously investigate VLMs along key design axes, including pretrained visual\nrepresentations and training from base vs. instruct-tuned language models,\namongst others. We couple our analysis with three resource contributions: (1) a\nunified framework for evaluating VLMs, (2) optimized, flexible training code,\nand (3) checkpoints for all models, including a family of VLMs at the 7-13B\nscale that strictly outperform InstructBLIP and LLaVa v1.5, the\nstate-of-the-art in open VLMs.\n","authors":["Siddharth Karamcheti","Suraj Nair","Ashwin Balakrishna","Percy Liang","Thomas Kollar","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2402.07865v2.pdf","comment":"Published at ICML 2024. 22 pages, 11 figures. Training code and\n  models: https://github.com/TRI-ML/prismatic-vlms. Evaluation code:\n  https://github.com/TRI-ML/vlm-evaluation"},{"id":"http://arxiv.org/abs/2402.18496v3","updated":"2024-05-30T12:43:01Z","published":"2024-02-28T17:25:59Z","title":"Language Models Represent Beliefs of Self and Others","summary":"  Understanding and attributing mental states, known as Theory of Mind (ToM),\nemerges as a fundamental capability for human social reasoning. While Large\nLanguage Models (LLMs) appear to possess certain ToM abilities, the mechanisms\nunderlying these capabilities remain elusive. In this study, we discover that\nit is possible to linearly decode the belief status from the perspectives of\nvarious agents through neural activations of language models, indicating the\nexistence of internal representations of self and others' beliefs. By\nmanipulating these representations, we observe dramatic changes in the models'\nToM performance, underscoring their pivotal role in the social reasoning\nprocess. Additionally, our findings extend to diverse social reasoning tasks\nthat involve different causal inference patterns, suggesting the potential\ngeneralizability of these representations.\n","authors":["Wentao Zhu","Zhining Zhang","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2402.18496v3.pdf","comment":"project page: https://walter0807.github.io/RepBelief/"},{"id":"http://arxiv.org/abs/2405.20003v1","updated":"2024-05-30T12:42:05Z","published":"2024-05-30T12:42:05Z","title":"Kernel Language Entropy: Fine-grained Uncertainty Quantification for\n  LLMs from Semantic Similarities","summary":"  Uncertainty quantification in Large Language Models (LLMs) is crucial for\napplications where safety and reliability are important. In particular,\nuncertainty can be used to improve the trustworthiness of LLMs by detecting\nfactually incorrect model responses, commonly called hallucinations.\nCritically, one should seek to capture the model's semantic uncertainty, i.e.,\nthe uncertainty over the meanings of LLM outputs, rather than uncertainty over\nlexical or syntactic variations that do not affect answer correctness. To\naddress this problem, we propose Kernel Language Entropy (KLE), a novel method\nfor uncertainty estimation in white- and black-box LLMs. KLE defines positive\nsemidefinite unit trace kernels to encode the semantic similarities of LLM\noutputs and quantifies uncertainty using the von Neumann entropy. It considers\npairwise semantic dependencies between answers (or semantic clusters),\nproviding more fine-grained uncertainty estimates than previous methods based\non hard clustering of answers. We theoretically prove that KLE generalizes the\nprevious state-of-the-art method called semantic entropy and empirically\ndemonstrate that it improves uncertainty quantification performance across\nmultiple natural language generation datasets and LLM architectures.\n","authors":["Alexander Nikitin","Jannik Kossen","Yarin Gal","Pekka Marttinen"],"pdf_url":"https://arxiv.org/pdf/2405.20003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16510v3","updated":"2024-05-30T12:40:06Z","published":"2024-05-26T10:33:17Z","title":"Meta-Task Planning for Language Agents","summary":"  The rapid advancement of neural language models has sparked a new surge of\nintelligent agent research. Unlike traditional agents, large language\nmodel-based agents (LLM agents) have emerged as a promising paradigm for\nachieving artificial general intelligence (AGI) due to their superior reasoning\nand generalization capabilities. Effective planning is crucial for the success\nof LLM agents in real-world tasks, making it a highly pursued topic in the\ncommunity. Current planning methods typically translate tasks into executable\naction sequences. However, determining a feasible or optimal sequence for\ncomplex tasks at fine granularity, which often requires compositing long chains\nof heterogeneous actions, remains challenging. This paper introduces Meta-Task\nPlanning (MTP), a zero-shot methodology for collaborative LLM-based multi-agent\nsystems that simplifies complex task planning by decomposing it into a\nhierarchy of subordinate tasks, or meta-tasks. Each meta-task is then mapped\ninto executable actions. MTP was assessed on two rigorous benchmarks,\nTravelPlanner and API-Bank. Notably, MTP achieved an average $\\sim40\\%$ success\nrate on TravelPlanner, significantly higher than the state-of-the-art (SOTA)\nbaseline ($2.92\\%$), and outperforming $LLM_{api}$-4 with ReAct on API-Bank by\n$\\sim14\\%$, showing the immense potential of integrating LLM with multi-agent\nsystems.\n","authors":["Cong Zhang","Derrick Goh Xin Deik","Dexun Li","Hao Zhang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2405.16510v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08975v2","updated":"2024-05-30T12:39:51Z","published":"2023-10-13T09:45:14Z","title":"ChatKBQA: A Generate-then-Retrieve Framework for Knowledge Base Question\n  Answering with Fine-tuned Large Language Models","summary":"  Knowledge Base Question Answering (KBQA) aims to answer natural language\nquestions over large-scale knowledge bases (KBs), which can be summarized into\ntwo crucial steps: knowledge retrieval and semantic parsing. However, three\ncore challenges remain: inefficient knowledge retrieval, mistakes of retrieval\nadversely impacting semantic parsing, and the complexity of previous KBQA\nmethods. To tackle these challenges, we introduce ChatKBQA, a novel and simple\ngenerate-then-retrieve KBQA framework, which proposes first generating the\nlogical form with fine-tuned LLMs, then retrieving and replacing entities and\nrelations with an unsupervised retrieval method, to improve both generation and\nretrieval more directly. Experimental results show that ChatKBQA achieves new\nstate-of-the-art performance on standard KBQA datasets, WebQSP, and CWQ. This\nwork can also be regarded as a new paradigm for combining LLMs with knowledge\ngraphs (KGs) for interpretable and knowledge-required question answering. Our\ncode is publicly available.\n","authors":["Haoran Luo","Haihong E","Zichen Tang","Shiyao Peng","Yikai Guo","Wentai Zhang","Chenghao Ma","Guanting Dong","Meina Song","Wei Lin","Yifan Zhu","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2310.08975v2.pdf","comment":"Accepted by Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2403.04280v2","updated":"2024-05-30T12:17:51Z","published":"2024-03-07T07:24:32Z","title":"A New Benchmark for Evaluating Automatic Speech Recognition in the\n  Arabic Call Domain","summary":"  This work is an attempt to introduce a comprehensive benchmark for Arabic\nspeech recognition, specifically tailored to address the challenges of\ntelephone conversations in Arabic language. Arabic, characterized by its rich\ndialectal diversity and phonetic complexity, presents a number of unique\nchallenges for automatic speech recognition (ASR) systems. These challenges are\nfurther amplified in the domain of telephone calls, where audio quality,\nbackground noise, and conversational speech styles negatively affect\nrecognition accuracy. Our work aims to establish a robust benchmark that not\nonly encompasses the broad spectrum of Arabic dialects but also emulates the\nreal-world conditions of call-based communications. By incorporating diverse\ndialectical expressions and accounting for the variable quality of call\nrecordings, this benchmark seeks to provide a rigorous testing ground for the\ndevelopment and evaluation of ASR systems capable of navigating the\ncomplexities of Arabic speech in telephonic contexts. This work also attempts\nto establish a baseline performance evaluation using state-of-the-art ASR\ntechnologies.\n","authors":["Qusai Abo Obaidah","Muhy Eddin Za'ter","Adnan Jaljuli","Ali Mahboub","Asma Hakouz","Bashar Al-Rfooh","Yazan Estaitia"],"pdf_url":"https://arxiv.org/pdf/2403.04280v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18350v2","updated":"2024-05-30T12:16:39Z","published":"2024-03-27T08:42:31Z","title":"Evaluation of Semantic Search and its Role in\n  Retrieved-Augmented-Generation (RAG) for Arabic Language","summary":"  The latest advancements in machine learning and deep learning have brought\nforth the concept of semantic similarity, which has proven immensely beneficial\nin multiple applications and has largely replaced keyword search. However,\nevaluating semantic similarity and conducting searches for a specific query\nacross various documents continue to be a complicated task. This complexity is\ndue to the multifaceted nature of the task, the lack of standard benchmarks,\nwhereas these challenges are further amplified for Arabic language. This paper\nendeavors to establish a straightforward yet potent benchmark for semantic\nsearch in Arabic. Moreover, to precisely evaluate the effectiveness of these\nmetrics and the dataset, we conduct our assessment of semantic search within\nthe framework of retrieval augmented generation (RAG).\n","authors":["Ali Mahboub","Muhy Eddin Za'ter","Bashar Al-Rfooh","Yazan Estaitia","Adnan Jaljuli","Asma Hakouz"],"pdf_url":"https://arxiv.org/pdf/2403.18350v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12052v3","updated":"2024-05-30T12:03:51Z","published":"2024-02-19T11:11:08Z","title":"Small Models, Big Insights: Leveraging Slim Proxy Models To Decide When\n  and What to Retrieve for LLMs","summary":"  The integration of large language models (LLMs) and search engines represents\na significant evolution in knowledge acquisition methodologies. However,\ndetermining the knowledge that an LLM already possesses and the knowledge that\nrequires the help of a search engine remains an unresolved issue. Most existing\nmethods solve this problem through the results of preliminary answers or\nreasoning done by the LLM itself, but this incurs excessively high\ncomputational costs. This paper introduces a novel collaborative approach,\nnamely SlimPLM, that detects missing knowledge in LLMs with a slim proxy model,\nto enhance the LLM's knowledge acquisition process. We employ a proxy model\nwhich has far fewer parameters, and take its answers as heuristic answers.\nHeuristic answers are then utilized to predict the knowledge required to answer\nthe user question, as well as the known and unknown knowledge within the LLM.\nWe only conduct retrieval for the missing knowledge in questions that the LLM\ndoes not know. Extensive experimental results on five datasets with two LLMs\ndemonstrate a notable improvement in the end-to-end performance of LLMs in\nquestion-answering tasks, achieving or surpassing current state-of-the-art\nmodels with lower LLM inference costs.\n","authors":["Jiejun Tan","Zhicheng Dou","Yutao Zhu","Peidong Guo","Kun Fang","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2402.12052v3.pdf","comment":"Accepted by ACL 2024 main conference. Repo:\n  https://github.com/plageon/SlimPLM"},{"id":"http://arxiv.org/abs/2405.19967v1","updated":"2024-05-30T11:46:42Z","published":"2024-05-30T11:46:42Z","title":"Improved Out-of-Scope Intent Classification with Dual Encoding and\n  Threshold-based Re-Classification","summary":"  Detecting out-of-scope user utterances is essential for task-oriented\ndialogues and intent classification. Current methodologies face difficulties\nwith the unpredictable distribution of outliers and often rely on assumptions\nabout data distributions. We present the Dual Encoder for Threshold-Based\nRe-Classification (DETER) to address these challenges. This end-to-end\nframework efficiently detects out-of-scope intents without requiring\nassumptions on data distributions or additional post-processing steps. The core\nof DETER utilizes dual text encoders, the Universal Sentence Encoder (USE) and\nthe Transformer-based Denoising AutoEncoder (TSDAE), to generate user utterance\nembeddings, which are classified through a branched neural architecture.\nFurther, DETER generates synthetic outliers using self-supervision and\nincorporates out-of-scope phrases from open-domain datasets. This approach\nensures a comprehensive training set for out-of-scope detection. Additionally,\na threshold-based re-classification mechanism refines the model's initial\npredictions. Evaluations on the CLINC-150, Stackoverflow, and Banking77\ndatasets demonstrate DETER's efficacy. Our model outperforms previous\nbenchmarks, increasing up to 13% and 5% in F1 score for known and unknown\nintents on CLINC-150 and Stackoverflow, and 16% for known and 24% % for unknown\nintents on Banking77. The source code has been released at\nhttps://github.com/Hossam-Mohammed-tech/Intent\\_Classification\\_OOS.\n","authors":["Hossam M. Zawbaa","Wael Rashwan","Sourav Dutta","Haytham Assem"],"pdf_url":"https://arxiv.org/pdf/2405.19967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17732v2","updated":"2024-05-30T11:32:05Z","published":"2024-05-28T01:23:58Z","title":"C$^{3}$Bench: A Comprehensive Classical Chinese Understanding Benchmark\n  for Large Language Models","summary":"  Classical Chinese Understanding (CCU) holds significant value in preserving\nand exploration of the outstanding traditional Chinese culture. Recently,\nresearchers have attempted to leverage the potential of Large Language Models\n(LLMs) for CCU by capitalizing on their remarkable comprehension and semantic\ncapabilities. However, no comprehensive benchmark is available to assess the\nCCU capabilities of LLMs. To fill this gap, this paper introduces C$^{3}$bench,\na Comprehensive Classical Chinese understanding benchmark, which comprises\n50,000 text pairs for five primary CCU tasks, including classification,\nretrieval, named entity recognition, punctuation, and translation. Furthermore,\nthe data in C$^{3}$bench originates from ten different domains, covering most\nof the categories in classical Chinese. Leveraging the proposed C$^{3}$bench,\nwe extensively evaluate the quantitative performance of 15 representative LLMs\non all five CCU tasks. Our results not only establish a public leaderboard of\nLLMs' CCU capabilities but also gain some findings. Specifically, existing LLMs\nare struggle with CCU tasks and still inferior to supervised models.\nAdditionally, the results indicate that CCU is a task that requires special\nattention. We believe this study could provide a standard benchmark,\ncomprehensive baselines, and valuable insights for the future advancement of\nLLM-based CCU research. The evaluation pipeline and dataset are available at\n\\url{https://github.com/SCUT-DLVCLab/C3bench}.\n","authors":["Jiahuan Cao","Yongxin Shi","Dezhi Peng","Yang Liu","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2405.17732v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12174v2","updated":"2024-05-30T11:26:58Z","published":"2024-02-19T14:28:31Z","title":"BIDER: Bridging Knowledge Inconsistency for Efficient\n  Retrieval-Augmented LLMs via Key Supporting Evidence","summary":"  Retrieval-augmented large language models (LLMs) have demonstrated efficacy\nin knowledge-intensive tasks such as open-domain QA, addressing inherent\nchallenges in knowledge update and factual inadequacy. However, inconsistencies\nbetween retrieval knowledge and the necessary knowledge for LLMs, leading to a\ndecline in LLM's answer quality. This paper introduces BIDER, an approach that\nrefines retrieval documents into Key Supporting Evidence (KSE) through\nknowledge synthesis, supervised fine-tuning (SFT), and preference alignment. We\ntrain BIDER by learning from crafting KSE, while maximizing its output to align\nwith LLM's information acquisition preferences through reinforcement learning.\nEvaluations across five datasets show BIDER boosts LLMs' answer quality by 7%\nwhile reducing input content length in retrieval documents by 80%,\noutperforming existing methods. The proposed KSE simulation effectively equips\nLLMs with essential information for accurate question answering.\n","authors":["Jiajie Jin","Yutao Zhu","Yujia Zhou","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2402.12174v2.pdf","comment":"Accepted by ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2405.19958v1","updated":"2024-05-30T11:25:42Z","published":"2024-05-30T11:25:42Z","title":"Multi-Aspect Controllable Text Generation with Disentangled\n  Counterfactual Augmentation","summary":"  Multi-aspect controllable text generation aims to control the generated texts\nin attributes from multiple aspects (e.g., \"positive\" from sentiment and\n\"sport\" from topic). For ease of obtaining training samples, existing works\nneglect attribute correlations formed by the intertwining of different\nattributes. Particularly, the stereotype formed by imbalanced attribute\ncorrelations significantly affects multi-aspect control. In this paper, we\npropose MAGIC, a new multi-aspect controllable text generation method with\ndisentangled counterfactual augmentation. We alleviate the issue of imbalanced\nattribute correlations during training using counterfactual feature vectors in\nthe attribute latent space by disentanglement. During inference, we enhance\nattribute correlations by target-guided counterfactual augmentation to further\nimprove multi-aspect control. Experiments show that MAGIC outperforms\nstate-of-the-art baselines in both imbalanced and balanced attribute\ncorrelation scenarios. Our source code and data are available at\nhttps://github.com/nju-websoft/MAGIC.\n","authors":["Yi Liu","Xiangyu Liu","Xiangrong Zhu","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2405.19958v1.pdf","comment":"Accepted in the 62nd Annual Meeting of the Association for\n  Computational Linguistics (ACL 2024)"},{"id":"http://arxiv.org/abs/2405.19954v1","updated":"2024-05-30T11:18:52Z","published":"2024-05-30T11:18:52Z","title":"GenKubeSec: LLM-Based Kubernetes Misconfiguration Detection,\n  Localization, Reasoning, and Remediation","summary":"  A key challenge associated with Kubernetes configuration files (KCFs) is that\nthey are often highly complex and error-prone, leading to security\nvulnerabilities and operational setbacks. Rule-based (RB) tools for KCF\nmisconfiguration detection rely on static rule sets, making them inherently\nlimited and unable to detect newly-discovered misconfigurations. RB tools also\nsuffer from misdetection, since mistakes are likely when coding the detection\nrules. Recent methods for detecting and remediating KCF misconfigurations are\nlimited in terms of their scalability and detection coverage, or due to the\nfact that they have high expertise requirements and do not offer automated\nremediation along with misconfiguration detection. Novel approaches that employ\nLLMs in their pipeline rely on API-based, general-purpose, and mainly\ncommercial models. Thus, they pose security challenges, have inconsistent\nclassification performance, and can be costly. In this paper, we propose\nGenKubeSec, a comprehensive and adaptive, LLM-based method, which, in addition\nto detecting a wide variety of KCF misconfigurations, also identifies the exact\nlocation of the misconfigurations and provides detailed reasoning about them,\nalong with suggested remediation. When empirically compared with three\nindustry-standard RB tools, GenKubeSec achieved equivalent precision (0.990)\nand superior recall (0.999). When a random sample of KCFs was examined by a\nKubernetes security expert, GenKubeSec's explanations as to misconfiguration\nlocalization, reasoning and remediation were 100% correct, informative and\nuseful. To facilitate further advancements in this domain, we share the unique\ndataset we collected, a unified misconfiguration index we developed for label\nstandardization, our experimentation code, and GenKubeSec itself as an\nopen-source tool.\n","authors":["Ehud Malul","Yair Meidan","Dudu Mimran","Yuval Elovici","Asaf Shabtai"],"pdf_url":"https://arxiv.org/pdf/2405.19954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17935v2","updated":"2024-05-30T11:01:10Z","published":"2024-05-28T08:01:26Z","title":"Tool Learning with Large Language Models: A Survey","summary":"  Recently, tool learning with large language models (LLMs) has emerged as a\npromising paradigm for augmenting the capabilities of LLMs to tackle highly\ncomplex problems. Despite growing attention and rapid advancements in this\nfield, the existing literature remains fragmented and lacks systematic\norganization, posing barriers to entry for newcomers. This gap motivates us to\nconduct a comprehensive survey of existing works on tool learning with LLMs. In\nthis survey, we focus on reviewing existing literature from the two primary\naspects (1) why tool learning is beneficial and (2) how tool learning is\nimplemented, enabling a comprehensive understanding of tool learning with LLMs.\nWe first explore the \"why\" by reviewing both the benefits of tool integration\nand the inherent benefits of the tool learning paradigm from six specific\naspects. In terms of \"how\", we systematically review the literature according\nto a taxonomy of four key stages in the tool learning workflow: task planning,\ntool selection, tool calling, and response generation. Additionally, we provide\na detailed summary of existing benchmarks and evaluation methods, categorizing\nthem according to their relevance to different stages. Finally, we discuss\ncurrent challenges and outline potential future directions, aiming to inspire\nboth researchers and industrial developers to further explore this emerging and\npromising area. We also maintain a GitHub repository to continually keep track\nof the relevant papers and resources in this rising area at\n\\url{https://github.com/quchangle1/LLM-Tool-Survey}.\n","authors":["Changle Qu","Sunhao Dai","Xiaochi Wei","Hengyi Cai","Shuaiqiang Wang","Dawei Yin","Jun Xu","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2405.17935v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13067v2","updated":"2024-05-30T10:00:14Z","published":"2023-05-22T14:37:05Z","title":"Distilling Robustness into Natural Language Inference Models with\n  Domain-Targeted Augmentation","summary":"  Knowledge distillation optimises a smaller student model to behave similarly\nto a larger teacher model, retaining some of the performance benefits. While\nthis method can improve results on in-distribution examples, it does not\nnecessarily generalise to out-of-distribution (OOD) settings. We investigate\ntwo complementary methods for improving the robustness of the resulting student\nmodels on OOD domains. The first approach augments the distillation with\ngenerated unlabelled examples that match the target distribution. The second\nmethod upsamples data points among the training set that are similar to the\ntarget distribution. When applied on the task of natural language inference\n(NLI), our experiments on MNLI show that distillation with these modifications\noutperforms previous robustness solutions. We also find that these methods\nimprove performance on OOD domains even beyond the target domain.\n","authors":["Joe Stacey","Marek Rei"],"pdf_url":"https://arxiv.org/pdf/2305.13067v2.pdf","comment":"Accepted at ACL Findings 2024"},{"id":"http://arxiv.org/abs/2402.02446v3","updated":"2024-05-30T09:49:47Z","published":"2024-02-04T10:59:52Z","title":"LQER: Low-Rank Quantization Error Reconstruction for LLMs","summary":"  Post-training quantization of Large Language Models (LLMs) is challenging. In\nthis work, we introduce Low-rank Quantization Error Reduction (LQER), which\ncombines quantization and low-rank approximation to recover the model\ncapability. LQER leverages an activation-induced scale matrix to drive the\nsingular value distribution of quantization error towards a desirable\ndistribution, which enables nearly-lossless W4A8 quantization on various LLMs\nand downstream tasks without the need for knowledge distillation, grid search,\nor gradient-base iterative optimization. Unlike existing methods, the\ncomputation pattern of LQER eliminates the need for specialized Scatter and\nGather processes to collect high-precision weights from irregular memory\nlocations. Our W4A8 LLMs achieve near-lossless performance on six popular\ndownstream tasks, while using 1.36$\\times$ fewer hardware resources than the\nleading state-of-the-art method. We open-source our framework at\nhttps://github.com/ChengZhang-98/lqer\n","authors":["Cheng Zhang","Jianyi Cheng","George A. Constantinides","Yiren Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.02446v3.pdf","comment":"Accepted at ICML2024"},{"id":"http://arxiv.org/abs/2405.19877v1","updated":"2024-05-30T09:32:14Z","published":"2024-05-30T09:32:14Z","title":"KNOW: A Real-World Ontology for Knowledge Capture with Large Language\n  Models","summary":"  We present KNOW--the Knowledge Navigator Ontology for the World--the first\nontology designed to capture everyday knowledge to augment large language\nmodels (LLMs) in real-world generative AI use cases such as personal AI\nassistants. Our domain is human life, both its everyday concerns and its major\nmilestones. We have limited the initial scope of the modeled concepts to only\nestablished human universals: spacetime (places, events) plus social (people,\ngroups, organizations). The inclusion criteria for modeled concepts are\npragmatic, beginning with universality and utility. We compare and contrast\nprevious work such as Schema.org and Cyc--as well as attempts at a synthesis of\nknowledge graphs and language models--noting how LLMs already encode internally\nmuch of the commonsense tacit knowledge that took decades to capture in the Cyc\nproject. We also make available code-generated software libraries for the 12\nmost popular programming languages, enabling the direct use of ontology\nconcepts in software engineering. We emphasize simplicity and developer\nexperience in promoting AI interoperability.\n","authors":["Arto Bendiken"],"pdf_url":"https://arxiv.org/pdf/2405.19877v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2405.19874v1","updated":"2024-05-30T09:28:56Z","published":"2024-05-30T09:28:56Z","title":"Is In-Context Learning Sufficient for Instruction Following in LLMs?","summary":"  In-context learning (ICL) allows LLMs to learn from examples without changing\ntheir weights, which is a particularly promising capability for long-context\nLLMs that can potentially learn from many examples. Recently, Lin et al. (2024)\nproposed URIAL, a method using only three in-context examples to align base\nLLMs, achieving non-trivial instruction following performance. In this work, we\nshow that, while effective, ICL alignment with URIAL still underperforms\ncompared to instruction fine-tuning on established benchmarks such as MT-Bench\nand AlpacaEval 2.0 (LC), especially with more capable base LMs. Unlike for\ntasks such as classification, translation, or summarization, adding more ICL\ndemonstrations for long-context LLMs does not systematically improve\ninstruction following performance. To address this limitation, we derive a\ngreedy selection approach for ICL examples that noticeably improves\nperformance, yet without bridging the gap to instruction fine-tuning. Finally,\nwe provide a series of ablation studies to better understand the reasons behind\nthe remaining gap, and we show how some aspects of ICL depart from the existing\nknowledge and are specific to the instruction tuning setting. Overall, our work\nadvances the understanding of ICL as an alignment technique. We provide our\ncode at https://github.com/tml-epfl/icl-alignment.\n","authors":["Hao Zhao","Maksym Andriushchenko","Francesco Croce","Nicolas Flammarion"],"pdf_url":"https://arxiv.org/pdf/2405.19874v1.pdf","comment":"Preprint. Code at https://github.com/tml-epfl/icl-alignment"},{"id":"http://arxiv.org/abs/2402.12786v2","updated":"2024-05-30T09:06:34Z","published":"2024-02-20T07:51:43Z","title":"Advancing Large Language Models to Capture Varied Speaking Styles and\n  Respond Properly in Spoken Conversations","summary":"  In spoken dialogue, even if two current turns are the same sentence, their\nresponses might still differ when they are spoken in different styles. The\nspoken styles, containing paralinguistic and prosodic information, mark the\nmost significant difference between text and speech modality. When using\ntext-only LLMs to model spoken dialogue, text-only LLMs cannot give different\nresponses based on the speaking style of the current turn. In this paper, we\nfocus on enabling LLMs to listen to the speaking styles and respond properly.\nOur goal is to teach the LLM that \"even if the sentences are identical if they\nare spoken in different styles, their corresponding responses might be\ndifferent\". Since there is no suitable dataset for achieving this goal, we\ncollect a speech-to-speech dataset, StyleTalk, with the following desired\ncharacteristics: when two current speeches have the same content but are spoken\nin different styles, their responses will be different. To teach LLMs to\nunderstand and respond properly to the speaking styles, we propose the\nSpoken-LLM framework that can model the linguistic content and the speaking\nstyles. We train Spoken-LLM using the StyleTalk dataset and devise a two-stage\ntraining pipeline to help the Spoken-LLM better learn the speaking styles.\nBased on extensive experiments, we show that Spoken-LLM outperforms text-only\nbaselines and prior speech LLMs methods.\n","authors":["Guan-Ting Lin","Cheng-Han Chiang","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2402.12786v2.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2405.19856v1","updated":"2024-05-30T09:03:42Z","published":"2024-05-30T09:03:42Z","title":"DevEval: A Manually-Annotated Code Generation Benchmark Aligned with\n  Real-World Code Repositories","summary":"  How to evaluate the coding abilities of Large Language Models (LLMs) remains\nan open question. We find that existing benchmarks are poorly aligned with\nreal-world code repositories and are insufficient to evaluate the coding\nabilities of LLMs.\n  To address the knowledge gap, we propose a new benchmark named DevEval, which\nhas three advances. (1) DevEval aligns with real-world repositories in multiple\ndimensions, e.g., code distributions and dependency distributions. (2) DevEval\nis annotated by 13 developers and contains comprehensive annotations (e.g.,\nrequirements, original repositories, reference code, and reference\ndependencies). (3) DevEval comprises 1,874 testing samples from 117\nrepositories, covering 10 popular domains (e.g., Internet, Database). Based on\nDevEval, we propose repository-level code generation and evaluate 8 popular\nLLMs on DevEval (e.g., gpt-4, gpt-3.5, StarCoder 2, DeepSeek Coder, CodeLLaMa).\nOur experiments reveal these LLMs' coding abilities in real-world code\nrepositories. For example, in our experiments, the highest Pass@1 of\ngpt-4-turbo is only 53.04%. We also analyze LLMs' failed cases and summarize\ntheir shortcomings. We hope DevEval can facilitate the development of LLMs in\nreal code repositories. DevEval, prompts, and LLMs' predictions have been\nreleased.\n","authors":["Jia Li","Ge Li","Yunfei Zhao","Yongmin Li","Huanyu Liu","Hao Zhu","Lecheng Wang","Kaibo Liu","Zheng Fang","Lanshen Wang","Jiazheng Ding","Xuanming Zhang","Yuqi Zhu","Yihong Dong","Zhi Jin","Binhua Li","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2405.19856v1.pdf","comment":"Accepted by the 62nd Annual Meeting of the Association for\n  Computational Linguistics (ACL 2024). arXiv admin note: substantial text\n  overlap with arXiv:2404.00599, arXiv:2401.06401"},{"id":"http://arxiv.org/abs/2404.07972v2","updated":"2024-05-30T08:55:12Z","published":"2024-04-11T17:56:05Z","title":"OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real\n  Computer Environments","summary":"  Autonomous agents that accomplish complex computer tasks with minimal human\ninterventions have the potential to transform human-computer interaction,\nsignificantly enhancing accessibility and productivity. However, existing\nbenchmarks either lack an interactive environment or are limited to\nenvironments specific to certain applications or domains, failing to reflect\nthe diverse and complex nature of real-world computer use, thereby limiting the\nscope of tasks and agent scalability. To address this issue, we introduce\nOSWorld, the first-of-its-kind scalable, real computer environment for\nmultimodal agents, supporting task setup, execution-based evaluation, and\ninteractive learning across various operating systems such as Ubuntu, Windows,\nand macOS. OSWorld can serve as a unified, integrated computer environment for\nassessing open-ended computer tasks that involve arbitrary applications.\nBuilding upon OSWorld, we create a benchmark of 369 computer tasks involving\nreal web and desktop apps in open domains, OS file I/O, and workflows spanning\nmultiple applications. Each task example is derived from real-world computer\nuse cases and includes a detailed initial state setup configuration and a\ncustom execution-based evaluation script for reliable, reproducible evaluation.\nExtensive evaluation of state-of-the-art LLM/VLM-based agents on OSWorld\nreveals significant deficiencies in their ability to serve as computer\nassistants. While humans can accomplish over 72.36% of the tasks, the best\nmodel achieves only 12.24% success, primarily struggling with GUI grounding and\noperational knowledge. Comprehensive analysis using OSWorld provides valuable\ninsights for developing multimodal generalist agents that were not possible\nwith previous benchmarks. Our code, environment, baseline models, and data are\npublicly available at https://os-world.github.io.\n","authors":["Tianbao Xie","Danyang Zhang","Jixuan Chen","Xiaochuan Li","Siheng Zhao","Ruisheng Cao","Toh Jing Hua","Zhoujun Cheng","Dongchan Shin","Fangyu Lei","Yitao Liu","Yiheng Xu","Shuyan Zhou","Silvio Savarese","Caiming Xiong","Victor Zhong","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2404.07972v2.pdf","comment":"51 pages, 21 figures"},{"id":"http://arxiv.org/abs/2405.19846v1","updated":"2024-05-30T08:50:55Z","published":"2024-05-30T08:50:55Z","title":"Quest: Query-centric Data Synthesis Approach for Long-context Scaling of\n  Large Language Model","summary":"  Large language models, initially pre-trained with a limited context length,\ncan better handle longer texts by continuing training on a corpus with extended\ncontexts. However, obtaining effective long-context data is challenging due to\nthe scarcity and uneven distribution of long documents across different\ndomains. To address this issue, we propose a Query-centric data synthesis\nmethod, abbreviated as Quest. Quest is an interpretable method based on the\nobservation that documents retrieved by similar queries are relevant but\nlow-redundant, thus well-suited for synthesizing long-context data. The method\nis also scalable and capable of constructing large amounts of long-context\ndata. Using Quest, we synthesize a long-context dataset up to 128k context\nlength, significantly outperforming other data synthesis methods on multiple\nlong-context benchmark datasets. In addition, we further verify that the Quest\nmethod is predictable through scaling law experiments, making it a reliable\nsolution for advancing long-context models.\n","authors":["Chaochen Gao","Xing Wu","Qi Fu","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2405.19846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19842v1","updated":"2024-05-30T08:49:34Z","published":"2024-05-30T08:49:34Z","title":"Improve Student's Reasoning Generalizability through Cascading\n  Decomposed CoTs Distillation","summary":"  Large language models (LLMs) exhibit enhanced reasoning at larger scales,\ndriving efforts to distill these capabilities into smaller models via\nteacher-student learning. Previous works simply fine-tune student models on\nteachers' generated Chain-of-Thoughts (CoTs) data. Although these methods\nenhance in-domain (IND) reasoning performance, they struggle to generalize to\nout-of-domain (OOD) tasks. We believe that the widespread spurious correlations\nbetween questions and answers may lead the model to preset a specific answer\nwhich restricts the diversity and generalizability of its reasoning process. In\nthis paper, we propose Cascading Decomposed CoTs Distillation (CasCoD) to\naddress these issues by decomposing the traditional single-step learning\nprocess into two cascaded learning steps. Specifically, by restructuring the\ntraining objectives -- removing the answer from outputs and concatenating the\nquestion with the rationale as input -- CasCoD's two-step learning process\nensures that students focus on learning rationales without interference from\nthe preset answers, thus improving reasoning generalizability. Extensive\nexperiments demonstrate the effectiveness of CasCoD on both IND and OOD\nbenchmark reasoning datasets. Code can be found at\nhttps://github.com/C-W-D/CasCoD.\n","authors":["Chengwei Dai","Kun Li","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2405.19842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19831v1","updated":"2024-05-30T08:41:33Z","published":"2024-05-30T08:41:33Z","title":"Just Rewrite It Again: A Post-Processing Method for Enhanced Semantic\n  Similarity and Privacy Preservation of Differentially Private Rewritten Text","summary":"  The study of Differential Privacy (DP) in Natural Language Processing often\nviews the task of text privatization as a $\\textit{rewriting}$ task, in which\nsensitive input texts are rewritten to hide explicit or implicit private\ninformation. In order to evaluate the privacy-preserving capabilities of a DP\ntext rewriting mechanism, $\\textit{empirical privacy}$ tests are frequently\nemployed. In these tests, an adversary is modeled, who aims to infer sensitive\ninformation (e.g., gender) about the author behind a (privatized) text. Looking\nto improve the empirical protections provided by DP rewriting methods, we\npropose a simple post-processing method based on the goal of aligning rewritten\ntexts with their original counterparts, where DP rewritten texts are rewritten\n$\\textit{again}$. Our results shown that such an approach not only produces\noutputs that are more semantically reminiscent of the original inputs, but also\ntexts which score on average better in empirical privacy evaluations.\nTherefore, our approach raises the bar for DP rewriting methods in their\nempirical privacy evaluations, providing an extra layer of protection against\nmalicious adversaries.\n","authors":["Stephen Meisenbacher","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2405.19831v1.pdf","comment":"10 pages, 2 figures, 2 tables. Accepted to ARES 2024 (IWAPS)"},{"id":"http://arxiv.org/abs/2403.11904v3","updated":"2024-05-30T08:37:45Z","published":"2024-03-18T16:04:55Z","title":"CICLe: Conformal In-Context Learning for Largescale Multi-Class Food\n  Risk Classification","summary":"  Contaminated or adulterated food poses a substantial risk to human health.\nGiven sets of labeled web texts for training, Machine Learning and Natural\nLanguage Processing can be applied to automatically detect such risks. We\npublish a dataset of 7,546 short texts describing public food recall\nannouncements. Each text is manually labeled, on two granularity levels (coarse\nand fine), for food products and hazards that the recall corresponds to. We\ndescribe the dataset and benchmark naive, traditional, and Transformer models.\nBased on our analysis, Logistic Regression based on a tf-idf representation\noutperforms RoBERTa and XLM-R on classes with low support. Finally, we discuss\ndifferent prompting strategies and present an LLM-in-the-loop framework, based\non Conformal Prediction, which boosts the performance of the base classifier\nwhile reducing energy consumption compared to normal prompting.\n","authors":["Korbinian Randl","John Pavlopoulos","Aron Henriksson","Tony Lindgren"],"pdf_url":"https://arxiv.org/pdf/2403.11904v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19799v1","updated":"2024-05-30T08:10:50Z","published":"2024-05-30T08:10:50Z","title":"Unsupervised Mutual Learning of Dialogue Discourse Parsing and Topic\n  Segmentation","summary":"  The advancement of large language models (LLMs) has propelled the development\nof dialogue systems. Unlike the popular ChatGPT-like assistant model, which\nonly satisfies the user's preferences, task-oriented dialogue systems have also\nfaced new requirements and challenges in the broader business field. They are\nexpected to provide correct responses at each dialogue turn, at the same time,\nachieve the overall goal defined by the task. By understanding rhetorical\nstructures and topic structures via topic segmentation and discourse parsing, a\ndialogue system may do a better planning to achieve both objectives. However,\nwhile both structures belong to discourse structure in linguistics, rhetorical\nstructure and topic structure are mostly modeled separately or with one\nassisting the other in the prior work. The interaction between these two\nstructures has not been considered for joint modeling and mutual learning.\nFurthermore, unsupervised learning techniques to achieve the above are not well\nexplored. To fill this gap, we propose an unsupervised mutual learning\nframework of two structures leveraging the global and local connections between\nthem. We extend the topic modeling between non-adjacent discourse units to\nensure global structural relevance with rhetorical structures. We also\nincorporate rhetorical structures into the topic structure through a graph\nneural network model to ensure local coherence consistency. Finally, we utilize\nthe similarity between the two fused structures for mutual learning. The\nexperimental results demonstrate that our methods outperform all strong\nbaselines on two dialogue rhetorical datasets (STAC and Molweni), as well as\ndialogue topic datasets (Doc2Dial and TIAGE).\n","authors":["Jiahui Xu","Feng Jiang","Anningzhe Gao","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2405.19799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19795v1","updated":"2024-05-30T08:03:15Z","published":"2024-05-30T08:03:15Z","title":"SLM as Guardian: Pioneering AI Safety with Small Language Models","summary":"  Most prior safety research of large language models (LLMs) has focused on\nenhancing the alignment of LLMs to better suit the safety requirements of\nhumans. However, internalizing such safeguard features into larger models\nbrought challenges of higher training cost and unintended degradation of\nhelpfulness. To overcome such challenges, a modular approach employing a\nsmaller LLM to detect harmful user queries is regarded as a convenient solution\nin designing LLM-based system with safety requirements.\n  In this paper, we leverage a smaller LLM for both harmful query detection and\nsafeguard response generation. We introduce our safety requirements and the\ntaxonomy of harmfulness categories, and then propose a multi-task learning\nmechanism fusing the two tasks into a single model. We demonstrate the\neffectiveness of our approach, providing on par or surpassing harmful query\ndetection and safeguard response performance compared to the publicly available\nLLMs.\n","authors":["Ohjoon Kwon","Donghyeon Jeon","Nayoung Choi","Gyu-Hwung Cho","Changbong Kim","Hyunwoo Lee","Inho Kang","Sun Kim","Taiwoo Park"],"pdf_url":"https://arxiv.org/pdf/2405.19795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19793v1","updated":"2024-05-30T08:01:20Z","published":"2024-05-30T08:01:20Z","title":"PDDLEGO: Iterative Planning in Textual Environments","summary":"  Planning in textual environments have been shown to be a long-standing\nchallenge even for current models. A recent, promising line of work uses LLMs\nto generate a formal representation of the environment that can be solved by a\nsymbolic planner. However, existing methods rely on a fully-observed\nenvironment where all entity states are initially known, so a one-off\nrepresentation can be constructed, leading to a complete plan. In contrast, we\ntackle partially-observed environments where there is initially no sufficient\ninformation to plan for the end-goal. We propose PDDLEGO that iteratively\nconstruct a planning representation that can lead to a partial plan for a given\nsub-goal. By accomplishing the sub-goal, more information is acquired to\naugment the representation, eventually achieving the end-goal. We show that\nplans produced by few-shot PDDLEGO are 43% more efficient than generating plans\nend-to-end on the Coin Collector simulation, with strong performance (98%) on\nthe more complex Cooking World simulation where end-to-end LLMs fail to\ngenerate coherent plans (4%).\n","authors":["Li Zhang","Peter Jansen","Tianyi Zhang","Peter Clark","Chris Callison-Burch","Niket Tandon"],"pdf_url":"https://arxiv.org/pdf/2405.19793v1.pdf","comment":"In *SEM 2024"},{"id":"http://arxiv.org/abs/2405.19787v1","updated":"2024-05-30T07:54:07Z","published":"2024-05-30T07:54:07Z","title":"From Symbolic Tasks to Code Generation: Diversification Yields Better\n  Task Performers","summary":"  Instruction tuning -- tuning large language models on instruction-output\npairs -- is a promising technique for making models better adapted to the real\nworld. Yet, the key factors driving the model's capability to understand and\nfollow instructions not seen during training remain under-explored. Our\ninvestigation begins with a series of synthetic experiments within the\ntheoretical framework of a Turing-complete algorithm called Markov algorithm,\nwhich allows fine-grained control over the instruction-tuning data.\nGeneralization and robustness with respect to the training distribution emerge\nonce a diverse enough set of tasks is provided, even though very few examples\nare provided for each task. We extend these initial results to a real-world\napplication scenario of code generation and find that a more diverse\ninstruction set, extending beyond code-related tasks, improves the performance\nof code generation. Our observations suggest that a more diverse semantic space\nfor instruction-tuning sets greatly improves the model's ability to follow\ninstructions and perform tasks.\n","authors":["Dylan Zhang","Justin Wang","Francois Charton"],"pdf_url":"https://arxiv.org/pdf/2405.19787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19782v1","updated":"2024-05-30T07:48:00Z","published":"2024-05-30T07:48:00Z","title":"Dataflow-Guided Retrieval Augmentation for Repository-Level Code\n  Completion","summary":"  Recent years have witnessed the deployment of code language models (LMs) in\nvarious code intelligence tasks such as code completion. Yet, it is challenging\nfor pre-trained LMs to generate correct completions in private repositories.\nPrevious studies retrieve cross-file context based on import relations or text\nsimilarity, which is insufficiently relevant to completion targets. In this\npaper, we propose a dataflow-guided retrieval augmentation approach, called\nDraCo, for repository-level code completion. DraCo parses a private repository\ninto code entities and establishes their relations through an extended dataflow\nanalysis, forming a repo-specific context graph. Whenever triggering code\ncompletion, DraCo precisely retrieves relevant background knowledge from the\nrepo-specific context graph and generates well-formed prompts to query code\nLMs. Furthermore, we construct a large Python dataset, ReccEval, with more\ndiverse completion targets. Our experiments demonstrate the superior accuracy\nand applicable efficiency of DraCo, improving code exact match by 3.43% and\nidentifier F1-score by 3.27% on average compared to the state-of-the-art\napproach.\n","authors":["Wei Cheng","Yuhan Wu","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2405.19782v1.pdf","comment":"Accepted in the 62nd Annual Meeting of the Association for\n  Computational Linguistics (ACL 2024)"},{"id":"http://arxiv.org/abs/2405.19778v1","updated":"2024-05-30T07:44:16Z","published":"2024-05-30T07:44:16Z","title":"Enhancing Consistency and Role-Specific Knowledge Capturing by\n  Rebuilding Fictional Character's Persona","summary":"  With the recent introduction of Assistants API, it is expected that\ndocument-based language models will be actively used in various domains,\nespecially Role-playing. However, a key challenge lies in utilizing\nprotagonist's persona: Assistants API often fails to achieve with its search\nbecause the information extraction part is different each time and it often\nomits important information such as protagonist's backstory or relationships.\nIt is hard to maintain a consistent persona simply by using the persona\ndocument as input to the Assistants API. To address the challenge of achieving\nstable persona consistency, we propose CharacterGPT, a novel persona\nreconstruction framework to alleviate the shortcomings of the Assistants API.\nOur method involves Character Persona Training (CPT), an effective persona\nrebuilding process that updates the character persona by extracting the\ncharacter's traits from given summary of the novel for each character as if the\nstory in a novel progresses. In our experiments, we ask each character to take\nthe Big Five Inventory personality test in various settings and analyze the\nresults. To assess whether it can think outside the box, we let each character\ngenerate short novels. Extensive experiments and human evaluation demonstrate\nthat CharacterGPT presents new possibilities for role-playing agent research.\n","authors":["Jeiyoon Park","Chanjun Park","Heuiseok Lim"],"pdf_url":"https://arxiv.org/pdf/2405.19778v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2405.19763v1","updated":"2024-05-30T07:19:31Z","published":"2024-05-30T07:19:31Z","title":"Enhancing Reinforcement Learning with Label-Sensitive Reward for Natural\n  Language Understanding","summary":"  Recent strides in large language models (LLMs) have yielded remarkable\nperformance, leveraging reinforcement learning from human feedback (RLHF) to\nsignificantly enhance generation and alignment capabilities. However, RLHF\nencounters numerous challenges, including the objective mismatch issue, leading\nto suboptimal performance in Natural Language Understanding (NLU) tasks. To\naddress this limitation, we propose a novel Reinforcement Learning framework\nenhanced with Label-sensitive Reward (RLLR) to amplify the performance of LLMs\nin NLU tasks. By incorporating label-sensitive pairs into reinforcement\nlearning, our method aims to adeptly capture nuanced label-sensitive semantic\nfeatures during RL, thereby enhancing natural language understanding.\nExperiments conducted on five diverse foundation models across eight tasks\nshowcase promising results. In comparison to Supervised Fine-tuning models\n(SFT), RLLR demonstrates an average performance improvement of 1.54%. Compared\nwith RLHF models, the improvement averages at 0.69%. These results reveal the\neffectiveness of our method for LLMs in NLU tasks. Code and data available at:\nhttps://github.com/MagiaSN/ACL2024_RLLR.\n","authors":["Kuo Liao","Shuang Li","Meng Zhao","Liqun Liu","Mengge Xue","Zhenyu Hu","Honglin Han","Chengguo Yin"],"pdf_url":"https://arxiv.org/pdf/2405.19763v1.pdf","comment":"Accept at ACL2024 Main"},{"id":"http://arxiv.org/abs/2311.04044v2","updated":"2024-05-30T06:56:56Z","published":"2023-11-07T14:55:52Z","title":"PrivLM-Bench: A Multi-level Privacy Evaluation Benchmark for Language\n  Models","summary":"  The rapid development of language models (LMs) brings unprecedented\naccessibility and usage for both models and users. On the one hand, powerful\nLMs achieve state-of-the-art performance over numerous downstream NLP tasks. On\nthe other hand, more and more attention is paid to unrestricted model accesses\nthat may bring malicious privacy risks of data leakage. To address these\nissues, many recent works propose privacy-preserving language models (PPLMs)\nwith differential privacy (DP). Unfortunately, different DP implementations\nmake it challenging for a fair comparison among existing PPLMs. In this paper,\nwe present PrivLM-Bench, a multi-perspective privacy evaluation benchmark to\nempirically and intuitively quantify the privacy leakage of LMs. Instead of\nonly reporting DP parameters, PrivLM-Bench sheds light on the neglected\ninference data privacy during actual usage. PrivLM-Bench first clearly defines\nmulti-faceted privacy objectives. Then, PrivLM-Bench constructs a unified\npipeline to perform private fine-tuning. Lastly, PrivLM-Bench performs existing\nprivacy attacks on LMs with pre-defined privacy objectives as the empirical\nevaluation results. The empirical attack results are used to fairly and\nintuitively evaluate the privacy leakage of various PPLMs. We conduct extensive\nexperiments on three datasets of GLUE for mainstream LMs.\n","authors":["Haoran Li","Dadi Guo","Donghao Li","Wei Fan","Qi Hu","Xin Liu","Chunkit Chan","Duanyi Yao","Yuan Yao","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2311.04044v2.pdf","comment":"To appear at ACL 2024"},{"id":"http://arxiv.org/abs/2405.15143v2","updated":"2024-05-30T06:48:44Z","published":"2024-05-24T01:45:27Z","title":"Intelligent Go-Explore: Standing on the Shoulders of Giant Foundation\n  Models","summary":"  Go-Explore is a powerful family of algorithms designed to solve\nhard-exploration problems, built on the principle of archiving discovered\nstates, and iteratively returning to and exploring from the most promising\nstates. This approach has led to superhuman performance across a wide variety\nof challenging problems including Atari games and robotic control, but requires\nmanually designing heuristics to guide exploration, which is time-consuming and\ninfeasible in general. To resolve this, we propose Intelligent Go-Explore (IGE)\nwhich greatly extends the scope of the original Go-Explore by replacing these\nheuristics with the intelligence and internalized human notions of\ninterestingness captured by giant foundation models (FMs). This provides IGE\nwith a human-like ability to instinctively identify how interesting or\npromising any new state is (e.g. discovering new objects, locations, or\nbehaviors), even in complex environments where heuristics are hard to define.\nMoreover, IGE offers the exciting and previously impossible opportunity to\nrecognize and capitalize on serendipitous discoveries that cannot be predicted\nahead of time. We evaluate IGE on a range of language-based tasks that require\nsearch and exploration. In Game of 24, a multistep mathematical reasoning\nproblem, IGE reaches 100% success rate 70.8% faster than the best classic graph\nsearch baseline. Next, in BabyAI-Text, a challenging partially observable\ngridworld, IGE exceeds the previous SOTA with orders of magnitude fewer online\nsamples. Finally, in TextWorld, we show the unique ability of IGE to succeed in\nsettings requiring long-horizon exploration where prior SOTA FM agents like\nReflexion completely fail. Overall, IGE combines the tremendous strengths of\nFMs and the powerful Go-Explore algorithm, opening up a new frontier of\nresearch into creating more generally capable agents with impressive\nexploration capabilities.\n","authors":["Cong Lu","Shengran Hu","Jeff Clune"],"pdf_url":"https://arxiv.org/pdf/2405.15143v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19744v1","updated":"2024-05-30T06:45:23Z","published":"2024-05-30T06:45:23Z","title":"X-Instruction: Aligning Language Model in Low-resource Languages with\n  Self-curated Cross-lingual Instructions","summary":"  Large language models respond well in high-resource languages like English\nbut struggle in low-resource languages. It may arise from the lack of\nhigh-quality instruction following data in these languages. Directly\ntranslating English samples into these languages can be a solution but\nunreliable, leading to responses with translation errors and lacking\nlanguage-specific or cultural knowledge. To address this issue, we propose a\nnovel method to construct cross-lingual instruction following samples with\ninstruction in English and response in low-resource languages. Specifically,\nthe language model first learns to generate appropriate English instructions\naccording to the natural web texts in other languages as responses. The\ncandidate cross-lingual instruction tuning samples are further refined and\ndiversified. We have employed this method to build a large-scale cross-lingual\ninstruction tuning dataset on 10 languages, namely X-Instruction. The\ninstruction data built using our method incorporate more language-specific\nknowledge compared with the naive translation method. Experimental results have\nshown that the response quality of the model tuned on X-Instruction greatly\nexceeds the model distilled from a powerful teacher model, reaching or even\nsurpassing the ones of ChatGPT. In addition, we find that models tuned on\ncross-lingual instruction following samples can follow the instruction in the\noutput language without further tuning.\n","authors":["Chong Li","Wen Yang","Jiajun Zhang","Jinliang Lu","Shaonan Wang","Chengqing Zong"],"pdf_url":"https://arxiv.org/pdf/2405.19744v1.pdf","comment":"ACL 2024. Our codes, data and model weights are available at\n  https://github.com/ZNLP/X-Instruction"},{"id":"http://arxiv.org/abs/2405.19740v1","updated":"2024-05-30T06:38:32Z","published":"2024-05-30T06:38:32Z","title":"PertEval: Unveiling Real Knowledge Capacity of LLMs with\n  Knowledge-Invariant Perturbations","summary":"  Expert-designed close-ended benchmarks serve as vital tools in assessing the\nknowledge capacity of large language models (LLMs). Despite their widespread\nuse, concerns have mounted regarding their reliability due to limited test\nscenarios and an unavoidable risk of data contamination. To rectify this, we\npresent PertEval, a toolkit devised for in-depth probing of LLMs' knowledge\ncapacity through knowledge-invariant perturbations. These perturbations employ\nhuman-like restatement techniques to generate on-the-fly test samples from\nstatic benchmarks, meticulously retaining knowledge-critical content while\naltering irrelevant details. Our toolkit further includes a suite of transition\nanalyses that compare performance on raw vs. perturbed test sets to precisely\nassess LLMs' genuine knowledge capacity. Six state-of-the-art LLMs are\nre-evaluated using PertEval. Results reveal significantly inflated performance\nof the LLMs on raw benchmarks, including an absolute 21% overestimation for\nGPT-4. Additionally, through a nuanced response pattern analysis, we discover\nthat PertEval retains LLMs' uncertainty to specious knowledge, potentially\nbeing resolved through rote memorization and leading to inflated performance.\nWe also find that the detailed transition analyses by PertEval could illuminate\nweaknesses in existing LLMs' knowledge mastery and guide the development of\nrefinement. Given these insights, we posit that PertEval can act as an\nessential tool that, when applied alongside any close-ended benchmark, unveils\nthe true knowledge capacity of LLMs, marking a significant step toward more\ntrustworthy LLM evaluation.\n","authors":["Jiatong Li","Renjun Hu","Kunzhe Huang","Yan Zhuang","Qi Liu","Mengxiao Zhu","Xing Shi","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2405.19740v1.pdf","comment":"23 pages, 12 figures, 10 tables"},{"id":"http://arxiv.org/abs/2405.19737v1","updated":"2024-05-30T06:32:11Z","published":"2024-05-30T06:32:11Z","title":"Beyond Imitation: Learning Key Reasoning Steps from Dual\n  Chain-of-Thoughts in Reasoning Distillation","summary":"  As Large Language Models (LLMs) scale up and gain powerful Chain-of-Thoughts\n(CoTs) reasoning abilities, practical resource constraints drive efforts to\ndistill these capabilities into more compact Smaller Language Models (SLMs). We\nfind that CoTs consist mainly of simple reasoning forms, with a small\nproportion ($\\approx 4.7\\%$) of key reasoning steps that truly impact\nconclusions. However, previous distillation methods typically involve\nsupervised fine-tuning student SLMs only on correct CoTs data produced by\nteacher LLMs, resulting in students struggling to learn the key reasoning\nsteps, instead imitating the teacher's reasoning forms and making errors or\nomissions on these steps. To address these issues, drawing an analogy to human\nlearning, where analyzing mistakes according to correct solutions often reveals\nthe crucial steps leading to successes or failures, we propose\nmistak\\textbf{E}-\\textbf{D}riven key reason\\textbf{I}ng step\ndistilla\\textbf{T}ion (\\textbf{EDIT}), a novel method that further aids SLMs\nlearning key reasoning steps rather than mere simple fine-tuning. Firstly, to\nexpose these crucial steps in CoTs, we design specific prompts to generate dual\nCoTs data with similar reasoning paths but divergent conclusions. Then, we\napply the minimum edit distance algorithm on the dual CoTs data to locate these\nkey steps and optimize the likelihood of these steps. Extensive experiments\nvalidate the effectiveness of EDIT across both in-domain and out-of-domain\nbenchmark reasoning datasets. Further analysis shows that EDIT can generate\nhigh-quality CoTs with more correct key reasoning steps. Notably, we also\nexplore how different mistake patterns affect performance and find that EDIT\nbenefits more from logical errors than from knowledge or mathematical\ncalculation errors in dual CoTs\\footnote{Code can be found at\n\\url{https://github.com/C-W-D/EDIT}}.\n","authors":["Chengwei Dai","Kun Li","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2405.19737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19732v1","updated":"2024-05-30T06:24:14Z","published":"2024-05-30T06:24:14Z","title":"Two Optimizers Are Better Than One: LLM Catalyst for Enhancing\n  Gradient-Based Optimization","summary":"  Learning a skill generally relies on both practical experience by doer and\ninsightful high-level guidance by instructor. Will this strategy also work well\nfor solving complex non-convex optimization problems? Here, a common\ngradient-based optimizer acts like a disciplined doer, making locally optimal\nupdate at each step. Recent methods utilize large language models (LLMs) to\noptimize solutions for concrete problems by inferring from natural language\ninstructions, akin to a high-level instructor. In this paper, we show that\nthese two optimizers are complementary to each other, suggesting a\ncollaborative optimization approach. The gradient-based optimizer and LLM-based\noptimizer are combined in an interleaved manner. We instruct LLMs using task\ndescriptions and timely optimization trajectories recorded during\ngradient-based optimization. Inferred results from LLMs are used as restarting\npoints for the next stage of gradient optimization. By leveraging both the\nlocally rigorous gradient-based optimizer and the high-level deductive\nLLM-based optimizer, our combined optimization method consistently yields\nimprovements over competitive baseline prompt tuning methods. Our results\ndemonstrate the synergistic effect of conventional gradient-based optimization\nand the inference ability of LLMs. The code is released at\nhttps://github.com/guozix/LLM-catalyst.\n","authors":["Zixian Guo","Ming Liu","Zhilong Ji","Jinfeng Bai","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.19732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15316v2","updated":"2024-05-30T06:18:20Z","published":"2023-11-26T14:35:23Z","title":"Sibyl: Sensible Empathetic Dialogue Generation with Visionary\n  Commonsense Knowledge","summary":"  Recently, there has been a heightened interest in building chatbots based on\nLarge Language Models (LLMs) to emulate human-like qualities in dialogues,\nincluding expressing empathy and offering emotional support. Despite having\naccess to commonsense knowledge to better understand the psychological aspects\nand causality of dialogue context, even these powerful LLMs struggle to achieve\nthe goals of empathy and emotional support. As current approaches do not\nadequately anticipate dialogue future, they may mislead language models to\nignore complex dialogue goals of empathy and emotional support, resulting in\nunsupportive responses lacking empathy. To address this issue, we present an\ninnovative framework named Sensible Empathetic Dialogue Generation with\nVisionary Commonsense Knowledge (Sibyl). Designed to concentrate on the\nimminent dialogue future, this paradigm directs LLMs toward the implicit\nrequirements of the conversation, aiming to provide more sensible responses.\nExperimental results demonstrate that incorporating our paradigm for acquiring\ncommonsense knowledge into LLMs comprehensively enhances the quality of their\nresponses.\n","authors":["Lanrui Wang","Jiangnan Li","Chenxu Yang","Zheng Lin","Hongyin Tang","Huan Liu","Xiaolei Huang","Yanan Cao","Jingang Wang","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15316v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2405.19716v1","updated":"2024-05-30T05:53:49Z","published":"2024-05-30T05:53:49Z","title":"Enhancing Large Vision Language Models with Self-Training on Image\n  Comprehension","summary":"  Large vision language models (LVLMs) integrate large language models (LLMs)\nwith pre-trained vision encoders, thereby activating the perception capability\nof the model to understand image inputs for different queries and conduct\nsubsequent reasoning. Improving this capability requires high-quality\nvision-language data, which is costly and labor-intensive to acquire.\nSelf-training approaches have been effective in single-modal settings to\nalleviate the need for labeled data by leveraging model's own generation.\nHowever, effective self-training remains a challenge regarding the unique\nvisual perception and reasoning capability of LVLMs. To address this, we\nintroduce Self-Training on Image Comprehension (STIC), which emphasizes a\nself-training approach specifically for image comprehension. First, the model\nself-constructs a preference dataset for image descriptions using unlabeled\nimages. Preferred responses are generated through a step-by-step prompt, while\ndis-preferred responses are generated from either corrupted images or\nmisleading prompts. To further self-improve reasoning on the extracted visual\ninformation, we let the model reuse a small portion of existing\ninstruction-tuning data and append its self-generated image descriptions to the\nprompts. We validate the effectiveness of STIC across seven different\nbenchmarks, demonstrating substantial performance gains of 4.0% on average\nwhile using 70% less supervised fine-tuning data than the current method.\nFurther studies investigate various components of STIC and highlight its\npotential to leverage vast quantities of unlabeled images for self-training.\nCode and data are made publicly available.\n","authors":["Yihe Deng","Pan Lu","Fan Yin","Ziniu Hu","Sheng Shen","James Zou","Kai-Wei Chang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2405.19716v1.pdf","comment":"19 pages, 14 figures, 6 tables"},{"id":"http://arxiv.org/abs/2405.19715v1","updated":"2024-05-30T05:49:38Z","published":"2024-05-30T05:49:38Z","title":"SpecDec++: Boosting Speculative Decoding via Adaptive Candidate Lengths","summary":"  Speculative decoding reduces the inference latency of a target large language\nmodel via utilizing a smaller and faster draft model. Its performance depends\non a hyperparameter K -- the candidate length, i.e., the number of candidate\ntokens for the target model to verify in each round. However, previous methods\noften use simple heuristics to choose K, which may result in sub-optimal\nperformance. We study the choice of the candidate length K and formulate it as\na Markov Decision Process. We theoretically show that the optimal policy of\nthis Markov decision process takes the form of a threshold policy, i.e., the\ncurrent speculation should stop and be verified when the probability of getting\na rejection exceeds a threshold value. Motivated by this theory, we propose\nSpecDec++, an enhanced version of speculative decoding that adaptively\ndetermines the candidate length on the fly. We augment the draft model with a\ntrained acceptance prediction head to predict the conditional acceptance\nprobability of the candidate tokens. SpecDec++ will stop the current\nspeculation when the predicted probability that at least one token gets\nrejected exceeds a threshold. We implement SpecDec++ and apply it to the\nllama-2-chat 7B & 70B model pair. Our adaptive method achieves a 2.04x speedup\non the Alpaca dataset (an additional 7.2% improvement over the baseline\nspeculative decoding). On the GSM8K and HumanEval datasets, our method achieves\na 2.26x speedup (9.4% improvement) and 2.23x speedup (11.1% improvement),\nrespectively.\n","authors":["Kaixuan Huang","Xudong Guo","Mengdi Wang"],"pdf_url":"https://arxiv.org/pdf/2405.19715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10160v2","updated":"2024-05-30T05:27:35Z","published":"2023-12-15T19:16:21Z","title":"Do LVLMs Understand Charts? Analyzing and Correcting Factual Errors in\n  Chart Captioning","summary":"  Recent advancements in large vision-language models (LVLMs) have led to\nsignificant progress in generating natural language descriptions for visual\ncontent and thus enhancing various applications. One issue with these powerful\nmodels is that they sometimes produce texts that are factually inconsistent\nwith the visual input. While there has been some effort to mitigate such\ninconsistencies in natural image captioning, the factuality of generated\ncaptions for structured document images, such as charts, has not received as\nmuch scrutiny, posing a potential threat to information reliability in critical\napplications. This work delves into the factuality aspect by introducing a\ncomprehensive typology of factual errors in generated chart captions. A\nlarge-scale human annotation effort provides insight into the error patterns\nand frequencies in captions crafted by various chart captioning models,\nultimately forming the foundation of a novel dataset, CHOCOLATE. Our analysis\nreveals that even state-of-the-art models, including GPT-4V, frequently produce\ncaptions laced with factual inaccuracies. In response to this challenge, we\nestablish the new task of Chart Caption Factual Error Correction and introduce\nCHARTVE, a model for visual entailment that outperforms proprietary and\nopen-source LVLMs in evaluating factual consistency. Furthermore, we propose\nC2TFEC, an interpretable two-stage framework that excels at correcting factual\nerrors. This work inaugurates a new domain in factual error correction for\nchart captions, presenting a novel evaluation mechanism, and demonstrating an\neffective approach to ensuring the factuality of generated chart captions. The\ncode and data as well as the continuously updated benchmark can be found at:\nhttps://khuangaf.github.io/CHOCOLATE/.\n","authors":["Kung-Hsiang Huang","Mingyang Zhou","Hou Pong Chan","Yi R. Fung","Zhenhailong Wang","Lingyu Zhang","Shih-Fu Chang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2312.10160v2.pdf","comment":"ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2405.19701v1","updated":"2024-05-30T05:26:57Z","published":"2024-05-30T05:26:57Z","title":"Significance of Chain of Thought in Gender Bias Mitigation for\n  English-Dravidian Machine Translation","summary":"  Gender bias in machine translation (MT) systems poses a significant challenge\nto achieving accurate and inclusive translations. This paper examines gender\nbias in machine translation systems for languages such as Telugu and Kannada\nfrom the Dravidian family, analyzing how gender inflections affect translation\naccuracy and neutrality using Google Translate and ChatGPT. It finds that while\nplural forms can reduce bias, individual-centric sentences often maintain the\nbias due to historical stereotypes. The study evaluates the Chain of Thought\nprocessing, noting significant bias mitigation from 80% to 4% in Telugu and\nfrom 40% to 0% in Kannada. It also compares Telugu and Kannada translations,\nemphasizing the need for language specific strategies to address these\nchallenges and suggesting directions for future research to enhance fairness in\nboth data preparation and prompts during inference.\n","authors":["Lavanya Prahallad","Radhika Mamidi"],"pdf_url":"https://arxiv.org/pdf/2405.19701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07088v3","updated":"2024-05-30T05:21:23Z","published":"2024-03-11T18:26:02Z","title":"SPA: Towards A Computational Friendly Cloud-Base and On-Devices\n  Collaboration Seq2seq Personalized Generation","summary":"  Large language models(LLMs) have shown its outperforming ability on various\ntasks and question answering. However, LLMs require substantial memory storage\non low-resource devices. More critically, the computational speed on these\ndevices is also severely limited. In this paper, we propose SPA(Side Plugin\nAdaption), a lightweight architecture for fast on-devices inference on the\nconstraints of strict on-devices computation and memory constraints. Compared\nwith other on-devices seq2seq generation, SPA could make a fast and stable\ninference on low-resource constraints, allowing it to obtain cost effiency. Our\nmethod establish an interaction between a pretrained LLMs on-cloud and additive\nparameters on-devices, which could provide the knowledge on both pretrained\nLLMs and featured personal feature. Further more, SPA provides a framework to\nkeep feature-base parameters on low computational devices while leave the\nparameters containing general information on the high computational devices.\n","authors":["Yanming Liu","Xinyue Peng","Jiannan Cao","Le Dai","Xingzu Liu","Weihao Liu","Mingbang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.07088v3.pdf","comment":"15 pages, second version of SPA(Side Plugin Adaption)"},{"id":"http://arxiv.org/abs/2405.11577v3","updated":"2024-05-30T05:13:19Z","published":"2024-05-19T15:00:50Z","title":"A Multi-Perspective Analysis of Memorization in Large Language Models","summary":"  Large Language Models (LLMs), trained on massive corpora with billions of\nparameters, show unprecedented performance in various fields. Though surprised\nby their excellent performances, researchers also noticed some special\nbehaviors of those LLMs. One of those behaviors is memorization, in which LLMs\ncan generate the same content used to train them. Though previous research has\ndiscussed memorization, the memorization of LLMs still lacks explanation,\nespecially the cause of memorization and the dynamics of generating them. In\nthis research, we comprehensively discussed memorization from various\nperspectives and extended the discussion scope to not only just the memorized\ncontent but also less and unmemorized content. Through various studies, we\nfound that: (1) Through experiments, we revealed the relation of memorization\nbetween model size, continuation size, and context size. Further, we showed how\nunmemorized sentences transition to memorized sentences. (2) Through embedding\nanalysis, we showed the distribution and decoding dynamics across model size in\nembedding space for sentences with different memorization scores. The n-gram\nstatistics analysis presents d (3) An analysis over n-gram and entropy decoding\ndynamics discovered a boundary effect when the model starts to generate\nmemorized sentences or unmemorized sentences. (4)We trained a Transformer model\nto predict the memorization of different models, showing that it is possible to\npredict memorizations by context.\n","authors":["Bowen Chen","Namgi Han","Yusuke Miyao"],"pdf_url":"https://arxiv.org/pdf/2405.11577v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14808v3","updated":"2024-05-30T05:09:25Z","published":"2024-02-22T18:58:28Z","title":"RelayAttention for Efficient Large Language Model Serving with Long\n  System Prompts","summary":"  A practical large language model (LLM) service may involve a long system\nprompt, which specifies the instructions, examples, and knowledge documents of\nthe task and is reused across requests. However, the long system prompt causes\nthroughput/latency bottlenecks as the cost of generating the next token grows\nw.r.t. the sequence length. This paper aims to improve the efficiency of LLM\nservices that involve long system prompts. Our key observation is that handling\nthese system prompts requires heavily redundant memory accesses in existing\ncausal attention computation algorithms. Specifically, for batched requests,\nthe cached hidden states (\\ie, key-value pairs) of system prompts are\ntransferred from off-chip DRAM to on-chip SRAM multiple times, each\ncorresponding to an individual request. To eliminate such a redundancy, we\npropose RelayAttention, an attention algorithm that allows reading these hidden\nstates from DRAM exactly once for a batch of input tokens. RelayAttention is a\nfree lunch: it maintains the generation quality while requiring no model\nretraining, as it is based on a mathematical reformulation of causal attention.\nWe have observed significant performance improvements to a production-level\nsystem, vLLM, through integration with RelayAttention. The improvements are\neven more profound with longer system prompts.\n","authors":["Lei Zhu","Xinjiang Wang","Wayne Zhang","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2402.14808v3.pdf","comment":"accepted by the ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2402.06967v2","updated":"2024-05-30T04:57:36Z","published":"2024-02-10T14:52:52Z","title":"Instruct Once, Chat Consistently in Multiple Rounds: An Efficient Tuning\n  Framework for Dialogue","summary":"  Tuning language models for dialogue generation has been a prevalent paradigm\nfor building capable dialogue agents. Yet, traditional tuning narrowly views\ndialogue generation as resembling other language generation tasks, ignoring the\nrole disparities between two speakers and the multi-round interactive process\nthat dialogues ought to be. Such a manner often leads to unsatisfactory chat\nconsistency for the built agent. In this work, we emphasize the interactive,\ncommunicative nature of dialogue and argue that it is more feasible to model\nthe speaker roles of agent and user separately, enabling the agent to adhere to\nits role consistently. With this in mind, we propose an efficient Multi-round\nInteractive Dialogue Tuning (Midi-Tuning) framework. It models the agent and\nuser individually with two adapters built upon large language models. The\nadapters make use of respective utterances round by round in alternating order\nand they are tuned via a round-level memory caching mechanism. Extensive\nexperiments demonstrate that, our framework performs superior to traditional\nfine-tuning and harbors the tremendous potential for improving dialogue\nconsistency.\n","authors":["Jian Wang","Chak Tou Leong","Jiashuo Wang","Dongding Lin","Wenjie Li","Xiao-Yong Wei"],"pdf_url":"https://arxiv.org/pdf/2402.06967v2.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2401.02415v2","updated":"2024-05-30T04:45:34Z","published":"2024-01-04T18:59:12Z","title":"LLaMA Pro: Progressive LLaMA with Block Expansion","summary":"  Humans generally acquire new skills without compromising the old; however,\nthe opposite holds for Large Language Models (LLMs), e.g., from LLaMA to\nCodeLLaMA. To this end, we propose a new post-pretraining method for LLMs with\nan expansion of Transformer blocks. We tune the expanded blocks using only new\ncorpus, efficiently and effectively improving the model's knowledge without\ncatastrophic forgetting. In this paper, we experiment on the corpus of code and\nmath, yielding LLaMA Pro-8.3B, a versatile foundation model initialized from\nLLaMA2-7B, excelling in general tasks, programming, and mathematics. LLaMA Pro\nand its instruction-following counterpart (LLaMA Pro-Instruct) achieve advanced\nperformance among various benchmarks, demonstrating superiority over existing\nopen models in the LLaMA family and the immense potential of reasoning and\naddressing diverse tasks as an intelligent agent. Our findings provide valuable\ninsights into integrating natural and programming languages, laying a solid\nfoundation for developing advanced language agents that operate effectively in\nvarious environments.\n","authors":["Chengyue Wu","Yukang Gan","Yixiao Ge","Zeyu Lu","Jiahao Wang","Ye Feng","Ying Shan","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2401.02415v2.pdf","comment":"Accepted by ACL 2024, Main Conference"},{"id":"http://arxiv.org/abs/2402.10466v4","updated":"2024-05-30T04:19:54Z","published":"2024-02-16T06:13:18Z","title":"Large Language Models as Zero-shot Dialogue State Tracker through\n  Function Calling","summary":"  Large language models (LLMs) are increasingly prevalent in conversational\nsystems due to their advanced understanding and generative capabilities in\ngeneral contexts. However, their effectiveness in task-oriented dialogues\n(TOD), which requires not only response generation but also effective dialogue\nstate tracking (DST) within specific tasks and domains, remains less\nsatisfying. In this work, we propose a novel approach FnCTOD for solving DST\nwith LLMs through function calling. This method improves zero-shot DST,\nallowing adaptation to diverse domains without extensive data collection or\nmodel tuning. Our experimental results demonstrate that our approach achieves\nexceptional performance with both modestly sized open-source and also\nproprietary LLMs: with in-context prompting it enables various 7B or 13B\nparameter models to surpass the previous state-of-the-art (SOTA) achieved by\nChatGPT, and improves ChatGPT's performance beating the SOTA by 5.6% average\njoint goal accuracy (JGA). Individual model results for GPT-3.5 and GPT-4 are\nboosted by 4.8% and 14%, respectively. We also show that by fine-tuning on a\nsmall collection of diverse task-oriented dialogues, we can equip modestly\nsized models, specifically a 13B parameter LLaMA2-Chat model, with\nfunction-calling capabilities and DST performance comparable to ChatGPT while\nmaintaining their chat capabilities. We have made the code publicly available\nat https://github.com/facebookresearch/FnCTOD\n","authors":["Zekun Li","Zhiyu Zoey Chen","Mike Ross","Patrick Huber","Seungwhan Moon","Zhaojiang Lin","Xin Luna Dong","Adithya Sagar","Xifeng Yan","Paul A. Crook"],"pdf_url":"https://arxiv.org/pdf/2402.10466v4.pdf","comment":"ACL 2024 Main. Code available at:\n  https://github.com/facebookresearch/FnCTOD"},{"id":"http://arxiv.org/abs/2402.01869v2","updated":"2024-05-30T04:18:03Z","published":"2024-02-02T19:47:57Z","title":"InferCept: Efficient Intercept Support for Augmented Large Language\n  Model Inference","summary":"  Large language models are increasingly integrated with external environments,\ntools, and agents like ChatGPT plugins to extend their capability beyond\nlanguage-centric tasks. However, today's LLM inference systems are designed for\nstandalone LLMs. They treat each external interaction as the end of LLM\ngeneration and form a new request when the interaction finishes, causing\nunnecessary recomputation of already computed contexts, which accounts for\n37-40% of total model forwarding time. This paper presents InferCept, the first\nLLM inference framework targeting augmented LLMs and supporting the efficient\ninterception of LLM generation. InferCept minimizes the GPU resource waste\ncaused by LLM interceptions and dedicates saved memory for serving more\nrequests. InferCept improves the overall serving throughput by 1.6x-2x and\ncompletes 2x more requests per second compared to the state-of-the-art LLM\ninference systems.\n","authors":["Reyna Abhyankar","Zijian He","Vikranth Srivatsa","Hao Zhang","Yiying Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.01869v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19670v1","updated":"2024-05-30T03:44:54Z","published":"2024-05-30T03:44:54Z","title":"One Token Can Help! Learning Scalable and Pluggable Virtual Tokens for\n  Retrieval-Augmented Large Language Models","summary":"  Retrieval-augmented generation (RAG) is a promising way to improve large\nlanguage models (LLMs) for generating more factual, accurate, and up-to-date\ncontent. Existing methods either optimize prompts to guide LLMs in leveraging\nretrieved information or directly fine-tune the LLMs to adapt to RAG scenarios.\nAlthough fine-tuning can yield better performance, it often compromises the\nLLMs' general generation capabilities by modifying their parameters. This\nlimitation poses challenges in practical applications, especially when LLMs are\nalready deployed, as parameter adjustments may affect their original\nfunctionality. To address this, we propose a novel method that involves\nlearning scalable and pluggable virtual tokens for RAG. By maintaining the\nLLMs' original parameters and fine-tuning only the embeddings of these\npluggable tokens, our approach not only enhances LLMs' performance but also\npreserves their general generation capacities. Furthermore, we design several\ntraining strategies to improve the scalability, flexibility, and\ngeneralizability of our method. Comprehensive experiments across nine\nquestion-answering tasks demonstrate the superiority of our approach.\n","authors":["Yutao Zhu","Zhaoheng Huang","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2405.19670v1.pdf","comment":"working in progress, repo: https://github.com/DaoD/SPRING/"},{"id":"http://arxiv.org/abs/2405.19660v1","updated":"2024-05-30T03:20:56Z","published":"2024-05-30T03:20:56Z","title":"PATIENT-Ψ: Using Large Language Models to Simulate Patients for\n  Training Mental Health Professionals","summary":"  Mental illness remains one of the most critical public health issues, with a\nsignificant gap between the available mental health support and patient needs.\nMany mental health professionals highlight a disconnect between their training\nand real-world patient interactions, leaving some trainees feeling unprepared\nand potentially affecting their early career success. In this paper, we propose\nPATIENT-{\\Psi}, a novel patient simulation framework for cognitive behavior\ntherapy (CBT) training. To build PATIENT-{\\Psi}, we constructed diverse patient\nprofiles and their corresponding cognitive models based on CBT principles, and\nthen used large language models (LLMs) programmed with the patient cognitive\nmodels to act as a simulated therapy patient. We propose an interactive\ntraining scheme, PATIENT-{\\Psi}-TRAINER, for mental health trainees to practice\na key skill in CBT -- formulating the cognitive model of the patient -- through\nrole-playing a therapy session with PATIENT-{\\Psi}. To evaluate PATIENT-{\\Psi},\nwe conducted a user study of 4 mental health trainees and 10 experts. The\nresults demonstrate that practice using PATIENT-{\\Psi}-TRAINER greatly enhances\nthe perceived skill acquisition and confidence of the trainees beyond existing\nforms of training such as textbooks, videos, and role-play with non-patients.\nBased on the experts' perceptions, PATIENT-{\\Psi} is perceived to be closer to\nreal patient interactions than GPT-4, and PATIENT-{\\Psi}-TRAINER holds strong\npromise to improve trainee competencies. Our pioneering patient simulation\ntraining framework, using LLMs, holds great potential to enhance and advance\nmental health training, ultimately leading to improved patient care and\noutcomes. We will release all our data, code, and the training platform.\n","authors":["Ruiyi Wang","Stephanie Milani","Jamie C. Chiu","Shaun M. Eack","Travis Labrum","Samuel M. Murphy","Nev Jones","Kate Hardy","Hong Shen","Fei Fang","Zhiyu Zoey Chen"],"pdf_url":"https://arxiv.org/pdf/2405.19660v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2405.19648v1","updated":"2024-05-30T03:00:47Z","published":"2024-05-30T03:00:47Z","title":"Detecting Hallucinations in Large Language Model Generation: A Token\n  Probability Approach","summary":"  Concerns regarding the propensity of Large Language Models (LLMs) to produce\ninaccurate outputs, also known as hallucinations, have escalated. Detecting\nthem is vital for ensuring the reliability of applications relying on\nLLM-generated content. Current methods often demand substantial resources and\nrely on extensive LLMs or employ supervised learning with multidimensional\nfeatures or intricate linguistic and semantic analyses difficult to reproduce\nand largely depend on using the same LLM that hallucinated. This paper\nintroduces a supervised learning approach employing two simple classifiers\nutilizing only four numerical features derived from tokens and vocabulary\nprobabilities obtained from other LLM evaluators, which are not necessarily the\nsame. The method yields promising results, surpassing state-of-the-art outcomes\nin multiple tasks across three different benchmarks. Additionally, we provide a\ncomprehensive examination of the strengths and weaknesses of our approach,\nhighlighting the significance of the features utilized and the LLM employed as\nan evaluator. We have released our code publicly at\nhttps://github.com/Baylor-AI/HalluDetect.\n","authors":["Ernesto Quevedo","Jorge Yero","Rachel Koerner","Pablo Rivas","Tomas Cerny"],"pdf_url":"https://arxiv.org/pdf/2405.19648v1.pdf","comment":"ICAI'24 - The 26th Int'l Conf on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2401.06102v3","updated":"2024-05-30T02:52:08Z","published":"2024-01-11T18:33:48Z","title":"Patchscopes: A Unifying Framework for Inspecting Hidden Representations\n  of Language Models","summary":"  Understanding the internal representations of large language models (LLMs)\ncan help explain models' behavior and verify their alignment with human values.\nGiven the capabilities of LLMs in generating human-understandable text, we\npropose leveraging the model itself to explain its internal representations in\nnatural language. We introduce a framework called Patchscopes and show how it\ncan be used to answer a wide range of questions about an LLM's computation. We\nshow that many prior interpretability methods based on projecting\nrepresentations into the vocabulary space and intervening on the LLM\ncomputation can be viewed as instances of this framework. Moreover, several of\ntheir shortcomings such as failure in inspecting early layers or lack of\nexpressivity can be mitigated by Patchscopes. Beyond unifying prior inspection\ntechniques, Patchscopes also opens up new possibilities such as using a more\ncapable model to explain the representations of a smaller model, and multihop\nreasoning error correction.\n","authors":["Asma Ghandeharioun","Avi Caciularu","Adam Pearce","Lucas Dixon","Mor Geva"],"pdf_url":"https://arxiv.org/pdf/2401.06102v3.pdf","comment":"ICML 2024 (to appear)"},{"id":"http://arxiv.org/abs/2405.12107v2","updated":"2024-05-30T02:47:10Z","published":"2024-05-20T15:23:19Z","title":"Imp: Highly Capable Large Multimodal Models for Mobile Devices","summary":"  By harnessing the capabilities of large language models (LLMs), recent large\nmultimodal models (LMMs) have shown remarkable versatility in open-world\nmultimodal understanding. Nevertheless, they are usually parameter-heavy and\ncomputation-intensive, thus hindering their applicability in\nresource-constrained scenarios. To this end, several lightweight LMMs have been\nproposed successively to maximize the capabilities under constrained scale\n(e.g., 3B). Despite the encouraging results achieved by these methods, most of\nthem only focus on one or two aspects of the design space, and the key design\nchoices that influence model capability have not yet been thoroughly\ninvestigated. In this paper, we conduct a systematic study for lightweight LMMs\nfrom the aspects of model architecture, training strategy, and training data.\nBased on our findings, we obtain Imp -- a family of highly capable LMMs at the\n2B-4B scales. Notably, our Imp-3B model steadily outperforms all the existing\nlightweight LMMs of similar size, and even surpasses the state-of-the-art LMMs\nat the 13B scale. With low-bit quantization and resolution reduction\ntechniques, our Imp model can be deployed on a Qualcomm Snapdragon 8Gen3 mobile\nchip with a high inference speed of about 13 tokens/s.\n","authors":["Zhenwei Shao","Zhou Yu","Jun Yu","Xuecheng Ouyang","Lihao Zheng","Zhenbiao Gai","Mingyang Wang","Jiajun Ding"],"pdf_url":"https://arxiv.org/pdf/2405.12107v2.pdf","comment":"fix some typos and correct a few number in the tables"},{"id":"http://arxiv.org/abs/2405.19635v1","updated":"2024-05-30T02:37:35Z","published":"2024-05-30T02:37:35Z","title":"GKT: A Novel Guidance-Based Knowledge Transfer Framework For Efficient\n  Cloud-edge Collaboration LLM Deployment","summary":"  The burgeoning size of Large Language Models (LLMs) has led to enhanced\ncapabilities in generating responses, albeit at the expense of increased\ninference times and elevated resource demands. Existing methods of\nacceleration, predominantly hinged on knowledge distillation, generally\nnecessitate fine-tuning of considerably large models, such as Llama-7B, posing\na challenge for average users. Furthermore, present techniques for expediting\ninference and reducing costs operate independently. To address these issues, we\nintroduce a novel and intuitive Guidance-based Knowledge Transfer (GKT)\nframework. This approach leverages a larger LLM as a ''teacher'' to create\nguidance prompts, paired with a smaller ''student'' model to finalize\nresponses. Remarkably, GKT requires no fine-tuning and doesn't necessitate the\nteacher and student models to have the same vocabulary, allowing for extensive\nbatch generation to accelerate the process while ensuring user customization.\nGKT can be seamlessly integrated into cloud-edge collaboration architectures,\nand is versatile enough for plug-and-play application across various models. It\nexcels in both efficiency and affordability, epitomizing a ''cheap and\ncheerful'' solution. GKT achieves a maximum accuracy improvement of 14.18%,\nalong with a 10.72 times speed-up on GSM8K and an accuracy improvement of 14.00\n% along with a 7.73 times speed-up in CSQA. When utilizing ChatGPT as teacher\nmodel and Llama2-70B as the student model, we can achieve 95.00% of ChatGPT's\nperformance at 52% of the cost. The results highlight substantial enhancements\nin accuracy and processing speed on the GSM8K and CSQA datasets, surpassing the\nperformance of using either the student or teacher models in isolation.\n","authors":["Yao Yao","Zuchao Li","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.19635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15924v3","updated":"2024-05-30T02:13:56Z","published":"2024-05-24T20:32:49Z","title":"SLIDE: A Framework Integrating Small and Large Language Models for\n  Open-Domain Dialogues Evaluation","summary":"  The long-standing one-to-many problem of gold standard responses in\nopen-domain dialogue systems presents challenges for automatic evaluation\nmetrics. Though prior works have demonstrated some success by applying powerful\nLarge Language Models (LLMs), existing approaches still struggle with the\none-to-many problem, and exhibit subpar performance in domain-specific\nscenarios. We assume the commonsense reasoning biases within LLMs may hinder\ntheir performance in domainspecific evaluations. To address both issues, we\npropose a novel framework SLIDE (Small and Large Integrated for Dialogue\nEvaluation), that leverages both a small, specialised model (SLM), and LLMs for\nthe evaluation of open domain dialogues. Our approach introduces several\ntechniques: (1) Contrastive learning to differentiate between robust and\nnon-robust response embeddings; (2) A novel metric for semantic sensitivity\nthat combines embedding cosine distances with similarity learned through neural\nnetworks, and (3) a strategy for incorporating the evaluation results from both\nthe SLM and LLMs. Our empirical results demonstrate that our approach achieves\nstate-of-the-art performance in both the classification and evaluation tasks,\nand additionally the SLIDE evaluator exhibits better correlation with human\njudgements. Our code is available at https://\ngithub.com/hegehongcha/SLIDE-ACL2024.\n","authors":["Kun Zhao","Bohao Yang","Chen Tang","Chenghua Lin","Liang Zhan"],"pdf_url":"https://arxiv.org/pdf/2405.15924v3.pdf","comment":"Accepted by ACL2024 Findings"},{"id":"http://arxiv.org/abs/2405.17743v2","updated":"2024-05-30T02:12:05Z","published":"2024-05-28T01:55:35Z","title":"ORLM: Training Large Language Models for Optimization Modeling","summary":"  Large Language Models (LLMs) have emerged as powerful tools for tackling\ncomplex Operations Research (OR) problem by providing the capacity in\nautomating optimization modeling. However, current methodologies heavily rely\non prompt engineering (e.g., multi-agent cooperation) with proprietary LLMs,\nraising data privacy concerns that could be prohibitive in industry\napplications. To tackle this issue, we propose training open-source LLMs for\noptimization modeling. We identify four critical requirements for the training\ndataset of OR LLMs, design and implement OR-Instruct, a semi-automated process\nfor creating synthetic data tailored to specific requirements. We also\nintroduce the IndustryOR benchmark, the first industrial benchmark for testing\nLLMs on solving real-world OR problems. We apply the data from OR-Instruct to\nvarious open-source LLMs of 7b size (termed as ORLMs), resulting in a\nsignificantly improved capability for optimization modeling. Our\nbest-performing ORLM achieves state-of-the-art performance on the NL4OPT, MAMO,\nand IndustryOR benchmarks. Our code and data are available at\n\\url{https://github.com/Cardinal-Operations/ORLM}.\n","authors":["Zhengyang Tang","Chenyu Huang","Xin Zheng","Shixi Hu","Zizhuo Wang","Dongdong Ge","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2405.17743v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2405.19616v1","updated":"2024-05-30T02:09:51Z","published":"2024-05-30T02:09:51Z","title":"Easy Problems That LLMs Get Wrong","summary":"  We introduce a comprehensive Linguistic Benchmark designed to evaluate the\nlimitations of Large Language Models (LLMs) in domains such as logical\nreasoning, spatial intelligence, and linguistic understanding, among others.\nThrough a series of straightforward questions, it uncovers the significant\nlimitations of well-regarded models to perform tasks that humans manage with\nease. It also highlights the potential of prompt engineering to mitigate some\nerrors and underscores the necessity for better training methodologies. Our\nfindings stress the importance of grounding LLMs with human reasoning and\ncommon sense, emphasising the need for human-in-the-loop for enterprise\napplications. We hope this work paves the way for future research to enhance\nthe usefulness and reliability of new models.\n","authors":["Sean Williams","James Huckle"],"pdf_url":"https://arxiv.org/pdf/2405.19616v1.pdf","comment":"AutogenAI Ltd. Associated code at\n  https://github.com/autogenai/easy-problems-that-llms-get-wrong"},{"id":"http://arxiv.org/abs/2402.01349v2","updated":"2024-05-30T01:57:14Z","published":"2024-02-02T12:07:00Z","title":"Beyond the Answers: Reviewing the Rationality of Multiple Choice\n  Question Answering for the Evaluation of Large Language Models","summary":"  In the field of natural language processing (NLP), Large Language Models\n(LLMs) have precipitated a paradigm shift, markedly enhancing performance in\nnatural language generation tasks. Despite these advancements, the\ncomprehensive evaluation of LLMs remains an inevitable challenge for the\ncommunity. Recently, the utilization of Multiple Choice Question Answering\n(MCQA) as a benchmark for LLMs has gained considerable traction. This study\nfirst investigates the limitations of MCQA as an evaluation method for LLMs and\nthen analyzes the fundamental reason for the limitations of MCQA, that while\nLLMs may select the correct answers, it is possible that they also recognize\nother wrong options as correct. Finally, we propose a dataset augmenting method\nfor Multiple-Choice Questions (MCQs), MCQA+, that can more accurately reflect\nthe performance of the model, which underscores the need for more robust\nevaluation mechanisms in assessing the performance of LLMs.\n","authors":["Haochun Wang","Sendong Zhao","Zewen Qiang","Nuwa Xi","Bing Qin","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01349v2.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.17267v4","updated":"2024-05-30T01:56:51Z","published":"2023-12-26T14:16:16Z","title":"Enhancing Low-Resource Relation Representations through Multi-View\n  Decoupling","summary":"  Recently, prompt-tuning with pre-trained language models (PLMs) has\ndemonstrated the significantly enhancing ability of relation extraction (RE)\ntasks. However, in low-resource scenarios, where the available training data is\nscarce, previous prompt-based methods may still perform poorly for prompt-based\nrepresentation learning due to a superficial understanding of the relation. To\nthis end, we highlight the importance of learning high-quality relation\nrepresentation in low-resource scenarios for RE, and propose a novel\nprompt-based relation representation method, named MVRE\n(\\underline{M}ulti-\\underline{V}iew \\underline{R}elation\n\\underline{E}xtraction), to better leverage the capacity of PLMs to improve the\nperformance of RE within the low-resource prompt-tuning paradigm. Specifically,\nMVRE decouples each relation into different perspectives to encompass\nmulti-view relation representations for maximizing the likelihood during\nrelation inference. Furthermore, we also design a Global-Local loss and a\nDynamic-Initialization method for better alignment of the multi-view\nrelation-representing virtual words, containing the semantics of relation\nlabels during the optimization learning process and initialization. Extensive\nexperiments on three benchmark datasets show that our method can achieve\nstate-of-the-art in low-resource settings.\n","authors":["Chenghao Fan","Wei Wei","Xiaoye Qu","Zhenyi Lu","Wenfeng Xie","Yu Cheng","Dangyang Chen"],"pdf_url":"https://arxiv.org/pdf/2312.17267v4.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2405.19597v1","updated":"2024-05-30T01:27:43Z","published":"2024-05-30T01:27:43Z","title":"SVFT: Parameter-Efficient Fine-Tuning with Singular Vectors","summary":"  Popular parameter-efficient fine-tuning (PEFT) methods, such as LoRA and its\nvariants, freeze pre-trained model weights \\(W\\) and inject learnable matrices\n\\(\\Delta W\\). These \\(\\Delta W\\) matrices are structured for efficient\nparameterization, often using techniques like low-rank approximations or\nscaling vectors. However, these methods typically show a performance gap\ncompared to full fine-tuning. Although recent PEFT methods have narrowed this\ngap, they do so at the cost of additional learnable parameters. We propose\nSVFT, a simple approach that fundamentally differs from existing methods: the\nstructure imposed on \\(\\Delta W\\) depends on the specific weight matrix \\(W\\).\nSpecifically, SVFT updates \\(W\\) as a sparse combination of outer products of\nits singular vectors, training only the coefficients (scales) of these sparse\ncombinations. This approach allows fine-grained control over expressivity\nthrough the number of coefficients. Extensive experiments on language and\nvision benchmarks show that SVFT recovers up to 96% of full fine-tuning\nperformance while training only 0.006 to 0.25% of parameters, outperforming\nexisting methods that only recover up to 85% performance using 0.03 to 0.8% of\nthe trainable parameter budget.\n","authors":["Vijay Lingam","Atula Tejaswi","Aditya Vavre","Aneesh Shetty","Gautham Krishna Gudur","Joydeep Ghosh","Alex Dimakis","Eunsol Choi","Aleksandar Bojchevski","Sujay Sanghavi"],"pdf_url":"https://arxiv.org/pdf/2405.19597v1.pdf","comment":"17 pages, 5 figures, 14 tables"},{"id":"http://arxiv.org/abs/2405.19592v1","updated":"2024-05-30T01:11:35Z","published":"2024-05-30T01:11:35Z","title":"Why Larger Language Models Do In-context Learning Differently?","summary":"  Large language models (LLM) have emerged as a powerful tool for AI, with the\nkey ability of in-context learning (ICL), where they can perform well on unseen\ntasks based on a brief series of task examples without necessitating any\nadjustments to the model parameters. One recent interesting mysterious\nobservation is that models of different scales may have different ICL\nbehaviors: larger models tend to be more sensitive to noise in the test\ncontext. This work studies this observation theoretically aiming to improve the\nunderstanding of LLM and ICL. We analyze two stylized settings: (1) linear\nregression with one-layer single-head linear transformers and (2) parity\nclassification with two-layer multiple attention heads transformers (non-linear\ndata and non-linear model). In both settings, we give closed-form optimal\nsolutions and find that smaller models emphasize important hidden features\nwhile larger ones cover more hidden features; thus, smaller models are more\nrobust to noise while larger ones are more easily distracted, leading to\ndifferent ICL behaviors. This sheds light on where transformers pay attention\nto and how that affects ICL. Preliminary experimental results on large base and\nchat models provide positive support for our analysis.\n","authors":["Zhenmei Shi","Junyi Wei","Zhuoyan Xu","Yingyu Liang"],"pdf_url":"https://arxiv.org/pdf/2405.19592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.07783v2","updated":"2024-05-30T00:26:06Z","published":"2021-12-14T23:06:21Z","title":"Online antisemitism across platforms","summary":"  We created a fine-grained AI system for the detection of antisemitism. This\nExplainable AI will identify English and German anti-Semitic expressions of\ndehumanization, verbal aggression and conspiracies in online social media\nmessages across platforms, to support high-level decision making.\n","authors":["Tom De Smedt"],"pdf_url":"https://arxiv.org/pdf/2112.07783v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2405.15525v2","updated":"2024-05-30T00:08:51Z","published":"2024-05-24T13:12:14Z","title":"Sparse Matrix in Large Language Model Fine-tuning","summary":"  LoRA and its variants have become popular parameter-efficient fine-tuning\n(PEFT) methods due to their ability to avoid excessive computational costs.\nHowever, an accuracy gap often exists between PEFT methods and full fine-tuning\n(FT), and this gap has yet to be systematically studied. In this work, we\nintroduce a method for selecting sparse sub-matrices that aim to minimize the\nperformance gap between PEFT vs. full fine-tuning (FT) while also reducing both\nfine-tuning computational cost and memory cost. Our Sparse Matrix Tuning (SMT)\nmethod begins by identifying the most significant sub-matrices in the gradient\nupdate, updating only these blocks during the fine-tuning process. In our\nexperiments, we demonstrate that SMT consistently surpasses other PEFT baseline\n(e.g. LoRA and DoRA) in fine-tuning popular large language models such as LLaMA\nacross a broad spectrum of tasks, while reducing the GPU memory footprint by\n67% compared to FT. We also examine how the performance of LoRA and DoRA tends\nto plateau and decline as the number of trainable parameters increases, in\ncontrast, our SMT method does not suffer from such issue.\n","authors":["Haoze He","Juncheng Billy Li","Xuan Jiang","Heather Miller"],"pdf_url":"https://arxiv.org/pdf/2405.15525v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2405.20331v1","updated":"2024-05-30T17:59:04Z","published":"2024-05-30T17:59:04Z","title":"CoSy: Evaluating Textual Explanations of Neurons","summary":"  A crucial aspect of understanding the complex nature of Deep Neural Networks\n(DNNs) is the ability to explain learned concepts within their latent\nrepresentations. While various methods exist to connect neurons to textual\ndescriptions of human-understandable concepts, evaluating the quality of these\nexplanation methods presents a major challenge in the field due to a lack of\nunified, general-purpose quantitative evaluation. In this work, we introduce\nCoSy (Concept Synthesis) -- a novel, architecture-agnostic framework to\nevaluate the quality of textual explanations for latent neurons. Given textual\nexplanations, our proposed framework leverages a generative model conditioned\non textual input to create data points representing the textual explanation.\nThen, the neuron's response to these explanation data points is compared with\nthe response to control data points, providing a quality estimate of the given\nexplanation. We ensure the reliability of our proposed framework in a series of\nmeta-evaluation experiments and demonstrate practical value through insights\nfrom benchmarking various concept-based textual explanation methods for\nComputer Vision tasks, showing that tested explanation methods significantly\ndiffer in quality.\n","authors":["Laura Kopf","Philine Lou Bommer","Anna Hedström","Sebastian Lapuschkin","Marina M. -C. Höhne","Kirill Bykov"],"pdf_url":"https://arxiv.org/pdf/2405.20331v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.20218v1","updated":"2024-05-30T16:19:02Z","published":"2024-05-30T16:19:02Z","title":"ESG-FTSE: A corpus of news articles with ESG relevance labels and use\n  cases","summary":"  We present ESG-FTSE, the first corpus comprised of news articles with\nEnvironmental, Social and Governance (ESG) relevance annotations. In recent\nyears, investors and regulators have pushed ESG investing to the mainstream due\nto the urgency of climate change. This has led to the rise of ESG scores to\nevaluate an investment's credentials as socially responsible. While demand for\nESG scores is high, their quality varies wildly. Quantitative techniques can be\napplied to improve ESG scores, thus, responsible investing. To contribute to\nresource building for ESG and financial text mining, we pioneer the ESG-FTSE\ncorpus. We further present the first of its kind ESG annotation schema. It has\nthree levels: a binary classification (relevant versus irrelevant news\narticles), ESG classification (ESG-related news articles), and target company.\nBoth supervised and unsupervised learning experiments for ESG relevance\ndetection were conducted to demonstrate that the corpus can be used in\ndifferent settings to derive accurate ESG predictions. Keywords: corpus\nannotation, ESG labels, annotation schema, news article, natural language\nprocessing\n","authors":["Mariya Pavlova","Bernard Casey","Miaosen Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20218v1.pdf","comment":"The corpus is available at\n  https://github.com/mariavpavlova/ESG-FTSE-Corpus.\n  https://aclanthology.org/2024.finnlp-1.14/"},{"id":"http://arxiv.org/abs/2405.20018v1","updated":"2024-05-30T12:57:35Z","published":"2024-05-30T12:57:35Z","title":"Safe Multi-agent Reinforcement Learning with Natural Language\n  Constraints","summary":"  The role of natural language constraints in Safe Multi-agent Reinforcement\nLearning (MARL) is crucial, yet often overlooked. While Safe MARL has vast\npotential, especially in fields like robotics and autonomous vehicles, its full\npotential is limited by the need to define constraints in pre-designed\nmathematical terms, which requires extensive domain expertise and reinforcement\nlearning knowledge, hindering its broader adoption. To address this limitation\nand make Safe MARL more accessible and adaptable, we propose a novel approach\nnamed Safe Multi-agent Reinforcement Learning with Natural Language constraints\n(SMALL). Our method leverages fine-tuned language models to interpret and\nprocess free-form textual constraints, converting them into semantic embeddings\nthat capture the essence of prohibited states and behaviours. These embeddings\nare then integrated into the multi-agent policy learning process, enabling\nagents to learn policies that minimize constraint violations while optimizing\nrewards. To evaluate the effectiveness of SMALL, we introduce the LaMaSafe, a\nmulti-task benchmark designed to assess the performance of multiple agents in\nadhering to natural language constraints. Empirical evaluations across various\nenvironments demonstrate that SMALL achieves comparable rewards and\nsignificantly fewer constraint violations, highlighting its effectiveness in\nunderstanding and enforcing natural language constraints.\n","authors":["Ziyan Wang","Meng Fang","Tristan Tomilin","Fei Fang","Yali Du"],"pdf_url":"https://arxiv.org/pdf/2405.20018v1.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.20015v1","updated":"2024-05-30T12:50:32Z","published":"2024-05-30T12:50:32Z","title":"Efficient LLM-Jailbreaking by Introducing Visual Modality","summary":"  This paper focuses on jailbreaking attacks against large language models\n(LLMs), eliciting them to generate objectionable content in response to harmful\nuser queries. Unlike previous LLM-jailbreaks that directly orient to LLMs, our\napproach begins by constructing a multimodal large language model (MLLM)\nthrough the incorporation of a visual module into the target LLM. Subsequently,\nwe conduct an efficient MLLM-jailbreak to generate jailbreaking embeddings\nembJS. Finally, we convert the embJS into text space to facilitate the\njailbreaking of the target LLM. Compared to direct LLM-jailbreaking, our\napproach is more efficient, as MLLMs are more vulnerable to jailbreaking than\npure LLM. Additionally, to improve the attack success rate (ASR) of\njailbreaking, we propose an image-text semantic matching scheme to identify a\nsuitable initial input. Extensive experiments demonstrate that our approach\nsurpasses current state-of-the-art methods in terms of both efficiency and\neffectiveness. Moreover, our approach exhibits superior cross-class\njailbreaking capabilities.\n","authors":["Zhenxing Niu","Yuyao Sun","Haodong Ren","Haoxuan Ji","Quan Wang","Xiaoke Ma","Gang Hua","Rong Jin"],"pdf_url":"https://arxiv.org/pdf/2405.20015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19988v1","updated":"2024-05-30T12:18:06Z","published":"2024-05-30T12:18:06Z","title":"Video-Language Critic: Transferable Reward Functions for\n  Language-Conditioned Robotics","summary":"  Natural language is often the easiest and most convenient modality for humans\nto specify tasks for robots. However, learning to ground language to behavior\ntypically requires impractical amounts of diverse, language-annotated\ndemonstrations collected on each target robot. In this work, we aim to separate\nthe problem of what to accomplish from how to accomplish it, as the former can\nbenefit from substantial amounts of external observation-only data, and only\nthe latter depends on a specific robot embodiment. To this end, we propose\nVideo-Language Critic, a reward model that can be trained on readily available\ncross-embodiment data using contrastive learning and a temporal ranking\nobjective, and use it to score behavior traces from a separate reinforcement\nlearning actor. When trained on Open X-Embodiment data, our reward model\nenables 2x more sample-efficient policy training on Meta-World tasks than a\nsparse reward only, despite a significant domain gap. Using in-domain data but\nin a challenging task generalization setting on Meta-World, we further\ndemonstrate more sample-efficient training than is possible with prior\nlanguage-conditioned reward models that are either trained with binary\nclassification, use static images, or do not leverage the temporal information\npresent in video data.\n","authors":["Minttu Alakuijala","Reginald McLean","Isaac Woungang","Nariman Farsad","Samuel Kaski","Pekka Marttinen","Kai Yuan"],"pdf_url":"https://arxiv.org/pdf/2405.19988v1.pdf","comment":"10 pages in the main text, 16 pages including references and\n  supplementary materials. 4 figures and 3 tables in the main text, 1 table in\n  supplementary materials"},{"id":"http://arxiv.org/abs/2405.19893v1","updated":"2024-05-30T09:50:38Z","published":"2024-05-30T09:50:38Z","title":"Similarity is Not All You Need: Endowing Retrieval Augmented Generation\n  with Multi Layered Thoughts","summary":"  In recent years, large language models (LLMs) have made remarkable\nachievements in various domains. However, the untimeliness and cost of\nknowledge updates coupled with hallucination issues of LLMs have curtailed\ntheir applications in knowledge intensive tasks, where retrieval augmented\ngeneration (RAG) can be of help. Nevertheless, existing retrieval augmented\nmodels typically use similarity as a bridge between queries and documents and\nfollow a retrieve then read procedure. In this work, we argue that similarity\nis not always the panacea and totally relying on similarity would sometimes\ndegrade the performance of retrieval augmented generation. To this end, we\npropose MetRag, a Multi layEred Thoughts enhanced Retrieval Augmented\nGeneration framework. To begin with, beyond existing similarity oriented\nthought, we embrace a small scale utility model that draws supervision from an\nLLM for utility oriented thought and further come up with a smarter model by\ncomprehensively combining the similarity and utility oriented thoughts.\nFurthermore, given the fact that the retrieved document set tends to be huge\nand using them in isolation makes it difficult to capture the commonalities and\ncharacteristics among them, we propose to make an LLM as a task adaptive\nsummarizer to endow retrieval augmented generation with compactness-oriented\nthought. Finally, with multi layered thoughts from the precedent stages, an LLM\nis called for knowledge augmented generation. Extensive experiments on\nknowledge-intensive tasks have demonstrated the superiority of MetRag.\n","authors":["Chunjing Gan","Dan Yang","Binbin Hu","Hanxiao Zhang","Siyuan Li","Ziqi Liu","Yue Shen","Lin Ju","Zhiqiang Zhang","Jinjie Gu","Lei Liang","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.19893v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2405.19883v1","updated":"2024-05-30T09:42:54Z","published":"2024-05-30T09:42:54Z","title":"From Words to Actions: Unveiling the Theoretical Underpinnings of\n  LLM-Driven Autonomous Systems","summary":"  In this work, from a theoretical lens, we aim to understand why large\nlanguage model (LLM) empowered agents are able to solve decision-making\nproblems in the physical world. To this end, consider a hierarchical\nreinforcement learning (RL) model where the LLM Planner and the Actor perform\nhigh-level task planning and low-level execution, respectively. Under this\nmodel, the LLM Planner navigates a partially observable Markov decision process\n(POMDP) by iteratively generating language-based subgoals via prompting. Under\nproper assumptions on the pretraining data, we prove that the pretrained LLM\nPlanner effectively performs Bayesian aggregated imitation learning (BAIL)\nthrough in-context learning. Additionally, we highlight the necessity for\nexploration beyond the subgoals derived from BAIL by proving that naively\nexecuting the subgoals returned by LLM leads to a linear regret. As a remedy,\nwe introduce an $\\epsilon$-greedy exploration strategy to BAIL, which is proven\nto incur sublinear regret when the pretraining error is small. Finally, we\nextend our theoretical framework to include scenarios where the LLM Planner\nserves as a world model for inferring the transition model of the environment\nand to multi-agent settings, enabling coordination among multiple Actors.\n","authors":["Jianliang He","Siyu Chen","Fengzhuo Zhang","Zhuoran Yang"],"pdf_url":"https://arxiv.org/pdf/2405.19883v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.19653v1","updated":"2024-05-30T03:12:04Z","published":"2024-05-30T03:12:04Z","title":"SysCaps: Language Interfaces for Simulation Surrogates of Complex\n  Systems","summary":"  Data-driven simulation surrogates help computational scientists study complex\nsystems. They can also help inform impactful policy decisions. We introduce a\nlearning framework for surrogate modeling where language is used to interface\nwith the underlying system being simulated. We call a language description of a\nsystem a \"system caption\", or SysCap. To address the lack of datasets of paired\nnatural language SysCaps and simulation runs, we use large language models\n(LLMs) to synthesize high-quality captions. Using our framework, we train\nmultimodal text and timeseries regression models for two real-world simulators\nof complex energy systems. Our experiments demonstrate the feasibility of\ndesigning language interfaces for real-world surrogate models at comparable\naccuracy to standard baselines. We qualitatively and quantitatively show that\nSysCaps unlock text-prompt-style surrogate modeling and new generalization\nabilities beyond what was previously possible. We will release the generated\nSysCaps datasets and our code to support follow-on studies.\n","authors":["Patrick Emami","Zhaonan Li","Saumya Sinha","Truc Nguyen"],"pdf_url":"https://arxiv.org/pdf/2405.19653v1.pdf","comment":"17 pages. Under review"},{"id":"http://arxiv.org/abs/2405.20541v1","updated":"2024-05-30T23:50:20Z","published":"2024-05-30T23:50:20Z","title":"Perplexed by Perplexity: Perplexity-Based Data Pruning With Small\n  Reference Models","summary":"  In this work, we investigate whether small language models can determine\nhigh-quality subsets of large-scale text datasets that improve the performance\nof larger language models. While existing work has shown that pruning based on\nthe perplexity of a larger model can yield high-quality data, we investigate\nwhether smaller models can be used for perplexity-based pruning and how pruning\nis affected by the domain composition of the data being pruned. We demonstrate\nthat for multiple dataset compositions, perplexity-based pruning of pretraining\ndata can \\emph{significantly} improve downstream task performance: pruning\nbased on perplexities computed with a 125 million parameter model improves the\naverage performance on downstream tasks of a 3 billion parameter model by up to\n2.04 and achieves up to a $1.45\\times$ reduction in pretraining steps to reach\ncommensurate baseline performance. Furthermore, we demonstrate that such\nperplexity-based data pruning also yields downstream performance gains in the\nover-trained and data-constrained regimes.\n","authors":["Zachary Ankner","Cody Blakeney","Kartik Sreenivasan","Max Marion","Matthew L. Leavitt","Mansheej Paul"],"pdf_url":"https://arxiv.org/pdf/2405.20541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20535v1","updated":"2024-05-30T23:20:25Z","published":"2024-05-30T23:20:25Z","title":"Unveiling the Impact of Coding Data Instruction Fine-Tuning on Large\n  Language Models Reasoning","summary":"  Instruction Fine-Tuning (IFT) significantly enhances the zero-shot\ncapabilities of pretrained Large Language Models (LLMs). While coding data is\nknown to boost reasoning abilities during LLM pretraining, its role in\nactivating internal reasoning capacities during IFT remains understudied. This\npaper investigates a key question: How does coding data impact LLMs' reasoning\ncapacities during the IFT stage? To explore this, we thoroughly examine the\nimpact of coding data across different coding data proportions, model families,\nsizes, and reasoning domains, from various perspectives. Specifically, we\ncreate three IFT datasets with increasing coding data proportions, fine-tune\nsix LLM backbones across different families and scales on these datasets,\nevaluate the tuned models' performance across twelve tasks in three reasoning\ndomains, and analyze the outcomes from three broad-to-granular perspectives:\noverall, domain-level, and task-specific. Our holistic analysis provides\nvaluable insights in each perspective. First, coding data tuning enhances the\noverall reasoning capabilities of LLMs across different model families and\nscales. Moreover, the effect of coding data varies among different domains but\nshows consistent trends across model families and scales within each domain.\nAdditionally, coding data generally yields comparable task-specific benefits\nacross different model families, with the optimal coding data proportions in\nIFT datasets being task-specific.\n","authors":["Xinlu Zhang","Zhiyu Zoey Chen","Xi Ye","Xianjun Yang","Lichang Chen","William Yang Wang","Linda Ruth Petzold"],"pdf_url":"https://arxiv.org/pdf/2405.20535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05660v2","updated":"2024-05-30T23:10:00Z","published":"2023-09-11T17:56:57Z","title":"Hypothesis Search: Inductive Reasoning with Language Models","summary":"  Inductive reasoning is a core problem-solving capacity: humans can identify\nunderlying principles from a few examples, which robustly generalize to novel\nscenarios. Recent work evaluates large language models (LLMs) on inductive\nreasoning tasks by directly prompting them yielding \"in context learning.\" This\nworks well for straightforward inductive tasks but performs poorly on complex\ntasks such as the Abstraction and Reasoning Corpus (ARC). In this work, we\npropose to improve the inductive reasoning ability of LLMs by generating\nexplicit hypotheses at multiple levels of abstraction: we prompt the LLM to\npropose multiple abstract hypotheses about the problem, in natural language,\nthen implement the natural language hypotheses as concrete Python programs.\nThese programs can be verified by running on observed examples and generalized\nto novel inputs. To reduce the hypothesis search space, we explore steps to\nfilter the set of hypotheses to implement: we either ask the LLM to summarize\nthem into a smaller set of hypotheses or ask human annotators to select a\nsubset. We verify our pipeline's effectiveness on the ARC visual inductive\nreasoning benchmark, its variant 1D-ARC, string transformation dataset SyGuS,\nand list transformation dataset List Functions. On a random 100-problem subset\nof ARC, our automated pipeline using LLM summaries achieves 30% accuracy,\noutperforming the direct prompting baseline (accuracy of 17%). With the minimal\nhuman input of selecting from LLM-generated candidates, performance is boosted\nto 33%. Our ablations show that both abstract hypothesis generation and\nconcrete program representations benefit LLMs on inductive reasoning tasks.\n","authors":["Ruocheng Wang","Eric Zelikman","Gabriel Poesia","Yewen Pu","Nick Haber","Noah D. Goodman"],"pdf_url":"https://arxiv.org/pdf/2309.05660v2.pdf","comment":"ICLR 2024. The first two authors contributed equally. Code:\n  https://github.com/Relento/hypothesis_search"},{"id":"http://arxiv.org/abs/2405.20529v1","updated":"2024-05-30T23:04:53Z","published":"2024-05-30T23:04:53Z","title":"An Automatic Question Usability Evaluation Toolkit","summary":"  Evaluating multiple-choice questions (MCQs) involves either labor intensive\nhuman assessments or automated methods that prioritize readability, often\noverlooking deeper question design flaws. To address this issue, we introduce\nthe Scalable Automatic Question Usability Evaluation Toolkit (SAQUET), an\nopen-source tool that leverages the Item-Writing Flaws (IWF) rubric for a\ncomprehensive and automated quality evaluation of MCQs. By harnessing the\nlatest in large language models such as GPT-4, advanced word embeddings, and\nTransformers designed to analyze textual complexity, SAQUET effectively\npinpoints and assesses a wide array of flaws in MCQs. We first demonstrate the\ndiscrepancy between commonly used automated evaluation metrics and the human\nassessment of MCQ quality. Then we evaluate SAQUET on a diverse dataset of MCQs\nacross the five domains of Chemistry, Statistics, Computer Science, Humanities,\nand Healthcare, showing how it effectively distinguishes between flawed and\nflawless questions, providing a level of analysis beyond what is achievable\nwith traditional metrics. With an accuracy rate of over 94% in detecting the\npresence of flaws identified by human evaluators, our findings emphasize the\nlimitations of existing evaluation methods and showcase potential in improving\nthe quality of educational assessments.\n","authors":["Steven Moore","Eamon Costello","Huy A. Nguyen","John Stamper"],"pdf_url":"https://arxiv.org/pdf/2405.20529v1.pdf","comment":"Artificial Intelligence in Education 2024"},{"id":"http://arxiv.org/abs/2405.20527v1","updated":"2024-05-30T23:01:10Z","published":"2024-05-30T23:01:10Z","title":"Towards Ontology-Enhanced Representation Learning for Large Language\n  Models","summary":"  Taking advantage of the widespread use of ontologies to organise and\nharmonize knowledge across several distinct domains, this paper proposes a\nnovel approach to improve an embedding-Large Language Model (embedding-LLM) of\ninterest by infusing the knowledge formalized by a reference ontology:\nontological knowledge infusion aims at boosting the ability of the considered\nLLM to effectively model the knowledge domain described by the infused\nontology. The linguistic information (i.e. concept synonyms and descriptions)\nand structural information (i.e. is-a relations) formalized by the ontology are\nutilized to compile a comprehensive set of concept definitions, with the\nassistance of a powerful generative LLM (i.e. GPT-3.5-turbo). These concept\ndefinitions are then employed to fine-tune the target embedding-LLM using a\ncontrastive learning framework. To demonstrate and evaluate the proposed\napproach, we utilize the biomedical disease ontology MONDO. The results show\nthat embedding-LLMs enhanced by ontological disease knowledge exhibit an\nimproved capability to effectively evaluate the similarity of in-domain\nsentences from biomedical documents mentioning diseases, without compromising\ntheir out-of-domain performance.\n","authors":["Francesco Ronzano","Jay Nanavati"],"pdf_url":"https://arxiv.org/pdf/2405.20527v1.pdf","comment":"14 pages, 1 figure"},{"id":"http://arxiv.org/abs/2405.20526v1","updated":"2024-05-30T22:57:49Z","published":"2024-05-30T22:57:49Z","title":"Automated Generation and Tagging of Knowledge Components from\n  Multiple-Choice Questions","summary":"  Knowledge Components (KCs) linked to assessments enhance the measurement of\nstudent learning, enrich analytics, and facilitate adaptivity. However,\ngenerating and linking KCs to assessment items requires significant effort and\ndomain-specific knowledge. To streamline this process for higher-education\ncourses, we employed GPT-4 to generate KCs for multiple-choice questions (MCQs)\nin Chemistry and E-Learning. We analyzed discrepancies between the KCs\ngenerated by the Large Language Model (LLM) and those made by humans through\nevaluation from three domain experts in each subject area. This evaluation\naimed to determine whether, in instances of non-matching KCs, evaluators showed\na preference for the LLM-generated KCs over their human-created counterparts.\nWe also developed an ontology induction algorithm to cluster questions that\nassess similar KCs based on their content. Our most effective LLM strategy\naccurately matched KCs for 56% of Chemistry and 35% of E-Learning MCQs, with\neven higher success when considering the top five KC suggestions. Human\nevaluators favored LLM-generated KCs, choosing them over human-assigned ones\napproximately two-thirds of the time, a preference that was statistically\nsignificant across both domains. Our clustering algorithm successfully grouped\nquestions by their underlying KCs without needing explicit labels or contextual\ninformation. This research advances the automation of KC generation and\nclassification for assessment items, alleviating the need for student data or\npredefined KC labels.\n","authors":["Steven Moore","Robin Schmucker","Tom Mitchell","John Stamper"],"pdf_url":"https://arxiv.org/pdf/2405.20526v1.pdf","comment":"Learning @ Scale 2024"},{"id":"http://arxiv.org/abs/2405.07960v3","updated":"2024-05-30T22:56:17Z","published":"2024-05-13T17:38:53Z","title":"AgentClinic: a multimodal agent benchmark to evaluate AI in simulated\n  clinical environments","summary":"  Diagnosing and managing a patient is a complex, sequential decision making\nprocess that requires physicians to obtain information -- such as which tests\nto perform -- and to act upon it. Recent advances in artificial intelligence\n(AI) and large language models (LLMs) promise to profoundly impact clinical\ncare. However, current evaluation schemes overrely on static medical\nquestion-answering benchmarks, falling short on interactive decision-making\nthat is required in real-life clinical work. Here, we present AgentClinic: a\nmultimodal benchmark to evaluate LLMs in their ability to operate as agents in\nsimulated clinical environments. In our benchmark, the doctor agent must\nuncover the patient's diagnosis through dialogue and active data collection. We\npresent two open medical agent benchmarks: a multimodal image and dialogue\nenvironment, AgentClinic-NEJM, and a dialogue-only environment,\nAgentClinic-MedQA. We embed cognitive and implicit biases both in patient and\ndoctor agents to emulate realistic interactions between biased agents. We find\nthat introducing bias leads to large reductions in diagnostic accuracy of the\ndoctor agents, as well as reduced compliance, confidence, and follow-up\nconsultation willingness in patient agents. Evaluating a suite of\nstate-of-the-art LLMs, we find that several models that excel in benchmarks\nlike MedQA are performing poorly in AgentClinic-MedQA. We find that the LLM\nused in the patient agent is an important factor for performance in the\nAgentClinic benchmark. We show that both having limited interactions as well as\ntoo many interaction reduces diagnostic accuracy in doctor agents. The code and\ndata for this work is publicly available at https://AgentClinic.github.io.\n","authors":["Samuel Schmidgall","Rojin Ziaei","Carl Harris","Eduardo Reis","Jeffrey Jopling","Michael Moor"],"pdf_url":"https://arxiv.org/pdf/2405.07960v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20512v1","updated":"2024-05-30T22:08:20Z","published":"2024-05-30T22:08:20Z","title":"How Multilingual Are Large Language Models Fine-Tuned for Translation?","summary":"  A new paradigm for machine translation has recently emerged: fine-tuning\nlarge language models (LLM) on parallel text has been shown to outperform\ndedicated translation systems trained in a supervised fashion on much larger\namounts of parallel data (Xu et al., 2024a; Alves et al., 2024). However, it\nremains unclear whether this paradigm can enable massively multilingual machine\ntranslation or whether it requires fine-tuning dedicated models for a small\nnumber of language pairs. How does translation fine-tuning impact the MT\ncapabilities of LLMs for zero-shot languages, zero-shot language pairs, and\ntranslation tasks that do not involve English? To address these questions, we\nconduct an extensive empirical evaluation of the translation quality of the\nTOWER family of language models (Alves et al., 2024) on 132 translation tasks\nfrom the multi-parallel FLORES-200 data. We find that translation fine-tuning\nimproves translation quality even for zero-shot languages on average, but that\nthe impact is uneven depending on the language pairs involved. These results\ncall for further research to effectively enable massively multilingual\ntranslation with LLMs.\n","authors":["Aquia Richburg","Marine Carpuat"],"pdf_url":"https://arxiv.org/pdf/2405.20512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20505v1","updated":"2024-05-30T21:51:01Z","published":"2024-05-30T21:51:01Z","title":"SPOT: Text Source Prediction from Originality Score Thresholding","summary":"  The wide acceptance of large language models (LLMs) has unlocked new\napplications and social risks. Popular countermeasures aim at detecting\nmisinformation, usually involve domain specific models trained to recognize the\nrelevance of any information. Instead of evaluating the validity of the\ninformation, we propose to investigate LLM generated text from the perspective\nof trust. In this study, we define trust as the ability to know if an input\ntext was generated by a LLM or a human. To do so, we design SPOT, an efficient\nmethod, that classifies the source of any, standalone, text input based on\noriginality score. This score is derived from the prediction of a given LLM to\ndetect other LLMs. We empirically demonstrate the robustness of the method to\nthe architecture, training data, evaluation data, task and compression of\nmodern LLMs.\n","authors":["Edouard Yvinec","Gabriel Kasser"],"pdf_url":"https://arxiv.org/pdf/2405.20505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20495v1","updated":"2024-05-30T21:36:12Z","published":"2024-05-30T21:36:12Z","title":"Transfer Q Star: Principled Decoding for LLM Alignment","summary":"  Aligning foundation models is essential for their safe and trustworthy\ndeployment. However, traditional fine-tuning methods are computationally\nintensive and require updating billions of model parameters. A promising\nalternative, alignment via decoding, adjusts the response distribution directly\nwithout model updates to maximize a target reward $r$, thus providing a\nlightweight and adaptable framework for alignment. However, principled decoding\nmethods rely on oracle access to an optimal Q-function ($Q^*$), which is often\nunavailable in practice. Hence, prior SoTA methods either approximate this\n$Q^*$ using $Q^{\\pi_{\\texttt{sft}}}$ (derived from the reference $\\texttt{SFT}$\nmodel) or rely on short-term rewards, resulting in sub-optimal decoding\nperformance. In this work, we propose Transfer $Q^*$, which implicitly\nestimates the optimal value function for a target reward $r$ through a baseline\nmodel $\\rho_{\\texttt{BL}}$ aligned with a baseline reward $\\rho_{\\texttt{BL}}$\n(which can be different from the target reward $r$). Theoretical analyses of\nTransfer $Q^*$ provide a rigorous characterization of its optimality, deriving\nan upper bound on the sub-optimality gap and identifying a hyperparameter to\ncontrol the deviation from the pre-trained reference $\\texttt{SFT}$ model based\non user needs. Our approach significantly reduces the sub-optimality gap\nobserved in prior SoTA methods and demonstrates superior empirical performance\nacross key metrics such as coherence, diversity, and quality in extensive tests\non several synthetic and real datasets.\n","authors":["Souradip Chakraborty","Soumya Suvra Ghosal","Ming Yin","Dinesh Manocha","Mengdi Wang","Amrit Singh Bedi","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2405.20495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20485v1","updated":"2024-05-30T21:19:24Z","published":"2024-05-30T21:19:24Z","title":"Phantom: General Trigger Attacks on Retrieval Augmented Language\n  Generation","summary":"  Retrieval Augmented Generation (RAG) expands the capabilities of modern large\nlanguage models (LLMs) in chatbot applications, enabling developers to adapt\nand personalize the LLM output without expensive training or fine-tuning. RAG\nsystems use an external knowledge database to retrieve the most relevant\ndocuments for a given query, providing this context to the LLM generator. While\nRAG achieves impressive utility in many applications, its adoption to enable\npersonalized generative models introduces new security risks. In this work, we\npropose new attack surfaces for an adversary to compromise a victim's RAG\nsystem, by injecting a single malicious document in its knowledge database. We\ndesign Phantom, general two-step attack framework against RAG augmented LLMs.\nThe first step involves crafting a poisoned document designed to be retrieved\nby the RAG system within the top-k results only when an adversarial trigger, a\nspecific sequence of words acting as backdoor, is present in the victim's\nqueries. In the second step, a specially crafted adversarial string within the\npoisoned document triggers various adversarial attacks in the LLM generator,\nincluding denial of service, reputation damage, privacy violations, and harmful\nbehaviors. We demonstrate our attacks on multiple LLM architectures, including\nGemma, Vicuna, and Llama.\n","authors":["Harsh Chaudhari","Giorgio Severi","John Abascal","Matthew Jagielski","Christopher A. Choquette-Choo","Milad Nasr","Cristina Nita-Rotaru","Alina Oprea"],"pdf_url":"https://arxiv.org/pdf/2405.20485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02691v3","updated":"2024-05-30T21:16:29Z","published":"2023-09-06T03:54:57Z","title":"A Joint Study of Phrase Grounding and Task Performance in Vision and\n  Language Models","summary":"  Key to tasks that require reasoning about natural language in visual contexts\nis grounding words and phrases to image regions. However, observing this\ngrounding in contemporary models is complex, even if it is generally expected\nto take place if the task is addressed in a way that is conductive to\ngeneralization. We propose a framework to jointly study task performance and\nphrase grounding, and propose three benchmarks to study the relation between\nthe two. Our results show that contemporary models demonstrate inconsistency\nbetween their ability to ground phrases and solve tasks. We show how this can\nbe addressed through brute-force training on ground phrasing annotations, and\nanalyze the dynamics it creates. Code and at available at\nhttps://github.com/lil-lab/phrase_grounding.\n","authors":["Noriyuki Kojima","Hadar Averbuch-Elor","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2309.02691v3.pdf","comment":"This was published in TMLR in 2024, on January 24th"},{"id":"http://arxiv.org/abs/2402.03299v4","updated":"2024-05-30T21:14:26Z","published":"2024-02-05T18:54:43Z","title":"GUARD: Role-playing to Generate Natural-language Jailbreakings to Test\n  Guideline Adherence of Large Language Models","summary":"  The discovery of \"jailbreaks\" to bypass safety filters of Large Language\nModels (LLMs) and harmful responses have encouraged the community to implement\nsafety measures. One major safety measure is to proactively test the LLMs with\njailbreaks prior to the release. Therefore, such testing will require a method\nthat can generate jailbreaks massively and efficiently. In this paper, we\nfollow a novel yet intuitive strategy to generate jailbreaks in the style of\nthe human generation. We propose a role-playing system that assigns four\ndifferent roles to the user LLMs to collaborate on new jailbreaks. Furthermore,\nwe collect existing jailbreaks and split them into different independent\ncharacteristics using clustering frequency and semantic patterns sentence by\nsentence. We organize these characteristics into a knowledge graph, making them\nmore accessible and easier to retrieve. Our system of different roles will\nleverage this knowledge graph to generate new jailbreaks, which have proved\neffective in inducing LLMs to generate unethical or guideline-violating\nresponses. In addition, we also pioneer a setting in our system that will\nautomatically follow the government-issued guidelines to generate jailbreaks to\ntest whether LLMs follow the guidelines accordingly. We refer to our system as\nGUARD (Guideline Upholding through Adaptive Role-play Diagnostics). We have\nempirically validated the effectiveness of GUARD on three cutting-edge\nopen-sourced LLMs (Vicuna-13B, LongChat-7B, and Llama-2-7B), as well as a\nwidely-utilized commercial LLM (ChatGPT). Moreover, our work extends to the\nrealm of vision language models (MiniGPT-v2 and Gemini Vision Pro), showcasing\nGUARD's versatility and contributing valuable insights for the development of\nsafer, more reliable LLM-based applications across diverse modalities.\n","authors":["Haibo Jin","Ruoxi Chen","Andy Zhou","Yang Zhang","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03299v4.pdf","comment":"28 papges"},{"id":"http://arxiv.org/abs/2405.20477v1","updated":"2024-05-30T20:56:41Z","published":"2024-05-30T20:56:41Z","title":"Automated Focused Feedback Generation for Scientific Writing Assistance","summary":"  Scientific writing is a challenging task, particularly for novice researchers\nwho often rely on feedback from experienced peers. Recent work has primarily\nfocused on improving surface form and style rather than manuscript content. In\nthis paper, we propose a novel task: automated focused feedback generation for\nscientific writing assistance. We present SWIF$^{2}$T: a Scientific WrIting\nFocused Feedback Tool. It is designed to generate specific, actionable and\ncoherent comments, which identify weaknesses in a scientific paper and/or\npropose revisions to it. Our approach consists of four components - planner,\ninvestigator, reviewer and controller - leveraging multiple Large Language\nModels (LLMs) to implement them. We compile a dataset of 300 peer reviews\nciting weaknesses in scientific papers and conduct human evaluation. The\nresults demonstrate the superiority in specificity, reading comprehension, and\noverall helpfulness of SWIF$^{2}$T's feedback compared to other approaches. In\nour analysis, we also identified cases where automatically generated reviews\nwere judged better than human ones, suggesting opportunities for integration of\nAI-generated feedback in scientific writing.\n","authors":["Eric Chamoun","Michael Schlichktrull","Andreas Vlachos"],"pdf_url":"https://arxiv.org/pdf/2405.20477v1.pdf","comment":"Accepted to ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2405.20468v1","updated":"2024-05-30T20:34:37Z","published":"2024-05-30T20:34:37Z","title":"Extending the Massive Text Embedding Benchmark to French","summary":"  In recent years, numerous embedding models have been made available and\nwidely used for various NLP tasks. Choosing a model that performs well for\nseveral tasks in English has been largely simplified by the Massive Text\nEmbedding Benchmark (MTEB), but extensions to other languages remain\nchallenging. This is why we expand MTEB to propose the first massive benchmark\nof sentence embeddings for French. Not only we gather 22 existing datasets in\nan easy-to-use interface, but we also create three new French datasets for a\nglobal evaluation over 8 different tasks. We perform a large scale comparison\nwith 46 carefully selected embedding models, conduct comprehensive statistical\ntests, and analyze the correlation between model performance and many of their\ncharacteristics. We find out that even if no model is the best on all tasks,\nlarge multilingual models pre-trained on sentence similarity perform\nparticularly well. Our work comes with open-source code, new datasets and a\npublic leaderboard.\n","authors":["Mathieu Ciancone","Imene Kerboua","Marion Schaeffer","Wissam Siblini"],"pdf_url":"https://arxiv.org/pdf/2405.20468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11052v3","updated":"2024-05-30T20:28:08Z","published":"2024-01-19T23:00:31Z","title":"Mining experimental data from Materials Science literature with Large\n  Language Models: an evaluation study","summary":"  This study is dedicated to assessing the capabilities of large language\nmodels (LLMs) such as GPT-3.5-Turbo, GPT-4, and GPT-4-Turbo in extracting\nstructured information from scientific documents in materials science. To this\nend, we primarily focus on two critical tasks of information extraction: (i) a\nnamed entity recognition (NER) of studied materials and physical properties and\n(ii) a relation extraction (RE) between these entities. Due to the evident lack\nof datasets within Materials Informatics (MI), we evaluated using SuperMat,\nbased on superconductor research, and MeasEval, a generic measurement\nevaluation corpus. The performance of LLMs in executing these tasks is\nbenchmarked against traditional models based on the BERT architecture and\nrule-based approaches (baseline). We introduce a novel methodology for the\ncomparative analysis of intricate material expressions, emphasising the\nstandardisation of chemical formulas to tackle the complexities inherent in\nmaterials science information assessment. For NER, LLMs fail to outperform the\nbaseline with zero-shot prompting and exhibit only limited improvement with\nfew-shot prompting. However, a GPT-3.5-Turbo fine-tuned with the appropriate\nstrategy for RE outperforms all models, including the baseline. Without any\nfine-tuning, GPT-4 and GPT-4-Turbo display remarkable reasoning and\nrelationship extraction capabilities after being provided with merely a couple\nof examples, surpassing the baseline. Overall, the results suggest that\nalthough LLMs demonstrate relevant reasoning skills in connecting concepts,\nspecialised models are currently a better choice for tasks requiring extracting\ncomplex domain-specific entities like materials. These insights provide initial\nguidance applicable to other materials science sub-domains in future work.\n","authors":["Luca Foppiano","Guillaume Lambard","Toshiyuki Amagasa","Masashi Ishii"],"pdf_url":"https://arxiv.org/pdf/2401.11052v3.pdf","comment":"40 pages: 5 figures and 1 table in the body. 32 Tables in the\n  Appendix / Supplementary materials"},{"id":"http://arxiv.org/abs/2405.20461v1","updated":"2024-05-30T20:16:27Z","published":"2024-05-30T20:16:27Z","title":"Scalable Detection of Salient Entities in News Articles","summary":"  News articles typically mention numerous entities, a large fraction of which\nare tangential to the story. Detecting the salience of entities in articles is\nthus important to applications such as news search, analysis and summarization.\nIn this work, we explore new approaches for efficient and effective salient\nentity detection by fine-tuning pretrained transformer models with\nclassification heads that use entity tags or contextualized entity\nrepresentations directly. Experiments show that these straightforward\ntechniques dramatically outperform prior work across datasets with varying\nsizes and salience definitions. We also study knowledge distillation techniques\nto effectively reduce the computational cost of these models without affecting\ntheir accuracy. Finally, we conduct extensive analyses and ablation experiments\nto characterize the behavior of the proposed models.\n","authors":["Eliyar Asgarieh","Kapil Thadani","Neil O'Hare"],"pdf_url":"https://arxiv.org/pdf/2405.20461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15713v2","updated":"2024-05-30T19:55:58Z","published":"2024-01-28T17:34:42Z","title":"Contrastive Learning and Mixture of Experts Enables Precise Vector\n  Embeddings","summary":"  The advancement of transformer neural networks has significantly elevated the\ncapabilities of sentence similarity models, but they struggle with highly\ndiscriminative tasks and produce sub-optimal representations of important\ndocuments like scientific literature. With the increased reliance on retrieval\naugmentation and search, representing diverse documents as concise and\ndescriptive vectors is crucial. This paper improves upon the vectors embeddings\nof scientific literature by assembling niche datasets using co-citations as a\nsimilarity metric, focusing on biomedical domains. We apply a novel Mixture of\nExperts (MoE) extension pipeline to pretrained BERT models, where every\nmulti-layer perceptron section is enlarged and copied into multiple distinct\nexperts. Our MoE variants perform well over $N$ scientific domains with $N$\ndedicated experts, whereas standard BERT models excel in only one domain.\nNotably, extending just a single transformer block to MoE captures 85% of the\nbenefit seen from full MoE extension at every layer. This holds promise for\nversatile and efficient One-Size-Fits-All transformer networks for numerically\nrepresenting diverse inputs. Our methodology marks significant advancements in\nrepresenting scientific text and holds promise for enhancing vector database\nsearch and compilation.\n","authors":["Logan Hallee","Rohan Kapur","Arjun Patel","Jason P. Gleghorn","Bohdan Khomtchouk"],"pdf_url":"https://arxiv.org/pdf/2401.15713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09723v3","updated":"2024-05-30T19:40:21Z","published":"2024-02-15T05:31:13Z","title":"Efficient Prompt Optimization Through the Lens of Best Arm\n  Identification","summary":"  The remarkable instruction-following capability of large language models\n(LLMs) has sparked a growing interest in automatically finding good prompts,\ni.e., prompt optimization. Most existing works follow the scheme of selecting\nfrom a pre-generated pool of candidate prompts. However, these designs mainly\nfocus on the generation strategy, while limited attention has been paid to the\nselection method. Especially, the cost incurred during the selection (e.g.,\naccessing LLM and evaluating the responses) is rarely explicitly considered. To\novercome this limitation, this work provides a principled framework, TRIPLE, to\nefficiently perform prompt selection under an explicit budget constraint.\nTRIPLE is built on a novel connection established between prompt optimization\nand fixed-budget best arm identification (BAI-FB) in multi-armed bandits (MAB);\nthus, it is capable of leveraging the rich toolbox from BAI-FB systematically\nand also incorporating unique characteristics of prompt optimization. Extensive\nexperiments on multiple well-adopted tasks using various LLMs demonstrate the\nremarkable performance improvement of TRIPLE over baselines while satisfying\nthe limited budget constraints. As an extension, variants of TRIPLE are\nproposed to efficiently select examples for few-shot prompts, also achieving\nsuperior empirical performance.\n","authors":["Chengshuai Shi","Kun Yang","Zihan Chen","Jundong Li","Jing Yang","Cong Shen"],"pdf_url":"https://arxiv.org/pdf/2402.09723v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15793v2","updated":"2024-05-30T19:09:01Z","published":"2024-05-06T17:41:33Z","title":"SWE-agent: Agent-Computer Interfaces Enable Automated Software\n  Engineering","summary":"  Language model (LM) agents are increasingly being used to automate\ncomplicated tasks in digital environments. Just as humans benefit from powerful\nsoftware applications, such as integrated development environments, for complex\ntasks like software engineering, we posit that LM agents represent a new\ncategory of end users with their own needs and abilities, and would benefit\nfrom specially-built interfaces to the software they use. We investigate how\ninterface design affects the performance of language model agents. As a result\nof this exploration, we introduce SWE-agent: a system that facilitates LM\nagents to autonomously use computers to solve software engineering tasks.\nSWE-agent's custom agent-computer interface (ACI) significantly enhances an\nagent's ability to create and edit code files, navigate entire repositories,\nand execute tests and other programs. We evaluate SWE-agent on SWE-bench and\nHumanEvalFix, achieving state-of-the-art performance on both with a pass@1 rate\nof 12.5% and 87.7%, respectively, far exceeding the previous state-of-the-art\nachieved with non-interactive LMs. Finally, we provide insight on how the\ndesign of the ACI can impact agents' behavior and performance.\n","authors":["John Yang","Carlos E. Jimenez","Alexander Wettig","Kilian Lieret","Shunyu Yao","Karthik Narasimhan","Ofir Press"],"pdf_url":"https://arxiv.org/pdf/2405.15793v2.pdf","comment":"Code, data, and demo available at https://swe-agent.com"},{"id":"http://arxiv.org/abs/2405.20419v1","updated":"2024-05-30T18:53:53Z","published":"2024-05-30T18:53:53Z","title":"Enhancing Antibiotic Stewardship using a Natural Language Approach for\n  Better Feature Representation","summary":"  The rapid emergence of antibiotic-resistant bacteria is recognized as a\nglobal healthcare crisis, undermining the efficacy of life-saving antibiotics.\nThis crisis is driven by the improper and overuse of antibiotics, which\nescalates bacterial resistance. In response, this study explores the use of\nclinical decision support systems, enhanced through the integration of\nelectronic health records (EHRs), to improve antibiotic stewardship. However,\nEHR systems present numerous data-level challenges, complicating the effective\nsynthesis and utilization of data. In this work, we transform EHR data into a\nserialized textual representation and employ pretrained foundation models to\ndemonstrate how this enhanced feature representation can aid in antibiotic\nsusceptibility predictions. Our results suggest that this text representation,\ncombined with foundation models, provides a valuable tool to increase\ninterpretability and support antibiotic stewardship efforts.\n","authors":["Simon A. Lee","Trevor Brokowski","Jeffrey N. Chiang"],"pdf_url":"https://arxiv.org/pdf/2405.20419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13350v2","updated":"2024-05-30T18:42:45Z","published":"2024-05-22T05:12:35Z","title":"Efficacy of ByT5 in Multilingual Translation of Biblical Texts for\n  Underrepresented Languages","summary":"  This study presents the development and evaluation of a ByT5-based\nmultilingual translation model tailored for translating the Bible into\nunderrepresented languages. Utilizing the comprehensive Johns Hopkins\nUniversity Bible Corpus, we trained the model to capture the intricate nuances\nof character-based and morphologically rich languages. Our results, measured by\nthe BLEU score and supplemented with sample translations, suggest the model can\nimprove accessibility to sacred texts. It effectively handles the distinctive\nbiblical lexicon and structure, thus bridging the linguistic divide. The study\nalso discusses the model's limitations and suggests pathways for future\nenhancements, focusing on expanding access to sacred literature across\nlinguistic boundaries.\n","authors":["Corinne Aars","Lauren Adams","Xiaokan Tian","Zhaoyu Wang","Colton Wismer","Jason Wu","Pablo Rivas","Korn Sooksatra","Matthew Fendt"],"pdf_url":"https://arxiv.org/pdf/2405.13350v2.pdf","comment":"LXAI Workshop at the 2024 Annual Conference of the North American\n  Chapter of the Association for Computational Linguistics (NAACL 2024)"},{"id":"http://arxiv.org/abs/2405.20413v1","updated":"2024-05-30T18:38:36Z","published":"2024-05-30T18:38:36Z","title":"Jailbreaking Large Language Models Against Moderation Guardrails via\n  Cipher Characters","summary":"  Large Language Models (LLMs) are typically harmless but remain vulnerable to\ncarefully crafted prompts known as ``jailbreaks'', which can bypass protective\nmeasures and induce harmful behavior. Recent advancements in LLMs have\nincorporated moderation guardrails that can filter outputs, which trigger\nprocessing errors for certain malicious questions. Existing red-teaming\nbenchmarks often neglect to include questions that trigger moderation\nguardrails, making it difficult to evaluate jailbreak effectiveness. To address\nthis issue, we introduce JAMBench, a harmful behavior benchmark designed to\ntrigger and evaluate moderation guardrails. JAMBench involves 160 manually\ncrafted instructions covering four major risk categories at multiple severity\nlevels. Furthermore, we propose a jailbreak method, JAM (Jailbreak Against\nModeration), designed to attack moderation guardrails using jailbreak prefixes\nto bypass input-level filters and a fine-tuned shadow model functionally\nequivalent to the guardrail model to generate cipher characters to bypass\noutput-level filters. Our extensive experiments on four LLMs demonstrate that\nJAM achieves higher jailbreak success ($\\sim$ $\\times$ 19.88) and lower\nfiltered-out rates ($\\sim$ $\\times$ 1/6) than baselines.\n","authors":["Haibo Jin","Andy Zhou","Joe D. Menke","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20413v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2405.20410v1","updated":"2024-05-30T18:28:31Z","published":"2024-05-30T18:28:31Z","title":"SeamlessExpressiveLM: Speech Language Model for Expressive\n  Speech-to-Speech Translation with Chain-of-Thought","summary":"  Expressive speech-to-speech translation (S2ST) is a key research topic in\nseamless communication, which focuses on the preservation of semantics and\nspeaker vocal style in translated speech. Early works synthesized speaker style\naligned speech in order to directly learn the mapping from speech to target\nspeech spectrogram. Without reliance on style aligned data, recent studies\nleverage the advances of language modeling (LM) and build cascaded LMs on\nsemantic and acoustic tokens. This work proposes SeamlessExpressiveLM, a single\nspeech language model for expressive S2ST. We decompose the complex\nsource-to-target speech mapping into intermediate generation steps with\nchain-of-thought prompting. The model is first guided to translate target\nsemantic content and then transfer the speaker style to multi-stream acoustic\nunits. Evaluated on Spanish-to-English and Hungarian-to-English translations,\nSeamlessExpressiveLM outperforms cascaded LMs in both semantic quality and\nstyle transfer, meanwhile achieving better parameter efficiency.\n","authors":["Hongyu Gong","Bandhav Veluri"],"pdf_url":"https://arxiv.org/pdf/2405.20410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20404v1","updated":"2024-05-30T18:16:41Z","published":"2024-05-30T18:16:41Z","title":"XPrompt:Explaining Large Language Model's Generation via Joint Prompt\n  Attribution","summary":"  Large Language Models (LLMs) have demonstrated impressive performances in\ncomplex text generation tasks. However, the contribution of the input prompt to\nthe generated content still remains obscure to humans, underscoring the\nnecessity of elucidating and explaining the causality between input and output\npairs. Existing works for providing prompt-specific explanation often confine\nmodel output to be classification or next-word prediction. Few initial attempts\naiming to explain the entire language generation often treat input prompt texts\nindependently, ignoring their combinatorial effects on the follow-up\ngeneration. In this study, we introduce a counterfactual explanation framework\nbased on joint prompt attribution, XPrompt, which aims to explain how a few\nprompt texts collaboratively influences the LLM's complete generation.\nParticularly, we formulate the task of prompt attribution for generation\ninterpretation as a combinatorial optimization problem, and introduce a\nprobabilistic algorithm to search for the casual input combination in the\ndiscrete space. We define and utilize multiple metrics to evaluate the produced\nexplanations, demonstrating both faithfulness and efficiency of our framework.\n","authors":["Yurui Chang","Bochuan Cao","Yujia Wang","Jinghui Chen","Lu Lin"],"pdf_url":"https://arxiv.org/pdf/2405.20404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20362v1","updated":"2024-05-30T17:56:05Z","published":"2024-05-30T17:56:05Z","title":"Hallucination-Free? Assessing the Reliability of Leading AI Legal\n  Research Tools","summary":"  Legal practice has witnessed a sharp rise in products incorporating\nartificial intelligence (AI). Such tools are designed to assist with a wide\nrange of core legal tasks, from search and summarization of caselaw to document\ndrafting. But the large language models used in these tools are prone to\n\"hallucinate,\" or make up false information, making their use risky in\nhigh-stakes domains. Recently, certain legal research providers have touted\nmethods such as retrieval-augmented generation (RAG) as \"eliminating\"\n(Casetext, 2023) or \"avoid[ing]\" hallucinations (Thomson Reuters, 2023), or\nguaranteeing \"hallucination-free\" legal citations (LexisNexis, 2023). Because\nof the closed nature of these systems, systematically assessing these claims is\nchallenging. In this article, we design and report on the first preregistered\nempirical evaluation of AI-driven legal research tools. We demonstrate that the\nproviders' claims are overstated. While hallucinations are reduced relative to\ngeneral-purpose chatbots (GPT-4), we find that the AI research tools made by\nLexisNexis (Lexis+ AI) and Thomson Reuters (Westlaw AI-Assisted Research and\nAsk Practical Law AI) each hallucinate between 17% and 33% of the time. We also\ndocument substantial differences between systems in responsiveness and\naccuracy. Our article makes four key contributions. It is the first to assess\nand report the performance of RAG-based proprietary legal AI tools. Second, it\nintroduces a comprehensive, preregistered dataset for identifying and\nunderstanding vulnerabilities in these systems. Third, it proposes a clear\ntypology for differentiating between hallucinations and accurate legal\nresponses. Last, it provides evidence to inform the responsibilities of legal\nprofessionals in supervising and verifying AI outputs, which remains a central\nopen question for the responsible integration of AI into law.\n","authors":["Varun Magesh","Faiz Surani","Matthew Dahl","Mirac Suzgun","Christopher D. Manning","Daniel E. Ho"],"pdf_url":"https://arxiv.org/pdf/2405.20362v1.pdf","comment":"Our dataset, tool outputs, and labels will be made available upon\n  publication. This version of the manuscript (May 30, 2024) is updated to\n  reflect an evaluation of Westlaw's AI-Assisted Research"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2405.20343v1","updated":"2024-05-30T17:59:54Z","published":"2024-05-30T17:59:54Z","title":"Unique3D: High-Quality and Efficient 3D Mesh Generation from a Single\n  Image","summary":"  In this work, we introduce Unique3D, a novel image-to-3D framework for\nefficiently generating high-quality 3D meshes from single-view images,\nfeaturing state-of-the-art generation fidelity and strong generalizability.\nPrevious methods based on Score Distillation Sampling (SDS) can produce\ndiversified 3D results by distilling 3D knowledge from large 2D diffusion\nmodels, but they usually suffer from long per-case optimization time with\ninconsistent issues. Recent works address the problem and generate better 3D\nresults either by finetuning a multi-view diffusion model or training a fast\nfeed-forward model. However, they still lack intricate textures and complex\ngeometries due to inconsistency and limited generated resolution. To\nsimultaneously achieve high fidelity, consistency, and efficiency in single\nimage-to-3D, we propose a novel framework Unique3D that includes a multi-view\ndiffusion model with a corresponding normal diffusion model to generate\nmulti-view images with their normal maps, a multi-level upscale process to\nprogressively improve the resolution of generated orthographic multi-views, as\nwell as an instant and consistent mesh reconstruction algorithm called ISOMER,\nwhich fully integrates the color and geometric priors into mesh results.\nExtensive experiments demonstrate that our Unique3D significantly outperforms\nother image-to-3D baselines in terms of geometric and textural details.\n","authors":["Kailu Wu","Fangfu Liu","Zhihan Cai","Runjie Yan","Hanyang Wang","Yating Hu","Yueqi Duan","Kaisheng Ma"],"pdf_url":"https://arxiv.org/pdf/2405.20343v1.pdf","comment":"Project page: https://wukailu.github.io/Unique3D"},{"id":"http://arxiv.org/abs/2405.20340v1","updated":"2024-05-30T17:59:50Z","published":"2024-05-30T17:59:50Z","title":"MotionLLM: Understanding Human Behaviors from Human Motions and Videos","summary":"  This study delves into the realm of multi-modality (i.e., video and motion\nmodalities) human behavior understanding by leveraging the powerful\ncapabilities of Large Language Models (LLMs). Diverging from recent LLMs\ndesigned for video-only or motion-only understanding, we argue that\nunderstanding human behavior necessitates joint modeling from both videos and\nmotion sequences (e.g., SMPL sequences) to capture nuanced body part dynamics\nand semantics effectively. In light of this, we present MotionLLM, a\nstraightforward yet effective framework for human motion understanding,\ncaptioning, and reasoning. Specifically, MotionLLM adopts a unified\nvideo-motion training strategy that leverages the complementary advantages of\nexisting coarse video-text data and fine-grained motion-text data to glean rich\nspatial-temporal insights. Furthermore, we collect a substantial dataset,\nMoVid, comprising diverse videos, motions, captions, and instructions.\nAdditionally, we propose the MoVid-Bench, with carefully manual annotations,\nfor better evaluation of human behavior understanding on video and motion.\nExtensive experiments show the superiority of MotionLLM in the caption,\nspatial-temporal comprehension, and reasoning ability.\n","authors":["Ling-Hao Chen","Shunlin Lu","Ailing Zeng","Hao Zhang","Benyou Wang","Ruimao Zhang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20340v1.pdf","comment":"MotionLLM version 1.0, project page see https://lhchen.top/MotionLLM"},{"id":"http://arxiv.org/abs/2405.20339v1","updated":"2024-05-30T17:59:47Z","published":"2024-05-30T17:59:47Z","title":"Visual Perception by Large Language Model's Weights","summary":"  Existing Multimodal Large Language Models (MLLMs) follow the paradigm that\nperceives visual information by aligning visual features with the input space\nof Large Language Models (LLMs), and concatenating visual tokens with text\ntokens to form a unified sequence input for LLMs. These methods demonstrate\npromising results on various vision-language tasks but are limited by the high\ncomputational effort due to the extended input sequence resulting from the\ninvolvement of visual tokens. In this paper, instead of input space alignment,\nwe propose a novel parameter space alignment paradigm that represents visual\ninformation as model weights. For each input image, we use a vision encoder to\nextract visual features, convert features into perceptual weights, and merge\nthe perceptual weights with LLM's weights. In this way, the input of LLM does\nnot require visual tokens, which reduces the length of the input sequence and\ngreatly improves efficiency. Following this paradigm, we propose VLoRA with the\nperceptual weights generator. The perceptual weights generator is designed to\nconvert visual features to perceptual weights with low-rank property,\nexhibiting a form similar to LoRA. The experimental results show that our VLoRA\nachieves comparable performance on various benchmarks for MLLMs, while\nsignificantly reducing the computational costs for both training and inference.\nThe code and models will be made open-source.\n","authors":["Feipeng Ma","Hongwei Xue","Guangting Wang","Yizhou Zhou","Fengyun Rao","Shilin Yan","Yueyi Zhang","Siying Wu","Mike Zheng Shou","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2405.20339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20337v1","updated":"2024-05-30T17:59:42Z","published":"2024-05-30T17:59:42Z","title":"OccSora: 4D Occupancy Generation Models as World Simulators for\n  Autonomous Driving","summary":"  Understanding the evolution of 3D scenes is important for effective\nautonomous driving. While conventional methods mode scene development with the\nmotion of individual instances, world models emerge as a generative framework\nto describe the general scene dynamics. However, most existing methods adopt an\nautoregressive framework to perform next-token prediction, which suffer from\ninefficiency in modeling long-term temporal evolutions. To address this, we\npropose a diffusion-based 4D occupancy generation model, OccSora, to simulate\nthe development of the 3D world for autonomous driving. We employ a 4D scene\ntokenizer to obtain compact discrete spatial-temporal representations for 4D\noccupancy input and achieve high-quality reconstruction for long-sequence\noccupancy videos. We then learn a diffusion transformer on the spatial-temporal\nrepresentations and generate 4D occupancy conditioned on a trajectory prompt.\nWe conduct extensive experiments on the widely used nuScenes dataset with Occ3D\noccupancy annotations. OccSora can generate 16s-videos with authentic 3D layout\nand temporal consistency, demonstrating its ability to understand the spatial\nand temporal distributions of driving scenes. With trajectory-aware 4D\ngeneration, OccSora has the potential to serve as a world simulator for the\ndecision-making of autonomous driving. Code is available at:\nhttps://github.com/wzzheng/OccSora.\n","authors":["Lening Wang","Wenzhao Zheng","Yilong Ren","Han Jiang","Zhiyong Cui","Haiyang Yu","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2405.20337v1.pdf","comment":"Code is available at: https://github.com/wzzheng/OccSora"},{"id":"http://arxiv.org/abs/2405.20336v1","updated":"2024-05-30T17:59:39Z","published":"2024-05-30T17:59:39Z","title":"RapVerse: Coherent Vocals and Whole-Body Motions Generations from Text","summary":"  In this work, we introduce a challenging task for simultaneously generating\n3D holistic body motions and singing vocals directly from textual lyrics\ninputs, advancing beyond existing works that typically address these two\nmodalities in isolation. To facilitate this, we first collect the RapVerse\ndataset, a large dataset containing synchronous rapping vocals, lyrics, and\nhigh-quality 3D holistic body meshes. With the RapVerse dataset, we investigate\nthe extent to which scaling autoregressive multimodal transformers across\nlanguage, audio, and motion can enhance the coherent and realistic generation\nof vocals and whole-body human motions. For modality unification, a\nvector-quantized variational autoencoder is employed to encode whole-body\nmotion sequences into discrete motion tokens, while a vocal-to-unit model is\nleveraged to obtain quantized audio tokens preserving content, prosodic\ninformation, and singer identity. By jointly performing transformer modeling on\nthese three modalities in a unified way, our framework ensures a seamless and\nrealistic blend of vocals and human motions. Extensive experiments demonstrate\nthat our unified generation framework not only produces coherent and realistic\nsinging vocals alongside human motions directly from textual inputs but also\nrivals the performance of specialized single-modality generation systems,\nestablishing new benchmarks for joint vocal-motion generation. The project page\nis available for research purposes at https://vis-www.cs.umass.edu/RapVerse.\n","authors":["Jiaben Chen","Xin Yan","Yihang Chen","Siyuan Cen","Qinwei Ma","Haoyu Zhen","Kaizhi Qian","Lie Lu","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2405.20336v1.pdf","comment":"Project website: https://vis-www.cs.umass.edu/RapVerse"},{"id":"http://arxiv.org/abs/2405.20334v1","updated":"2024-05-30T17:59:24Z","published":"2024-05-30T17:59:24Z","title":"VividDream: Generating 3D Scene with Ambient Dynamics","summary":"  We introduce VividDream, a method for generating explorable 4D scenes with\nambient dynamics from a single input image or text prompt. VividDream first\nexpands an input image into a static 3D point cloud through iterative\ninpainting and geometry merging. An ensemble of animated videos is then\ngenerated using video diffusion models with quality refinement techniques and\nconditioned on renderings of the static 3D scene from the sampled camera\ntrajectories. We then optimize a canonical 4D scene representation using an\nanimated video ensemble, with per-video motion embeddings and visibility masks\nto mitigate inconsistencies. The resulting 4D scene enables free-view\nexploration of a 3D scene with plausible ambient scene dynamics. Experiments\ndemonstrate that VividDream can provide human viewers with compelling 4D\nexperiences generated based on diverse real images and text prompts.\n","authors":["Yao-Chih Lee","Yi-Ting Chen","Andrew Wang","Ting-Hsuan Liao","Brandon Y. Feng","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2405.20334v1.pdf","comment":"Project page: https://vivid-dream-4d.github.io"},{"id":"http://arxiv.org/abs/2405.20333v1","updated":"2024-05-30T17:59:10Z","published":"2024-05-30T17:59:10Z","title":"SurgiTrack: Fine-Grained Multi-Class Multi-Tool Tracking in Surgical\n  Videos","summary":"  Accurate tool tracking is essential for the success of computer-assisted\nintervention. Previous efforts often modeled tool trajectories rigidly,\noverlooking the dynamic nature of surgical procedures, especially tracking\nscenarios like out-of-body and out-of-camera views. Addressing this limitation,\nthe new CholecTrack20 dataset provides detailed labels that account for\nmultiple tool trajectories in three perspectives: (1) intraoperative, (2)\nintracorporeal, and (3) visibility, representing the different types of\ntemporal duration of tool tracks. These fine-grained labels enhance tracking\nflexibility but also increase the task complexity. Re-identifying tools after\nocclusion or re-insertion into the body remains challenging due to high visual\nsimilarity, especially among tools of the same category. This work recognizes\nthe critical role of the tool operators in distinguishing tool track instances,\nespecially those belonging to the same tool category. The operators'\ninformation are however not explicitly captured in surgical videos. We\ntherefore propose SurgiTrack, a novel deep learning method that leverages\nYOLOv7 for precise tool detection and employs an attention mechanism to model\nthe originating direction of the tools, as a proxy to their operators, for tool\nre-identification. To handle diverse tool trajectory perspectives, SurgiTrack\nemploys a harmonizing bipartite matching graph, minimizing conflicts and\nensuring accurate tool identity association. Experimental results on\nCholecTrack20 demonstrate SurgiTrack's effectiveness, outperforming baselines\nand state-of-the-art methods with real-time inference capability. This work\nsets a new standard in surgical tool tracking, providing dynamic trajectories\nfor more adaptable and precise assistance in minimally invasive surgeries.\n","authors":["Chinedu Innocent Nwoye","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2405.20333v1.pdf","comment":"15 pages, 7 figures, 9 tables, 1 video. Supplementary video available\n  at: https://vimeo.com/951853260"},{"id":"http://arxiv.org/abs/2405.20330v1","updated":"2024-05-30T17:59:02Z","published":"2024-05-30T17:59:02Z","title":"4DHands: Reconstructing Interactive Hands in 4D with Transformers","summary":"  In this paper, we introduce 4DHands, a robust approach to recovering\ninteractive hand meshes and their relative movement from monocular inputs. Our\napproach addresses two major limitations of previous methods: lacking a unified\nsolution for handling various hand image inputs and neglecting the positional\nrelationship of two hands within images. To overcome these challenges, we\ndevelop a transformer-based architecture with novel tokenization and feature\nfusion strategies. Specifically, we propose a Relation-aware Two-Hand\nTokenization (RAT) method to embed positional relation information into the\nhand tokens. In this way, our network can handle both single-hand and two-hand\ninputs and explicitly leverage relative hand positions, facilitating the\nreconstruction of intricate hand interactions in real-world scenarios. As such\ntokenization indicates the relative relationship of two hands, it also supports\nmore effective feature fusion. To this end, we further develop a\nSpatio-temporal Interaction Reasoning (SIR) module to fuse hand tokens in 4D\nwith attention and decode them into 3D hand meshes and relative temporal\nmovements. The efficacy of our approach is validated on several benchmark\ndatasets. The results on in-the-wild videos and real-world scenarios\ndemonstrate the superior performances of our approach for interactive hand\nreconstruction. More video results can be found on the project page:\nhttps://4dhands.github.io.\n","authors":["Dixuan Lin","Yuxiang Zhang","Mengcheng Li","Yebin Liu","Wei Jing","Qi Yan","Qianying Wang","Hongwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20330v1.pdf","comment":"More demo videos can be seen at our project page:\n  https://4dhands.github.io"},{"id":"http://arxiv.org/abs/2405.20327v1","updated":"2024-05-30T17:58:00Z","published":"2024-05-30T17:58:00Z","title":"GECO: Generative Image-to-3D within a SECOnd","summary":"  3D generation has seen remarkable progress in recent years. Existing\ntechniques, such as score distillation methods, produce notable results but\nrequire extensive per-scene optimization, impacting time efficiency.\nAlternatively, reconstruction-based approaches prioritize efficiency but\ncompromise quality due to their limited handling of uncertainty. We introduce\nGECO, a novel method for high-quality 3D generative modeling that operates\nwithin a second. Our approach addresses the prevalent issues of uncertainty and\ninefficiency in current methods through a two-stage approach. In the initial\nstage, we train a single-step multi-view generative model with score\ndistillation. Then, a second-stage distillation is applied to address the\nchallenge of view inconsistency from the multi-view prediction. This two-stage\nprocess ensures a balanced approach to 3D generation, optimizing both quality\nand efficiency. Our comprehensive experiments demonstrate that GECO achieves\nhigh-quality image-to-3D generation with an unprecedented level of efficiency.\n","authors":["Chen Wang","Jiatao Gu","Xiaoxiao Long","Yuan Liu","Lingjie Liu"],"pdf_url":"https://arxiv.org/pdf/2405.20327v1.pdf","comment":"Project Page: https://cwchenwang.github.io/geco"},{"id":"http://arxiv.org/abs/2405.20325v1","updated":"2024-05-30T17:57:30Z","published":"2024-05-30T17:57:30Z","title":"MotionFollower: Editing Video Motion via Lightweight Score-Guided\n  Diffusion","summary":"  Despite impressive advancements in diffusion-based video editing models in\naltering video attributes, there has been limited exploration into modifying\nmotion information while preserving the original protagonist's appearance and\nbackground. In this paper, we propose MotionFollower, a lightweight\nscore-guided diffusion model for video motion editing. To introduce conditional\ncontrols to the denoising process, MotionFollower leverages two of our proposed\nlightweight signal controllers, one for poses and the other for appearances,\nboth of which consist of convolution blocks without involving heavy attention\ncalculations. Further, we design a score guidance principle based on a\ntwo-branch architecture, including the reconstruction and editing branches,\nwhich significantly enhance the modeling capability of texture details and\ncomplicated backgrounds. Concretely, we enforce several consistency\nregularizers and losses during the score estimation. The resulting gradients\nthus inject appropriate guidance to the intermediate latents, forcing the model\nto preserve the original background details and protagonists' appearances\nwithout interfering with the motion modification. Experiments demonstrate the\ncompetitive motion editing ability of MotionFollower qualitatively and\nquantitatively. Compared with MotionEditor, the most advanced motion editing\nmodel, MotionFollower achieves an approximately 80% reduction in GPU memory\nwhile delivering superior motion editing performance and exclusively supporting\nlarge camera movements and actions.\n","authors":["Shuyuan Tu","Qi Dai","Zihao Zhang","Sicheng Xie","Zhi-Qi Cheng","Chong Luo","Xintong Han","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.20325v1.pdf","comment":"23 pages, 18 figures. Project page at\n  https://francis-rings.github.io/MotionFollower/"},{"id":"http://arxiv.org/abs/2405.20324v1","updated":"2024-05-30T17:57:26Z","published":"2024-05-30T17:57:26Z","title":"Don't drop your samples! Coherence-aware training benefits Conditional\n  diffusion","summary":"  Conditional diffusion models are powerful generative models that can leverage\nvarious types of conditional information, such as class labels, segmentation\nmasks, or text captions. However, in many real-world scenarios, conditional\ninformation may be noisy or unreliable due to human annotation errors or weak\nalignment. In this paper, we propose the Coherence-Aware Diffusion (CAD), a\nnovel method that integrates coherence in conditional information into\ndiffusion models, allowing them to learn from noisy annotations without\ndiscarding data. We assume that each data point has an associated coherence\nscore that reflects the quality of the conditional information. We then\ncondition the diffusion model on both the conditional information and the\ncoherence score. In this way, the model learns to ignore or discount the\nconditioning when the coherence is low. We show that CAD is theoretically sound\nand empirically effective on various conditional generation tasks. Moreover, we\nshow that leveraging coherence generates realistic and diverse samples that\nrespect conditional information better than models trained on cleaned datasets\nwhere samples with low coherence have been discarded.\n","authors":["Nicolas Dufour","Victor Besnier","Vicky Kalogeiton","David Picard"],"pdf_url":"https://arxiv.org/pdf/2405.20324v1.pdf","comment":"Accepted at CVPR 2024 as a Highlight. Project page:\n  https://nicolas-dufour.github.io/cad.html"},{"id":"http://arxiv.org/abs/2405.20323v1","updated":"2024-05-30T17:57:08Z","published":"2024-05-30T17:57:08Z","title":"$\\textit{S}^3$Gaussian: Self-Supervised Street Gaussians for Autonomous\n  Driving","summary":"  Photorealistic 3D reconstruction of street scenes is a critical technique for\ndeveloping real-world simulators for autonomous driving. Despite the efficacy\nof Neural Radiance Fields (NeRF) for driving scenes, 3D Gaussian Splatting\n(3DGS) emerges as a promising direction due to its faster speed and more\nexplicit representation. However, most existing street 3DGS methods require\ntracked 3D vehicle bounding boxes to decompose the static and dynamic elements\nfor effective reconstruction, limiting their applications for in-the-wild\nscenarios. To facilitate efficient 3D scene reconstruction without costly\nannotations, we propose a self-supervised street Gaussian\n($\\textit{S}^3$Gaussian) method to decompose dynamic and static elements from\n4D consistency. We represent each scene with 3D Gaussians to preserve the\nexplicitness and further accompany them with a spatial-temporal field network\nto compactly model the 4D dynamics. We conduct extensive experiments on the\nchallenging Waymo-Open dataset to evaluate the effectiveness of our method. Our\n$\\textit{S}^3$Gaussian demonstrates the ability to decompose static and dynamic\nscenes and achieves the best performance without using 3D annotations. Code is\navailable at: https://github.com/nnanhuang/S3Gaussian/.\n","authors":["Nan Huang","Xiaobao Wei","Wenzhao Zheng","Pengju An","Ming Lu","Wei Zhan","Masayoshi Tomizuka","Kurt Keutzer","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20323v1.pdf","comment":"Code is available at: https://github.com/nnanhuang/S3Gaussian/"},{"id":"http://arxiv.org/abs/2405.20321v1","updated":"2024-05-30T17:56:54Z","published":"2024-05-30T17:56:54Z","title":"Vision-based Manipulation from Single Human Video with Open-World Object\n  Graphs","summary":"  We present an object-centric approach to empower robots to learn vision-based\nmanipulation skills from human videos. We investigate the problem of imitating\nrobot manipulation from a single human video in the open-world setting, where a\nrobot must learn to manipulate novel objects from one video demonstration. We\nintroduce ORION, an algorithm that tackles the problem by extracting an\nobject-centric manipulation plan from a single RGB-D video and deriving a\npolicy that conditions on the extracted plan. Our method enables the robot to\nlearn from videos captured by daily mobile devices such as an iPad and\ngeneralize the policies to deployment environments with varying visual\nbackgrounds, camera angles, spatial layouts, and novel object instances. We\nsystematically evaluate our method on both short-horizon and long-horizon\ntasks, demonstrating the efficacy of ORION in learning from a single human\nvideo in the open world. Videos can be found in the project website\nhttps://ut-austin-rpl.github.io/ORION-release.\n","authors":["Yifeng Zhu","Arisrei Lim","Peter Stone","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.20321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20320v1","updated":"2024-05-30T17:56:04Z","published":"2024-05-30T17:56:04Z","title":"Improving the Training of Rectified Flows","summary":"  Diffusion models have shown great promise for image and video generation, but\nsampling from state-of-the-art models requires expensive numerical integration\nof a generative ODE. One approach for tackling this problem is rectified flows,\nwhich iteratively learn smooth ODE paths that are less susceptible to\ntruncation error. However, rectified flows still require a relatively large\nnumber of function evaluations (NFEs). In this work, we propose improved\ntechniques for training rectified flows, allowing them to compete with\nknowledge distillation methods even in the low NFE setting. Our main insight is\nthat under realistic settings, a single iteration of the Reflow algorithm for\ntraining rectified flows is sufficient to learn nearly straight trajectories;\nhence, the current practice of using multiple Reflow iterations is unnecessary.\nWe thus propose techniques to improve one-round training of rectified flows,\nincluding a U-shaped timestep distribution and LPIPS-Huber premetric. With\nthese techniques, we improve the FID of the previous 2-rectified flow by up to\n72% in the 1 NFE setting on CIFAR-10. On ImageNet 64$\\times$64, our improved\nrectified flow outperforms the state-of-the-art distillation methods such as\nconsistency distillation and progressive distillation in both one-step and\ntwo-step settings and rivals the performance of improved consistency training\n(iCT) in FID. Code is available at https://github.com/sangyun884/rfpp.\n","authors":["Sangyun Lee","Zinan Lin","Giulia Fanti"],"pdf_url":"https://arxiv.org/pdf/2405.20320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20319v1","updated":"2024-05-30T17:55:46Z","published":"2024-05-30T17:55:46Z","title":"ParSEL: Parameterized Shape Editing with Language","summary":"  The ability to edit 3D assets from natural language presents a compelling\nparadigm to aid in the democratization of 3D content creation. However, while\nnatural language is often effective at communicating general intent, it is\npoorly suited for specifying precise manipulation. To address this gap, we\nintroduce ParSEL, a system that enables controllable editing of high-quality 3D\nassets from natural language. Given a segmented 3D mesh and an editing request,\nParSEL produces a parameterized editing program. Adjusting the program\nparameters allows users to explore shape variations with a precise control over\nthe magnitudes of edits. To infer editing programs which align with an input\nedit request, we leverage the abilities of large-language models (LLMs).\nHowever, while we find that LLMs excel at identifying initial edit operations,\nthey often fail to infer complete editing programs, and produce outputs that\nviolate shape semantics. To overcome this issue, we introduce Analytical Edit\nPropagation (AEP), an algorithm which extends a seed edit with additional\noperations until a complete editing program has been formed. Unlike prior\nmethods, AEP searches for analytical editing operations compatible with a range\nof possible user edits through the integration of computer algebra systems for\ngeometric analysis. Experimentally we demonstrate ParSEL's effectiveness in\nenabling controllable editing of 3D objects through natural language requests\nover alternative system designs.\n","authors":["Aditya Ganeshan","Ryan Y. Huang","Xianghao Xu","R. Kenny Jones","Daniel Ritchie"],"pdf_url":"https://arxiv.org/pdf/2405.20319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20310v1","updated":"2024-05-30T17:52:52Z","published":"2024-05-30T17:52:52Z","title":"A Pixel Is Worth More Than One 3D Gaussians in Single-View 3D\n  Reconstruction","summary":"  Learning 3D scene representation from a single-view image is a long-standing\nfundamental problem in computer vision, with the inherent ambiguity in\npredicting contents unseen from the input view. Built on the recently proposed\n3D Gaussian Splatting (3DGS), the Splatter Image method has made promising\nprogress on fast single-image novel view synthesis via learning a single 3D\nGaussian for each pixel based on the U-Net feature map of an input image.\nHowever, it has limited expressive power to represent occluded components that\nare not observable in the input view. To address this problem, this paper\npresents a Hierarchical Splatter Image method in which a pixel is worth more\nthan one 3D Gaussians. Specifically,\n  each pixel is represented by a parent 3D Gaussian and a small number of child\n3D Gaussians. Parent 3D Gaussians are learned as done in the vanilla Splatter\nImage. Child 3D Gaussians are learned via a lightweight Multi-Layer Perceptron\n(MLP) which takes as input the projected image features of a parent 3D Gaussian\nand the embedding of a target camera view. Both parent and child 3D Gaussians\nare learned end-to-end in a stage-wise way. The joint condition of input image\nfeatures from eyes of the parent Gaussians and the target camera position\nfacilitates learning to allocate child Gaussians to ``see the unseen'',\nrecovering the occluded details that are often missed by parent Gaussians.\n  In experiments, the proposed method is tested on the ShapeNet-SRN and CO3D\ndatasets with state-of-the-art performance obtained, especially showing\npromising capabilities of reconstructing occluded contents in the input view.\n","authors":["Jianghao Shen","Tianfu Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20310v1.pdf","comment":"preprint, under review"},{"id":"http://arxiv.org/abs/2209.15210v5","updated":"2024-05-30T17:51:36Z","published":"2022-09-30T03:40:10Z","title":"Multi-Prompt Alignment for Multi-Source Unsupervised Domain Adaptation","summary":"  Most existing methods for unsupervised domain adaptation (UDA) rely on a\nshared network to extract domain-invariant features. However, when facing\nmultiple source domains, optimizing such a network involves updating the\nparameters of the entire network, making it both computationally expensive and\nchallenging, particularly when coupled with min-max objectives. Inspired by\nrecent advances in prompt learning that adapts high-capacity models for\ndownstream tasks in a computationally economic way, we introduce Multi-Prompt\nAlignment (MPA), a simple yet efficient framework for multi-source UDA. Given a\nsource and target domain pair, MPA first trains an individual prompt to\nminimize the domain gap through a contrastive loss. Then, MPA denoises the\nlearned prompts through an auto-encoding process and aligns them by maximizing\nthe agreement of all the reconstructed prompts. Moreover, we show that the\nresulting subspace acquired from the auto-encoding process can easily\ngeneralize to a streamlined set of target domains, making our method more\nefficient for practical usage. Extensive experiments show that MPA achieves\nstate-of-the-art results on three popular datasets with an impressive average\naccuracy of 54.1% on DomainNet.\n","authors":["Haoran Chen","Xintong Han","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2209.15210v5.pdf","comment":"NeurIPS 2023 camera-ready version"},{"id":"http://arxiv.org/abs/2405.20305v1","updated":"2024-05-30T17:50:08Z","published":"2024-05-30T17:50:08Z","title":"Can't make an Omelette without Breaking some Eggs: Plausible Action\n  Anticipation using Large Video-Language Models","summary":"  We introduce PlausiVL, a large video-language model for anticipating action\nsequences that are plausible in the real-world. While significant efforts have\nbeen made towards anticipating future actions, prior approaches do not take\ninto account the aspect of plausibility in an action sequence. To address this\nlimitation, we explore the generative capability of a large video-language\nmodel in our work and further, develop the understanding of plausibility in an\naction sequence by introducing two objective functions, a counterfactual-based\nplausible action sequence learning loss and a long-horizon action repetition\nloss. We utilize temporal logical constraints as well as verb-noun action pair\nlogical constraints to create implausible/counterfactual action sequences and\nuse them to train the model with plausible action sequence learning loss. This\nloss helps the model to differentiate between plausible and not plausible\naction sequences and also helps the model to learn implicit temporal cues\ncrucial for the task of action anticipation. The long-horizon action repetition\nloss puts a higher penalty on the actions that are more prone to repetition\nover a longer temporal window. With this penalization, the model is able to\ngenerate diverse, plausible action sequences. We evaluate our approach on two\nlarge-scale datasets, Ego4D and EPIC-Kitchens-100, and show improvements on the\ntask of action anticipation.\n","authors":["Himangi Mittal","Nakul Agarwal","Shao-Yuan Lo","Kwonjoon Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20305v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2405.20299v1","updated":"2024-05-30T17:46:23Z","published":"2024-05-30T17:46:23Z","title":"Scaling White-Box Transformers for Vision","summary":"  CRATE, a white-box transformer architecture designed to learn compressed and\nsparse representations, offers an intriguing alternative to standard vision\ntransformers (ViTs) due to its inherent mathematical interpretability. Despite\nextensive investigations into the scaling behaviors of language and vision\ntransformers, the scalability of CRATE remains an open question which this\npaper aims to address. Specifically, we propose CRATE-$\\alpha$, featuring\nstrategic yet minimal modifications to the sparse coding block in the CRATE\narchitecture design, and a light training recipe designed to improve the\nscalability of CRATE. Through extensive experiments, we demonstrate that\nCRATE-$\\alpha$ can effectively scale with larger model sizes and datasets. For\nexample, our CRATE-$\\alpha$-B substantially outperforms the prior best CRATE-B\nmodel accuracy on ImageNet classification by 3.7%, achieving an accuracy of\n83.2%. Meanwhile, when scaling further, our CRATE-$\\alpha$-L obtains an\nImageNet classification accuracy of 85.1%. More notably, these model\nperformance improvements are achieved while preserving, and potentially even\nenhancing the interpretability of learned CRATE models, as we demonstrate\nthrough showing that the learned token representations of increasingly larger\ntrained CRATE-$\\alpha$ models yield increasingly higher-quality unsupervised\nobject segmentation of images. The project page is\nhttps://rayjryang.github.io/CRATE-alpha/.\n","authors":["Jinrui Yang","Xianhang Li","Druv Pai","Yuyin Zhou","Yi Ma","Yaodong Yu","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2405.20299v1.pdf","comment":"project page: https://rayjryang.github.io/CRATE-alpha/"},{"id":"http://arxiv.org/abs/2403.01643v2","updated":"2024-05-30T17:46:22Z","published":"2024-03-03T23:40:35Z","title":"You Need to Pay Better Attention: Rethinking the Mathematics of\n  Attention Mechanism","summary":"  Scaled Dot Product Attention (SDPA) is the backbone of many modern\ndeep-learning models. It is so versatile that it has been used in natural\nlanguage, vision, and multi-modal domains with very little change compared to\nits original formulation. This paper discusses why the current formulation is\ninefficient by delving into the mathematical details of the attention\nmechanism. We propose three improvements to mitigate these inefficiencies,\nthereby, introducing three enhanced attention mechanisms: Optimised, Efficient,\nand Super Attention. Optimised and Efficient Attention have one and two matrix\nmultiplications fewer per head, respectively, and 25% and 50% fewer parameters,\nrespectively, than standard SDPA, but perform similarly to standard SDPA in\nboth vision and natural language tasks. They can be used in all applications\nwhere SDPA is used while offering smaller model sizes and faster training and\ninference without noticeable loss in performance. Super Attention introduces a\nnew linear transformation on the values, transforming them from the left. It\noutperforms standard SPDA on vision and natural language tasks by up to 17%\nwhile having one fewer matrix multiplication per head and 25% fewer parameters\nthan standard SDPA. Consequently, it is also faster than standard SDPA. Super\nAttention is ideal in applications where the attention layer's context length\nis fixed, such as Vision Transformers. In addition to providing mathematical\nreasoning, we evaluate the presented attention mechanisms on several datasets\nincluding MNIST, CIFAR100, ImageNet, IMDB Movie Reviews, and Amazon Reviews\ndatasets, as well as combined Europarl and Anki English-Spanish datasets for\nneural machine translation.\n","authors":["Mehran Hosseini","Peyman Hosseini"],"pdf_url":"https://arxiv.org/pdf/2403.01643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20291v1","updated":"2024-05-30T17:41:32Z","published":"2024-05-30T17:41:32Z","title":"Unveiling and Mitigating Backdoor Vulnerabilities based on Unlearning\n  Weight Changes and Backdoor Activeness","summary":"  The security threat of backdoor attacks is a central concern for deep neural\nnetworks (DNNs). Recently, without poisoned data, unlearning models with clean\ndata and then learning a pruning mask have contributed to backdoor defense.\nAdditionally, vanilla fine-tuning with those clean data can help recover the\nlost clean accuracy. However, the behavior of clean unlearning is still\nunder-explored, and vanilla fine-tuning unintentionally induces back the\nbackdoor effect. In this work, we first investigate model unlearning from the\nperspective of weight changes and gradient norms, and find two interesting\nobservations in the backdoored model: 1) the weight changes between poison and\nclean unlearning are positively correlated, making it possible for us to\nidentify the backdoored-related neurons without using poisoned data; 2) the\nneurons of the backdoored model are more active (i.e., larger changes in\ngradient norm) than those in the clean model, suggesting the need to suppress\nthe gradient norm during fine-tuning. Then, we propose an effective two-stage\ndefense method. In the first stage, an efficient Neuron Weight Change\n(NWC)-based Backdoor Reinitialization is proposed based on observation 1). In\nthe second stage, based on observation 2), we design an Activeness-Aware\nFine-Tuning to replace the vanilla fine-tuning. Extensive experiments,\ninvolving eight backdoor attacks on three benchmark datasets, demonstrate the\nsuperior performance of our proposed method compared to recent state-of-the-art\nbackdoor defense approaches.\n","authors":["Weilin Lin","Li Liu","Shaokui Wei","Jianze Li","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2405.20291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20283v1","updated":"2024-05-30T17:35:49Z","published":"2024-05-30T17:35:49Z","title":"TetSphere Splatting: Representing High-Quality Geometry with Lagrangian\n  Volumetric Meshes","summary":"  We present TetSphere splatting, an explicit, Lagrangian representation for\nreconstructing 3D shapes with high-quality geometry. In contrast to\nconventional object reconstruction methods which predominantly use Eulerian\nrepresentations, including both neural implicit (e.g., NeRF, NeuS) and explicit\nrepresentations (e.g., DMTet), and often struggle with high computational\ndemands and suboptimal mesh quality, TetSphere splatting utilizes an underused\nbut highly effective geometric primitive -- tetrahedral meshes. This approach\ndirectly yields superior mesh quality without relying on neural networks or\npost-processing. It deforms multiple initial tetrahedral spheres to accurately\nreconstruct the 3D shape through a combination of differentiable rendering and\ngeometric energy optimization, resulting in significant computational\nefficiency. Serving as a robust and versatile geometry representation,\nTet-Sphere splatting seamlessly integrates into diverse applications, including\nsingle-view 3D reconstruction, image-/text-to-3D content generation.\nExperimental results demonstrate that TetSphere splatting outperforms existing\nrepresentations, delivering faster optimization speed, enhanced mesh quality,\nand reliable preservation of thin structures.\n","authors":["Minghao Guo","Bohan Wang","Kaiming He","Wojciech Matusik"],"pdf_url":"https://arxiv.org/pdf/2405.20283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20282v1","updated":"2024-05-30T17:34:40Z","published":"2024-05-30T17:34:40Z","title":"SemFlow: Binding Semantic Segmentation and Image Synthesis via Rectified\n  Flow","summary":"  Semantic segmentation and semantic image synthesis are two representative\ntasks in visual perception and generation. While existing methods consider them\nas two distinct tasks, we propose a unified diffusion-based framework (SemFlow)\nand model them as a pair of reverse problems. Specifically, motivated by\nrectified flow theory, we train an ordinary differential equation (ODE) model\nto transport between the distributions of real images and semantic masks. As\nthe training object is symmetric, samples belonging to the two distributions,\nimages and semantic masks, can be effortlessly transferred reversibly. For\nsemantic segmentation, our approach solves the contradiction between the\nrandomness of diffusion outputs and the uniqueness of segmentation results. For\nimage synthesis, we propose a finite perturbation approach to enhance the\ndiversity of generated results without changing the semantic categories.\nExperiments show that our SemFlow achieves competitive results on semantic\nsegmentation and semantic image synthesis tasks. We hope this simple framework\nwill motivate people to rethink the unification of low-level and high-level\nvision. Project page: https://github.com/wang-chaoyang/SemFlow.\n","authors":["Chaoyang Wang","Xiangtai Li","Lu Qi","Henghui Ding","Yunhai Tong","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2405.20282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20279v1","updated":"2024-05-30T17:33:10Z","published":"2024-05-30T17:33:10Z","title":"CV-VAE: A Compatible Video VAE for Latent Generative Video Models","summary":"  Spatio-temporal compression of videos, utilizing networks such as Variational\nAutoencoders (VAE), plays a crucial role in OpenAI's SORA and numerous other\nvideo generative models. For instance, many LLM-like video models learn the\ndistribution of discrete tokens derived from 3D VAEs within the VQVAE\nframework, while most diffusion-based video models capture the distribution of\ncontinuous latent extracted by 2D VAEs without quantization. The temporal\ncompression is simply realized by uniform frame sampling which results in\nunsmooth motion between consecutive frames. Currently, there lacks of a\ncommonly used continuous video (3D) VAE for latent diffusion-based video models\nin the research community. Moreover, since current diffusion-based approaches\nare often implemented using pre-trained text-to-image (T2I) models, directly\ntraining a video VAE without considering the compatibility with existing T2I\nmodels will result in a latent space gap between them, which will take huge\ncomputational resources for training to bridge the gap even with the T2I models\nas initialization. To address this issue, we propose a method for training a\nvideo VAE of latent video models, namely CV-VAE, whose latent space is\ncompatible with that of a given image VAE, e.g., image VAE of Stable Diffusion\n(SD). The compatibility is achieved by the proposed novel latent space\nregularization, which involves formulating a regularization loss using the\nimage VAE. Benefiting from the latent space compatibility, video models can be\ntrained seamlessly from pre-trained T2I or video models in a truly\nspatio-temporally compressed latent space, rather than simply sampling video\nframes at equal intervals. With our CV-VAE, existing video models can generate\nfour times more frames with minimal finetuning. Extensive experiments are\nconducted to demonstrate the effectiveness of the proposed video VAE.\n","authors":["Sijie Zhao","Yong Zhang","Xiaodong Cun","Shaoshu Yang","Muyao Niu","Xiaoyu Li","Wenbo Hu","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2405.20279v1.pdf","comment":"Project Page: https://ailab-cvc.github.io/cvvae/index.html"},{"id":"http://arxiv.org/abs/2405.20271v1","updated":"2024-05-30T17:26:02Z","published":"2024-05-30T17:26:02Z","title":"ETHER: Efficient Finetuning of Large-Scale Models with Hyperplane\n  Reflections","summary":"  Parameter-efficient finetuning (PEFT) has become ubiquitous to adapt\nfoundation models to downstream task requirements while retaining their\ngeneralization ability. However, the amount of additionally introduced\nparameters and compute for successful adaptation and hyperparameter searches\ncan explode quickly, especially when deployed at scale to serve numerous\nindividual requests. To ensure effective, parameter-efficient, and\nhyperparameter-robust adaptation, we propose the ETHER transformation family,\nwhich performs Efficient fineTuning via HypErplane Reflections. By design,\nETHER transformations require a minimal number of parameters, are less likely\nto deteriorate model performance, and exhibit robustness to hyperparameter and\nlearning rate choices. In particular, we introduce ETHER and its relaxation\nETHER+, which match or outperform existing PEFT methods with significantly\nfewer parameters ($\\sim$$10$-$100$ times lower than LoRA or OFT) across\nmultiple image synthesis and natural language tasks without exhaustive\nhyperparameter tuning. Finally, we investigate the recent emphasis on\nHyperspherical Energy retention for adaptation and raise questions on its\npractical utility. The code is available at https://github.com/mwbini/ether.\n","authors":["Massimo Bini","Karsten Roth","Zeynep Akata","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2405.20271v1.pdf","comment":"Accepted to ICML 2024. Code available at\n  https://github.com/mwbini/ether"},{"id":"http://arxiv.org/abs/2405.20259v1","updated":"2024-05-30T17:09:05Z","published":"2024-05-30T17:09:05Z","title":"FaceMixup: Enhancing Facial Expression Recognition through Mixed Face\n  Regularization","summary":"  The proliferation of deep learning solutions and the scarcity of large\nannotated datasets pose significant challenges in real-world applications.\nVarious strategies have been explored to overcome this challenge, with data\naugmentation (DA) approaches emerging as prominent solutions. DA approaches\ninvolve generating additional examples by transforming existing labeled data,\nthereby enriching the dataset and helping deep learning models achieve improved\ngeneralization without succumbing to overfitting. In real applications, where\nsolutions based on deep learning are widely used, there is facial expression\nrecognition (FER), which plays an essential role in human communication,\nimproving a range of knowledge areas (e.g., medicine, security, and marketing).\nIn this paper, we propose a simple and comprehensive face data augmentation\napproach based on mixed face component regularization that outperforms the\nclassical DA approaches from the literature, including the MixAugment which is\na specific approach for the target task in two well-known FER datasets existing\nin the literature.\n","authors":["Fabio A. Faria","Mateus M. Souza","Raoni F. da S. Teixeira","Mauricio P. Segundo"],"pdf_url":"https://arxiv.org/pdf/2405.20259v1.pdf","comment":"29 pages, 9 figures, paper is under review on journal"},{"id":"http://arxiv.org/abs/2405.20247v1","updated":"2024-05-30T16:58:34Z","published":"2024-05-30T16:58:34Z","title":"KerasCV and KerasNLP: Vision and Language Power-Ups","summary":"  We present the Keras domain packages KerasCV and KerasNLP, extensions of the\nKeras API for Computer Vision and Natural Language Processing workflows,\ncapable of running on either JAX, TensorFlow, or PyTorch. These domain packages\nare designed to enable fast experimentation, with a focus on ease-of-use and\nperformance. We adopt a modular, layered design: at the library's lowest level\nof abstraction, we provide building blocks for creating models and data\npreprocessing pipelines, and at the library's highest level of abstraction, we\nprovide pretrained ``task\" models for popular architectures such as Stable\nDiffusion, YOLOv8, GPT2, BERT, Mistral, CLIP, Gemma, T5, etc. Task models have\nbuilt-in preprocessing, pretrained weights, and can be fine-tuned on raw\ninputs. To enable efficient training, we support XLA compilation for all\nmodels, and run all preprocessing via a compiled graph of TensorFlow operations\nusing the tf.data API. The libraries are fully open-source (Apache 2.0 license)\nand available on GitHub.\n","authors":["Matthew Watson","Divyashree Shivakumar Sreepathihalli","Francois Chollet","Martin Gorner","Kiranbir Sodhia","Ramesh Sampath","Tirth Patel","Haifeng Jin","Neel Kovelamudi","Gabriel Rasskin","Samaneh Saadat","Luke Wood","Chen Qian","Jonathan Bischof","Ian Stenbit"],"pdf_url":"https://arxiv.org/pdf/2405.20247v1.pdf","comment":"Submitted to Journal of Machine Learning Open Source Software"},{"id":"http://arxiv.org/abs/2405.16470v2","updated":"2024-05-30T16:57:57Z","published":"2024-05-26T07:45:12Z","title":"Image Deraining with Frequency-Enhanced State Space Model","summary":"  Removing rain artifacts in images is recognized as a significant issue. In\nthis field, deep learning-based approaches, such as convolutional neural\nnetworks (CNNs) and Transformers, have succeeded. Recently, State Space Models\n(SSMs) have exhibited superior performance across various tasks in both natural\nlanguage processing and image processing due to their ability to model\nlong-range dependencies. This study introduces SSM to rain removal and proposes\na Deraining Frequency-Enhanced State Space Model (DFSSM). To effectively remove\nrain streaks, which produce high-intensity frequency components in specific\ndirections, we employ frequency domain processing concurrently with SSM.\nAdditionally, we develop a novel mixed-scale gated-convolutional block, which\nuses convolutions with multiple kernel sizes to capture various scale\ndegradations effectively and integrates a gating mechanism to manage the flow\nof information. Finally, experiments on synthetic and real-world rainy image\ndatasets show that our method surpasses state-of-the-art methods.\n","authors":["Shugo Yamashita","Masaaki Ikehara"],"pdf_url":"https://arxiv.org/pdf/2405.16470v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20222v1","updated":"2024-05-30T16:22:22Z","published":"2024-05-30T16:22:22Z","title":"MOFA-Video: Controllable Image Animation via Generative Motion Field\n  Adaptions in Frozen Image-to-Video Diffusion Model","summary":"  We present MOFA-Video, an advanced controllable image animation method that\ngenerates video from the given image using various additional controllable\nsignals (such as human landmarks reference, manual trajectories, and another\neven provided video) or their combinations. This is different from previous\nmethods which only can work on a specific motion domain or show weak control\nabilities with diffusion prior. To achieve our goal, we design several\ndomain-aware motion field adapters (\\ie, MOFA-Adapters) to control the\ngenerated motions in the video generation pipeline. For MOFA-Adapters, we\nconsider the temporal motion consistency of the video and generate the dense\nmotion flow from the given sparse control conditions first, and then, the\nmulti-scale features of the given image are wrapped as a guided feature for\nstable video diffusion generation. We naively train two motion adapters for the\nmanual trajectories and the human landmarks individually since they both\ncontain sparse information about the control. After training, the MOFA-Adapters\nin different domains can also work together for more controllable video\ngeneration.\n","authors":["Muyao Niu","Xiaodong Cun","Xintao Wang","Yong Zhang","Ying Shan","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.20222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20216v1","updated":"2024-05-30T16:18:05Z","published":"2024-05-30T16:18:05Z","title":"Boost Your Own Human Image Generation Model via Direct Preference\n  Optimization with AI Feedback","summary":"  The generation of high-quality human images through text-to-image (T2I)\nmethods is a significant yet challenging task. Distinct from general image\ngeneration, human image synthesis must satisfy stringent criteria related to\nhuman pose, anatomy, and alignment with textual prompts, making it particularly\ndifficult to achieve realistic results. Recent advancements in T2I generation\nbased on diffusion models have shown promise, yet challenges remain in meeting\nhuman-specific preferences. In this paper, we introduce a novel approach\ntailored specifically for human image generation utilizing Direct Preference\nOptimization (DPO). Specifically, we introduce an efficient method for\nconstructing a specialized DPO dataset for training human image generation\nmodels without the need for costly human feedback. We also propose a modified\nloss function that enhances the DPO training process by minimizing artifacts\nand improving image fidelity. Our method demonstrates its versatility and\neffectiveness in generating human images, including personalized text-to-image\ngeneration. Through comprehensive evaluations, we show that our approach\nsignificantly advances the state of human image generation, achieving superior\nresults in terms of natural anatomies, poses, and text-image alignment.\n","authors":["Sanghyeon Na","Yonggyu Kim","Hyunjoon Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20216v1.pdf","comment":"28 pages, 18 figures"},{"id":"http://arxiv.org/abs/2405.20204v1","updated":"2024-05-30T16:07:54Z","published":"2024-05-30T16:07:54Z","title":"Jina CLIP: Your CLIP Model Is Also Your Text Retriever","summary":"  Contrastive Language-Image Pretraining (CLIP) is widely used to train models\nto align images and texts in a common embedding space by mapping them to\nfixed-sized vectors. These models are key to multimodal information retrieval\nand related tasks. However, CLIP models generally underperform in text-only\ntasks compared to specialized text models. This creates inefficiencies for\ninformation retrieval systems that keep separate embeddings and models for\ntext-only and multimodal tasks. We propose a novel, multi-task contrastive\ntraining method to address this issue, which we use to train the jina-clip-v1\nmodel to achieve the state-of-the-art performance on both text-image and\ntext-text retrieval tasks.\n","authors":["Andreas Koukounas","Georgios Mastrapas","Michael Günther","Bo Wang","Scott Martens","Isabelle Mohr","Saba Sturua","Mohammad Kalim Akram","Joan Fontanals Martínez","Saahil Ognawala","Susana Guzman","Maximilian Werk","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2405.20204v1.pdf","comment":"4 pages, ICML2024 workshop submission"},{"id":"http://arxiv.org/abs/2405.20188v1","updated":"2024-05-30T15:55:04Z","published":"2024-05-30T15:55:04Z","title":"SPARE: Symmetrized Point-to-Plane Distance for Robust Non-Rigid\n  Registration","summary":"  Existing optimization-based methods for non-rigid registration typically\nminimize an alignment error metric based on the point-to-point or\npoint-to-plane distance between corresponding point pairs on the source surface\nand target surface. However, these metrics can result in slow convergence or a\nloss of detail. In this paper, we propose SPARE, a novel formulation that\nutilizes a symmetrized point-to-plane distance for robust non-rigid\nregistration. The symmetrized point-to-plane distance relies on both the\npositions and normals of the corresponding points, resulting in a more accurate\napproximation of the underlying geometry and can achieve higher accuracy than\nexisting methods. To solve this optimization problem efficiently, we propose an\nalternating minimization solver using a majorization-minimization strategy.\nMoreover, for effective initialization of the solver, we incorporate a\ndeformation graph-based coarse alignment that improves registration quality and\nefficiency. Extensive experiments show that the proposed method greatly\nimproves the accuracy of non-rigid registration problems and maintains\nrelatively high solution efficiency. The code is publicly available at\nhttps://github.com/yaoyx689/spare.\n","authors":["Yuxin Yao","Bailin Deng","Junhui Hou","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20180v1","updated":"2024-05-30T15:48:04Z","published":"2024-05-30T15:48:04Z","title":"Transformers and Slot Encoding for Sample Efficient Physical World\n  Modelling","summary":"  World modelling, i.e. building a representation of the rules that govern the\nworld so as to predict its evolution, is an essential ability for any agent\ninteracting with the physical world. Recent applications of the Transformer\narchitecture to the problem of world modelling from video input show notable\nimprovements in sample efficiency. However, existing approaches tend to work\nonly at the image level thus disregarding that the environment is composed of\nobjects interacting with each other. In this paper, we propose an architecture\ncombining Transformers for world modelling with the slot-attention paradigm, an\napproach for learning representations of objects appearing in a scene. We\ndescribe the resulting neural architecture and report experimental results\nshowing an improvement over the existing solutions in terms of sample\nefficiency and a reduction of the variation of the performance over the\ntraining examples. The code for our architecture and experiments is available\nat https://github.com/torchipeppo/transformers-and-slot-encoding-for-wm\n","authors":["Francesco Petri","Luigi Asprino","Aldo Gangemi"],"pdf_url":"https://arxiv.org/pdf/2405.20180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20161v1","updated":"2024-05-30T15:33:32Z","published":"2024-05-30T15:33:32Z","title":"Landslide mapping from Sentinel-2 imagery through change detection","summary":"  Landslides are one of the most critical and destructive geohazards.\nWidespread development of human activities and settlements combined with the\neffects of climate change on weather are resulting in a high increase in the\nfrequency and destructive power of landslides, making them a major threat to\nhuman life and the economy. In this paper, we explore methodologies to map\nnewly-occurred landslides using Sentinel-2 imagery automatically. All\napproaches presented are framed as a bi-temporal change detection problem,\nrequiring only a pair of Sentinel-2 images, taken respectively before and after\na landslide-triggering event. Furthermore, we introduce a novel deep learning\narchitecture for fusing Sentinel-2 bi-temporal image pairs with Digital\nElevation Model (DEM) data, showcasing its promising performances w.r.t. other\nchange detection models in the literature. As a parallel task, we address\nlimitations in existing datasets by creating a novel geodatabase, which\nincludes manually validated open-access landslide inventories over\nheterogeneous ecoregions of the world. We release both code and dataset with an\nopen-source license.\n","authors":["Tommaso Monopoli","Fabio Montello","Claudio Rossi"],"pdf_url":"https://arxiv.org/pdf/2405.20161v1.pdf","comment":"to be published in IEEE IGARSS 2024 conference proceedings"},{"id":"http://arxiv.org/abs/2405.20155v1","updated":"2024-05-30T15:30:38Z","published":"2024-05-30T15:30:38Z","title":"MotionDreamer: Zero-Shot 3D Mesh Animation from Video Diffusion Models","summary":"  Animation techniques bring digital 3D worlds and characters to life. However,\nmanual animation is tedious and automated techniques are often specialized to\nnarrow shape classes. In our work, we propose a technique for automatic\nre-animation of arbitrary 3D shapes based on a motion prior extracted from a\nvideo diffusion model. Unlike existing 4D generation methods, we focus solely\non the motion, and we leverage an explicit mesh-based representation compatible\nwith existing computer-graphics pipelines. Furthermore, our utilization of\ndiffusion features enhances accuracy of our motion fitting. We analyze efficacy\nof these features for animation fitting and we experimentally validate our\napproach for two different diffusion models and four animation models. Finally,\nwe demonstrate that our time-efficient zero-shot method achieves a superior\nperformance re-animating a diverse set of 3D shapes when compared to existing\ntechniques in a user study. The project website is located at\nhttps://lukas.uzolas.com/MotionDreamer.\n","authors":["Lukas Uzolas","Elmar Eisemann","Petr Kellnhofer"],"pdf_url":"https://arxiv.org/pdf/2405.20155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20152v1","updated":"2024-05-30T15:27:56Z","published":"2024-05-30T15:27:56Z","title":"Uncovering Bias in Large Vision-Language Models at Scale with\n  Counterfactuals","summary":"  With the advent of Large Language Models (LLMs) possessing increasingly\nimpressive capabilities, a number of Large Vision-Language Models (LVLMs) have\nbeen proposed to augment LLMs with visual inputs. Such models condition\ngenerated text on both an input image and a text prompt, enabling a variety of\nuse cases such as visual question answering and multimodal chat. While prior\nstudies have examined the social biases contained in text generated by LLMs,\nthis topic has been relatively unexplored in LVLMs. Examining social biases in\nLVLMs is particularly challenging due to the confounding contributions of bias\ninduced by information contained across the text and visual modalities. To\naddress this challenging problem, we conduct a large-scale study of text\ngenerated by different LVLMs under counterfactual changes to input images.\nSpecifically, we present LVLMs with identical open-ended text prompts while\nconditioning on images from different counterfactual sets, where each set\ncontains images which are largely identical in their depiction of a common\nsubject (e.g., a doctor), but vary only in terms of intersectional social\nattributes (e.g., race and gender). We comprehensively evaluate the text\nproduced by different models under this counterfactual generation setting at\nscale, producing over 57 million responses from popular LVLMs. Our\nmulti-dimensional analysis reveals that social attributes such as race, gender,\nand physical characteristics depicted in input images can significantly\ninfluence the generation of toxic content, competency-associated words, harmful\nstereotypes, and numerical ratings of depicted individuals. We additionally\nexplore the relationship between social bias in LVLMs and their corresponding\nLLMs, as well as inference-time strategies to mitigate bias.\n","authors":["Phillip Howard","Kathleen C. Fraser","Anahita Bhiwandiwalla","Svetlana Kiritchenko"],"pdf_url":"https://arxiv.org/pdf/2405.20152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20141v1","updated":"2024-05-30T15:16:06Z","published":"2024-05-30T15:16:06Z","title":"OpenDAS: Domain Adaptation for Open-Vocabulary Segmentation","summary":"  The advent of Vision Language Models (VLMs) transformed image understanding\nfrom closed-set classifications to dynamic image-language interactions,\nenabling open-vocabulary segmentation. Despite this flexibility, VLMs often\nfall behind closed-set classifiers in accuracy due to their reliance on\nambiguous image captions and lack of domain-specific knowledge. We, therefore,\nintroduce a new task domain adaptation for open-vocabulary segmentation,\nenhancing VLMs with domain-specific priors while preserving their\nopen-vocabulary nature. Existing adaptation methods, when applied to\nsegmentation tasks, improve performance on training queries but can reduce VLM\nperformance on zero-shot text inputs. To address this shortcoming, we propose\nan approach that combines parameter-efficient prompt tuning with a\ntriplet-loss-based training strategy. This strategy is designed to enhance\nopen-vocabulary generalization while adapting to the visual domain. Our results\noutperform other parameter-efficient adaptation strategies in open-vocabulary\nsegment classification tasks across indoor and outdoor datasets. Notably, our\napproach is the only one that consistently surpasses the original VLM on\nzero-shot queries. Our adapted VLMs can be plug-and-play integrated into\nexisting open-vocabulary segmentation pipelines, improving OV-Seg by +6.0% mIoU\non ADE20K, and OpenMask3D by +4.1% AP on ScanNet++ Offices without any changes\nto the methods.\n","authors":["Gonca Yilmaz","Songyou Peng","Francis Engelmann","Marc Pollefeys","Hermann Blum"],"pdf_url":"https://arxiv.org/pdf/2405.20141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20136v1","updated":"2024-05-30T15:12:18Z","published":"2024-05-30T15:12:18Z","title":"A Multimodal Dangerous State Recognition and Early Warning System for\n  Elderly with Intermittent Dementia","summary":"  In response to the social issue of the increasing number of elderly\nvulnerable groups going missing due to the aggravating aging population in\nChina, our team has developed a wearable anti-loss device and intelligent early\nwarning system for elderly individuals with intermittent dementia using\nartificial intelligence and IoT technology. This system comprises an anti-loss\nsmart helmet, a cloud computing module, and an intelligent early warning\napplication on the caregiver's mobile device. The smart helmet integrates a\nminiature camera module, a GPS module, and a 5G communication module to collect\nfirst-person images and location information of the elderly. Data is\ntransmitted remotely via 5G, FTP, and TCP protocols. In the cloud computing\nmodule, our team has proposed for the first time a multimodal dangerous state\nrecognition network based on scene and location information to accurately\nassess the risk of elderly individuals going missing. Finally, the application\nsoftware interface designed for the caregiver's mobile device implements\nmulti-level early warnings. The system developed by our team requires no\noperation or response from the elderly, achieving fully automatic environmental\nperception, risk assessment, and proactive alarming. This overcomes the\nlimitations of traditional monitoring devices, which require active operation\nand response, thus avoiding the issue of the digital divide for the elderly. It\neffectively prevents accidental loss and potential dangers for elderly\nindividuals with dementia.\n","authors":["Liyun Deng","Lei Jin","Guangcheng Wang","Quan Shi","Han Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20136v1.pdf","comment":"13 pages,9 figures"},{"id":"http://arxiv.org/abs/2401.17981v2","updated":"2024-05-30T15:09:49Z","published":"2024-01-31T16:38:32Z","title":"Enhancing Multimodal Large Language Models with Vision Detection Models:\n  An Empirical Study","summary":"  Despite the impressive capabilities of Multimodal Large Language Models\n(MLLMs) in integrating text and image modalities, challenges remain in\naccurately interpreting detailed visual elements. This paper presents an\nempirical study on enhancing MLLMs with state-of-the-art (SOTA) object\ndetection and Optical Character Recognition (OCR) models to improve\nfine-grained understanding and reduce hallucination in responses. We\ninvestigate the embedding-based infusion of textual detection information, the\nimpact of such infusion on MLLMs' original abilities, and the\ninterchangeability of detection models. We conduct systematic and extensive\nexperiments with representative models such as LLaVA-1.5, DINO, PaddleOCRv2,\nand Grounding DINO, revealing that our simple yet general approach not only\nrefines MLLMs' performance in fine-grained visual tasks but also maintains\ntheir original strengths. Notably, the enhanced LLaVA-1.5 outperforms its\noriginal 7B/13B models on all 10 benchmarks, achieving an improvement of up to\n12.5% on the normalized average score. We release our codes to facilitate\nfurther exploration into the fine-grained multimodal capabilities of MLLMs.\n","authors":["Qirui Jiao","Daoyuan Chen","Yilun Huang","Yaliang Li","Ying Shen"],"pdf_url":"https://arxiv.org/pdf/2401.17981v2.pdf","comment":"25 pages, 18 tables, 7 figures"},{"id":"http://arxiv.org/abs/2405.20126v1","updated":"2024-05-30T15:07:30Z","published":"2024-05-30T15:07:30Z","title":"Federated and Transfer Learning for Cancer Detection Based on Image\n  Analysis","summary":"  This review article discusses the roles of federated learning (FL) and\ntransfer learning (TL) in cancer detection based on image analysis. These two\nstrategies powered by machine learning have drawn a lot of attention due to\ntheir potential to increase the precision and effectiveness of cancer diagnosis\nin light of the growing importance of machine learning techniques in cancer\ndetection. FL enables the training of machine learning models on data\ndistributed across multiple sites without the need for centralized data\nsharing, while TL allows for the transfer of knowledge from one task to\nanother. A comprehensive assessment of the two methods, including their\nstrengths, and weaknesses is presented. Moving on, their applications in cancer\ndetection are discussed, including potential directions for the future.\nFinally, this article offers a thorough description of the functions of TL and\nFL in image-based cancer detection. The authors also make insightful\nsuggestions for additional study in this rapidly developing area.\n","authors":["Amine Bechar","Youssef Elmir","Yassine Himeur","Rafik Medjoudj","Abbes Amira"],"pdf_url":"https://arxiv.org/pdf/2405.20126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05168v2","updated":"2024-05-30T14:55:45Z","published":"2024-01-10T14:03:05Z","title":"CLIP-Guided Source-Free Object Detection in Aerial Images","summary":"  Domain adaptation is crucial in aerial imagery, as the visual representation\nof these images can significantly vary based on factors such as geographic\nlocation, time, and weather conditions. Additionally, high-resolution aerial\nimages often require substantial storage space and may not be readily\naccessible to the public. To address these challenges, we propose a novel\nSource-Free Object Detection (SFOD) method. Specifically, our approach begins\nwith a self-training framework, which significantly enhances the performance of\nbaseline methods. To alleviate the noisy labels in self-training, we utilize\nContrastive Language-Image Pre-training (CLIP) to guide the generation of\npseudo-labels, termed CLIP-guided Aggregation (CGA). By leveraging CLIP's\nzero-shot classification capability, we aggregate its scores with the original\npredicted bounding boxes, enabling us to obtain refined scores for the\npseudo-labels. To validate the effectiveness of our method, we constructed two\nnew datasets from different domains based on the DIOR dataset, named DIOR-C and\nDIOR-Cloudy. Experimental results demonstrate that our method outperforms other\ncomparative algorithms. The code is available at\nhttps://github.com/Lans1ng/SFOD-RS.\n","authors":["Nanqing Liu","Xun Xu","Yongyi Su","Chengxin Liu","Peiliang Gong","Heng-Chao Li"],"pdf_url":"https://arxiv.org/pdf/2401.05168v2.pdf","comment":"Accepted by IGARSS2024"},{"id":"http://arxiv.org/abs/2405.20117v1","updated":"2024-05-30T14:54:26Z","published":"2024-05-30T14:54:26Z","title":"Infinite 3D Landmarks: Improving Continuous 2D Facial Landmark Detection","summary":"  In this paper, we examine 3 important issues in the practical use of\nstate-of-the-art facial landmark detectors and show how a combination of\nspecific architectural modifications can directly improve their accuracy and\ntemporal stability. First, many facial landmark detectors require face\nnormalization as a preprocessing step, which is accomplished by a\nseparately-trained neural network that crops and resizes the face in the input\nimage. There is no guarantee that this pre-trained network performs the optimal\nface normalization for landmark detection. We instead analyze the use of a\nspatial transformer network that is trained alongside the landmark detector in\nan unsupervised manner, and jointly learn optimal face normalization and\nlandmark detection. Second, we show that modifying the output head of the\nlandmark predictor to infer landmarks in a canonical 3D space can further\nimprove accuracy. To convert the predicted 3D landmarks into screen-space, we\nadditionally predict the camera intrinsics and head pose from the input image.\nAs a side benefit, this allows to predict the 3D face shape from a given image\nonly using 2D landmarks as supervision, which is useful in determining landmark\nvisibility among other things. Finally, when training a landmark detector on\nmultiple datasets at the same time, annotation inconsistencies across datasets\nforces the network to produce a suboptimal average. We propose to add a\nsemantic correction network to address this issue. This additional lightweight\nneural network is trained alongside the landmark detector, without requiring\nany additional supervision. While the insights of this paper can be applied to\nmost common landmark detectors, we specifically target a recently-proposed\ncontinuous 2D landmark detector to demonstrate how each of our additions leads\nto meaningful improvements over the state-of-the-art on standard benchmarks.\n","authors":["Prashanth Chandran","Gaspard Zoss","Paulo Gotardo","Derek Bradley"],"pdf_url":"https://arxiv.org/pdf/2405.20117v1.pdf","comment":"12 pages, 13 figures"},{"id":"http://arxiv.org/abs/2405.20112v1","updated":"2024-05-30T14:49:54Z","published":"2024-05-30T14:49:54Z","title":"RIGID: A Training-free and Model-Agnostic Framework for Robust\n  AI-Generated Image Detection","summary":"  The rapid advances in generative AI models have empowered the creation of\nhighly realistic images with arbitrary content, raising concerns about\npotential misuse and harm, such as Deepfakes. Current research focuses on\ntraining detectors using large datasets of generated images. However, these\ntraining-based solutions are often computationally expensive and show limited\ngeneralization to unseen generated images. In this paper, we propose a\ntraining-free method to distinguish between real and AI-generated images. We\nfirst observe that real images are more robust to tiny noise perturbations than\nAI-generated images in the representation space of vision foundation models.\nBased on this observation, we propose RIGID, a training-free and model-agnostic\nmethod for robust AI-generated image detection. RIGID is a simple yet effective\napproach that identifies whether an image is AI-generated by comparing the\nrepresentation similarity between the original and the noise-perturbed\ncounterpart. Our evaluation on a diverse set of AI-generated images and\nbenchmarks shows that RIGID significantly outperforms existing trainingbased\nand training-free detectors. In particular, the average performance of RIGID\nexceeds the current best training-free method by more than 25%. Importantly,\nRIGID exhibits strong generalization across different image generation methods\nand robustness to image corruptions.\n","authors":["Zhiyuan He","Pin-Yu Chen","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2405.20112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20109v1","updated":"2024-05-30T14:45:02Z","published":"2024-05-30T14:45:02Z","title":"FMARS: Annotating Remote Sensing Images for Disaster Management using\n  Foundation Models","summary":"  Very-High Resolution (VHR) remote sensing imagery is increasingly accessible,\nbut often lacks annotations for effective machine learning applications. Recent\nfoundation models like GroundingDINO and Segment Anything (SAM) provide\nopportunities to automatically generate annotations. This study introduces\nFMARS (Foundation Model Annotations in Remote Sensing), a methodology\nleveraging VHR imagery and foundation models for fast and robust annotation. We\nfocus on disaster management and provide a large-scale dataset with labels\nobtained from pre-event imagery over 19 disaster events, derived from the Maxar\nOpen Data initiative. We train segmentation models on the generated labels,\nusing Unsupervised Domain Adaptation (UDA) techniques to increase\ntransferability to real-world scenarios. Our results demonstrate the\neffectiveness of leveraging foundation models to automatically annotate remote\nsensing data at scale, enabling robust downstream models for critical\napplications. Code and dataset are available at\n\\url{https://github.com/links-ads/igarss-fmars}.\n","authors":["Edoardo Arnaudo","Jacopo Lungo Vaschetti","Lorenzo Innocenti","Luca Barco","Davide Lisi","Vanina Fissore","Claudio Rossi"],"pdf_url":"https://arxiv.org/pdf/2405.20109v1.pdf","comment":"Accepted at IGARSS 2024, 5 pages"},{"id":"http://arxiv.org/abs/2405.20093v1","updated":"2024-05-30T14:31:46Z","published":"2024-05-30T14:31:46Z","title":"Rapid Wildfire Hotspot Detection Using Self-Supervised Learning on\n  Temporal Remote Sensing Data","summary":"  Rapid detection and well-timed intervention are essential to mitigate the\nimpacts of wildfires. Leveraging remote sensed data from satellite networks and\nadvanced AI models to automatically detect hotspots (i.e., thermal anomalies\ncaused by active fires) is an effective way to build wildfire monitoring\nsystems. In this work, we propose a novel dataset containing time series of\nremotely sensed data related to European fire events and a Self-Supervised\nLearning (SSL)-based model able to analyse multi-temporal data and identify\nhotspots in potentially near real time. We train and evaluate the performance\nof our model using our dataset and Thraws, a dataset of thermal anomalies\nincluding several fire events, obtaining an F1 score of 63.58.\n","authors":["Luca Barco","Angelica Urbanelli","Claudio Rossi"],"pdf_url":"https://arxiv.org/pdf/2405.20093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20091v1","updated":"2024-05-30T14:27:40Z","published":"2024-05-30T14:27:40Z","title":"Visual Attention Analysis in Online Learning","summary":"  In this paper, we present an approach in the Multimodal Learning Analytics\nfield. Within this approach, we have developed a tool to visualize and analyze\neye movement data collected during learning sessions in online courses. The\ntool is named VAAD (an acronym for Visual Attention Analysis Dashboard). These\neye movement data have been gathered using an eye-tracker and subsequently\nprocessed and visualized for interpretation. The purpose of the tool is to\nconduct a descriptive analysis of the data by facilitating its visualization,\nenabling the identification of differences and learning patterns among various\nlearner populations. Additionally, it integrates a predictive module capable of\nanticipating learner activities during a learning session. Consequently, VAAD\nholds the potential to offer valuable insights into online learning behaviors\nfrom both descriptive and predictive perspectives.\n","authors":["Navarro Miriam","Becerra Álvaro","Daza Roberto","Cobos Ruth","Morales Aythami","Fierrez Julian"],"pdf_url":"https://arxiv.org/pdf/2405.20091v1.pdf","comment":"Accepted in CEDI 2024 (VII Congreso Espa\\~nol de Inform\\'atica), A\n  Coru\\~na, Spain"},{"id":"http://arxiv.org/abs/2405.20090v1","updated":"2024-05-30T14:27:20Z","published":"2024-05-30T14:27:20Z","title":"Typography Leads Semantic Diversifying: Amplifying Adversarial\n  Transferability across Multimodal Large Language Models","summary":"  Following the advent of the Artificial Intelligence (AI) era of large models,\nMultimodal Large Language Models (MLLMs) with the ability to understand\ncross-modal interactions between vision and text have attracted wide attention.\nAdversarial examples with human-imperceptible perturbation are shown to possess\na characteristic known as transferability, which means that a perturbation\ngenerated by one model could also mislead another different model. Augmenting\nthe diversity in input data is one of the most significant methods for\nenhancing adversarial transferability. This method has been certified as a way\nto significantly enlarge the threat impact under black-box conditions. Research\nworks also demonstrate that MLLMs can be exploited to generate adversarial\nexamples in the white-box scenario. However, the adversarial transferability of\nsuch perturbations is quite limited, failing to achieve effective black-box\nattacks across different models. In this paper, we propose the\nTypographic-based Semantic Transfer Attack (TSTA), which is inspired by: (1)\nMLLMs tend to process semantic-level information; (2) Typographic Attack could\neffectively distract the visual information captured by MLLMs. In the scenarios\nof Harmful Word Insertion and Important Information Protection, our TSTA\ndemonstrates superior performance.\n","authors":["Hao Cheng","Erjia Xiao","Jiahang Cao","Le Yang","Kaidi Xu","Jindong Gu","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2405.20090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14870v2","updated":"2024-05-30T14:23:21Z","published":"2024-05-23T17:59:57Z","title":"An Empirical Study of Training State-of-the-Art LiDAR Segmentation\n  Models","summary":"  In the rapidly evolving field of autonomous driving, precise segmentation of\nLiDAR data is crucial for understanding complex 3D environments. Traditional\napproaches often rely on disparate, standalone codebases, hindering unified\nadvancements and fair benchmarking across models. To address these challenges,\nwe introduce MMDetection3D-lidarseg, a comprehensive toolbox designed for the\nefficient training and evaluation of state-of-the-art LiDAR segmentation\nmodels. We support a wide range of segmentation models and integrate advanced\ndata augmentation techniques to enhance robustness and generalization.\nAdditionally, the toolbox provides support for multiple leading sparse\nconvolution backends, optimizing computational efficiency and performance. By\nfostering a unified framework, MMDetection3D-lidarseg streamlines development\nand benchmarking, setting new standards for research and application. Our\nextensive benchmark experiments on widely-used datasets demonstrate the\neffectiveness of the toolbox. The codebase and trained models have been\npublicly available, promoting further research and innovation in the field of\nLiDAR segmentation for autonomous driving.\n","authors":["Jiahao Sun","Chunmei Qing","Xiang Xu","Lingdong Kong","Youquan Liu","Li Li","Chenming Zhu","Jingwei Zhang","Zeqi Xiao","Runnan Chen","Tai Wang","Wenwei Zhang","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2405.14870v2.pdf","comment":"Preprint; 17 pages, 4 figures, 7 tables; Code at\n  https://github.com/open-mmlab/mmdetection3d"},{"id":"http://arxiv.org/abs/2405.20084v1","updated":"2024-05-30T14:14:39Z","published":"2024-05-30T14:14:39Z","title":"Estimating Human Poses Across Datasets: A Unified Skeleton and\n  Multi-Teacher Distillation Approach","summary":"  Human pose estimation is a key task in computer vision with various\napplications such as activity recognition and interactive systems. However, the\nlack of consistency in the annotated skeletons across different datasets poses\nchallenges in developing universally applicable models. To address this\nchallenge, we propose a novel approach integrating multi-teacher knowledge\ndistillation with a unified skeleton representation. Our networks are jointly\ntrained on the COCO and MPII datasets, containing 17 and 16 keypoints,\nrespectively. We demonstrate enhanced adaptability by predicting an extended\nset of 21 keypoints, 4 (COCO) and 5 (MPII) more than original annotations,\nimproving cross-dataset generalization. Our joint models achieved an average\naccuracy of 70.89 and 76.40, compared to 53.79 and 55.78 when trained on a\nsingle dataset and evaluated on both. Moreover, we also evaluate all 21\npredicted points by our two models by reporting an AP of 66.84 and 72.75 on the\nHalpe dataset. This highlights the potential of our technique to address one of\nthe most pressing challenges in pose estimation research and application - the\ninconsistency in skeletal annotations.\n","authors":["Muhammad Saif Ullah Khan","Dhavalkumar Limbachiya","Didier Stricker","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2405.20084v1.pdf","comment":"15 pages (with references)"},{"id":"http://arxiv.org/abs/2405.18751v2","updated":"2024-05-30T14:13:05Z","published":"2024-05-29T04:29:12Z","title":"On the Limits of Multi-modal Meta-Learning with Auxiliary Task\n  Modulation Using Conditional Batch Normalization","summary":"  Few-shot learning aims to learn representations that can tackle novel tasks\ngiven a small number of examples. Recent studies show that cross-modal learning\ncan improve representations for few-shot classification. More specifically,\nlanguage is a rich modality that can be used to guide visual learning. In this\nwork, we experiment with a multi-modal architecture for few-shot learning that\nconsists of three components: a classifier, an auxiliary network, and a bridge\nnetwork. While the classifier performs the main classification task, the\nauxiliary network learns to predict language representations from the same\ninput, and the bridge network transforms high-level features of the auxiliary\nnetwork into modulation parameters for layers of the few-shot classifier using\nconditional batch normalization. The bridge should encourage a form of\nlightweight semantic alignment between language and vision which could be\nuseful for the classifier. However, after evaluating the proposed approach on\ntwo popular few-shot classification benchmarks we find that a) the improvements\ndo not reproduce across benchmarks, and b) when they do, the improvements are\ndue to the additional compute and parameters introduced by the bridge network.\nWe contribute insights and recommendations for future work in multi-modal\nmeta-learning, especially when using language representations.\n","authors":["Jordi Armengol-Estapé","Vincent Michalski","Ramnath Kumar","Pierre-Luc St-Charles","Doina Precup","Samira Ebrahimi Kahou"],"pdf_url":"https://arxiv.org/pdf/2405.18751v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20081v1","updated":"2024-05-30T14:11:27Z","published":"2024-05-30T14:11:27Z","title":"NoiseBoost: Alleviating Hallucination with Noise Perturbation for\n  Multimodal Large Language Models","summary":"  Multimodal large language models (MLLMs) contribute a powerful mechanism to\nunderstanding visual information building on large language models. However,\nMLLMs are notorious for suffering from hallucinations, especially when\ngenerating lengthy, detailed descriptions for images. Our analysis reveals that\nhallucinations stem from the inherent summarization mechanism of large language\nmodels, leading to excessive dependence on linguistic tokens while neglecting\nvision information. In this paper, we propose NoiseBoost, a broadly applicable\nand simple method for alleviating hallucinations for MLLMs through the\nintegration of noise feature perturbations. Noise perturbation acts as a\nregularizer, facilitating a balanced distribution of attention weights among\nvisual and linguistic tokens. Despite its simplicity, NoiseBoost consistently\nenhances the performance of MLLMs across common training strategies, including\nsupervised fine-tuning and reinforcement learning. Further, NoiseBoost\npioneerly enables semi-supervised learning for MLLMs, unleashing the power of\nunlabeled data. Comprehensive experiments demonstrate that NoiseBoost improves\ndense caption accuracy by 8.1% with human evaluation and achieves comparable\nresults with 50% of the data by mining unlabeled data. Code and models are\navailable at https://kaiwu5.github.io/noiseboost.\n","authors":["Kai Wu","Boyuan Jiang","Zhengkai Jiang","Qingdong He","Donghao Luo","Shengzhi Wang","Qingwen Liu","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20081v1.pdf","comment":"updating"},{"id":"http://arxiv.org/abs/2405.20072v1","updated":"2024-05-30T14:02:40Z","published":"2024-05-30T14:02:40Z","title":"Faces of the Mind: Unveiling Mental Health States Through Facial\n  Expressions in 11,427 Adolescents","summary":"  Mood disorders, including depression and anxiety, often manifest through\nfacial expressions. While previous research has explored the connection between\nfacial features and emotions, machine learning algorithms for estimating mood\ndisorder severity have been hindered by small datasets and limited real-world\napplication. To address this gap, we analyzed facial videos of 11,427\nparticipants, a dataset two orders of magnitude larger than previous studies.\nThis comprehensive collection includes standardized facial expression videos\nfrom reading tasks, along with a detailed psychological scale that measures\ndepression, anxiety, and stress. By examining the relationships among these\nemotional states and employing clustering analysis, we identified distinct\nsubgroups embodying different emotional profiles. We then trained tree-based\nclassifiers and deep learning models to estimate emotional states from facial\nfeatures. Results indicate that models previously effective on small datasets\nexperienced decreased performance when applied to our large dataset,\nhighlighting the importance of data scale and mitigating overfitting in\npractical settings. Notably, our study identified subtle shifts in pupil\ndynamics and gaze orientation as potential markers of mood disorders, providing\nvaluable information on the interaction between facial expressions and mental\nhealth. This research marks the first large-scale and comprehensive\ninvestigation of facial expressions in the context of mental health, laying the\ngroundwork for future data-driven advancements in this field.\n","authors":["Xiao Xu","Keyin Zhou","Yan Zhang","Yang Wang","Fei Wang","Xizhe Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20067v1","updated":"2024-05-30T13:56:58Z","published":"2024-05-30T13:56:58Z","title":"N-Dimensional Gaussians for Fitting of High Dimensional Functions","summary":"  In the wake of many new ML-inspired approaches for reconstructing and\nrepresenting high-quality 3D content, recent hybrid and explicitly learned\nrepresentations exhibit promising performance and quality characteristics.\nHowever, their scaling to higher dimensions is challenging, e.g. when\naccounting for dynamic content with respect to additional parameters such as\nmaterial properties, illumination, or time. In this paper, we tackle these\nchallenges for an explicit representations based on Gaussian mixture models.\nWith our solutions, we arrive at efficient fitting of compact N-dimensional\nGaussian mixtures and enable efficient evaluation at render time: For fast\nfitting and evaluation, we introduce a high-dimensional culling scheme that\nefficiently bounds N-D Gaussians, inspired by Locality Sensitive Hashing. For\nadaptive refinement yet compact representation, we introduce a loss-adaptive\ndensity control scheme that incrementally guides the use of additional capacity\ntowards missing details. With these tools we can for the first time represent\ncomplex appearance that depends on many input dimensions beyond position or\nviewing angle within a compact, explicit representation optimized in minutes\nand rendered in milliseconds.\n","authors":["Stavros Diolatzis","Tobias Zirr","Alexandr Kuznetsov","Georgios Kopanas","Anton Kaplanyan"],"pdf_url":"https://arxiv.org/pdf/2405.20067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20062v1","updated":"2024-05-30T13:50:39Z","published":"2024-05-30T13:50:39Z","title":"Can the accuracy bias by facial hairstyle be reduced through balancing\n  the training data?","summary":"  Appearance of a face can be greatly altered by growing a beard and mustache.\nThe facial hairstyles in a pair of images can cause marked changes to the\nimpostor distribution and the genuine distribution. Also, different\ndistributions of facial hairstyle across demographics could cause a false\nimpression of relative accuracy across demographics. We first show that, even\nthough larger training sets boost the recognition accuracy on all facial\nhairstyles, accuracy variations caused by facial hairstyles persist regardless\nof the size of the training set. Then, we analyze the impact of having\ndifferent fractions of the training data represent facial hairstyles. We\ncreated balanced training sets using a set of identities available in\nWebface42M that both have clean-shaven and facial hair images. We find that,\neven when a face recognition model is trained with a balanced clean-shaven /\nfacial hair training set, accuracy variation on the test data does not\ndiminish. Next, data augmentation is employed to further investigate the effect\nof facial hair distribution in training data by manipulating facial hair pixels\nwith the help of facial landmark points and a facial hair segmentation model.\nOur results show facial hair causes an accuracy gap between clean-shaven and\nfacial hair images, and this impact can be significantly different between\nAfrican-Americans and Caucasians.\n","authors":["Kagan Ozturk","Haiyu Wu","Kevin W. Bowyer"],"pdf_url":"https://arxiv.org/pdf/2405.20062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20058v1","updated":"2024-05-30T13:46:56Z","published":"2024-05-30T13:46:56Z","title":"Enhancing Plant Disease Detection: A Novel CNN-Based Approach with\n  Tensor Subspace Learning and HOWSVD-MD","summary":"  Machine learning has revolutionized the field of agricultural science,\nparticularly in the early detection and management of plant diseases, which are\ncrucial for maintaining crop health and productivity. Leveraging advanced\nalgorithms and imaging technologies, researchers are now able to identify and\nclassify plant diseases with unprecedented accuracy and speed. Effective\nmanagement of tomato diseases is crucial for enhancing agricultural\nproductivity. The development and application of tomato disease classification\nmethods are central to this objective. This paper introduces a cutting-edge\ntechnique for the detection and classification of tomato leaf diseases,\nutilizing insights from the latest pre-trained Convolutional Neural Network\n(CNN) models. We propose a sophisticated approach within the domain of tensor\nsubspace learning, known as Higher-Order Whitened Singular Value Decomposition\n(HOWSVD), designed to boost the discriminatory power of the system. Our\napproach to Tensor Subspace Learning is methodically executed in two phases,\nbeginning with HOWSVD and culminating in Multilinear Discriminant Analysis\n(MDA). The efficacy of this innovative method was rigorously tested through\ncomprehensive experiments on two distinct datasets, namely PlantVillage and the\nTaiwan dataset. The findings reveal that HOWSVD-MDA outperforms existing\nmethods, underscoring its capability to markedly enhance the precision and\ndependability of diagnosing tomato leaf diseases. For instance, up to 98.36\\%\nand 89.39\\% accuracy scores have been achieved under PlantVillage and the\nTaiwan datasets, respectively.\n","authors":["Abdelmalik Ouamane","Ammar Chouchane","Yassine Himeur","Abderrazak Debilou","Abbes Amira","Shadi Atalla","Wathiq Mansoor","Hussain Al Ahmad"],"pdf_url":"https://arxiv.org/pdf/2405.20058v1.pdf","comment":"17 pages, 9 figures and 8 tables"},{"id":"http://arxiv.org/abs/2306.08970v2","updated":"2024-05-30T13:46:34Z","published":"2023-06-15T09:05:36Z","title":"An Efficient and Multi-private Key Secure Aggregation for Federated\n  Learning","summary":"  With the emergence of privacy leaks in federated learning, secure aggregation\nprotocols that mainly adopt either homomorphic encryption or threshold secret\nsharing have been widely developed for federated learning to protect the\nprivacy of the local training data of each client. However, these existing\nprotocols suffer from many shortcomings, such as the dependence on a trusted\nthird party, the vulnerability to clients being corrupted, low efficiency, the\ntrade-off between security and fault tolerance, etc. To solve these\ndisadvantages, we propose an efficient and multi-private key secure aggregation\nscheme for federated learning. Specifically, we skillfully modify the variant\nElGamal encryption technique to achieve homomorphic addition operation, which\nhas two important advantages: 1) The server and each client can freely select\npublic and private keys without introducing a trust third party and 2) Compared\nto the variant ElGamal encryption, the plaintext space is relatively large,\nwhich is more suitable for the deep model. Besides, for the high dimensional\ndeep model parameter, we introduce a super-increasing sequence to compress\nmulti-dimensional data into 1-D, which can greatly reduce encryption and\ndecryption times as well as communication for ciphertext transmission. Detailed\nsecurity analyses show that our proposed scheme achieves the semantic security\nof both individual local gradients and the aggregated result while achieving\noptimal robustness in tolerating both client collusion and dropped clients.\nExtensive simulations demonstrate that the accuracy of our scheme is almost the\nsame as the non-private approach, while the efficiency of our scheme is much\nbetter than the state-of-the-art homomorphic encryption-based secure\naggregation schemes. More importantly, the efficiency advantages of our scheme\nwill become increasingly prominent as the number of model parameters increases.\n","authors":["Xue Yang","Zifeng Liu","Xiaohu Tang","Rongxing Lu","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2306.08970v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17817v2","updated":"2024-05-30T13:40:23Z","published":"2024-05-28T04:29:10Z","title":"Benchmarking Skeleton-based Motion Encoder Models for Clinical\n  Applications: Estimating Parkinson's Disease Severity in Walking Sequences","summary":"  This study investigates the application of general human motion encoders\ntrained on large-scale human motion datasets for analyzing gait patterns in PD\npatients. Although these models have learned a wealth of human biomechanical\nknowledge, their effectiveness in analyzing pathological movements, such as\nparkinsonian gait, has yet to be fully validated. We propose a comparative\nframework and evaluate six pre-trained state-of-the-art human motion encoder\nmodels on their ability to predict the Movement Disorder Society - Unified\nParkinson's Disease Rating Scale (MDS-UPDRS-III) gait scores from motion\ncapture data. We compare these against a traditional gait feature-based\npredictive model in a recently released large public PD dataset, including PD\npatients on and off medication. The feature-based model currently shows higher\nweighted average accuracy, precision, recall, and F1-score. Motion encoder\nmodels with closely comparable results demonstrate promise for scalability and\nefficiency in clinical settings. This potential is underscored by the enhanced\nperformance of the encoder model upon fine-tuning on PD training set. Four of\nthe six human motion models examined provided prediction scores that were\nsignificantly different between on- and off-medication states. This finding\nreveals the sensitivity of motion encoder models to nuanced clinical changes.\nIt also underscores the necessity for continued customization of these models\nto better capture disease-specific features, thereby reducing the reliance on\nlabor-intensive feature engineering. Lastly, we establish a benchmark for the\nanalysis of skeleton-based motion encoder models in clinical settings. To the\nbest of our knowledge, this is the first study to provide a benchmark that\nenables state-of-the-art models to be tested and compete in a clinical context.\nCodes and benchmark leaderboard are available at code.\n","authors":["Vida Adeli","Soroush Mehraban","Irene Ballester","Yasamin Zarghami","Andrea Sabo","Andrea Iaboni","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2405.17817v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04173v2","updated":"2024-05-30T13:38:18Z","published":"2024-03-07T03:07:59Z","title":"Image Coding for Machines with Edge Information Learning Using Segment\n  Anything","summary":"  Image Coding for Machines (ICM) is an image compression technique for image\nrecognition.\n  This technique is essential due to the growing demand for image recognition\nAI.\n  In this paper, we propose a method for ICM that focuses on encoding and\ndecoding only the edge information of object parts in an image, which we call\nSA-ICM.\n  This is an Learned Image Compression (LIC) model trained using edge\ninformation created by Segment Anything.\n  Our method can be used for image recognition models with various tasks.\n  SA-ICM is also robust to changes in input data, making it effective for a\nvariety of use cases.\n  Additionally, our method provides benefits from a privacy point of view, as\nit removes human facial information on the encoder's side, thus protecting\none's privacy.\n  Furthermore, this LIC model training method can be used to train Neural\nRepresentations for Videos (NeRV), which is a video compression model.\n  By training NeRV using edge information created by Segment Anything, it is\npossible to create a NeRV that is effective for image recognition (SA-NeRV).\n  Experimental results confirm the advantages of SA-ICM, presenting the best\nperformance in image compression for image recognition.\n  We also show that SA-NeRV is superior to ordinary NeRV in video compression\nfor machines.\n","authors":["Takahiro Shindo","Kein Yamada","Taiju Watanabe","Hiroshi Watanabe"],"pdf_url":"https://arxiv.org/pdf/2403.04173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19149v2","updated":"2024-05-30T13:26:43Z","published":"2024-05-29T14:52:10Z","title":"CaLa: Complementary Association Learning for Augmenting Composed Image\n  Retrieval","summary":"  Composed Image Retrieval (CIR) involves searching for target images based on\nan image-text pair query. While current methods treat this as a query-target\nmatching problem, we argue that CIR triplets contain additional associations\nbeyond this primary relation. In our paper, we identify two new relations\nwithin triplets, treating each triplet as a graph node. Firstly, we introduce\nthe concept of text-bridged image alignment, where the query text serves as a\nbridge between the query image and the target image. We propose a hinge-based\ncross-attention mechanism to incorporate this relation into network learning.\nSecondly, we explore complementary text reasoning, considering CIR as a form of\ncross-modal retrieval where two images compose to reason about complementary\ntext. To integrate these perspectives effectively, we design a twin\nattention-based compositor. By combining these complementary associations with\nthe explicit query pair-target image relation, we establish a comprehensive set\nof constraints for CIR. Our framework, CaLa (Complementary Association Learning\nfor Augmenting Composed Image Retrieval), leverages these insights. We evaluate\nCaLa on CIRR and FashionIQ benchmarks with multiple backbones, demonstrating\nits superiority in composed image retrieval.\n","authors":["Xintong Jiang","Yaxiong Wang","Mengjian Li","Yujiao Wu","Bingwen Hu","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2405.19149v2.pdf","comment":"To appear at SIGIR 2024. arXiv admin note: text overlap with\n  arXiv:2309.02169"},{"id":"http://arxiv.org/abs/2405.20044v1","updated":"2024-05-30T13:25:25Z","published":"2024-05-30T13:25:25Z","title":"A Point-Neighborhood Learning Framework for Nasal Endoscope Image\n  Segmentation","summary":"  The lesion segmentation on endoscopic images is challenging due to its\ncomplex and ambiguous features. Fully-supervised deep learning segmentation\nmethods can receive good performance based on entirely pixel-level labeled\ndataset but greatly increase experts' labeling burden. Semi-supervised and\nweakly supervised methods can ease labeling burden, but heavily strengthen the\nlearning difficulty. To alleviate this difficulty, weakly semi-supervised\nsegmentation adopts a new annotation protocol of adding a large number of point\nannotation samples into a few pixel-level annotation samples. However, existing\nmethods only mine points' limited information while ignoring reliable prior\nsurrounding the point annotations. In this paper, we propose a weakly\nsemi-supervised method called Point-Neighborhood Learning (PNL) framework. To\nmine the prior of the pixels surrounding the annotated point, we transform a\nsingle-point annotation into a circular area named a point-neighborhood. We\npropose point-neighborhood supervision loss and pseudo-label scoring mechanism\nto enhance training supervision. Point-neighborhoods are also used to augment\nthe data diversity. Our method greatly improves performance without changing\nthe structure of segmentation network. Comprehensive experiments show the\nsuperiority of our method over the other existing methods, demonstrating its\neffectiveness in point-annotated medical images. The project code will be\navailable on: https://github.com/ParryJay/PNL.\n","authors":["Pengyu Jie","Wanquan Liu","Chenqiang Gao","Yihui Wen","Rui He","Pengcheng Li","Jintao Zhang","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2405.20044v1.pdf","comment":"10 pages, 10 figures,"},{"id":"http://arxiv.org/abs/2405.20031v1","updated":"2024-05-30T13:16:17Z","published":"2024-05-30T13:16:17Z","title":"Structure Gaussian SLAM with Manhattan World Hypothesis","summary":"  Gaussian SLAM systems have made significant advancements in improving the\nefficiency and fidelity of real-time reconstructions. However, these systems\noften encounter incomplete reconstructions in complex indoor environments,\ncharacterized by substantial holes due to unobserved geometry caused by\nobstacles or limited view angles. To address this challenge, we present\nManhattan Gaussian SLAM (MG-SLAM), an RGB-D system that leverages the Manhattan\nWorld hypothesis to enhance geometric accuracy and completeness. By seamlessly\nintegrating fused line segments derived from structured scenes, MG-SLAM ensures\nrobust tracking in textureless indoor areas. Moreover, The extracted lines and\nplanar surface assumption allow strategic interpolation of new Gaussians in\nregions of missing geometry, enabling efficient scene completion. Extensive\nexperiments conducted on both synthetic and real-world scenes demonstrate that\nthese advancements enable our method to achieve state-of-the-art performance,\nmarking a substantial improvement in the capabilities of Gaussian SLAM systems.\n","authors":["Shuhong Liu","Heng Zhou","Liuzhuozheng Li","Yun Liu","Tianchen Deng","Yiming Zhou","Mingrui Li"],"pdf_url":"https://arxiv.org/pdf/2405.20031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20030v1","updated":"2024-05-30T13:15:18Z","published":"2024-05-30T13:15:18Z","title":"EMAG: Ego-motion Aware and Generalizable 2D Hand Forecasting from\n  Egocentric Videos","summary":"  Predicting future human behavior from egocentric videos is a challenging but\ncritical task for human intention understanding. Existing methods for\nforecasting 2D hand positions rely on visual representations and mainly focus\non hand-object interactions. In this paper, we investigate the hand forecasting\ntask and tackle two significant issues that persist in the existing methods:\n(1) 2D hand positions in future frames are severely affected by ego-motions in\negocentric videos; (2) prediction based on visual information tends to overfit\nto background or scene textures, posing a challenge for generalization on novel\nscenes or human behaviors. To solve the aforementioned problems, we propose\nEMAG, an ego-motion-aware and generalizable 2D hand forecasting method. In\nresponse to the first problem, we propose a method that considers ego-motion,\nrepresented by a sequence of homography matrices of two consecutive frames. We\nfurther leverage modalities such as optical flow, trajectories of hands and\ninteracting objects, and ego-motions, thereby alleviating the second issue.\nExtensive experiments on two large-scale egocentric video datasets, Ego4D and\nEPIC-Kitchens 55, verify the effectiveness of the proposed method. In\nparticular, our model outperforms prior methods by $7.0$\\% on cross-dataset\nevaluations. Project page: https://masashi-hatano.github.io/EMAG/\n","authors":["Masashi Hatano","Ryo Hachiuma","Hideo Saito"],"pdf_url":"https://arxiv.org/pdf/2405.20030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20025v1","updated":"2024-05-30T13:11:08Z","published":"2024-05-30T13:11:08Z","title":"From Forest to Zoo: Great Ape Behavior Recognition with ChimpBehave","summary":"  This paper addresses the significant challenge of recognizing behaviors in\nnon-human primates, specifically focusing on chimpanzees. Automated behavior\nrecognition is crucial for both conservation efforts and the advancement of\nbehavioral research. However, it is significantly hindered by the\nlabor-intensive process of manual video annotation. Despite the availability of\nlarge-scale animal behavior datasets, the effective application of machine\nlearning models across varied environmental settings poses a critical\nchallenge, primarily due to the variability in data collection contexts and the\nspecificity of annotations.\n  In this paper, we introduce ChimpBehave, a novel dataset featuring over 2\nhours of video (approximately 193,000 video frames) of zoo-housed chimpanzees,\nmeticulously annotated with bounding boxes and behavior labels for action\nrecognition. ChimpBehave uniquely aligns its behavior classes with existing\ndatasets, allowing for the study of domain adaptation and cross-dataset\ngeneralization methods between different visual settings. Furthermore, we\nbenchmark our dataset using a state-of-the-art CNN-based action recognition\nmodel, providing the first baseline results for both within and cross-dataset\nsettings. The dataset, models, and code can be accessed at:\nhttps://github.com/MitchFuchs/ChimpBehave\n","authors":["Michael Fuchs","Emilie Genty","Adrian Bangerter","Klaus Zuberbühler","Paul Cotofrei"],"pdf_url":"https://arxiv.org/pdf/2405.20025v1.pdf","comment":"CV4Animals: Computer Vision for Animal Behavior Tracking and Modeling\n  In conjunction with Computer Vision and Pattern Recognition 2024"},{"id":"http://arxiv.org/abs/2401.14535v2","updated":"2024-05-30T13:09:47Z","published":"2024-01-25T22:01:07Z","title":"CaRiNG: Learning Temporal Causal Representation under Non-Invertible\n  Generation Process","summary":"  Identifying the underlying time-delayed latent causal processes in sequential\ndata is vital for grasping temporal dynamics and making downstream reasoning.\nWhile some recent methods can robustly identify these latent causal variables,\nthey rely on strict assumptions about the invertible generation process from\nlatent variables to observed data. However, these assumptions are often hard to\nsatisfy in real-world applications containing information loss. For instance,\nthe visual perception process translates a 3D space into 2D images, or the\nphenomenon of persistence of vision incorporates historical data into current\nperceptions. To address this challenge, we establish an identifiability theory\nthat allows for the recovery of independent latent components even when they\ncome from a nonlinear and non-invertible mix. Using this theory as a\nfoundation, we propose a principled approach, CaRiNG, to learn the CAusal\nRepresentatIon of Non-invertible Generative temporal data with identifiability\nguarantees. Specifically, we utilize temporal context to recover lost latent\ninformation and apply the conditions in our theory to guide the training\nprocess. Through experiments conducted on synthetic datasets, we validate that\nour CaRiNG method reliably identifies the causal process, even when the\ngeneration process is non-invertible. Moreover, we demonstrate that our\napproach considerably improves temporal understanding and reasoning in\npractical applications.\n","authors":["Guangyi Chen","Yifan Shen","Zhenhao Chen","Xiangchen Song","Yuewen Sun","Weiran Yao","Xiao Liu","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.14535v2.pdf","comment":"To appear at ICML 2024, 24 pages"},{"id":"http://arxiv.org/abs/2402.07865v2","updated":"2024-05-30T13:08:48Z","published":"2024-02-12T18:21:14Z","title":"Prismatic VLMs: Investigating the Design Space of Visually-Conditioned\n  Language Models","summary":"  Visually-conditioned language models (VLMs) have seen growing adoption in\napplications such as visual dialogue, scene understanding, and robotic task\nplanning; adoption that has fueled a wealth of new models such as LLaVa,\nInstructBLIP, and PaLI-3. Despite the volume of new releases, key design\ndecisions around image preprocessing, architecture, and optimization are\nunder-explored, making it challenging to understand what factors account for\nmodel performance $-$ a challenge further complicated by the lack of objective,\nconsistent evaluations. To address these gaps, we first compile a suite of\nstandardized evaluations spanning visual question answering, object\nlocalization, and challenge sets that probe properties such as hallucination;\nevaluations that provide fine-grained insight VLM capabilities. Second, we\nrigorously investigate VLMs along key design axes, including pretrained visual\nrepresentations and training from base vs. instruct-tuned language models,\namongst others. We couple our analysis with three resource contributions: (1) a\nunified framework for evaluating VLMs, (2) optimized, flexible training code,\nand (3) checkpoints for all models, including a family of VLMs at the 7-13B\nscale that strictly outperform InstructBLIP and LLaVa v1.5, the\nstate-of-the-art in open VLMs.\n","authors":["Siddharth Karamcheti","Suraj Nair","Ashwin Balakrishna","Percy Liang","Thomas Kollar","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2402.07865v2.pdf","comment":"Published at ICML 2024. 22 pages, 11 figures. Training code and\n  models: https://github.com/TRI-ML/prismatic-vlms. Evaluation code:\n  https://github.com/TRI-ML/vlm-evaluation"},{"id":"http://arxiv.org/abs/2201.04435v3","updated":"2024-05-30T13:07:43Z","published":"2022-01-12T12:09:24Z","title":"Beyond the Visible: A Survey on Cross-spectral Face Recognition","summary":"  Cross-spectral face recognition (CFR) refers to recognizing individuals using\nface images stemming from different spectral bands, such as infrared vs.\nvisible. While CFR is inherently more challenging than classical face\nrecognition due to significant variation in facial appearance caused by the\nmodality gap, it is useful in many scenarios including night-vision biometrics\nand detecting presentation attacks. Recent advances in convolutional neural\nnetworks (CNNs) have resulted in significant improvement in the performance of\nCFR systems. Given these developments, the contributions of this survey are\nthree-fold. First, we provide an overview of CFR, by formalizing the CFR\nproblem and presenting related applications. Secondly, we discuss the\nappropriate spectral bands for face recognition and discuss recent CFR methods,\nplacing emphasis on deep neural networks. In particular we describe techniques\nthat have been proposed to extract and compare heterogeneous features emerging\nfrom different spectral bands. We also discuss the datasets that have been used\nfor evaluating CFR methods. Finally, we discuss the challenges and future lines\nof research on this topic.\n","authors":["David Anghelone","Cunjian Chen","Arun Ross","Antitza Dantcheva"],"pdf_url":"https://arxiv.org/pdf/2201.04435v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17835v3","updated":"2024-05-30T12:55:14Z","published":"2024-05-28T05:14:57Z","title":"Deform3DGS: Flexible Deformation for Fast Surgical Scene Reconstruction\n  with Gaussian Splatting","summary":"  Tissue deformation poses a key challenge for accurate surgical scene\nreconstruction. Despite yielding high reconstruction quality, existing methods\nsuffer from slow rendering speeds and long training times, limiting their\nintraoperative applicability. Motivated by recent progress in 3D Gaussian\nSplatting, an emerging technology in real-time 3D rendering, this work presents\na novel fast reconstruction framework, termed Deform3DGS, for deformable\ntissues during endoscopic surgery. Specifically, we introduce 3D GS into\nsurgical scenes by integrating a point cloud initialization to improve\nreconstruction. Furthermore, we propose a novel flexible deformation modeling\nscheme (FDM) to learn tissue deformation dynamics at the level of individual\nGaussians. Our FDM can model the surface deformation with efficient\nrepresentations, allowing for real-time rendering performance. More\nimportantly, FDM significantly accelerates surgical scene reconstruction,\ndemonstrating considerable clinical values, particularly in intraoperative\nsettings where time efficiency is crucial. Experiments on DaVinci robotic\nsurgery videos indicate the efficacy of our approach, showcasing superior\nreconstruction fidelity PSNR: (37.90) and rendering speed (338.8 FPS) while\nsubstantially reducing training time to only 1 minute/scene. Our code is\navailable at https://github.com/jinlab-imvr/Deform3DGS.\n","authors":["Shuojue Yang","Qian Li","Daiyun Shen","Bingchen Gong","Qi Dou","Yueming Jin"],"pdf_url":"https://arxiv.org/pdf/2405.17835v3.pdf","comment":"Early accepted at MICCAI 2024, 10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2405.20008v1","updated":"2024-05-30T12:45:34Z","published":"2024-05-30T12:45:34Z","title":"Sharing Key Semantics in Transformer Makes Efficient Image Restoration","summary":"  Image Restoration (IR), a classic low-level vision task, has witnessed\nsignificant advancements through deep models that effectively model global\ninformation. Notably, the Vision Transformers (ViTs) emergence has further\npropelled these advancements. When computing, the self-attention mechanism, a\ncornerstone of ViTs, tends to encompass all global cues, even those from\nsemantically unrelated objects or regions. This inclusivity introduces\ncomputational inefficiencies, particularly noticeable with high input\nresolution, as it requires processing irrelevant information, thereby impeding\nefficiency. Additionally, for IR, it is commonly noted that small segments of a\ndegraded image, particularly those closely aligned semantically, provide\nparticularly relevant information to aid in the restoration process, as they\ncontribute essential contextual cues crucial for accurate reconstruction. To\naddress these challenges, we propose boosting IR's performance by sharing the\nkey semantics via Transformer for IR (i.e., SemanIR) in this paper.\nSpecifically, SemanIR initially constructs a sparse yet comprehensive\nkey-semantic dictionary within each transformer stage by establishing essential\nsemantic connections for every degraded patch. Subsequently, this dictionary is\nshared across all subsequent transformer blocks within the same stage. This\nstrategy optimizes attention calculation within each block by focusing\nexclusively on semantically related components stored in the key-semantic\ndictionary. As a result, attention calculation achieves linear computational\ncomplexity within each window. Extensive experiments across 6 IR tasks confirm\nthe proposed SemanIR's state-of-the-art performance, quantitatively and\nqualitatively showcasing advancements.\n","authors":["Bin Ren","Yawei Li","Jingyun Liang","Rakesh Ranjan","Mengyuan Liu","Rita Cucchiara","Luc Van Gool","Ming-Hsuan Yang","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2405.20008v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2405.19996v1","updated":"2024-05-30T12:32:35Z","published":"2024-05-30T12:32:35Z","title":"DP-IQA: Utilizing Diffusion Prior for Blind Image Quality Assessment in\n  the Wild","summary":"  Image quality assessment (IQA) plays a critical role in selecting\nhigh-quality images and guiding compression and enhancement methods in a series\nof applications. The blind IQA, which assesses the quality of in-the-wild\nimages containing complex authentic distortions without reference images, poses\ngreater challenges. Existing methods are limited to modeling a uniform\ndistribution with local patches and are bothered by the gap between low and\nhigh-level visions (caused by widely adopted pre-trained classification\nnetworks). In this paper, we propose a novel IQA method called diffusion\npriors-based IQA (DP-IQA), which leverages the prior knowledge from the\npre-trained diffusion model with its excellent powers to bridge semantic gaps\nin the perception of the visual quality of images. Specifically, we use\npre-trained stable diffusion as the backbone, extract multi-level features from\nthe denoising U-Net during the upsampling process at a specified timestep, and\ndecode them to estimate the image quality score. The text and image adapters\nare adopted to mitigate the domain gap for downstream tasks and correct the\ninformation loss caused by the variational autoencoder bottleneck. Finally, we\ndistill the knowledge in the above model into a CNN-based student model,\nsignificantly reducing the parameter to enhance applicability, with the student\nmodel performing similarly or even better than the teacher model surprisingly.\nExperimental results demonstrate that our DP-IQA achieves state-of-the-art\nresults on various in-the-wild datasets with better generalization capability,\nwhich shows the superiority of our method in global modeling and utilizing the\nhierarchical feature clues of diffusion for evaluating image quality.\n","authors":["Honghao Fu","Yufei Wang","Wenhan Yang","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2405.19996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19990v1","updated":"2024-05-30T12:22:06Z","published":"2024-05-30T12:22:06Z","title":"DiffPhysBA: Diffusion-based Physical Backdoor Attack against Person\n  Re-Identification in Real-World","summary":"  Person Re-Identification (ReID) systems pose a significant security risk from\nbackdoor attacks, allowing adversaries to evade tracking or impersonate others.\nBeyond recognizing this issue, we investigate how backdoor attacks can be\ndeployed in real-world scenarios, where a ReID model is typically trained on\ndata collected in the digital domain and then deployed in a physical\nenvironment. This attack scenario requires an attack flow that embeds backdoor\ntriggers in the digital domain realistically enough to also activate the buried\nbackdoor in person ReID models in the physical domain. This paper realizes this\nattack flow by leveraging a diffusion model to generate realistic accessories\non pedestrian images (e.g., bags, hats, etc.) as backdoor triggers. However,\nthe noticeable domain gap between the triggers generated by the off-the-shelf\ndiffusion model and their physical counterparts results in a low attack success\nrate. Therefore, we introduce a novel diffusion-based physical backdoor attack\n(DiffPhysBA) method that adopts a training-free similarity-guided sampling\nprocess to enhance the resemblance between generated and physical triggers.\nConsequently, DiffPhysBA can generate realistic attributes as semantic-level\ntriggers in the digital domain and provides higher physical ASR compared to the\ndirect paste method by 25.6% on the real-world test set. Through evaluations on\nnewly proposed real-world and synthetic ReID test sets, DiffPhysBA demonstrates\nan impressive success rate exceeding 90% in both the digital and physical\ndomains. Notably, it excels in digital stealth metrics and can effectively\nevade state-of-the-art defense methods.\n","authors":["Wenli Sun","Xinyang Jiang","Dongsheng Li","Cairong Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.19990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15240v2","updated":"2024-05-30T12:14:05Z","published":"2024-05-24T06:06:41Z","title":"Towards Real World Debiasing: A Fine-grained Analysis On Spurious\n  Correlation","summary":"  Spurious correlations in training data significantly hinder the\ngeneralization capability of machine learning models when faced with\ndistribution shifts in real-world scenarios. To tackle the problem, numerous\ndebias approaches have been proposed and benchmarked on datasets intentionally\ndesigned with severe biases. However, it remains to be asked: \\textit{1. Do\nexisting benchmarks really capture biases in the real world? 2. Can existing\ndebias methods handle biases in the real world?} To answer the questions, we\nrevisit biased distributions in existing benchmarks and real-world datasets,\nand propose a fine-grained framework for analyzing dataset bias by\ndisentangling it into the magnitude and prevalence of bias. We observe and\ntheoretically demonstrate that existing benchmarks poorly represent real-world\nbiases. We further introduce two novel biased distributions to bridge this gap,\nforming a nuanced evaluation framework for real-world debiasing. Building upon\nthese results, we evaluate existing debias methods with our evaluation\nframework. Results show that existing methods are incapable of handling\nreal-world biases. Through in-depth analysis, we propose a simple yet effective\napproach that can be easily applied to existing debias methods, named Debias in\nDestruction (DiD). Empirical results demonstrate the superiority of DiD,\nimproving the performance of existing methods on all types of biases within the\nproposed evaluation framework.\n","authors":["Zhibo Wang","Peng Kuang","Zhixuan Chu","Jingyi Wang","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2405.15240v2.pdf","comment":"9 pages of main paper, 10 pages of appendix"},{"id":"http://arxiv.org/abs/2308.14746v3","updated":"2024-05-30T11:52:33Z","published":"2023-08-28T17:55:33Z","title":"CoVR: Learning Composed Video Retrieval from Web Video Captions","summary":"  Composed Image Retrieval (CoIR) has recently gained popularity as a task that\nconsiders both text and image queries together, to search for relevant images\nin a database. Most CoIR approaches require manually annotated datasets,\ncomprising image-text-image triplets, where the text describes a modification\nfrom the query image to the target image. However, manual curation of CoIR\ntriplets is expensive and prevents scalability. In this work, we instead\npropose a scalable automatic dataset creation methodology that generates\ntriplets given video-caption pairs, while also expanding the scope of the task\nto include composed video retrieval (CoVR). To this end, we mine paired videos\nwith a similar caption from a large database, and leverage a large language\nmodel to generate the corresponding modification text. Applying this\nmethodology to the extensive WebVid2M collection, we automatically construct\nour WebVid-CoVR dataset, resulting in 1.6 million triplets. Moreover, we\nintroduce a new benchmark for CoVR with a manually annotated evaluation set,\nalong with baseline results. Our experiments further demonstrate that training\na CoVR model on our dataset effectively transfers to CoIR, leading to improved\nstate-of-the-art performance in the zero-shot setup on both the CIRR and\nFashionIQ benchmarks. Our code, datasets, and models are publicly available at\nhttps://imagine.enpc.fr/~ventural/covr.\n","authors":["Lucas Ventura","Antoine Yang","Cordelia Schmid","Gül Varol"],"pdf_url":"https://arxiv.org/pdf/2308.14746v3.pdf","comment":"AAAI 2024, Updated the results on CIRR with the correct evaluation.\n  Project page: Project page: https://imagine.enpc.fr/~ventural/covr/"},{"id":"http://arxiv.org/abs/2405.18416v2","updated":"2024-05-30T11:52:04Z","published":"2024-05-28T17:57:12Z","title":"3D StreetUnveiler with Semantic-Aware 2DGS","summary":"  Unveiling an empty street from crowded observations captured by in-car\ncameras is crucial for autonomous driving. However, removing all temporarily\nstatic objects, such as stopped vehicles and standing pedestrians, presents a\nsignificant challenge. Unlike object-centric 3D inpainting, which relies on\nthorough observation in a small scene, street scene cases involve long\ntrajectories that differ from previous 3D inpainting tasks. The camera-centric\nmoving environment of captured videos further complicates the task due to the\nlimited degree and time duration of object observation. To address these\nobstacles, we introduce StreetUnveiler to reconstruct an empty street.\nStreetUnveiler learns a 3D representation of the empty street from crowded\nobservations. Our representation is based on the hard-label semantic 2D\nGaussian Splatting (2DGS) for its scalability and ability to identify Gaussians\nto be removed. We inpaint rendered image after removing unwanted Gaussians to\nprovide pseudo-labels and subsequently re-optimize the 2DGS. Given its temporal\ncontinuous movement, we divide the empty street scene into observed,\npartial-observed, and unobserved regions, which we propose to locate through a\nrendered alpha map. This decomposition helps us to minimize the regions that\nneed to be inpainted. To enhance the temporal consistency of the inpainting, we\nintroduce a novel time-reversal framework to inpaint frames in reverse order\nand use later frames as references for earlier frames to fully utilize the\nlong-trajectory observations. Our experiments conducted on the street scene\ndataset successfully reconstructed a 3D representation of the empty street. The\nmesh representation of the empty street can be extracted for further\napplications. The project page and more visualizations can be found at:\nhttps://streetunveiler.github.io\n","authors":["Jingwei Xu","Yikai Wang","Yiqun Zhao","Yanwei Fu","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2405.18416v2.pdf","comment":"Project page: https://streetunveiler.github.io"},{"id":"http://arxiv.org/abs/2402.03286v3","updated":"2024-05-30T11:42:15Z","published":"2024-02-05T18:42:34Z","title":"Training-Free Consistent Text-to-Image Generation","summary":"  Text-to-image models offer a new level of creative flexibility by allowing\nusers to guide the image generation process through natural language. However,\nusing these models to consistently portray the same subject across diverse\nprompts remains challenging. Existing approaches fine-tune the model to teach\nit new words that describe specific user-provided subjects or add image\nconditioning to the model. These methods require lengthy per-subject\noptimization or large-scale pre-training. Moreover, they struggle to align\ngenerated images with text prompts and face difficulties in portraying multiple\nsubjects. Here, we present ConsiStory, a training-free approach that enables\nconsistent subject generation by sharing the internal activations of the\npretrained model. We introduce a subject-driven shared attention block and\ncorrespondence-based feature injection to promote subject consistency between\nimages. Additionally, we develop strategies to encourage layout diversity while\nmaintaining subject consistency. We compare ConsiStory to a range of baselines,\nand demonstrate state-of-the-art performance on subject consistency and text\nalignment, without requiring a single optimization step. Finally, ConsiStory\ncan naturally extend to multi-subject scenarios, and even enable training-free\npersonalization for common objects.\n","authors":["Yoad Tewel","Omri Kaduri","Rinon Gal","Yoni Kasten","Lior Wolf","Gal Chechik","Yuval Atzmon"],"pdf_url":"https://arxiv.org/pdf/2402.03286v3.pdf","comment":"Accepted to journal track of SIGGRAPH 2024 (TOG). Project page is at\n  https://consistory-paper.github.io"},{"id":"http://arxiv.org/abs/2306.17574v2","updated":"2024-05-30T11:33:08Z","published":"2023-06-30T11:49:00Z","title":"SpATr: MoCap 3D Human Action Recognition based on Spiral Auto-encoder\n  and Transformer Network","summary":"  Recent technological advancements have significantly expanded the potential\nof human action recognition through harnessing the power of 3D data. This data\nprovides a richer understanding of actions, including depth information that\nenables more accurate analysis of spatial and temporal characteristics. In this\ncontext, We study the challenge of 3D human action recognition.Unlike prior\nmethods, that rely on sampling 2D depth images, skeleton points, or point\nclouds, often leading to substantial memory requirements and the ability to\nhandle only short sequences, we introduce a novel approach for 3D human action\nrecognition, denoted as SpATr (Spiral Auto-encoder and Transformer Network),\nspecifically designed for fixed-topology mesh sequences. The SpATr model\ndisentangles space and time in the mesh sequences. A lightweight auto-encoder,\nbased on spiral convolutions, is employed to extract spatial geometrical\nfeatures from each 3D mesh. These convolutions are lightweight and specifically\ndesigned for fix-topology mesh data. Subsequently, a temporal transformer,\nbased on self-attention, captures the temporal context within the feature\nsequence. The self-attention mechanism enables long-range dependencies\ncapturing and parallel processing, ensuring scalability for long sequences. The\nproposed method is evaluated on three prominent 3D human action datasets:\nBabel, MoVi, and BMLrub, from the Archive of Motion Capture As Surface Shapes\n(AMASS). Our results analysis demonstrates the competitive performance of our\nSpATr model in 3D human action recognition while maintaining efficient memory\nusage. The code and the training results will soon be made publicly available\nat https://github.com/h-bouzid/spatr.\n","authors":["Hamza Bouzid","Lahoucine Ballihi"],"pdf_url":"https://arxiv.org/pdf/2306.17574v2.pdf","comment":"Accepted in CVIU"},{"id":"http://arxiv.org/abs/2405.19957v1","updated":"2024-05-30T11:23:01Z","published":"2024-05-30T11:23:01Z","title":"PLA4D: Pixel-Level Alignments for Text-to-4D Gaussian Splatting","summary":"  As text-conditioned diffusion models (DMs) achieve breakthroughs in image,\nvideo, and 3D generation, the research community's focus has shifted to the\nmore challenging task of text-to-4D synthesis, which introduces a temporal\ndimension to generate dynamic 3D objects. In this context, we identify Score\nDistillation Sampling (SDS), a widely used technique for text-to-3D synthesis,\nas a significant hindrance to text-to-4D performance due to its Janus-faced and\ntexture-unrealistic problems coupled with high computational costs. In this\npaper, we propose \\textbf{P}ixel-\\textbf{L}evel \\textbf{A}lignments for\nText-to-\\textbf{4D} Gaussian Splatting (\\textbf{PLA4D}), a novel method that\nutilizes text-to-video frames as explicit pixel alignment targets to generate\nstatic 3D objects and inject motion into them. Specifically, we introduce Focal\nAlignment to calibrate camera poses for rendering and GS-Mesh Contrastive\nLearning to distill geometry priors from rendered image contrasts at the pixel\nlevel. Additionally, we develop Motion Alignment using a deformation network to\ndrive changes in Gaussians and implement Reference Refinement for smooth 4D\nobject surfaces. These techniques enable 4D Gaussian Splatting to align\ngeometry, texture, and motion with generated videos at the pixel level.\nCompared to previous methods, PLA4D produces synthesized outputs with better\ntexture details in less time and effectively mitigates the Janus-faced problem.\nPLA4D is fully implemented using open-source models, offering an accessible,\nuser-friendly, and promising direction for 4D digital content creation. Our\nproject page:\n\\href{https://github.com/MiaoQiaowei/PLA4D.github.io}{https://github.com/MiaoQiaowei/PLA4D.github.io}.\n","authors":["Qiaowei Miao","Yawei Luo","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2405.19957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06681v2","updated":"2024-05-30T11:16:49Z","published":"2024-03-11T12:56:36Z","title":"Trustworthy Partial Label Learning with Out-of-distribution Detection","summary":"  Partial Label Learning (PLL) grapples with learning from ambiguously labelled\ndata, and it has been successfully applied in fields such as image recognition.\nNevertheless, traditional PLL methods rely on the closed-world assumption,\nwhich can be limiting in open-world scenarios and negatively impact model\nperformance and generalization. To tackle these challenges, our study\nintroduces a novel method called PLL-OOD, which is the first to incorporate\nOut-of-Distribution (OOD) detection into the PLL framework. PLL-OOD\nsignificantly enhances model adaptability and accuracy by merging\nself-supervised learning with partial label loss and pioneering the\nPartial-Energy (PE) score for OOD detection. This approach improves data\nfeature representation and effectively disambiguates candidate labels, using a\ndynamic label confidence matrix to refine predictions. The PE score, adjusted\nby label confidence, precisely identifies OOD instances, optimizing model\ntraining towards in-distribution data. This innovative method markedly boosts\nPLL model robustness and performance in open-world settings. To validate our\napproach, we conducted a comprehensive comparative experiment combining the\nexisting state-of-the-art PLL model with multiple OOD scores on the CIFAR-10\nand CIFAR-100 datasets with various OOD datasets. The results demonstrate that\nthe proposed PLL-OOD framework is highly effective and effectiveness\noutperforms existing models, showcasing its superiority and effectiveness.\n","authors":["Jintao Huang","Yiu-Ming Cheung"],"pdf_url":"https://arxiv.org/pdf/2403.06681v2.pdf","comment":"There are many errors in the Abstract, Introduction, Related Work,\n  Proposed Method, Experiment and References of this paper, which need to be\n  further corrected to avoid misleading. Therefore, it needs to be withdrawn"},{"id":"http://arxiv.org/abs/2405.19949v1","updated":"2024-05-30T11:11:54Z","published":"2024-05-30T11:11:54Z","title":"Hyper-Transformer for Amodal Completion","summary":"  Amodal object completion is a complex task that involves predicting the\ninvisible parts of an object based on visible segments and background\ninformation. Learning shape priors is crucial for effective amodal completion,\nbut traditional methods often rely on two-stage processes or additional\ninformation, leading to inefficiencies and potential error accumulation. To\naddress these shortcomings, we introduce a novel framework named the\nHyper-Transformer Amodal Network (H-TAN). This framework utilizes a hyper\ntransformer equipped with a dynamic convolution head to directly learn shape\npriors and accurately predict amodal masks. Specifically, H-TAN uses a\ndual-branch structure to extract multi-scale features from both images and\nmasks. The multi-scale features from the image branch guide the hyper\ntransformer in learning shape priors and in generating the weights for dynamic\nconvolution tailored to each instance. The dynamic convolution head then uses\nthe features from the mask branch to predict precise amodal masks. We\nextensively evaluate our model on three benchmark datasets: KINS, COCOA-cls,\nand D2SA, where H-TAN demonstrated superior performance compared to existing\nmethods. Additional experiments validate the effectiveness and stability of the\nnovel hyper transformer in our framework.\n","authors":["Jianxiong Gao","Xuelin Qian","Longfei Liang","Junwei Han","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2405.19949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19943v1","updated":"2024-05-30T11:03:27Z","published":"2024-05-30T11:03:27Z","title":"Multi-View People Detection in Large Scenes via Supervised View-Wise\n  Contribution Weighting","summary":"  Recent deep learning-based multi-view people detection (MVD) methods have\nshown promising results on existing datasets. However, current methods are\nmainly trained and evaluated on small, single scenes with a limited number of\nmulti-view frames and fixed camera views. As a result, these methods may not be\npractical for detecting people in larger, more complex scenes with severe\nocclusions and camera calibration errors. This paper focuses on improving\nmulti-view people detection by developing a supervised view-wise contribution\nweighting approach that better fuses multi-camera information under large\nscenes. Besides, a large synthetic dataset is adopted to enhance the model's\ngeneralization ability and enable more practical evaluation and comparison. The\nmodel's performance on new testing scenes is further improved with a simple\ndomain adaptation technique. Experimental results demonstrate the effectiveness\nof our approach in achieving promising cross-scene multi-view people detection\nperformance. See code here: https://vcc.tech/research/2024/MVD.\n","authors":["Qi Zhang","Yunfei Gong","Daijie Chen","Antoni B. Chan","Hui Huang"],"pdf_url":"https://arxiv.org/pdf/2405.19943v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2405.18762v2","updated":"2024-05-30T10:58:56Z","published":"2024-05-29T05:04:07Z","title":"Inpaint Biases: A Pathway to Accurate and Unbiased Image Generation","summary":"  This paper examines the limitations of advanced text-to-image models in\naccurately rendering unconventional concepts which are scarcely represented or\nabsent in their training datasets. We identify how these limitations not only\nconfine the creative potential of these models but also pose risks of\nreinforcing stereotypes. To address these challenges, we introduce the Inpaint\nBiases framework, which employs user-defined masks and inpainting techniques to\nenhance the accuracy of image generation, particularly for novel or\ninaccurately rendered objects. Through experimental validation, we demonstrate\nhow this framework significantly improves the fidelity of generated images to\nthe user's intent, thereby expanding the models' creative capabilities and\nmitigating the risk of perpetuating biases. Our study contributes to the\nadvancement of text-to-image models as unbiased, versatile tools for creative\nexpression.\n","authors":["Jiyoon Myung","Jihyeon Park"],"pdf_url":"https://arxiv.org/pdf/2405.18762v2.pdf","comment":"Paper accepted in CVPRW 2024"},{"id":"http://arxiv.org/abs/2405.19931v1","updated":"2024-05-30T10:47:48Z","published":"2024-05-30T10:47:48Z","title":"Exploring Diffusion Models' Corruption Stage in Few-Shot Fine-tuning and\n  Mitigating with Bayesian Neural Networks","summary":"  Few-shot fine-tuning of Diffusion Models (DMs) is a key advancement,\nsignificantly reducing training costs and enabling personalized AI\napplications. However, we explore the training dynamics of DMs and observe an\nunanticipated phenomenon: during the training process, image fidelity initially\nimproves, then unexpectedly deteriorates with the emergence of noisy patterns,\nonly to recover later with severe overfitting. We term the stage with generated\nnoisy patterns as corruption stage. To understand this corruption stage, we\nbegin by theoretically modeling the one-shot fine-tuning scenario, and then\nextend this modeling to more general cases. Through this modeling, we identify\nthe primary cause of this corruption stage: a narrowed learning distribution\ninherent in the nature of few-shot fine-tuning. To tackle this, we apply\nBayesian Neural Networks (BNNs) on DMs with variational inference to implicitly\nbroaden the learned distribution, and present that the learning target of the\nBNNs can be naturally regarded as an expectation of the diffusion loss and a\nfurther regularization with the pretrained DMs. This approach is highly\ncompatible with current few-shot fine-tuning methods in DMs and does not\nintroduce any extra inference costs. Experimental results demonstrate that our\nmethod significantly mitigates corruption, and improves the fidelity, quality\nand diversity of the generated images in both object-driven and subject-driven\ngeneration tasks.\n","authors":["Xiaoyu Wu","Jiaru Zhang","Yang Hua","Bohan Lyu","Hao Wang","Tao Song","Haibing Guan"],"pdf_url":"https://arxiv.org/pdf/2405.19931v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2309.15703v3","updated":"2024-05-30T10:46:20Z","published":"2023-09-27T14:46:01Z","title":"Physics-Based Rigid Body Object Tracking and Friction Filtering From\n  RGB-D Videos","summary":"  Physics-based understanding of object interactions from sensory observations\nis an essential capability in augmented reality and robotics. It enables to\ncapture the properties of a scene for simulation and control. In this paper, we\npropose a novel approach for real-to-sim which tracks rigid objects in 3D from\nRGB-D images and infers physical properties of the objects. We use a\ndifferentiable physics simulation as state-transition model in an Extended\nKalman Filter which can model contact and friction for arbitrary mesh-based\nshapes and in this way estimate physically plausible trajectories. We\ndemonstrate that our approach can filter position, orientation, velocities, and\nconcurrently can estimate the coefficient of friction of the objects. We\nanalyze our approach on various sliding scenarios in synthetic image sequences\nof single objects and colliding objects. We also demonstrate and evaluate our\napproach on a real-world dataset. We make our novel benchmark datasets publicly\navailable to foster future research in this novel problem setting and\ncomparison with our method.\n","authors":["Rama Krishna Kandukuri","Michael Strecke","Joerg Stueckler"],"pdf_url":"https://arxiv.org/pdf/2309.15703v3.pdf","comment":"33 pages, 35 figures, accepted for publication at 3DV 2024, includes\n  supplementary material of the conference submission"},{"id":"http://arxiv.org/abs/2405.19203v2","updated":"2024-05-30T10:38:09Z","published":"2024-05-29T15:43:49Z","title":"$E^{3}$Gen: Efficient, Expressive and Editable Avatars Generation","summary":"  This paper aims to introduce 3D Gaussian for efficient, expressive, and\neditable digital avatar generation. This task faces two major challenges: (1)\nThe unstructured nature of 3D Gaussian makes it incompatible with current\ngeneration pipelines; (2) the expressive animation of 3D Gaussian in a\ngenerative setting that involves training with multiple subjects remains\nunexplored. In this paper, we propose a novel avatar generation method named\n$E^3$Gen, to effectively address these challenges. First, we propose a novel\ngenerative UV features plane representation that encodes unstructured 3D\nGaussian onto a structured 2D UV space defined by the SMPL-X parametric model.\nThis novel representation not only preserves the representation ability of the\noriginal 3D Gaussian but also introduces a shared structure among subjects to\nenable generative learning of the diffusion model. To tackle the second\nchallenge, we propose a part-aware deformation module to achieve robust and\naccurate full-body expressive pose control. Extensive experiments demonstrate\nthat our method achieves superior performance in avatar generation and enables\nexpressive full-body pose control and editing. Our project page is\nhttps://olivia23333.github.io/E3Gen.\n","authors":["Weitian Zhang","Yichao Yan","Yunhui Liu","Xingdong Sheng","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2405.19203v2.pdf","comment":"Project Page: https://olivia23333.github.io/E3Gen"},{"id":"http://arxiv.org/abs/2405.19921v1","updated":"2024-05-30T10:33:14Z","published":"2024-05-30T10:33:14Z","title":"MCDS-VSS: Moving Camera Dynamic Scene Video Semantic Segmentation by\n  Filtering with Self-Supervised Geometry and Motion","summary":"  Autonomous systems, such as self-driving cars, rely on reliable semantic\nenvironment perception for decision making. Despite great advances in video\nsemantic segmentation, existing approaches ignore important inductive biases\nand lack structured and interpretable internal representations. In this work,\nwe propose MCDS-VSS, a structured filter model that learns in a self-supervised\nmanner to estimate scene geometry and ego-motion of the camera, while also\nestimating the motion of external objects. Our model leverages these\nrepresentations to improve the temporal consistency of semantic segmentation\nwithout sacrificing segmentation accuracy. MCDS-VSS follows a prediction-fusion\napproach in which scene geometry and camera motion are first used to compensate\nfor ego-motion, then residual flow is used to compensate motion of dynamic\nobjects, and finally the predicted scene features are fused with the current\nfeatures to obtain a temporally consistent scene segmentation. Our model parses\nautomotive scenes into multiple decoupled interpretable representations such as\nscene geometry, ego-motion, and object motion. Quantitative evaluation shows\nthat MCDS-VSS achieves superior temporal consistency on video sequences while\nretaining competitive segmentation performance.\n","authors":["Angel Villar-Corrales","Moritz Austermann","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2405.19921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19917v1","updated":"2024-05-30T10:30:07Z","published":"2024-05-30T10:30:07Z","title":"Multimodal Cross-Domain Few-Shot Learning for Egocentric Action\n  Recognition","summary":"  We address a novel cross-domain few-shot learning task (CD-FSL) with\nmultimodal input and unlabeled target data for egocentric action recognition.\nThis paper simultaneously tackles two critical challenges associated with\negocentric action recognition in CD-FSL settings: (1) the extreme domain gap in\negocentric videos (\\eg, daily life vs. industrial domain) and (2) the\ncomputational cost for real-world applications. We propose MM-CDFSL, a\ndomain-adaptive and computationally efficient approach designed to enhance\nadaptability to the target domain and improve inference speed. To address the\nfirst challenge, we propose the incorporation of multimodal distillation into\nthe student RGB model using teacher models. Each teacher model is trained\nindependently on source and target data for its respective modality. Leveraging\nonly unlabeled target data during multimodal distillation enhances the student\nmodel's adaptability to the target domain. We further introduce ensemble masked\ninference, a technique that reduces the number of input tokens through masking.\nIn this approach, ensemble prediction mitigates the performance degradation\ncaused by masking, effectively addressing the second issue. Our approach\noutperformed the state-of-the-art CD-FSL approaches with a substantial margin\non multiple egocentric datasets, improving by an average of 6.12/6.10 points\nfor 1-shot/5-shot settings while achieving $2.2$ times faster inference speed.\nProject page: https://masashi-hatano.github.io/MM-CDFSL/\n","authors":["Masashi Hatano","Ryo Hachiuma","Ryo Fuji","Hideo Saito"],"pdf_url":"https://arxiv.org/pdf/2405.19917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19914v1","updated":"2024-05-30T10:25:50Z","published":"2024-05-30T10:25:50Z","title":"Towards RGB-NIR Cross-modality Image Registration and Beyond","summary":"  This paper focuses on the area of RGB(visible)-NIR(near-infrared)\ncross-modality image registration, which is crucial for many downstream vision\ntasks to fully leverage the complementary information present in visible and\ninfrared images. In this field, researchers face two primary challenges - the\nabsence of a correctly-annotated benchmark with viewpoint variations for\nevaluating RGB-NIR cross-modality registration methods and the problem of\ninconsistent local features caused by the appearance discrepancy between\nRGB-NIR cross-modality images. To address these challenges, we first present\nthe RGB-NIR Image Registration (RGB-NIR-IRegis) benchmark, which, for the first\ntime, enables fair and comprehensive evaluations for the task of RGB-NIR\ncross-modality image registration. Evaluations of previous methods highlight\nthe significant challenges posed by our RGB-NIR-IRegis benchmark, especially on\nRGB-NIR image pairs with viewpoint variations. To analyze the causes of the\nunsatisfying performance, we then design several metrics to reveal the toxic\nimpact of inconsistent local features between visible and infrared images on\nthe model performance. This further motivates us to develop a baseline method\nnamed Semantic Guidance Transformer (SGFormer), which utilizes high-level\nsemantic guidance to mitigate the negative impact of local inconsistent\nfeatures. Despite the simplicity of our motivation, extensive experimental\nresults show the effectiveness of our method.\n","authors":["Huadong Li","Shichao Dong","Jin Wang","Rong Fu","Minhao Jing","Jiajun Liang","Haoqiang Fan","Renhe Ji"],"pdf_url":"https://arxiv.org/pdf/2405.19914v1.pdf","comment":"18 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.00530v3","updated":"2024-05-30T10:18:44Z","published":"2023-06-01T10:29:58Z","title":"CL-MRI: Self-Supervised Contrastive Learning to Improve the Accuracy of\n  Undersampled MRI Reconstruction","summary":"  In Magnetic Resonance Imaging (MRI), image acquisitions are often\nundersampled in the measurement domain to accelerate the scanning process, at\nthe expense of image quality. However, image quality is a crucial factor that\ninfluences the accuracy of clinical diagnosis; hence, high-quality image\nreconstruction from undersampled measurements has been a key area of research.\nRecently, deep learning (DL) methods have emerged as the state-of-the-art for\nMRI reconstruction, typically involving deep neural networks to transform\nundersampled MRI images into high-quality MRI images through data-driven\nprocesses. Nevertheless, there is clear and significant room for improvement in\nundersampled DL MRI reconstruction to meet the high standards required for\nclinical diagnosis, in terms of eliminating aliasing artifacts and reducing\nimage noise. In this paper, we introduce a self-supervised pretraining\nprocedure using contrastive learning to improve the accuracy of undersampled DL\nMRI reconstruction. We use contrastive learning to transform the MRI image\nrepresentations into a latent space that maximizes mutual information among\ndifferent undersampled representations and optimizes the information content at\nthe input of the downstream DL reconstruction models. Our experiments\ndemonstrate improved reconstruction accuracy across a range of acceleration\nfactors and datasets, both quantitatively and qualitatively. Furthermore, our\nextended experiments validate the proposed framework's robustness under\nadversarial conditions, such as measurement noise, different k-space sampling\npatterns, and pathological abnormalities, and also prove the transfer learning\ncapabilities on MRI datasets with completely different anatomy. Additionally,\nwe conducted experiments to visualize and analyze the properties of the\nproposed MRI contrastive learning latent space.\n","authors":["Mevan Ekanayake","Zhifeng Chen","Mehrtash Harandi","Gary Egan","Zhaolin Chen"],"pdf_url":"https://arxiv.org/pdf/2306.00530v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00618v2","updated":"2024-05-30T10:16:04Z","published":"2024-03-31T09:10:32Z","title":"A Multi-Branched Radial Basis Network Approach to Predicting Complex\n  Chaotic Behaviours","summary":"  In this study, we propose a multi branched network approach to predict the\ndynamics of a physics attractor characterized by intricate and chaotic\nbehavior. We introduce a unique neural network architecture comprised of Radial\nBasis Function (RBF) layers combined with an attention mechanism designed to\neffectively capture nonlinear inter-dependencies inherent in the attractor's\ntemporal evolution. Our results demonstrate successful prediction of the\nattractor's trajectory across 100 predictions made using a real-world dataset\nof 36,700 time-series observations encompassing approximately 28 minutes of\nactivity. To further illustrate the performance of our proposed technique, we\nprovide comprehensive visualizations depicting the attractor's original and\npredicted behaviors alongside quantitative measures comparing observed versus\nestimated outcomes. Overall, this work showcases the potential of advanced\nmachine learning algorithms in elucidating hidden structures in complex\nphysical systems while offering practical applications in various domains\nrequiring accurate short-term forecasting capabilities.\n","authors":["Aarush Sinha"],"pdf_url":"https://arxiv.org/pdf/2404.00618v2.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.02407v2","updated":"2024-05-30T10:13:13Z","published":"2024-02-04T08:57:42Z","title":"Defining Neural Network Architecture through Polytope Structures of\n  Dataset","summary":"  Current theoretical and empirical research in neural networks suggests that\ncomplex datasets require large network architectures for thorough\nclassification, yet the precise nature of this relationship remains unclear.\nThis paper tackles this issue by defining upper and lower bounds for neural\nnetwork widths, which are informed by the polytope structure of the dataset in\nquestion. We also delve into the application of these principles to simplicial\ncomplexes and specific manifold shapes, explaining how the requirement for\nnetwork width varies in accordance with the geometric complexity of the\ndataset. Moreover, we develop an algorithm to investigate a converse situation\nwhere the polytope structure of a dataset can be inferred from its\ncorresponding trained neural networks. Through our algorithm, it is established\nthat popular datasets such as MNIST, Fashion-MNIST, and CIFAR10 can be\nefficiently encapsulated using no more than two polytopes with a small number\nof faces.\n","authors":["Sangmin Lee","Abbas Mammadov","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2402.02407v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19899v1","updated":"2024-05-30T09:55:19Z","published":"2024-05-30T09:55:19Z","title":"Open-Set Domain Adaptation for Semantic Segmentation","summary":"  Unsupervised domain adaptation (UDA) for semantic segmentation aims to\ntransfer the pixel-wise knowledge from the labeled source domain to the\nunlabeled target domain. However, current UDA methods typically assume a shared\nlabel space between source and target, limiting their applicability in\nreal-world scenarios where novel categories may emerge in the target domain. In\nthis paper, we introduce Open-Set Domain Adaptation for Semantic Segmentation\n(OSDA-SS) for the first time, where the target domain includes unknown classes.\nWe identify two major problems in the OSDA-SS scenario as follows: 1) the\nexisting UDA methods struggle to predict the exact boundary of the unknown\nclasses, and 2) they fail to accurately predict the shape of the unknown\nclasses. To address these issues, we propose Boundary and Unknown Shape-Aware\nopen-set domain adaptation, coined BUS. Our BUS can accurately discern the\nboundaries between known and unknown classes in a contrastive manner using a\nnovel dilation-erosion-based contrastive loss. In addition, we propose\nOpenReMix, a new domain mixing augmentation method that guides our model to\neffectively learn domain and size-invariant features for improving the shape\ndetection of the known and unknown classes. Through extensive experiments, we\ndemonstrate that our proposed BUS effectively detects unknown classes in the\nchallenging OSDA-SS scenario compared to the previous methods by a large\nmargin. The code is available at https://github.com/KHU-AGI/BUS.\n","authors":["Seun-An Choe","Ah-Hyung Shin","Keon-Hee Park","Jinwoo Choi","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2405.19899v1.pdf","comment":"14 pages, 5 figures, 13 tables, CVPR 2024 Poster"},{"id":"http://arxiv.org/abs/2403.16539v2","updated":"2024-05-30T09:42:26Z","published":"2024-03-25T08:31:14Z","title":"Data-Efficient 3D Visual Grounding via Order-Aware Referring","summary":"  3D visual grounding aims to identify the target object within a 3D point\ncloud scene referred to by a natural language description. Previous works\nusually require significant data relating to point color and their descriptions\nto exploit the corresponding complicated verbo-visual relations. In our work,\nwe introduce Vigor, a novel Data-Efficient 3D Visual Grounding framework via\nOrder-aware Referring. Vigor leverages LLM to produce a desirable referential\norder from the input description for 3D visual grounding. With the proposed\nstacked object-referring blocks, the predicted anchor objects in the above\norder allow one to locate the target object progressively without supervision\non the identities of anchor objects or exact relations between anchor/target\nobjects. In addition, we present an order-aware warm-up training strategy,\nwhich augments referential orders for pre-training the visual grounding\nframework. This allows us to better capture the complex verbo-visual relations\nand benefit the desirable data-efficient learning scheme. Experimental results\non the NR3D and ScanRefer datasets demonstrate our superiority in low-resource\nscenarios. In particular, Vigor surpasses current state-of-the-art frameworks\nby 9.3% and 7.6% grounding accuracy under 1% data and 10% data settings on the\nNR3D dataset, respectively.\n","authors":["Tung-Yu Wu","Sheng-Yu Huang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19882v1","updated":"2024-05-30T09:41:10Z","published":"2024-05-30T09:41:10Z","title":"PixOOD: Pixel-Level Out-of-Distribution Detection","summary":"  We propose a dense image prediction out-of-distribution detection algorithm,\ncalled PixOOD, which does not require training on samples of anomalous data and\nis not designed for a specific application which avoids traditional training\nbiases. In order to model the complex intra-class variability of the\nin-distribution data at the pixel level, we propose an online data condensation\nalgorithm which is more robust than standard K-means and is easily trainable\nthrough SGD. We evaluate PixOOD on a wide range of problems. It achieved\nstate-of-the-art results on four out of seven datasets, while being competitive\non the rest. The source code is available at https://github.com/vojirt/PixOOD.\n","authors":["Tomáš Vojíř","Jan Šochman","Jiří Matas"],"pdf_url":"https://arxiv.org/pdf/2405.19882v1.pdf","comment":"under review at ECCV 2024"},{"id":"http://arxiv.org/abs/2405.19876v1","updated":"2024-05-30T09:30:28Z","published":"2024-05-30T09:30:28Z","title":"IReNe: Instant Recoloring in Neural Radiance Fields","summary":"  Advances in NERFs have allowed for 3D scene reconstructions and novel view\nsynthesis. Yet, efficiently editing these representations while retaining\nphotorealism is an emerging challenge. Recent methods face three primary\nlimitations: they're slow for interactive use, lack precision at object\nboundaries, and struggle to ensure multi-view consistency. We introduce IReNe\nto address these limitations, enabling swift, near real-time color editing in\nNeRF. Leveraging a pre-trained NeRF model and a single training image with\nuser-applied color edits, IReNe swiftly adjusts network parameters in seconds.\nThis adjustment allows the model to generate new scene views, accurately\nrepresenting the color changes from the training image while also controlling\nobject boundaries and view-specific effects. Object boundary control is\nachieved by integrating a trainable segmentation module into the model. The\nprocess gains efficiency by retraining only the weights of the last network\nlayer. We observed that neurons in this layer can be classified into those\nresponsible for view-dependent appearance and those contributing to diffuse\nappearance. We introduce an automated classification approach to identify these\nneuron types and exclusively fine-tune the weights of the diffuse neurons. This\nfurther accelerates training and ensures consistent color edits across\ndifferent views. A thorough validation on a new dataset, with edited object\ncolors, shows significant quantitative and qualitative advancements over\ncompetitors, accelerating speeds by 5x to 500x.\n","authors":["Alessio Mazzucchelli","Adrian Garcia-Garcia","Elena Garces","Fernando Rivas-Manzaneque","Francesc Moreno-Noguer","Adrian Penate-Sanchez"],"pdf_url":"https://arxiv.org/pdf/2405.19876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03917v3","updated":"2024-05-30T09:15:06Z","published":"2024-02-06T11:35:02Z","title":"Elastic Feature Consolidation for Cold Start Exemplar-Free Incremental\n  Learning","summary":"  Exemplar-Free Class Incremental Learning (EFCIL) aims to learn from a\nsequence of tasks without having access to previous task data. In this paper,\nwe consider the challenging Cold Start scenario in which insufficient data is\navailable in the first task to learn a high-quality backbone. This is\nespecially challenging for EFCIL since it requires high plasticity, which\nresults in feature drift which is difficult to compensate for in the\nexemplar-free setting. To address this problem, we propose a simple and\neffective approach that consolidates feature representations by regularizing\ndrift in directions highly relevant to previous tasks and employs prototypes to\nreduce task-recency bias. Our method, called Elastic Feature Consolidation\n(EFC), exploits a tractable second-order approximation of feature drift based\non an Empirical Feature Matrix (EFM). The EFM induces a pseudo-metric in\nfeature space which we use to regularize feature drift in important directions\nand to update Gaussian prototypes used in a novel asymmetric cross entropy loss\nwhich effectively balances prototype rehearsal with data from new tasks.\nExperimental results on CIFAR-100, Tiny-ImageNet, ImageNet-Subset and\nImageNet-1K demonstrate that Elastic Feature Consolidation is better able to\nlearn new tasks by maintaining model plasticity and significantly outperform\nthe state-of-the-art.\n","authors":["Simone Magistri","Tomaso Trinci","Albin Soutif-Cormerais","Joost van de Weijer","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2402.03917v3.pdf","comment":"Accepted at Twelfth International Conference on Learning\n  Representations (ICLR 2024)"},{"id":"http://arxiv.org/abs/2405.19861v1","updated":"2024-05-30T09:10:33Z","published":"2024-05-30T09:10:33Z","title":"Hierarchical Object-Centric Learning with Capsule Networks","summary":"  Capsule networks (CapsNets) were introduced to address convolutional neural\nnetworks limitations, learning object-centric representations that are more\nrobust, pose-aware, and interpretable. They organize neurons into groups called\ncapsules, where each capsule encodes the instantiation parameters of an object\nor one of its parts. Moreover, a routing algorithm connects capsules in\ndifferent layers, thereby capturing hierarchical part-whole relationships in\nthe data.\n  This thesis investigates the intriguing aspects of CapsNets and focuses on\nthree key questions to unlock their full potential. First, we explore the\neffectiveness of the routing algorithm, particularly in small-sized networks.\nWe propose a novel method that anneals the number of routing iterations during\ntraining, enhancing performance in architectures with fewer parameters.\n  Secondly, we investigate methods to extract more effective first-layer\ncapsules, also known as primary capsules. By exploiting pruned backbones, we\naim to improve computational efficiency by reducing the number of capsules\nwhile achieving high generalization. This approach reduces CapsNets memory\nrequirements and computational effort.\n  Third, we explore part-relationship learning in CapsNets. Through extensive\nresearch, we demonstrate that capsules with low entropy can extract more\nconcise and discriminative part-whole relationships compared to traditional\ncapsule networks, even with reasonable network sizes.\n  Lastly, we showcase how CapsNets can be utilized in real-world applications,\nincluding autonomous localization of unmanned aerial vehicles, quaternion-based\nrotations prediction in synthetic datasets, and lung nodule segmentation in\nbiomedical imaging.\n  The findings presented in this thesis contribute to a deeper understanding of\nCapsNets and highlight their potential to address complex computer vision\nchallenges.\n","authors":["Riccardo Renzulli"],"pdf_url":"https://arxiv.org/pdf/2405.19861v1.pdf","comment":"Updated version of my PhD thesis (Nov 2023), with fixed typos. Will\n  keep updated as new typos are discovered!"},{"id":"http://arxiv.org/abs/2405.19092v2","updated":"2024-05-30T09:06:07Z","published":"2024-05-29T13:54:12Z","title":"Benchmarking and Improving Detail Image Caption","summary":"  Image captioning has long been regarded as a fundamental task in visual\nunderstanding. Recently, however, few large vision-language model (LVLM)\nresearch discusses model's image captioning performance because of the outdated\nshort-caption benchmarks and unreliable evaluation metrics. In this work, we\npropose to benchmark detail image caption task by curating high-quality\nevaluation datasets annotated by human experts, GPT-4V and Gemini-1.5-Pro. We\nalso design a more reliable caption evaluation metric called CAPTURE (CAPtion\nevaluation by exTracting and coUpling coRE information). CAPTURE extracts\nvisual elements, e.g., objects, attributes and relations from captions, and\nthen matches these elements through three stages, achieving the highest\nconsistency with expert judgements over other rule-based or model-based caption\nmetrics. The proposed benchmark and metric provide reliable evaluation for\nLVLM's detailed image captioning ability. Guided by this evaluation, we further\nexplore to unleash LVLM's detail caption capabilities by synthesizing\nhigh-quality data through a five-stage data construction pipeline. Our pipeline\nonly uses a given LVLM itself and other open-source tools, without any human or\nGPT-4V annotation in the loop. Experiments show that the proposed data\nconstruction strategy significantly improves model-generated detail caption\ndata quality for LVLMs with leading performance, and the data quality can be\nfurther improved in a self-looping paradigm. All code and dataset will be\npublicly available at https://github.com/foundation-multimodal-models/CAPTURE.\n","authors":["Hongyuan Dong","Jiawen Li","Bohong Wu","Jiacong Wang","Yuan Zhang","Haoyuan Guo"],"pdf_url":"https://arxiv.org/pdf/2405.19092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19854v1","updated":"2024-05-30T09:03:23Z","published":"2024-05-30T09:03:23Z","title":"RTGen: Generating Region-Text Pairs for Open-Vocabulary Object Detection","summary":"  Open-vocabulary object detection (OVD) requires solid modeling of the\nregion-semantic relationship, which could be learned from massive region-text\npairs. However, such data is limited in practice due to significant annotation\ncosts. In this work, we propose RTGen to generate scalable open-vocabulary\nregion-text pairs and demonstrate its capability to boost the performance of\nopen-vocabulary object detection. RTGen includes both text-to-region and\nregion-to-text generation processes on scalable image-caption data. The\ntext-to-region generation is powered by image inpainting, directed by our\nproposed scene-aware inpainting guider for overall layout harmony. For\nregion-to-text generation, we perform multiple region-level image captioning\nwith various prompts and select the best matching text according to CLIP\nsimilarity. To facilitate detection training on region-text pairs, we also\nintroduce a localization-aware region-text contrastive loss that learns object\nproposals tailored with different localization qualities. Extensive experiments\ndemonstrate that our RTGen can serve as a scalable, semantically rich, and\neffective source for open-vocabulary object detection and continue to improve\nthe model performance when more data is utilized, delivering superior\nperformance compared to the existing state-of-the-art methods.\n","authors":["Fangyi Chen","Han Zhang","Zhantao Yang","Hao Chen","Kai Hu","Marios Savvides"],"pdf_url":"https://arxiv.org/pdf/2405.19854v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2405.19833v1","updated":"2024-05-30T08:44:12Z","published":"2024-05-30T08:44:12Z","title":"KITRO: Refining Human Mesh by 2D Clues and Kinematic-tree Rotation","summary":"  2D keypoints are commonly used as an additional cue to refine estimated 3D\nhuman meshes. Current methods optimize the pose and shape parameters with a\nreprojection loss on the provided 2D keypoints. Such an approach, while simple\nand intuitive, has limited effectiveness because the optimal solution is hard\nto find in ambiguous parameter space and may sacrifice depth. Additionally,\ndivergent gradients from distal joints complicate and deviate the refinement of\nproximal joints in the kinematic chain. To address these, we introduce\nKinematic-Tree Rotation (KITRO), a novel mesh refinement strategy that\nexplicitly models depth and human kinematic-tree structure. KITRO treats\nrefinement from a bone-wise perspective. Unlike previous methods which perform\ngradient-based optimizations, our method calculates bone directions in closed\nform. By accounting for the 2D pose, bone length, and parent joint's depth, the\ncalculation results in two possible directions for each child joint. We then\nuse a decision tree to trace binary choices for all bones along the human\nskeleton's kinematic-tree to select the most probable hypothesis. Our\nexperiments across various datasets and baseline models demonstrate that KITRO\nsignificantly improves 3D joint estimation accuracy and achieves an ideal 2D\nfit simultaneously. Our code available at: https://github.com/MartaYang/KITRO.\n","authors":["Fengyuan Yang","Kerui Gu","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2405.19833v1.pdf","comment":"Accepted by CVPR24"},{"id":"http://arxiv.org/abs/2210.14562v2","updated":"2024-05-30T08:38:50Z","published":"2022-10-26T08:46:24Z","title":"FairCLIP: Social Bias Elimination based on Attribute Prototype Learning\n  and Representation Neutralization","summary":"  The Vision-Language Pre-training (VLP) models like CLIP have gained\npopularity in recent years. However, many works found that the social biases\nhidden in CLIP easily manifest in downstream tasks, especially in image\nretrieval, which can have harmful effects on human society. In this work, we\npropose FairCLIP to eliminate the social bias in CLIP-based image retrieval\nwithout damaging the retrieval performance achieving the compatibility between\nthe debiasing effect and the retrieval performance. FairCLIP is divided into\ntwo steps: Attribute Prototype Learning (APL) and Representation Neutralization\n(RN). In the first step, we extract the concepts needed for debiasing in CLIP.\nWe use the query with learnable word vector prefixes as the extraction\nstructure. In the second step, we first divide the attributes into target and\nbias attributes. By analysis, we find that both attributes have an impact on\nthe bias. Therefore, we try to eliminate the bias by using Re-Representation\nMatrix (RRM) to achieve the neutralization of the representation. We compare\nthe debiasing effect and retrieval performance with other methods, and\nexperiments demonstrate that FairCLIP can achieve the best compatibility.\nAlthough FairCLIP is used to eliminate bias in image retrieval, it achieves the\nneutralization of the representation which is common to all CLIP downstream\ntasks. This means that FairCLIP can be applied as a general debiasing method\nfor other fairness issues related to CLIP.\n","authors":["Junyang Wang","Yi Zhang","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2210.14562v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19822v1","updated":"2024-05-30T08:31:01Z","published":"2024-05-30T08:31:01Z","title":"Improving Object Detector Training on Synthetic Data by Starting With a\n  Strong Baseline Methodology","summary":"  Collecting and annotating real-world data for the development of object\ndetection models is a time-consuming and expensive process. In the military\ndomain in particular, data collection can also be dangerous or infeasible.\nTraining models on synthetic data may provide a solution for cases where access\nto real-world training data is restricted. However, bridging the reality gap\nbetween synthetic and real data remains a challenge. Existing methods usually\nbuild on top of baseline Convolutional Neural Network (CNN) models that have\nbeen shown to perform well when trained on real data, but have limited ability\nto perform well when trained on synthetic data. For example, some architectures\nallow for fine-tuning with the expectation of large quantities of training data\nand are prone to overfitting on synthetic data. Related work usually ignores\nvarious best practices from object detection on real data, e.g. by training on\nsynthetic data from a single environment with relatively little variation. In\nthis paper we propose a methodology for improving the performance of a\npre-trained object detector when training on synthetic data. Our approach\nfocuses on extracting the salient information from synthetic data without\nforgetting useful features learned from pre-training on real images. Based on\nthe state of the art, we incorporate data augmentation methods and a\nTransformer backbone. Besides reaching relatively strong performance without\nany specialized synthetic data transfer methods, we show that our methods\nimprove the state of the art on synthetic data trained object detection for the\nRarePlanes and DGTA-VisDrone datasets, and reach near-perfect performance on an\nin-house vehicle detection dataset.\n","authors":["Frank A. Ruis","Alma M. Liezenga","Friso G. Heslinga","Luca Ballan","Thijs A. Eker","Richard J. M. den Hollander","Martin C. van Leeuwen","Judith Dijk","Wyke Huizinga"],"pdf_url":"https://arxiv.org/pdf/2405.19822v1.pdf","comment":"Submitted to and presented at SPIE Defense + Commercial Sensing 2024,\n  13 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2405.17825v2","updated":"2024-05-30T08:28:32Z","published":"2024-05-28T04:47:54Z","title":"Diffusion Model Patching via Mixture-of-Prompts","summary":"  We present Diffusion Model Patching (DMP), a simple method to boost the\nperformance of pre-trained diffusion models that have already reached\nconvergence, with a negligible increase in parameters. DMP inserts a small,\nlearnable set of prompts into the model's input space while keeping the\noriginal model frozen. The effectiveness of DMP is not merely due to the\naddition of parameters but stems from its dynamic gating mechanism, which\nselects and combines a subset of learnable prompts at every step of the\ngenerative process (e.g., reverse denoising steps). This strategy, which we\nterm \"mixture-of-prompts\", enables the model to draw on the distinct expertise\nof each prompt, essentially \"patching\" the model's functionality at every step\nwith minimal yet specialized parameters. Uniquely, DMP enhances the model by\nfurther training on the same dataset on which it was originally trained, even\nin a scenario where significant improvements are typically not expected due to\nmodel convergence. Experiments show that DMP significantly enhances the\nconverged FID of DiT-L/2 on FFHQ 256x256 by 10.38%, achieved with only a 1.43%\nparameter increase and 50K additional training iterations.\n","authors":["Seokil Ham","Sangmin Woo","Jin-Young Kim","Hyojun Go","Byeongjun Park","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2405.17825v2.pdf","comment":"Project page: https://sangminwoo.github.io/DMP/"},{"id":"http://arxiv.org/abs/2405.19819v1","updated":"2024-05-30T08:26:47Z","published":"2024-05-30T08:26:47Z","title":"Gated Fields: Learning Scene Reconstruction from Gated Videos","summary":"  Reconstructing outdoor 3D scenes from temporal observations is a challenge\nthat recent work on neural fields has offered a new avenue for. However,\nexisting methods that recover scene properties, such as geometry, appearance,\nor radiance, solely from RGB captures often fail when handling poorly-lit or\ntexture-deficient regions. Similarly, recovering scenes with scanning LiDAR\nsensors is also difficult due to their low angular sampling rate which makes\nrecovering expansive real-world scenes difficult. Tackling these gaps, we\nintroduce Gated Fields - a neural scene reconstruction method that utilizes\nactive gated video sequences. To this end, we propose a neural rendering\napproach that seamlessly incorporates time-gated capture and illumination. Our\nmethod exploits the intrinsic depth cues in the gated videos, achieving precise\nand dense geometry reconstruction irrespective of ambient illumination\nconditions. We validate the method across day and night scenarios and find that\nGated Fields compares favorably to RGB and LiDAR reconstruction methods. Our\ncode and datasets are available at https://light.princeton.edu/gatedfields/.\n","authors":["Andrea Ramazzina","Stefanie Walz","Pragyan Dahal","Mario Bijelic","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2405.19819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19818v1","updated":"2024-05-30T08:25:21Z","published":"2024-05-30T08:25:21Z","title":"WebUOT-1M: Advancing Deep Underwater Object Tracking with A\n  Million-Scale Benchmark","summary":"  Underwater object tracking (UOT) is a foundational task for identifying and\ntracing submerged entities in underwater video sequences. However, current UOT\ndatasets suffer from limitations in scale, diversity of target categories and\nscenarios covered, hindering the training and evaluation of modern tracking\nalgorithms. To bridge this gap, we take the first step and introduce WebUOT-1M,\n\\ie, the largest public UOT benchmark to date, sourced from complex and\nrealistic underwater environments. It comprises 1.1 million frames across 1,500\nvideo clips filtered from 408 target categories, largely surpassing previous\nUOT datasets, \\eg, UVOT400. Through meticulous manual annotation and\nverification, we provide high-quality bounding boxes for underwater targets.\nAdditionally, WebUOT-1M includes language prompts for video sequences,\nexpanding its application areas, \\eg, underwater vision-language tracking. Most\nexisting trackers are tailored for open-air environments, leading to\nperformance degradation when applied to UOT due to domain gaps. Retraining and\nfine-tuning these trackers are challenging due to sample imbalances and limited\nreal-world underwater datasets. To tackle these challenges, we propose a novel\nomni-knowledge distillation framework based on WebUOT-1M, incorporating various\nstrategies to guide the learning of the student Transformer. To the best of our\nknowledge, this framework is the first to effectively transfer open-air domain\nknowledge to the UOT model through knowledge distillation, as demonstrated by\nresults on both existing UOT datasets and the newly proposed WebUOT-1M.\nFurthermore, we comprehensively evaluate WebUOT-1M using 30 deep trackers,\nshowcasing its value as a benchmark for UOT research by presenting new\nchallenges and opportunities for future studies. The complete dataset, codes\nand tracking results, will be made publicly available.\n","authors":["Chunhui Zhang","Li Liu","Guanjie Huang","Hao Wen","Xi Zhou","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.19818v1.pdf","comment":"GitHub project:\n  https://github.com/983632847/Awesome-Multimodal-Object-Tracking"},{"id":"http://arxiv.org/abs/2405.19817v1","updated":"2024-05-30T08:24:00Z","published":"2024-05-30T08:24:00Z","title":"Performance Examination of Symbolic Aggregate Approximation in IoT\n  Applications","summary":"  Symbolic Aggregate approXimation (SAX) is a common dimensionality reduction\napproach for time-series data which has been employed in a variety of domains,\nincluding classification and anomaly detection in time-series data. Domains\nalso include shape recognition where the shape outline is converted into\ntime-series data forinstance epoch classification of archived arrowheads. In\nthis paper we propose a dimensionality reduction and shape recognition approach\nbased on the SAX algorithm, an application which requires responses on cost\nefficient, IoT-like, platforms. The challenge is largely dealing with the\ncomputational expense of the SAX algorithm in IoT-like applications, from\nsimple time-series dimension reduction through shape recognition. The approach\nis based on lowering the dimensional space while capturing and preserving the\nmost representative features of the shape. We present three scenarios of\nincreasing computational complexity backing up our statements with measurement\nof performance characteristics\n","authors":["Suzana Veljanovska","Hans Dermot Doran"],"pdf_url":"https://arxiv.org/pdf/2405.19817v1.pdf","comment":"Embedded World Conference, Nuremberg, 2024"},{"id":"http://arxiv.org/abs/2405.19794v1","updated":"2024-05-30T08:02:05Z","published":"2024-05-30T08:02:05Z","title":"Video Question Answering for People with Visual Impairments Using an\n  Egocentric 360-Degree Camera","summary":"  This paper addresses the daily challenges encountered by visually impaired\nindividuals, such as limited access to information, navigation difficulties,\nand barriers to social interaction. To alleviate these challenges, we introduce\na novel visual question answering dataset. Our dataset offers two significant\nadvancements over previous datasets: Firstly, it features videos captured using\na 360-degree egocentric wearable camera, enabling observation of the entire\nsurroundings, departing from the static image-centric nature of prior datasets.\nSecondly, unlike datasets centered on singular challenges, ours addresses\nmultiple real-life obstacles simultaneously through an innovative\nvisual-question answering framework. We validate our dataset using various\nstate-of-the-art VideoQA methods and diverse metrics. Results indicate that\nwhile progress has been made, satisfactory performance levels for AI-powered\nassistive services remain elusive for visually impaired individuals.\nAdditionally, our evaluation highlights the distinctive features of the\nproposed dataset, featuring ego-motion in videos captured via 360-degree\ncameras across varied scenarios.\n","authors":["Inpyo Song","Minjun Joo","Joonhyung Kwon","Jangwon Lee"],"pdf_url":"https://arxiv.org/pdf/2405.19794v1.pdf","comment":"CVPR2024 EgoVis Workshop"},{"id":"http://arxiv.org/abs/2403.16794v2","updated":"2024-05-30T07:53:21Z","published":"2024-03-25T14:13:09Z","title":"CurbNet: Curb Detection Framework Based on LiDAR Point Cloud\n  Segmentation","summary":"  Curb detection is a crucial function in intelligent driving, essential for\ndetermining drivable areas on the road. However, the complexity of road\nenvironments makes curb detection challenging. This paper introduces CurbNet, a\nnovel framework for curb detection utilizing point cloud segmentation. To\naddress the lack of comprehensive curb datasets with 3D annotations, we have\ndeveloped the 3D-Curb dataset based on SemanticKITTI, currently the largest and\nmost diverse collection of curb point clouds. Recognizing that the primary\ncharacteristic of curbs is height variation, our approach leverages spatially\nrich 3D point clouds for training. To tackle the challenges posed by the uneven\ndistribution of curb features on the xy-plane and their dependence on\nhigh-frequency features along the z-axis, we introduce the Multi-Scale and\nChannel Attention (MSCA) module, a customized solution designed to optimize\ndetection performance. Additionally, we propose an adaptive weighted loss\nfunction group specifically formulated to counteract the imbalance in the\ndistribution of curb point clouds relative to other categories. Extensive\nexperiments conducted on 2 major datasets demonstrate that our method surpasses\nexisting benchmarks set by leading curb detection and point cloud segmentation\nmodels. Through the post-processing refinement of the detection results, we\nhave significantly reduced noise in curb detection, thereby improving precision\nby 4.5 points. Similarly, our tolerance experiments also achieved\nstate-of-the-art results. Furthermore, real-world experiments and dataset\nanalyses mutually validate each other, reinforcing CurbNet's superior detection\ncapability and robust generalizability. The project website is available at:\nhttps://github.com/guoyangzhao/CurbNet/.\n","authors":["Guoyang Zhao","Fulong Ma","Weiqing Qi","Yuxuan Liu","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19783v1","updated":"2024-05-30T07:48:32Z","published":"2024-05-30T07:48:32Z","title":"Instruction-Guided Visual Masking","summary":"  Instruction following is crucial in contemporary LLM. However, when extended\nto multimodal setting, it often suffers from misalignment between specific\ntextual instruction and targeted local region of an image. To achieve more\naccurate and nuanced multimodal instruction following, we introduce\nInstruction-guided Visual Masking (IVM), a new versatile visual grounding model\nthat is compatible with diverse multimodal models, such as LMM and robot model.\nBy constructing visual masks for instruction-irrelevant regions, IVM-enhanced\nmultimodal models can effectively focus on task-relevant image regions to\nbetter align with complex instructions. Specifically, we design a visual\nmasking data generation pipeline and create an IVM-Mix-1M dataset with 1\nmillion image-instruction pairs. We further introduce a new learning technique,\nDiscriminator Weighted Supervised Learning (DWSL) for preferential IVM training\nthat prioritizes high-quality data samples. Experimental results on generic\nmultimodal tasks such as VQA and embodied robotic control demonstrate the\nversatility of IVM, which as a plug-and-play tool, significantly boosts the\nperformance of diverse multimodal models, yielding new state-of-the-art results\nacross challenging multimodal benchmarks. Code is available at\nhttps://github.com/2toinf/IVM.\n","authors":["Jinliang Zheng","Jianxiong Li","Sijie Cheng","Yinan Zheng","Jiaming Li","Jihao Liu","Yu Liu","Jingjing Liu","Xianyuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2405.19783v1.pdf","comment":"preprint, 21 pages"},{"id":"http://arxiv.org/abs/2405.19775v1","updated":"2024-05-30T07:41:07Z","published":"2024-05-30T07:41:07Z","title":"Puff-Net: Efficient Style Transfer with Pure Content and Style Feature\n  Fusion Network","summary":"  Style transfer aims to render an image with the artistic features of a style\nimage, while maintaining the original structure. Various methods have been put\nforward for this task, but some challenges still exist. For instance, it is\ndifficult for CNN-based methods to handle global information and long-range\ndependencies between input images, for which transformer-based methods have\nbeen proposed. Although transformers can better model the relationship between\ncontent and style images, they require high-cost hardware and time-consuming\ninference. To address these issues, we design a novel transformer model that\nincludes only the encoder, thus significantly reducing the computational cost.\nIn addition, we also find that existing style transfer methods may lead to\nimages under-stylied or missing content. In order to achieve better\nstylization, we design a content feature extractor and a style feature\nextractor, based on which pure content and style images can be fed to the\ntransformer. Finally, we propose a novel network termed Puff-Net, i.e., pure\ncontent and style feature fusion network. Through qualitative and quantitative\nexperiments, we demonstrate the advantages of our model compared to\nstate-of-the-art ones in the literature.\n","authors":["Sizhe Zheng","Pan Gao","Peng Zhou","Jie Qin"],"pdf_url":"https://arxiv.org/pdf/2405.19775v1.pdf","comment":"11 pages, 11 figures, to be published in IEEE Conference on Computer\n  Vision and Pattern Recognition (CVPR 2024)"},{"id":"http://arxiv.org/abs/2405.19773v1","updated":"2024-05-30T07:38:58Z","published":"2024-05-30T07:38:58Z","title":"VQA Training Sets are Self-play Environments for Generating Few-shot\n  Pools","summary":"  Large-language models and large-vision models are increasingly capable of\nsolving compositional reasoning tasks, as measured by breakthroughs in\nvisual-question answering benchmarks. However, state-of-the-art solutions often\ninvolve careful construction of large pre-training and fine-tuning datasets,\nwhich can be expensive. The use of external tools, whether other ML models,\nsearch engines, or APIs, can significantly improve performance by breaking down\nhigh-level reasoning questions into sub-questions that are answerable by\nindividual tools, but this approach has similar dataset construction costs to\nteach fine-tuned models how to use the available tools. We propose a technique\nin which existing training sets can be directly used for constructing\ncomputational environments with task metrics as rewards. This enables a model\nto autonomously teach itself to use itself or another model as a tool. By doing\nso, we augment training sets by integrating external signals. The proposed\nmethod starts with zero-shot prompts and iteratively refines them by selecting\nfew-shot examples that maximize the task metric on the training set. Our\nexperiments showcase how Gemini learns how to use itself, or another smaller\nand specialized model such as ScreenAI, to iteratively improve performance on\ntraining sets. Our approach successfully generalizes and improves upon zeroshot\nperformance on charts, infographics, and document visual question-answering\ndatasets\n","authors":["Tautvydas Misiunas","Hassan Mansoor","Jasper Uijlings","Oriana Riva","Victor Carbune"],"pdf_url":"https://arxiv.org/pdf/2405.19773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19769v1","updated":"2024-05-30T07:34:05Z","published":"2024-05-30T07:34:05Z","title":"All-In-One Medical Image Restoration via Task-Adaptive Routing","summary":"  Although single-task medical image restoration (MedIR) has witnessed\nremarkable success, the limited generalizability of these methods poses a\nsubstantial obstacle to wider application. In this paper, we focus on the task\nof all-in-one medical image restoration, aiming to address multiple distinct\nMedIR tasks with a single universal model. Nonetheless, due to significant\ndifferences between different MedIR tasks, training a universal model often\nencounters task interference issues, where different tasks with shared\nparameters may conflict with each other in the gradient update direction. This\ntask interference leads to deviation of the model update direction from the\noptimal path, thereby affecting the model's performance. To tackle this issue,\nwe propose a task-adaptive routing strategy, allowing conflicting tasks to\nselect different network paths in spatial and channel dimensions, thereby\nmitigating task interference. Experimental results demonstrate that our\nproposed \\textbf{A}ll-in-one \\textbf{M}edical \\textbf{I}mage\n\\textbf{R}estoration (\\textbf{AMIR}) network achieves state-of-the-art\nperformance in three MedIR tasks: MRI super-resolution, CT denoising, and PET\nsynthesis, both in single-task and all-in-one settings. The code and data will\nbe available at\n\\href{https://github.com/Yaziwel/All-In-One-Medical-Image-Restoration-via-Task-Adaptive-Routing.git}{https://github.com/Yaziwel/AMIR}.\n","authors":["Zhiwen Yang","Haowei Chen","Ziniu Qian","Yang Yi","Hui Zhang","Dan Zhao","Bingzheng Wei","Yan Xu"],"pdf_url":"https://arxiv.org/pdf/2405.19769v1.pdf","comment":"This article has been early accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2405.19765v1","updated":"2024-05-30T07:25:23Z","published":"2024-05-30T07:25:23Z","title":"Towards Unified Multi-granularity Text Detection with Interactive\n  Attention","summary":"  Existing OCR engines or document image analysis systems typically rely on\ntraining separate models for text detection in varying scenarios and\ngranularities, leading to significant computational complexity and resource\ndemands. In this paper, we introduce \"Detect Any Text\" (DAT), an advanced\nparadigm that seamlessly unifies scene text detection, layout analysis, and\ndocument page detection into a cohesive, end-to-end model. This design enables\nDAT to efficiently manage text instances at different granularities, including\n*word*, *line*, *paragraph* and *page*. A pivotal innovation in DAT is the\nacross-granularity interactive attention module, which significantly enhances\nthe representation learning of text instances at varying granularities by\ncorrelating structural information across different text queries. As a result,\nit enables the model to achieve mutually beneficial detection performances\nacross multiple text granularities. Additionally, a prompt-based segmentation\nmodule refines detection outcomes for texts of arbitrary curvature and complex\nlayouts, thereby improving DAT's accuracy and expanding its real-world\napplicability. Experimental results demonstrate that DAT achieves\nstate-of-the-art performances across a variety of text-related benchmarks,\nincluding multi-oriented/arbitrarily-shaped scene text detection, document\nlayout analysis and page detection tasks.\n","authors":["Xingyu Wan","Chengquan Zhang","Pengyuan Lyu","Sen Fan","Zihan Ni","Kun Yao","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2405.19765v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.19754v1","updated":"2024-05-30T07:02:50Z","published":"2024-05-30T07:02:50Z","title":"Mitigating annotation shift in cancer classification using single image\n  generative models","summary":"  Artificial Intelligence (AI) has emerged as a valuable tool for assisting\nradiologists in breast cancer detection and diagnosis. However, the success of\nAI applications in this domain is restricted by the quantity and quality of\navailable data, posing challenges due to limited and costly data annotation\nprocedures that often lead to annotation shifts. This study simulates, analyses\nand mitigates annotation shifts in cancer classification in the breast\nmammography domain. First, a high-accuracy cancer risk prediction model is\ndeveloped, which effectively distinguishes benign from malignant lesions. Next,\nmodel performance is used to quantify the impact of annotation shift. We\nuncover a substantial impact of annotation shift on multiclass classification\nperformance particularly for malignant lesions. We thus propose a training data\naugmentation approach based on single-image generative models for the affected\nclass, requiring as few as four in-domain annotations to considerably mitigate\nannotation shift, while also addressing dataset imbalance. Lastly, we further\nincrease performance by proposing and validating an ensemble architecture based\non multiple models trained under different data augmentation regimes. Our study\noffers key insights into annotation shift in deep learning breast cancer\nclassification and explores the potential of single-image generative models to\novercome domain shift challenges.\n","authors":["Marta Buetas Arcas","Richard Osuala","Karim Lekadir","Oliver Díaz"],"pdf_url":"https://arxiv.org/pdf/2405.19754v1.pdf","comment":"Preprint of paper accepted at SPIE IWBI 2024 Conference"},{"id":"http://arxiv.org/abs/2405.19751v1","updated":"2024-05-30T06:56:11Z","published":"2024-05-30T06:56:11Z","title":"HQ-DiT: Efficient Diffusion Transformer with FP4 Hybrid Quantization","summary":"  Diffusion Transformers (DiTs) have recently gained substantial attention in\nboth industrial and academic fields for their superior visual generation\ncapabilities, outperforming traditional diffusion models that use U-Net.\nHowever,the enhanced performance of DiTs also comes with high parameter counts\nand implementation costs, seriously restricting their use on resource-limited\ndevices such as mobile phones. To address these challenges, we introduce the\nHybrid Floating-point Quantization for DiT(HQ-DiT), an efficient post-training\nquantization method that utilizes 4-bit floating-point (FP) precision on both\nweights and activations for DiT inference. Compared to fixed-point quantization\n(e.g., INT8), FP quantization, complemented by our proposed clipping range\nselection mechanism, naturally aligns with the data distribution within DiT,\nresulting in a minimal quantization error. Furthermore, HQ-DiT also implements\na universal identity mathematical transform to mitigate the serious\nquantization error caused by the outliers. The experimental results demonstrate\nthat DiT can achieve extremely low-precision quantization (i.e., 4 bits) with\nnegligible impact on performance. Our approach marks the first instance where\nboth weights and activations in DiTs are quantized to just 4 bits, with only a\n0.12 increase in sFID on ImageNet.\n","authors":["Wenxuan Liu","Saiqian Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.19751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07865v3","updated":"2024-05-30T06:55:15Z","published":"2023-12-13T03:04:22Z","title":"SimAC: A Simple Anti-Customization Method for Protecting Face Privacy\n  against Text-to-Image Synthesis of Diffusion Models","summary":"  Despite the success of diffusion-based customization methods on visual\ncontent creation, increasing concerns have been raised about such techniques\nfrom both privacy and political perspectives. To tackle this issue, several\nanti-customization methods have been proposed in very recent months,\npredominantly grounded in adversarial attacks. Unfortunately, most of these\nmethods adopt straightforward designs, such as end-to-end optimization with a\nfocus on adversarially maximizing the original training loss, thereby\nneglecting nuanced internal properties intrinsic to the diffusion model, and\neven leading to ineffective optimization in some diffusion time steps.In this\npaper, we strive to bridge this gap by undertaking a comprehensive exploration\nof these inherent properties, to boost the performance of current\nanti-customization approaches. Two aspects of properties are investigated: 1)\nWe examine the relationship between time step selection and the model's\nperception in the frequency domain of images and find that lower time steps can\ngive much more contributions to adversarial noises. This inspires us to propose\nan adaptive greedy search for optimal time steps that seamlessly integrates\nwith existing anti-customization methods. 2) We scrutinize the roles of\nfeatures at different layers during denoising and devise a sophisticated\nfeature-based optimization framework for anti-customization.Experiments on\nfacial benchmarks demonstrate that our approach significantly increases\nidentity disruption, thereby protecting user privacy and copyright. Our code is\navailable at: https://github.com/somuchtome/SimAC.\n","authors":["Feifei Wang","Zhentao Tan","Tianyi Wei","Yue Wu","Qidong Huang"],"pdf_url":"https://arxiv.org/pdf/2312.07865v3.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2405.15549v2","updated":"2024-05-30T06:52:37Z","published":"2024-05-24T13:35:56Z","title":"SEP: Self-Enhanced Prompt Tuning for Visual-Language Model","summary":"  Prompt tuning based on Context Optimization (CoOp) effectively adapts\nvisual-language models (VLMs) to downstream tasks by inferring additional\nlearnable prompt tokens. However, these tokens are less discriminative as they\nare independent of the pre-trained tokens and fail to capture input-specific\nknowledge, such as class-aware textual or instance-aware visual knowledge.\nLeveraging the discriminative and generalization capabilities inherent in\npre-trained tokens, we introduce a novel approach named Self-Enhanced Prompt\nTuning (SEP). The core principle of SEP involves adapting the learnable prompt\ntokens at each encoder layer from the corresponding self-pretrained tokens,\nthereby explicitly incorporating discriminative prior knowledge to enhance both\ntextual-level and visual-level embeddings. Furthermore, SEP's self-enhanced\ntokens not only boost discrimination but also mitigate domain shifts in unseen\ndomains, enhancing generalization. In practice, SEP selects several\nrepresentative tokens from all pre-trained tokens for each input data at every\nlayer of the text/visual encoders. Subsequently, a Token Fusion Module (TFM) is\nintroduced to generate a self-enhanced token by merging these representative\ntokens with the learnable tokens using a cross-attention mechanism. This\nself-enhanced token is then concatenated with all pre-trained tokens, serving\nas input for subsequent encoder layers to produce the relevant embeddings.\nComprehensive evaluations across various benchmarks and tasks confirm SEP's\nefficacy in prompt tuning. Code: \\href{Code}{https://github.com/htyao89/SEP}.\n","authors":["Hantao Yao","Rui Zhang","Lu Yu","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2405.15549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19746v1","updated":"2024-05-30T06:49:59Z","published":"2024-05-30T06:49:59Z","title":"DenseSeg: Joint Learning for Semantic Segmentation and Landmark\n  Detection Using Dense Image-to-Shape Representation","summary":"  Purpose: Semantic segmentation and landmark detection are fundamental tasks\nof medical image processing, facilitating further analysis of anatomical\nobjects. Although deep learning-based pixel-wise classification has set a\nnew-state-of-the-art for segmentation, it falls short in landmark detection, a\nstrength of shape-based approaches.\n  Methods: In this work, we propose a dense image-to-shape representation that\nenables the joint learning of landmarks and semantic segmentation by employing\na fully convolutional architecture. Our method intuitively allows the\nextraction of arbitrary landmarks due to its representation of anatomical\ncorrespondences. We benchmark our method against the state-of-the-art for\nsemantic segmentation (nnUNet), a shape-based approach employing geometric deep\nlearning and a CNN-based method for landmark detection.\n  Results: We evaluate our method on two medical dataset: one common benchmark\nfeaturing the lungs, heart, and clavicle from thorax X-rays, and another with\n17 different bones in the paediatric wrist. While our method is on pair with\nthe landmark detection baseline in the thorax setting (error in mm of\n$2.6\\pm0.9$ vs $2.7\\pm0.9$), it substantially surpassed it in the more complex\nwrist setting ($1.1\\pm0.6$ vs $1.9\\pm0.5$).\n  Conclusion: We demonstrate that dense geometric shape representation is\nbeneficial for challenging landmark detection tasks and outperforms previous\nstate-of-the-art using heatmap regression. While it does not require explicit\ntraining on the landmarks themselves, allowing for the addition of new\nlandmarks without necessitating retraining.}\n","authors":["Ron Keuth","Lasse Hansen","Maren Balks","Ronja Jäger","Anne-Nele Schröder","Ludger Tüshaus","Mattias Heinrich"],"pdf_url":"https://arxiv.org/pdf/2405.19746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19745v1","updated":"2024-05-30T06:47:55Z","published":"2024-05-30T06:47:55Z","title":"GaussianPrediction: Dynamic 3D Gaussian Prediction for Motion\n  Extrapolation and Free View Synthesis","summary":"  Forecasting future scenarios in dynamic environments is essential for\nintelligent decision-making and navigation, a challenge yet to be fully\nrealized in computer vision and robotics. Traditional approaches like video\nprediction and novel-view synthesis either lack the ability to forecast from\narbitrary viewpoints or to predict temporal dynamics. In this paper, we\nintroduce GaussianPrediction, a novel framework that empowers 3D Gaussian\nrepresentations with dynamic scene modeling and future scenario synthesis in\ndynamic environments. GaussianPrediction can forecast future states from any\nviewpoint, using video observations of dynamic scenes. To this end, we first\npropose a 3D Gaussian canonical space with deformation modeling to capture the\nappearance and geometry of dynamic scenes, and integrate the lifecycle property\ninto Gaussians for irreversible deformations. To make the prediction feasible\nand efficient, a concentric motion distillation approach is developed by\ndistilling the scene motion with key points. Finally, a Graph Convolutional\nNetwork is employed to predict the motions of key points, enabling the\nrendering of photorealistic images of future scenarios. Our framework shows\noutstanding performance on both synthetic and real-world datasets,\ndemonstrating its efficacy in predicting and rendering future environments.\n","authors":["Boming Zhao","Yuan Li","Ziyu Sun","Lin Zeng","Yujun Shen","Rui Ma","Yinda Zhang","Hujun Bao","Zhaopeng Cui"],"pdf_url":"https://arxiv.org/pdf/2405.19745v1.pdf","comment":"Accepted to SIGGRAPH 2024 Conference. Project Page:\n  https://zju3dv.github.io/gaussian-prediction/"},{"id":"http://arxiv.org/abs/2405.19743v1","updated":"2024-05-30T06:43:55Z","published":"2024-05-30T06:43:55Z","title":"May the Dance be with You: Dance Generation Framework for Non-Humanoids","summary":"  We hypothesize dance as a motion that forms a visual rhythm from music, where\nthe visual rhythm can be perceived from an optical flow. If an agent can\nrecognize the relationship between visual rhythm and music, it will be able to\ndance by generating a motion to create a visual rhythm that matches the music.\nBased on this, we propose a framework for any kind of non-humanoid agents to\nlearn how to dance from human videos. Our framework works in two processes: (1)\ntraining a reward model which perceives the relationship between optical flow\n(visual rhythm) and music from human dance videos, (2) training the\nnon-humanoid dancer based on that reward model, and reinforcement learning. Our\nreward model consists of two feature encoders for optical flow and music. They\nare trained based on contrastive learning which makes the higher similarity\nbetween concurrent optical flow and music features. With this reward model, the\nagent learns dancing by getting a higher reward when its action creates an\noptical flow whose feature has a higher similarity with the given music\nfeature. Experiment results show that generated dance motion can align with the\nmusic beat properly, and user study result indicates that our framework is more\npreferred by humans compared to the baselines. To the best of our knowledge,\nour work of non-humanoid agents which learn dance from human videos is\nunprecedented. An example video can be found at https://youtu.be/dOUPvo-O3QY.\n","authors":["Hyemin Ahn"],"pdf_url":"https://arxiv.org/pdf/2405.19743v1.pdf","comment":"13 pages, 6 Figures, Rejected at Neurips 2023"},{"id":"http://arxiv.org/abs/2405.09215v2","updated":"2024-05-30T06:33:03Z","published":"2024-05-15T09:47:59Z","title":"Xmodel-VLM: A Simple Baseline for Multimodal Vision Language Model","summary":"  We introduce Xmodel-VLM, a cutting-edge multimodal vision language model. It\nis designed for efficient deployment on consumer GPU servers. Our work directly\nconfronts a pivotal industry issue by grappling with the prohibitive service\ncosts that hinder the broad adoption of large-scale multimodal systems. Through\nrigorous training, we have developed a 1B-scale language model from the ground\nup, employing the LLaVA paradigm for modal alignment. The result, which we call\nXmodel-VLM, is a lightweight yet powerful multimodal vision language model.\nExtensive testing across numerous classic multimodal benchmarks has revealed\nthat despite its smaller size and faster execution, Xmodel-VLM delivers\nperformance comparable to that of larger models. Our model checkpoints and code\nare publicly available on GitHub at https://github.com/XiaoduoAILab/XmodelVLM.\n","authors":["Wanting Xu","Yang Liu","Langping He","Xucheng Huang","Ling Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.09215v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19735v1","updated":"2024-05-30T06:31:03Z","published":"2024-05-30T06:31:03Z","title":"Twin Deformable Point Convolutions for Point Cloud Semantic Segmentation\n  in Remote Sensing Scenes","summary":"  Thanks to the application of deep learning technology in point cloud\nprocessing of the remote sensing field, point cloud segmentation has become a\nresearch hotspot in recent years, which can be applied to real-world 3D, smart\ncities, and other fields. Although existing solutions have made unprecedented\nprogress, they ignore the inherent characteristics of point clouds in remote\nsensing fields that are strictly arranged according to latitude, longitude, and\naltitude, which brings great convenience to the segmentation of point clouds in\nremote sensing fields. To consider this property cleverly, we propose novel\nconvolution operators, termed Twin Deformable point Convolutions (TDConvs),\nwhich aim to achieve adaptive feature learning by learning deformable sampling\npoints in the latitude-longitude plane and altitude direction, respectively.\nFirst, to model the characteristics of the latitude-longitude plane, we propose\na Cylinder-wise Deformable point Convolution (CyDConv) operator, which\ngenerates a two-dimensional cylinder map by constructing a cylinder-like grid\nin the latitude-longitude direction. Furthermore, to better integrate the\nfeatures of the latitude-longitude plane and the spatial geometric features, we\nperform a multi-scale fusion of the extracted latitude-longitude features and\nspatial geometric features, and realize it through the aggregation of adjacent\npoint features of different scales. In addition, a Sphere-wise Deformable point\nConvolution (SpDConv) operator is introduced to adaptively offset the sampling\npoints in three-dimensional space by constructing a sphere grid structure,\naiming at modeling the characteristics in the altitude direction. Experiments\non existing popular benchmarks conclude that our TDConvs achieve the best\nsegmentation performance, surpassing the existing state-of-the-art methods.\n","authors":["Yong-Qiang Mao","Hanbo Bi","Xuexue Li","Kaiqiang Chen","Zhirui Wang","Xian Sun","Kun Fu"],"pdf_url":"https://arxiv.org/pdf/2405.19735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19732v1","updated":"2024-05-30T06:24:14Z","published":"2024-05-30T06:24:14Z","title":"Two Optimizers Are Better Than One: LLM Catalyst for Enhancing\n  Gradient-Based Optimization","summary":"  Learning a skill generally relies on both practical experience by doer and\ninsightful high-level guidance by instructor. Will this strategy also work well\nfor solving complex non-convex optimization problems? Here, a common\ngradient-based optimizer acts like a disciplined doer, making locally optimal\nupdate at each step. Recent methods utilize large language models (LLMs) to\noptimize solutions for concrete problems by inferring from natural language\ninstructions, akin to a high-level instructor. In this paper, we show that\nthese two optimizers are complementary to each other, suggesting a\ncollaborative optimization approach. The gradient-based optimizer and LLM-based\noptimizer are combined in an interleaved manner. We instruct LLMs using task\ndescriptions and timely optimization trajectories recorded during\ngradient-based optimization. Inferred results from LLMs are used as restarting\npoints for the next stage of gradient optimization. By leveraging both the\nlocally rigorous gradient-based optimizer and the high-level deductive\nLLM-based optimizer, our combined optimization method consistently yields\nimprovements over competitive baseline prompt tuning methods. Our results\ndemonstrate the synergistic effect of conventional gradient-based optimization\nand the inference ability of LLMs. The code is released at\nhttps://github.com/guozix/LLM-catalyst.\n","authors":["Zixian Guo","Ming Liu","Zhilong Ji","Jinfeng Bai","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.19732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19727v1","updated":"2024-05-30T06:19:01Z","published":"2024-05-30T06:19:01Z","title":"Automatic Dance Video Segmentation for Understanding Choreography","summary":"  Segmenting dance video into short movements is a popular way to easily\nunderstand dance choreography. However, it is currently done manually and\nrequires a significant amount of effort by experts. That is, even if many dance\nvideos are available on social media (e.g., TikTok and YouTube), it remains\ndifficult for people, especially novices, to casually watch short video\nsegments to practice dance choreography. In this paper, we propose a method to\nautomatically segment a dance video into each movement. Given a dance video as\ninput, we first extract visual and audio features: the former is computed from\nthe keypoints of the dancer in the video, and the latter is computed from the\nMel spectrogram of the music in the video. Next, these features are passed to a\nTemporal Convolutional Network (TCN), and segmentation points are estimated by\npicking peaks of the network output. To build our training dataset, we annotate\nsegmentation points to dance videos in the AIST Dance Video Database, which is\na shared database containing original street dance videos with\ncopyright-cleared dance music. The evaluation study shows that the proposed\nmethod (i.e., combining the visual and audio features) can estimate\nsegmentation points with high accuracy. In addition, we developed an\napplication to help dancers practice choreography using the proposed method.\n","authors":["Koki Endo","Shuhei Tsuchida","Tsukasa Fukusato","Takeo Igarashi"],"pdf_url":"https://arxiv.org/pdf/2405.19727v1.pdf","comment":"9 pages, 11 figures"},{"id":"http://arxiv.org/abs/2405.19726v1","updated":"2024-05-30T06:16:33Z","published":"2024-05-30T06:16:33Z","title":"Streaming Video Diffusion: Online Video Editing with Diffusion Models","summary":"  We present a novel task called online video editing, which is designed to\nedit \\textbf{streaming} frames while maintaining temporal consistency. Unlike\nexisting offline video editing assuming all frames are pre-established and\naccessible, online video editing is tailored to real-life applications such as\nlive streaming and online chat, requiring (1) fast continual step inference,\n(2) long-term temporal modeling, and (3) zero-shot video editing capability. To\nsolve these issues, we propose Streaming Video Diffusion (SVDiff), which\nincorporates the compact spatial-aware temporal recurrence into off-the-shelf\nStable Diffusion and is trained with the segment-level scheme on large-scale\nlong videos. This simple yet effective setup allows us to obtain a single model\nthat is capable of executing a broad range of videos and editing each streaming\nframe with temporal coherence. Our experiments indicate that our model can edit\nlong, high-quality videos with remarkable results, achieving a real-time\ninference speed of 15.2 FPS at a resolution of 512x512.\n","authors":["Feng Chen","Zhen Yang","Bohan Zhuang","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2405.19726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19725v1","updated":"2024-05-30T06:15:08Z","published":"2024-05-30T06:15:08Z","title":"Quantum Visual Feature Encoding Revisited","summary":"  Although quantum machine learning has been introduced for a while, its\napplications in computer vision are still limited. This paper, therefore,\nrevisits the quantum visual encoding strategies, the initial step in quantum\nmachine learning. Investigating the root cause, we uncover that the existing\nquantum encoding design fails to ensure information preservation of the visual\nfeatures after the encoding process, thus complicating the learning process of\nthe quantum machine learning models. In particular, the problem, termed\n\"Quantum Information Gap\" (QIG), leads to a gap of information between\nclassical and corresponding quantum features. We provide theoretical proof and\npractical demonstrations of that found and underscore the significance of QIG,\nas it directly impacts the performance of quantum machine learning algorithms.\nTo tackle this challenge, we introduce a simple but efficient new loss function\nnamed Quantum Information Preserving (QIP) to minimize this gap, resulting in\nenhanced performance of quantum machine learning algorithms. Extensive\nexperiments validate the effectiveness of our approach, showcasing superior\nperformance compared to current methodologies and consistently achieving\nstate-of-the-art results in quantum modeling.\n","authors":["Xuan-Bac Nguyen","Hoang-Quan Nguyen","Hugh Churchill","Samee U. Khan","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2405.19725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19723v1","updated":"2024-05-30T06:10:10Z","published":"2024-05-30T06:10:10Z","title":"Encoding and Controlling Global Semantics for Long-form Video Question\n  Answering","summary":"  Seeking answers effectively for long videos is essential to build video\nquestion answering (videoQA) systems. Previous methods adaptively select frames\nand regions from long videos to save computations. However, this fails to\nreason over the whole sequence of video, leading to sub-optimal performance. To\naddress this problem, we introduce a state space layer (SSL) into multi-modal\nTransformer to efficiently integrate global semantics of the video, which\nmitigates the video information loss caused by frame and region selection\nmodules. Our SSL includes a gating unit to enable controllability over the flow\nof global semantics into visual representations. To further enhance the\ncontrollability, we introduce a cross-modal compositional congruence (C^3)\nobjective to encourage global semantics aligned with the question. To\nrigorously evaluate long-form videoQA capacity, we construct two new benchmarks\nEgo-QA and MAD-QA featuring videos of considerably long length, i.e. 17.5\nminutes and 1.9 hours, respectively. Extensive experiments demonstrate the\nsuperiority of our framework on these new as well as existing datasets.\n","authors":["Thong Thanh Nguyen","Zhiyuan Hu","Xiaobao Wu","Cong-Duy T Nguyen","See-Kiong Ng","Anh Tuan Luu"],"pdf_url":"https://arxiv.org/pdf/2405.19723v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2405.19722v1","updated":"2024-05-30T06:07:57Z","published":"2024-05-30T06:07:57Z","title":"QClusformer: A Quantum Transformer-based Framework for Unsupervised\n  Visual Clustering","summary":"  Unsupervised vision clustering, a cornerstone in computer vision, has been\nstudied for decades, yielding significant outcomes across numerous vision\ntasks. However, these algorithms involve substantial computational demands when\nconfronted with vast amounts of unlabeled data. Conversely, Quantum computing\nholds promise in expediting unsupervised algorithms when handling large-scale\ndatabases. In this study, we introduce QClusformer, a pioneering\nTransformer-based framework leveraging Quantum machines to tackle unsupervised\nvision clustering challenges. Specifically, we design the Transformer\narchitecture, including the self-attention module and transformer blocks, from\na Quantum perspective to enable execution on Quantum hardware. In addition, we\npresent QClusformer, a variant based on the Transformer architecture, tailored\nfor unsupervised vision clustering tasks. By integrating these elements into an\nend-to-end framework, QClusformer consistently outperforms previous methods\nrunning on classical computers. Empirical evaluations across diverse\nbenchmarks, including MS-Celeb-1M and DeepFashion, underscore the superior\nperformance of QClusformer compared to state-of-the-art methods.\n","authors":["Xuan-Bac Nguyen","Hoang-Quan Nguyen","Samuel Yen-Chi Chen","Samee U. Khan","Hugh Churchill","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2405.19722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19718v1","updated":"2024-05-30T06:02:35Z","published":"2024-05-30T06:02:35Z","title":"LED: A Large-scale Real-world Paired Dataset for Event Camera Denoising","summary":"  Event camera has significant advantages in capturing dynamic scene\ninformation while being prone to noise interference, particularly in\nchallenging conditions like low threshold and low illumination. However, most\nexisting research focuses on gentle situations, hindering event camera\napplications in realistic complex scenarios. To tackle this limitation and\nadvance the field, we construct a new paired real-world event denoising dataset\n(LED), including 3K sequences with 18K seconds of high-resolution (1200*680)\nevent streams and showing three notable distinctions compared to others:\ndiverse noise levels and scenes, larger-scale with high-resolution, and\nhigh-quality GT. Specifically, it contains stepped parameters and varying\nillumination with diverse scenarios. Moreover, based on the property of noise\nevents inconsistency and signal events consistency, we propose a novel\neffective denoising framework(DED) using homogeneous dual events to generate\nthe GT with better separating noise from the raw. Furthermore, we design a\nbio-inspired baseline leveraging Leaky-Integrate-and-Fire (LIF) neurons with\ndynamic thresholds to realize accurate denoising. The experimental results\ndemonstrate that the remarkable performance of the proposed approach on\ndifferent datasets.The dataset and code are at https://github.com/Yee-Sing/led.\n","authors":["Yuxing Duan","Shihan Peng","Lin Zhu","Wei Zhang","Yi Chang","Sheng Zhong","Luxin Yan"],"pdf_url":"https://arxiv.org/pdf/2405.19718v1.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2405.19716v1","updated":"2024-05-30T05:53:49Z","published":"2024-05-30T05:53:49Z","title":"Enhancing Large Vision Language Models with Self-Training on Image\n  Comprehension","summary":"  Large vision language models (LVLMs) integrate large language models (LLMs)\nwith pre-trained vision encoders, thereby activating the perception capability\nof the model to understand image inputs for different queries and conduct\nsubsequent reasoning. Improving this capability requires high-quality\nvision-language data, which is costly and labor-intensive to acquire.\nSelf-training approaches have been effective in single-modal settings to\nalleviate the need for labeled data by leveraging model's own generation.\nHowever, effective self-training remains a challenge regarding the unique\nvisual perception and reasoning capability of LVLMs. To address this, we\nintroduce Self-Training on Image Comprehension (STIC), which emphasizes a\nself-training approach specifically for image comprehension. First, the model\nself-constructs a preference dataset for image descriptions using unlabeled\nimages. Preferred responses are generated through a step-by-step prompt, while\ndis-preferred responses are generated from either corrupted images or\nmisleading prompts. To further self-improve reasoning on the extracted visual\ninformation, we let the model reuse a small portion of existing\ninstruction-tuning data and append its self-generated image descriptions to the\nprompts. We validate the effectiveness of STIC across seven different\nbenchmarks, demonstrating substantial performance gains of 4.0% on average\nwhile using 70% less supervised fine-tuning data than the current method.\nFurther studies investigate various components of STIC and highlight its\npotential to leverage vast quantities of unlabeled images for self-training.\nCode and data are made publicly available.\n","authors":["Yihe Deng","Pan Lu","Fan Yin","Ziniu Hu","Sheng Shen","James Zou","Kai-Wei Chang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2405.19716v1.pdf","comment":"19 pages, 14 figures, 6 tables"},{"id":"http://arxiv.org/abs/2310.17455v2","updated":"2024-05-30T05:53:23Z","published":"2023-10-26T15:01:54Z","title":"OTMatch: Improving Semi-Supervised Learning with Optimal Transport","summary":"  Semi-supervised learning has made remarkable strides by effectively utilizing\na limited amount of labeled data while capitalizing on the abundant information\npresent in unlabeled data. However, current algorithms often prioritize\naligning image predictions with specific classes generated through\nself-training techniques, thereby neglecting the inherent relationships that\nexist within these classes. In this paper, we present a new approach called\nOTMatch, which leverages semantic relationships among classes by employing an\noptimal transport loss function to match distributions. We conduct experiments\non many standard vision and language datasets. The empirical results show\nimprovements in our method above baseline, this demonstrates the effectiveness\nand superiority of our approach in harnessing semantic relationships to enhance\nlearning performance in a semi-supervised setting.\n","authors":["Zhiquan Tan","Kaipeng Zheng","Weiran Huang"],"pdf_url":"https://arxiv.org/pdf/2310.17455v2.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2405.19712v1","updated":"2024-05-30T05:43:09Z","published":"2024-05-30T05:43:09Z","title":"HINT: Learning Complete Human Neural Representations from Limited\n  Viewpoints","summary":"  No augmented application is possible without animated humanoid avatars. At\nthe same time, generating human replicas from real-world monocular hand-held or\nrobotic sensor setups is challenging due to the limited availability of views.\nPrevious work showed the feasibility of virtual avatars but required the\npresence of 360 degree views of the targeted subject. To address this issue, we\npropose HINT, a NeRF-based algorithm able to learn a detailed and complete\nhuman model from limited viewing angles. We achieve this by introducing a\nsymmetry prior, regularization constraints, and training cues from large human\ndatasets. In particular, we introduce a sagittal plane symmetry prior to the\nappearance of the human, directly supervise the density function of the human\nmodel using explicit 3D body modeling, and leverage a co-learned human\ndigitization network as additional supervision for the unseen angles. As a\nresult, our method can reconstruct complete humans even from a few viewing\nangles, increasing performance by more than 15% PSNR compared to previous\nstate-of-the-art algorithms.\n","authors":["Alessandro Sanvito","Andrea Ramazzina","Stefanie Walz","Mario Bijelic","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2405.19712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19005v2","updated":"2024-05-30T05:42:46Z","published":"2024-05-29T11:42:02Z","title":"Auto-selected Knowledge Adapters for Lifelong Person Re-identification","summary":"  Lifelong Person Re-Identification (LReID) extends traditional ReID by\nrequiring systems to continually learn from non-overlapping datasets across\ndifferent times and locations, adapting to new identities while preserving\nknowledge of previous ones. Existing approaches, either rehearsal-free or\nrehearsal-based, still suffer from the problem of catastrophic forgetting since\nthey try to cram diverse knowledge into one fixed model. To overcome this\nlimitation, we introduce a novel framework AdalReID, that adopts knowledge\nadapters and a parameter-free auto-selection mechanism for lifelong learning.\nConcretely, we incrementally build distinct adapters to learn domain-specific\nknowledge at each step, which can effectively learn and preserve knowledge\nacross different datasets. Meanwhile, the proposed auto-selection strategy\nadaptively calculates the knowledge similarity between the input set and the\nadapters. On the one hand, the appropriate adapters are selected for the inputs\nto process ReID, and on the other hand, the knowledge interaction and fusion\nbetween adapters are enhanced to improve the generalization ability of the\nmodel. Extensive experiments are conducted to demonstrate the superiority of\nour AdalReID, which significantly outperforms SOTAs by about 10$\\sim$20\\% mAP\non both seen and unseen domains.\n","authors":["Xuelin Qian","Ruiqi Wu","Gong Cheng","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2405.19005v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09655v3","updated":"2024-05-30T05:42:03Z","published":"2023-11-16T08:17:02Z","title":"Multi-View Spectrogram Transformer for Respiratory Sound Classification","summary":"  Deep neural networks have been applied to audio spectrograms for respiratory\nsound classification. Existing models often treat the spectrogram as a\nsynthetic image while overlooking its physical characteristics. In this paper,\na Multi-View Spectrogram Transformer (MVST) is proposed to embed different\nviews of time-frequency characteristics into the vision transformer.\nSpecifically, the proposed MVST splits the mel-spectrogram into different sized\npatches, representing the multi-view acoustic elements of a respiratory sound.\nThese patches and positional embeddings are then fed into transformer encoders\nto extract the attentional information among patches through a self-attention\nmechanism. Finally, a gated fusion scheme is designed to automatically weigh\nthe multi-view features to highlight the best one in a specific scenario.\nExperimental results on the ICBHI dataset demonstrate that the proposed MVST\nsignificantly outperforms state-of-the-art methods for classifying respiratory\nsounds.\n","authors":["Wentao He","Yuchen Yan","Jianfeng Ren","Ruibin Bai","Xudong Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.09655v3.pdf","comment":"The paper was published at ICASSP 2024"},{"id":"http://arxiv.org/abs/2405.19708v1","updated":"2024-05-30T05:36:32Z","published":"2024-05-30T05:36:32Z","title":"Text Guided Image Editing with Automatic Concept Locating and Forgetting","summary":"  With the advancement of image-to-image diffusion models guided by text,\nsignificant progress has been made in image editing. However, a persistent\nchallenge remains in seamlessly incorporating objects into images based on\ntextual instructions, without relying on extra user-provided guidance. Text and\nimages are inherently distinct modalities, bringing out difficulties in fully\ncapturing the semantic intent conveyed through language and accurately\ntranslating that into the desired visual modifications. Therefore, text-guided\nimage editing models often produce generations with residual object attributes\nthat do not fully align with human expectations. To address this challenge, the\nmodels should comprehend the image content effectively away from a disconnect\nbetween the provided textual editing prompts and the actual modifications made\nto the image. In our paper, we propose a novel method called Locate and Forget\n(LaF), which effectively locates potential target concepts in the image for\nmodification by comparing the syntactic trees of the target prompt and scene\ndescriptions in the input image, intending to forget their existence clues in\nthe generated image. Compared to the baselines, our method demonstrates its\nsuperiority in text-guided image editing tasks both qualitatively and\nquantitatively.\n","authors":["Jia Li","Lijie Hu","Zhixian He","Jingfeng Zhang","Tianhang Zheng","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2405.19708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19707v1","updated":"2024-05-30T05:36:12Z","published":"2024-05-30T05:36:12Z","title":"DeMamba: AI-Generated Video Detection on Million-Scale GenVideo\n  Benchmark","summary":"  Recently, video generation techniques have advanced rapidly. Given the\npopularity of video content on social media platforms, these models intensify\nconcerns about the spread of fake information. Therefore, there is a growing\ndemand for detectors capable of distinguishing between fake AI-generated videos\nand mitigating the potential harm caused by fake information. However, the lack\nof large-scale datasets from the most advanced video generators poses a barrier\nto the development of such detectors. To address this gap, we introduce the\nfirst AI-generated video detection dataset, GenVideo. It features the following\ncharacteristics: (1) a large volume of videos, including over one million\nAI-generated and real videos collected; (2) a rich diversity of generated\ncontent and methodologies, covering a broad spectrum of video categories and\ngeneration techniques. We conducted extensive studies of the dataset and\nproposed two evaluation methods tailored for real-world-like scenarios to\nassess the detectors' performance: the cross-generator video classification\ntask assesses the generalizability of trained detectors on generators; the\ndegraded video classification task evaluates the robustness of detectors to\nhandle videos that have degraded in quality during dissemination. Moreover, we\nintroduced a plug-and-play module, named Detail Mamba (DeMamba), designed to\nenhance the detectors by identifying AI-generated videos through the analysis\nof inconsistencies in temporal and spatial dimensions. Our extensive\nexperiments demonstrate DeMamba's superior generalizability and robustness on\nGenVideo compared to existing detectors. We believe that the GenVideo dataset\nand the DeMamba module will significantly advance the field of AI-generated\nvideo detection. Our code and dataset will be aviliable at\n\\url{https://github.com/chenhaoxing/DeMamba}.\n","authors":["Haoxing Chen","Yan Hong","Zizheng Huang","Zhuoer Xu","Zhangxuan Gu","Yaohui Li","Jun Lan","Huijia Zhu","Jianfu Zhang","Weiqiang Wang","Huaxiong Li"],"pdf_url":"https://arxiv.org/pdf/2405.19707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19703v1","updated":"2024-05-30T05:27:46Z","published":"2024-05-30T05:27:46Z","title":"Towards a Better Evaluation of Out-of-Domain Generalization","summary":"  The objective of Domain Generalization (DG) is to devise algorithms and\nmodels capable of achieving high performance on previously unseen test\ndistributions. In the pursuit of this objective, average measure has been\nemployed as the prevalent measure for evaluating models and comparing\nalgorithms in the existing DG studies. Despite its significance, a\ncomprehensive exploration of the average measure has been lacking and its\nsuitability in approximating the true domain generalization performance has\nbeen questionable. In this study, we carefully investigate the limitations\ninherent in the average measure and propose worst+gap measure as a robust\nalternative. We establish theoretical grounds of the proposed measure by\nderiving two theorems starting from two different assumptions. We conduct\nextensive experimental investigations to compare the proposed worst+gap measure\nwith the conventional average measure. Given the indispensable need to access\nthe true DG performance for studying measures, we modify five existing datasets\nto come up with SR-CMNIST, C-Cats&Dogs, L-CIFAR10, PACS-corrupted, and\nVLCS-corrupted datasets. The experiment results unveil an inferior performance\nof the average measure in approximating the true DG performance and confirm the\nrobustness of the theoretically supported worst+gap measure.\n","authors":["Duhun Hwang","Suhyun Kang","Moonjung Eo","Jimyeong Kim","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2405.19703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19695v1","updated":"2024-05-30T05:15:38Z","published":"2024-05-30T05:15:38Z","title":"Distribution Aligned Semantics Adaption for Lifelong Person\n  Re-Identification","summary":"  In real-world scenarios, person Re-IDentification (Re-ID) systems need to be\nadaptable to changes in space and time. Therefore, the adaptation of Re-ID\nmodels to new domains while preserving previously acquired knowledge is\ncrucial, known as Lifelong person Re-IDentification (LReID). Advanced LReID\nmethods rely on replaying exemplars from old domains and applying knowledge\ndistillation in logits with old models. However, due to privacy concerns,\nretaining previous data is inappropriate. Additionally, the fine-grained and\nopen-set characteristics of Re-ID limit the effectiveness of the distillation\nparadigm for accumulating knowledge. We argue that a Re-ID model trained on\ndiverse and challenging pedestrian images at a large scale can acquire robust\nand general human semantic knowledge. These semantics can be readily utilized\nas shared knowledge for lifelong applications. In this paper, we identify the\nchallenges and discrepancies associated with adapting a pre-trained model to\neach application domain, and introduce the Distribution Aligned Semantics\nAdaption (DASA) framework. It efficiently adjusts Batch Normalization (BN) to\nmitigate interference from data distribution discrepancy and freezes the\npre-trained convolutional layers to preserve shared knowledge. Additionally, we\npropose the lightweight Semantics Adaption (SA) module, which effectively\nadapts learned semantics to enhance pedestrian representations. Extensive\nexperiments demonstrate the remarkable superiority of our proposed framework\nover advanced LReID methods, and it exhibits significantly reduced storage\nconsumption. DASA presents a novel and cost-effective perspective on\neffectively adapting pre-trained models for LReID.\n","authors":["Qizao Wang","Xuelin Qian","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2405.19695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19689v1","updated":"2024-05-30T05:04:01Z","published":"2024-05-30T05:04:01Z","title":"Uncertainty-aware sign language video retrieval with probability\n  distribution modeling","summary":"  Sign language video retrieval plays a key role in facilitating information\naccess for the deaf community. Despite significant advances in video-text\nretrieval, the complexity and inherent uncertainty of sign language preclude\nthe direct application of these techniques. Previous methods achieve the\nmapping between sign language video and text through fine-grained modal\nalignment. However, due to the scarcity of fine-grained annotation, the\nuncertainty inherent in sign language video is underestimated, limiting the\nfurther development of sign language retrieval tasks. To address this\nchallenge, we propose a novel Uncertainty-aware Probability Distribution\nRetrieval (UPRet), that conceptualizes the mapping process of sign language\nvideo and text in terms of probability distributions, explores their potential\ninterrelationships, and enables flexible mappings. Experiments on three\nbenchmarks demonstrate the effectiveness of our method, which achieves\nstate-of-the-art results on How2Sign (59.1%), PHOENIX-2014T (72.0%), and\nCSL-Daily (78.4%).\n","authors":["Xuan Wu","Hongxiang Li","Yuanjiang Luo","Xuxin Cheng","Xianwei Zhuang","Meng Cao","Keren Fu"],"pdf_url":"https://arxiv.org/pdf/2405.19689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19688v1","updated":"2024-05-30T04:57:55Z","published":"2024-05-30T04:57:55Z","title":"DNPM: A Neural Parametric Model for the Synthesis of Facial Geometric\n  Details","summary":"  Parametric 3D models have enabled a wide variety of computer vision and\ngraphics tasks, such as modeling human faces, bodies and hands. In 3D face\nmodeling, 3DMM is the most widely used parametric model, but can't generate\nfine geometric details solely from identity and expression inputs. To tackle\nthis limitation, we propose a neural parametric model named DNPM for the facial\ngeometric details, which utilizes deep neural network to extract latent codes\nfrom facial displacement maps encoding details and wrinkles. Built upon DNPM, a\nnovel 3DMM named Detailed3DMM is proposed, which augments traditional 3DMMs by\nincluding the synthesis of facial details only from the identity and expression\ninputs. Moreover, we show that DNPM and Detailed3DMM can facilitate two\ndownstream applications: speech-driven detailed 3D facial animation and 3D face\nreconstruction from a degraded image. Extensive experiments have shown the\nusefulness of DNPM and Detailed3DMM, and the progressiveness of two proposed\napplications.\n","authors":["Haitao Cao","Baoping Cheng","Qiran Pu","Haocheng Zhang","Bin Luo","Yixiang Zhuang","Juncong Lin","Liyan Chen","Xuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2405.19688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19687v1","updated":"2024-05-30T04:57:54Z","published":"2024-05-30T04:57:54Z","title":"Autonomous Driving with Spiking Neural Networks","summary":"  Autonomous driving demands an integrated approach that encompasses\nperception, prediction, and planning, all while operating under strict energy\nconstraints to enhance scalability and environmental sustainability. We present\nSpiking Autonomous Driving (\\name{}), the first unified Spiking Neural Network\n(SNN) to address the energy challenges faced by autonomous driving systems\nthrough its event-driven and energy-efficient nature. SAD is trained end-to-end\nand consists of three main modules: perception, which processes inputs from\nmulti-view cameras to construct a spatiotemporal bird's eye view; prediction,\nwhich utilizes a novel dual-pathway with spiking neurons to forecast future\nstates; and planning, which generates safe trajectories considering predicted\noccupancy, traffic rules, and ride comfort. Evaluated on the nuScenes dataset,\nSAD achieves competitive performance in perception, prediction, and planning\ntasks, while drawing upon the energy efficiency of SNNs. This work highlights\nthe potential of neuromorphic computing to be applied to energy-efficient\nautonomous driving, a critical step toward sustainable and safety-critical\nautomotive technology. Our code is available at\n\\url{https://github.com/ridgerchu/SAD}.\n","authors":["Rui-Jie Zhu","Ziqing Wang","Leilani Gilpin","Jason K. Eshraghian"],"pdf_url":"https://arxiv.org/pdf/2405.19687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19684v1","updated":"2024-05-30T04:46:40Z","published":"2024-05-30T04:46:40Z","title":"A Comprehensive Survey on Underwater Image Enhancement Based on Deep\n  Learning","summary":"  Underwater image enhancement (UIE) is a challenging research task in the\nfield of computer vision. Although hundreds of UIE algorithms have been\nproposed, a comprehensive and systematic review is still lacking. To promote\nfuture research, we summarize the UIE task from multiple perspectives. First,\nthe physical models, data construction processes, evaluation metrics, and loss\nfunctions are introduced. Second, according to the contributions brought by\ndifferent literatures, recent proposed algorithms are discussed and classified\nfrom six perspectives, namely network architecture, learning strategy, learning\nstage, assistance task, domain perspective and disentanglement fusion,\nrespectively. Third, considering the inconsistencies in experimental settings\nin different literatures, a comprehensive and fair comparison does not yet\nexist. To this end, we quantitatively and qualitatively evaluate\nstate-of-the-art algorithms on multiple benchmark datasets. Finally, issues\nworthy of further research in the UIE task are raised. A collection of useful\nmaterials is available at https://github.com/YuZhao1999/UIE.\n","authors":["Xiaofeng Cong","Yu Zhao","Jie Gui","Junming Hou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2405.19684v1.pdf","comment":"A survey on the underwater image enhancement task"},{"id":"http://arxiv.org/abs/2405.16134v2","updated":"2024-05-30T04:45:11Z","published":"2024-05-25T08:57:30Z","title":"Breaking the False Sense of Security in Backdoor Defense through\n  Re-Activation Attack","summary":"  Deep neural networks face persistent challenges in defending against backdoor\nattacks, leading to an ongoing battle between attacks and defenses. While\nexisting backdoor defense strategies have shown promising performance on\nreducing attack success rates, can we confidently claim that the backdoor\nthreat has truly been eliminated from the model? To address it, we\nre-investigate the characteristics of the backdoored models after defense\n(denoted as defense models). Surprisingly, we find that the original backdoors\nstill exist in defense models derived from existing post-training defense\nstrategies, and the backdoor existence is measured by a novel metric called\nbackdoor existence coefficient. It implies that the backdoors just lie dormant\nrather than being eliminated. To further verify this finding, we empirically\nshow that these dormant backdoors can be easily re-activated during inference,\nby manipulating the original trigger with well-designed tiny perturbation using\nuniversal adversarial attack. More practically, we extend our backdoor\nreactivation to black-box scenario, where the defense model can only be queried\nby the adversary during inference, and develop two effective methods, i.e.,\nquery-based and transfer-based backdoor re-activation attacks. The\neffectiveness of the proposed methods are verified on both image classification\nand multimodal contrastive learning (i.e., CLIP) tasks. In conclusion, this\nwork uncovers a critical vulnerability that has never been explored in existing\ndefense strategies, emphasizing the urgency of designing more robust and\nadvanced backdoor defense mechanisms in the future.\n","authors":["Mingli Zhu","Siyuan Liang","Baoyuan Wu"],"pdf_url":"https://arxiv.org/pdf/2405.16134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19682v1","updated":"2024-05-30T04:37:57Z","published":"2024-05-30T04:37:57Z","title":"Fully Test-Time Adaptation for Monocular 3D Object Detection","summary":"  Monocular 3D object detection (Mono 3Det) aims to identify 3D objects from a\nsingle RGB image. However, existing methods often assume training and test data\nfollow the same distribution, which may not hold in real-world test scenarios.\nTo address the out-of-distribution (OOD) problems, we explore a new adaptation\nparadigm for Mono 3Det, termed Fully Test-time Adaptation. It aims to adapt a\nwell-trained model to unlabeled test data by handling potential data\ndistribution shifts at test time without access to training data and test\nlabels. However, applying this paradigm in Mono 3Det poses significant\nchallenges due to OOD test data causing a remarkable decline in object\ndetection scores. This decline conflicts with the pre-defined score thresholds\nof existing detection methods, leading to severe object omissions (i.e., rare\npositive detections and many false negatives). Consequently, the limited\npositive detection and plenty of noisy predictions cause test-time adaptation\nto fail in Mono 3Det. To handle this problem, we propose a novel Monocular\nTest-Time Adaptation (MonoTTA) method, based on two new strategies. 1)\nReliability-driven adaptation: we empirically find that high-score objects are\nstill reliable and the optimization of high-score objects can enhance\nconfidence across all detections. Thus, we devise a self-adaptive strategy to\nidentify reliable objects for model adaptation, which discovers potential\nobjects and alleviates omissions. 2) Noise-guard adaptation: since high-score\nobjects may be scarce, we develop a negative regularization term to exploit the\nnumerous low-score objects via negative learning, preventing overfitting to\nnoise and trivial solutions. Experimental results show that MonoTTA brings\nsignificant performance gains for Mono 3Det models in OOD test scenarios,\napproximately 190% gains by average on KITTI and 198% gains on nuScenes.\n","authors":["Hongbin Lin","Yifan Zhang","Shuaicheng Niu","Shuguang Cui","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2405.19682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19678v1","updated":"2024-05-30T04:14:58Z","published":"2024-05-30T04:14:58Z","title":"View-Consistent Hierarchical 3D SegmentationUsing Ultrametric Feature\n  Fields","summary":"  Large-scale vision foundation models such as Segment Anything (SAM)\ndemonstrate impressive performance in zero-shot image segmentation at multiple\nlevels of granularity. However, these zero-shot predictions are rarely\n3D-consistent. As the camera viewpoint changes in a scene, so do the\nsegmentation predictions, as well as the characterizations of ``coarse\" or\n``fine\" granularity. In this work, we address the challenging task of lifting\nmulti-granular and view-inconsistent image segmentations into a hierarchical\nand 3D-consistent representation. We learn a novel feature field within a\nNeural Radiance Field (NeRF) representing a 3D scene, whose segmentation\nstructure can be revealed at different scales by simply using different\nthresholds on feature distance. Our key idea is to learn an ultrametric feature\nspace, which unlike a Euclidean space, exhibits transitivity in distance-based\ngrouping, naturally leading to a hierarchical clustering. Put together, our\nmethod takes view-inconsistent multi-granularity 2D segmentations as input and\nproduces a hierarchy of 3D-consistent segmentations as output. We evaluate our\nmethod and several baselines on synthetic datasets with multi-view images and\nmulti-granular segmentation, showcasing improved accuracy and\nviewpoint-consistency. We additionally provide qualitative examples of our\nmodel's 3D hierarchical segmentations in real world scenes.\\footnote{The code\nand dataset are available at:\n","authors":["Haodi He","Colton Stearns","Adam W. Harley","Leonidas J. Guibas"],"pdf_url":"https://arxiv.org/pdf/2405.19678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19675v1","updated":"2024-05-30T04:04:36Z","published":"2024-05-30T04:04:36Z","title":"Knowledge-grounded Adaptation Strategy for Vision-language Models:\n  Building Unique Case-set for Screening Mammograms for Residents Training","summary":"  A visual-language model (VLM) pre-trained on natural images and text pairs\nposes a significant barrier when applied to medical contexts due to domain\nshift. Yet, adapting or fine-tuning these VLMs for medical use presents\nconsiderable hurdles, including domain misalignment, limited access to\nextensive datasets, and high-class imbalances. Hence, there is a pressing need\nfor strategies to effectively adapt these VLMs to the medical domain, as such\nadaptations would prove immensely valuable in healthcare applications. In this\nstudy, we propose a framework designed to adeptly tailor VLMs to the medical\ndomain, employing selective sampling and hard-negative mining techniques for\nenhanced performance in retrieval tasks. We validate the efficacy of our\nproposed approach by implementing it across two distinct VLMs: the in-domain\nVLM (MedCLIP) and out-of-domain VLMs (ALBEF). We assess the performance of\nthese models both in their original off-the-shelf state and after undergoing\nour proposed training strategies, using two extensive datasets containing\nmammograms and their corresponding reports. Our evaluation spans zero-shot,\nfew-shot, and supervised scenarios. Through our approach, we observe a notable\nenhancement in Recall@K performance for the image-text retrieval task.\n","authors":["Aisha Urooj Khan","John Garrett","Tyler Bradshaw","Lonie Salkowski","Jiwoong Jason Jeong","Amara Tariq","Imon Banerjee"],"pdf_url":"https://arxiv.org/pdf/2405.19675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19672v1","updated":"2024-05-30T03:56:01Z","published":"2024-05-30T03:56:01Z","title":"CRIS: Collaborative Refinement Integrated with Segmentation for Polyp\n  Segmentation","summary":"  Accurate detection of colorectal cancer and early prevention heavily rely on\nprecise polyp identification during gastrointestinal colonoscopy. Due to\nlimited data, many current state-of-the-art deep learning methods for polyp\nsegmentation often rely on post-processing of masks to reduce noise and enhance\nresults. In this study, we propose an approach that integrates mask refinement\nand binary semantic segmentation, leveraging a novel collaborative training\nstrategy that surpasses current widely-used refinement strategies. We\ndemonstrate the superiority of our approach through comprehensive evaluation on\nestablished benchmark datasets and its successful application across various\nmedical image segmentation architectures.\n","authors":["Ankush Gajanan Arudkar","Bernard J. E. Evans"],"pdf_url":"https://arxiv.org/pdf/2405.19672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.08816v2","updated":"2024-05-30T03:49:20Z","published":"2024-05-14T17:59:57Z","title":"The RoboDrive Challenge: Drive Anytime Anywhere in Any Condition","summary":"  In the realm of autonomous driving, robust perception under\nout-of-distribution conditions is paramount for the safe deployment of\nvehicles. Challenges such as adverse weather, sensor malfunctions, and\nenvironmental unpredictability can severely impact the performance of\nautonomous systems. The 2024 RoboDrive Challenge was crafted to propel the\ndevelopment of driving perception technologies that can withstand and adapt to\nthese real-world variabilities. Focusing on four pivotal tasks -- BEV\ndetection, map segmentation, semantic occupancy prediction, and multi-view\ndepth estimation -- the competition laid down a gauntlet to innovate and\nenhance system resilience against typical and atypical disturbances. This\nyear's challenge consisted of five distinct tracks and attracted 140 registered\nteams from 93 institutes across 11 countries, resulting in nearly one thousand\nsubmissions evaluated through our servers. The competition culminated in 15\ntop-performing solutions, which introduced a range of innovative approaches\nincluding advanced data augmentation, multi-sensor fusion, self-supervised\nlearning for error correction, and new algorithmic strategies to enhance sensor\nrobustness. These contributions significantly advanced the state of the art,\nparticularly in handling sensor inconsistencies and environmental variability.\nParticipants, through collaborative efforts, pushed the boundaries of current\ntechnologies, showcasing their potential in real-world scenarios. Extensive\nevaluations and analyses provided insights into the effectiveness of these\nsolutions, highlighting key trends and successful strategies for improving the\nresilience of driving perception systems. This challenge has set a new\nbenchmark in the field, providing a rich repository of techniques expected to\nguide future research in this field.\n","authors":["Lingdong Kong","Shaoyuan Xie","Hanjiang Hu","Yaru Niu","Wei Tsang Ooi","Benoit R. Cottereau","Lai Xing Ng","Yuexin Ma","Wenwei Zhang","Liang Pan","Kai Chen","Ziwei Liu","Weichao Qiu","Wei Zhang","Xu Cao","Hao Lu","Ying-Cong Chen","Caixin Kang","Xinning Zhou","Chengyang Ying","Wentao Shang","Xingxing Wei","Yinpeng Dong","Bo Yang","Shengyin Jiang","Zeliang Ma","Dengyi Ji","Haiwen Li","Xingliang Huang","Yu Tian","Genghua Kou","Fan Jia","Yingfei Liu","Tiancai Wang","Ying Li","Xiaoshuai Hao","Yifan Yang","Hui Zhang","Mengchuan Wei","Yi Zhou","Haimei Zhao","Jing Zhang","Jinke Li","Xiao He","Xiaoqiang Cheng","Bingyang Zhang","Lirong Zhao","Dianlei Ding","Fangsheng Liu","Yixiang Yan","Hongming Wang","Nanfei Ye","Lun Luo","Yubo Tian","Yiwei Zuo","Zhe Cao","Yi Ren","Yunfan Li","Wenjie Liu","Xun Wu","Yifan Mao","Ming Li","Jian Liu","Jiayang Liu","Zihan Qin","Cunxi Chu","Jialei Xu","Wenbo Zhao","Junjun Jiang","Xianming Liu","Ziyan Wang","Chiwei Li","Shilong Li","Chendong Yuan","Songyue Yang","Wentao Liu","Peng Chen","Bin Zhou","Yubo Wang","Chi Zhang","Jianhang Sun","Hai Chen","Xiao Yang","Lizhong Wang","Dongyi Fu","Yongchun Lin","Huitong Yang","Haoang Li","Yadan Luo","Xianjing Cheng","Yong Xu"],"pdf_url":"https://arxiv.org/pdf/2405.08816v2.pdf","comment":"ICRA 2024; 32 pages, 24 figures, 5 tables; Code at\n  https://robodrive-24.github.io/"},{"id":"http://arxiv.org/abs/2405.19671v1","updated":"2024-05-30T03:46:59Z","published":"2024-05-30T03:46:59Z","title":"GaussianRoom: Improving 3D Gaussian Splatting with SDF Guidance and\n  Monocular Cues for Indoor Scene Reconstruction","summary":"  Recently, 3D Gaussian Splatting(3DGS) has revolutionized neural rendering\nwith its high-quality rendering and real-time speed. However, when it comes to\nindoor scenes with a significant number of textureless areas, 3DGS yields\nincomplete and noisy reconstruction results due to the poor initialization of\nthe point cloud and under-constrained optimization. Inspired by the continuity\nof signed distance field (SDF), which naturally has advantages in modeling\nsurfaces, we present a unified optimizing framework integrating neural SDF with\n3DGS. This framework incorporates a learnable neural SDF field to guide the\ndensification and pruning of Gaussians, enabling Gaussians to accurately model\nscenes even with poor initialized point clouds. At the same time, the geometry\nrepresented by Gaussians improves the efficiency of the SDF field by piloting\nits point sampling. Additionally, we regularize the optimization with normal\nand edge priors to eliminate geometry ambiguity in textureless areas and\nimprove the details. Extensive experiments in ScanNet and ScanNet++ show that\nour method achieves state-of-the-art performance in both surface reconstruction\nand novel view synthesis.\n","authors":["Haodong Xiang","Xinghui Li","Xiansong Lai","Wanting Zhang","Zhichao Liao","Kai Cheng","Xueping Liu"],"pdf_url":"https://arxiv.org/pdf/2405.19671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19669v1","updated":"2024-05-30T03:38:44Z","published":"2024-05-30T03:38:44Z","title":"Texture-guided Coding for Deep Features","summary":"  With the rapid development of machine vision technology in recent years, many\nresearchers have begun to focus on feature compression that is better suited\nfor machine vision tasks. The target of feature compression is deep features,\nwhich arise from convolution in the middle layer of a pre-trained convolutional\nneural network. However, due to the large volume of data and high level of\nabstraction of deep features, their application is primarily limited to\nmachine-centric scenarios, which poses significant constraints in situations\nrequiring human-computer interaction. This paper investigates features and\ntextures and proposes a texture-guided feature compression strategy based on\ntheir characteristics. Specifically, the strategy comprises feature layers and\ntexture layers. The feature layers serve the machine, including a feature\nselection module and a feature reconstruction network. With the assistance of\ntexture images, they selectively compress and transmit channels relevant to\nvisual tasks, reducing feature data while providing high-quality features for\nthe machine. The texture layers primarily serve humans and consist of an image\nreconstruction network. This image reconstruction network leverages features\nand texture images to reconstruct preview images for humans. Our method fully\nexploits the characteristics of texture and features. It eliminates feature\nredundancy, reconstructs high-quality preview images for humans, and supports\ndecision-making. The experimental results demonstrate excellent performance\nwhen employing our proposed method to compress the deep features.\n","authors":["Lei Xiong","Xin Luo","Zihao Wang","Chaofan He","Shuyuan Zhu","Bing Zeng"],"pdf_url":"https://arxiv.org/pdf/2405.19669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19668v1","updated":"2024-05-30T03:38:31Z","published":"2024-05-30T03:38:31Z","title":"AutoBreach: Universal and Adaptive Jailbreaking with Efficient\n  Wordplay-Guided Optimization","summary":"  Despite the widespread application of large language models (LLMs) across\nvarious tasks, recent studies indicate that they are susceptible to jailbreak\nattacks, which can render their defense mechanisms ineffective. However,\nprevious jailbreak research has frequently been constrained by limited\nuniversality, suboptimal efficiency, and a reliance on manual crafting. In\nresponse, we rethink the approach to jailbreaking LLMs and formally define\nthree essential properties from the attacker' s perspective, which contributes\nto guiding the design of jailbreak methods. We further introduce AutoBreach, a\nnovel method for jailbreaking LLMs that requires only black-box access.\nInspired by the versatility of wordplay, AutoBreach employs a wordplay-guided\nmapping rule sampling strategy to generate a variety of universal mapping rules\nfor creating adversarial prompts. This generation process leverages LLMs'\nautomatic summarization and reasoning capabilities, thus alleviating the manual\nburden. To boost jailbreak success rates, we further suggest sentence\ncompression and chain-of-thought-based mapping rules to correct errors and\nwordplay misinterpretations in target LLMs. Additionally, we propose a\ntwo-stage mapping rule optimization strategy that initially optimizes mapping\nrules before querying target LLMs to enhance the efficiency of AutoBreach.\nAutoBreach can efficiently identify security vulnerabilities across various\nLLMs, including three proprietary models: Claude-3, GPT-3.5, GPT-4 Turbo, and\ntwo LLMs' web platforms: Bingchat, GPT-4 Web, achieving an average success rate\nof over 80% with fewer than 10 queries\n","authors":["Jiawei Chen","Xiao Yang","Zhengwei Fang","Yu Tian","Yinpeng Dong","Zhaoxia Yin","Hang Su"],"pdf_url":"https://arxiv.org/pdf/2405.19668v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2403.16578v4","updated":"2024-05-30T03:35:06Z","published":"2024-03-25T09:43:56Z","title":"SegICL: A Multimodal In-context Learning Framework for Enhanced\n  Segmentation in Medical Imaging","summary":"  In the field of medical image segmentation, tackling Out-of-Distribution\n(OOD) segmentation tasks in a cost-effective manner remains a significant\nchallenge. Universal segmentation models is a solution, which aim to generalize\nacross the diverse modality of medical images, yet their effectiveness often\ndiminishes when applied to OOD data modalities and tasks, requiring intricate\nfine-tuning of model for optimal performance. Few-shot learning segmentation\nmethods are typically designed for specific modalities of data and cannot be\ndirectly transferred for use with another modality. Therefore, we introduce\nSegICL, a novel approach leveraging In-Context Learning (ICL) for image\nsegmentation. Unlike existing methods, SegICL has the capability to employ\ntext-guided segmentation and conduct in-context learning with a small set of\nimage-mask pairs, eliminating the need for training the model from scratch or\nfine-tuning for OOD tasks (including OOD modality and dataset). Extensive\nexperimental demonstrates a positive correlation between the number of shots\nand segmentation performance on OOD tasks. The performance of segmentation when\nprovided thre-shots is approximately 1.5 times better than the performance in a\nzero-shot setting. This indicates that SegICL effectively address new\nsegmentation tasks based on contextual information. Additionally, SegICL also\nexhibits comparable performance to mainstream models on OOD and in-distribution\ntasks. Our code will be released after paper review.\n","authors":["Lingdong Shen","Fangxin Shang","Xiaoshuang Huang","Yehui Yang","Haifeng Huang","Shiming Xiang"],"pdf_url":"https://arxiv.org/pdf/2403.16578v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19988v1","updated":"2024-05-30T12:18:06Z","published":"2024-05-30T12:18:06Z","title":"Video-Language Critic: Transferable Reward Functions for\n  Language-Conditioned Robotics","summary":"  Natural language is often the easiest and most convenient modality for humans\nto specify tasks for robots. However, learning to ground language to behavior\ntypically requires impractical amounts of diverse, language-annotated\ndemonstrations collected on each target robot. In this work, we aim to separate\nthe problem of what to accomplish from how to accomplish it, as the former can\nbenefit from substantial amounts of external observation-only data, and only\nthe latter depends on a specific robot embodiment. To this end, we propose\nVideo-Language Critic, a reward model that can be trained on readily available\ncross-embodiment data using contrastive learning and a temporal ranking\nobjective, and use it to score behavior traces from a separate reinforcement\nlearning actor. When trained on Open X-Embodiment data, our reward model\nenables 2x more sample-efficient policy training on Meta-World tasks than a\nsparse reward only, despite a significant domain gap. Using in-domain data but\nin a challenging task generalization setting on Meta-World, we further\ndemonstrate more sample-efficient training than is possible with prior\nlanguage-conditioned reward models that are either trained with binary\nclassification, use static images, or do not leverage the temporal information\npresent in video data.\n","authors":["Minttu Alakuijala","Reginald McLean","Isaac Woungang","Nariman Farsad","Samuel Kaski","Pekka Marttinen","Kai Yuan"],"pdf_url":"https://arxiv.org/pdf/2405.19988v1.pdf","comment":"10 pages in the main text, 16 pages including references and\n  supplementary materials. 4 figures and 3 tables in the main text, 1 table in\n  supplementary materials"},{"id":"http://arxiv.org/abs/2405.19730v1","updated":"2024-05-30T06:21:34Z","published":"2024-05-30T06:21:34Z","title":"Research on Foundation Model for Spatial Data Intelligence: China's 2024\n  White Paper on Strategic Development of Spatial Data Intelligence","summary":"  This report focuses on spatial data intelligent large models, delving into\nthe principles, methods, and cutting-edge applications of these models. It\nprovides an in-depth discussion on the definition, development history, current\nstatus, and trends of spatial data intelligent large models, as well as the\nchallenges they face. The report systematically elucidates the key technologies\nof spatial data intelligent large models and their applications in urban\nenvironments, aerospace remote sensing, geography, transportation, and other\nscenarios. Additionally, it summarizes the latest application cases of spatial\ndata intelligent large models in themes such as urban development, multimodal\nsystems, remote sensing, smart transportation, and resource environments.\nFinally, the report concludes with an overview and outlook on the development\nprospects of spatial data intelligent large models.\n","authors":["Shaohua Wang","Xing Xie","Yong Li","Danhuai Guo","Zhi Cai","Yu Liu","Yang Yue","Xiao Pan","Feng Lu","Huayi Wu","Zhipeng Gui","Zhiming Ding","Bolong Zheng","Fuzheng Zhang","Tao Qin","Jingyuan Wang","Chuang Tao","Zhengchao Chen","Hao Lu","Jiayi Li","Hongyang Chen","Peng Yue","Wenhao Yu","Yao Yao","Leilei Sun","Yong Zhang","Longbiao Chen","Xiaoping Du","Xiang Li","Xueying Zhang","Kun Qin","Zhaoya Gong","Weihua Dong","Xiaofeng Meng"],"pdf_url":"https://arxiv.org/pdf/2405.19730v1.pdf","comment":"in Chinese language"},{"id":"http://arxiv.org/abs/2405.20525v1","updated":"2024-05-30T22:56:15Z","published":"2024-05-30T22:56:15Z","title":"Comparing Quantum Annealing and Spiking Neuromorphic Computing for\n  Sampling Binary Sparse Coding QUBO Problems","summary":"  We consider the problem of computing a sparse binary representation of an\nimage. To be precise, given an image and an overcomplete, non-orthonormal\nbasis, we aim to find a sparse binary vector indicating the minimal set of\nbasis vectors that when added together best reconstruct the given input. We\nformulate this problem with an $L_2$ loss on the reconstruction error, and an\n$L_0$ (or, equivalently, an $L_1$) loss on the binary vector enforcing\nsparsity. This yields a quadratic binary optimization problem (QUBO), whose\noptimal solution(s) in general is NP-hard to find. The method of unsupervised\nand unnormalized dictionary feature learning for a desired sparsity level to\nbest match the data is presented. Next, we solve the sparse representation QUBO\nby implementing it both on a D-Wave quantum annealer with Pegasus chip\nconnectivity via minor embedding, as well as on the Intel Loihi 2 spiking\nneuromorphic processor. On the quantum annealer, we sample from the sparse\nrepresentation QUBO using parallel quantum annealing combined with quantum\nevolution Monte Carlo, also known as iterated reverse annealing. On Loihi 2, we\nuse a stochastic winner take all network of neurons. The solutions are\nbenchmarked against simulated annealing, a classical heuristic, and the optimal\nsolutions are computed using CPLEX. Iterated reverse quantum annealing performs\nsimilarly to simulated annealing, although simulated annealing is always able\nto sample the optimal solution whereas quantum annealing was not always able\nto. The Loihi 2 solutions that are sampled are on average more sparse than the\nsolutions from any of the other methods. Loihi 2 outperforms a D-Wave quantum\nannealer standard linear-schedule anneal, while iterated reverse quantum\nannealing performs much better than both unmodified linear-schedule quantum\nannealing and iterated warm starting on Loihi 2.\n","authors":["Kyle Henke","Elijah Pelofske","Garrett Kenyon","Georg Hahn"],"pdf_url":"https://arxiv.org/pdf/2405.20525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17138v2","updated":"2024-05-30T22:22:54Z","published":"2023-11-28T18:59:06Z","title":"Shadows Don't Lie and Lines Can't Bend! Generative Models don't know\n  Projective Geometry...for now","summary":"  Generative models can produce impressively realistic images. This paper\ndemonstrates that generated images have geometric features different from those\nof real images. We build a set of collections of generated images, prequalified\nto fool simple, signal-based classifiers into believing they are real. We then\nshow that prequalified generated images can be identified reliably by\nclassifiers that only look at geometric properties. We use three such\nclassifiers. All three classifiers are denied access to image pixels, and look\nonly at derived geometric features. The first classifier looks at the\nperspective field of the image, the second looks at lines detected in the\nimage, and the third looks at relations between detected objects and shadows.\nOur procedure detects generated images more reliably than SOTA local signal\nbased detectors, for images from a number of distinct generators. Saliency maps\nsuggest that the classifiers can identify geometric problems reliably. We\nconclude that current generators cannot reliably reproduce geometric properties\nof real images.\n","authors":["Ayush Sarkar","Hanlin Mai","Amitabh Mahapatra","Svetlana Lazebnik","D. A. Forsyth","Anand Bhattad"],"pdf_url":"https://arxiv.org/pdf/2311.17138v2.pdf","comment":"Project Page: https://projective-geometry.github.io | First three\n  authors contributed equally"},{"id":"http://arxiv.org/abs/2405.20513v1","updated":"2024-05-30T22:13:17Z","published":"2024-05-30T22:13:17Z","title":"Deep Modeling of Non-Gaussian Aleatoric Uncertainty","summary":"  Deep learning offers promising new ways to accurately model aleatoric\nuncertainty in robotic estimation systems, particularly when the uncertainty\ndistributions do not conform to traditional assumptions of being fixed and\nGaussian. In this study, we formulate and evaluate three fundamental deep\nlearning approaches for conditional probability density modeling to quantify\nnon-Gaussian aleatoric uncertainty: parametric, discretized, and generative\nmodeling. We systematically compare the respective strengths and weaknesses of\nthese three methods on simulated non-Gaussian densities as well as on\nreal-world terrain-relative navigation data. Our results show that these deep\nlearning methods can accurately capture complex uncertainty patterns,\nhighlighting their potential for improving the reliability and robustness of\nestimation systems.\n","authors":["Aastha Acharya","Caleb Lee","Marissa D'Alonzo","Jared Shamwell","Nisar R. Ahmed","Rebecca Russell"],"pdf_url":"https://arxiv.org/pdf/2405.20513v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.20510v1","updated":"2024-05-30T21:59:29Z","published":"2024-05-30T21:59:29Z","title":"Physically Compatible 3D Object Modeling from a Single Image","summary":"  We present a computational framework that transforms single images into 3D\nphysical objects. The visual geometry of a physical object in an image is\ndetermined by three orthogonal attributes: mechanical properties, external\nforces, and rest-shape geometry. Existing single-view 3D reconstruction methods\noften overlook this underlying composition, presuming rigidity or neglecting\nexternal forces. Consequently, the reconstructed objects fail to withstand\nreal-world physical forces, resulting in instability or undesirable deformation\n-- diverging from their intended designs as depicted in the image. Our\noptimization framework addresses this by embedding physical compatibility into\nthe reconstruction process. We explicitly decompose the three physical\nattributes and link them through static equilibrium, which serves as a hard\nconstraint, ensuring that the optimized physical shapes exhibit desired\nphysical behaviors. Evaluations on a dataset collected from Objaverse\ndemonstrate that our framework consistently enhances the physical realism of 3D\nmodels over existing methods. The utility of our framework extends to practical\napplications in dynamic simulations and 3D printing, where adherence to\nphysical compatibility is paramount.\n","authors":["Minghao Guo","Bohan Wang","Pingchuan Ma","Tianyuan Zhang","Crystal Elaine Owens","Chuang Gan","Joshua B. Tenenbaum","Kaiming He","Wojciech Matusik"],"pdf_url":"https://arxiv.org/pdf/2405.20510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20501v1","updated":"2024-05-30T21:42:54Z","published":"2024-05-30T21:42:54Z","title":"ShelfHelp: Empowering Humans to Perform Vision-Independent Manipulation\n  Tasks with a Socially Assistive Robotic Cane","summary":"  The ability to shop independently, especially in grocery stores, is important\nfor maintaining a high quality of life. This can be particularly challenging\nfor people with visual impairments (PVI). Stores carry thousands of products,\nwith approximately 30,000 new products introduced each year in the US market\nalone, presenting a challenge even for modern computer vision solutions.\nThrough this work, we present a proof-of-concept socially assistive robotic\nsystem we call ShelfHelp, and propose novel technical solutions for enhancing\ninstrumented canes traditionally meant for navigation tasks with additional\ncapability within the domain of shopping. ShelfHelp includes a novel visual\nproduct locator algorithm designed for use in grocery stores and a novel\nplanner that autonomously issues verbal manipulation guidance commands to guide\nthe user during product retrieval. Through a human subjects study, we show the\nsystem's success in locating and providing effective manipulation guidance to\nretrieve desired products with novice users. We compare two autonomous verbal\nguidance modes achieving comparable performance to a human assistance baseline\nand present encouraging findings that validate our system's efficiency and\neffectiveness and through positive subjective metrics including competence,\nintelligence, and ease of use.\n","authors":["Shivendra Agrawal","Suresh Nayak","Ashutosh Naik","Bradley Hayes"],"pdf_url":"https://arxiv.org/pdf/2405.20501v1.pdf","comment":"8 pages, 14 figures and charts"},{"id":"http://arxiv.org/abs/2405.20494v1","updated":"2024-05-30T21:35:48Z","published":"2024-05-30T21:35:48Z","title":"Slight Corruption in Pre-training Data Makes Better Diffusion Models","summary":"  Diffusion models (DMs) have shown remarkable capabilities in generating\nrealistic high-quality images, audios, and videos. They benefit significantly\nfrom extensive pre-training on large-scale datasets, including web-crawled data\nwith paired data and conditions, such as image-text and image-class pairs.\nDespite rigorous filtering, these pre-training datasets often inevitably\ncontain corrupted pairs where conditions do not accurately describe the data.\nThis paper presents the first comprehensive study on the impact of such\ncorruption in pre-training data of DMs. We synthetically corrupt ImageNet-1K\nand CC3M to pre-train and evaluate over 50 conditional DMs. Our empirical\nfindings reveal that various types of slight corruption in pre-training can\nsignificantly enhance the quality, diversity, and fidelity of the generated\nimages across different DMs, both during pre-training and downstream adaptation\nstages. Theoretically, we consider a Gaussian mixture model and prove that\nslight corruption in the condition leads to higher entropy and a reduced\n2-Wasserstein distance to the ground truth of the data distribution generated\nby the corruptly trained DMs. Inspired by our analysis, we propose a simple\nmethod to improve the training of DMs on practical datasets by adding condition\nembedding perturbations (CEP). CEP significantly improves the performance of\nvarious DMs in both pre-training and downstream tasks. We hope that our study\nprovides new insights into understanding the data and pre-training processes of\nDMs.\n","authors":["Hao Chen","Yujin Han","Diganta Misra","Xiang Li","Kai Hu","Difan Zou","Masashi Sugiyama","Jindong Wang","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2405.20494v1.pdf","comment":"50 pages, 33 figures, 4 tables"},{"id":"http://arxiv.org/abs/2309.02691v3","updated":"2024-05-30T21:16:29Z","published":"2023-09-06T03:54:57Z","title":"A Joint Study of Phrase Grounding and Task Performance in Vision and\n  Language Models","summary":"  Key to tasks that require reasoning about natural language in visual contexts\nis grounding words and phrases to image regions. However, observing this\ngrounding in contemporary models is complex, even if it is generally expected\nto take place if the task is addressed in a way that is conductive to\ngeneralization. We propose a framework to jointly study task performance and\nphrase grounding, and propose three benchmarks to study the relation between\nthe two. Our results show that contemporary models demonstrate inconsistency\nbetween their ability to ground phrases and solve tasks. We show how this can\nbe addressed through brute-force training on ground phrasing annotations, and\nanalyze the dynamics it creates. Code and at available at\nhttps://github.com/lil-lab/phrase_grounding.\n","authors":["Noriyuki Kojima","Hadar Averbuch-Elor","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2309.02691v3.pdf","comment":"This was published in TMLR in 2024, on January 24th"},{"id":"http://arxiv.org/abs/2402.03299v4","updated":"2024-05-30T21:14:26Z","published":"2024-02-05T18:54:43Z","title":"GUARD: Role-playing to Generate Natural-language Jailbreakings to Test\n  Guideline Adherence of Large Language Models","summary":"  The discovery of \"jailbreaks\" to bypass safety filters of Large Language\nModels (LLMs) and harmful responses have encouraged the community to implement\nsafety measures. One major safety measure is to proactively test the LLMs with\njailbreaks prior to the release. Therefore, such testing will require a method\nthat can generate jailbreaks massively and efficiently. In this paper, we\nfollow a novel yet intuitive strategy to generate jailbreaks in the style of\nthe human generation. We propose a role-playing system that assigns four\ndifferent roles to the user LLMs to collaborate on new jailbreaks. Furthermore,\nwe collect existing jailbreaks and split them into different independent\ncharacteristics using clustering frequency and semantic patterns sentence by\nsentence. We organize these characteristics into a knowledge graph, making them\nmore accessible and easier to retrieve. Our system of different roles will\nleverage this knowledge graph to generate new jailbreaks, which have proved\neffective in inducing LLMs to generate unethical or guideline-violating\nresponses. In addition, we also pioneer a setting in our system that will\nautomatically follow the government-issued guidelines to generate jailbreaks to\ntest whether LLMs follow the guidelines accordingly. We refer to our system as\nGUARD (Guideline Upholding through Adaptive Role-play Diagnostics). We have\nempirically validated the effectiveness of GUARD on three cutting-edge\nopen-sourced LLMs (Vicuna-13B, LongChat-7B, and Llama-2-7B), as well as a\nwidely-utilized commercial LLM (ChatGPT). Moreover, our work extends to the\nrealm of vision language models (MiniGPT-v2 and Gemini Vision Pro), showcasing\nGUARD's versatility and contributing valuable insights for the development of\nsafer, more reliable LLM-based applications across diverse modalities.\n","authors":["Haibo Jin","Ruoxi Chen","Andy Zhou","Yang Zhang","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03299v4.pdf","comment":"28 papges"},{"id":"http://arxiv.org/abs/2402.01335v2","updated":"2024-05-30T21:04:36Z","published":"2024-02-02T11:40:27Z","title":"Simulator-Free Visual Domain Randomization via Video Games","summary":"  Domain randomization is an effective computer vision technique for improving\ntransferability of vision models across visually distinct domains exhibiting\nsimilar content. Existing approaches, however, rely extensively on tweaking\ncomplex and specialized simulation engines that are difficult to construct,\nsubsequently affecting their feasibility and scalability. This paper introduces\nBehAVE, a video understanding framework that uniquely leverages the plethora of\nexisting commercial video games for domain randomization, without requiring\naccess to their simulation engines. Under BehAVE (1) the inherent rich visual\ndiversity of video games acts as the source of randomization and (2) player\nbehavior -- represented semantically via textual descriptions of actions --\nguides the *alignment* of videos with similar content. We test BehAVE on 25\ngames of the first-person shooter (FPS) genre across various video and text\nfoundation models and we report its robustness for domain randomization. BehAVE\nsuccessfully aligns player behavioral patterns and is able to zero-shot\ntransfer them to multiple unseen FPS games when trained on just one FPS game.\nIn a more challenging setting, BehAVE manages to improve the zero-shot\ntransferability of foundation models to unseen FPS games (up to 22%) even when\ntrained on a game of a different genre (Minecraft). Code and dataset can be\nfound at https://github.com/nrasajski/BehAVE.\n","authors":["Chintan Trivedi","Nemanja Rašajski","Konstantinos Makantasis","Antonios Liapis","Georgios N. Yannakakis"],"pdf_url":"https://arxiv.org/pdf/2402.01335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10636v5","updated":"2024-05-30T20:58:39Z","published":"2022-11-19T09:57:01Z","title":"EVEREST: Efficient Masked Video Autoencoder by Removing Redundant\n  Spatiotemporal Tokens","summary":"  Masked Video Autoencoder (MVA) approaches have demonstrated their potential\nby significantly outperforming previous video representation learning methods.\nHowever, they waste an excessive amount of computations and memory in\npredicting uninformative tokens/frames due to random masking strategies. (e.g.,\nover 16 nodes with 128 NVIDIA A100 GPUs). To resolve this issue, we exploit the\nunequal information density among the patches in videos and propose EVEREST, a\nsurprisingly efficient MVA approach for video representation learning that\nfinds tokens containing rich motion features and discards uninformative ones\nduring both pre-training and fine-tuning. We further present an\ninformation-intensive frame selection strategy that allows the model to focus\non informative and causal frames with minimal redundancy. Our method\nsignificantly reduces the computation and memory requirements of MVA, enabling\nthe pre-training and fine-tuning on a single machine with 8 GPUs while\nachieving comparable performance to computation- and memory-heavy baselines on\nmultiple benchmarks and the uncurated Ego4D dataset. We hope that our work\ncontributes to reducing the barrier to further research on video understanding.\n","authors":["Sunil Hwang","Jaehong Yoon","Youngwan Lee","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2211.10636v5.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.20470v1","updated":"2024-05-30T20:41:12Z","published":"2024-05-30T20:41:12Z","title":"STHN: Deep Homography Estimation for UAV Thermal Geo-localization with\n  Satellite Imagery","summary":"  Accurate geo-localization of Unmanned Aerial Vehicles (UAVs) is crucial for a\nvariety of outdoor applications including search and rescue operations, power\nline inspections, and environmental monitoring. The vulnerability of Global\nNavigation Satellite Systems (GNSS) signals to interference and spoofing\nnecessitates the development of additional robust localization methods for\nautonomous navigation. Visual Geo-localization (VG), leveraging onboard cameras\nand reference satellite maps, offers a promising solution for absolute\nlocalization. Specifically, Thermal Geo-localization (TG), which relies on\nimage-based matching between thermal imagery with satellite databases, stands\nout by utilizing infrared cameras for effective night-time localization.\nHowever, the efficiency and effectiveness of current TG approaches, are\nhindered by dense sampling on satellite maps and geometric noises in thermal\nquery images. To overcome these challenges, in this paper, we introduce STHN, a\nnovel UAV thermal geo-localization approach that employs a coarse-to-fine deep\nhomography estimation method. This method attains reliable thermal\ngeo-localization within a 512-meter radius of the UAV's last known location\neven with a challenging 11% overlap between satellite and thermal images,\ndespite the presence of indistinct textures in thermal imagery and self-similar\npatterns in both spectra. Our research significantly enhances UAV thermal\ngeo-localization performance and robustness against the impacts of geometric\nnoises under low-visibility conditions in the wild. The code will be made\npublicly available.\n","authors":["Jiuhong Xiao","Ning Zhang","Daniel Tortei","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2405.20470v1.pdf","comment":"8 pages, 7 figures. This work has been submitted to the IEEE for\n  possible publication. Copyright may be transferred without notice, after\n  which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2405.20469v1","updated":"2024-05-30T20:37:34Z","published":"2024-05-30T20:37:34Z","title":"Is Synthetic Data all We Need? Benchmarking the Robustness of Models\n  Trained with Synthetic Images","summary":"  A long-standing challenge in developing machine learning approaches has been\nthe lack of high-quality labeled data. Recently, models trained with purely\nsynthetic data, here termed synthetic clones, generated using large-scale\npre-trained diffusion models have shown promising results in overcoming this\nannotation bottleneck. As these synthetic clone models progress, they are\nlikely to be deployed in challenging real-world settings, yet their suitability\nremains understudied. Our work addresses this gap by providing the first\nbenchmark for three classes of synthetic clone models, namely supervised,\nself-supervised, and multi-modal ones, across a range of robustness measures.\nWe show that existing synthetic self-supervised and multi-modal clones are\ncomparable to or outperform state-of-the-art real-image baselines for a range\nof robustness metrics - shape bias, background bias, calibration, etc. However,\nwe also find that synthetic clones are much more susceptible to adversarial and\nreal-world noise than models trained with real data. To address this, we find\nthat combining both real and synthetic data further increases the robustness,\nand that the choice of prompt used for generating synthetic images plays an\nimportant part in the robustness of synthetic clones.\n","authors":["Krishnakant Singh","Thanush Navaratnam","Jannik Holmer","Simone Schaub-Meyer","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2405.20469v1.pdf","comment":"Accepted at CVPR 2024 Workshop: SyntaGen-Harnessing Generative Models\n  for Synthetic Visual Datasets. Project page at\n  https://synbenchmark.github.io/SynCloneBenchmark"},{"id":"http://arxiv.org/abs/2405.20465v1","updated":"2024-05-30T20:26:47Z","published":"2024-05-30T20:26:47Z","title":"ENTIRe-ID: An Extensive and Diverse Dataset for Person Re-Identification","summary":"  The growing importance of person reidentification in computer vision has\nhighlighted the need for more extensive and diverse datasets. In response, we\nintroduce the ENTIRe-ID dataset, an extensive collection comprising over 4.45\nmillion images from 37 different cameras in varied environments. This dataset\nis uniquely designed to tackle the challenges of domain variability and model\ngeneralization, areas where existing datasets for person re-identification have\nfallen short. The ENTIRe-ID dataset stands out for its coverage of a wide array\nof real-world scenarios, encompassing various lighting conditions, angles of\nview, and diverse human activities. This design ensures a realistic and robust\ntraining platform for ReID models. The ENTIRe-ID dataset is publicly available\nat https://serdaryildiz.github.io/ENTIRe-ID\n","authors":["Serdar Yildiz","Ahmet Nezih Kasim"],"pdf_url":"https://arxiv.org/pdf/2405.20465v1.pdf","comment":"5 pages, 2024 18th International Conference on Automatic Face and\n  Gesture Recognition (FG)"},{"id":"http://arxiv.org/abs/2405.20462v1","updated":"2024-05-30T20:19:42Z","published":"2024-05-30T20:19:42Z","title":"Multi-Label Guided Soft Contrastive Learning for Efficient Earth\n  Observation Pretraining","summary":"  Self-supervised pretraining on large-scale satellite data has raised great\ninterest in building Earth observation (EO) foundation models. However, many\nimportant resources beyond pure satellite imagery, such as land-cover-land-use\nproducts that provide free global semantic information, as well as vision\nfoundation models that hold strong knowledge of the natural world, tend to be\noverlooked. In this work, we show these free additional resources not only help\nresolve common contrastive learning bottlenecks, but also significantly boost\nthe efficiency and effectiveness of EO pretraining.\n  Specifically, we first propose soft contrastive learning that optimizes\ncross-scene soft similarity based on land-cover-generated multi-label\nsupervision, naturally solving the issue of multiple positive samples and too\nstrict positive matching in complex scenes. Second, we explore cross-domain\ncontinual pretraining for both multispectral and SAR imagery, building\nefficient EO foundation models from strongest vision models such as DINOv2.\nIntegrating simple weight-initialization and Siamese masking strategies into\nour soft contrastive learning framework, we demonstrate impressive continual\npretraining performance even when the input channels and modalities are not\naligned.\n  Without prohibitive training, we produce multispectral and SAR foundation\nmodels that achieve significantly better results in 9 out of 10 downstream\ntasks than most existing SOTA models. For example, our ResNet50/ViT-S achieve\n84.8/85.0 linear probing mAP scores on BigEarthNet-10\\% which are better than\nmost existing ViT-L models; under the same setting, our ViT-B sets a new record\nof 86.8 in multispectral, and 82.5 in SAR, the latter even better than many\nmultispectral models. Dataset and models are available at\nhttps://github.com/zhu-xlab/softcon.\n","authors":["Yi Wang","Conrad M Albrecht","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.20462v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.20459v1","updated":"2024-05-30T20:12:14Z","published":"2024-05-30T20:12:14Z","title":"On Calibration of Object Detectors: Pitfalls, Evaluation and Baselines","summary":"  Reliable usage of object detectors require them to be calibrated -- a crucial\nproblem that requires careful attention. Recent approaches towards this involve\n(1) designing new loss functions to obtain calibrated detectors by training\nthem from scratch, and (2) post-hoc Temperature Scaling (TS) that learns to\nscale the likelihood of a trained detector to output calibrated predictions.\nThese approaches are then evaluated based on a combination of Detection\nExpected Calibration Error (D-ECE) and Average Precision. In this work, via\nextensive analysis and insights, we highlight that these recent evaluation\nframeworks, evaluation metrics, and the use of TS have notable drawbacks\nleading to incorrect conclusions. As a step towards fixing these issues, we\npropose a principled evaluation framework to jointly measure calibration and\naccuracy of object detectors. We also tailor efficient and easy-to-use post-hoc\ncalibration approaches such as Platt Scaling and Isotonic Regression\nspecifically for object detection task. Contrary to the common notion, our\nexperiments show that once designed and evaluated properly, post-hoc\ncalibrators, which are extremely cheap to build and use, are much more powerful\nand effective than the recent train-time calibration methods. To illustrate,\nD-DETR with our post-hoc Isotonic Regression calibrator outperforms the recent\ntrain-time state-of-the-art calibration method Cal-DETR by more than 7 D-ECE on\nthe COCO dataset. Additionally, we propose improved versions of the recently\nproposed Localization-aware ECE and show the efficacy of our method on these\nmetrics as well. Code is available at:\nhttps://github.com/fiveai/detection_calibration.\n","authors":["Selim Kuzucu","Kemal Oksuz","Jonathan Sadeghi","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2405.20459v1.pdf","comment":"31 pages, 8 figures"},{"id":"http://arxiv.org/abs/2405.20443v1","updated":"2024-05-30T19:40:08Z","published":"2024-05-30T19:40:08Z","title":"P-MSDiff: Parallel Multi-Scale Diffusion for Remote Sensing Image\n  Segmentation","summary":"  Diffusion models and multi-scale features are essential components in\nsemantic segmentation tasks that deal with remote-sensing images. They\ncontribute to improved segmentation boundaries and offer significant contextual\ninformation. U-net-like architectures are frequently employed in diffusion\nmodels for segmentation tasks. These architectural designs include dense skip\nconnections that may pose challenges for interpreting intermediate features.\nConsequently, they might not efficiently convey semantic information throughout\nvarious layers of the encoder-decoder architecture. To address these\nchallenges, we propose a new model for semantic segmentation known as the\ndiffusion model with parallel multi-scale branches. This model consists of\nParallel Multiscale Diffusion modules (P-MSDiff) and a Cross-Bridge Linear\nAttention mechanism (CBLA). P-MSDiff enhances the understanding of semantic\ninformation across multiple levels of granularity and detects repetitive\ndistribution data through the integration of recursive denoising branches. It\nfurther facilitates the amalgamation of data by connecting relevant branches to\nthe primary framework to enable concurrent denoising. Furthermore, within the\ninterconnected transformer architecture, the LA module has been substituted\nwith the CBLA module. This module integrates a semidefinite matrix linked to\nthe query into the dot product computation of keys and values. This integration\nenables the adaptation of queries within the LA framework. This adjustment\nenhances the structure for multi-head attention computation, leading to\nenhanced network performance and CBLA is a plug-and-play module. Our model\ndemonstrates superior performance based on the J1 metric on both the UAVid and\nVaihingen Building datasets, showing improvements of 1.60% and 1.40% over\nstrong baseline models, respectively.\n","authors":["Qi Zhang","Guohua Geng","Longquan Yan","Pengbo Zhou","Zhaodi Li","Kang Li","Qinglin Liu"],"pdf_url":"https://arxiv.org/pdf/2405.20443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01716v4","updated":"2024-05-30T19:21:41Z","published":"2023-04-04T11:25:44Z","title":"Decoupling Dynamic Monocular Videos for Dynamic View Synthesis","summary":"  The challenge of dynamic view synthesis from dynamic monocular videos, i.e.,\nsynthesizing novel views for free viewpoints given a monocular video of a\ndynamic scene captured by a moving camera, mainly lies in accurately modeling\nthe \\textbf{dynamic objects} of a scene using limited 2D frames, each with a\nvarying timestamp and viewpoint. Existing methods usually require pre-processed\n2D optical flow and depth maps by off-the-shelf methods to supervise the\nnetwork, making them suffer from the inaccuracy of the pre-processed\nsupervision and the ambiguity when lifting the 2D information to 3D. In this\npaper, we tackle this challenge in an unsupervised fashion. Specifically, we\ndecouple the motion of the dynamic objects into object motion and camera\nmotion, respectively regularized by proposed unsupervised surface consistency\nand patch-based multi-view constraints. The former enforces the 3D geometric\nsurfaces of moving objects to be consistent over time, while the latter\nregularizes their appearances to be consistent across different viewpoints.\nSuch a fine-grained motion formulation can alleviate the learning difficulty\nfor the network, thus enabling it to produce not only novel views with higher\nquality but also more accurate scene flows and depth than existing methods\nrequiring extra supervision.\n","authors":["Meng You","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2304.01716v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20431v1","updated":"2024-05-30T19:21:33Z","published":"2024-05-30T19:21:33Z","title":"Exploring the Practicality of Federated Learning: A Survey Towards the\n  Communication Perspective","summary":"  Federated Learning (FL) is a promising paradigm that offers significant\nadvancements in privacy-preserving, decentralized machine learning by enabling\ncollaborative training of models across distributed devices without\ncentralizing data. However, the practical deployment of FL systems faces a\nsignificant bottleneck: the communication overhead caused by frequently\nexchanging large model updates between numerous devices and a central server.\nThis communication inefficiency can hinder training speed, model performance,\nand the overall feasibility of real-world FL applications. In this survey, we\ninvestigate various strategies and advancements made in communication-efficient\nFL, highlighting their impact and potential to overcome the communication\nchallenges inherent in FL systems. Specifically, we define measures for\ncommunication efficiency, analyze sources of communication inefficiency in FL\nsystems, and provide a taxonomy and comprehensive review of state-of-the-art\ncommunication-efficient FL methods. Additionally, we discuss promising future\nresearch directions for enhancing the communication efficiency of FL systems.\nBy addressing the communication bottleneck, FL can be effectively applied and\nenable scalable and practical deployment across diverse applications that\nrequire privacy-preserving, decentralized machine learning, such as IoT,\nhealthcare, or finance.\n","authors":["Khiem Le","Nhan Luong-Ha","Manh Nguyen-Duc","Danh Le-Phuoc","Cuong Do","Kok-Seng Wong"],"pdf_url":"https://arxiv.org/pdf/2405.20431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20420v1","updated":"2024-05-30T18:55:50Z","published":"2024-05-30T18:55:50Z","title":"Back to the Basics on Predicting Transfer Performance","summary":"  In the evolving landscape of deep learning, selecting the best pre-trained\nmodels from a growing number of choices is a challenge. Transferability scorers\npropose alleviating this scenario, but their recent proliferation, ironically,\nposes the challenge of their own assessment. In this work, we propose both\nrobust benchmark guidelines for transferability scorers, and a well-founded\ntechnique to combine multiple scorers, which we show consistently improves\ntheir results. We extensively evaluate 13 scorers from literature across 11\ndatasets, comprising generalist, fine-grained, and medical imaging datasets. We\nshow that few scorers match the predictive performance of the simple raw metric\nof models on ImageNet, and that all predictors suffer on medical datasets. Our\nresults highlight the potential of combining different information sources for\nreliably predicting transferability across varied domains.\n","authors":["Levy Chaves","Eduardo Valle","Alceu Bissoto","Sandra Avila"],"pdf_url":"https://arxiv.org/pdf/2405.20420v1.pdf","comment":"15 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2405.20413v1","updated":"2024-05-30T18:38:36Z","published":"2024-05-30T18:38:36Z","title":"Jailbreaking Large Language Models Against Moderation Guardrails via\n  Cipher Characters","summary":"  Large Language Models (LLMs) are typically harmless but remain vulnerable to\ncarefully crafted prompts known as ``jailbreaks'', which can bypass protective\nmeasures and induce harmful behavior. Recent advancements in LLMs have\nincorporated moderation guardrails that can filter outputs, which trigger\nprocessing errors for certain malicious questions. Existing red-teaming\nbenchmarks often neglect to include questions that trigger moderation\nguardrails, making it difficult to evaluate jailbreak effectiveness. To address\nthis issue, we introduce JAMBench, a harmful behavior benchmark designed to\ntrigger and evaluate moderation guardrails. JAMBench involves 160 manually\ncrafted instructions covering four major risk categories at multiple severity\nlevels. Furthermore, we propose a jailbreak method, JAM (Jailbreak Against\nModeration), designed to attack moderation guardrails using jailbreak prefixes\nto bypass input-level filters and a fine-tuned shadow model functionally\nequivalent to the guardrail model to generate cipher characters to bypass\noutput-level filters. Our extensive experiments on four LLMs demonstrate that\nJAM achieves higher jailbreak success ($\\sim$ $\\times$ 19.88) and lower\nfiltered-out rates ($\\sim$ $\\times$ 1/6) than baselines.\n","authors":["Haibo Jin","Andy Zhou","Joe D. Menke","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20413v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2405.20392v1","updated":"2024-05-30T18:04:58Z","published":"2024-05-30T18:04:58Z","title":"Can No-Reference Quality-Assessment Methods Serve as Perceptual Losses\n  for Super-Resolution?","summary":"  Perceptual losses play an important role in constructing\ndeep-neural-network-based methods by increasing the naturalness and realism of\nprocessed images and videos. Use of perceptual losses is often limited to\nLPIPS, a fullreference method. Even though deep no-reference\nimage-qualityassessment methods are excellent at predicting human judgment,\nlittle research has examined their incorporation in loss functions. This paper\ninvestigates direct optimization of several video-superresolution models using\nno-reference image-quality-assessment methods as perceptual losses. Our\nexperimental results show that straightforward optimization of these methods\nproduce artifacts, but a special training procedure can mitigate them.\n","authors":["Egor Kashkarov","Egor Chistov","Ivan Molodetskikh","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2405.20392v1.pdf","comment":"4 pages, 3 figures. The first two authors contributed equally to this\n  work"},{"id":"http://arxiv.org/abs/2405.20380v1","updated":"2024-05-30T18:00:03Z","published":"2024-05-30T18:00:03Z","title":"Gradient Inversion of Federated Diffusion Models","summary":"  Diffusion models are becoming defector generative models, which generate\nexceptionally high-resolution image data. Training effective diffusion models\nrequire massive real data, which is privately owned by distributed parties.\nEach data party can collaboratively train diffusion models in a federated\nlearning manner by sharing gradients instead of the raw data. In this paper, we\nstudy the privacy leakage risk of gradient inversion attacks. First, we design\na two-phase fusion optimization, GIDM, to leverage the well-trained generative\nmodel itself as prior knowledge to constrain the inversion search (latent)\nspace, followed by pixel-wise fine-tuning. GIDM is shown to be able to\nreconstruct images almost identical to the original ones. Considering a more\nprivacy-preserving training scenario, we then argue that locally initialized\nprivate training noise $\\epsilon$ and sampling step t may raise additional\nchallenges for the inversion attack. To solve this, we propose a\ntriple-optimization GIDM+ that coordinates the optimization of the unknown\ndata, $\\epsilon$ and $t$. Our extensive evaluation results demonstrate the\nvulnerability of sharing gradient for data protection of diffusion models, even\nhigh-resolution images can be reconstructed with high quality.\n","authors":["Jiyue Huang","Chi Hong","Lydia Y. Chen","Stefanie Roos"],"pdf_url":"https://arxiv.org/pdf/2405.20380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20364v1","updated":"2024-05-30T17:59:51Z","published":"2024-05-30T17:59:51Z","title":"Learning 3D Robotics Perception using Inductive Priors","summary":"  Recent advances in deep learning have led to a data-centric intelligence i.e.\nartificially intelligent models unlocking the potential to ingest a large\namount of data and be really good at performing digital tasks such as\ntext-to-image generation, machine-human conversation, and image recognition.\nThis thesis covers the topic of learning with structured inductive bias and\npriors to design approaches and algorithms unlocking the potential of\nprinciple-centric intelligence. Prior knowledge (priors for short), often\navailable in terms of past experience as well as assumptions of how the world\nworks, helps the autonomous agent generalize better and adapt their behavior\nbased on past experience. In this thesis, I demonstrate the use of prior\nknowledge in three different robotics perception problems. 1. object-centric 3D\nreconstruction, 2. vision and language for decision-making, and 3. 3D scene\nunderstanding. To solve these challenging problems, I propose various sources\nof prior knowledge including 1. geometry and appearance priors from synthetic\ndata, 2. modularity and semantic map priors and 3. semantic, structural, and\ncontextual priors. I study these priors for solving robotics 3D perception\ntasks and propose ways to efficiently encode them in deep learning models. Some\npriors are used to warm-start the network for transfer learning, others are\nused as hard constraints to restrict the action space of robotics agents. While\nclassical techniques are brittle and fail to generalize to unseen scenarios and\ndata-centric approaches require a large amount of labeled data, this thesis\naims to build intelligent agents which require very-less real-world data or\ndata acquired only from simulation to generalize to highly dynamic and\ncluttered environments in novel simulations (i.e. sim2sim) or real-world unseen\nenvironments (i.e. sim2real) for a holistic scene understanding of the 3D\nworld.\n","authors":["Muhammad Zubair Irshad"],"pdf_url":"https://arxiv.org/pdf/2405.20364v1.pdf","comment":"Georgia Tech Ph.D. Thesis, December 2023. For more details:\n  https://zubairirshad.com/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2403.19546v2","updated":"2024-05-30T16:20:04Z","published":"2024-03-28T16:27:26Z","title":"Croissant: A Metadata Format for ML-Ready Datasets","summary":"  Data is a critical resource for Machine Learning (ML), yet working with data\nremains a key friction point. This paper introduces Croissant, a metadata\nformat for datasets that simplifies how data is used by ML tools and\nframeworks. Croissant makes datasets more discoverable, portable and\ninteroperable, thereby addressing significant challenges in ML data management\nand responsible AI. Croissant is already supported by several popular dataset\nrepositories, spanning hundreds of thousands of datasets, ready to be loaded\ninto the most popular ML frameworks.\n","authors":["Mubashara Akhtar","Omar Benjelloun","Costanza Conforti","Pieter Gijsbers","Joan Giner-Miguelez","Nitisha Jain","Michael Kuchnik","Quentin Lhoest","Pierre Marcenac","Manil Maskey","Peter Mattson","Luis Oala","Pierre Ruyssen","Rajat Shinde","Elena Simperl","Goeffry Thomas","Slava Tykhonov","Joaquin Vanschoren","Jos van der Velde","Steffen Vogler","Carole-Jean Wu"],"pdf_url":"https://arxiv.org/pdf/2403.19546v2.pdf","comment":"Published in Proceedings of ACM SIGMOD/PODS'24 Data Management for\n  End-to-End Machine Learning (DEEM) Workshop\n  https://dl.acm.org/doi/10.1145/3650203.3663326"},{"id":"http://arxiv.org/abs/2405.20204v1","updated":"2024-05-30T16:07:54Z","published":"2024-05-30T16:07:54Z","title":"Jina CLIP: Your CLIP Model Is Also Your Text Retriever","summary":"  Contrastive Language-Image Pretraining (CLIP) is widely used to train models\nto align images and texts in a common embedding space by mapping them to\nfixed-sized vectors. These models are key to multimodal information retrieval\nand related tasks. However, CLIP models generally underperform in text-only\ntasks compared to specialized text models. This creates inefficiencies for\ninformation retrieval systems that keep separate embeddings and models for\ntext-only and multimodal tasks. We propose a novel, multi-task contrastive\ntraining method to address this issue, which we use to train the jina-clip-v1\nmodel to achieve the state-of-the-art performance on both text-image and\ntext-text retrieval tasks.\n","authors":["Andreas Koukounas","Georgios Mastrapas","Michael Günther","Bo Wang","Scott Martens","Isabelle Mohr","Saba Sturua","Mohammad Kalim Akram","Joan Fontanals Martínez","Saahil Ognawala","Susana Guzman","Maximilian Werk","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2405.20204v1.pdf","comment":"4 pages, ICML2024 workshop submission"},{"id":"http://arxiv.org/abs/2405.19149v2","updated":"2024-05-30T13:26:43Z","published":"2024-05-29T14:52:10Z","title":"CaLa: Complementary Association Learning for Augmenting Composed Image\n  Retrieval","summary":"  Composed Image Retrieval (CIR) involves searching for target images based on\nan image-text pair query. While current methods treat this as a query-target\nmatching problem, we argue that CIR triplets contain additional associations\nbeyond this primary relation. In our paper, we identify two new relations\nwithin triplets, treating each triplet as a graph node. Firstly, we introduce\nthe concept of text-bridged image alignment, where the query text serves as a\nbridge between the query image and the target image. We propose a hinge-based\ncross-attention mechanism to incorporate this relation into network learning.\nSecondly, we explore complementary text reasoning, considering CIR as a form of\ncross-modal retrieval where two images compose to reason about complementary\ntext. To integrate these perspectives effectively, we design a twin\nattention-based compositor. By combining these complementary associations with\nthe explicit query pair-target image relation, we establish a comprehensive set\nof constraints for CIR. Our framework, CaLa (Complementary Association Learning\nfor Augmenting Composed Image Retrieval), leverages these insights. We evaluate\nCaLa on CIRR and FashionIQ benchmarks with multiple backbones, demonstrating\nits superiority in composed image retrieval.\n","authors":["Xintong Jiang","Yaxiong Wang","Mengjian Li","Yujiao Wu","Bingwen Hu","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2405.19149v2.pdf","comment":"To appear at SIGIR 2024. arXiv admin note: text overlap with\n  arXiv:2309.02169"},{"id":"http://arxiv.org/abs/2312.12728v3","updated":"2024-05-30T11:25:08Z","published":"2023-12-20T02:55:15Z","title":"Lookahead: An Inference Acceleration Framework for Large Language Model\n  with Lossless Generation Accuracy","summary":"  As Large Language Models (LLMs) have made significant advancements across\nvarious tasks, such as question answering, translation, text summarization, and\ndialogue systems, the need for accuracy in information becomes crucial,\nespecially for serious financial products serving billions of users like\nAlipay. However, for a real-world product serving millions of users, the\ninference speed of LLMs becomes a critical factor compared to a mere\nexperimental model.\n  Hence, this paper presents a generic framework for accelerating the inference\nprocess, resulting in a substantial increase in speed and cost reduction for\nour LLM-based scenarios, with lossless generation accuracy. In the traditional\ninference process, each token is generated sequentially by the LLM, leading to\na time consumption proportional to the number of generated tokens. To enhance\nthis process, our framework, named \\textit{lookahead}, introduces a\n\\textit{multi-branch} strategy. Instead of generating a single token at a time,\nwe propose a Trie-based retrieval and verification mechanism to be able to\naccept several tokens at a forward step. Our strategy offers two distinct\nadvantages: (1) it guarantees absolute correctness of the output, avoiding any\napproximation algorithms, and (2) the worst-case performance of our approach is\nequivalent to the conventional process. We conduct extensive experiments to\ndemonstrate the significant improvements achieved by applying our inference\nacceleration framework. Our framework is widely deployed in Alipay since April\n2023, and obtain remarkable 2.66x to 6.26x speedup. Our code is available at\nhttps://github.com/alipay/PainlessInferenceAcceleration.\n","authors":["Yao Zhao","Zhitian Xie","Chen Liang","Chenyi Zhuang","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2312.12728v3.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.09497v2","updated":"2024-05-30T10:03:27Z","published":"2023-10-14T05:20:02Z","title":"A Setwise Approach for Effective and Highly Efficient Zero-shot Ranking\n  with Large Language Models","summary":"  We propose a novel zero-shot document ranking approach based on Large\nLanguage Models (LLMs): the Setwise prompting approach. Our approach\ncomplements existing prompting approaches for LLM-based zero-shot ranking:\nPointwise, Pairwise, and Listwise. Through the first-of-its-kind comparative\nevaluation within a consistent experimental framework and considering factors\nlike model size, token consumption, latency, among others, we show that\nexisting approaches are inherently characterised by trade-offs between\neffectiveness and efficiency. We find that while Pointwise approaches score\nhigh on efficiency, they suffer from poor effectiveness. Conversely, Pairwise\napproaches demonstrate superior effectiveness but incur high computational\noverhead. Our Setwise approach, instead, reduces the number of LLM inferences\nand the amount of prompt token consumption during the ranking procedure,\ncompared to previous methods. This significantly improves the efficiency of\nLLM-based zero-shot ranking, while also retaining high zero-shot ranking\neffectiveness. We make our code and results publicly available at\n\\url{https://github.com/ielab/llm-rankers}.\n","authors":["Shengyao Zhuang","Honglei Zhuang","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2310.09497v2.pdf","comment":"SIGIR2024 full paper"},{"id":"http://arxiv.org/abs/2208.09612v2","updated":"2024-05-30T07:46:20Z","published":"2022-08-20T06:03:35Z","title":"AntCritic: Argument Mining for Free-Form and Visually-Rich Financial\n  Comments","summary":"  Argument mining aims to detect all possible argumentative components and\nidentify their relationships automatically. As a thriving task in natural\nlanguage processing, there has been a large amount of corpus for academic study\nand application development in this field. However, the research in this area\nis still constrained by the inherent limitations of existing datasets.\nSpecifically, all the publicly available datasets are relatively small in\nscale, and few of them provide information from other modalities to facilitate\nthe learning process. Moreover, the statements and expressions in these corpora\nare usually in a compact form, which restricts the generalization ability of\nmodels. To this end, we collect a novel dataset AntCritic to serve as a helpful\ncomplement to this area, which consists of about 10k free-form and\nvisually-rich financial comments and supports both argument component detection\nand argument relation prediction tasks. Besides, to cope with the challenges\nbrought by scenario expansion, we thoroughly explore the fine-grained relation\nprediction and structure reconstruction scheme and discuss the encoding\nmechanism for visual styles and layouts. On this basis, we design two simple\nbut effective model architectures and conduct various experiments on this\ndataset to provide benchmark performances as a reference and verify the\npracticability of our proposed architecture. We release our data and code in\nthis link, and this dataset follows CC BY-NC-ND 4.0 license.\n","authors":["Huadai Liu","Wenqiang Xu","Xuan Lin","Jingjing Huo","Hong Chen","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2208.09612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19749v1","updated":"2024-05-30T06:52:01Z","published":"2024-05-30T06:52:01Z","title":"Generating Query Recommendations via LLMs","summary":"  Query recommendation systems are ubiquitous in modern search engines,\nassisting users in producing effective queries to meet their information needs.\nHowever, these systems require a large amount of data to produce good\nrecommendations, such as a large collection of documents to index and query\nlogs. In particular, query logs and user data are not available in cold start\nscenarios. Query logs are expensive to collect and maintain and require complex\nand time-consuming cascading pipelines for creating, combining, and ranking\nrecommendations. To address these issues, we frame the query recommendation\nproblem as a generative task, proposing a novel approach called Generative\nQuery Recommendation (GQR). GQR uses an LLM as its foundation and does not\nrequire to be trained or fine-tuned to tackle the query recommendation problem.\nWe design a prompt that enables the LLM to understand the specific\nrecommendation task, even using a single example. We then improved our system\nby proposing a version that exploits query logs called Retriever-Augmented GQR\n(RA-GQR). RA-GQr dynamically composes its prompt by retrieving similar queries\nfrom query logs. GQR approaches reuses a pre-existing neural architecture\nresulting in a simpler and more ready-to-market approach, even in a cold start\nscenario. Our proposed GQR obtains state-of-the-art performance in terms of\nNDCG@10 and clarity score against two commercial search engines and the\nprevious state-of-the-art approach on the Robust04 and ClueWeb09B collections,\nimproving on average the NDCG@10 performance up to ~4% on Robust04 and\nClueWeb09B w.r.t the previous best competitor. RA-GQR further improve the\nNDCG@10 obtaining an increase of ~11%, ~6\\% on Robust04 and ClueWeb09B w.r.t\nthe best competitor. Furthermore, our system obtained ~59% of user preferences\nin a blind user study, proving that our method produces the most engaging\nqueries.\n","authors":["Andrea Bacciu","Enrico Palumbo","Andreas Damianou","Nicola Tonellotto","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2405.19749v1.pdf","comment":"Generating Query Recommendations via LLMs"},{"id":"http://arxiv.org/abs/2306.08121v2","updated":"2024-05-30T05:53:39Z","published":"2023-06-13T20:34:15Z","title":"Better Generalization with Semantic IDs: A Case Study in Ranking for\n  Recommendations","summary":"  Randomly-hashed item ids are used ubiquitously in recommendation models.\nHowever, the learned representations from random hashing prevents\ngeneralization across similar items, causing problems of learning unseen and\nlong-tail items, especially when item corpus is large, power-law distributed,\nand evolving dynamically. In this paper, we propose using content-derived\nfeatures as a replacement for random ids. We show that simply replacing ID\nfeatures with content-based embeddings can cause a drop in quality due to\nreduced memorization capability. To strike a good balance of memorization and\ngeneralization, we propose to use Semantic IDs -- a compact discrete item\nrepresentation learned from frozen content embeddings using RQ-VAE that\ncaptures the hierarchy of concepts in items -- as a replacement for random item\nids. Similar to content embeddings, the compactness of Semantic IDs poses a\nproblem of easy adaption in recommendation models. We propose novel methods for\nadapting Semantic IDs in industry-scale ranking models, through hashing\nsub-pieces of of the Semantic-ID sequences. In particular, we find that the\nSentencePiece model that is commonly used in LLM tokenization outperforms\nmanually crafted pieces such as N-grams. To the end, we evaluate our approaches\nin a real-world ranking model for YouTube recommendations. Our experiments\ndemonstrate that Semantic IDs can replace the direct use of video IDs by\nimproving the generalization ability on new and long-tail item slices without\nsacrificing overall model quality.\n","authors":["Anima Singh","Trung Vu","Nikhil Mehta","Raghunandan Keshavan","Maheswaran Sathiamoorthy","Yilin Zheng","Lichan Hong","Lukasz Heldt","Li Wei","Devansh Tandon","Ed H. Chi","Xinyang Yi"],"pdf_url":"https://arxiv.org/pdf/2306.08121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19689v1","updated":"2024-05-30T05:04:01Z","published":"2024-05-30T05:04:01Z","title":"Uncertainty-aware sign language video retrieval with probability\n  distribution modeling","summary":"  Sign language video retrieval plays a key role in facilitating information\naccess for the deaf community. Despite significant advances in video-text\nretrieval, the complexity and inherent uncertainty of sign language preclude\nthe direct application of these techniques. Previous methods achieve the\nmapping between sign language video and text through fine-grained modal\nalignment. However, due to the scarcity of fine-grained annotation, the\nuncertainty inherent in sign language video is underestimated, limiting the\nfurther development of sign language retrieval tasks. To address this\nchallenge, we propose a novel Uncertainty-aware Probability Distribution\nRetrieval (UPRet), that conceptualizes the mapping process of sign language\nvideo and text in terms of probability distributions, explores their potential\ninterrelationships, and enables flexible mappings. Experiments on three\nbenchmarks demonstrate the effectiveness of our method, which achieves\nstate-of-the-art results on How2Sign (59.1%), PHOENIX-2014T (72.0%), and\nCSL-Daily (78.4%).\n","authors":["Xuan Wu","Hongxiang Li","Yuanjiang Luo","Xuxin Cheng","Xianwei Zhuang","Meng Cao","Keren Fu"],"pdf_url":"https://arxiv.org/pdf/2405.19689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19612v1","updated":"2024-05-30T02:00:03Z","published":"2024-05-30T02:00:03Z","title":"Keyword-driven Retrieval-Augmented Large Language Models for Cold-start\n  User Recommendations","summary":"  Recent advancements in Large Language Models (LLMs) have shown significant\npotential in enhancing recommender systems. However, addressing the cold-start\nrecommendation problem, where users lack historical data, remains a\nconsiderable challenge. In this paper, we introduce KALM4Rec (Keyword-driven\nRetrieval-Augmented Large Language Models for Cold-start User Recommendations),\na novel framework specifically designed to tackle this problem by requiring\nonly a few input keywords from users in a practical scenario of cold-start user\nrestaurant recommendations. KALM4Rec operates in two main stages: candidates\nretrieval and LLM-based candidates re-ranking. In the first stage,\nkeyword-driven retrieval models are used to identify potential candidates,\naddressing LLMs' limitations in processing extensive tokens and reducing the\nrisk of generating misleading information. In the second stage, we employ LLMs\nwith various prompting strategies, including zero-shot and few-shot techniques,\nto re-rank these candidates by integrating multiple examples directly into the\nLLM prompts. Our evaluation, using a Yelp restaurant dataset with user reviews\nfrom three English-speaking cities, shows that our proposed framework\nsignificantly improves recommendation quality. Specifically, the integration of\nin-context instructions with LLMs for re-ranking markedly enhances the\nperformance of the cold-start user recommender system.\n","authors":["Hai-Dang Kieu","Minh Duc Nguyen","Thanh-Son Nguyen","Dung D. Le"],"pdf_url":"https://arxiv.org/pdf/2405.19612v1.pdf","comment":"10 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.20245v1","updated":"2024-05-30T16:54:42Z","published":"2024-05-30T16:54:42Z","title":"Retrieval Augmented Structured Generation: Business Document Information\n  Extraction As Tool Use","summary":"  Business Document Information Extraction (BDIE) is the problem of\ntransforming a blob of unstructured information (raw text, scanned documents,\netc.) into a structured format that downstream systems can parse and use. It\nhas two main tasks: Key-Information Extraction (KIE) and Line Items Recognition\n(LIR). In this paper, we argue that BDIE is best modeled as a Tool Use problem,\nwhere the tools are these downstream systems. We then present Retrieval\nAugmented Structured Generation (RASG), a novel general framework for BDIE that\nachieves state of the art (SOTA) results on both KIE and LIR tasks on BDIE\nbenchmarks.\n  The contributions of this paper are threefold: (1) We show, with ablation\nbenchmarks, that Large Language Models (LLMs) with RASG are already competitive\nwith or surpasses current SOTA Large Multimodal Models (LMMs) without RASG on\nBDIE benchmarks. (2) We propose a new metric class for Line Items Recognition,\nGeneral Line Items Recognition Metric (GLIRM), that is more aligned with\npractical BDIE use cases compared to existing metrics, such as ANLS*, DocILE,\nand GriTS. (3) We provide a heuristic algorithm for backcalculating bounding\nboxes of predicted line items and tables without the need for vision encoders.\nFinally, we claim that, while LMMs might sometimes offer marginal performance\nbenefits, LLMs + RASG is oftentimes superior given real-world applications and\nconstraints of BDIE.\n","authors":["Franz Louis Cesista","Rui Aguiar","Jason Kim","Paolo Acilo"],"pdf_url":"https://arxiv.org/pdf/2405.20245v1.pdf","comment":"Accepted by IEEE 7th International Conference on Multimedia\n  Information Processing and Retrieval (MIPR), 2024"},{"id":"http://arxiv.org/abs/2405.20468v1","updated":"2024-05-30T20:34:37Z","published":"2024-05-30T20:34:37Z","title":"Extending the Massive Text Embedding Benchmark to French","summary":"  In recent years, numerous embedding models have been made available and\nwidely used for various NLP tasks. Choosing a model that performs well for\nseveral tasks in English has been largely simplified by the Massive Text\nEmbedding Benchmark (MTEB), but extensions to other languages remain\nchallenging. This is why we expand MTEB to propose the first massive benchmark\nof sentence embeddings for French. Not only we gather 22 existing datasets in\nan easy-to-use interface, but we also create three new French datasets for a\nglobal evaluation over 8 different tasks. We perform a large scale comparison\nwith 46 carefully selected embedding models, conduct comprehensive statistical\ntests, and analyze the correlation between model performance and many of their\ncharacteristics. We find out that even if no model is the best on all tasks,\nlarge multilingual models pre-trained on sentence similarity perform\nparticularly well. Our work comes with open-source code, new datasets and a\npublic leaderboard.\n","authors":["Mathieu Ciancone","Imene Kerboua","Marion Schaeffer","Wissam Siblini"],"pdf_url":"https://arxiv.org/pdf/2405.20468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20389v1","updated":"2024-05-30T18:00:21Z","published":"2024-05-30T18:00:21Z","title":"Designing an Evaluation Framework for Large Language Models in Astronomy\n  Research","summary":"  Large Language Models (LLMs) are shifting how scientific research is done. It\nis imperative to understand how researchers interact with these models and how\nscientific sub-communities like astronomy might benefit from them. However,\nthere is currently no standard for evaluating the use of LLMs in astronomy.\nTherefore, we present the experimental design for an evaluation study on how\nastronomy researchers interact with LLMs. We deploy a Slack chatbot that can\nanswer queries from users via Retrieval-Augmented Generation (RAG); these\nresponses are grounded in astronomy papers from arXiv. We record and anonymize\nuser questions and chatbot answers, user upvotes and downvotes to LLM\nresponses, user feedback to the LLM, and retrieved documents and similarity\nscores with the query. Our data collection method will enable future dynamic\nevaluations of LLM tools for astronomy.\n","authors":["John F. Wu","Alina Hyk","Kiera McCormick","Christine Ye","Simone Astarita","Elina Baral","Jo Ciuca","Jesse Cranney","Anjalie Field","Kartheik Iyer","Philipp Koehn","Jenn Kotler","Sandor Kruk","Michelle Ntampaka","Charles O'Neill","Joshua E. G. Peek","Sanjib Sharma","Mikaeel Yunus"],"pdf_url":"https://arxiv.org/pdf/2405.20389v1.pdf","comment":"7 pages, 3 figures. Code available at\n  https://github.com/jsalt2024-evaluating-llms-for-astronomy/astro-arxiv-bot"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2405.20343v1","updated":"2024-05-30T17:59:54Z","published":"2024-05-30T17:59:54Z","title":"Unique3D: High-Quality and Efficient 3D Mesh Generation from a Single\n  Image","summary":"  In this work, we introduce Unique3D, a novel image-to-3D framework for\nefficiently generating high-quality 3D meshes from single-view images,\nfeaturing state-of-the-art generation fidelity and strong generalizability.\nPrevious methods based on Score Distillation Sampling (SDS) can produce\ndiversified 3D results by distilling 3D knowledge from large 2D diffusion\nmodels, but they usually suffer from long per-case optimization time with\ninconsistent issues. Recent works address the problem and generate better 3D\nresults either by finetuning a multi-view diffusion model or training a fast\nfeed-forward model. However, they still lack intricate textures and complex\ngeometries due to inconsistency and limited generated resolution. To\nsimultaneously achieve high fidelity, consistency, and efficiency in single\nimage-to-3D, we propose a novel framework Unique3D that includes a multi-view\ndiffusion model with a corresponding normal diffusion model to generate\nmulti-view images with their normal maps, a multi-level upscale process to\nprogressively improve the resolution of generated orthographic multi-views, as\nwell as an instant and consistent mesh reconstruction algorithm called ISOMER,\nwhich fully integrates the color and geometric priors into mesh results.\nExtensive experiments demonstrate that our Unique3D significantly outperforms\nother image-to-3D baselines in terms of geometric and textural details.\n","authors":["Kailu Wu","Fangfu Liu","Zhihan Cai","Runjie Yan","Hanyang Wang","Yating Hu","Yueqi Duan","Kaisheng Ma"],"pdf_url":"https://arxiv.org/pdf/2405.20343v1.pdf","comment":"Project page: https://wukailu.github.io/Unique3D"},{"id":"http://arxiv.org/abs/2405.20341v1","updated":"2024-05-30T17:59:51Z","published":"2024-05-30T17:59:51Z","title":"From Zero to Hero: Cold-Start Anomaly Detection","summary":"  When first deploying an anomaly detection system, e.g., to detect\nout-of-scope queries in chatbots, there are no observed data, making\ndata-driven approaches ineffective. Zero-shot anomaly detection methods offer a\nsolution to such \"cold-start\" cases, but unfortunately they are often not\naccurate enough. This paper studies the realistic but underexplored cold-start\nsetting where an anomaly detection model is initialized using zero-shot\nguidance, but subsequently receives a small number of contaminated observations\n(namely, that may include anomalies). The goal is to make efficient use of both\nthe zero-shot guidance and the observations. We propose ColdFusion, a method\nthat effectively adapts the zero-shot anomaly detector to contaminated\nobservations. To support future development of this new setting, we propose an\nevaluation suite consisting of evaluation protocols and metrics.\n","authors":["Tal Reiss","George Kour","Naama Zwerdling","Ateret Anaby-Tavor","Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2405.20341v1.pdf","comment":"ACL 2024. Our code is available at\n  https://github.com/talreiss/ColdFusion"},{"id":"http://arxiv.org/abs/2405.20331v1","updated":"2024-05-30T17:59:04Z","published":"2024-05-30T17:59:04Z","title":"CoSy: Evaluating Textual Explanations of Neurons","summary":"  A crucial aspect of understanding the complex nature of Deep Neural Networks\n(DNNs) is the ability to explain learned concepts within their latent\nrepresentations. While various methods exist to connect neurons to textual\ndescriptions of human-understandable concepts, evaluating the quality of these\nexplanation methods presents a major challenge in the field due to a lack of\nunified, general-purpose quantitative evaluation. In this work, we introduce\nCoSy (Concept Synthesis) -- a novel, architecture-agnostic framework to\nevaluate the quality of textual explanations for latent neurons. Given textual\nexplanations, our proposed framework leverages a generative model conditioned\non textual input to create data points representing the textual explanation.\nThen, the neuron's response to these explanation data points is compared with\nthe response to control data points, providing a quality estimate of the given\nexplanation. We ensure the reliability of our proposed framework in a series of\nmeta-evaluation experiments and demonstrate practical value through insights\nfrom benchmarking various concept-based textual explanation methods for\nComputer Vision tasks, showing that tested explanation methods significantly\ndiffer in quality.\n","authors":["Laura Kopf","Philine Lou Bommer","Anna Hedström","Sebastian Lapuschkin","Marina M. -C. Höhne","Kirill Bykov"],"pdf_url":"https://arxiv.org/pdf/2405.20331v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.20324v1","updated":"2024-05-30T17:57:26Z","published":"2024-05-30T17:57:26Z","title":"Don't drop your samples! Coherence-aware training benefits Conditional\n  diffusion","summary":"  Conditional diffusion models are powerful generative models that can leverage\nvarious types of conditional information, such as class labels, segmentation\nmasks, or text captions. However, in many real-world scenarios, conditional\ninformation may be noisy or unreliable due to human annotation errors or weak\nalignment. In this paper, we propose the Coherence-Aware Diffusion (CAD), a\nnovel method that integrates coherence in conditional information into\ndiffusion models, allowing them to learn from noisy annotations without\ndiscarding data. We assume that each data point has an associated coherence\nscore that reflects the quality of the conditional information. We then\ncondition the diffusion model on both the conditional information and the\ncoherence score. In this way, the model learns to ignore or discount the\nconditioning when the coherence is low. We show that CAD is theoretically sound\nand empirically effective on various conditional generation tasks. Moreover, we\nshow that leveraging coherence generates realistic and diverse samples that\nrespect conditional information better than models trained on cleaned datasets\nwhere samples with low coherence have been discarded.\n","authors":["Nicolas Dufour","Victor Besnier","Vicky Kalogeiton","David Picard"],"pdf_url":"https://arxiv.org/pdf/2405.20324v1.pdf","comment":"Accepted at CVPR 2024 as a Highlight. Project page:\n  https://nicolas-dufour.github.io/cad.html"},{"id":"http://arxiv.org/abs/2405.20321v1","updated":"2024-05-30T17:56:54Z","published":"2024-05-30T17:56:54Z","title":"Vision-based Manipulation from Single Human Video with Open-World Object\n  Graphs","summary":"  We present an object-centric approach to empower robots to learn vision-based\nmanipulation skills from human videos. We investigate the problem of imitating\nrobot manipulation from a single human video in the open-world setting, where a\nrobot must learn to manipulate novel objects from one video demonstration. We\nintroduce ORION, an algorithm that tackles the problem by extracting an\nobject-centric manipulation plan from a single RGB-D video and deriving a\npolicy that conditions on the extracted plan. Our method enables the robot to\nlearn from videos captured by daily mobile devices such as an iPad and\ngeneralize the policies to deployment environments with varying visual\nbackgrounds, camera angles, spatial layouts, and novel object instances. We\nsystematically evaluate our method on both short-horizon and long-horizon\ntasks, demonstrating the efficacy of ORION in learning from a single human\nvideo in the open world. Videos can be found in the project website\nhttps://ut-austin-rpl.github.io/ORION-release.\n","authors":["Yifeng Zhu","Arisrei Lim","Peter Stone","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.20321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20320v1","updated":"2024-05-30T17:56:04Z","published":"2024-05-30T17:56:04Z","title":"Improving the Training of Rectified Flows","summary":"  Diffusion models have shown great promise for image and video generation, but\nsampling from state-of-the-art models requires expensive numerical integration\nof a generative ODE. One approach for tackling this problem is rectified flows,\nwhich iteratively learn smooth ODE paths that are less susceptible to\ntruncation error. However, rectified flows still require a relatively large\nnumber of function evaluations (NFEs). In this work, we propose improved\ntechniques for training rectified flows, allowing them to compete with\nknowledge distillation methods even in the low NFE setting. Our main insight is\nthat under realistic settings, a single iteration of the Reflow algorithm for\ntraining rectified flows is sufficient to learn nearly straight trajectories;\nhence, the current practice of using multiple Reflow iterations is unnecessary.\nWe thus propose techniques to improve one-round training of rectified flows,\nincluding a U-shaped timestep distribution and LPIPS-Huber premetric. With\nthese techniques, we improve the FID of the previous 2-rectified flow by up to\n72% in the 1 NFE setting on CIFAR-10. On ImageNet 64$\\times$64, our improved\nrectified flow outperforms the state-of-the-art distillation methods such as\nconsistency distillation and progressive distillation in both one-step and\ntwo-step settings and rivals the performance of improved consistency training\n(iCT) in FID. Code is available at https://github.com/sangyun884/rfpp.\n","authors":["Sangyun Lee","Zinan Lin","Giulia Fanti"],"pdf_url":"https://arxiv.org/pdf/2405.20320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20318v1","updated":"2024-05-30T17:55:28Z","published":"2024-05-30T17:55:28Z","title":"CausalQuest: Collecting Natural Causal Questions for AI Agents","summary":"  Humans have an innate drive to seek out causality. Whether fuelled by\ncuriosity or specific goals, we constantly question why things happen, how they\nare interconnected, and many other related phenomena. To develop AI agents\ncapable of addressing this natural human quest for causality, we urgently need\na comprehensive dataset of natural causal questions. Unfortunately, existing\ndatasets either contain only artificially-crafted questions that do not reflect\nreal AI usage scenarios or have limited coverage of questions from specific\nsources. To address this gap, we present CausalQuest, a dataset of 13,500\nnaturally occurring questions sourced from social networks, search engines, and\nAI assistants. We formalize the definition of causal questions and establish a\ntaxonomy for finer-grained classification. Through a combined effort of human\nannotators and large language models (LLMs), we carefully label the dataset. We\nfind that 42% of the questions humans ask are indeed causal, with the majority\nseeking to understand the causes behind given effects. Using this dataset, we\ntrain efficient classifiers (up to 2.85B parameters) for the binary task of\nidentifying causal questions, achieving high performance with F1 scores of up\nto 0.877. We conclude with a rich set of future research directions that can\nbuild upon our data and models.\n","authors":["Roberto Ceraolo","Dmitrii Kharlapenko","Amélie Reymond","Rada Mihalcea","Mrinmaya Sachan","Bernhard Schölkopf","Zhijing Jin"],"pdf_url":"https://arxiv.org/pdf/2405.20318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09919v3","updated":"2024-05-30T17:55:19Z","published":"2024-03-14T23:40:56Z","title":"Recurrent Drafter for Fast Speculative Decoding in Large Language Models","summary":"  In this paper, we introduce an improved approach of speculative decoding\naimed at enhancing the efficiency of serving large language models. Our method\ncapitalizes on the strengths of two established techniques: the classic\ntwo-model speculative decoding approach, and the more recent single-model\napproach, Medusa. Drawing inspiration from Medusa, our approach adopts a\nsingle-model strategy for speculative decoding. However, our method\ndistinguishes itself by employing a single, lightweight draft head with a\nrecurrent dependency design, akin in essence to the small, draft model uses in\nclassic speculative decoding, but without the complexities of the full\ntransformer architecture. And because of the recurrent dependency, we can use\nbeam search to swiftly filter out undesired candidates with the draft head. The\noutcome is a method that combines the simplicity of single-model design and\navoids the need to create a data-dependent tree attention structure only for\ninference in Medusa. We empirically demonstrate the effectiveness of the\nproposed method on several popular open source language models, along with a\ncomprehensive analysis of the trade-offs involved in adopting this approach.\n","authors":["Aonan Zhang","Chong Wang","Yi Wang","Xuanyu Zhang","Yunfei Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.09919v3.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.05928v2","updated":"2024-05-30T17:54:02Z","published":"2024-02-08T18:57:42Z","title":"Sharp Rates in Dependent Learning Theory: Avoiding Sample Size Deflation\n  for the Square Loss","summary":"  In this work, we study statistical learning with dependent ($\\beta$-mixing)\ndata and square loss in a hypothesis class $\\mathscr{F}\\subset L_{\\Psi_p}$\nwhere $\\Psi_p$ is the norm $\\|f\\|_{\\Psi_p} \\triangleq \\sup_{m\\geq 1} m^{-1/p}\n\\|f\\|_{L^m} $ for some $p\\in [2,\\infty]$. Our inquiry is motivated by the\nsearch for a sharp noise interaction term, or variance proxy, in learning with\ndependent data. Absent any realizability assumption, typical non-asymptotic\nresults exhibit variance proxies that are deflated multiplicatively by the\nmixing time of the underlying covariates process. We show that whenever the\ntopologies of $L^2$ and $\\Psi_p$ are comparable on our hypothesis class\n$\\mathscr{F}$ -- that is, $\\mathscr{F}$ is a weakly sub-Gaussian class:\n$\\|f\\|_{\\Psi_p} \\lesssim \\|f\\|_{L^2}^\\eta$ for some $\\eta\\in (0,1]$ -- the\nempirical risk minimizer achieves a rate that only depends on the complexity of\nthe class and second order statistics in its leading term. Our result holds\nwhether the problem is realizable or not and we refer to this as a \\emph{near\nmixing-free rate}, since direct dependence on mixing is relegated to an\nadditive higher order term. We arrive at our result by combining the above\nnotion of a weakly sub-Gaussian class with mixed tail generic chaining. This\ncombination allows us to compute sharp, instance-optimal rates for a wide range\nof problems. Examples that satisfy our framework include sub-Gaussian linear\nregression, more general smoothly parameterized function classes, finite\nhypothesis classes, and bounded smoothness classes.\n","authors":["Ingvar Ziemann","Stephen Tu","George J. Pappas","Nikolai Matni"],"pdf_url":"https://arxiv.org/pdf/2402.05928v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20313v1","updated":"2024-05-30T17:53:50Z","published":"2024-05-30T17:53:50Z","title":"Sequence-Augmented SE(3)-Flow Matching For Conditional Protein Backbone\n  Generation","summary":"  Proteins are essential for almost all biological processes and derive their\ndiverse functions from complex 3D structures, which are in turn determined by\ntheir amino acid sequences. In this paper, we exploit the rich biological\ninductive bias of amino acid sequences and introduce FoldFlow-2, a novel\nsequence-conditioned SE(3)-equivariant flow matching model for protein\nstructure generation. FoldFlow-2 presents substantial new architectural\nfeatures over the previous FoldFlow family of models including a protein large\nlanguage model to encode sequence, a new multi-modal fusion trunk that combines\nstructure and sequence representations, and a geometric transformer based\ndecoder. To increase diversity and novelty of generated samples -- crucial for\nde-novo drug design -- we train FoldFlow-2 at scale on a new dataset that is an\norder of magnitude larger than PDB datasets of prior works, containing both\nknown proteins in PDB and high-quality synthetic structures achieved through\nfiltering. We further demonstrate the ability to align FoldFlow-2 to arbitrary\nrewards, e.g. increasing secondary structures diversity, by introducing a\nReinforced Finetuning (ReFT) objective. We empirically observe that FoldFlow-2\noutperforms previous state-of-the-art protein structure-based generative\nmodels, improving over RFDiffusion in terms of unconditional generation across\nall metrics including designability, diversity, and novelty across all protein\nlengths, as well as exhibiting generalization on the task of equilibrium\nconformation sampling. Finally, we demonstrate that a fine-tuned FoldFlow-2\nmakes progress on challenging conditional design tasks such as designing\nscaffolds for the VHH nanobody.\n","authors":["Guillaume Huguet","James Vuckovic","Kilian Fatras","Eric Thibodeau-Laufer","Pablo Lemos","Riashat Islam","Cheng-Hao Liu","Jarrid Rector-Brooks","Tara Akhound-Sadegh","Michael Bronstein","Alexander Tong","Avishek Joey Bose"],"pdf_url":"https://arxiv.org/pdf/2405.20313v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2405.20309v1","updated":"2024-05-30T17:52:36Z","published":"2024-05-30T17:52:36Z","title":"Large Language Models Can Self-Improve At Web Agent Tasks","summary":"  Training models to act as agents that can effectively navigate and perform\nactions in a complex environment, such as a web browser, has typically been\nchallenging due to lack of training data. Large language models (LLMs) have\nrecently demonstrated some capability to navigate novel environments as agents\nin a zero-shot or few-shot fashion, purely guided by natural language\ninstructions as prompts. Recent research has also demonstrated LLMs have the\ncapability to exceed their base performance through self-improvement, i.e.\nfine-tuning on data generated by the model itself. In this work, we explore the\nextent to which LLMs can self-improve their performance as agents in\nlong-horizon tasks in a complex environment using the WebArena benchmark. In\nWebArena, an agent must autonomously navigate and perform actions on web pages\nto achieve a specified objective. We explore fine-tuning on three distinct\nsynthetic training data mixtures and achieve a 31\\% improvement in task\ncompletion rate over the base model on the WebArena benchmark through a\nself-improvement procedure. We additionally contribute novel evaluation metrics\nfor assessing the performance, robustness, capabilities, and quality of\ntrajectories of our fine-tuned agent models to a greater degree than simple,\naggregate-level benchmark scores currently used to measure self-improvement.\n","authors":["Ajay Patel","Markus Hofmarcher","Claudiu Leoveanu-Condrei","Marius-Constantin Dinu","Chris Callison-Burch","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2405.20309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20304v1","updated":"2024-05-30T17:50:04Z","published":"2024-05-30T17:50:04Z","title":"Group Robust Preference Optimization in Reward-free RLHF","summary":"  Adapting large language models (LLMs) for specific tasks usually involves\nfine-tuning through reinforcement learning with human feedback (RLHF) on\npreference data. While these data often come from diverse labelers' groups\n(e.g., different demographics, ethnicities, company teams, etc.), traditional\nRLHF approaches adopt a \"one-size-fits-all\" approach, i.e., they\nindiscriminately assume and optimize a single preference model, thus not being\nrobust to unique characteristics and needs of the various groups. To address\nthis limitation, we propose a novel Group Robust Preference Optimization (GRPO)\nmethod to align LLMs to individual groups' preferences robustly. Our approach\nbuilds upon reward-free direct preference optimization methods, but unlike\nprevious approaches, it seeks a robust policy which maximizes the worst-case\ngroup performance. To achieve this, GRPO adaptively and sequentially weights\nthe importance of different groups, prioritizing groups with worse cumulative\nloss. We theoretically study the feasibility of GRPO and analyze its\nconvergence for the log-linear policy class. By fine-tuning LLMs with GRPO\nusing diverse group-based global opinion data, we significantly improved\nperformance for the worst-performing groups, reduced loss imbalances across\ngroups, and improved probability accuracies compared to non-robust baselines.\n","authors":["Shyam Sundhar Ramesh","Yifan Hu","Iason Chaimalas","Viraj Mehta","Pier Giuseppe Sessa","Haitham Bou Ammar","Ilija Bogunovic"],"pdf_url":"https://arxiv.org/pdf/2405.20304v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2403.01643v2","updated":"2024-05-30T17:46:22Z","published":"2024-03-03T23:40:35Z","title":"You Need to Pay Better Attention: Rethinking the Mathematics of\n  Attention Mechanism","summary":"  Scaled Dot Product Attention (SDPA) is the backbone of many modern\ndeep-learning models. It is so versatile that it has been used in natural\nlanguage, vision, and multi-modal domains with very little change compared to\nits original formulation. This paper discusses why the current formulation is\ninefficient by delving into the mathematical details of the attention\nmechanism. We propose three improvements to mitigate these inefficiencies,\nthereby, introducing three enhanced attention mechanisms: Optimised, Efficient,\nand Super Attention. Optimised and Efficient Attention have one and two matrix\nmultiplications fewer per head, respectively, and 25% and 50% fewer parameters,\nrespectively, than standard SDPA, but perform similarly to standard SDPA in\nboth vision and natural language tasks. They can be used in all applications\nwhere SDPA is used while offering smaller model sizes and faster training and\ninference without noticeable loss in performance. Super Attention introduces a\nnew linear transformation on the values, transforming them from the left. It\noutperforms standard SPDA on vision and natural language tasks by up to 17%\nwhile having one fewer matrix multiplication per head and 25% fewer parameters\nthan standard SDPA. Consequently, it is also faster than standard SDPA. Super\nAttention is ideal in applications where the attention layer's context length\nis fixed, such as Vision Transformers. In addition to providing mathematical\nreasoning, we evaluate the presented attention mechanisms on several datasets\nincluding MNIST, CIFAR100, ImageNet, IMDB Movie Reviews, and Amazon Reviews\ndatasets, as well as combined Europarl and Anki English-Spanish datasets for\nneural machine translation.\n","authors":["Mehran Hosseini","Peyman Hosseini"],"pdf_url":"https://arxiv.org/pdf/2403.01643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20289v1","updated":"2024-05-30T17:40:11Z","published":"2024-05-30T17:40:11Z","title":"DITTO-2: Distilled Diffusion Inference-Time T-Optimization for Music\n  Generation","summary":"  Controllable music generation methods are critical for human-centered\nAI-based music creation, but are currently limited by speed, quality, and\ncontrol design trade-offs. Diffusion Inference-Time T-optimization (DITTO), in\nparticular, offers state-of-the-art results, but is over 10x slower than\nreal-time, limiting practical use. We propose Distilled Diffusion\nInference-Time T -Optimization (or DITTO-2), a new method to speed up\ninference-time optimization-based control and unlock faster-than-real-time\ngeneration for a wide-variety of applications such as music inpainting,\noutpainting, intensity, melody, and musical structure control. Our method works\nby (1) distilling a pre-trained diffusion model for fast sampling via an\nefficient, modified consistency or consistency trajectory distillation process\n(2) performing inference-time optimization using our distilled model with\none-step sampling as an efficient surrogate optimization task and (3) running a\nfinal multi-step sampling generation (decoding) using our estimated noise\nlatents for best-quality, fast, controllable generation. Through thorough\nevaluation, we find our method not only speeds up generation over 10-20x, but\nsimultaneously improves control adherence and generation quality all at once.\nFurthermore, we apply our approach to a new application of maximizing text\nadherence (CLAP score) and show we can convert an unconditional diffusion model\nwithout text inputs into a model that yields state-of-the-art text control.\nSound examples can be found at https://ditto-music.github.io/ditto2/.\n","authors":["Zachary Novack","Julian McAuley","Taylor Berg-Kirkpatrick","Nicholas Bryan"],"pdf_url":"https://arxiv.org/pdf/2405.20289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20287v1","updated":"2024-05-30T17:39:15Z","published":"2024-05-30T17:39:15Z","title":"Flexible SE(2) graph neural networks with applications to PDE surrogates","summary":"  This paper presents a novel approach for constructing graph neural networks\nequivariant to 2D rotations and translations and leveraging them as PDE\nsurrogates on non-gridded domains. We show that aligning the representations\nwith the principal axis allows us to sidestep many constraints while preserving\nSE(2) equivariance. By applying our model as a surrogate for fluid flow\nsimulations and conducting thorough benchmarks against non-equivariant models,\nwe demonstrate significant gains in terms of both data efficiency and accuracy.\n","authors":["Maria Bånkestad","Olof Mogren","Aleksis Pirinen"],"pdf_url":"https://arxiv.org/pdf/2405.20287v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2402.05140v2","updated":"2024-05-30T17:37:06Z","published":"2024-02-06T20:11:54Z","title":"Tag-LLM: Repurposing General-Purpose LLMs for Specialized Domains","summary":"  Large Language Models (LLMs) have demonstrated remarkable proficiency in\nunderstanding and generating natural language. However, their capabilities wane\nin highly specialized domains underrepresented in the pretraining corpus, such\nas physical and biomedical sciences. This work explores how to repurpose\ngeneral LLMs into effective task solvers for specialized domains. We introduce\na novel, model-agnostic framework for learning custom input tags, which are\nparameterized as continuous vectors appended to the LLM's embedding layer, to\ncondition the LLM. We design two types of input tags: domain tags are used to\ndelimit specialized representations (e.g., chemical formulas) and provide\ndomain-relevant context; function tags are used to represent specific functions\n(e.g., predicting molecular properties) and compress function-solving\ninstructions. We develop a three-stage protocol to learn these tags using\nauxiliary data and domain knowledge. By explicitly disentangling task domains\nfrom task functions, our method enables zero-shot generalization to unseen\nproblems through diverse combinations of the input tags. It also boosts LLM's\nperformance in various specialized domains, such as predicting protein or\nchemical properties and modeling drug-target interactions, outperforming expert\nmodels tailored to these tasks.\n","authors":["Junhong Shen","Neil Tenenholtz","James Brian Hall","David Alvarez-Melis","Nicolo Fusi"],"pdf_url":"https://arxiv.org/pdf/2402.05140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20278v1","updated":"2024-05-30T17:32:46Z","published":"2024-05-30T17:32:46Z","title":"Length independent generalization bounds for deep SSM architectures with\n  stability constraints","summary":"  Many state-of-the-art models trained on long-range sequences, for example S4,\nS5 or LRU, are made of sequential blocks combining State-Space Models (SSMs)\nwith neural networks. In this paper we provide a PAC bound that holds for these\nkind of architectures with stable SSM blocks and does not depend on the length\nof the input sequence. Imposing stability of the SSM blocks is a standard\npractice in the literature, and it is known to help performance. Our results\nprovide a theoretical justification for the use of stable SSM blocks as the\nproposed PAC bound decreases as the degree of stability of the SSM blocks\nincreases.\n","authors":["Dániel Rácz","Mihály Petreczky","Bálint Daróczy"],"pdf_url":"https://arxiv.org/pdf/2405.20278v1.pdf","comment":"25 pages, no figures, under submission"},{"id":"http://arxiv.org/abs/2210.16299v4","updated":"2024-05-30T17:31:41Z","published":"2022-10-28T17:52:18Z","title":"Nonuniqueness and Convergence to Equivalent Solutions in Observer-based\n  Inverse Reinforcement Learning","summary":"  A key challenge in solving the deterministic inverse reinforcement learning\n(IRL) problem online and in real-time is the existence of multiple solutions.\nNonuniqueness necessitates the study of the notion of equivalent solutions,\ni.e., solutions that result in a different cost functional but same feedback\nmatrix, and convergence to such solutions. While offline algorithms that result\nin convergence to equivalent solutions have been developed in the literature,\nonline, real-time techniques that address nonuniqueness are not available. In\nthis paper, a regularized history stack observer that converges to\napproximately equivalent solutions of the IRL problem is developed. Novel\ndata-richness conditions are developed to facilitate the analysis and\nsimulation results are provided to demonstrate the effectiveness of the\ndeveloped technique.\n","authors":["Jared Town","Zachary Morrison","Rushikesh Kamalapurkar"],"pdf_url":"https://arxiv.org/pdf/2210.16299v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16646v3","updated":"2024-05-30T17:30:42Z","published":"2024-05-26T17:52:58Z","title":"A Provably Effective Method for Pruning Experts in Fine-tuned Sparse\n  Mixture-of-Experts","summary":"  The sparsely gated mixture of experts (MoE) architecture sends different\ninputs to different subnetworks, i.e., experts, through trainable routers. MoE\nreduces the training computation significantly for large models, but its\ndeployment can be still memory or computation expensive for some downstream\ntasks. Model pruning is a popular approach to reduce inference computation, but\nits application in MoE architecture is largely unexplored. To the best of our\nknowledge, this paper provides the first provably efficient technique for\npruning experts in finetuned MoE models. We theoretically prove that\nprioritizing the pruning of the experts with a smaller change of the routers l2\nnorm from the pretrained model guarantees the preservation of test accuracy,\nwhile significantly reducing the model size and the computational requirements.\nAlthough our theoretical analysis is centered on binary classification tasks on\nsimplified MoE architecture, our expert pruning method is verified on large\nvision MoE models such as VMoE and E3MoE finetuned on benchmark datasets such\nas CIFAR10, CIFAR100, and ImageNet.\n","authors":["Mohammed Nowaz Rabbani Chowdhury","Meng Wang","Kaoutar El Maghraoui","Naigang Wang","Pin-Yu Chen","Christopher Carothers"],"pdf_url":"https://arxiv.org/pdf/2405.16646v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20274v1","updated":"2024-05-30T17:29:15Z","published":"2024-05-30T17:29:15Z","title":"ROAST: Review-level Opinion Aspect Sentiment Target Joint Detection","summary":"  Aspect-Based Sentiment Analysis (ABSA) has experienced tremendous expansion\nand diversity due to various shared tasks spanning several languages and fields\nand organized via SemEval workshops and Germeval. Nonetheless, a few\nshortcomings still need to be addressed, such as the lack of low-resource\nlanguage evaluations and the emphasis on sentence-level analysis. To thoroughly\nassess ABSA techniques in the context of complete reviews, this research\npresents a novel task, Review-Level Opinion Aspect Sentiment Target (ROAST).\nROAST seeks to close the gap between sentence-level and text-level ABSA by\nidentifying every ABSA constituent at the review level. We extend the available\ndatasets to enable ROAST, addressing the drawbacks noted in previous research\nby incorporating low-resource languages, numerous languages, and a variety of\ntopics. Through this effort, ABSA research will be able to cover more ground\nand get a deeper comprehension of the task and its practical application in a\nvariety of languages and domains (https://github.com/RiTUAL-UH/ROAST-ABSA).\n","authors":["Siva Uday Sampreeth Chebolu","Franck Dernoncourt","Nedim Lipka","Thamar Solorio"],"pdf_url":"https://arxiv.org/pdf/2405.20274v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2309.13297"},{"id":"http://arxiv.org/abs/2405.20272v1","updated":"2024-05-30T17:27:44Z","published":"2024-05-30T17:27:44Z","title":"Reconstruction Attacks on Machine Unlearning: Simple Models are\n  Vulnerable","summary":"  Machine unlearning is motivated by desire for data autonomy: a person can\nrequest to have their data's influence removed from deployed models, and those\nmodels should be updated as if they were retrained without the person's data.\nWe show that, counter-intuitively, these updates expose individuals to\nhigh-accuracy reconstruction attacks which allow the attacker to recover their\ndata in its entirety, even when the original models are so simple that privacy\nrisk might not otherwise have been a concern. We show how to mount a\nnear-perfect attack on the deleted data point from linear regression models. We\nthen generalize our attack to other loss functions and architectures, and\nempirically demonstrate the effectiveness of our attacks across a wide range of\ndatasets (capturing both tabular and image data). Our work highlights that\nprivacy risk is significant even for extremely simple model classes when\nindividuals can request deletion of their data from the model.\n","authors":["Martin Bertran","Shuai Tang","Michael Kearns","Jamie Morgenstern","Aaron Roth","Zhiwei Steven Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20271v1","updated":"2024-05-30T17:26:02Z","published":"2024-05-30T17:26:02Z","title":"ETHER: Efficient Finetuning of Large-Scale Models with Hyperplane\n  Reflections","summary":"  Parameter-efficient finetuning (PEFT) has become ubiquitous to adapt\nfoundation models to downstream task requirements while retaining their\ngeneralization ability. However, the amount of additionally introduced\nparameters and compute for successful adaptation and hyperparameter searches\ncan explode quickly, especially when deployed at scale to serve numerous\nindividual requests. To ensure effective, parameter-efficient, and\nhyperparameter-robust adaptation, we propose the ETHER transformation family,\nwhich performs Efficient fineTuning via HypErplane Reflections. By design,\nETHER transformations require a minimal number of parameters, are less likely\nto deteriorate model performance, and exhibit robustness to hyperparameter and\nlearning rate choices. In particular, we introduce ETHER and its relaxation\nETHER+, which match or outperform existing PEFT methods with significantly\nfewer parameters ($\\sim$$10$-$100$ times lower than LoRA or OFT) across\nmultiple image synthesis and natural language tasks without exhaustive\nhyperparameter tuning. Finally, we investigate the recent emphasis on\nHyperspherical Energy retention for adaptation and raise questions on its\npractical utility. The code is available at https://github.com/mwbini/ether.\n","authors":["Massimo Bini","Karsten Roth","Zeynep Akata","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2405.20271v1.pdf","comment":"Accepted to ICML 2024. Code available at\n  https://github.com/mwbini/ether"},{"id":"http://arxiv.org/abs/2402.07240v3","updated":"2024-05-30T17:23:03Z","published":"2024-02-11T16:36:48Z","title":"Oja's Algorithm for Sparse PCA","summary":"  Oja's algorithm for streaming Principal Component Analysis (PCA) for $n$\ndatapoints in a $d$ dimensional space achieves the same sin-squared error\n$O(r_\\mathsf{eff}/n)$ as the offline algorithm in $O(d)$ space and $O(nd)$ time\nand a single pass through the datapoints. Here $r_\\mathsf{eff}$ is the\neffective rank (ratio of the trace and the principal eigenvalue of the\npopulation covariance matrix $\\Sigma$). Under this computational budget, we\nconsider the problem of sparse PCA, where the principal eigenvector of $\\Sigma$\nis $s$-sparse, and $r_\\mathsf{eff}$ can be large. In this setting, to our\nknowledge, \\textit{there are no known single-pass algorithms} that achieve the\nminimax error bound in $O(d)$ space and $O(nd)$ time without either requiring\nstrong initialization conditions or assuming further structure (e.g., spiked)\nof the covariance matrix. We show that a simple single-pass procedure that\nthresholds the output of Oja's algorithm (the Oja vector) can achieve the\nminimax error bound under some regularity conditions in $O(d)$ space and\n$O(nd)$ time as long as $r_\\mathsf{eff}=O(n/\\log n)$. We present a nontrivial\nand novel analysis of the entries of the unnormalized Oja vector, which\ninvolves the projection of a product of independent random matrices on a random\ninitial vector. This is completely different from previous analyses of Oja's\nalgorithm and matrix products, which have been done when the $r_\\mathsf{eff}$\nis bounded.\n","authors":["Syamantak Kumar","Purnamrita Sarkar"],"pdf_url":"https://arxiv.org/pdf/2402.07240v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19327v2","updated":"2024-05-30T17:17:21Z","published":"2024-05-29T17:57:16Z","title":"MAP-Neo: Highly Capable and Transparent Bilingual Large Language Model\n  Series","summary":"  Large Language Models (LLMs) have made great strides in recent years to\nachieve unprecedented performance across different tasks. However, due to\ncommercial interest, the most competitive models like GPT, Gemini, and Claude\nhave been gated behind proprietary interfaces without disclosing the training\ndetails. Recently, many institutions have open-sourced several strong LLMs like\nLLaMA-3, comparable to existing closed-source LLMs. However, only the model's\nweights are provided with most details (e.g., intermediate checkpoints,\npre-training corpus, and training code, etc.) being undisclosed. To improve the\ntransparency of LLMs, the research community has formed to open-source truly\nopen LLMs (e.g., Pythia, Amber, OLMo), where more details (e.g., pre-training\ncorpus and training code) are being provided. These models have greatly\nadvanced the scientific study of these large models including their strengths,\nweaknesses, biases and risks. However, we observe that the existing truly open\nLLMs on reasoning, knowledge, and coding tasks are still inferior to existing\nstate-of-the-art LLMs with similar model sizes. To this end, we open-source\nMAP-Neo, a highly capable and transparent bilingual language model with 7B\nparameters trained from scratch on 4.5T high-quality tokens. Our MAP-Neo is the\nfirst fully open-sourced bilingual LLM with comparable performance compared to\nexisting state-of-the-art LLMs. Moreover, we open-source all details to\nreproduce our MAP-Neo, where the cleaned pre-training corpus, data cleaning\npipeline, checkpoints, and well-optimized training/evaluation framework are\nprovided. Finally, we hope our MAP-Neo will enhance and strengthen the open\nresearch community and inspire more innovations and creativities to facilitate\nthe further improvements of LLMs.\n","authors":["Ge Zhang","Scott Qu","Jiaheng Liu","Chenchen Zhang","Chenghua Lin","Chou Leuang Yu","Danny Pan","Esther Cheng","Jie Liu","Qunshu Lin","Raven Yuan","Tuney Zheng","Wei Pang","Xinrun Du","Yiming Liang","Yinghao Ma","Yizhi Li","Ziyang Ma","Bill Lin","Emmanouil Benetos","Huan Yang","Junting Zhou","Kaijing Ma","Minghao Liu","Morry Niu","Noah Wang","Quehry Que","Ruibo Liu","Sine Liu","Shawn Guo","Soren Gao","Wangchunshu Zhou","Xinyue Zhang","Yizhi Zhou","Yubo Wang","Yuelin Bai","Yuhan Zhang","Yuxiang Zhang","Zenith Wang","Zhenzhu Yang","Zijian Zhao","Jiajun Zhang","Wanli Ouyang","Wenhao Huang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2405.19327v2.pdf","comment":"https://map-neo.github.io/"},{"id":"http://arxiv.org/abs/2405.17476v3","updated":"2024-05-30T17:15:09Z","published":"2024-05-24T04:56:39Z","title":"How to Leverage Diverse Demonstrations in Offline Imitation Learning","summary":"  Offline Imitation Learning (IL) with imperfect demonstrations has garnered\nincreasing attention owing to the scarcity of expert data in many real-world\ndomains. A fundamental problem in this scenario is how to extract positive\nbehaviors from noisy data. In general, current approaches to the problem select\ndata building on state-action similarity to given expert demonstrations,\nneglecting precious information in (potentially abundant) $\\textit{diverse}$\nstate-actions that deviate from expert ones. In this paper, we introduce a\nsimple yet effective data selection method that identifies positive behaviors\nbased on their resultant states -- a more informative criterion enabling\nexplicit utilization of dynamics information and effective extraction of both\nexpert and beneficial diverse behaviors. Further, we devise a lightweight\nbehavior cloning algorithm capable of leveraging the expert and selected data\ncorrectly. In the experiments, we evaluate our method on a suite of complex and\nhigh-dimensional offline IL benchmarks, including continuous-control and\nvision-based tasks. The results demonstrate that our method achieves\nstate-of-the-art performance, outperforming existing methods on\n$\\textbf{20/21}$ benchmarks, typically by $\\textbf{2-5x}$, while maintaining a\ncomparable runtime to Behavior Cloning ($\\texttt{BC}$).\n","authors":["Sheng Yue","Jiani Liu","Xingyuan Hua","Ju Ren","Sen Lin","Junshan Zhang","Yaoxue Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.17476v3.pdf","comment":"International Conference on Machine Learning (ICML)"},{"id":"http://arxiv.org/abs/2402.01057v2","updated":"2024-05-30T17:14:43Z","published":"2024-02-01T23:06:19Z","title":"Expert Proximity as Surrogate Rewards for Single Demonstration Imitation\n  Learning","summary":"  In this paper, we focus on single-demonstration imitation learning (IL), a\npractical approach for real-world applications where acquiring multiple expert\ndemonstrations is costly or infeasible and the ground truth reward function is\nnot available. In contrast to typical IL settings with multiple demonstrations,\nsingle-demonstration IL involves an agent having access to only one expert\ntrajectory. We highlight the issue of sparse reward signals in this setting and\npropose to mitigate this issue through our proposed Transition\nDiscriminator-based IL (TDIL) method. TDIL is an IRL method designed to address\nreward sparsity by introducing a denser surrogate reward function that\nconsiders environmental dynamics. This surrogate reward function encourages the\nagent to navigate towards states that are proximal to expert states. In\npractice, TDIL trains a transition discriminator to differentiate between valid\nand non-valid transitions in a given environment to compute the surrogate\nrewards. The experiments demonstrate that TDIL outperforms existing IL\napproaches and achieves expert-level performance in the single-demonstration IL\nsetting across five widely adopted MuJoCo benchmarks as well as the \"Adroit\nDoor\" robotic environment.\n","authors":["Chia-Cheng Chiang","Li-Cheng Lan","Wei-Fang Sun","Chien Feng","Cho-Jui Hsieh","Chun-Yi Lee"],"pdf_url":"https://arxiv.org/pdf/2402.01057v2.pdf","comment":"Published at ICML 2024. Code: https://github.com/stanl1y/tdil"},{"id":"http://arxiv.org/abs/2310.13230v5","updated":"2024-05-30T17:13:04Z","published":"2023-10-20T02:40:05Z","title":"Absolute Policy Optimization","summary":"  In recent years, trust region on-policy reinforcement learning has achieved\nimpressive results in addressing complex control tasks and gaming scenarios.\nHowever, contemporary state-of-the-art algorithms within this category\nprimarily emphasize improvement in expected performance, lacking the ability to\ncontrol over the worst-case performance outcomes. To address this limitation,\nwe introduce a novel objective function, optimizing which leads to guaranteed\nmonotonic improvement in the lower probability bound of performance with high\nconfidence. Building upon this groundbreaking theoretical advancement, we\nfurther introduce a practical solution called Absolute Policy Optimization\n(APO). Our experiments demonstrate the effectiveness of our approach across\nchallenging continuous control benchmark tasks and extend its applicability to\nmastering Atari games. Our findings reveal that APO as well as its efficient\nvariation Proximal Absolute Policy Optimization (PAPO) significantly\noutperforms state-of-the-art policy gradient algorithms, resulting in\nsubstantial improvements in worst-case performance, as well as expected\nperformance.\n","authors":["Weiye Zhao","Feihan Li","Yifan Sun","Rui Chen","Tianhao Wei","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2310.13230v5.pdf","comment":"published in ICML 2024"},{"id":"http://arxiv.org/abs/2405.17477v3","updated":"2024-05-30T17:11:46Z","published":"2024-05-24T04:57:25Z","title":"OLLIE: Imitation Learning from Offline Pretraining to Online Finetuning","summary":"  In this paper, we study offline-to-online Imitation Learning (IL) that\npretrains an imitation policy from static demonstration data, followed by fast\nfinetuning with minimal environmental interaction. We find the na\\\"ive\ncombination of existing offline IL and online IL methods tends to behave poorly\nin this context, because the initial discriminator (often used in online IL)\noperates randomly and discordantly against the policy initialization, leading\nto misguided policy optimization and $\\textit{unlearning}$ of pretraining\nknowledge. To overcome this challenge, we propose a principled\noffline-to-online IL method, named $\\texttt{OLLIE}$, that simultaneously learns\na near-expert policy initialization along with an $\\textit{aligned\ndiscriminator initialization}$, which can be seamlessly integrated into online\nIL, achieving smooth and fast finetuning. Empirically, $\\texttt{OLLIE}$\nconsistently and significantly outperforms the baseline methods in\n$\\textbf{20}$ challenging tasks, from continuous control to vision-based\ndomains, in terms of performance, demonstration efficiency, and convergence\nspeed. This work may serve as a foundation for further exploration of\npretraining and finetuning in the context of IL.\n","authors":["Sheng Yue","Xingyuan Hua","Ju Ren","Sen Lin","Junshan Zhang","Yaoxue Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.17477v3.pdf","comment":"International Conference on Machine Learning (ICML)"},{"id":"http://arxiv.org/abs/2310.12815v2","updated":"2024-05-30T17:09:56Z","published":"2023-10-19T15:12:09Z","title":"Formalizing and Benchmarking Prompt Injection Attacks and Defenses","summary":"  A prompt injection attack aims to inject malicious instruction/data into the\ninput of an LLM-Integrated Application such that it produces results as an\nattacker desires. Existing works are limited to case studies. As a result, the\nliterature lacks a systematic understanding of prompt injection attacks and\ntheir defenses. We aim to bridge the gap in this work. In particular, we\npropose a framework to formalize prompt injection attacks. Existing attacks are\nspecial cases in our framework. Moreover, based on our framework, we design a\nnew attack by combining existing ones. Using our framework, we conduct a\nsystematic evaluation on 5 prompt injection attacks and 10 defenses with 10\nLLMs and 7 tasks. Our work provides a common benchmark for quantitatively\nevaluating future prompt injection attacks and defenses. To facilitate research\non this topic, we make our platform public at\nhttps://github.com/liu00222/Open-Prompt-Injection.\n","authors":["Yupei Liu","Yuqi Jia","Runpeng Geng","Jinyuan Jia","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2310.12815v2.pdf","comment":"To appear in USENIX Security Symposium 2024"},{"id":"http://arxiv.org/abs/2405.20250v1","updated":"2024-05-30T17:02:18Z","published":"2024-05-30T17:02:18Z","title":"Entropy annealing for policy mirror descent in continuous time and space","summary":"  Entropy regularization has been extensively used in policy optimization\nalgorithms to regularize the optimization landscape and accelerate convergence;\nhowever, it comes at the cost of introducing an additional regularization bias.\nThis work quantifies the impact of entropy regularization on the convergence of\npolicy gradient methods for stochastic exit time control problems. We analyze a\ncontinuous-time policy mirror descent dynamics, which updates the policy based\non the gradient of an entropy-regularized value function and adjusts the\nstrength of entropy regularization as the algorithm progresses. We prove that\nwith a fixed entropy level, the dynamics converges exponentially to the optimal\nsolution of the regularized problem. We further show that when the entropy\nlevel decays at suitable polynomial rates, the annealed flow converges to the\nsolution of the unregularized problem at a rate of $\\mathcal O(1/S)$ for\ndiscrete action spaces and, under suitable conditions, at a rate of $\\mathcal\nO(1/\\sqrt{S})$ for general action spaces, with $S$ being the gradient flow\ntime. This paper explains how entropy regularization improves policy\noptimization, even with the true gradient, from the perspective of convergence\nrate.\n","authors":["Deven Sethi","David Šiška","Yufei Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09904v3","updated":"2024-05-30T17:00:35Z","published":"2023-02-20T11:02:55Z","title":"WW-FL: Secure and Private Large-Scale Federated Learning","summary":"  Federated learning (FL) is an efficient approach for large-scale distributed\nmachine learning that promises data privacy by keeping training data on client\ndevices. However, recent research has uncovered vulnerabilities in FL,\nimpacting both security and privacy through poisoning attacks and the potential\ndisclosure of sensitive information in individual model updates as well as the\naggregated global model. This paper explores the inadequacies of existing FL\nprotection measures when applied independently, and the challenges of creating\neffective compositions.\n  Addressing these issues, we propose WW-FL, an innovative framework that\ncombines secure multi-party computation (MPC) with hierarchical FL to guarantee\ndata and global model privacy. One notable feature of WW-FL is its capability\nto prevent malicious clients from directly poisoning model parameters,\nconfining them to less destructive data poisoning attacks. We furthermore\nprovide a PyTorch-based FL implementation integrated with Meta's CrypTen MPC\nframework to systematically measure the performance and robustness of WW-FL.\nOur extensive evaluation demonstrates that WW-FL is a promising solution for\nsecure and private large-scale federated learning.\n","authors":["Felix Marx","Thomas Schneider","Ajith Suresh","Tobias Wehrle","Christian Weinert","Hossein Yalame"],"pdf_url":"https://arxiv.org/pdf/2302.09904v3.pdf","comment":"WWFL combines private training and inference with secure aggregation\n  and hierarchical FL to provide end-to-end protection and to facilitate\n  large-scale global deployment"},{"id":"http://arxiv.org/abs/2403.07723v2","updated":"2024-05-30T16:58:52Z","published":"2024-03-12T15:01:17Z","title":"On the Last-Iterate Convergence of Shuffling Gradient Methods","summary":"  Shuffling gradient methods are widely implemented in practice, particularly\nincluding three popular algorithms: Random Reshuffle (RR), Shuffle Once (SO),\nand Incremental Gradient (IG). Compared to the empirical success, the\ntheoretical guarantee of shuffling gradient methods was not well-understood for\na long time. Until recently, the convergence rates had just been established\nfor the average iterate for convex functions and the last iterate for strongly\nconvex problems (using squared distance as the metric). However, when using the\nfunction value gap as the convergence criterion, existing theories cannot\ninterpret the good performance of the last iterate in different settings (e.g.,\nconstrained optimization). To bridge this gap between practice and theory, we\nprove the first last-iterate convergence rates for shuffling gradient methods\nwith respect to the objective value even without strong convexity. Our new\nresults either (nearly) match the existing last-iterate lower bounds or are as\nfast as the previous best upper bounds for the average iterate.\n","authors":["Zijian Liu","Zhengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.07723v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.20247v1","updated":"2024-05-30T16:58:34Z","published":"2024-05-30T16:58:34Z","title":"KerasCV and KerasNLP: Vision and Language Power-Ups","summary":"  We present the Keras domain packages KerasCV and KerasNLP, extensions of the\nKeras API for Computer Vision and Natural Language Processing workflows,\ncapable of running on either JAX, TensorFlow, or PyTorch. These domain packages\nare designed to enable fast experimentation, with a focus on ease-of-use and\nperformance. We adopt a modular, layered design: at the library's lowest level\nof abstraction, we provide building blocks for creating models and data\npreprocessing pipelines, and at the library's highest level of abstraction, we\nprovide pretrained ``task\" models for popular architectures such as Stable\nDiffusion, YOLOv8, GPT2, BERT, Mistral, CLIP, Gemma, T5, etc. Task models have\nbuilt-in preprocessing, pretrained weights, and can be fine-tuned on raw\ninputs. To enable efficient training, we support XLA compilation for all\nmodels, and run all preprocessing via a compiled graph of TensorFlow operations\nusing the tf.data API. The libraries are fully open-source (Apache 2.0 license)\nand available on GitHub.\n","authors":["Matthew Watson","Divyashree Shivakumar Sreepathihalli","Francois Chollet","Martin Gorner","Kiranbir Sodhia","Ramesh Sampath","Tirth Patel","Haifeng Jin","Neel Kovelamudi","Gabriel Rasskin","Samaneh Saadat","Luke Wood","Chen Qian","Jonathan Bischof","Ian Stenbit"],"pdf_url":"https://arxiv.org/pdf/2405.20247v1.pdf","comment":"Submitted to Journal of Machine Learning Open Source Software"},{"id":"http://arxiv.org/abs/2405.20245v1","updated":"2024-05-30T16:54:42Z","published":"2024-05-30T16:54:42Z","title":"Retrieval Augmented Structured Generation: Business Document Information\n  Extraction As Tool Use","summary":"  Business Document Information Extraction (BDIE) is the problem of\ntransforming a blob of unstructured information (raw text, scanned documents,\netc.) into a structured format that downstream systems can parse and use. It\nhas two main tasks: Key-Information Extraction (KIE) and Line Items Recognition\n(LIR). In this paper, we argue that BDIE is best modeled as a Tool Use problem,\nwhere the tools are these downstream systems. We then present Retrieval\nAugmented Structured Generation (RASG), a novel general framework for BDIE that\nachieves state of the art (SOTA) results on both KIE and LIR tasks on BDIE\nbenchmarks.\n  The contributions of this paper are threefold: (1) We show, with ablation\nbenchmarks, that Large Language Models (LLMs) with RASG are already competitive\nwith or surpasses current SOTA Large Multimodal Models (LMMs) without RASG on\nBDIE benchmarks. (2) We propose a new metric class for Line Items Recognition,\nGeneral Line Items Recognition Metric (GLIRM), that is more aligned with\npractical BDIE use cases compared to existing metrics, such as ANLS*, DocILE,\nand GriTS. (3) We provide a heuristic algorithm for backcalculating bounding\nboxes of predicted line items and tables without the need for vision encoders.\nFinally, we claim that, while LMMs might sometimes offer marginal performance\nbenefits, LLMs + RASG is oftentimes superior given real-world applications and\nconstraints of BDIE.\n","authors":["Franz Louis Cesista","Rui Aguiar","Jason Kim","Paolo Acilo"],"pdf_url":"https://arxiv.org/pdf/2405.20245v1.pdf","comment":"Accepted by IEEE 7th International Conference on Multimedia\n  Information Processing and Retrieval (MIPR), 2024"},{"id":"http://arxiv.org/abs/2405.20237v1","updated":"2024-05-30T16:40:28Z","published":"2024-05-30T16:40:28Z","title":"Training-efficient density quantum machine learning","summary":"  Quantum machine learning requires powerful, flexible and efficiently\ntrainable models to be successful in solving challenging problems. In this\nwork, we present density quantum neural networks, a learning model\nincorporating randomisation over a set of trainable unitaries. These models\ngeneralise quantum neural networks using parameterised quantum circuits, and\nallow a trade-off between expressibility and efficient trainability,\nparticularly on quantum hardware. We demonstrate the flexibility of the\nformalism by applying it to two recently proposed model families. The first are\ncommuting-block quantum neural networks (QNNs) which are efficiently trainable\nbut may be limited in expressibility. The second are orthogonal (Hamming-weight\npreserving) quantum neural networks which provide well-defined and\ninterpretable transformations on data but are challenging to train at scale on\nquantum devices. Density commuting QNNs improve capacity with minimal gradient\ncomplexity overhead, and density orthogonal neural networks admit a\nquadratic-to-constant gradient query advantage with minimal to no performance\nloss. We conduct numerical experiments on synthetic translationally invariant\ndata and MNIST image data with hyperparameter optimisation to support our\nfindings. Finally, we discuss the connection to post-variational quantum neural\nnetworks, measurement-based quantum machine learning and the dropout mechanism.\n","authors":["Brian Coyle","El Amine Cherrat","Nishant Jain","Natansh Mathur","Snehal Raj","Skander Kazdaghli","Iordanis Kerenidis"],"pdf_url":"https://arxiv.org/pdf/2405.20237v1.pdf","comment":"17 pages main text, 9 pages appendices. 9 figures"},{"id":"http://arxiv.org/abs/2405.20236v1","updated":"2024-05-30T16:40:07Z","published":"2024-05-30T16:40:07Z","title":"Disentangling and Mitigating the Impact of Task Similarity for Continual\n  Learning","summary":"  Continual learning of partially similar tasks poses a challenge for\nartificial neural networks, as task similarity presents both an opportunity for\nknowledge transfer and a risk of interference and catastrophic forgetting.\nHowever, it remains unclear how task similarity in input features and readout\npatterns influences knowledge transfer and forgetting, as well as how they\ninteract with common algorithms for continual learning. Here, we develop a\nlinear teacher-student model with latent structure and show analytically that\nhigh input feature similarity coupled with low readout similarity is\ncatastrophic for both knowledge transfer and retention. Conversely, the\nopposite scenario is relatively benign. Our analysis further reveals that\ntask-dependent activity gating improves knowledge retention at the expense of\ntransfer, while task-dependent plasticity gating does not affect either\nretention or transfer performance at the over-parameterized limit. In contrast,\nweight regularization based on the Fisher information metric significantly\nimproves retention, regardless of task similarity, without compromising\ntransfer performance. Nevertheless, its diagonal approximation and\nregularization in the Euclidean space are much less robust against task\nsimilarity. We demonstrate consistent results in a permuted MNIST task with\nlatent variables. Overall, this work provides insights into when continual\nlearning is difficult and how to mitigate it.\n","authors":["Naoki Hiratani"],"pdf_url":"https://arxiv.org/pdf/2405.20236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20233v1","updated":"2024-05-30T16:35:30Z","published":"2024-05-30T16:35:30Z","title":"Grokfast: Accelerated Grokking by Amplifying Slow Gradients","summary":"  One puzzling artifact in machine learning dubbed grokking is where delayed\ngeneralization is achieved tenfolds of iterations after near perfect\noverfitting to the training data. Focusing on the long delay itself on behalf\nof machine learning practitioners, our goal is to accelerate generalization of\na model under grokking phenomenon. By regarding a series of gradients of a\nparameter over training iterations as a random signal over time, we can\nspectrally decompose the parameter trajectories under gradient descent into two\ncomponents: the fast-varying, overfitting-yielding component and the\nslow-varying, generalization-inducing component. This analysis allows us to\naccelerate the grokking phenomenon more than $\\times 50$ with only a few lines\nof code that amplifies the slow-varying components of gradients. The\nexperiments show that our algorithm applies to diverse tasks involving images,\nlanguages, and graphs, enabling practical availability of this peculiar\nartifact of sudden generalization. Our code is available at\n\\url{https://github.com/ironjr/grokfast}.\n","authors":["Jaerin Lee","Bong Gyun Kang","Kihoon Kim","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20233v1.pdf","comment":"15 pages, 12 figures. Project page:\n  https://jaerinlee.com/research/grokfast"},{"id":"http://arxiv.org/abs/2405.20231v1","updated":"2024-05-30T16:32:31Z","published":"2024-05-30T16:32:31Z","title":"The Empirical Impact of Neural Parameter Symmetries, or Lack Thereof","summary":"  Many algorithms and observed phenomena in deep learning appear to be affected\nby parameter symmetries -- transformations of neural network parameters that do\nnot change the underlying neural network function. These include linear mode\nconnectivity, model merging, Bayesian neural network inference, metanetworks,\nand several other characteristics of optimization or loss-landscapes. However,\ntheoretical analysis of the relationship between parameter space symmetries and\nthese phenomena is difficult. In this work, we empirically investigate the\nimpact of neural parameter symmetries by introducing new neural network\narchitectures that have reduced parameter space symmetries. We develop two\nmethods, with some provable guarantees, of modifying standard neural networks\nto reduce parameter space symmetries. With these new methods, we conduct a\ncomprehensive experimental study consisting of multiple tasks aimed at\nassessing the effect of removing parameter symmetries. Our experiments reveal\nseveral interesting observations on the empirical impact of parameter\nsymmetries; for instance, we observe linear mode connectivity between our\nnetworks without alignment of weight spaces, and we find that our networks\nallow for faster and more effective Bayesian neural network training.\n","authors":["Derek Lim","Moe Putterman","Robin Walters","Haggai Maron","Stefanie Jegelka"],"pdf_url":"https://arxiv.org/pdf/2405.20231v1.pdf","comment":"27 pages. Preparing code for release"},{"id":"http://arxiv.org/abs/2402.14800v2","updated":"2024-05-30T16:24:16Z","published":"2024-02-22T18:56:07Z","title":"Not All Experts are Equal: Efficient Expert Pruning and Skipping for\n  Mixture-of-Experts Large Language Models","summary":"  A pivotal advancement in the progress of large language models (LLMs) is the\nemergence of the Mixture-of-Experts (MoE) LLMs. Compared to traditional LLMs,\nMoE LLMs can achieve higher performance with fewer parameters, but it is still\nhard to deploy them due to their immense parameter sizes. Different from\nprevious weight pruning methods that rely on specifically designed hardware,\nthis paper mainly aims to enhance the deployment efficiency of MoE LLMs by\nintroducing plug-and-play expert-level sparsification techniques. Specifically,\nwe propose, for the first time to our best knowledge, post-training approaches\nfor task-agnostic and task-specific expert pruning and skipping of MoE LLMs,\ntailored to improve deployment efficiency while maintaining model performance\nacross a wide range of tasks. Extensive experiments show that our proposed\nmethods can simultaneously reduce model sizes and increase the inference speed,\nwhile maintaining satisfactory performance. Data and code will be available at\nhttps://github.com/Lucky-Lance/Expert_Sparsity.\n","authors":["Xudong Lu","Qi Liu","Yuhui Xu","Aojun Zhou","Siyuan Huang","Bo Zhang","Junchi Yan","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2402.14800v2.pdf","comment":"Mixture-of-Experts Large Language Models, ACL2024"},{"id":"http://arxiv.org/abs/2403.19546v2","updated":"2024-05-30T16:20:04Z","published":"2024-03-28T16:27:26Z","title":"Croissant: A Metadata Format for ML-Ready Datasets","summary":"  Data is a critical resource for Machine Learning (ML), yet working with data\nremains a key friction point. This paper introduces Croissant, a metadata\nformat for datasets that simplifies how data is used by ML tools and\nframeworks. Croissant makes datasets more discoverable, portable and\ninteroperable, thereby addressing significant challenges in ML data management\nand responsible AI. Croissant is already supported by several popular dataset\nrepositories, spanning hundreds of thousands of datasets, ready to be loaded\ninto the most popular ML frameworks.\n","authors":["Mubashara Akhtar","Omar Benjelloun","Costanza Conforti","Pieter Gijsbers","Joan Giner-Miguelez","Nitisha Jain","Michael Kuchnik","Quentin Lhoest","Pierre Marcenac","Manil Maskey","Peter Mattson","Luis Oala","Pierre Ruyssen","Rajat Shinde","Elena Simperl","Goeffry Thomas","Slava Tykhonov","Joaquin Vanschoren","Jos van der Velde","Steffen Vogler","Carole-Jean Wu"],"pdf_url":"https://arxiv.org/pdf/2403.19546v2.pdf","comment":"Published in Proceedings of ACM SIGMOD/PODS'24 Data Management for\n  End-to-End Machine Learning (DEEM) Workshop\n  https://dl.acm.org/doi/10.1145/3650203.3663326"},{"id":"http://arxiv.org/abs/2405.20216v1","updated":"2024-05-30T16:18:05Z","published":"2024-05-30T16:18:05Z","title":"Boost Your Own Human Image Generation Model via Direct Preference\n  Optimization with AI Feedback","summary":"  The generation of high-quality human images through text-to-image (T2I)\nmethods is a significant yet challenging task. Distinct from general image\ngeneration, human image synthesis must satisfy stringent criteria related to\nhuman pose, anatomy, and alignment with textual prompts, making it particularly\ndifficult to achieve realistic results. Recent advancements in T2I generation\nbased on diffusion models have shown promise, yet challenges remain in meeting\nhuman-specific preferences. In this paper, we introduce a novel approach\ntailored specifically for human image generation utilizing Direct Preference\nOptimization (DPO). Specifically, we introduce an efficient method for\nconstructing a specialized DPO dataset for training human image generation\nmodels without the need for costly human feedback. We also propose a modified\nloss function that enhances the DPO training process by minimizing artifacts\nand improving image fidelity. Our method demonstrates its versatility and\neffectiveness in generating human images, including personalized text-to-image\ngeneration. Through comprehensive evaluations, we show that our approach\nsignificantly advances the state of human image generation, achieving superior\nresults in terms of natural anatomies, poses, and text-image alignment.\n","authors":["Sanghyeon Na","Yonggyu Kim","Hyunjoon Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20216v1.pdf","comment":"28 pages, 18 figures"},{"id":"http://arxiv.org/abs/2405.20213v1","updated":"2024-05-30T16:16:25Z","published":"2024-05-30T16:16:25Z","title":"PostDoc: Generating Poster from a Long Multimodal Document Using Deep\n  Submodular Optimization","summary":"  A poster from a long input document can be considered as a one-page\neasy-to-read multimodal (text and images) summary presented on a nice template\nwith good design elements. Automatic transformation of a long document into a\nposter is a very less studied but challenging task. It involves content\nsummarization of the input document followed by template generation and\nharmonization. In this work, we propose a novel deep submodular function which\ncan be trained on ground truth summaries to extract multimodal content from the\ndocument and explicitly ensures good coverage, diversity and alignment of text\nand images. Then, we use an LLM based paraphraser and propose to generate a\ntemplate with various design aspects conditioned on the input content. We show\nthe merits of our approach through extensive automated and human evaluations.\n","authors":["Vijay Jaisankar","Sambaran Bandyopadhyay","Kalp Vyas","Varre Chaitanya","Shwetha Somasundaram"],"pdf_url":"https://arxiv.org/pdf/2405.20213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14918v2","updated":"2024-05-30T16:04:44Z","published":"2024-05-23T17:13:52Z","title":"AnalogCoder: Analog Circuit Design via Training-Free Code Generation","summary":"  Analog circuit design is a significant task in modern chip technology,\nfocusing on the selection of component types, connectivity, and parameters to\nensure proper circuit functionality. Despite advances made by Large Language\nModels (LLMs) in digital circuit design, the complexity and scarcity of data in\nanalog circuitry pose significant challenges. To mitigate these issues, we\nintroduce AnalogCoder, the first training-free LLM agent for designing analog\ncircuits through Python code generation. Firstly, AnalogCoder incorporates a\nfeedback-enhanced flow with tailored domain-specific prompts, enabling the\nautomated and self-correcting design of analog circuits with a high success\nrate. Secondly, it proposes a circuit tool library to archive successful\ndesigns as reusable modular sub-circuits, simplifying composite circuit\ncreation. Thirdly, extensive experiments on a benchmark designed to cover a\nwide range of analog circuit tasks show that AnalogCoder outperforms other\nLLM-based methods. It has successfully designed 20 circuits, 5 more than\nstandard GPT-4o. We believe AnalogCoder can significantly improve the\nlabor-intensive chip design process, enabling non-experts to design analog\ncircuits efficiently.\n","authors":["Yao Lai","Sungyoung Lee","Guojin Chen","Souradip Poddar","Mengkang Hu","David Z. Pan","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2405.14918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20200v1","updated":"2024-05-30T16:04:35Z","published":"2024-05-30T16:04:35Z","title":"Unified Explanations in Machine Learning Models: A Perturbation Approach","summary":"  A high-velocity paradigm shift towards Explainable Artificial Intelligence\n(XAI) has emerged in recent years. Highly complex Machine Learning (ML) models\nhave flourished in many tasks of intelligence, and the questions have started\nto shift away from traditional metrics of validity towards something deeper:\nWhat is this model telling me about my data, and how is it arriving at these\nconclusions? Inconsistencies between XAI and modeling techniques can have the\nundesirable effect of casting doubt upon the efficacy of these explainability\napproaches. To address these problems, we propose a systematic,\nperturbation-based analysis against a popular, model-agnostic method in XAI,\nSHapley Additive exPlanations (Shap). We devise algorithms to generate relative\nfeature importance in settings of dynamic inference amongst a suite of popular\nmachine learning and deep learning methods, and metrics that allow us to\nquantify how well explanations generated under the static case hold. We propose\na taxonomy for feature importance methodology, measure alignment, and observe\nquantifiable similarity amongst explanation models across several datasets.\n","authors":["Jacob Dineen","Don Kridel","Daniel Dolk","David Castillo"],"pdf_url":"https://arxiv.org/pdf/2405.20200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20194v1","updated":"2024-05-30T15:58:22Z","published":"2024-05-30T15:58:22Z","title":"Occam Gradient Descent","summary":"  Deep learning neural network models must be large enough to adapt to their\nproblem domain, while small enough to avoid overfitting training data during\ngradient descent. To balance these competing demands, overprovisioned deep\nlearning models such as transformers are trained for a single epoch on large\ndata sets, and hence inefficient with both computing resources and training\ndata. In response to these inefficiencies, we exploit learning theory to derive\nOccam Gradient Descent, an algorithm that interleaves adaptive reduction of\nmodel size to minimize generalization error, with gradient descent on model\nweights to minimize fitting error. In contrast, traditional gradient descent\ngreedily minimizes fitting error without regard to generalization error. Our\nalgorithm simultaneously descends the space of weights and topological size of\nany neural network without modification, and is effective in our experiments in\noutperforming traditional gradient descent with or without post-train pruning\nin accuracy, compute and model compression.\n","authors":["B. N. Kausik"],"pdf_url":"https://arxiv.org/pdf/2405.20194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20180v1","updated":"2024-05-30T15:48:04Z","published":"2024-05-30T15:48:04Z","title":"Transformers and Slot Encoding for Sample Efficient Physical World\n  Modelling","summary":"  World modelling, i.e. building a representation of the rules that govern the\nworld so as to predict its evolution, is an essential ability for any agent\ninteracting with the physical world. Recent applications of the Transformer\narchitecture to the problem of world modelling from video input show notable\nimprovements in sample efficiency. However, existing approaches tend to work\nonly at the image level thus disregarding that the environment is composed of\nobjects interacting with each other. In this paper, we propose an architecture\ncombining Transformers for world modelling with the slot-attention paradigm, an\napproach for learning representations of objects appearing in a scene. We\ndescribe the resulting neural architecture and report experimental results\nshowing an improvement over the existing solutions in terms of sample\nefficiency and a reduction of the variation of the performance over the\ntraining examples. The code for our architecture and experiments is available\nat https://github.com/torchipeppo/transformers-and-slot-encoding-for-wm\n","authors":["Francesco Petri","Luigi Asprino","Aldo Gangemi"],"pdf_url":"https://arxiv.org/pdf/2405.20180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20178v1","updated":"2024-05-30T15:47:48Z","published":"2024-05-30T15:47:48Z","title":"Non-intrusive data-driven model order reduction for circuits based on\n  Hammerstein architectures","summary":"  We demonstrate that data-driven system identification techniques can provide\na basis for effective, non-intrusive model order reduction (MOR) for common\ncircuits that are key building blocks in microelectronics. Our approach is\nmotivated by the practical operation of these circuits and utilizes a canonical\nHammerstein architecture. To demonstrate the approach we develop a parsimonious\nHammerstein model for a non-linear CMOS differential amplifier. We train this\nmodel on a combination of direct current (DC) and transient Spice (Xyce)\ncircuit simulation data using a novel sequential strategy to identify the\nstatic nonlinear and linear dynamical parts of the model. Simulation results\nshow that the Hammerstein model is an effective surrogate for the differential\namplifier circuit that accurately and efficiently reproduces its behavior over\na wide range of operating points and input frequencies.\n","authors":["Joshua Hanson","Biliana Paskaleva","Pavel Bochev"],"pdf_url":"https://arxiv.org/pdf/2405.20178v1.pdf","comment":"13 pages, 13 figures; submitted to IEEE Transactions on\n  Computer-Aided Design of Integrated Circuits and Systems"},{"id":"http://arxiv.org/abs/2405.20174v1","updated":"2024-05-30T15:45:03Z","published":"2024-05-30T15:45:03Z","title":"Tropical Expressivity of Neural Networks","summary":"  We propose an algebraic geometric framework to study the expressivity of\nlinear activation neural networks. A particular quantity that has been actively\nstudied in the field of deep learning is the number of linear regions, which\ngives an estimate of the information capacity of the architecture. To study and\nevaluate information capacity and expressivity, we work in the setting of\ntropical geometry -- a combinatorial and polyhedral variant of algebraic\ngeometry -- where there are known connections between tropical rational maps\nand feedforward neural networks. Our work builds on and expands this connection\nto capitalize on the rich theory of tropical geometry to characterize and study\nvarious architectural aspects of neural networks. Our contributions are\nthreefold: we provide a novel tropical geometric approach to selecting sampling\ndomains among linear regions; an algebraic result allowing for a guided\nrestriction of the sampling domain for network architectures with symmetries;\nand an open source library to analyze neural networks as tropical Puiseux\nrational maps. We provide a comprehensive set of proof-of-concept numerical\nexperiments demonstrating the breadth of neural network architectures to which\ntropical geometric theory can be applied to reveal insights on expressivity\ncharacteristics of a network. Our work provides the foundations for the\nadaptation of both theory and existing software from computational tropical\ngeometry and symbolic computation to deep learning.\n","authors":["Shiv Bhatia","Yueqi Cao","Paul Lezeau","Anthea Monod"],"pdf_url":"https://arxiv.org/pdf/2405.20174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15159v3","updated":"2024-05-30T15:44:51Z","published":"2024-02-23T07:43:26Z","title":"Machine Unlearning of Pre-trained Large Language Models","summary":"  This study investigates the concept of the `right to be forgotten' within the\ncontext of large language models (LLMs). We explore machine unlearning as a\npivotal solution, with a focus on pre-trained models--a notably\nunder-researched area. Our research delineates a comprehensive framework for\nmachine unlearning in pre-trained LLMs, encompassing a critical analysis of\nseven diverse unlearning methods. Through rigorous evaluation using curated\ndatasets from arXiv, books, and GitHub, we establish a robust benchmark for\nunlearning performance, demonstrating that these methods are over $10^5$ times\nmore computationally efficient than retraining. Our results show that\nintegrating gradient ascent with gradient descent on in-distribution data\nimproves hyperparameter robustness. We also provide detailed guidelines for\nefficient hyperparameter tuning in the unlearning process. Our findings advance\nthe discourse on ethical AI practices, offering substantive insights into the\nmechanics of machine unlearning for pre-trained LLMs and underscoring the\npotential for responsible AI development.\n","authors":["Jin Yao","Eli Chien","Minxin Du","Xinyao Niu","Tianhao Wang","Zezhou Cheng","Xiang Yue"],"pdf_url":"https://arxiv.org/pdf/2402.15159v3.pdf","comment":"ACL 2024 main. Code and data at\n  https://github.com/yaojin17/Unlearning_LLM"},{"id":"http://arxiv.org/abs/2405.20172v1","updated":"2024-05-30T15:44:27Z","published":"2024-05-30T15:44:27Z","title":"Iterative Feature Boosting for Explainable Speech Emotion Recognition","summary":"  In speech emotion recognition (SER), using predefined features without\nconsidering their practical importance may lead to high dimensional datasets,\nincluding redundant and irrelevant information. Consequently, high-dimensional\nlearning often results in decreasing model accuracy while increasing\ncomputational complexity. Our work underlines the importance of carefully\nconsidering and analyzing features in order to build efficient SER systems. We\npresent a new supervised SER method based on an efficient feature engineering\napproach. We pay particular attention to the explainability of results to\nevaluate feature relevance and refine feature sets. This is performed\niteratively through feature evaluation loop, using Shapley values to boost\nfeature selection and improve overall framework performance. Our approach\nallows thus to balance the benefits between model performance and transparency.\nThe proposed method outperforms human-level performance (HLP) and\nstate-of-the-art machine learning methods in emotion recognition on the TESS\ndataset.\n","authors":["Alaa Nfissi","Wassim Bouachir","Nizar Bouguila","Brian Mishara"],"pdf_url":"https://arxiv.org/pdf/2405.20172v1.pdf","comment":"Published in: 2023 International Conference on Machine Learning and\n  Applications (ICMLA)"},{"id":"http://arxiv.org/abs/2405.20165v1","updated":"2024-05-30T15:39:19Z","published":"2024-05-30T15:39:19Z","title":"Randomized Exploration for Reinforcement Learning with Multinomial\n  Logistic Function Approximation","summary":"  We study reinforcement learning with multinomial logistic (MNL) function\napproximation where the underlying transition probability kernel of the Markov\ndecision processes (MDPs) is parametrized by an unknown transition core with\nfeatures of state and action. For the finite horizon episodic setting with\ninhomogeneous state transitions, we propose provably efficient algorithms with\nrandomized exploration having frequentist regret guarantees. For our first\nalgorithm, $\\texttt{RRL-MNL}$, we adapt optimistic sampling to ensure the\noptimism of the estimated value function with sufficient frequency and\nestablish that $\\texttt{RRL-MNL}$ is both statistically and computationally\nefficient, achieving a $\\tilde{O}(\\kappa^{-1} d^{\\frac{3}{2}} H^{\\frac{3}{2}}\n\\sqrt{T})$ frequentist regret bound with constant-time computational cost per\nepisode. Here, $d$ is the dimension of the transition core, $H$ is the horizon\nlength, $T$ is the total number of steps, and $\\kappa$ is a problem-dependent\nconstant. Despite the simplicity and practicality of $\\texttt{RRL-MNL}$, its\nregret bound scales with $\\kappa^{-1}$, which is potentially large in the worst\ncase. To improve the dependence on $\\kappa^{-1}$, we propose\n$\\texttt{ORRL-MNL}$, which estimates the value function using local gradient\ninformation of the MNL transition model. We show that its frequentist regret\nbound is $\\tilde{O}(d^{\\frac{3}{2}} H^{\\frac{3}{2}} \\sqrt{T} + \\kappa^{-1} d^2\nH^2)$. To the best of our knowledge, these are the first randomized RL\nalgorithms for the MNL transition model that achieve both computational and\nstatistical efficiency. Numerical experiments demonstrate the superior\nperformance of the proposed algorithms.\n","authors":["Wooseong Cho","Taehyun Hwang","Joongkyu Lee","Min-hwan Oh"],"pdf_url":"https://arxiv.org/pdf/2405.20165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09983v2","updated":"2024-05-30T15:34:10Z","published":"2024-05-16T11:01:09Z","title":"Zero-Shot Hierarchical Classification on the Common Procurement\n  Vocabulary Taxonomy","summary":"  Classifying public tenders is a useful task for both companies that are\ninvited to participate and for inspecting fraudulent activities. To facilitate\nthe task for both participants and public administrations, the European Union\npresented a common taxonomy (Common Procurement Vocabulary, CPV) which is\nmandatory for tenders of certain importance; however, the contracts in which a\nCPV label is mandatory are the minority compared to all the Public\nAdministrations activities. Classifying over a real-world taxonomy introduces\nsome difficulties that can not be ignored. First of all, some fine-grained\nclasses have an insufficient (if any) number of observations in the training\nset, while other classes are far more frequent (even thousands of times) than\nthe average. To overcome those difficulties, we present a zero-shot approach,\nbased on a pre-trained language model that relies only on label description and\nrespects the label taxonomy. To train our proposed model, we used industrial\ndata, which comes from contrattipubblici.org, a service by SpazioDati s.r.l.\nthat collects public contracts stipulated in Italy in the last 25 years.\nResults show that the proposed model achieves better performance in classifying\nlow-frequent classes compared to three different baselines, and is also able to\npredict never-seen classes.\n","authors":["Federico Moiraghi","Matteo Palmonari","Davide Allavena","Federico Morando"],"pdf_url":"https://arxiv.org/pdf/2405.09983v2.pdf","comment":"Full-length version of the short paper accepted at COMPSAC 2024"},{"id":"http://arxiv.org/abs/2307.13885v5","updated":"2024-05-30T15:33:55Z","published":"2023-07-26T01:10:29Z","title":"Characterizing Data Point Vulnerability via Average-Case Robustness","summary":"  Studying the robustness of machine learning models is important to ensure\nconsistent model behaviour across real-world settings. To this end, adversarial\nrobustness is a standard framework, which views robustness of predictions\nthrough a binary lens: either a worst-case adversarial misclassification exists\nin the local region around an input, or it does not. However, this binary\nperspective does not account for the degrees of vulnerability, as data points\nwith a larger number of misclassified examples in their neighborhoods are more\nvulnerable. In this work, we consider a complementary framework for robustness,\ncalled average-case robustness, which measures the fraction of points in a\nlocal region that provides consistent predictions. However, computing this\nquantity is hard, as standard Monte Carlo approaches are inefficient especially\nfor high-dimensional inputs. In this work, we propose the first analytical\nestimators for average-case robustness for multi-class classifiers. We show\nempirically that our estimators are accurate and efficient for standard deep\nlearning models and demonstrate their usefulness for identifying vulnerable\ndata points, as well as quantifying robustness bias of models. Overall, our\ntools provide a complementary view to robustness, improving our ability to\ncharacterize model behaviour.\n","authors":["Tessa Han","Suraj Srinivas","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2307.13885v5.pdf","comment":"UAI 2024"},{"id":"http://arxiv.org/abs/2405.02235v2","updated":"2024-05-30T15:18:24Z","published":"2024-05-03T16:45:15Z","title":"Learning Optimal Deterministic Policies with Stochastic Policy Gradients","summary":"  Policy gradient (PG) methods are successful approaches to deal with\ncontinuous reinforcement learning (RL) problems. They learn stochastic\nparametric (hyper)policies by either exploring in the space of actions or in\nthe space of parameters. Stochastic controllers, however, are often undesirable\nfrom a practical perspective because of their lack of robustness, safety, and\ntraceability. In common practice, stochastic (hyper)policies are learned only\nto deploy their deterministic version. In this paper, we make a step towards\nthe theoretical understanding of this practice. After introducing a novel\nframework for modeling this scenario, we study the global convergence to the\nbest deterministic policy, under (weak) gradient domination assumptions. Then,\nwe illustrate how to tune the exploration level used for learning to optimize\nthe trade-off between the sample complexity and the performance of the deployed\ndeterministic policy. Finally, we quantitatively compare action-based and\nparameter-based exploration, giving a formal guise to intuitive results.\n","authors":["Alessandro Montenegro","Marco Mussi","Alberto Maria Metelli","Matteo Papini"],"pdf_url":"https://arxiv.org/pdf/2405.02235v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2403.15112v3","updated":"2024-05-30T15:17:55Z","published":"2024-03-22T11:08:48Z","title":"Text clustering with LLM embeddings","summary":"  Text clustering is an important approach for organising the growing amount of\ndigital content, helping to structure and find hidden patterns in uncategorised\ndata. However, the effectiveness of text clustering heavily relies on the\nchoice of textual embeddings and clustering algorithms. We argue that recent\nadvances in large language models (LLMs) can potentially improve this task. In\nthis research, we investigated how different textual embeddings -- particularly\nthose used in LLMs -- and clustering algorithms affect how text datasets are\nclustered. A series of experiments were conducted to assess how embeddings\ninfluence clustering results, the role played by dimensionality reduction\nthrough summarisation, and model size adjustment. Findings reveal that LLM\nembeddings excel at capturing subtleties in structured language, while BERT\nleads the lightweight options in performance. In addition, we observe that\nincreasing model dimensionality and employing summarization techniques do not\nconsistently lead to improvements in clustering efficiency, suggesting that\nthese strategies require careful analysis to use in real-life models. These\nresults highlight a complex balance between the need for refined text\nrepresentation and computational feasibility in text clustering applications.\nThis study extends traditional text clustering frameworks by incorporating\nembeddings from LLMs, providing a path for improved methodologies, while\ninforming new avenues for future research in various types of textual analysis.\n","authors":["Alina Petukhova","João P. Matos-Carvalho","Nuno Fachada"],"pdf_url":"https://arxiv.org/pdf/2403.15112v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20139v1","updated":"2024-05-30T15:14:24Z","published":"2024-05-30T15:14:24Z","title":"GNN-RAG: Graph Neural Retrieval for Large Language Model Reasoning","summary":"  Knowledge Graphs (KGs) represent human-crafted factual knowledge in the form\nof triplets (head, relation, tail), which collectively form a graph. Question\nAnswering over KGs (KGQA) is the task of answering natural questions grounding\nthe reasoning to the information provided by the KG. Large Language Models\n(LLMs) are the state-of-the-art models for QA tasks due to their remarkable\nability to understand natural language. On the other hand, Graph Neural\nNetworks (GNNs) have been widely used for KGQA as they can handle the complex\ngraph information stored in the KG. In this work, we introduce GNN-RAG, a novel\nmethod for combining language understanding abilities of LLMs with the\nreasoning abilities of GNNs in a retrieval-augmented generation (RAG) style.\nFirst, a GNN reasons over a dense KG subgraph to retrieve answer candidates for\na given question. Second, the shortest paths in the KG that connect question\nentities and answer candidates are extracted to represent KG reasoning paths.\nThe extracted paths are verbalized and given as input for LLM reasoning with\nRAG. In our GNN-RAG framework, the GNN acts as a dense subgraph reasoner to\nextract useful graph information, while the LLM leverages its natural language\nprocessing ability for ultimate KGQA. Furthermore, we develop a retrieval\naugmentation (RA) technique to further boost KGQA performance with GNN-RAG.\nExperimental results show that GNN-RAG achieves state-of-the-art performance in\ntwo widely used KGQA benchmarks (WebQSP and CWQ), outperforming or matching\nGPT-4 performance with a 7B tuned LLM. In addition, GNN-RAG excels on multi-hop\nand multi-entity questions outperforming competing approaches by 8.9--15.5%\npoints at answer F1.\n","authors":["Costas Mavromatis","George Karypis"],"pdf_url":"https://arxiv.org/pdf/2405.20139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.15403v4","updated":"2024-05-30T15:11:45Z","published":"2022-05-30T20:00:19Z","title":"Neural Optimal Transport with General Cost Functionals","summary":"  We introduce a novel neural network-based algorithm to compute optimal\ntransport (OT) plans for general cost functionals. In contrast to common\nEuclidean costs, i.e., $\\ell^1$ or $\\ell^2$, such functionals provide more\nflexibility and allow using auxiliary information, such as class labels, to\nconstruct the required transport map. Existing methods for general costs are\ndiscrete and have limitations in practice, i.e. they do not provide an\nout-of-sample estimation. We address the challenge of designing a continuous OT\napproach for general costs that generalizes to new data points in\nhigh-dimensional spaces, such as images. Additionally, we provide the\ntheoretical error analysis for our recovered transport plans. As an\napplication, we construct a cost functional to map data distributions while\npreserving the class-wise structure.\n","authors":["Arip Asadulaev","Alexander Korotin","Vage Egiazarian","Petr Mokrov","Evgeny Burnaev"],"pdf_url":"https://arxiv.org/pdf/2205.15403v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20127v1","updated":"2024-05-30T15:07:30Z","published":"2024-05-30T15:07:30Z","title":"SPAM: Stochastic Proximal Point Method with Momentum Variance Reduction\n  for Non-convex Cross-Device Federated Learning","summary":"  Cross-device training is a crucial subfield of federated learning, where the\nnumber of clients can reach into the billions. Standard approaches and local\nmethods are prone to issues such as client drift and insensitivity to data\nsimilarities. We propose a novel algorithm (SPAM) for cross-device federated\nlearning with non-convex losses, which solves both issues. We provide sharp\nanalysis under second-order (Hessian) similarity, a condition satisfied by a\nvariety of machine learning problems in practice. Additionally, we extend our\nresults to the partial participation setting, where a cohort of selected\nclients communicate with the server at each communication round. Our method is\nthe first in its kind, that does not require the smoothness of the objective\nand provably benefits from clients having similar data.\n","authors":["Avetik Karagulyan","Egor Shulgin","Abdurakhmon Sadiev","Peter Richtárik"],"pdf_url":"https://arxiv.org/pdf/2405.20127v1.pdf","comment":"The main part of the paper is around 9 pages. It contains the\n  proposed algorithms, the main theoretical results and the experimental\n  setting. The proofs of the main results and other technicalities are deferred\n  to the Appendix"},{"id":"http://arxiv.org/abs/2403.07262v2","updated":"2024-05-30T15:04:42Z","published":"2024-03-12T02:43:41Z","title":"A2PO: Towards Effective Offline Reinforcement Learning from an\n  Advantage-aware Perspective","summary":"  Offline reinforcement learning endeavors to leverage offline datasets to\ncraft effective agent policy without online interaction, which imposes proper\nconservative constraints with the support of behavior policies to tackle the\nout-of-distribution problem. However, existing works often suffer from the\nconstraint conflict issue when offline datasets are collected from multiple\nbehavior policies, i.e., different behavior policies may exhibit inconsistent\nactions with distinct returns across the state space. To remedy this issue,\nrecent advantage-weighted methods prioritize samples with high advantage values\nfor agent training while inevitably ignoring the diversity of behavior policy.\nIn this paper, we introduce a novel Advantage-Aware Policy Optimization (A2PO)\nmethod to explicitly construct advantage-aware policy constraints for offline\nlearning under mixed-quality datasets. Specifically, A2PO employs a conditional\nvariational auto-encoder to disentangle the action distributions of intertwined\nbehavior policies by modeling the advantage values of all training data as\nconditional variables. Then the agent can follow such disentangled action\ndistribution constraints to optimize the advantage-aware policy towards high\nadvantage values. Extensive experiments conducted on both the single-quality\nand mixed-quality datasets of the D4RL benchmark demonstrate that A2PO yields\nresults superior to the counterparts. Our code will be made publicly available.\n","authors":["Yunpeng Qing","Shunyu liu","Jingyuan Cong","Kaixuan Chen","Yihe Zhou","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2403.07262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.12880v2","updated":"2024-05-30T15:04:27Z","published":"2022-05-25T15:58:34Z","title":"Trust-based Consensus in Multi-Agent Reinforcement Learning Systems","summary":"  An often neglected issue in multi-agent reinforcement learning (MARL) is the\npotential presence of unreliable agents in the environment whose deviations\nfrom expected behavior can prevent a system from accomplishing its intended\ntasks. In particular, consensus is a fundamental underpinning problem of\ncooperative distributed multi-agent systems. Consensus requires different\nagents, situated in a decentralized communication network, to reach an\nagreement out of a set of initial proposals that they put forward.\nLearning-based agents should adopt a protocol that allows them to reach\nconsensus despite having one or more unreliable agents in the system. This\npaper investigates the problem of unreliable agents in MARL, considering\nconsensus as a case study. Echoing established results in the distributed\nsystems literature, our experiments show that even a moderate fraction of such\nagents can greatly impact the ability of reaching consensus in a networked\nenvironment. We propose Reinforcement Learning-based Trusted Consensus (RLTC),\na decentralized trust mechanism, in which agents can independently decide which\nneighbors to communicate with. We empirically demonstrate that our trust\nmechanism is able to handle unreliable agents effectively, as evidenced by\nhigher consensus success rates.\n","authors":["Ho Long Fung","Victor-Alexandru Darvariu","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2205.12880v2.pdf","comment":"Accepted for publication in proceedings of the first Reinforcement\n  Learning Conference (RLC 2024)"},{"id":"http://arxiv.org/abs/2405.14852v2","updated":"2024-05-30T15:01:49Z","published":"2024-05-23T17:57:04Z","title":"PV-Tuning: Beyond Straight-Through Estimation for Extreme LLM\n  Compression","summary":"  There has been significant interest in \"extreme\" compression of large\nlanguage models (LLMs), i.e., to 1-2 bits per parameter, which allows such\nmodels to be executed efficiently on resource-constrained devices. Existing\nwork focused on improved one-shot quantization techniques and weight\nrepresentations; yet, purely post-training approaches are reaching diminishing\nreturns in terms of the accuracy-vs-bit-width trade-off. State-of-the-art\nquantization methods such as QuIP# and AQLM include fine-tuning (part of) the\ncompressed parameters over a limited amount of calibration data; however, such\nfine-tuning techniques over compressed weights often make exclusive use of\nstraight-through estimators (STE), whose performance is not well-understood in\nthis setting. In this work, we question the use of STE for extreme LLM\ncompression, showing that it can be sub-optimal, and perform a systematic study\nof quantization-aware fine-tuning strategies for LLMs. We propose PV-Tuning - a\nrepresentation-agnostic framework that generalizes and improves upon existing\nfine-tuning strategies, and provides convergence guarantees in restricted\ncases. On the practical side, when used for 1-2 bit vector quantization,\nPV-Tuning outperforms prior techniques for highly-performant models such as\nLlama and Mistral. Using PV-Tuning, we achieve the first Pareto-optimal\nquantization for Llama 2 family models at 2 bits per parameter.\n","authors":["Vladimir Malinovskii","Denis Mazur","Ivan Ilin","Denis Kuznedelev","Konstantin Burlachenko","Kai Yi","Dan Alistarh","Peter Richtarik"],"pdf_url":"https://arxiv.org/pdf/2405.14852v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2405.20124v1","updated":"2024-05-30T15:01:18Z","published":"2024-05-30T15:01:18Z","title":"A Geometric Unification of Distributionally Robust Covariance\n  Estimators: Shrinking the Spectrum by Inflating the Ambiguity Set","summary":"  The state-of-the-art methods for estimating high-dimensional covariance\nmatrices all shrink the eigenvalues of the sample covariance matrix towards a\ndata-insensitive shrinkage target. The underlying shrinkage transformation is\neither chosen heuristically - without compelling theoretical justification - or\noptimally in view of restrictive distributional assumptions. In this paper, we\npropose a principled approach to construct covariance estimators without\nimposing restrictive assumptions. That is, we study distributionally robust\ncovariance estimation problems that minimize the worst-case Frobenius error\nwith respect to all data distributions close to a nominal distribution, where\nthe proximity of distributions is measured via a divergence on the space of\ncovariance matrices. We identify mild conditions on this divergence under which\nthe resulting minimizers represent shrinkage estimators. We show that the\ncorresponding shrinkage transformations are intimately related to the\ngeometrical properties of the underlying divergence. We also prove that our\nrobust estimators are efficiently computable and asymptotically consistent and\nthat they enjoy finite-sample performance guarantees. We exemplify our general\nmethodology by synthesizing explicit estimators induced by the\nKullback-Leibler, Fisher-Rao, and Wasserstein divergences. Numerical\nexperiments based on synthetic and real data show that our robust estimators\nare competitive with state-of-the-art estimators.\n","authors":["Man-Chung Yue","Yves Rychener","Daniel Kuhn","Viet Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2405.20124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20114v1","updated":"2024-05-30T14:51:57Z","published":"2024-05-30T14:51:57Z","title":"Near Optimal Decentralized Optimization with Compression and Momentum\n  Tracking","summary":"  Communication efficiency has garnered significant attention as it is\nconsidered the main bottleneck for large-scale decentralized Machine Learning\napplications in distributed and federated settings. In this regime, clients are\nrestricted to transmitting small amounts of quantized information to their\nneighbors over a communication graph. Numerous endeavors have been made to\naddress this challenging problem by developing algorithms with compressed\ncommunication for decentralized non-convex optimization problems. Despite\nconsiderable efforts, the current results suffer from various issues such as\nnon-scalability with the number of clients, requirements for large batches, or\nbounded gradient assumption. In this paper, we introduce MoTEF, a novel\napproach that integrates communication compression with Momentum Tracking and\nError Feedback. Our analysis demonstrates that MoTEF achieves most of the\ndesired properties, and significantly outperforms existing methods under\narbitrary data heterogeneity. We provide numerical experiments to validate our\ntheoretical findings and confirm the practical superiority of MoTEF.\n","authors":["Rustem Islamov","Yuan Gao","Sebastian U. Stich"],"pdf_url":"https://arxiv.org/pdf/2405.20114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01567v2","updated":"2024-05-30T14:49:45Z","published":"2024-02-02T17:00:17Z","title":"Understanding Adam Optimizer via Online Learning of Updates: Adam is\n  FTRL in Disguise","summary":"  Despite the success of the Adam optimizer in practice, the theoretical\nunderstanding of its algorithmic components still remains limited. In\nparticular, most existing analyses of Adam show the convergence rate that can\nbe simply achieved by non-adative algorithms like SGD. In this work, we provide\na different perspective based on online learning that underscores the\nimportance of Adam's algorithmic components. Inspired by Cutkosky et al.\n(2023), we consider the framework called online learning of updates/increments,\nwhere we choose the updates/increments of an optimizer based on an online\nlearner. With this framework, the design of a good optimizer is reduced to the\ndesign of a good online learner. Our main observation is that Adam corresponds\nto a principled online learning framework called Follow-the-Regularized-Leader\n(FTRL). Building on this observation, we study the benefits of its algorithmic\ncomponents from the online learning perspective.\n","authors":["Kwangjun Ahn","Zhiyu Zhang","Yunbum Kook","Yan Dai"],"pdf_url":"https://arxiv.org/pdf/2402.01567v2.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2310.12942v5","updated":"2024-05-30T14:49:25Z","published":"2023-10-19T17:39:47Z","title":"On the Representational Capacity of Recurrent Neural Language Models","summary":"  This work investigates the computational expressivity of language models\n(LMs) based on recurrent neural networks (RNNs). Siegelmann and Sontag (1992)\nfamously showed that RNNs with rational weights and hidden states and unbounded\ncomputation time are Turing complete. However, LMs define weightings over\nstrings in addition to just (unweighted) language membership and the analysis\nof the computational power of RNN LMs (RLMs) should reflect this. We extend the\nTuring completeness result to the probabilistic case, showing how a rationally\nweighted RLM with unbounded computation time can simulate any deterministic\nprobabilistic Turing machine (PTM) with rationally weighted transitions. Since,\nin practice, RLMs work in real-time, processing a symbol at every time step, we\ntreat the above result as an upper bound on the expressivity of RLMs. We also\nprovide a lower bound by showing that under the restriction to real-time\ncomputation, such models can simulate deterministic real-time rational PTMs.\n","authors":["Franz Nowak","Anej Svete","Li Du","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2310.12942v5.pdf","comment":"Added requirement for non-negative probabilities to definitions 2.3\n  and 3.1, fixed typos"},{"id":"http://arxiv.org/abs/2312.07252v2","updated":"2024-05-30T14:48:06Z","published":"2023-12-12T13:28:53Z","title":"Identifying Drivers of Predictive Aleatoric Uncertainty","summary":"  Explainability and uncertainty quantification are two pillars of trustable\nartificial intelligence. However, the reasoning behind uncertainty estimates is\ngenerally left unexplained. Identifying the drivers of uncertainty complements\nexplanations of point predictions in recognizing model limitations and enhances\ntrust in decisions and their communication. So far, explanations of\nuncertainties have been rarely studied. The few exceptions rely on Bayesian\nneural networks or technically intricate approaches, such as auxiliary\ngenerative models, thereby hindering their broad adoption. We present a simple\napproach to explain predictive aleatoric uncertainties. We estimate uncertainty\nas predictive variance by adapting a neural network with a Gaussian output\ndistribution. Subsequently, we apply out-of-the-box explainers to the model's\nvariance output. This approach can explain uncertainty influences more reliably\nthan literature baselines, which we evaluate in a synthetic setting with a\nknown data-generating process. We further adapt multiple metrics from\nconventional XAI research to uncertainty explanations. We quantify our findings\nwith a nuanced benchmark analysis that includes real-world datasets. Finally,\nwe apply our approach to an age regression model and discover reasonable\nsources of uncertainty. Overall, we explain uncertainty estimates with little\nmodifications to the model architecture and demonstrate that our approach\ncompetes effectively with more intricate methods.\n","authors":["Pascal Iversen","Simon Witzke","Katharina Baum","Bernhard Y. Renard"],"pdf_url":"https://arxiv.org/pdf/2312.07252v2.pdf","comment":"Simon Witzke and Pascal Iversen contributed equally"},{"id":"http://arxiv.org/abs/2405.20094v1","updated":"2024-05-30T14:32:06Z","published":"2024-05-30T14:32:06Z","title":"Low-dimensional approximations of the conditional law of Volterra\n  processes: a non-positive curvature approach","summary":"  Predicting the conditional evolution of Volterra processes with stochastic\nvolatility is a crucial challenge in mathematical finance. While deep neural\nnetwork models offer promise in approximating the conditional law of such\nprocesses, their effectiveness is hindered by the curse of dimensionality\ncaused by the infinite dimensionality and non-smooth nature of these problems.\nTo address this, we propose a two-step solution. Firstly, we develop a stable\ndimension reduction technique, projecting the law of a reasonably broad class\nof Volterra process onto a low-dimensional statistical manifold of non-positive\nsectional curvature. Next, we introduce a sequentially deep learning model\ntailored to the manifold's geometry, which we show can approximate the\nprojected conditional law of the Volterra process. Our model leverages an\nauxiliary hypernetwork to dynamically update its internal parameters, allowing\nit to encode non-stationary dynamics of the Volterra process, and it can be\ninterpreted as a gating mechanism in a mixture of expert models where each\nexpert is specialized at a specific point in time. Our hypernetwork further\nallows us to achieve approximation rates that would seemingly only be possible\nwith very large networks.\n","authors":["Reza Arabpour","John Armstrong","Luca Galimberti","Anastasis Kratsios","Giulia Livieri"],"pdf_url":"https://arxiv.org/pdf/2405.20094v1.pdf","comment":"Main body: 25 Pages, Appendices 29 Pages, 14 Tables, 6 Figures"},{"id":"http://arxiv.org/abs/2405.20091v1","updated":"2024-05-30T14:27:40Z","published":"2024-05-30T14:27:40Z","title":"Visual Attention Analysis in Online Learning","summary":"  In this paper, we present an approach in the Multimodal Learning Analytics\nfield. Within this approach, we have developed a tool to visualize and analyze\neye movement data collected during learning sessions in online courses. The\ntool is named VAAD (an acronym for Visual Attention Analysis Dashboard). These\neye movement data have been gathered using an eye-tracker and subsequently\nprocessed and visualized for interpretation. The purpose of the tool is to\nconduct a descriptive analysis of the data by facilitating its visualization,\nenabling the identification of differences and learning patterns among various\nlearner populations. Additionally, it integrates a predictive module capable of\nanticipating learner activities during a learning session. Consequently, VAAD\nholds the potential to offer valuable insights into online learning behaviors\nfrom both descriptive and predictive perspectives.\n","authors":["Navarro Miriam","Becerra Álvaro","Daza Roberto","Cobos Ruth","Morales Aythami","Fierrez Julian"],"pdf_url":"https://arxiv.org/pdf/2405.20091v1.pdf","comment":"Accepted in CEDI 2024 (VII Congreso Espa\\~nol de Inform\\'atica), A\n  Coru\\~na, Spain"},{"id":"http://arxiv.org/abs/2405.20086v1","updated":"2024-05-30T14:16:32Z","published":"2024-05-30T14:16:32Z","title":"Analysis of a multi-target linear shrinkage covariance estimator","summary":"  Multi-target linear shrinkage is an extension of the standard single-target\nlinear shrinkage for covariance estimation. We combine several constant\nmatrices - the targets - with the sample covariance matrix. We derive the\noracle and a \\textit{bona fide} multi-target linear shrinkage estimator with\nexact and empirical mean. In both settings, we proved its convergence towards\nthe oracle under Kolmogorov asymptotics. Finally, we show empirically that it\noutperforms other standard estimators in various situations.\n","authors":["Benoit Oriol"],"pdf_url":"https://arxiv.org/pdf/2405.20086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20085v1","updated":"2024-05-30T14:16:19Z","published":"2024-05-30T14:16:19Z","title":"Soft Partitioning of Latent Space for Semantic Channel Equalization","summary":"  Semantic channel equalization has emerged as a solution to address language\nmismatch in multi-user semantic communications. This approach aims to align the\nlatent spaces of an encoder and a decoder which were not jointly trained and it\nrelies on a partition of the semantic (latent) space into atoms based on the\nthe semantic meaning. In this work we explore the role of the semantic space\npartition in scenarios where the task structure involves a one-to-many mapping\nbetween the semantic space and the action space. In such scenarios,\npartitioning based on hard inference results results in loss of information\nwhich degrades the equalization performance. We propose a soft criterion to\nderive the atoms of the partition which leverages the soft decoder's output and\noffers a more comprehensive understanding of the semantic space's structure.\nThrough empirical validation, we demonstrate that soft partitioning yields a\nmore descriptive and regular partition of the space, consequently enhancing the\nperformance of the equalization algorithm.\n","authors":["Tomás Huttebraucker","Mohamed Sana","Emilio Calvanese Strinati"],"pdf_url":"https://arxiv.org/pdf/2405.20085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15174v3","updated":"2024-05-30T14:15:22Z","published":"2023-05-24T14:06:02Z","title":"Simultaneous identification of models and parameters of scientific\n  simulators","summary":"  Many scientific models are composed of multiple discrete components, and\nscientists often make heuristic decisions about which components to include.\nBayesian inference provides a mathematical framework for systematically\nselecting model components, but defining prior distributions over model\ncomponents and developing associated inference schemes has been challenging. We\napproach this problem in a simulation-based inference framework: We define\nmodel priors over candidate components and, from model simulations, train\nneural networks to infer joint probability distributions over both model\ncomponents and associated parameters. Our method, simulation-based model\ninference (SBMI), represents distributions over model components as a\nconditional mixture of multivariate binary distributions in the Grassmann\nformalism. SBMI can be applied to any compositional stochastic simulator\nwithout requiring likelihood evaluations. We evaluate SBMI on a simple time\nseries model and on two scientific models from neuroscience, and show that it\ncan discover multiple data-consistent model configurations, and that it reveals\nnon-identifiable model components and parameters. SBMI provides a powerful tool\nfor data-driven scientific inquiry which will allow scientists to identify\nessential model components and make uncertainty-informed modelling decisions.\n","authors":["Cornelius Schröder","Jakob H. Macke"],"pdf_url":"https://arxiv.org/pdf/2305.15174v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05680v2","updated":"2024-05-30T14:12:54Z","published":"2024-02-08T13:58:16Z","title":"Interpretable classifiers for tabular data via discretization and\n  feature selection","summary":"  We introduce a method for computing immediately human interpretable yet\naccurate classifiers from tabular data. The classifiers obtained are short\nBoolean formulas, computed via first discretizing the original data and then\nusing feature selection coupled with a very fast algorithm for producing the\nbest possible Boolean classifier for the setting. We demonstrate the approach\nvia 13 experiments, obtaining results with accuracies comparable to ones\nobtained via random forests, XGBoost, and existing results for the same\ndatasets in the literature. In most cases, the accuracy of our method is in\nfact similar to that of the reference methods, even though the main objective\nof our study is the immediate interpretability of our classifiers. We also\nprove a new result on the probability that the classifier we obtain from\nreal-life data corresponds to the ideally best classifier with respect to the\nbackground distribution the data comes from.\n","authors":["Reijo Jaakkola","Tomi Janhunen","Antti Kuusisto","Masood Feyzbakhsh Rankooh","Miikka Vilander"],"pdf_url":"https://arxiv.org/pdf/2402.05680v2.pdf","comment":"Changes in relation to version 1: more thorough and detailed\n  experiments, general corrections and refinements"},{"id":"http://arxiv.org/abs/2405.20082v1","updated":"2024-05-30T14:11:29Z","published":"2024-05-30T14:11:29Z","title":"Segment, Shuffle, and Stitch: A Simple Mechanism for Improving\n  Time-Series Representations","summary":"  Existing approaches for learning representations of time-series keep the\ntemporal arrangement of the time-steps intact with the presumption that the\noriginal order is the most optimal for learning. However, non-adjacent sections\nof real-world time-series may have strong dependencies. Accordingly we raise\nthe question: Is there an alternative arrangement for time-series which could\nenable more effective representation learning? To address this, we propose a\nsimple plug-and-play mechanism called Segment, Shuffle, and Stitch (S3)\ndesigned to improve time-series representation learning of existing models. S3\nworks by creating non-overlapping segments from the original sequence and\nshuffling them in a learned manner that is the most optimal for the task at\nhand. It then re-attaches the shuffled segments back together and performs a\nlearned weighted sum with the original input to capture both the newly shuffled\nsequence along with the original sequence. S3 is modular and can be stacked to\ncreate various degrees of granularity, and can be added to many forms of neural\narchitectures including CNNs or Transformers with negligible computation\noverhead. Through extensive experiments on several datasets and\nstate-of-the-art baselines, we show that incorporating S3 results in\nsignificant improvements for the tasks of time-series classification and\nforecasting, improving performance on certain datasets by up to 68\\%. We also\nshow that S3 makes the learning more stable with a smoother training loss curve\nand loss landscape compared to the original baseline. The code is available at\nhttps://github.com/shivam-grover/S3-TimeSeries .\n","authors":["Shivam Grover","Amin Jalali","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2405.20082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20079v1","updated":"2024-05-30T14:09:43Z","published":"2024-05-30T14:09:43Z","title":"Student Answer Forecasting: Transformer-Driven Answer Choice Prediction\n  for Language Learning","summary":"  Intelligent Tutoring Systems (ITS) enhance personalized learning by\npredicting student answers to provide immediate and customized instruction.\nHowever, recent research has primarily focused on the correctness of the answer\nrather than the student's performance on specific answer choices, limiting\ninsights into students' thought processes and potential misconceptions. To\naddress this gap, we present MCQStudentBert, an answer forecasting model that\nleverages the capabilities of Large Language Models (LLMs) to integrate\ncontextual understanding of students' answering history along with the text of\nthe questions and answers. By predicting the specific answer choices students\nare likely to make, practitioners can easily extend the model to new answer\nchoices or remove answer choices for the same multiple-choice question (MCQ)\nwithout retraining the model. In particular, we compare MLP, LSTM, BERT, and\nMistral 7B architectures to generate embeddings from students' past\ninteractions, which are then incorporated into a finetuned BERT's\nanswer-forecasting mechanism. We apply our pipeline to a dataset of language\nlearning MCQ, gathered from an ITS with over 10,000 students to explore the\npredictive accuracy of MCQStudentBert, which incorporates student interaction\npatterns, in comparison to correct answer prediction and traditional\nmastery-learning feature-based approaches. This work opens the door to more\npersonalized content, modularization, and granular support.\n","authors":["Elena Grazia Gado","Tommaso Martorella","Luca Zunino","Paola Mejia-Domenzain","Vinitra Swamy","Jibril Frej","Tanja Käser"],"pdf_url":"https://arxiv.org/pdf/2405.20079v1.pdf","comment":"Accepted as a poster paper at EDM 2024: 17th International Conference\n  on Educational Data Mining in Atlanta, USA"},{"id":"http://arxiv.org/abs/2402.03271v2","updated":"2024-05-30T14:03:35Z","published":"2024-02-05T18:28:44Z","title":"Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information\n  Seeking in Large Language Models","summary":"  In the face of uncertainty, the ability to *seek information* is of\nfundamental importance. In many practical applications, such as medical\ndiagnosis and troubleshooting, the information needed to solve the task is not\ninitially given and has to be actively sought by asking follow-up questions\n(for example, a doctor asking a patient for more details about their symptoms).\nIn this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to\naugment large language models with the ability to actively seek information by\nasking effective questions. UoT combines 1) an *uncertainty-aware simulation\napproach* which enables the model to simulate possible future scenarios and how\nlikely they are to occur, 2) *uncertainty-based rewards* motivated by\ninformation gain which incentivizes the model to seek information, and 3) a\n*reward propagation scheme* to select the optimal question to ask in a way that\nmaximizes the expected reward. In experiments on medical diagnosis,\ntroubleshooting, and the `20 Questions` game, UoT achieves an average\nperformance improvement of 38.1% in the rate of successful task completion\nacross multiple LLMs compared with direct prompting and also improves\nefficiency (i.e., the number of questions needed to complete the task). Our\ncode has been released [here](https://github.com/zhiyuanhubj/UoT)\n","authors":["Zhiyuan Hu","Chumin Liu","Xidong Feng","Yilun Zhao","See-Kiong Ng","Anh Tuan Luu","Junxian He","Pang Wei Koh","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.03271v2.pdf","comment":"Update Results"},{"id":"http://arxiv.org/abs/2405.20071v1","updated":"2024-05-30T14:01:02Z","published":"2024-05-30T14:01:02Z","title":"A Staged Approach using Machine Learning and Uncertainty Quantification\n  to Predict the Risk of Hip Fracture","summary":"  Despite advancements in medical care, hip fractures impose a significant\nburden on individuals and healthcare systems. This paper focuses on the\nprediction of hip fracture risk in older and middle-aged adults, where falls\nand compromised bone quality are predominant factors. We propose a novel staged\nmodel that combines advanced imaging and clinical data to improve predictive\nperformance. By using CNNs to extract features from hip DXA images, along with\nclinical variables, shape measurements, and texture features, our method\nprovides a comprehensive framework for assessing fracture risk. A staged\nmachine learning-based model was developed using two ensemble models: Ensemble\n1 (clinical variables only) and Ensemble 2 (clinical variables and DXA imaging\nfeatures). This staged approach used uncertainty quantification from Ensemble 1\nto decide if DXA features are necessary for further prediction. Ensemble 2\nexhibited the highest performance, achieving an AUC of 0.9541, an accuracy of\n0.9195, a sensitivity of 0.8078, and a specificity of 0.9427. The staged model\nalso performed well, with an AUC of 0.8486, an accuracy of 0.8611, a\nsensitivity of 0.5578, and a specificity of 0.9249, outperforming Ensemble 1,\nwhich had an AUC of 0.5549, an accuracy of 0.7239, a sensitivity of 0.1956, and\na specificity of 0.8343. Furthermore, the staged model suggested that 54.49% of\npatients did not require DXA scanning. It effectively balanced accuracy and\nspecificity, offering a robust solution when DXA data acquisition is not always\nfeasible. Statistical tests confirmed significant differences between the\nmodels, highlighting the advantages of the advanced modeling strategies. Our\nstaged approach could identify individuals at risk with a high accuracy but\nreduce the unnecessary DXA scanning. It has great promise to guide\ninterventions to prevent hip fractures with reduced cost and radiation.\n","authors":["Anjum Shaik","Kristoffer Larsen","Nancy E. Lane","Chen Zhao","Kuan-Jui Su","Joyce H. Keyak","Qing Tian","Qiuying Sha","Hui Shen","Hong-Wen Deng","Weihua Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.20071v1.pdf","comment":"29 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2306.08970v2","updated":"2024-05-30T13:46:34Z","published":"2023-06-15T09:05:36Z","title":"An Efficient and Multi-private Key Secure Aggregation for Federated\n  Learning","summary":"  With the emergence of privacy leaks in federated learning, secure aggregation\nprotocols that mainly adopt either homomorphic encryption or threshold secret\nsharing have been widely developed for federated learning to protect the\nprivacy of the local training data of each client. However, these existing\nprotocols suffer from many shortcomings, such as the dependence on a trusted\nthird party, the vulnerability to clients being corrupted, low efficiency, the\ntrade-off between security and fault tolerance, etc. To solve these\ndisadvantages, we propose an efficient and multi-private key secure aggregation\nscheme for federated learning. Specifically, we skillfully modify the variant\nElGamal encryption technique to achieve homomorphic addition operation, which\nhas two important advantages: 1) The server and each client can freely select\npublic and private keys without introducing a trust third party and 2) Compared\nto the variant ElGamal encryption, the plaintext space is relatively large,\nwhich is more suitable for the deep model. Besides, for the high dimensional\ndeep model parameter, we introduce a super-increasing sequence to compress\nmulti-dimensional data into 1-D, which can greatly reduce encryption and\ndecryption times as well as communication for ciphertext transmission. Detailed\nsecurity analyses show that our proposed scheme achieves the semantic security\nof both individual local gradients and the aggregated result while achieving\noptimal robustness in tolerating both client collusion and dropped clients.\nExtensive simulations demonstrate that the accuracy of our scheme is almost the\nsame as the non-private approach, while the efficiency of our scheme is much\nbetter than the state-of-the-art homomorphic encryption-based secure\naggregation schemes. More importantly, the efficiency advantages of our scheme\nwill become increasingly prominent as the number of model parameters increases.\n","authors":["Xue Yang","Zifeng Liu","Xiaohu Tang","Rongxing Lu","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2306.08970v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15769v2","updated":"2024-05-30T13:43:44Z","published":"2023-09-27T16:41:10Z","title":"Algebraic and Statistical Properties of the Ordinary Least Squares\n  Interpolator","summary":"  Deep learning research has uncovered the phenomenon of benign overfitting for\noverparameterized statistical models, which has drawn significant theoretical\ninterest in recent years. Given its simplicity and practicality, the ordinary\nleast squares (OLS) interpolator has become essential to gain foundational\ninsights into this phenomenon. While properties of OLS are well established in\nclassical, underparameterized settings, its behavior in high-dimensional,\noverparameterized regimes is less explored (unlike for ridge or lasso\nregression) though significant progress has been made of late. We contribute to\nthis growing literature by providing fundamental algebraic and statistical\nresults for the minimum $\\ell_2$-norm OLS interpolator. In particular, we\nprovide algebraic equivalents of (i) the leave-$k$-out residual formula, (ii)\nCochran's formula, and (iii) the Frisch-Waugh-Lovell theorem in the\noverparameterized regime. These results aid in understanding the OLS\ninterpolator's ability to generalize and have substantive implications for\ncausal inference. Under the Gauss-Markov model, we present statistical results\nsuch as an extension of the Gauss-Markov theorem and an analysis of variance\nestimation under homoskedastic errors for the overparameterized regime. To\nsubstantiate our theoretical contributions, we conduct simulations that further\nexplore the stochastic properties of the OLS interpolator.\n","authors":["Dennis Shen","Dogyoon Song","Peng Ding","Jasjeet S. Sekhon"],"pdf_url":"https://arxiv.org/pdf/2309.15769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20053v1","updated":"2024-05-30T13:38:52Z","published":"2024-05-30T13:38:52Z","title":"Would I Lie To You? Inference Time Alignment of Language Models using\n  Direct Preference Heads","summary":"  Pre-trained Language Models (LMs) exhibit strong zero-shot and in-context\nlearning capabilities; however, their behaviors are often difficult to control.\nBy utilizing Reinforcement Learning from Human Feedback (RLHF), it is possible\nto fine-tune unsupervised LMs to follow instructions and produce outputs that\nreflect human preferences. Despite its benefits, RLHF has been shown to\npotentially harm a language model's reasoning capabilities and introduce\nartifacts such as hallucinations where the model may fabricate facts. To\naddress this issue we introduce Direct Preference Heads (DPH), a fine-tuning\nframework that enables LMs to learn human preference signals through an\nauxiliary reward head without directly affecting the output distribution of the\nlanguage modeling head. We perform a theoretical analysis of our objective\nfunction and find strong ties to Conservative Direct Preference Optimization\n(cDPO). Finally we evaluate our models on GLUE, RACE, and the GPT4All\nevaluation suite and demonstrate that our method produces models which achieve\nhigher scores than those fine-tuned with Supervised Fine-Tuning (SFT) or Direct\nPreference Optimization (DPO) alone.\n","authors":["Avelina Asada Hadji-Kyriacou","Ognjen Arandjelovic"],"pdf_url":"https://arxiv.org/pdf/2405.20053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20052v1","updated":"2024-05-30T13:38:28Z","published":"2024-05-30T13:38:28Z","title":"A Hardware-Efficient EMG Decoder with an Attractor-based Neural Network\n  for Next-Generation Hand Prostheses","summary":"  Advancements in neural engineering have enabled the development of Robotic\nProsthetic Hands (RPHs) aimed at restoring hand functionality. Current\ncommercial RPHs offer limited control through basic on/off commands. Recent\nprogresses in machine learning enable finger movement decoding with higher\ndegrees of freedom, yet the high computational complexity of such models limits\ntheir application in portable devices. Future RPH designs must balance\nportability, low power consumption, and high decoding accuracy to be practical\nfor individuals with disabilities. To this end, we introduce a novel\nattractor-based neural network to realize on-chip movement decoding for\nnext-generation portable RPHs. The proposed architecture comprises an encoder,\nan attention layer, an attractor network, and a refinement regressor. We tested\nour model on four healthy subjects and achieved a decoding accuracy of\n80.6\\pm3.3\\%. Our proposed model is over 120 and 50 times more compact compared\nto state-of-the-art LSTM and CNN models, respectively, with comparable (or\nsuperior) decoding accuracy. Therefore, it exhibits minimal hardware complexity\nand can be effectively integrated as a System-on-Chip.\n","authors":["Mohammad Kalbasi","MohammadAli Shaeri","Vincent Alexandre Mendez","Solaiman Shokur","Silvestro Micera","Mahsa Shoaran"],"pdf_url":"https://arxiv.org/pdf/2405.20052v1.pdf","comment":"\\c{opyright} 2024 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2405.20051v1","updated":"2024-05-30T13:37:53Z","published":"2024-05-30T13:37:53Z","title":"Threshold-Independent Fair Matching through Score Calibration","summary":"  Entity Matching (EM) is a critical task in numerous fields, such as\nhealthcare, finance, and public administration, as it identifies records that\nrefer to the same entity within or across different databases. EM faces\nconsiderable challenges, particularly with false positives and negatives. These\nare typically addressed by generating matching scores and apply thresholds to\nbalance false positives and negatives in various contexts. However, adjusting\nthese thresholds can affect the fairness of the outcomes, a critical factor\nthat remains largely overlooked in current fair EM research. The existing body\nof research on fair EM tends to concentrate on static thresholds, neglecting\ntheir critical impact on fairness. To address this, we introduce a new approach\nin EM using recent metrics for evaluating biases in score based binary\nclassification, particularly through the lens of distributional parity. This\napproach enables the application of various bias metrics like equalized odds,\nequal opportunity, and demographic parity without depending on threshold\nsettings. Our experiments with leading matching methods reveal potential\nbiases, and by applying a calibration technique for EM scores using Wasserstein\nbarycenters, we not only mitigate these biases but also preserve accuracy\nacross real world datasets. This paper contributes to the field of fairness in\ndata cleaning, especially within EM, which is a central task in data cleaning,\nby promoting a method for generating matching scores that reduce biases across\ndifferent thresholds.\n","authors":["Mohammad Hossein Moslemi","Mostafa Milani"],"pdf_url":"https://arxiv.org/pdf/2405.20051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20045v1","updated":"2024-05-30T13:27:17Z","published":"2024-05-30T13:27:17Z","title":"Iterative Learning Control of Fast, Nonlinear, Oscillatory Dynamics\n  (Preprint)","summary":"  The sudden onset of deleterious and oscillatory dynamics (often called\ninstabilities) is a known challenge in many fluid, plasma, and aerospace\nsystems. These dynamics are difficult to address because they are nonlinear,\nchaotic, and are often too fast for active control schemes. In this work, we\ndevelop an alternative active controls system using an iterative,\ntrajectory-optimization and parameter-tuning approach based on Iterative\nLearning Control (ILC), Time-Lagged Phase Portraits (TLPP) and Gaussian Process\nRegression (GPR). The novelty of this approach is that it can control a\nsystem's dynamics despite the controller being much slower than the dynamics.\nWe demonstrate this controller on the Lorenz system of equations where it\niteratively adjusts (tunes) the system's input parameters to successfully\nreproduce a desired oscillatory trajectory or state. Additionally, we\ninvestigate the system's dynamical sensitivity to its control parameters,\nidentify continuous and bounded regions of desired dynamical trajectories, and\ndemonstrate that the controller is robust to missing information and\nuncontrollable parameters as long as certain requirements are met. The\ncontroller presented in this work provides a framework for low-speed control\nfor a variety of fast, nonlinear systems that may aid in instability\nsuppression and mitigation.\n","authors":["John W. Brooks","Christine M. Greve"],"pdf_url":"https://arxiv.org/pdf/2405.20045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20042v1","updated":"2024-05-30T13:23:02Z","published":"2024-05-30T13:23:02Z","title":"CycleFormer : TSP Solver Based on Language Modeling","summary":"  We propose a new transformer model for the Traveling Salesman Problem (TSP)\ncalled CycleFormer. We identified distinctive characteristics that need to be\nconsidered when applying a conventional transformer model to TSP and aimed to\nfully incorporate these elements into the TSP-specific transformer. Unlike the\ntoken sets in typical language models, which are limited and static, the token\n(node) set in TSP is unlimited and dynamic. To exploit this fact to the\nfullest, we equated the encoder output with the decoder linear layer and\ndirectly connected the context vector of the encoder to the decoder encoding.\nAdditionally, we added a positional encoding to the encoder tokens that\nreflects the two-dimensional nature of TSP, and devised a circular positional\nencoding for the decoder tokens that considers the cyclic properties of a tour.\nBy incorporating these ideas, CycleFormer outperforms state-of-the-art (SOTA)\ntransformer models for TSP from TSP-50 to TSP-500. Notably, on TSP-500, the\noptimality gap was reduced by approximately 2.8 times, from 3.09% to 1.10%,\ncompared to the existing SOTA. The code will be made available at\nhttps://github.com/Giventicket/CycleFormer.\n","authors":["Jieun Yook","Junpyo Seo","Joon Huh","Han Joon Byun","Byung-ro Mooon"],"pdf_url":"https://arxiv.org/pdf/2405.20042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20039v1","updated":"2024-05-30T13:19:49Z","published":"2024-05-30T13:19:49Z","title":"Task-Agnostic Machine Learning-Assisted Inference","summary":"  Machine learning (ML) is playing an increasingly important role in scientific\nresearch. In conjunction with classical statistical approaches, ML-assisted\nanalytical strategies have shown great promise in accelerating research\nfindings. This has also opened up a whole new field of methodological research\nfocusing on integrative approaches that leverage both ML and statistics to\ntackle data science challenges. One type of study that has quickly gained\npopularity employs ML to predict unobserved outcomes in massive samples and\nthen uses the predicted outcomes in downstream statistical inference. However,\nexisting methods designed to ensure the validity of this type of\npost-prediction inference are limited to very basic tasks such as linear\nregression analysis. This is because any extension of these approaches to new,\nmore sophisticated statistical tasks requires task-specific algebraic\nderivations and software implementations, which ignores the massive library of\nexisting software tools already developed for complex inference tasks and\nseverely constrains the scope of post-prediction inference in real\napplications. To address this challenge, we propose a novel statistical\nframework for task-agnostic ML-assisted inference. It provides a\npost-prediction inference solution that can be easily plugged into almost any\nestablished data analysis routine. It delivers valid and efficient inference\nthat is robust to arbitrary choices of ML models, while allowing nearly all\nexisting analytical frameworks to be incorporated into the analysis of\nML-predicted outcomes. Through extensive experiments, we showcase the validity,\nversatility, and superiority of our method compared to existing approaches.\n","authors":["Jiacheng Miao","Qiongshi Lu"],"pdf_url":"https://arxiv.org/pdf/2405.20039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20029v1","updated":"2024-05-30T13:13:48Z","published":"2024-05-30T13:13:48Z","title":"A Random Forest-based Prediction Model for Turning Points in\n  Antagonistic event-group Competitions","summary":"  At present, most of the prediction studies related to antagonistic\nevent-group competitions focus on the prediction of competition results, and\nless on the prediction of the competition process, which can not provide\nreal-time feedback of the athletes' state information in the actual\ncompetition, and thus can not analyze the changes of the competition situation.\nIn order to solve this problem, this paper proposes a prediction model based on\nRandom Forest for the turning point of the antagonistic event-group. Firstly,\nthe quantitative equation of competitive potential energy is proposed;\nSecondly, the quantitative value of competitive potential energy is obtained by\nusing the dynamic combination of weights method, and the turning point of the\ncompetition situation of the antagonistic event-group is marked according to\nthe quantitative time series graph; Finally, the random forest prediction model\nbased on the optimisation of the KM-SMOTE algorithm and the grid search method\nis established. The experimental analysis shows that: the quantitative equation\nof competitive potential energy can effectively reflect the dynamic situation\nof the competition; The model can effectively predict the turning point of the\ncompetition situation of the antagonistic event-group, and the recall rate of\nthe model in the test set is 86.13%; the model has certain significance for the\nfuture study of the competition situation of the antagonistic event-group.\n","authors":["Zishuo Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.20029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20028v1","updated":"2024-05-30T13:13:12Z","published":"2024-05-30T13:13:12Z","title":"A Simple and Adaptive Learning Rate for FTRL in Online Learning with\n  Minimax Regret of $Θ(T^{2/3})$ and its Application to\n  Best-of-Both-Worlds","summary":"  Follow-the-Regularized-Leader (FTRL) is a powerful framework for various\nonline learning problems. By designing its regularizer and learning rate to be\nadaptive to past observations, FTRL is known to work adaptively to various\nproperties of an underlying environment. However, most existing adaptive\nlearning rates are for online learning problems with a minimax regret of\n$\\Theta(\\sqrt{T})$ for the number of rounds $T$, and there are only a few\nstudies on adaptive learning rates for problems with a minimax regret of\n$\\Theta(T^{2/3})$, which include several important problems dealing with\nindirect feedback. To address this limitation, we establish a new adaptive\nlearning rate framework for problems with a minimax regret of\n$\\Theta(T^{2/3})$. Our learning rate is designed by matching the stability,\npenalty, and bias terms that naturally appear in regret upper bounds for\nproblems with a minimax regret of $\\Theta(T^{2/3})$. As applications of this\nframework, we consider two major problems dealing with indirect feedback:\npartial monitoring and graph bandits. We show that FTRL with our learning rate\nand the Tsallis entropy regularizer improves existing Best-of-Both-Worlds\n(BOBW) regret upper bounds, which achieve simultaneous optimality in the\nstochastic and adversarial regimes. The resulting learning rate is surprisingly\nsimple compared to the existing learning rates for BOBW algorithms for problems\nwith a minimax regret of $\\Theta(T^{2/3})$.\n","authors":["Taira Tsuchiya","Shinji Ito"],"pdf_url":"https://arxiv.org/pdf/2405.20028v1.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2401.14535v2","updated":"2024-05-30T13:09:47Z","published":"2024-01-25T22:01:07Z","title":"CaRiNG: Learning Temporal Causal Representation under Non-Invertible\n  Generation Process","summary":"  Identifying the underlying time-delayed latent causal processes in sequential\ndata is vital for grasping temporal dynamics and making downstream reasoning.\nWhile some recent methods can robustly identify these latent causal variables,\nthey rely on strict assumptions about the invertible generation process from\nlatent variables to observed data. However, these assumptions are often hard to\nsatisfy in real-world applications containing information loss. For instance,\nthe visual perception process translates a 3D space into 2D images, or the\nphenomenon of persistence of vision incorporates historical data into current\nperceptions. To address this challenge, we establish an identifiability theory\nthat allows for the recovery of independent latent components even when they\ncome from a nonlinear and non-invertible mix. Using this theory as a\nfoundation, we propose a principled approach, CaRiNG, to learn the CAusal\nRepresentatIon of Non-invertible Generative temporal data with identifiability\nguarantees. Specifically, we utilize temporal context to recover lost latent\ninformation and apply the conditions in our theory to guide the training\nprocess. Through experiments conducted on synthetic datasets, we validate that\nour CaRiNG method reliably identifies the causal process, even when the\ngeneration process is non-invertible. Moreover, we demonstrate that our\napproach considerably improves temporal understanding and reasoning in\npractical applications.\n","authors":["Guangyi Chen","Yifan Shen","Zhenhao Chen","Xiangchen Song","Yuewen Sun","Weiran Yao","Xiao Liu","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.14535v2.pdf","comment":"To appear at ICML 2024, 24 pages"},{"id":"http://arxiv.org/abs/2402.07865v2","updated":"2024-05-30T13:08:48Z","published":"2024-02-12T18:21:14Z","title":"Prismatic VLMs: Investigating the Design Space of Visually-Conditioned\n  Language Models","summary":"  Visually-conditioned language models (VLMs) have seen growing adoption in\napplications such as visual dialogue, scene understanding, and robotic task\nplanning; adoption that has fueled a wealth of new models such as LLaVa,\nInstructBLIP, and PaLI-3. Despite the volume of new releases, key design\ndecisions around image preprocessing, architecture, and optimization are\nunder-explored, making it challenging to understand what factors account for\nmodel performance $-$ a challenge further complicated by the lack of objective,\nconsistent evaluations. To address these gaps, we first compile a suite of\nstandardized evaluations spanning visual question answering, object\nlocalization, and challenge sets that probe properties such as hallucination;\nevaluations that provide fine-grained insight VLM capabilities. Second, we\nrigorously investigate VLMs along key design axes, including pretrained visual\nrepresentations and training from base vs. instruct-tuned language models,\namongst others. We couple our analysis with three resource contributions: (1) a\nunified framework for evaluating VLMs, (2) optimized, flexible training code,\nand (3) checkpoints for all models, including a family of VLMs at the 7-13B\nscale that strictly outperform InstructBLIP and LLaVa v1.5, the\nstate-of-the-art in open VLMs.\n","authors":["Siddharth Karamcheti","Suraj Nair","Ashwin Balakrishna","Percy Liang","Thomas Kollar","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2402.07865v2.pdf","comment":"Published at ICML 2024. 22 pages, 11 figures. Training code and\n  models: https://github.com/TRI-ML/prismatic-vlms. Evaluation code:\n  https://github.com/TRI-ML/vlm-evaluation"},{"id":"http://arxiv.org/abs/2310.00841v3","updated":"2024-05-30T13:03:32Z","published":"2023-10-02T01:30:42Z","title":"Drug Discovery with Dynamic Goal-aware Fragments","summary":"  Fragment-based drug discovery is an effective strategy for discovering drug\ncandidates in the vast chemical space, and has been widely employed in\nmolecular generative models. However, many existing fragment extraction methods\nin such models do not take the target chemical properties into account or rely\non heuristic rules. Additionally, the existing fragment-based generative models\ncannot update the fragment vocabulary with goal-aware fragments newly\ndiscovered during the generation. To this end, we propose a molecular\ngenerative framework for drug discovery, named Goal-aware fragment Extraction,\nAssembly, and Modification (GEAM). GEAM consists of three modules, each\nresponsible for goal-aware fragment extraction, fragment assembly, and fragment\nmodification. The fragment extraction module identifies important fragments\ncontributing to the desired target properties with the information bottleneck\nprinciple, thereby constructing an effective goal-aware fragment vocabulary.\nMoreover, GEAM can explore beyond the initial vocabulary with the fragment\nmodification module, and the exploration is further enhanced through the\ndynamic goal-aware vocabulary update. We experimentally demonstrate that GEAM\neffectively discovers drug candidates through the generative cycle of the three\nmodules in various drug discovery tasks. Our code is available at\nhttps://github.com/SeulLee05/GEAM.\n","authors":["Seul Lee","Seanie Lee","Kenji Kawaguchi","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2310.00841v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.20018v1","updated":"2024-05-30T12:57:35Z","published":"2024-05-30T12:57:35Z","title":"Safe Multi-agent Reinforcement Learning with Natural Language\n  Constraints","summary":"  The role of natural language constraints in Safe Multi-agent Reinforcement\nLearning (MARL) is crucial, yet often overlooked. While Safe MARL has vast\npotential, especially in fields like robotics and autonomous vehicles, its full\npotential is limited by the need to define constraints in pre-designed\nmathematical terms, which requires extensive domain expertise and reinforcement\nlearning knowledge, hindering its broader adoption. To address this limitation\nand make Safe MARL more accessible and adaptable, we propose a novel approach\nnamed Safe Multi-agent Reinforcement Learning with Natural Language constraints\n(SMALL). Our method leverages fine-tuned language models to interpret and\nprocess free-form textual constraints, converting them into semantic embeddings\nthat capture the essence of prohibited states and behaviours. These embeddings\nare then integrated into the multi-agent policy learning process, enabling\nagents to learn policies that minimize constraint violations while optimizing\nrewards. To evaluate the effectiveness of SMALL, we introduce the LaMaSafe, a\nmulti-task benchmark designed to assess the performance of multiple agents in\nadhering to natural language constraints. Empirical evaluations across various\nenvironments demonstrate that SMALL achieves comparable rewards and\nsignificantly fewer constraint violations, highlighting its effectiveness in\nunderstanding and enforcing natural language constraints.\n","authors":["Ziyan Wang","Meng Fang","Tristan Tomilin","Fei Fang","Yali Du"],"pdf_url":"https://arxiv.org/pdf/2405.20018v1.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.20014v1","updated":"2024-05-30T12:49:34Z","published":"2024-05-30T12:49:34Z","title":"subMFL: Compatiple subModel Generation for Federated Learning in Device\n  Heterogenous Environment","summary":"  Federated Learning (FL) is commonly used in systems with distributed and\nheterogeneous devices with access to varying amounts of data and diverse\ncomputing and storage capacities. FL training process enables such devices to\nupdate the weights of a shared model locally using their local data and then a\ntrusted central server combines all of those models to generate a global model.\nIn this way, a global model is generated while the data remains local to\ndevices to preserve privacy. However, training large models such as Deep Neural\nNetworks (DNNs) on resource-constrained devices can take a prohibitively long\ntime and consume a large amount of energy. In the current process, the\nlow-capacity devices are excluded from the training process, although they\nmight have access to unseen data. To overcome this challenge, we propose a\nmodel compression approach that enables heterogeneous devices with varying\ncomputing capacities to participate in the FL process. In our approach, the\nserver shares a dense model with all devices to train it: Afterwards, the\ntrained model is gradually compressed to obtain submodels with varying levels\nof sparsity to be used as suitable initial global models for\nresource-constrained devices that were not capable of train the first dense\nmodel. This results in an increased participation rate of resource-constrained\ndevices while the transferred weights from the previous round of training are\npreserved. Our validation experiments show that despite reaching about 50 per\ncent global sparsity, generated submodels maintain their accuracy while can be\nshared to increase participation by around 50 per cent.\n","authors":["Zeyneddin Oz","Ceylan Soygul Oz","Abdollah Malekjafarian","Nima Afraz","Fatemeh Golpayegani"],"pdf_url":"https://arxiv.org/pdf/2405.20014v1.pdf","comment":"12 pages, 7 figures, European Conference on Parallel Processing, pp.\n  between 52 and 64, Springer, 2023"},{"id":"http://arxiv.org/abs/2405.20012v1","updated":"2024-05-30T12:48:44Z","published":"2024-05-30T12:48:44Z","title":"FlexiDrop: Theoretical Insights and Practical Advances in Random Dropout\n  Method on GNNs","summary":"  Graph Neural Networks (GNNs) are powerful tools for handling graph-type data.\nRecently, GNNs have been widely applied in various domains, but they also face\nsome issues, such as overfitting, over-smoothing and non-robustness. The\nexisting research indicates that random dropout methods are an effective way to\naddress these issues. However, random dropout methods in GNNs still face\nunresolved problems. Currently, the choice of dropout rate, often determined by\nheuristic or grid search methods, can increase the generalization error,\ncontradicting the principal aims of dropout. In this paper, we propose a novel\nrandom dropout method for GNNs called FlexiDrop. First, we conduct a\ntheoretical analysis of dropout in GNNs using rademacher complexity and\ndemonstrate that the generalization error of traditional random dropout methods\nis constrained by a function related to the dropout rate. Subsequently, we use\nthis function as a regularizer to unify the dropout rate and empirical loss\nwithin a single loss function, optimizing them simultaneously. Therefore, our\nmethod enables adaptive adjustment of the dropout rate and theoretically\nbalances the trade-off between model complexity and generalization ability.\nFurthermore, extensive experimental results on benchmark datasets show that\nFlexiDrop outperforms traditional random dropout methods in GNNs.\n","authors":["Zhiheng Zhou","Sihao Liu","Weichen Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.20012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20003v1","updated":"2024-05-30T12:42:05Z","published":"2024-05-30T12:42:05Z","title":"Kernel Language Entropy: Fine-grained Uncertainty Quantification for\n  LLMs from Semantic Similarities","summary":"  Uncertainty quantification in Large Language Models (LLMs) is crucial for\napplications where safety and reliability are important. In particular,\nuncertainty can be used to improve the trustworthiness of LLMs by detecting\nfactually incorrect model responses, commonly called hallucinations.\nCritically, one should seek to capture the model's semantic uncertainty, i.e.,\nthe uncertainty over the meanings of LLM outputs, rather than uncertainty over\nlexical or syntactic variations that do not affect answer correctness. To\naddress this problem, we propose Kernel Language Entropy (KLE), a novel method\nfor uncertainty estimation in white- and black-box LLMs. KLE defines positive\nsemidefinite unit trace kernels to encode the semantic similarities of LLM\noutputs and quantifies uncertainty using the von Neumann entropy. It considers\npairwise semantic dependencies between answers (or semantic clusters),\nproviding more fine-grained uncertainty estimates than previous methods based\non hard clustering of answers. We theoretically prove that KLE generalizes the\nprevious state-of-the-art method called semantic entropy and empirically\ndemonstrate that it improves uncertainty quantification performance across\nmultiple natural language generation datasets and LLM architectures.\n","authors":["Alexander Nikitin","Jannik Kossen","Yarin Gal","Pekka Marttinen"],"pdf_url":"https://arxiv.org/pdf/2405.20003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16510v3","updated":"2024-05-30T12:40:06Z","published":"2024-05-26T10:33:17Z","title":"Meta-Task Planning for Language Agents","summary":"  The rapid advancement of neural language models has sparked a new surge of\nintelligent agent research. Unlike traditional agents, large language\nmodel-based agents (LLM agents) have emerged as a promising paradigm for\nachieving artificial general intelligence (AGI) due to their superior reasoning\nand generalization capabilities. Effective planning is crucial for the success\nof LLM agents in real-world tasks, making it a highly pursued topic in the\ncommunity. Current planning methods typically translate tasks into executable\naction sequences. However, determining a feasible or optimal sequence for\ncomplex tasks at fine granularity, which often requires compositing long chains\nof heterogeneous actions, remains challenging. This paper introduces Meta-Task\nPlanning (MTP), a zero-shot methodology for collaborative LLM-based multi-agent\nsystems that simplifies complex task planning by decomposing it into a\nhierarchy of subordinate tasks, or meta-tasks. Each meta-task is then mapped\ninto executable actions. MTP was assessed on two rigorous benchmarks,\nTravelPlanner and API-Bank. Notably, MTP achieved an average $\\sim40\\%$ success\nrate on TravelPlanner, significantly higher than the state-of-the-art (SOTA)\nbaseline ($2.92\\%$), and outperforming $LLM_{api}$-4 with ReAct on API-Bank by\n$\\sim14\\%$, showing the immense potential of integrating LLM with multi-agent\nsystems.\n","authors":["Cong Zhang","Derrick Goh Xin Deik","Dexun Li","Hao Zhang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2405.16510v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19995v1","updated":"2024-05-30T12:32:18Z","published":"2024-05-30T12:32:18Z","title":"Symmetries in Overparametrized Neural Networks: A Mean-Field View","summary":"  We develop a Mean-Field (MF) view of the learning dynamics of\noverparametrized Artificial Neural Networks (NN) under data symmetric in law\nwrt the action of a general compact group $G$. We consider for this a class of\ngeneralized shallow NNs given by an ensemble of $N$ multi-layer units, jointly\ntrained using stochastic gradient descent (SGD) and possibly\nsymmetry-leveraging (SL) techniques, such as Data Augmentation (DA), Feature\nAveraging (FA) or Equivariant Architectures (EA). We introduce the notions of\nweakly and strongly invariant laws (WI and SI) on the parameter space of each\nsingle unit, corresponding, respectively, to $G$-invariant distributions, and\nto distributions supported on parameters fixed by the group action (which\nencode EA). This allows us to define symmetric models compatible with taking\n$N\\to\\infty$ and give an interpretation of the asymptotic dynamics of DA, FA\nand EA in terms of Wasserstein Gradient Flows describing their MF limits. When\nactivations respect the group action, we show that, for symmetric data, DA, FA\nand freely-trained models obey the exact same MF dynamic, which stays in the\nspace of WI laws and minimizes therein the population risk. We also give a\ncounterexample to the general attainability of an optimum over SI laws. Despite\nthis, quite remarkably, we show that the set of SI laws is also preserved by\nthe MF dynamics even when freely trained. This sharply contrasts the finite-$N$\nsetting, in which EAs are generally not preserved by unconstrained SGD. We\nillustrate the validity of our findings as $N$ gets larger in a teacher-student\nexperimental setting, training a student NN to learn from a WI, SI or arbitrary\nteacher model through various SL schemes. We last deduce a data-driven\nheuristic to discover the largest subspace of parameters supporting SI\ndistributions for a problem, that could be used for designing EA with minimal\ngeneralization error.\n","authors":["Javier Maass Martínez","Joaquin Fontbona"],"pdf_url":"https://arxiv.org/pdf/2405.19995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19988v1","updated":"2024-05-30T12:18:06Z","published":"2024-05-30T12:18:06Z","title":"Video-Language Critic: Transferable Reward Functions for\n  Language-Conditioned Robotics","summary":"  Natural language is often the easiest and most convenient modality for humans\nto specify tasks for robots. However, learning to ground language to behavior\ntypically requires impractical amounts of diverse, language-annotated\ndemonstrations collected on each target robot. In this work, we aim to separate\nthe problem of what to accomplish from how to accomplish it, as the former can\nbenefit from substantial amounts of external observation-only data, and only\nthe latter depends on a specific robot embodiment. To this end, we propose\nVideo-Language Critic, a reward model that can be trained on readily available\ncross-embodiment data using contrastive learning and a temporal ranking\nobjective, and use it to score behavior traces from a separate reinforcement\nlearning actor. When trained on Open X-Embodiment data, our reward model\nenables 2x more sample-efficient policy training on Meta-World tasks than a\nsparse reward only, despite a significant domain gap. Using in-domain data but\nin a challenging task generalization setting on Meta-World, we further\ndemonstrate more sample-efficient training than is possible with prior\nlanguage-conditioned reward models that are either trained with binary\nclassification, use static images, or do not leverage the temporal information\npresent in video data.\n","authors":["Minttu Alakuijala","Reginald McLean","Isaac Woungang","Nariman Farsad","Samuel Kaski","Pekka Marttinen","Kai Yuan"],"pdf_url":"https://arxiv.org/pdf/2405.19988v1.pdf","comment":"10 pages in the main text, 16 pages including references and\n  supplementary materials. 4 figures and 3 tables in the main text, 1 table in\n  supplementary materials"},{"id":"http://arxiv.org/abs/2405.19985v1","updated":"2024-05-30T12:14:25Z","published":"2024-05-30T12:14:25Z","title":"Targeted Sequential Indirect Experiment Design","summary":"  Scientific hypotheses typically concern specific aspects of complex,\nimperfectly understood or entirely unknown mechanisms, such as the effect of\ngene expression levels on phenotypes or how microbial communities influence\nenvironmental health. Such queries are inherently causal (rather than purely\nassociational), but in many settings, experiments can not be conducted directly\non the target variables of interest, but are indirect. Therefore, they perturb\nthe target variable, but do not remove potential confounding factors. If,\nadditionally, the resulting experimental measurements are multi-dimensional and\nthe studied mechanisms nonlinear, the query of interest is generally not\nidentified. We develop an adaptive strategy to design indirect experiments that\noptimally inform a targeted query about the ground truth mechanism in terms of\nsequentially narrowing the gap between an upper and lower bound on the query.\nWhile the general formulation consists of a bi-level optimization procedure, we\nderive an efficiently estimable analytical kernel-based estimator of the bounds\nfor the causal effect, a query of key interest, and demonstrate the efficacy of\nour approach in confounded, multivariate, nonlinear synthetic settings.\n","authors":["Elisabeth Ailer","Niclas Dern","Jason Hartford","Niki Kilbertus"],"pdf_url":"https://arxiv.org/pdf/2405.19985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15240v2","updated":"2024-05-30T12:14:05Z","published":"2024-05-24T06:06:41Z","title":"Towards Real World Debiasing: A Fine-grained Analysis On Spurious\n  Correlation","summary":"  Spurious correlations in training data significantly hinder the\ngeneralization capability of machine learning models when faced with\ndistribution shifts in real-world scenarios. To tackle the problem, numerous\ndebias approaches have been proposed and benchmarked on datasets intentionally\ndesigned with severe biases. However, it remains to be asked: \\textit{1. Do\nexisting benchmarks really capture biases in the real world? 2. Can existing\ndebias methods handle biases in the real world?} To answer the questions, we\nrevisit biased distributions in existing benchmarks and real-world datasets,\nand propose a fine-grained framework for analyzing dataset bias by\ndisentangling it into the magnitude and prevalence of bias. We observe and\ntheoretically demonstrate that existing benchmarks poorly represent real-world\nbiases. We further introduce two novel biased distributions to bridge this gap,\nforming a nuanced evaluation framework for real-world debiasing. Building upon\nthese results, we evaluate existing debias methods with our evaluation\nframework. Results show that existing methods are incapable of handling\nreal-world biases. Through in-depth analysis, we propose a simple yet effective\napproach that can be easily applied to existing debias methods, named Debias in\nDestruction (DiD). Empirical results demonstrate the superiority of DiD,\nimproving the performance of existing methods on all types of biases within the\nproposed evaluation framework.\n","authors":["Zhibo Wang","Peng Kuang","Zhixuan Chu","Jingyi Wang","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2405.15240v2.pdf","comment":"9 pages of main paper, 10 pages of appendix"},{"id":"http://arxiv.org/abs/2405.04923v2","updated":"2024-05-30T12:04:17Z","published":"2024-05-08T09:45:54Z","title":"DataSP: A Differential All-to-All Shortest Path Algorithm for Learning\n  Costs and Predicting Paths with Context","summary":"  Learning latent costs of transitions on graphs from trajectories\ndemonstrations under various contextual features is challenging but useful for\npath planning. Yet, existing methods either oversimplify cost assumptions or\nscale poorly with the number of observed trajectories. This paper introduces\nDataSP, a differentiable all-to-all shortest path algorithm to facilitate\nlearning latent costs from trajectories. It allows to learn from a large number\nof trajectories in each learning step without additional computation. Complex\nlatent cost functions from contextual features can be represented in the\nalgorithm through a neural network approximation. We further propose a method\nto sample paths from DataSP in order to reconstruct/mimic observed paths'\ndistributions. We prove that the inferred distribution follows the maximum\nentropy principle. We show that DataSP outperforms state-of-the-art\ndifferentiable combinatorial solver and classical machine learning approaches\nin predicting paths on graphs.\n","authors":["Alan A. Lahoud","Erik Schaffernicht","Johannes A. Stork"],"pdf_url":"https://arxiv.org/pdf/2405.04923v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19978v1","updated":"2024-05-30T12:01:12Z","published":"2024-05-30T12:01:12Z","title":"Domain Adaptation with Cauchy-Schwarz Divergence","summary":"  Domain adaptation aims to use training data from one or multiple source\ndomains to learn a hypothesis that can be generalized to a different, but\nrelated, target domain. As such, having a reliable measure for evaluating the\ndiscrepancy of both marginal and conditional distributions is crucial. We\nintroduce Cauchy-Schwarz (CS) divergence to the problem of unsupervised domain\nadaptation (UDA). The CS divergence offers a theoretically tighter\ngeneralization error bound than the popular Kullback-Leibler divergence. This\nholds for the general case of supervised learning, including multi-class\nclassification and regression. Furthermore, we illustrate that the CS\ndivergence enables a simple estimator on the discrepancy of both marginal and\nconditional distributions between source and target domains in the\nrepresentation space, without requiring any distributional assumptions. We\nprovide multiple examples to illustrate how the CS divergence can be\nconveniently used in both distance metric- or adversarial training-based UDA\nframeworks, resulting in compelling performance.\n","authors":["Wenzhe Yin","Shujian Yu","Yicong Lin","Jie Liu","Jan-Jakob Sonke","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2405.19978v1.pdf","comment":"Accepted by UAI-24"},{"id":"http://arxiv.org/abs/2405.19977v1","updated":"2024-05-30T11:59:58Z","published":"2024-05-30T11:59:58Z","title":"Consistent Submodular Maximization","summary":"  Maximizing monotone submodular functions under cardinality constraints is a\nclassic optimization task with several applications in data mining and machine\nlearning. In this paper we study this problem in a dynamic environment with\nconsistency constraints: elements arrive in a streaming fashion and the goal is\nmaintaining a constant approximation to the optimal solution while having a\nstable solution (i.e., the number of changes between two consecutive solutions\nis bounded). We provide algorithms in this setting with different trade-offs\nbetween consistency and approximation quality. We also complement our\ntheoretical results with an experimental analysis showing the effectiveness of\nour algorithms in real-world instances.\n","authors":["Paul Dütting","Federico Fusco","Silvio Lattanzi","Ashkan Norouzi-Fard","Morteza Zadimoghaddam"],"pdf_url":"https://arxiv.org/pdf/2405.19977v1.pdf","comment":"To appear at ICML 24"},{"id":"http://arxiv.org/abs/2403.05300v2","updated":"2024-05-30T11:55:49Z","published":"2024-03-08T13:29:46Z","title":"Unity by Diversity: Improved Representation Learning in Multimodal VAEs","summary":"  Variational Autoencoders for multimodal data hold promise for many tasks in\ndata analysis, such as representation learning, conditional generation, and\nimputation. Current architectures either share the encoder output, decoder\ninput, or both across modalities to learn a shared representation. Such\narchitectures impose hard constraints on the model. In this work, we show that\na better latent representation can be obtained by replacing these hard\nconstraints with a soft constraint. We propose a new mixture-of-experts prior,\nsoftly guiding each modality's latent representation towards a shared aggregate\nposterior. This approach results in a superior latent representation and allows\neach encoding to preserve information better from its uncompressed original\nfeatures. In extensive experiments on multiple benchmark datasets and two\nchallenging real-world datasets, we show improved learned latent\nrepresentations and imputation of missing data modalities compared to existing\nmethods.\n","authors":["Thomas M. Sutter","Yang Meng","Andrea Agostini","Daphné Chopard","Norbert Fortin","Julia E. Vogt","Bahbak Shahbaba","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2403.05300v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19971v1","updated":"2024-05-30T11:55:21Z","published":"2024-05-30T11:55:21Z","title":"GasTrace: Detecting Sandwich Attack Malicious Accounts in Ethereum","summary":"  The openness and transparency of Ethereum transaction data make it easy to be\nexploited by any entities, executing malicious attacks. The sandwich attack\nmanipulates the Automated Market Maker (AMM) mechanism, profiting from\nmanipulating the market price through front or after-running transactions. To\nidentify and prevent sandwich attacks, we propose a cascade classification\nframework GasTrace. GasTrace analyzes various transaction features to detect\nmalicious accounts, notably through the analysis and modeling of Gas features.\nIn the initial classification, we utilize the Support Vector Machine (SVM) with\nthe Radial Basis Function (RBF) kernel to generate the predicted probabilities\nof accounts, further constructing a detailed transaction network. Subsequently,\nthe behavior features are captured by the Graph Attention Network (GAT)\ntechnique in the second classification. Through cascade classification,\nGasTrace can analyze and classify the sandwich attacks. Our experimental\nresults demonstrate that GasTrace achieves a remarkable detection and\ngeneration capability, performing an accuracy of 96.73\\% and an F1 score of\n95.71\\% for identifying sandwich attack accounts.\n","authors":["Zekai Liu","Xiaoqi Li","Hongli Peng","Wenkai Li"],"pdf_url":"https://arxiv.org/pdf/2405.19971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19967v1","updated":"2024-05-30T11:46:42Z","published":"2024-05-30T11:46:42Z","title":"Improved Out-of-Scope Intent Classification with Dual Encoding and\n  Threshold-based Re-Classification","summary":"  Detecting out-of-scope user utterances is essential for task-oriented\ndialogues and intent classification. Current methodologies face difficulties\nwith the unpredictable distribution of outliers and often rely on assumptions\nabout data distributions. We present the Dual Encoder for Threshold-Based\nRe-Classification (DETER) to address these challenges. This end-to-end\nframework efficiently detects out-of-scope intents without requiring\nassumptions on data distributions or additional post-processing steps. The core\nof DETER utilizes dual text encoders, the Universal Sentence Encoder (USE) and\nthe Transformer-based Denoising AutoEncoder (TSDAE), to generate user utterance\nembeddings, which are classified through a branched neural architecture.\nFurther, DETER generates synthetic outliers using self-supervision and\nincorporates out-of-scope phrases from open-domain datasets. This approach\nensures a comprehensive training set for out-of-scope detection. Additionally,\na threshold-based re-classification mechanism refines the model's initial\npredictions. Evaluations on the CLINC-150, Stackoverflow, and Banking77\ndatasets demonstrate DETER's efficacy. Our model outperforms previous\nbenchmarks, increasing up to 13% and 5% in F1 score for known and unknown\nintents on CLINC-150 and Stackoverflow, and 16% for known and 24% % for unknown\nintents on Banking77. The source code has been released at\nhttps://github.com/Hossam-Mohammed-tech/Intent\\_Classification\\_OOS.\n","authors":["Hossam M. Zawbaa","Wael Rashwan","Sourav Dutta","Haytham Assem"],"pdf_url":"https://arxiv.org/pdf/2405.19967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17106v3","updated":"2024-05-30T11:44:40Z","published":"2024-02-27T00:59:32Z","title":"Achievable Fairness on Your Data With Utility Guarantees","summary":"  In machine learning fairness, training models that minimize disparity across\ndifferent sensitive groups often leads to diminished accuracy, a phenomenon\nknown as the fairness-accuracy trade-off. The severity of this trade-off\ninherently depends on dataset characteristics such as dataset imbalances or\nbiases and therefore, using a uniform fairness requirement across diverse\ndatasets remains questionable. To address this, we present a computationally\nefficient approach to approximate the fairness-accuracy trade-off curve\ntailored to individual datasets, backed by rigorous statistical guarantees. By\nutilizing the You-Only-Train-Once (YOTO) framework, our approach mitigates the\ncomputational burden of having to train multiple models when approximating the\ntrade-off curve. Crucially, we introduce a novel methodology for quantifying\nuncertainty in our estimates, thereby providing practitioners with a robust\nframework for auditing model fairness while avoiding false conclusions due to\nestimation errors. Our experiments spanning tabular (e.g., Adult), image\n(CelebA), and language (Jigsaw) datasets underscore that our approach not only\nreliably quantifies the optimum achievable trade-offs across various data\nmodalities but also helps detect suboptimality in SOTA fairness methods.\n","authors":["Muhammad Faaiz Taufiq","Jean-Francois Ton","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.17106v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03286v3","updated":"2024-05-30T11:42:15Z","published":"2024-02-05T18:42:34Z","title":"Training-Free Consistent Text-to-Image Generation","summary":"  Text-to-image models offer a new level of creative flexibility by allowing\nusers to guide the image generation process through natural language. However,\nusing these models to consistently portray the same subject across diverse\nprompts remains challenging. Existing approaches fine-tune the model to teach\nit new words that describe specific user-provided subjects or add image\nconditioning to the model. These methods require lengthy per-subject\noptimization or large-scale pre-training. Moreover, they struggle to align\ngenerated images with text prompts and face difficulties in portraying multiple\nsubjects. Here, we present ConsiStory, a training-free approach that enables\nconsistent subject generation by sharing the internal activations of the\npretrained model. We introduce a subject-driven shared attention block and\ncorrespondence-based feature injection to promote subject consistency between\nimages. Additionally, we develop strategies to encourage layout diversity while\nmaintaining subject consistency. We compare ConsiStory to a range of baselines,\nand demonstrate state-of-the-art performance on subject consistency and text\nalignment, without requiring a single optimization step. Finally, ConsiStory\ncan naturally extend to multi-subject scenarios, and even enable training-free\npersonalization for common objects.\n","authors":["Yoad Tewel","Omri Kaduri","Rinon Gal","Yoni Kasten","Lior Wolf","Gal Chechik","Yuval Atzmon"],"pdf_url":"https://arxiv.org/pdf/2402.03286v3.pdf","comment":"Accepted to journal track of SIGGRAPH 2024 (TOG). Project page is at\n  https://consistory-paper.github.io"},{"id":"http://arxiv.org/abs/2405.19961v1","updated":"2024-05-30T11:32:42Z","published":"2024-05-30T11:32:42Z","title":"Collective Variable Free Transition Path Sampling with Generative Flow\n  Network","summary":"  Understanding transition paths between meta-stable states in molecular\nsystems is fundamental for material design and drug discovery. However,\nsampling these paths via molecular dynamics simulations is computationally\nprohibitive due to the high-energy barriers between the meta-stable states.\nRecent machine learning approaches are often restricted to simple systems or\nrely on collective variables (CVs) extracted from expensive domain knowledge.\nIn this work, we propose to leverage generative flow networks (GFlowNets) to\nsample transition paths without relying on CVs. We reformulate the problem as\namortized energy-based sampling over molecular trajectories and train a bias\npotential by minimizing the squared log-ratio between the target distribution\nand the generator, derived from the flow matching objective of GFlowNets. Our\nevaluation on three proteins (Alanine Dipeptide, Polyproline, and Chignolin)\ndemonstrates that our approach, called TPS-GFN, generates more realistic and\ndiverse transition paths than the previous CV-free machine learning approach.\n","authors":["Kiyoung Seong","Seonghyun Park","Seonghwan Kim","Woo Youn Kim","Sungsoo Ahn"],"pdf_url":"https://arxiv.org/pdf/2405.19961v1.pdf","comment":"9 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.12728v3","updated":"2024-05-30T11:25:08Z","published":"2023-12-20T02:55:15Z","title":"Lookahead: An Inference Acceleration Framework for Large Language Model\n  with Lossless Generation Accuracy","summary":"  As Large Language Models (LLMs) have made significant advancements across\nvarious tasks, such as question answering, translation, text summarization, and\ndialogue systems, the need for accuracy in information becomes crucial,\nespecially for serious financial products serving billions of users like\nAlipay. However, for a real-world product serving millions of users, the\ninference speed of LLMs becomes a critical factor compared to a mere\nexperimental model.\n  Hence, this paper presents a generic framework for accelerating the inference\nprocess, resulting in a substantial increase in speed and cost reduction for\nour LLM-based scenarios, with lossless generation accuracy. In the traditional\ninference process, each token is generated sequentially by the LLM, leading to\na time consumption proportional to the number of generated tokens. To enhance\nthis process, our framework, named \\textit{lookahead}, introduces a\n\\textit{multi-branch} strategy. Instead of generating a single token at a time,\nwe propose a Trie-based retrieval and verification mechanism to be able to\naccept several tokens at a forward step. Our strategy offers two distinct\nadvantages: (1) it guarantees absolute correctness of the output, avoiding any\napproximation algorithms, and (2) the worst-case performance of our approach is\nequivalent to the conventional process. We conduct extensive experiments to\ndemonstrate the significant improvements achieved by applying our inference\nacceleration framework. Our framework is widely deployed in Alipay since April\n2023, and obtain remarkable 2.66x to 6.26x speedup. Our code is available at\nhttps://github.com/alipay/PainlessInferenceAcceleration.\n","authors":["Yao Zhao","Zhitian Xie","Chen Liang","Chenyi Zhuang","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2312.12728v3.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.16056v2","updated":"2024-05-30T11:20:22Z","published":"2024-05-25T04:51:41Z","title":"FedSheafHN: Personalized Federated Learning on Graph-structured Data","summary":"  Personalized subgraph Federated Learning (FL) is a task that customizes Graph\nNeural Networks (GNNs) to individual client needs, accommodating diverse data\ndistributions. However, applying hypernetworks in FL, while aiming to\nfacilitate model personalization, often encounters challenges due to inadequate\nrepresentation of client-specific characteristics. To overcome these\nlimitations, we propose a model called FedSheafHN, using enhanced collaboration\ngraph embedding and efficient personalized model parameter generation.\nSpecifically, our model embeds each client's local subgraph into a\nserver-constructed collaboration graph. We utilize sheaf diffusion in the\ncollaboration graph to learn client representations. Our model improves the\nintegration and interpretation of complex client characteristics. Furthermore,\nour model ensures the generation of personalized models through advanced\nhypernetworks optimized for parallel operations across clients. Empirical\nevaluations demonstrate that FedSheafHN outperforms existing methods in most\nscenarios, in terms of client model performance on various graph-structured\ndatasets. It also has fast model convergence and effective new clients\ngeneralization.\n","authors":["Wenfei Liang","Yanan Zhao","Rui She","Yiming Li","Wee Peng Tay"],"pdf_url":"https://arxiv.org/pdf/2405.16056v2.pdf","comment":"This paper was submitted to ICML 2024 in Feb 2024. You can find a\n  record\n  here:https://github.com/CarrieWFF/ICML-2024-submission-recording/blob/main/Screenshot%20of%20FedSheafHN%20submission%20to%20ICML%202024.png"},{"id":"http://arxiv.org/abs/2405.19954v1","updated":"2024-05-30T11:18:52Z","published":"2024-05-30T11:18:52Z","title":"GenKubeSec: LLM-Based Kubernetes Misconfiguration Detection,\n  Localization, Reasoning, and Remediation","summary":"  A key challenge associated with Kubernetes configuration files (KCFs) is that\nthey are often highly complex and error-prone, leading to security\nvulnerabilities and operational setbacks. Rule-based (RB) tools for KCF\nmisconfiguration detection rely on static rule sets, making them inherently\nlimited and unable to detect newly-discovered misconfigurations. RB tools also\nsuffer from misdetection, since mistakes are likely when coding the detection\nrules. Recent methods for detecting and remediating KCF misconfigurations are\nlimited in terms of their scalability and detection coverage, or due to the\nfact that they have high expertise requirements and do not offer automated\nremediation along with misconfiguration detection. Novel approaches that employ\nLLMs in their pipeline rely on API-based, general-purpose, and mainly\ncommercial models. Thus, they pose security challenges, have inconsistent\nclassification performance, and can be costly. In this paper, we propose\nGenKubeSec, a comprehensive and adaptive, LLM-based method, which, in addition\nto detecting a wide variety of KCF misconfigurations, also identifies the exact\nlocation of the misconfigurations and provides detailed reasoning about them,\nalong with suggested remediation. When empirically compared with three\nindustry-standard RB tools, GenKubeSec achieved equivalent precision (0.990)\nand superior recall (0.999). When a random sample of KCFs was examined by a\nKubernetes security expert, GenKubeSec's explanations as to misconfiguration\nlocalization, reasoning and remediation were 100% correct, informative and\nuseful. To facilitate further advancements in this domain, we share the unique\ndataset we collected, a unified misconfiguration index we developed for label\nstandardization, our experimentation code, and GenKubeSec itself as an\nopen-source tool.\n","authors":["Ehud Malul","Yair Meidan","Dudu Mimran","Yuval Elovici","Asaf Shabtai"],"pdf_url":"https://arxiv.org/pdf/2405.19954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19950v1","updated":"2024-05-30T11:14:01Z","published":"2024-05-30T11:14:01Z","title":"MM-Lego: Modular Biomedical Multimodal Models with Minimal Fine-Tuning","summary":"  Learning holistic computational representations in physical, chemical or\nbiological systems requires the ability to process information from different\ndistributions and modalities within the same model. Thus, the demand for\nmultimodal machine learning models has sharply risen for modalities that go\nbeyond vision and language, such as sequences, graphs, time series, or tabular\ndata. While there are many available multimodal fusion and alignment\napproaches, most of them require end-to-end training, scale quadratically with\nthe number of modalities, cannot handle cases of high modality imbalance in the\ntraining set, or are highly topology-specific, making them too restrictive for\nmany biomedical learning tasks. This paper presents Multimodal Lego (MM-Lego),\na modular and general-purpose fusion and model merging framework to turn any\nset of encoders into a competitive multimodal model with no or minimal\nfine-tuning. We achieve this by introducing a wrapper for unimodal encoders\nthat enforces lightweight dimensionality assumptions between modalities and\nharmonises their representations by learning features in the frequency domain\nto enable model merging with little signal interference. We show that MM-Lego\n1) can be used as a model merging method which achieves competitive performance\nwith end-to-end fusion models without any fine-tuning, 2) can operate on any\nunimodal encoder, and 3) is a model fusion method that, with minimal\nfine-tuning, achieves state-of-the-art results on six benchmarked multimodal\nbiomedical tasks.\n","authors":["Konstantin Hemker","Nikola Simidjievski","Mateja Jamnik"],"pdf_url":"https://arxiv.org/pdf/2405.19950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17823v2","updated":"2024-05-30T11:12:25Z","published":"2024-05-28T04:47:12Z","title":"Spectral Truncation Kernels: Noncommutativity in $C^*$-algebraic Kernel\n  Machines","summary":"  In this paper, we propose a new class of positive definite kernels based on\nthe spectral truncation, which has been discussed in the fields of\nnoncommutative geometry and $C^*$-algebra. We focus on kernels whose inputs and\noutputs are functions and generalize existing kernels, such as polynomial,\nproduct, and separable kernels, by introducing a truncation parameter $n$ that\ndescribes the noncommutativity of the products appearing in the kernels. When\n$n$ goes to infinity, the proposed kernels tend to the existing commutative\nkernels. If $n$ is finite, they exhibit different behavior, and the\nnoncommutativity induces interactions along the data function domain. We show\nthat the truncation parameter $n$ is a governing factor leading to performance\nenhancement: by setting an appropriate $n$, we can balance the representation\npower and the complexity of the representation space. The flexibility of the\nproposed class of kernels allows us to go beyond previous commutative kernels.\n","authors":["Yuka Hashimoto","Ayoub Hafid","Masahiro Ikeda","Hachem Kadri"],"pdf_url":"https://arxiv.org/pdf/2405.17823v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08595v2","updated":"2024-05-30T11:03:46Z","published":"2023-06-14T15:55:19Z","title":"TensorKrowch: Smooth integration of tensor networks in machine learning","summary":"  Tensor networks are factorizations of high-dimensional tensors into networks\nof smaller tensors. They have applications in physics and mathematics, and\nrecently have been proposed as promising machine learning architectures. To\nease the integration of tensor networks in machine learning pipelines, we\nintroduce TensorKrowch, an open source Python library built on top of PyTorch.\nProviding a user-friendly interface, TensorKrowch allows users to construct any\ntensor network, train it, and integrate it as a layer in more intricate deep\nlearning models. In this paper, we describe the main functionality and basic\nusage of TensorKrowch, and provide technical details on its building blocks and\nthe optimizations performed to achieve efficient operation.\n","authors":["José Ramón Pareja Monturiol","David Pérez-García","Alejandro Pozas-Kerstjens"],"pdf_url":"https://arxiv.org/pdf/2306.08595v2.pdf","comment":"20 pages, 2 figures. The TensorKrowch GitHub repository is in\n  https://github.com/joserapa98/tensorkrowch and the TensorKrowch documentation\n  is in https://joserapa98.github.io/tensorkrowch. V2: Accepted version"},{"id":"http://arxiv.org/abs/2402.02425v2","updated":"2024-05-30T10:53:51Z","published":"2024-02-04T09:45:35Z","title":"DeepLag: Discovering Deep Lagrangian Dynamics for Intuitive Fluid\n  Prediction","summary":"  Accurately predicting the future fluid is vital to extensive areas such as\nmeteorology, oceanology, and aerodynamics. However, since the fluid is usually\nobserved from an Eulerian perspective, its moving and intricate dynamics are\nseriously obscured and confounded in static grids, bringing thorny challenges\nto the prediction. This paper introduces a new Lagrangian-Eulerian combined\nparadigm to tackle the tanglesome fluid dynamics. Instead of solely predicting\nthe future based on Eulerian observations, we propose DeepLag to discover\nhidden Lagrangian dynamics within the fluid by tracking the movements of\nadaptively sampled key particles. DeepLag utilizes the proposed where the\nLagrangian movement of the tracked particles is inferred from Eulerian\nobservations, and their accumulated Lagrangian dynamics information is\nincorporated into global Eulerian evolving features to guide future prediction\nrespectively. Tracking key particles not only provides a transparent and\ninterpretable clue for fluid dynamics but also makes our model free from\nmodeling complex correlations among massive grids for better efficiency.\nExperimentally, DeepLag excels in three challenging fluid prediction tasks\ncovering 2D and 3D, simulated and real-world fluids.\n","authors":["Qilong Ma","Haixu Wu","Lanxiang Xing","Shangchen Miao","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2402.02425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19933v1","updated":"2024-05-30T10:49:22Z","published":"2024-05-30T10:49:22Z","title":"Learning Latent Graph Structures and their Uncertainty","summary":"  Within a prediction task, Graph Neural Networks (GNNs) use relational\ninformation as an inductive bias to enhance the model's accuracy. As\ntask-relevant relations might be unknown, graph structure learning approaches\nhave been proposed to learn them while solving the downstream prediction task.\nIn this paper, we demonstrate that minimization of a point-prediction loss\nfunction, e.g., the mean absolute error, does not guarantee proper learning of\nthe latent relational information and its associated uncertainty. Conversely,\nwe prove that a suitable loss function on the stochastic model outputs\nsimultaneously grants (i) the unknown adjacency matrix latent distribution and\n(ii) optimal performance on the prediction task. Finally, we propose a\nsampling-based method that solves this joint learning task. Empirical results\nvalidate our theoretical claims and demonstrate the effectiveness of the\nproposed approach.\n","authors":["Alessandro Manenti","Daniele Zambon","Cesare Alippi"],"pdf_url":"https://arxiv.org/pdf/2405.19933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19931v1","updated":"2024-05-30T10:47:48Z","published":"2024-05-30T10:47:48Z","title":"Exploring Diffusion Models' Corruption Stage in Few-Shot Fine-tuning and\n  Mitigating with Bayesian Neural Networks","summary":"  Few-shot fine-tuning of Diffusion Models (DMs) is a key advancement,\nsignificantly reducing training costs and enabling personalized AI\napplications. However, we explore the training dynamics of DMs and observe an\nunanticipated phenomenon: during the training process, image fidelity initially\nimproves, then unexpectedly deteriorates with the emergence of noisy patterns,\nonly to recover later with severe overfitting. We term the stage with generated\nnoisy patterns as corruption stage. To understand this corruption stage, we\nbegin by theoretically modeling the one-shot fine-tuning scenario, and then\nextend this modeling to more general cases. Through this modeling, we identify\nthe primary cause of this corruption stage: a narrowed learning distribution\ninherent in the nature of few-shot fine-tuning. To tackle this, we apply\nBayesian Neural Networks (BNNs) on DMs with variational inference to implicitly\nbroaden the learned distribution, and present that the learning target of the\nBNNs can be naturally regarded as an expectation of the diffusion loss and a\nfurther regularization with the pretrained DMs. This approach is highly\ncompatible with current few-shot fine-tuning methods in DMs and does not\nintroduce any extra inference costs. Experimental results demonstrate that our\nmethod significantly mitigates corruption, and improves the fidelity, quality\nand diversity of the generated images in both object-driven and subject-driven\ngeneration tasks.\n","authors":["Xiaoyu Wu","Jiaru Zhang","Yang Hua","Bohan Lyu","Hao Wang","Tao Song","Haibing Guan"],"pdf_url":"https://arxiv.org/pdf/2405.19931v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2405.19928v1","updated":"2024-05-30T10:44:45Z","published":"2024-05-30T10:44:45Z","title":"BAN: Detecting Backdoors Activated by Adversarial Neuron Noise","summary":"  Backdoor attacks on deep learning represent a recent threat that has gained\nsignificant attention in the research community. Backdoor defenses are mainly\nbased on backdoor inversion, which has been shown to be generic,\nmodel-agnostic, and applicable to practical threat scenarios. State-of-the-art\nbackdoor inversion recovers a mask in the feature space to locate prominent\nbackdoor features, where benign and backdoor features can be disentangled.\nHowever, it suffers from high computational overhead, and we also find that it\noverly relies on prominent backdoor features that are highly distinguishable\nfrom benign features. To tackle these shortcomings, this paper improves\nbackdoor feature inversion for backdoor detection by incorporating extra neuron\nactivation information. In particular, we adversarially increase the loss of\nbackdoored models with respect to weights to activate the backdoor effect,\nbased on which we can easily differentiate backdoored and clean models.\nExperimental results demonstrate our defense, BAN, is 1.37$\\times$ (on\nCIFAR-10) and 5.11$\\times$ (on ImageNet200) more efficient with 9.99% higher\ndetect success rate than the state-of-the-art defense BTI-DBF. Our code and\ntrained models are publicly\navailable.\\url{https://anonymous.4open.science/r/ban-4B32}\n","authors":["Xiaoyun Xu","Zhuoran Liu","Stefanos Koffas","Shujian Yu","Stjepan Picek"],"pdf_url":"https://arxiv.org/pdf/2405.19928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19919v1","updated":"2024-05-30T10:30:44Z","published":"2024-05-30T10:30:44Z","title":"Unraveling the Impact of Heterophilic Structures on Graph\n  Positive-Unlabeled Learning","summary":"  While Positive-Unlabeled (PU) learning is vital in many real-world scenarios,\nits application to graph data still remains under-explored. We unveil that a\ncritical challenge for PU learning on graph lies on the edge heterophily, which\ndirectly violates the irreducibility assumption for Class-Prior Estimation\n(class prior is essential for building PU learning algorithms) and degenerates\nthe latent label inference on unlabeled nodes during classifier training. In\nresponse to this challenge, we introduce a new method, named Graph PU Learning\nwith Label Propagation Loss (GPL). Specifically, GPL considers learning from PU\nnodes along with an intermediate heterophily reduction, which helps mitigate\nthe negative impact of the heterophilic structure. We formulate this procedure\nas a bilevel optimization that reduces heterophily in the inner loop and\nefficiently learns a classifier in the outer loop. Extensive experiments across\na variety of datasets have shown that GPL significantly outperforms baseline\nmethods, confirming its effectiveness and superiority.\n","authors":["Yuhao Wu","Jiangchao Yao","Bo Han","Lina Yao","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2405.19919v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.19912v1","updated":"2024-05-30T10:23:16Z","published":"2024-05-30T10:23:16Z","title":"Robust Kernel Hypothesis Testing under Data Corruption","summary":"  We propose two general methods for constructing robust permutation tests\nunder data corruption. The proposed tests effectively control the\nnon-asymptotic type I error under data corruption, and we prove their\nconsistency in power under minimal conditions. This contributes to the\npractical deployment of hypothesis tests for real-world applications with\npotential adversarial attacks. One of our methods inherently ensures\ndifferential privacy, further broadening its applicability to private data\nanalysis. For the two-sample and independence settings, we show that our kernel\nrobust tests are minimax optimal, in the sense that they are guaranteed to be\nnon-asymptotically powerful against alternatives uniformly separated from the\nnull in the kernel MMD and HSIC metrics at some optimal rate (tight with\nmatching lower bound). Finally, we provide publicly available implementations\nand empirically illustrate the practicality of our proposed tests.\n","authors":["Antonin Schrab","Ilmun Kim"],"pdf_url":"https://arxiv.org/pdf/2405.19912v1.pdf","comment":"26 pages, 2 figures, 2 algorithms"},{"id":"http://arxiv.org/abs/2405.19909v1","updated":"2024-05-30T10:20:55Z","published":"2024-05-30T10:20:55Z","title":"Adaptive Advantage-Guided Policy Regularization for Offline\n  Reinforcement Learning","summary":"  In offline reinforcement learning, the challenge of out-of-distribution (OOD)\nis pronounced. To address this, existing methods often constrain the learned\npolicy through policy regularization. However, these methods often suffer from\nthe issue of unnecessary conservativeness, hampering policy improvement. This\noccurs due to the indiscriminate use of all actions from the behavior policy\nthat generates the offline dataset as constraints. The problem becomes\nparticularly noticeable when the quality of the dataset is suboptimal. Thus, we\npropose Adaptive Advantage-guided Policy Regularization (A2PR), obtaining\nhigh-advantage actions from an augmented behavior policy combined with VAE to\nguide the learned policy. A2PR can select high-advantage actions that differ\nfrom those present in the dataset, while still effectively maintaining\nconservatism from OOD actions. This is achieved by harnessing the VAE capacity\nto generate samples matching the distribution of the data points. We\ntheoretically prove that the improvement of the behavior policy is guaranteed.\nBesides, it effectively mitigates value overestimation with a bounded\nperformance gap. Empirically, we conduct a series of experiments on the D4RL\nbenchmark, where A2PR demonstrates state-of-the-art performance. Furthermore,\nexperimental results on additional suboptimal mixed datasets reveal that A2PR\nexhibits superior performance. Code is available at\nhttps://github.com/ltlhuuu/A2PR.\n","authors":["Tenglong Liu","Yang Li","Yixing Lan","Hao Gao","Wei Pan","Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2405.19909v1.pdf","comment":"ICML 2024, 19 pages"},{"id":"http://arxiv.org/abs/2404.00618v2","updated":"2024-05-30T10:16:04Z","published":"2024-03-31T09:10:32Z","title":"A Multi-Branched Radial Basis Network Approach to Predicting Complex\n  Chaotic Behaviours","summary":"  In this study, we propose a multi branched network approach to predict the\ndynamics of a physics attractor characterized by intricate and chaotic\nbehavior. We introduce a unique neural network architecture comprised of Radial\nBasis Function (RBF) layers combined with an attention mechanism designed to\neffectively capture nonlinear inter-dependencies inherent in the attractor's\ntemporal evolution. Our results demonstrate successful prediction of the\nattractor's trajectory across 100 predictions made using a real-world dataset\nof 36,700 time-series observations encompassing approximately 28 minutes of\nactivity. To further illustrate the performance of our proposed technique, we\nprovide comprehensive visualizations depicting the attractor's original and\npredicted behaviors alongside quantitative measures comparing observed versus\nestimated outcomes. Overall, this work showcases the potential of advanced\nmachine learning algorithms in elucidating hidden structures in complex\nphysical systems while offering practical applications in various domains\nrequiring accurate short-term forecasting capabilities.\n","authors":["Aarush Sinha"],"pdf_url":"https://arxiv.org/pdf/2404.00618v2.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.02407v2","updated":"2024-05-30T10:13:13Z","published":"2024-02-04T08:57:42Z","title":"Defining Neural Network Architecture through Polytope Structures of\n  Dataset","summary":"  Current theoretical and empirical research in neural networks suggests that\ncomplex datasets require large network architectures for thorough\nclassification, yet the precise nature of this relationship remains unclear.\nThis paper tackles this issue by defining upper and lower bounds for neural\nnetwork widths, which are informed by the polytope structure of the dataset in\nquestion. We also delve into the application of these principles to simplicial\ncomplexes and specific manifold shapes, explaining how the requirement for\nnetwork width varies in accordance with the geometric complexity of the\ndataset. Moreover, we develop an algorithm to investigate a converse situation\nwhere the polytope structure of a dataset can be inferred from its\ncorresponding trained neural networks. Through our algorithm, it is established\nthat popular datasets such as MNIST, Fashion-MNIST, and CIFAR10 can be\nefficiently encapsulated using no more than two polytopes with a small number\nof faces.\n","authors":["Sangmin Lee","Abbas Mammadov","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2402.02407v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19902v1","updated":"2024-05-30T10:06:06Z","published":"2024-05-30T10:06:06Z","title":"Learning Discriminative Dynamics with Label Corruption for Noisy Label\n  Detection","summary":"  Label noise, commonly found in real-world datasets, has a detrimental impact\non a model's generalization. To effectively detect incorrectly labeled\ninstances, previous works have mostly relied on distinguishable training\nsignals, such as training loss, as indicators to differentiate between clean\nand noisy labels. However, they have limitations in that the training signals\nincompletely reveal the model's behavior and are not effectively generalized to\nvarious noise types, resulting in limited detection accuracy. In this paper, we\npropose DynaCor framework that distinguishes incorrectly labeled instances from\ncorrectly labeled ones based on the dynamics of the training signals. To cope\nwith the absence of supervision for clean and noisy labels, DynaCor first\nintroduces a label corruption strategy that augments the original dataset with\nintentionally corrupted labels, enabling indirect simulation of the model's\nbehavior on noisy labels. Then, DynaCor learns to identify clean and noisy\ninstances by inducing two clearly distinguishable clusters from the latent\nrepresentations of training dynamics. Our comprehensive experiments show that\nDynaCor outperforms the state-of-the-art competitors and shows strong\nrobustness to various noise types and noise rates.\n","authors":["Suyeon Kim","Dongha Lee","SeongKu Kang","Sukang Chae","Sanghwan Jang","Hwanjo Yu"],"pdf_url":"https://arxiv.org/pdf/2405.19902v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2405.19901v1","updated":"2024-05-30T10:02:53Z","published":"2024-05-30T10:02:53Z","title":"Urban Air Pollution Forecasting: a Machine Learning Approach leveraging\n  Satellite Observations and Meteorological Forecasts","summary":"  Air pollution poses a significant threat to public health and well-being,\nparticularly in urban areas. This study introduces a series of machine-learning\nmodels that integrate data from the Sentinel-5P satellite, meteorological\nconditions, and topological characteristics to forecast future levels of five\nmajor pollutants. The investigation delineates the process of data collection,\ndetailing the combination of diverse data sources utilized in the study.\nThrough experiments conducted in the Milan metropolitan area, the models\ndemonstrate their efficacy in predicting pollutant levels for the forthcoming\nday, achieving a percentage error of around 30%. The proposed models are\nadvantageous as they are independent of monitoring stations, facilitating their\nuse in areas without existing infrastructure. Additionally, we have released\nthe collected dataset to the public, aiming to stimulate further research in\nthis field. This research contributes to advancing our understanding of urban\nair quality dynamics and emphasizes the importance of amalgamating satellite,\nmeteorological, and topographical data to develop robust pollution forecasting\nmodels.\n","authors":["Giacomo Blanco","Luca Barco","Lorenzo Innocenti","Claudio Rossi"],"pdf_url":"https://arxiv.org/pdf/2405.19901v1.pdf","comment":"5 pages, 2 figures, submitted to IEEE MetroLivEnv 2024"},{"id":"http://arxiv.org/abs/2305.13067v2","updated":"2024-05-30T10:00:14Z","published":"2023-05-22T14:37:05Z","title":"Distilling Robustness into Natural Language Inference Models with\n  Domain-Targeted Augmentation","summary":"  Knowledge distillation optimises a smaller student model to behave similarly\nto a larger teacher model, retaining some of the performance benefits. While\nthis method can improve results on in-distribution examples, it does not\nnecessarily generalise to out-of-distribution (OOD) settings. We investigate\ntwo complementary methods for improving the robustness of the resulting student\nmodels on OOD domains. The first approach augments the distillation with\ngenerated unlabelled examples that match the target distribution. The second\nmethod upsamples data points among the training set that are similar to the\ntarget distribution. When applied on the task of natural language inference\n(NLI), our experiments on MNLI show that distillation with these modifications\noutperforms previous robustness solutions. We also find that these methods\nimprove performance on OOD domains even beyond the target domain.\n","authors":["Joe Stacey","Marek Rei"],"pdf_url":"https://arxiv.org/pdf/2305.13067v2.pdf","comment":"Accepted at ACL Findings 2024"},{"id":"http://arxiv.org/abs/2205.12961v2","updated":"2024-05-30T09:53:16Z","published":"2022-05-25T14:02:49Z","title":"Position: Tensor Networks are a Valuable Asset for Green AI","summary":"  For the first time, this position paper introduces a fundamental link between\ntensor networks (TNs) and Green AI, highlighting their synergistic potential to\nenhance both the inclusivity and sustainability of AI research. We argue that\nTNs are valuable for Green AI due to their strong mathematical backbone and\ninherent logarithmic compression potential. We undertake a comprehensive review\nof the ongoing discussions on Green AI, emphasizing the importance of\nsustainability and inclusivity in AI research to demonstrate the significance\nof establishing the link between Green AI and TNs. To support our position, we\nfirst provide a comprehensive overview of efficiency metrics proposed in Green\nAI literature and then evaluate examples of TNs in the fields of kernel\nmachines and deep learning using the proposed efficiency metrics. This position\npaper aims to incentivize meaningful, constructive discussions by bridging\nfundamental principles of Green AI and TNs. We advocate for researchers to\nseriously evaluate the integration of TNs into their research projects, and in\nalignment with the link established in this paper, we support prior calls\nencouraging researchers to treat Green AI principles as a research priority.\n","authors":["Eva Memmel","Clara Menzen","Jetze Schuurmans","Frederiek Wesel","Kim Batselier"],"pdf_url":"https://arxiv.org/pdf/2205.12961v2.pdf","comment":"This paper has been accepted for presentation at the International\n  Conference on Machine Learning (ICML) 2024 and will appear in the conference\n  proceedings"},{"id":"http://arxiv.org/abs/2405.19893v1","updated":"2024-05-30T09:50:38Z","published":"2024-05-30T09:50:38Z","title":"Similarity is Not All You Need: Endowing Retrieval Augmented Generation\n  with Multi Layered Thoughts","summary":"  In recent years, large language models (LLMs) have made remarkable\nachievements in various domains. However, the untimeliness and cost of\nknowledge updates coupled with hallucination issues of LLMs have curtailed\ntheir applications in knowledge intensive tasks, where retrieval augmented\ngeneration (RAG) can be of help. Nevertheless, existing retrieval augmented\nmodels typically use similarity as a bridge between queries and documents and\nfollow a retrieve then read procedure. In this work, we argue that similarity\nis not always the panacea and totally relying on similarity would sometimes\ndegrade the performance of retrieval augmented generation. To this end, we\npropose MetRag, a Multi layEred Thoughts enhanced Retrieval Augmented\nGeneration framework. To begin with, beyond existing similarity oriented\nthought, we embrace a small scale utility model that draws supervision from an\nLLM for utility oriented thought and further come up with a smarter model by\ncomprehensively combining the similarity and utility oriented thoughts.\nFurthermore, given the fact that the retrieved document set tends to be huge\nand using them in isolation makes it difficult to capture the commonalities and\ncharacteristics among them, we propose to make an LLM as a task adaptive\nsummarizer to endow retrieval augmented generation with compactness-oriented\nthought. Finally, with multi layered thoughts from the precedent stages, an LLM\nis called for knowledge augmented generation. Extensive experiments on\nknowledge-intensive tasks have demonstrated the superiority of MetRag.\n","authors":["Chunjing Gan","Dan Yang","Binbin Hu","Hanxiao Zhang","Siyuan Li","Ziqi Liu","Yue Shen","Lin Ju","Zhiqiang Zhang","Jinjie Gu","Lei Liang","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.19893v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2402.02446v3","updated":"2024-05-30T09:49:47Z","published":"2024-02-04T10:59:52Z","title":"LQER: Low-Rank Quantization Error Reconstruction for LLMs","summary":"  Post-training quantization of Large Language Models (LLMs) is challenging. In\nthis work, we introduce Low-rank Quantization Error Reduction (LQER), which\ncombines quantization and low-rank approximation to recover the model\ncapability. LQER leverages an activation-induced scale matrix to drive the\nsingular value distribution of quantization error towards a desirable\ndistribution, which enables nearly-lossless W4A8 quantization on various LLMs\nand downstream tasks without the need for knowledge distillation, grid search,\nor gradient-base iterative optimization. Unlike existing methods, the\ncomputation pattern of LQER eliminates the need for specialized Scatter and\nGather processes to collect high-precision weights from irregular memory\nlocations. Our W4A8 LLMs achieve near-lossless performance on six popular\ndownstream tasks, while using 1.36$\\times$ fewer hardware resources than the\nleading state-of-the-art method. We open-source our framework at\nhttps://github.com/ChengZhang-98/lqer\n","authors":["Cheng Zhang","Jianyi Cheng","George A. Constantinides","Yiren Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.02446v3.pdf","comment":"Accepted at ICML2024"},{"id":"http://arxiv.org/abs/2405.19889v1","updated":"2024-05-30T09:46:59Z","published":"2024-05-30T09:46:59Z","title":"Deep Joint Semantic Coding and Beamforming for Near-Space Airship-Borne\n  Massive MIMO Network","summary":"  Near-space airship-borne communication network is recognized to be an\nindispensable component of the future integrated ground-air-space network\nthanks to airships' advantage of long-term residency at stratospheric\naltitudes, but it urgently needs reliable and efficient Airship-to-X link. To\nimprove the transmission efficiency and capacity, this paper proposes to\nintegrate semantic communication with massive multiple-input multiple-output\n(MIMO) technology. Specifically, we propose a deep joint semantic coding and\nbeamforming (JSCBF) scheme for airship-based massive MIMO image transmission\nnetwork in space, in which semantics from both source and channel are fused to\njointly design the semantic coding and physical layer beamforming. First, we\ndesign two semantic extraction networks to extract semantics from image source\nand channel state information, respectively. Then, we propose a semantic fusion\nnetwork that can fuse these semantics into complex-valued semantic features for\nsubsequent physical-layer transmission. To efficiently transmit the fused\nsemantic features at the physical layer, we then propose the hybrid data and\nmodel-driven semantic-aware beamforming networks. At the receiver, a semantic\ndecoding network is designed to reconstruct the transmitted images. Finally, we\nperform end-to-end deep learning to jointly train all the modules, using the\nimage reconstruction quality at the receivers as a metric. The proposed deep\nJSCBF scheme fully combines the efficient source compressibility and robust\nerror correction capability of semantic communication with the high spectral\nefficiency of massive MIMO, achieving a significant performance improvement\nover existing approaches.\n","authors":["Minghui Wu","Zhen Gao","Zhaocheng Wang","Dusit Niyato","George K. Karagiannidis","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2405.19889v1.pdf","comment":"Major Revision by IEEE JSAC"},{"id":"http://arxiv.org/abs/2405.19888v1","updated":"2024-05-30T09:46:36Z","published":"2024-05-30T09:46:36Z","title":"Parrot: Efficient Serving of LLM-based Applications with Semantic\n  Variable","summary":"  The rise of large language models (LLMs) has enabled LLM-based applications\n(a.k.a. AI agents or co-pilots), a new software paradigm that combines the\nstrength of LLM and conventional software. Diverse LLM applications from\ndifferent tenants could design complex workflows using multiple LLM requests to\naccomplish one task. However, they have to use the over-simplified\nrequest-level API provided by today's public LLM services, losing essential\napplication-level information. Public LLM services have to blindly optimize\nindividual LLM requests, leading to sub-optimal end-to-end performance of LLM\napplications.\n  This paper introduces Parrot, an LLM service system that focuses on the\nend-to-end experience of LLM-based applications. Parrot proposes Semantic\nVariable, a unified abstraction to expose application-level knowledge to public\nLLM services. A Semantic Variable annotates an input/output variable in the\nprompt of a request, and creates the data pipeline when connecting multiple LLM\nrequests, providing a natural way to program LLM applications. Exposing\nSemantic Variables to the public LLM service allows it to perform conventional\ndata flow analysis to uncover the correlation across multiple LLM requests.\nThis correlation opens a brand-new optimization space for the end-to-end\nperformance of LLM-based applications. Extensive evaluations demonstrate that\nParrot can achieve up to an order-of-magnitude improvement for popular and\npractical use cases of LLM applications.\n","authors":["Chaofan Lin","Zhenhua Han","Chengruidong Zhang","Yuqing Yang","Fan Yang","Chen Chen","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2405.19888v1.pdf","comment":"To appear on USENIX OSDI 2024"},{"id":"http://arxiv.org/abs/2405.19886v1","updated":"2024-05-30T09:45:18Z","published":"2024-05-30T09:45:18Z","title":"Federated Learning with Multi-resolution Model Broadcast","summary":"  In federated learning, a server must periodically broadcast a model to the\nagents. We propose to use multi-resolution coding and modulation (also known as\nnon-uniform modulation) for this purpose. In the simplest instance, broadcast\ntransmission is used, whereby all agents are targeted with one and the same\ntransmission (typically without any particular favored beam direction), which\nis coded using multi-resolution coding/modulation. This enables high-SNR\nagents, with high path gains to the server, to receive a more accurate model\nthan the low-SNR agents do, without consuming more downlink resources. As one\nimplementation, we use transmission with a non-uniform 8-PSK constellation,\nwhere a high-SNR receiver (agent) can separate all 8 constellation points\n(hence receive 3 bits) whereas a low-SNR receiver can only separate 4 points\n(hence receive 2 bits). By encoding the least significant information in the\nthird bit, the high-SNR receivers can obtain the model with higher accuracy,\nwhile the low-SNR receiver can still obtain the model although with reduced\naccuracy, thereby facilitating at least some basic participation of the low-SNR\nreceiver. We show the effectiveness of our proposed scheme via experimentation\nusing federated learning with the MNIST data-set.\n","authors":["Henrik Rydén","Reza Moosavi","Erik G. Larsson"],"pdf_url":"https://arxiv.org/pdf/2405.19886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19885v1","updated":"2024-05-30T09:43:59Z","published":"2024-05-30T09:43:59Z","title":"Fourier Controller Networks for Real-Time Decision-Making in Embodied\n  Learning","summary":"  Reinforcement learning is able to obtain generalized low-level robot policies\non diverse robotics datasets in embodied learning scenarios, and Transformer\nhas been widely used to model time-varying features. However, it still suffers\nfrom the issues of low data efficiency and high inference latency. In this\npaper, we propose to investigate the task from a new perspective of the\nfrequency domain. We first observe that the energy density in the frequency\ndomain of a robot's trajectory is mainly concentrated in the low-frequency\npart. Then, we present the Fourier Controller Network (FCNet), a new network\nthat utilizes the Short-Time Fourier Transform (STFT) to extract and encode\ntime-varying features through frequency domain interpolation. We further\nachieve parallel training and efficient recurrent inference by using FFT and\nSliding DFT methods in the model architecture for real-time decision-making.\nComprehensive analyses in both simulated (e.g., D4RL) and real-world\nenvironments (e.g., robot locomotion) demonstrate FCNet's substantial\nefficiency and effectiveness over existing methods such as Transformer, e.g.,\nFCNet outperforms Transformer on multi-environmental robotics datasets of all\ntypes of sizes (from 1.9M to 120M). The project page and code can be found\nhttps://thkkk.github.io/fcnet.\n","authors":["Hengkai Tan","Songming Liu","Kai Ma","Chengyang Ying","Xingxing Zhang","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2405.19885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19883v1","updated":"2024-05-30T09:42:54Z","published":"2024-05-30T09:42:54Z","title":"From Words to Actions: Unveiling the Theoretical Underpinnings of\n  LLM-Driven Autonomous Systems","summary":"  In this work, from a theoretical lens, we aim to understand why large\nlanguage model (LLM) empowered agents are able to solve decision-making\nproblems in the physical world. To this end, consider a hierarchical\nreinforcement learning (RL) model where the LLM Planner and the Actor perform\nhigh-level task planning and low-level execution, respectively. Under this\nmodel, the LLM Planner navigates a partially observable Markov decision process\n(POMDP) by iteratively generating language-based subgoals via prompting. Under\nproper assumptions on the pretraining data, we prove that the pretrained LLM\nPlanner effectively performs Bayesian aggregated imitation learning (BAIL)\nthrough in-context learning. Additionally, we highlight the necessity for\nexploration beyond the subgoals derived from BAIL by proving that naively\nexecuting the subgoals returned by LLM leads to a linear regret. As a remedy,\nwe introduce an $\\epsilon$-greedy exploration strategy to BAIL, which is proven\nto incur sublinear regret when the pretraining error is small. Finally, we\nextend our theoretical framework to include scenarios where the LLM Planner\nserves as a world model for inferring the transition model of the environment\nand to multi-agent settings, enabling coordination among multiple Actors.\n","authors":["Jianliang He","Siyu Chen","Fengzhuo Zhang","Zhuoran Yang"],"pdf_url":"https://arxiv.org/pdf/2405.19883v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.06263v2","updated":"2024-05-30T09:40:02Z","published":"2024-05-10T06:28:42Z","title":"Learning Latent Dynamic Robust Representations for World Models","summary":"  Visual Model-Based Reinforcement Learning (MBRL) promises to encapsulate\nagent's knowledge about the underlying dynamics of the environment, enabling\nlearning a world model as a useful planner. However, top MBRL agents such as\nDreamer often struggle with visual pixel-based inputs in the presence of\nexogenous or irrelevant noise in the observation space, due to failure to\ncapture task-specific features while filtering out irrelevant spatio-temporal\ndetails. To tackle this problem, we apply a spatio-temporal masking strategy, a\nbisimulation principle, combined with latent reconstruction, to capture\nendogenous task-specific aspects of the environment for world models,\neffectively eliminating non-essential information. Joint training of\nrepresentations, dynamics, and policy often leads to instabilities. To further\naddress this issue, we develop a Hybrid Recurrent State-Space Model (HRSSM)\nstructure, enhancing state representation robustness for effective policy\nlearning. Our empirical evaluation demonstrates significant performance\nimprovements over existing methods in a range of visually complex control tasks\nsuch as Maniskill \\cite{gu2023maniskill2} with exogenous distractors from the\nMatterport environment. Our code is avaliable at\nhttps://github.com/bit1029public/HRSSM.\n","authors":["Ruixiang Sun","Hongyu Zang","Xin Li","Riashat Islam"],"pdf_url":"https://arxiv.org/pdf/2405.06263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08732v2","updated":"2024-05-30T09:37:30Z","published":"2023-10-12T21:39:16Z","title":"Provably Robust Cost-Sensitive Learning via Randomized Smoothing","summary":"  We study the problem of robust learning against adversarial perturbations\nunder cost-sensitive scenarios, where the potential harm of different types of\nmisclassifications is encoded in a cost matrix. Existing approaches are either\nempirical and cannot certify robustness or suffer from inherent scalability\nissues. In this work, we investigate whether randomized smoothing, a scalable\nframework for robustness certification, can be leveraged to certify and train\nfor cost-sensitive robustness. Built upon the notion of cost-sensitive\ncertified radius, we first illustrate how to adapt the standard certification\nalgorithm of randomized smoothing to produce tight robustness certificates for\nany binary cost matrix, and then develop a robust training method to promote\ncertified cost-sensitive robustness while maintaining the model's overall\naccuracy. Through extensive experiments on image benchmarks, we demonstrate the\nsuperiority of our proposed certification algorithm and training method under\nvarious cost-sensitive scenarios. Our implementation is available as open\nsource code at: https://github.com/TrustMLRG/CS-RS.\n","authors":["Yuan Xin","Michael Backes","Xiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08732v2.pdf","comment":"19 pages, 9 tables, 5 figures"},{"id":"http://arxiv.org/abs/2405.19878v1","updated":"2024-05-30T09:34:31Z","published":"2024-05-30T09:34:31Z","title":"Learning from Random Demonstrations: Offline Reinforcement Learning with\n  Importance-Sampled Diffusion Models","summary":"  Generative models such as diffusion have been employed as world models in\noffline reinforcement learning to generate synthetic data for more effective\nlearning. Existing work either generates diffusion models one-time prior to\ntraining or requires additional interaction data to update it. In this paper,\nwe propose a novel approach for offline reinforcement learning with closed-loop\npolicy evaluation and world-model adaptation. It iteratively leverages a guided\ndiffusion world model to directly evaluate the offline target policy with\nactions drawn from it, and then performs an importance-sampled world model\nupdate to adaptively align the world model with the updated policy. We analyzed\nthe performance of the proposed method and provided an upper bound on the\nreturn gap between our method and the real environment under an optimal policy.\nThe result sheds light on various factors affecting learning performance.\nEvaluations in the D4RL environment show significant improvement over\nstate-of-the-art baselines, especially when only random or medium-expertise\ndemonstrations are available -- thus requiring improved alignment between the\nworld model and offline policy evaluation.\n","authors":["Zeyu Fang","Tian Lan"],"pdf_url":"https://arxiv.org/pdf/2405.19878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19874v1","updated":"2024-05-30T09:28:56Z","published":"2024-05-30T09:28:56Z","title":"Is In-Context Learning Sufficient for Instruction Following in LLMs?","summary":"  In-context learning (ICL) allows LLMs to learn from examples without changing\ntheir weights, which is a particularly promising capability for long-context\nLLMs that can potentially learn from many examples. Recently, Lin et al. (2024)\nproposed URIAL, a method using only three in-context examples to align base\nLLMs, achieving non-trivial instruction following performance. In this work, we\nshow that, while effective, ICL alignment with URIAL still underperforms\ncompared to instruction fine-tuning on established benchmarks such as MT-Bench\nand AlpacaEval 2.0 (LC), especially with more capable base LMs. Unlike for\ntasks such as classification, translation, or summarization, adding more ICL\ndemonstrations for long-context LLMs does not systematically improve\ninstruction following performance. To address this limitation, we derive a\ngreedy selection approach for ICL examples that noticeably improves\nperformance, yet without bridging the gap to instruction fine-tuning. Finally,\nwe provide a series of ablation studies to better understand the reasons behind\nthe remaining gap, and we show how some aspects of ICL depart from the existing\nknowledge and are specific to the instruction tuning setting. Overall, our work\nadvances the understanding of ICL as an alignment technique. We provide our\ncode at https://github.com/tml-epfl/icl-alignment.\n","authors":["Hao Zhao","Maksym Andriushchenko","Francesco Croce","Nicolas Flammarion"],"pdf_url":"https://arxiv.org/pdf/2405.19874v1.pdf","comment":"Preprint. Code at https://github.com/tml-epfl/icl-alignment"},{"id":"http://arxiv.org/abs/2405.19870v1","updated":"2024-05-30T09:23:48Z","published":"2024-05-30T09:23:48Z","title":"On Vessel Location Forecasting and the Effect of Federated Learning","summary":"  The wide spread of Automatic Identification System (AIS) has motivated\nseveral maritime analytics operations. Vessel Location Forecasting (VLF) is one\nof the most critical operations for maritime awareness. However, accurate VLF\nis a challenging problem due to the complexity and dynamic nature of maritime\ntraffic conditions. Furthermore, as privacy concerns and restrictions have\ngrown, training data has become increasingly fragmented, resulting in dispersed\ndatabases of several isolated data silos among different organizations, which\nin turn decreases the quality of learning models. In this paper, we propose an\nefficient VLF solution based on LSTM neural networks, in two variants, namely\nNautilus and FedNautilus for the centralized and the federated learning\napproach, respectively. We also demonstrate the superiority of the centralized\napproach with respect to current state of the art and discuss the advantages\nand disadvantages of the federated against the centralized approach.\n","authors":["Andreas Tritsarolis","Nikos Pelekis","Konstantina Bereta","Dimitris Zissis","Yannis Theodoridis"],"pdf_url":"https://arxiv.org/pdf/2405.19870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03917v3","updated":"2024-05-30T09:15:06Z","published":"2024-02-06T11:35:02Z","title":"Elastic Feature Consolidation for Cold Start Exemplar-Free Incremental\n  Learning","summary":"  Exemplar-Free Class Incremental Learning (EFCIL) aims to learn from a\nsequence of tasks without having access to previous task data. In this paper,\nwe consider the challenging Cold Start scenario in which insufficient data is\navailable in the first task to learn a high-quality backbone. This is\nespecially challenging for EFCIL since it requires high plasticity, which\nresults in feature drift which is difficult to compensate for in the\nexemplar-free setting. To address this problem, we propose a simple and\neffective approach that consolidates feature representations by regularizing\ndrift in directions highly relevant to previous tasks and employs prototypes to\nreduce task-recency bias. Our method, called Elastic Feature Consolidation\n(EFC), exploits a tractable second-order approximation of feature drift based\non an Empirical Feature Matrix (EFM). The EFM induces a pseudo-metric in\nfeature space which we use to regularize feature drift in important directions\nand to update Gaussian prototypes used in a novel asymmetric cross entropy loss\nwhich effectively balances prototype rehearsal with data from new tasks.\nExperimental results on CIFAR-100, Tiny-ImageNet, ImageNet-Subset and\nImageNet-1K demonstrate that Elastic Feature Consolidation is better able to\nlearn new tasks by maintaining model plasticity and significantly outperform\nthe state-of-the-art.\n","authors":["Simone Magistri","Tomaso Trinci","Albin Soutif-Cormerais","Joost van de Weijer","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2402.03917v3.pdf","comment":"Accepted at Twelfth International Conference on Learning\n  Representations (ICLR 2024)"},{"id":"http://arxiv.org/abs/2405.19864v1","updated":"2024-05-30T09:14:01Z","published":"2024-05-30T09:14:01Z","title":"Out-of-distribution Reject Option Method for Dataset Shift Problem in\n  Early Disease Onset Prediction","summary":"  Machine learning is increasingly used to predict lifestyle-related disease\nonset using health and medical data. However, the prediction effectiveness is\nhindered by dataset shift, which involves discrepancies in data distribution\nbetween the training and testing datasets, misclassifying out-of-distribution\n(OOD) data. To diminish dataset shift effects, this paper proposes the\nout-of-distribution reject option for prediction (ODROP), which integrates OOD\ndetection models to preclude OOD data from the prediction phase. We\ninvestigated the efficacy of five OOD detection methods (variational\nautoencoder, neural network ensemble std, neural network ensemble epistemic,\nneural network energy, and neural network gaussian mixture based energy\nmeasurement) across two datasets, the Hirosaki and Wakayama health checkup\ndata, in the context of three disease onset prediction tasks: diabetes,\ndyslipidemia, and hypertension. To evaluate the ODROP method, we trained\ndisease onset prediction models and OOD detection models on Hirosaki data and\nused AUROC-rejection curve plots from Wakayama data. The variational\nautoencoder method showed superior stability and magnitude of improvement in\nArea Under the Receiver Operating Curve (AUROC) in five cases: AUROC in the\nWakayama data was improved from 0.80 to 0.90 at a 31.1% rejection rate for\ndiabetes onset and from 0.70 to 0.76 at a 34% rejection rate for dyslipidemia.\nWe categorized dataset shifts into two types using SHAP clustering - those that\nconsiderably affect predictions and those that do not. We expect that this\nclassification will help standardize measuring instruments. This study is the\nfirst to apply OOD detection to actual health and medical data, demonstrating\nits potential to substantially improve the accuracy and reliability of disease\nprediction models amidst dataset shift.\n","authors":["Taisei Tosaki","Eiichiro Uchino","Ryosuke Kojima","Yohei Mineharu","Mikio Arita","Nobuyuki Miyai","Yoshinori Tamada","Tatsuya Mikami","Koichi Murashita","Shigeyuki Nakaji","Yasushi Okuno"],"pdf_url":"https://arxiv.org/pdf/2405.19864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05337v2","updated":"2024-05-30T09:11:51Z","published":"2023-09-11T09:34:44Z","title":"Stochastic Gradient Descent-like relaxation is equivalent to Metropolis\n  dynamics in discrete optimization and inference problems","summary":"  Is Stochastic Gradient Descent (SGD) substantially different from Metropolis\nMonte Carlo dynamics? This is a fundamental question at the time of\nunderstanding the most used training algorithm in the field of Machine\nLearning, but it received no answer until now. Here we show that in discrete\noptimization and inference problems, the dynamics of an SGD-like algorithm\nresemble very closely that of Metropolis Monte Carlo with a properly chosen\ntemperature, which depends on the mini-batch size. This quantitative matching\nholds both at equilibrium and in the out-of-equilibrium regime, despite the two\nalgorithms having fundamental differences (e.g.\\ SGD does not satisfy detailed\nbalance). Such equivalence allows us to use results about performances and\nlimits of Monte Carlo algorithms to optimize the mini-batch size in the\nSGD-like algorithm and make it efficient at recovering the signal in hard\ninference problems.\n","authors":["Maria Chiara Angelini","Angelo Giorgio Cavaliere","Raffaele Marino","Federico Ricci-Tersenghi"],"pdf_url":"https://arxiv.org/pdf/2309.05337v2.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.16733v2","updated":"2024-05-30T09:02:53Z","published":"2023-09-27T19:22:19Z","title":"Resilience of Deep Learning applications: a systematic literature review\n  of analysis and hardening techniques","summary":"  Machine Learning (ML) is currently being exploited in numerous applications\nbeing one of the most effective Artificial Intelligence (AI) technologies, used\nin diverse fields, such as vision, autonomous systems, and alike. The trend\nmotivated a significant amount of contributions to the analysis and design of\nML applications against faults affecting the underlying hardware. The authors\ninvestigate the existing body of knowledge on Deep Learning (among ML\ntechniques) resilience against hardware faults systematically through a\nthoughtful review in which the strengths and weaknesses of this literature\nstream are presented clearly and then future avenues of research are set out.\nThe review is based on 220 scientific articles published between January 2019\nand March 2024. The authors adopt a classifying framework to interpret and\nhighlight research similarities and peculiarities, based on several parameters,\nstarting from the main scope of the work, the adopted fault and error models,\nto their reproducibility. This framework allows for a comparison of the\ndifferent solutions and the identification of possible synergies. Furthermore,\nsuggestions concerning the future direction of research are proposed in the\nform of open challenges to be addressed.\n","authors":["Cristiana Bolchini","Luca Cassano","Antonio Miele"],"pdf_url":"https://arxiv.org/pdf/2309.16733v2.pdf","comment":"Submitted to Elsevier Computer Science Review on May 9, 2024"},{"id":"http://arxiv.org/abs/2402.08871v2","updated":"2024-05-30T08:52:56Z","published":"2024-02-14T00:35:10Z","title":"Position: Topological Deep Learning is the New Frontier for Relational\n  Learning","summary":"  Topological deep learning (TDL) is a rapidly evolving field that uses\ntopological features to understand and design deep learning models. This paper\nposits that TDL is the new frontier for relational learning. TDL may complement\ngraph representation learning and geometric deep learning by incorporating\ntopological concepts, and can thus provide a natural choice for various machine\nlearning settings. To this end, this paper discusses open problems in TDL,\nranging from practical benefits to theoretical foundations. For each problem,\nit outlines potential solutions and future research opportunities. At the same\ntime, this paper serves as an invitation to the scientific community to\nactively participate in TDL research to unlock the potential of this emerging\nfield.\n","authors":["Theodore Papamarkou","Tolga Birdal","Michael Bronstein","Gunnar Carlsson","Justin Curry","Yue Gao","Mustafa Hajij","Roland Kwitt","Pietro Liò","Paolo Di Lorenzo","Vasileios Maroulas","Nina Miolane","Farzana Nasrin","Karthikeyan Natesan Ramamurthy","Bastian Rieck","Simone Scardapane","Michael T. Schaub","Petar Veličković","Bei Wang","Yusu Wang","Guo-Wei Wei","Ghada Zamzmi"],"pdf_url":"https://arxiv.org/pdf/2402.08871v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19836v1","updated":"2024-05-30T08:45:45Z","published":"2024-05-30T08:45:45Z","title":"The Merit of River Network Topology for Neural Flood Forecasting","summary":"  Climate change exacerbates riverine floods, which occur with higher frequency\nand intensity than ever. The much-needed forecasting systems typically rely on\naccurate river discharge predictions. To this end, the SOTA data-driven\napproaches treat forecasting at spatially distributed gauge stations as\nisolated problems, even within the same river network. However, incorporating\nthe known topology of the river network into the prediction model has the\npotential to leverage the adjacency relationship between gauges. Thus, we model\nriver discharge for a network of gauging stations with GNNs and compare the\nforecasting performance achieved by different adjacency definitions. Our\nresults show that the model fails to benefit from the river network topology\ninformation, both on the entire network and small subgraphs. The learned edge\nweights correlate with neither of the static definitions and exhibit no regular\npattern. Furthermore, the GNNs struggle to predict sudden, narrow discharge\nspikes. Our work hints at a more general underlying phenomenon of neural\nprediction not always benefitting from graphical structure and may inspire a\nsystematic study of the conditions under which this happens.\n","authors":["Nikolas Kirschstein","Yixuan Sun"],"pdf_url":"https://arxiv.org/pdf/2405.19836v1.pdf","comment":"https://openreview.net/forum?id=QE6iC9s6vU"},{"id":"http://arxiv.org/abs/2403.11904v3","updated":"2024-05-30T08:37:45Z","published":"2024-03-18T16:04:55Z","title":"CICLe: Conformal In-Context Learning for Largescale Multi-Class Food\n  Risk Classification","summary":"  Contaminated or adulterated food poses a substantial risk to human health.\nGiven sets of labeled web texts for training, Machine Learning and Natural\nLanguage Processing can be applied to automatically detect such risks. We\npublish a dataset of 7,546 short texts describing public food recall\nannouncements. Each text is manually labeled, on two granularity levels (coarse\nand fine), for food products and hazards that the recall corresponds to. We\ndescribe the dataset and benchmark naive, traditional, and Transformer models.\nBased on our analysis, Logistic Regression based on a tf-idf representation\noutperforms RoBERTa and XLM-R on classes with low support. Finally, we discuss\ndifferent prompting strategies and present an LLM-in-the-loop framework, based\non Conformal Prediction, which boosts the performance of the base classifier\nwhile reducing energy consumption compared to normal prompting.\n","authors":["Korbinian Randl","John Pavlopoulos","Aron Henriksson","Tony Lindgren"],"pdf_url":"https://arxiv.org/pdf/2403.11904v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19823v1","updated":"2024-05-30T08:31:18Z","published":"2024-05-30T08:31:18Z","title":"Joint Selective State Space Model and Detrending for Robust Time Series\n  Anomaly Detection","summary":"  Deep learning-based sequence models are extensively employed in Time Series\nAnomaly Detection (TSAD) tasks due to their effective sequential modeling\ncapabilities. However, the ability of TSAD is limited by two key challenges:\n(i) the ability to model long-range dependency and (ii) the generalization\nissue in the presence of non-stationary data. To tackle these challenges, an\nanomaly detector that leverages the selective state space model known for its\nproficiency in capturing long-term dependencies across various domains is\nproposed. Additionally, a multi-stage detrending mechanism is introduced to\nmitigate the prominent trend component in non-stationary data to address the\ngeneralization issue. Extensive experiments conducted on realworld public\ndatasets demonstrate that the proposed methods surpass all 12 compared baseline\nmethods.\n","authors":["Junqi Chen","Xu Tan","Sylwan Rahardja","Jiawei Yang","Susanto Rahardja"],"pdf_url":"https://arxiv.org/pdf/2405.19823v1.pdf","comment":"Submitted to IEEE Signal Processing Letters"},{"id":"http://arxiv.org/abs/2402.17257v3","updated":"2024-05-30T08:24:54Z","published":"2024-02-27T07:03:25Z","title":"RIME: Robust Preference-based Reinforcement Learning with Noisy\n  Preferences","summary":"  Preference-based Reinforcement Learning (PbRL) circumvents the need for\nreward engineering by harnessing human preferences as the reward signal.\nHowever, current PbRL methods excessively depend on high-quality feedback from\ndomain experts, which results in a lack of robustness. In this paper, we\npresent RIME, a robust PbRL algorithm for effective reward learning from noisy\npreferences. Our method utilizes a sample selection-based discriminator to\ndynamically filter out noise and ensure robust training. To counteract the\ncumulative error stemming from incorrect selection, we suggest a warm start for\nthe reward model, which additionally bridges the performance gap during the\ntransition from pre-training to online training in PbRL. Our experiments on\nrobotic manipulation and locomotion tasks demonstrate that RIME significantly\nenhances the robustness of the state-of-the-art PbRL method. Code is available\nat https://github.com/CJReinforce/RIME_ICML2024.\n","authors":["Jie Cheng","Gang Xiong","Xingyuan Dai","Qinghai Miao","Yisheng Lv","Fei-Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2402.17257v3.pdf","comment":"Accepted by ICML2024"},{"id":"http://arxiv.org/abs/2405.19811v1","updated":"2024-05-30T08:20:34Z","published":"2024-05-30T08:20:34Z","title":"Approximate Global Convergence of Independent Learning in Multi-Agent\n  Systems","summary":"  Independent learning (IL), despite being a popular approach in practice to\nachieve scalability in large-scale multi-agent systems, usually lacks global\nconvergence guarantees. In this paper, we study two representative algorithms,\nindependent $Q$-learning and independent natural actor-critic, within\nvalue-based and policy-based frameworks, and provide the first finite-sample\nanalysis for approximate global convergence. The results imply a sample\ncomplexity of $\\tilde{\\mathcal{O}}(\\epsilon^{-2})$ up to an error term that\ncaptures the dependence among agents and characterizes the fundamental limit of\nIL in achieving global convergence. To establish the result, we develop a novel\napproach for analyzing IL by constructing a separable Markov decision process\n(MDP) for convergence analysis and then bounding the gap due to model\ndifference between the separable MDP and the original one. Moreover, we conduct\nnumerical experiments using a synthetic MDP and an electric vehicle charging\nexample to verify our theoretical findings and to demonstrate the practical\napplicability of IL.\n","authors":["Ruiyang Jin","Zaiwei Chen","Yiheng Lin","Jie Song","Adam Wierman"],"pdf_url":"https://arxiv.org/pdf/2405.19811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13047v2","updated":"2024-05-30T08:19:34Z","published":"2023-08-24T19:27:59Z","title":"Federated Causal Inference from Observational Data","summary":"  Decentralized data sources are prevalent in real-world applications, posing a\nformidable challenge for causal inference. These sources cannot be consolidated\ninto a single entity owing to privacy constraints. The presence of dissimilar\ndata distributions and missing values within them can potentially introduce\nbias to the causal estimands. In this article, we propose a framework to\nestimate causal effects from decentralized data sources. The proposed framework\navoid exchanging raw data among the sources, thus contributing towards\nprivacy-preserving causal learning. Three instances of the proposed framework\nare introduced to estimate causal effects across a wide range of diverse\nscenarios within a federated setting. (1) FedCI: a Bayesian framework based on\nGaussian processes for estimating causal effects from federated observational\ndata sources. It estimates the posterior distributions of the causal effects to\ncompute the higher-order statistics that capture the uncertainty. (2)\nCausalRFF: an adaptive transfer algorithm that learns the similarities among\nthe data sources by utilizing Random Fourier Features to disentangle the loss\nfunction into multiple components, each of which is associated with a data\nsource. It estimates the similarities among the sources through transfer\ncoefficients, and hence requiring no prior information about the similarity\nmeasures. (3) CausalFI: a new approach for federated causal inference from\nincomplete data, enabling the estimation of causal effects from multiple\ndecentralized and incomplete data sources. It accounts for the missing data\nunder the missing at random assumption, while also estimating higher-order\nstatistics of the causal estimands. The proposed federated framework and its\ninstances are an important step towards a privacy-preserving causal learning\nmodel.\n","authors":["Thanh Vinh Vo","Young lee","Tze-Yun Leong"],"pdf_url":"https://arxiv.org/pdf/2308.13047v2.pdf","comment":"Preprint. arXiv admin note: substantial text overlap with\n  arXiv:2301.00346"},{"id":"http://arxiv.org/abs/2405.19807v1","updated":"2024-05-30T08:17:00Z","published":"2024-05-30T08:17:00Z","title":"MetaCURL: Non-stationary Concave Utility Reinforcement Learning","summary":"  We explore online learning in episodic loop-free Markov decision processes on\nnon-stationary environments (changing losses and probability transitions). Our\nfocus is on the Concave Utility Reinforcement Learning problem (CURL), an\nextension of classical RL for handling convex performance criteria in\nstate-action distributions induced by agent policies. While various machine\nlearning problems can be written as CURL, its non-linearity invalidates\ntraditional Bellman equations. Despite recent solutions to classical CURL, none\naddress non-stationary MDPs. This paper introduces MetaCURL, the first CURL\nalgorithm for non-stationary MDPs. It employs a meta-algorithm running multiple\nblack-box algorithms instances over different intervals, aggregating outputs\nvia a sleeping expert framework. The key hurdle is partial information due to\nMDP uncertainty. Under partial information on the probability transitions\n(uncertainty and non-stationarity coming only from external noise, independent\nof agent state-action pairs), we achieve optimal dynamic regret without prior\nknowledge of MDP changes. Unlike approaches for RL, MetaCURL handles full\nadversarial losses, not just stochastic ones. We believe our approach for\nmanaging non-stationarity with experts can be of interest to the RL community.\n","authors":["Bianca Marin Moreno","Margaux Brégère","Pierre Gaillard","Nadia Oudjane"],"pdf_url":"https://arxiv.org/pdf/2405.19807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20291v1","updated":"2024-05-30T17:41:32Z","published":"2024-05-30T17:41:32Z","title":"Unveiling and Mitigating Backdoor Vulnerabilities based on Unlearning\n  Weight Changes and Backdoor Activeness","summary":"  The security threat of backdoor attacks is a central concern for deep neural\nnetworks (DNNs). Recently, without poisoned data, unlearning models with clean\ndata and then learning a pruning mask have contributed to backdoor defense.\nAdditionally, vanilla fine-tuning with those clean data can help recover the\nlost clean accuracy. However, the behavior of clean unlearning is still\nunder-explored, and vanilla fine-tuning unintentionally induces back the\nbackdoor effect. In this work, we first investigate model unlearning from the\nperspective of weight changes and gradient norms, and find two interesting\nobservations in the backdoored model: 1) the weight changes between poison and\nclean unlearning are positively correlated, making it possible for us to\nidentify the backdoored-related neurons without using poisoned data; 2) the\nneurons of the backdoored model are more active (i.e., larger changes in\ngradient norm) than those in the clean model, suggesting the need to suppress\nthe gradient norm during fine-tuning. Then, we propose an effective two-stage\ndefense method. In the first stage, an efficient Neuron Weight Change\n(NWC)-based Backdoor Reinitialization is proposed based on observation 1). In\nthe second stage, based on observation 2), we design an Activeness-Aware\nFine-Tuning to replace the vanilla fine-tuning. Extensive experiments,\ninvolving eight backdoor attacks on three benchmark datasets, demonstrate the\nsuperior performance of our proposed method compared to recent state-of-the-art\nbackdoor defense approaches.\n","authors":["Weilin Lin","Li Liu","Shaokui Wei","Jianze Li","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2405.20291v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2405.20078v1","updated":"2024-05-30T14:08:09Z","published":"2024-05-30T14:08:09Z","title":"NeRF View Synthesis: Subjective Quality Assessment and Objective Metrics\n  Evaluation","summary":"  Neural radiance fields (NeRF) are a groundbreaking computer vision technology\nthat enables the generation of high-quality, immersive visual content from\nmultiple viewpoints. This capability holds significant advantages for\napplications such as virtual/augmented reality, 3D modelling and content\ncreation for the film and entertainment industry. However, the evaluation of\nNeRF methods poses several challenges, including a lack of comprehensive\ndatasets, reliable assessment methodologies, and objective quality metrics.\nThis paper addresses the problem of NeRF quality assessment thoroughly, by\nconducting a rigorous subjective quality assessment test that considers several\nscene classes and recently proposed NeRF view synthesis methods. Additionally,\nthe performance of a wide range of state-of-the-art conventional and\nlearning-based full-reference 2D image and video quality assessment metrics is\nevaluated against the subjective scores of the subjective study. The\nexperimental results are analyzed in depth, providing a comparative evaluation\nof several NeRF methods and objective quality metrics, across different classes\nof visual scenes, including real and synthetic content for front-face and\n360-degree camera trajectories.\n","authors":["Pedro Martin","Antonio Rodrigues","Joao Ascenso","Maria Paula Queluz"],"pdf_url":"https://arxiv.org/pdf/2405.20078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20032v1","updated":"2024-05-30T13:16:48Z","published":"2024-05-30T13:16:48Z","title":"Promptus: Can Prompts Streaming Replace Video Streaming with Stable\n  Diffusion","summary":"  With the exponential growth of video traffic, traditional video streaming\nsystems are approaching their limits in compression efficiency and\ncommunication capacity. To further reduce bitrate while maintaining quality, we\npropose Promptus, a disruptive novel system that streaming prompts instead of\nvideo content with Stable Diffusion, which converts video frames into a series\nof \"prompts\" for delivery. To ensure pixel alignment, a gradient descent-based\nprompt fitting framework is proposed. To achieve adaptive bitrate for prompts,\na low-rank decomposition-based bitrate control algorithm is introduced. For\ninter-frame compression of prompts, a temporal smoothing-based prompt\ninterpolation algorithm is proposed. Evaluations across various video domains\nand real network traces demonstrate Promptus can enhance the perceptual quality\nby 0.111 and 0.092 (in LPIPS) compared to VAE and H.265, respectively, and\ndecreases the ratio of severely distorted frames by 89.3% and 91.7%. Moreover,\nPromptus achieves real-time video generation from prompts at over 150 FPS. To\nthe best of our knowledge, Promptus is the first attempt to replace video\ncodecs with prompt inversion and the first to use prompt streaming instead of\nvideo streaming. Our work opens up a new paradigm for efficient video\ncommunication beyond the Shannon limit.\n","authors":["Jiangkai Wu","Liming Liu","Yunpeng Tan","Junlin Hao","Xinggong Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19889v1","updated":"2024-05-30T09:46:59Z","published":"2024-05-30T09:46:59Z","title":"Deep Joint Semantic Coding and Beamforming for Near-Space Airship-Borne\n  Massive MIMO Network","summary":"  Near-space airship-borne communication network is recognized to be an\nindispensable component of the future integrated ground-air-space network\nthanks to airships' advantage of long-term residency at stratospheric\naltitudes, but it urgently needs reliable and efficient Airship-to-X link. To\nimprove the transmission efficiency and capacity, this paper proposes to\nintegrate semantic communication with massive multiple-input multiple-output\n(MIMO) technology. Specifically, we propose a deep joint semantic coding and\nbeamforming (JSCBF) scheme for airship-based massive MIMO image transmission\nnetwork in space, in which semantics from both source and channel are fused to\njointly design the semantic coding and physical layer beamforming. First, we\ndesign two semantic extraction networks to extract semantics from image source\nand channel state information, respectively. Then, we propose a semantic fusion\nnetwork that can fuse these semantics into complex-valued semantic features for\nsubsequent physical-layer transmission. To efficiently transmit the fused\nsemantic features at the physical layer, we then propose the hybrid data and\nmodel-driven semantic-aware beamforming networks. At the receiver, a semantic\ndecoding network is designed to reconstruct the transmitted images. Finally, we\nperform end-to-end deep learning to jointly train all the modules, using the\nimage reconstruction quality at the receivers as a metric. The proposed deep\nJSCBF scheme fully combines the efficient source compressibility and robust\nerror correction capability of semantic communication with the high spectral\nefficiency of massive MIMO, achieving a significant performance improvement\nover existing approaches.\n","authors":["Minghui Wu","Zhen Gao","Zhaocheng Wang","Dusit Niyato","George K. Karagiannidis","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2405.19889v1.pdf","comment":"Major Revision by IEEE JSAC"},{"id":"http://arxiv.org/abs/2405.19802v1","updated":"2024-05-30T08:12:08Z","published":"2024-05-30T08:12:08Z","title":"Exploring the Robustness of Decision-Level Through Adversarial Attacks\n  on LLM-Based Embodied Models","summary":"  Embodied intelligence empowers agents with a profound sense of perception,\nenabling them to respond in a manner closely aligned with real-world\nsituations. Large Language Models (LLMs) delve into language instructions with\ndepth, serving a crucial role in generating plans for intricate tasks. Thus,\nLLM-based embodied models further enhance the agent's capacity to comprehend\nand process information. However, this amalgamation also ushers in new\nchallenges in the pursuit of heightened intelligence. Specifically, attackers\ncan manipulate LLMs to produce irrelevant or even malicious outputs by altering\ntheir prompts. Confronted with this challenge, we observe a notable absence of\nmulti-modal datasets essential for comprehensively evaluating the robustness of\nLLM-based embodied models. Consequently, we construct the Embodied Intelligent\nRobot Attack Dataset (EIRAD), tailored specifically for robustness evaluation.\nAdditionally, two attack strategies are devised, including untargeted attacks\nand targeted attacks, to effectively simulate a range of diverse attack\nscenarios. At the same time, during the attack process, to more accurately\nascertain whether our method is successful in attacking the LLM-based embodied\nmodel, we devise a new attack success evaluation method utilizing the BLIP2\nmodel. Recognizing the time and cost-intensive nature of the GCG algorithm in\nattacks, we devise a scheme for prompt suffix initialization based on various\ntarget tasks, thus expediting the convergence process. Experimental results\ndemonstrate that our method exhibits a superior attack success rate when\ntargeting LLM-based embodied models, indicating a lower level of decision-level\nrobustness in these models.\n","authors":["Shuyuan Liu","Jiawei Chen","Shouwei Ruan","Hang Su","Zhaoxia Yin"],"pdf_url":"https://arxiv.org/pdf/2405.19802v1.pdf","comment":null}]},"2024-05-31T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2405.21075v1","updated":"2024-05-31T17:59:47Z","published":"2024-05-31T17:59:47Z","title":"Video-MME: The First-Ever Comprehensive Evaluation Benchmark of\n  Multi-modal LLMs in Video Analysis","summary":"  In the quest for artificial general intelligence, Multi-modal Large Language\nModels (MLLMs) have emerged as a focal point in recent advancements. However,\nthe predominant focus remains on developing their capabilities in static image\nunderstanding. The potential of MLLMs in processing sequential visual data is\nstill insufficiently explored, highlighting the absence of a comprehensive,\nhigh-quality assessment of their performance. In this paper, we introduce\nVideo-MME, the first-ever full-spectrum, Multi-Modal Evaluation benchmark of\nMLLMs in Video analysis. Our work distinguishes from existing benchmarks\nthrough four key features: 1) Diversity in video types, spanning 6 primary\nvisual domains with 30 subfields to ensure broad scenario generalizability; 2)\nDuration in temporal dimension, encompassing both short-, medium-, and\nlong-term videos, ranging from 11 seconds to 1 hour, for robust contextual\ndynamics; 3) Breadth in data modalities, integrating multi-modal inputs besides\nvideo frames, including subtitles and audios, to unveil the all-round\ncapabilities of MLLMs; 4) Quality in annotations, utilizing rigorous manual\nlabeling by expert annotators to facilitate precise and reliable model\nassessment. 900 videos with a total of 256 hours are manually selected and\nannotated by repeatedly viewing all the video content, resulting in 2,700\nquestion-answer pairs. With Video-MME, we extensively evaluate various\nstate-of-the-art MLLMs, including GPT-4 series and Gemini 1.5 Pro, as well as\nopen-source image models like InternVL-Chat-V1.5 and video models like\nLLaVA-NeXT-Video. Our experiments reveal that Gemini 1.5 Pro is the\nbest-performing commercial model, significantly outperforming the open-source\nmodels. Our dataset along with these findings underscores the need for further\nimprovements in handling longer sequences and multi-modal data. Project Page:\nhttps://video-mme.github.io\n","authors":["Chaoyou Fu","Yuhan Dai","Yondong Luo","Lei Li","Shuhuai Ren","Renrui Zhang","Zihan Wang","Chenyu Zhou","Yunhang Shen","Mengdan Zhang","Peixian Chen","Yanwei Li","Shaohui Lin","Sirui Zhao","Ke Li","Tong Xu","Xiawu Zheng","Enhong Chen","Rongrong Ji","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2405.21075v1.pdf","comment":"Project Page: https://video-mme.github.io"},{"id":"http://arxiv.org/abs/2405.21070v1","updated":"2024-05-31T17:57:24Z","published":"2024-05-31T17:57:24Z","title":"Generalization Beyond Data Imbalance: A Controlled Study on CLIP for\n  Transferable Insights","summary":"  Severe data imbalance naturally exists among web-scale vision-language\ndatasets. Despite this, we find CLIP pre-trained thereupon exhibits notable\nrobustness to the data imbalance compared to supervised learning, and\ndemonstrates significant effectiveness in learning generalizable\nrepresentations. With an aim to investigate the reasons behind this finding, we\nconduct controlled experiments to study various underlying factors, and reveal\nthat CLIP's pretext task forms a dynamic classification problem wherein only a\nsubset of classes is present in training. This isolates the bias from dominant\nclasses and implicitly balances the learning signal. Furthermore, the\nrobustness and discriminability of CLIP improve with more descriptive language\nsupervision, larger data scale, and broader open-world concepts, which are\ninaccessible to supervised learning. Our study not only uncovers the mechanisms\nbehind CLIP's generalizability beyond data imbalance but also provides\ntransferable insights for the research community. The findings are validated in\nboth supervised and self-supervised learning, enabling models trained on\nimbalanced data to achieve CLIP-level performance on diverse recognition tasks.\nCode will be available at: https://github.com/CVMI-Lab/clip-beyond-tail.\n","authors":["Xin Wen","Bingchen Zhao","Yilun Chen","Jiangmiao Pang","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2405.21070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21068v1","updated":"2024-05-31T17:56:33Z","published":"2024-05-31T17:56:33Z","title":"Code Pretraining Improves Entity Tracking Abilities of Language Models","summary":"  Recent work has provided indirect evidence that pretraining language models\non code improves the ability of models to track state changes of discourse\nentities expressed in natural language. In this work, we systematically test\nthis claim by comparing pairs of language models on their entity tracking\nperformance. Critically, the pairs consist of base models and models trained on\ntop of these base models with additional code data. We extend this analysis to\nadditionally examine the effect of math training, another highly structured\ndata type, and alignment tuning, an important step for enhancing the usability\nof models. We find clear evidence that models additionally trained on large\namounts of code outperform the base models. On the other hand, we find no\nconsistent benefit of additional math training or alignment tuning across\nvarious model families.\n","authors":["Najoung Kim","Sebastian Schuster","Shubham Toshniwal"],"pdf_url":"https://arxiv.org/pdf/2405.21068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15938v3","updated":"2024-05-31T17:49:03Z","published":"2024-02-24T23:54:41Z","title":"Generalization or Memorization: Data Contamination and Trustworthy\n  Evaluation for Large Language Models","summary":"  Recent statements about the impressive capabilities of large language models\n(LLMs) are usually supported by evaluating on open-access benchmarks.\nConsidering the vast size and wide-ranging sources of LLMs' training data, it\ncould explicitly or implicitly include test data, leading to LLMs being more\nsusceptible to data contamination. However, due to the opacity of training\ndata, the black-box access of models, and the rapid growth of synthetic\ntraining data, detecting and mitigating data contamination for LLMs faces\nsignificant challenges. In this paper, we propose CDD, which stands for\nContamination Detection via output Distribution for LLMs. CDD necessitates only\nthe sampled texts to detect data contamination, by identifying the peakedness\nof LLM's output distribution. To mitigate the impact of data contamination in\nevaluation, we also present TED: Trustworthy Evaluation via output\nDistribution, based on the correction of LLM's output distribution. To\nfacilitate this study, we introduce two benchmarks, i.e., DetCon and ComiEval,\nfor data contamination detection and contamination mitigation evaluation tasks.\nExtensive experimental results show that CDD achieves the average relative\nimprovements of 21.8\\%-30.2\\% over other contamination detection approaches in\nterms of Accuracy, F1 Score, and AUC metrics, and can effectively detect\nimplicit contamination. TED substantially mitigates performance improvements up\nto 66.9\\% attributed to data contamination across various contamination setups.\nIn real-world applications, we reveal that ChatGPT exhibits a high potential to\nsuffer from data contamination on HumanEval benchmark.\n","authors":["Yihong Dong","Xue Jiang","Huanyu Liu","Zhi Jin","Bin Gu","Mengfei Yang","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2402.15938v3.pdf","comment":"Accepted to ACL"},{"id":"http://arxiv.org/abs/2405.08295v2","updated":"2024-05-31T17:47:40Z","published":"2024-05-14T03:33:31Z","title":"SpeechVerse: A Large-scale Generalizable Audio Language Model","summary":"  Large language models (LLMs) have shown incredible proficiency in performing\ntasks that require semantic understanding of natural language instructions.\nRecently, many works have further expanded this capability to perceive\nmultimodal audio and text inputs, but their capabilities are often limited to\nspecific fine-tuned tasks such as automatic speech recognition and translation.\nWe therefore develop SpeechVerse, a robust multi-task training and curriculum\nlearning framework that combines pre-trained speech and text foundation models\nvia a small set of learnable parameters, while keeping the pre-trained models\nfrozen during training. The models are instruction finetuned using continuous\nlatent representations extracted from the speech foundation model to achieve\noptimal zero-shot performance on a diverse range of speech processing tasks\nusing natural language instructions. We perform extensive benchmarking that\nincludes comparing our model performance against traditional baselines across\nseveral datasets and tasks. Furthermore, we evaluate the model's capability for\ngeneralized instruction following by testing on out-of-domain datasets, novel\nprompts, and unseen tasks. Our empirical experiments reveal that our multi-task\nSpeechVerse model is even superior to conventional task-specific baselines on 9\nout of the 11 tasks.\n","authors":["Nilaksh Das","Saket Dingliwal","Srikanth Ronanki","Rohit Paturi","Zhaocheng Huang","Prashant Mathur","Jie Yuan","Dhanush Bekal","Xing Niu","Sai Muralidhar Jayanthi","Xilai Li","Karel Mundnich","Monica Sunkara","Sundararajan Srinivasan","Kyu J Han","Katrin Kirchhoff"],"pdf_url":"https://arxiv.org/pdf/2405.08295v2.pdf","comment":"Single Column, 13 page"},{"id":"http://arxiv.org/abs/2405.21047v1","updated":"2024-05-31T17:39:15Z","published":"2024-05-31T17:39:15Z","title":"Grammar-Aligned Decoding","summary":"  Large Language Models (LLMs) struggle with reliably generating highly\nstructured outputs, such as program code, mathematical formulas, or well-formed\nmarkup. Constrained decoding approaches mitigate this problem by greedily\nrestricting what tokens an LLM can output at each step to guarantee that the\noutput matches a given constraint. Specifically, in grammar-constrained\ndecoding (GCD), the LLM's output must follow a given grammar. In this paper we\ndemonstrate that GCD techniques (and in general constrained decoding\ntechniques) can distort the LLM's distribution, leading to outputs that are\ngrammatical but appear with likelihoods that are not proportional to the ones\ngiven by the LLM, and so ultimately are low-quality. We call the problem of\naligning sampling with a grammar constraint, grammar-aligned decoding (GAD),\nand propose adaptive sampling with approximate expected futures (ASAp), a\ndecoding algorithm that guarantees the output to be grammatical while provably\nproducing outputs that match the conditional probability of the LLM's\ndistribution conditioned on the given grammar constraint. Our algorithm uses\nprior sample outputs to soundly overapproximate the future grammaticality of\ndifferent output prefixes. Our evaluation on code generation and structured NLP\ntasks shows how ASAp often produces outputs with higher likelihood (according\nto the LLM's distribution) than existing GCD techniques, while still enforcing\nthe desired grammatical constraints.\n","authors":["Kanghee Park","Jiayu Wang","Taylor Berg-Kirkpatrick","Nadia Polikarpova","Loris D'Antoni"],"pdf_url":"https://arxiv.org/pdf/2405.21047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21046v1","updated":"2024-05-31T17:39:06Z","published":"2024-05-31T17:39:06Z","title":"Exploratory Preference Optimization: Harnessing Implicit\n  Q*-Approximation for Sample-Efficient RLHF","summary":"  Reinforcement learning from human feedback (RLHF) has emerged as a central\ntool for language model alignment. We consider online exploration in RLHF,\nwhich exploits interactive access to human or AI feedback by deliberately\nencouraging the model to produce diverse, maximally informative responses. By\nallowing RLHF to confidently stray from the pre-trained model, online\nexploration offers the possibility of novel, potentially super-human\ncapabilities, but its full potential as a paradigm for language model training\nhas yet to be realized, owing to computational and statistical bottlenecks in\ndirectly adapting existing reinforcement learning techniques. We propose a new\nalgorithm for online exploration in RLHF, Exploratory Preference Optimization\n(XPO), which is simple and practical -- a one-line change to (online) Direct\nPreference Optimization (DPO; Rafailov et al., 2023) -- yet enjoys the\nstrongest known provable guarantees and promising empirical performance. XPO\naugments the DPO objective with a novel and principled exploration bonus,\nempowering the algorithm to explore outside the support of the initial model\nand human feedback data. In theory, we show that XPO is provably\nsample-efficient and converges to a near-optimal language model policy under\nnatural exploration conditions, irrespective of whether the initial model has\ngood coverage. Our analysis, which builds on the observation that DPO\nimplicitly performs a form of $Q^{\\star}$-approximation (or, Bellman error\nminimization), combines previously disparate techniques from language modeling\nand theoretical reinforcement learning in a serendipitous fashion through the\nperspective of KL-regularized Markov decision processes. Empirically, we find\nthat XPO is more sample-efficient than non-exploratory DPO variants in a\npreliminary evaluation.\n","authors":["Tengyang Xie","Dylan J. Foster","Akshay Krishnamurthy","Corby Rosset","Ahmed Awadallah","Alexander Rakhlin"],"pdf_url":"https://arxiv.org/pdf/2405.21046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18239v2","updated":"2024-05-31T17:38:51Z","published":"2024-04-28T16:31:32Z","title":"SOUL: Unlocking the Power of Second-Order Optimization for LLM\n  Unlearning","summary":"  Large Language Models (LLMs) have highlighted the necessity of effective\nunlearning mechanisms to comply with data regulations and ethical AI practices.\nLLM unlearning aims at removing undesired data influences and associated model\ncapabilities without compromising utility out of the scope of unlearning. While\ninterest in studying LLM unlearning is growing,the impact of the optimizer\nchoice for LLM unlearning remains under-explored. In this work, we shed light\non the significance of optimizer selection in LLM unlearning for the first\ntime, establishing a clear connection between {second-order optimization} and\ninfluence unlearning (a classical approach using influence functions to update\nthe model for data influence removal). This insight propels us to develop a\nsecond-order unlearning framework, termed SOUL, built upon the second-order\nclipped stochastic optimization (Sophia)-based LLM training method. SOUL\nextends the static, one-shot model update using influence unlearning to a\ndynamic, iterative unlearning process. Our extensive experiments show that SOUL\nconsistently outperforms conventional first-order methods across various\nunlearning tasks, models, and metrics, suggesting the promise of second-order\noptimization in providing a scalable and easily implementable solution for LLM\nunlearning.\n","authors":["Jinghan Jia","Yihua Zhang","Yimeng Zhang","Jiancheng Liu","Bharat Runwal","James Diffenderfer","Bhavya Kailkhura","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2404.18239v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09615v3","updated":"2024-05-31T17:31:38Z","published":"2024-02-14T23:09:15Z","title":"API Pack: A Massive Multi-Programming Language Dataset for API Call\n  Generation","summary":"  We introduce API Pack, a massive multi-programming language dataset\ncontaining more than 1 million instruction-API call pairs to improve the API\ncall generation capabilities of large language models. By fine-tuning\nCodeLlama-13B on 20,000 Python instances from API Pack, we achieved around 10%\nand 5% higher accuracy compared to GPT-3.5 and GPT-4, respectively, in\ngenerating unseen API calls. Fine-tuning on API Pack enables cross-programming\nlanguage generalization by leveraging a large amount of data in one language\nand small amounts of data from other languages. Scaling the training data to 1\nmillion instances further improves the model's generalization to new APIs not\nencountered during training. We open-source the API Pack dataset, trained\nmodels, and associated source code at https://github.com/zguo0525/API-Pack to\nfacilitate further research.\n","authors":["Zhen Guo","Adriana Meza Soria","Wei Sun","Yikang Shen","Rameswar Panda"],"pdf_url":"https://arxiv.org/pdf/2402.09615v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21040v1","updated":"2024-05-31T17:31:18Z","published":"2024-05-31T17:31:18Z","title":"Direct Alignment of Language Models via Quality-Aware Self-Refinement","summary":"  Reinforcement Learning from Human Feedback (RLHF) has been commonly used to\nalign the behaviors of Large Language Models (LLMs) with human preferences.\nRecently, a popular alternative is Direct Policy Optimization (DPO), which\nreplaces an LLM-based reward model with the policy itself, thus obviating the\nneed for extra memory and training time to learn the reward model. However, DPO\ndoes not consider the relative qualities of the positive and negative\nresponses, and can lead to sub-optimal training outcomes. To alleviate this\nproblem, we investigate the use of intrinsic knowledge within the on-the-fly\nfine-tuning LLM to obtain relative qualities and help to refine the loss\nfunction. Specifically, we leverage the knowledge of the LLM to design a\nrefinement function to estimate the quality of both the positive and negative\nresponses. We show that the constructed refinement function can help\nself-refine the loss function under mild assumptions. The refinement function\nis integrated into DPO and its variant Identity Policy Optimization (IPO).\nExperiments across various evaluators indicate that they can improve the\nperformance of the fine-tuned models over DPO and IPO.\n","authors":["Runsheng Yu","Yong Wang","Xiaoqi Jiao","Youzhi Zhang","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2405.21040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11058v2","updated":"2024-05-31T17:30:13Z","published":"2024-02-16T20:14:47Z","title":"II-MMR: Identifying and Improving Multi-modal Multi-hop Reasoning in\n  Visual Question Answering","summary":"  Visual Question Answering (VQA) often involves diverse reasoning scenarios\nacross Vision and Language (V&L). Most prior VQA studies, however, have merely\nfocused on assessing the model's overall accuracy without evaluating it on\ndifferent reasoning cases. Furthermore, some recent works observe that\nconventional Chain-of-Thought (CoT) prompting fails to generate effective\nreasoning for VQA, especially for complex scenarios requiring multi-hop\nreasoning. In this paper, we propose II-MMR, a novel idea to identify and\nimprove multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA\nquestion with an image and finds a reasoning path to reach its answer using two\nnovel language promptings: (i) answer prediction-guided CoT prompt, or (ii)\nknowledge triplet-guided prompt. II-MMR then analyzes this path to identify\ndifferent reasoning cases in current VQA benchmarks by estimating how many hops\nand what types (i.e., visual or beyond-visual) of reasoning are required to\nanswer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR\nobserves that most of their VQA questions are easy to answer, simply demanding\n\"single-hop\" reasoning, whereas only a few questions require \"multi-hop\"\nreasoning. Moreover, while the recent V&L model struggles with such complex\nmulti-hop reasoning questions even using the traditional CoT method, II-MMR\nshows its effectiveness across all reasoning cases in both zero-shot and\nfine-tuning settings.\n","authors":["Jihyung Kil","Farideh Tavazoee","Dongyeop Kang","Joo-Kyung Kim"],"pdf_url":"https://arxiv.org/pdf/2402.11058v2.pdf","comment":"Accepted to ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2405.21028v1","updated":"2024-05-31T17:16:38Z","published":"2024-05-31T17:16:38Z","title":"LACIE: Listener-Aware Finetuning for Confidence Calibration in Large\n  Language Models","summary":"  When answering questions, LLMs can convey not only an answer, but a level of\nconfidence about the answer being correct. This includes explicit confidence\nmarkers (e.g. giving a numeric score) as well as implicit markers, like an\nauthoritative tone or elaborating with additional knowledge. For LLMs to be\ntrustworthy knowledge sources, the confidence they convey should match their\nactual expertise; however, most current models tend towards overconfidence. To\ncalibrate both implicit and explicit confidence markers, we introduce a\npragmatic, listener-aware finetuning method (LACIE) that models the listener,\nconsidering not only whether an answer is right, but whether it will be\naccepted by a listener. We cast calibration as preference optimization,\ncreating data via a two-agent game, where a speaker model's outputs are judged\nby a simulated listener. We then finetune three LLMs (Mistral-7B, Llama3-8B,\nLlama3-70B) with LACIE, and show that the resulting models are better\ncalibrated w.r.t. a simulated listener. Crucially, these trends transfer to\nhuman listeners, helping them correctly predict model correctness: we conduct a\nhuman evaluation where annotators accept or reject an LLM's answers, finding\nthat training with LACIE results in 47% fewer incorrect answers being accepted\nwhile maintaining the same level of acceptance for correct answers.\nFurthermore, LACIE generalizes to another dataset, resulting in a large\nincrease in truthfulness on TruthfulQA when trained on TriviaQA. Our analysis\nindicates that LACIE leads to a better confidence separation between correct\nand incorrect examples. Qualitatively, we find that a LACIE-trained model\nhedges more and implicitly signals certainty when it is correct by using an\nauthoritative tone or including details. Finally, LACIE finetuning leads to an\nemergent increase in model abstention (e.g. saying \"I don't know\") for answers\nthat are likely wrong.\n","authors":["Elias Stengel-Eskin","Peter Hase","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2405.21028v1.pdf","comment":"17 pages. Code: https://github.com/esteng/pragmatic_calibration"},{"id":"http://arxiv.org/abs/2405.21022v1","updated":"2024-05-31T17:09:16Z","published":"2024-05-31T17:09:16Z","title":"You Only Scan Once: Efficient Multi-dimension Sequential Modeling with\n  LightNet","summary":"  Linear attention mechanisms have gained prominence in causal language models\ndue to their linear computational complexity and enhanced speed. However, the\ninherent decay mechanism in linear attention presents challenges when applied\nto multi-dimensional sequence modeling tasks, such as image processing and\nmulti-modal learning. In these scenarios, the utilization of sequential\nscanning to establish a global receptive field necessitates multiple scans for\nmulti-dimensional data, thereby leading to inefficiencies. This paper\nidentifies the inefficiency caused by a multiplicative linear recurrence and\nproposes an efficient alternative additive linear recurrence to avoid the\nissue, as it can handle multi-dimensional data within a single scan. We further\ndevelop an efficient multi-dimensional sequential modeling framework called\nLightNet based on the new recurrence. Moreover, we present two new\nmulti-dimensional linear relative positional encoding methods, MD-TPE and\nMD-LRPE to enhance the model's ability to discern positional information in\nmulti-dimensional scenarios. Our empirical evaluations across various tasks,\nincluding image classification, image generation, bidirectional language\nmodeling, and autoregressive language modeling, demonstrate the efficacy of\nLightNet, showcasing its potential as a versatile and efficient solution for\nmulti-dimensional sequential modeling.\n","authors":["Zhen Qin","Yuxin Mao","Xuyang Shen","Dong Li","Jing Zhang","Yuchao Dai","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2405.21022v1.pdf","comment":"Technical report. Yiran Zhong is the corresponding author. The code\n  is available at https://github.com/OpenNLPLab/LightNet"},{"id":"http://arxiv.org/abs/2405.21018v1","updated":"2024-05-31T17:07:15Z","published":"2024-05-31T17:07:15Z","title":"Improved Techniques for Optimization-Based Jailbreaking on Large\n  Language Models","summary":"  Large language models (LLMs) are being rapidly developed, and a key component\nof their widespread deployment is their safety-related alignment. Many\nred-teaming efforts aim to jailbreak LLMs, where among these efforts, the\nGreedy Coordinate Gradient (GCG) attack's success has led to a growing interest\nin the study of optimization-based jailbreaking techniques. Although GCG is a\nsignificant milestone, its attacking efficiency remains unsatisfactory. In this\npaper, we present several improved (empirical) techniques for\noptimization-based jailbreaks like GCG. We first observe that the single target\ntemplate of \"Sure\" largely limits the attacking performance of GCG; given this,\nwe propose to apply diverse target templates containing harmful self-suggestion\nand/or guidance to mislead LLMs. Besides, from the optimization aspects, we\npropose an automatic multi-coordinate updating strategy in GCG (i.e.,\nadaptively deciding how many tokens to replace in each step) to accelerate\nconvergence, as well as tricks like easy-to-hard initialisation. Then, we\ncombine these improved technologies to develop an efficient jailbreak method,\ndubbed $\\mathcal{I}$-GCG. In our experiments, we evaluate on a series of\nbenchmarks (such as NeurIPS 2023 Red Teaming Track). The results demonstrate\nthat our improved techniques can help GCG outperform state-of-the-art\njailbreaking attacks and achieve nearly 100% attack success rate. The code is\nreleased at https://github.com/jiaxiaojunQAQ/I-GCG.\n","authors":["Xiaojun Jia","Tianyu Pang","Chao Du","Yihao Huang","Jindong Gu","Yang Liu","Xiaochun Cao","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2405.21018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13401v3","updated":"2024-05-31T16:59:17Z","published":"2024-05-22T07:21:32Z","title":"TrojanRAG: Retrieval-Augmented Generation Can Be Backdoor Driver in\n  Large Language Models","summary":"  Large language models (LLMs) have raised concerns about potential security\nthreats despite performing significantly in Natural Language Processing (NLP).\nBackdoor attacks initially verified that LLM is doing substantial harm at all\nstages, but the cost and robustness have been criticized. Attacking LLMs is\ninherently risky in security review, while prohibitively expensive. Besides,\nthe continuous iteration of LLMs will degrade the robustness of backdoors. In\nthis paper, we propose TrojanRAG, which employs a joint backdoor attack in the\nRetrieval-Augmented Generation, thereby manipulating LLMs in universal attack\nscenarios. Specifically, the adversary constructs elaborate target contexts and\ntrigger sets. Multiple pairs of backdoor shortcuts are orthogonally optimized\nby contrastive learning, thus constraining the triggering conditions to a\nparameter subspace to improve the matching. To improve the recall of the RAG\nfor the target contexts, we introduce a knowledge graph to construct structured\ndata to achieve hard matching at a fine-grained level. Moreover, we normalize\nthe backdoor scenarios in LLMs to analyze the real harm caused by backdoors\nfrom both attackers' and users' perspectives and further verify whether the\ncontext is a favorable tool for jailbreaking models. Extensive experimental\nresults on truthfulness, language understanding, and harmfulness show that\nTrojanRAG exhibits versatility threats while maintaining retrieval capabilities\non normal queries.\n","authors":["Pengzhou Cheng","Yidong Ding","Tianjie Ju","Zongru Wu","Wei Du","Ping Yi","Zhuosheng Zhang","Gongshen Liu"],"pdf_url":"https://arxiv.org/pdf/2405.13401v3.pdf","comment":"19 pages, 14 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.20999v1","updated":"2024-05-31T16:41:36Z","published":"2024-05-31T16:41:36Z","title":"Towards a Fluid computer","summary":"  In 1991, Moore [20] raised a question about whether hydrodynamics is capable\nof performing computations. Similarly, in 2016, Tao [25] asked whether a\nmechanical system, including a fluid flow, can simulate a universal Turing\nmachine. In this expository article, we review the construction in [8] of a\n\"Fluid computer\" in dimension 3 that combines techniques in symbolic dynamics\nwith the connection between steady Euler flows and contact geometry unveiled by\nEtnyre and Ghrist. In addition, we argue that the metric that renders the\nvector field Beltrami cannot be critical in the Chern-Hamilton sense [9]. We\nalso sketch the completely different construction for the Euclidean metric in\n$\\mathbb R^3$ as given in [7]. These results reveal the existence of\nundecidable fluid particle paths. We conclude the article with a list of open\nproblems.\n","authors":["Robert Cardona","Eva Miranda","Daniel Peralta-Salas"],"pdf_url":"https://arxiv.org/pdf/2405.20999v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.20994v1","updated":"2024-05-31T16:38:54Z","published":"2024-05-31T16:38:54Z","title":"CWRCzech: 100M Query-Document Czech Click Dataset and Its Application to\n  Web Relevance Ranking","summary":"  We present CWRCzech, Click Web Ranking dataset for Czech, a 100M\nquery-document Czech click dataset for relevance ranking with user behavior\ndata collected from search engine logs of Seznam.cz. To the best of our\nknowledge, CWRCzech is the largest click dataset with raw text published so\nfar. It provides document positions in the search results as well as\ninformation about user behavior: 27.6M clicked documents and 10.8M dwell times.\nIn addition, we also publish a manually annotated Czech test for the relevance\ntask, containing nearly 50k query-document pairs, each annotated by at least 2\nannotators. Finally, we analyze how the user behavior data improve relevance\nranking and show that models trained on data automatically harnessed at\nsufficient scale can surpass the performance of models trained on human\nannotated data. CWRCzech is published under an academic non-commercial license\nand is available to the research community at\nhttps://github.com/seznam/CWRCzech.\n","authors":["Josef Vonášek","Milan Straka","Rostislav Krč","Lenka Lasoňová","Ekaterina Egorova","Jana Straková","Jakub Náplava"],"pdf_url":"https://arxiv.org/pdf/2405.20994v1.pdf","comment":"Accepted to SIGIR 2024"},{"id":"http://arxiv.org/abs/2405.14622v3","updated":"2024-05-31T16:37:53Z","published":"2024-05-23T14:30:33Z","title":"Calibrated Self-Rewarding Vision Language Models","summary":"  Large Vision-Language Models (LVLMs) have made substantial progress by\nintegrating pre-trained large language models (LLMs) and vision models through\ninstruction tuning. Despite these advancements, LVLMs often exhibit the\nhallucination phenomenon, where generated text responses appear linguistically\nplausible but contradict the input image, indicating a misalignment between\nimage and text pairs. This misalignment arises because the model tends to\nprioritize textual information over visual input, even when both the language\nmodel and visual representations are of high quality. Existing methods leverage\nadditional models or human annotations to curate preference data and enhance\nmodality alignment through preference optimization. These approaches may not\neffectively reflect the target LVLM's preferences, making the curated\npreferences easily distinguishable. Our work addresses these challenges by\nproposing the Calibrated Self-Rewarding (CSR) approach, which enables the model\nto self-improve by iteratively generating candidate responses, evaluating the\nreward for each response, and curating preference data for fine-tuning. In the\nreward modeling, we employ a step-wise strategy and incorporate visual\nconstraints into the self-rewarding process to place greater emphasis on visual\ninput. Empirical results demonstrate that CSR enhances performance and reduces\nhallucinations across ten benchmarks and tasks, achieving substantial\nimprovements over existing methods by 7.62%. Our empirical results are further\nsupported by rigorous theoretical analysis, under mild assumptions, verifying\nthe effectiveness of introducing visual constraints into the self-rewarding\nparadigm. Additionally, CSR shows compatibility with different vision-language\nmodels and the ability to incrementally improve performance through iterative\nfine-tuning. Our data and code are available at\nhttps://github.com/YiyangZhou/CSR.\n","authors":["Yiyang Zhou","Zhiyuan Fan","Dongjie Cheng","Sihan Yang","Zhaorun Chen","Chenhang Cui","Xiyao Wang","Yun Li","Linjun Zhang","Huaxiu Yao"],"pdf_url":"https://arxiv.org/pdf/2405.14622v3.pdf","comment":"fix some typos and add acknowledgement section in V3"},{"id":"http://arxiv.org/abs/2310.02905v2","updated":"2024-05-31T16:27:53Z","published":"2023-10-02T02:01:16Z","title":"Use Your INSTINCT: INSTruction optimization for LLMs usIng Neural\n  bandits Coupled with Transformers","summary":"  Large language models (LLMs) have shown remarkable instruction-following\ncapabilities and achieved impressive performances in various applications.\nHowever, the performances of LLMs depend heavily on the instructions given to\nthem, which are typically manually tuned with substantial human efforts. Recent\nwork has used the query-efficient Bayesian optimization (BO) algorithm to\nautomatically optimize the instructions given to black-box LLMs. However, BO\nusually falls short when optimizing highly sophisticated (e.g.,\nhigh-dimensional) objective functions, such as the functions mapping an\ninstruction to the performance of an LLM. This is mainly due to the limited\nexpressive power of the Gaussian process (GP) which is used by BO as a\nsurrogate to model the objective function. Meanwhile, it has been repeatedly\nshown that neural networks (NNs), especially pre-trained transformers, possess\nstrong expressive power and can model highly complex functions. So, we adopt a\nneural bandit algorithm which replaces the GP in BO by an NN surrogate to\noptimize instructions for black-box LLMs. More importantly, the neural bandit\nalgorithm allows us to naturally couple the NN surrogate with the hidden\nrepresentation learned by a pre-trained transformer (i.e., an open-source LLM),\nwhich significantly boosts its performance. These motivate us to propose our\nINSTruction optimization usIng Neural bandits Coupled with Transformers\n(INSTINCT) algorithm. We perform instruction optimization for ChatGPT and use\nextensive experiments to show that INSTINCT consistently outperforms baselines\nin different tasks, e.g., various instruction induction tasks and the task of\nimproving zero-shot chain-of-thought instructions. Our code is available at\nhttps://github.com/xqlin98/INSTINCT.\n","authors":["Xiaoqiang Lin","Zhaoxuan Wu","Zhongxiang Dai","Wenyang Hu","Yao Shu","See-Kiong Ng","Patrick Jaillet","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2310.02905v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2405.20974v1","updated":"2024-05-31T16:21:16Z","published":"2024-05-31T16:21:16Z","title":"SaySelf: Teaching LLMs to Express Confidence with Self-Reflective\n  Rationales","summary":"  Large language models (LLMs) often generate inaccurate or fabricated\ninformation and generally fail to indicate their confidence, which limits their\nbroader applications. Previous work elicits confidence from LLMs by direct or\nself-consistency prompting, or constructing specific datasets for supervised\nfinetuning. The prompting-based approaches have inferior performance, and the\ntraining-based approaches are limited to binary or inaccurate group-level\nconfidence estimates. In this work, we present the advanced SaySelf, a training\nframework that teaches LLMs to express more accurate fine-grained confidence\nestimates. In addition, beyond the confidence scores, SaySelf initiates the\nprocess of directing LLMs to produce self-reflective rationales that clearly\nidentify gaps in their parametric knowledge and explain their uncertainty. This\nis achieved by using an LLM to automatically summarize the uncertainties in\nspecific knowledge via natural language. The summarization is based on the\nanalysis of the inconsistency in multiple sampled reasoning chains, and the\nresulting data is utilized for supervised fine-tuning. Moreover, we utilize\nreinforcement learning with a meticulously crafted reward function to calibrate\nthe confidence estimates, motivating LLMs to deliver accurate, high-confidence\npredictions and to penalize overconfidence in erroneous outputs. Experimental\nresults in both in-distribution and out-of-distribution datasets demonstrate\nthe effectiveness of SaySelf in reducing the confidence calibration error and\nmaintaining the task performance. We show that the generated self-reflective\nrationales are reasonable and can further contribute to the calibration. The\ncode is made public at \\url{https://github.com/xu1868/SaySelf}.\n","authors":["Tianyang Xu","Shujin Wu","Shizhe Diao","Xiaoze Liu","Xingyao Wang","Yangyi Chen","Jing Gao"],"pdf_url":"https://arxiv.org/pdf/2405.20974v1.pdf","comment":"The code is available at \\url{https://github.com/xu1868/SaySelf}"},{"id":"http://arxiv.org/abs/2405.20973v1","updated":"2024-05-31T16:21:05Z","published":"2024-05-31T16:21:05Z","title":"LCQ: Low-Rank Codebook based Quantization for Large Language Models","summary":"  Large language models~(LLMs) have recently demonstrated promising performance\nin many tasks. However, the high storage and computational cost of LLMs has\nbecome a challenge for deploying LLMs. Weight quantization has been widely used\nfor model compression, which can reduce both storage and computational cost.\nMost existing weight quantization methods for LLMs use a rank-one codebook for\nquantization, which results in substantial accuracy loss when the compression\nratio is high. In this paper, we propose a novel weight quantization method,\ncalled low-rank codebook based quantization~(LCQ), for LLMs. LCQ adopts a\nlow-rank codebook, the rank of which can be larger than one, for quantization.\nExperiments show that LCQ can achieve better accuracy than existing methods\nwith a negligibly extra storage cost.\n","authors":["Wen-Pu Cai","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2405.20973v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.20967v1","updated":"2024-05-31T16:14:06Z","published":"2024-05-31T16:14:06Z","title":"Superlatives in Context: Explicit and Implicit Domain Restrictions for\n  Superlative Frames","summary":"  Superlatives are used to single out elements with a maximal/minimal property.\nSemantically, superlatives perform a set comparison: something (or some things)\nhas the min/max property out of a set. As such, superlatives provide an ideal\nphenomenon for studying implicit phenomena and discourse restrictions. While\nthis comparison set is often not explicitly defined, its (implicit)\nrestrictions can be inferred from the discourse context the expression appears\nin. In this work we provide an extensive computational study on the semantics\nof superlatives. We propose a unified account of superlative semantics which\nallows us to derive a broad-coverage annotation schema. Using this unified\nschema we annotated a multi-domain dataset of superlatives and their semantic\ninterpretations. We specifically focus on interpreting implicit or ambiguous\nsuperlative expressions, by analyzing how the discourse context restricts the\nset of interpretations. In a set of experiments we then analyze how well models\nperform at variations of predicting superlative semantics, with and without\ncontext. We show that the fine-grained semantics of superlatives in context can\nbe challenging for contemporary models, including GPT-4.\n","authors":["Valentina Pyatkin","Bonnie Webber","Ido Dagan","Reut Tsarfaty"],"pdf_url":"https://arxiv.org/pdf/2405.20967v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2405.20962v1","updated":"2024-05-31T16:07:33Z","published":"2024-05-31T16:07:33Z","title":"Large Language Models are Zero-Shot Next Location Predictors","summary":"  Predicting the locations an individual will visit in the future is crucial\nfor solving many societal issues like disease diffusion and reduction of\npollution among many others. The models designed to tackle next-location\nprediction, however, require a significant amount of individual-level\ninformation to be trained effectively. Such data may be scarce or even\nunavailable in some geographic regions or peculiar scenarios (e.g., cold-start\nin recommendation systems). Moreover, the design of a next-location predictor\nable to generalize or geographically transfer knowledge is still an open\nresearch challenge. Recent advances in natural language processing have led to\na rapid diffusion of Large Language Models (LLMs) which have shown good\ngeneralization and reasoning capabilities. These insights, coupled with the\nrecent findings that LLMs are rich in geographical knowledge, allowed us to\nbelieve that these models can act as zero-shot next-location predictors. This\npaper evaluates the capabilities of many popular LLMs in this role,\nspecifically Llama, GPT-3.5 and Mistral 7B. After designing a proper prompt, we\ntested the models on three real-world mobility datasets. The results show that\nLLMs can obtain accuracies up to 32.4%, a significant relative improvement of\nover 600% when compared to sophisticated DL models specifically designed for\nhuman mobility. Moreover, we show that other LLMs are unable to perform the\ntask properly. To prevent positively biased results, we also propose a\nframework inspired by other studies to test data contamination. Finally, we\nexplored the possibility of using LLMs as text-based explainers for\nnext-location prediction showing that can effectively provide an explanation\nfor their decision. Notably, 7B models provide more generic, but still\nreliable, explanations compared to larger counterparts. Code:\ngithub.com/ssai-trento/LLM-zero-shot-NL\n","authors":["Ciro Beneduce","Bruno Lepri","Massimiliano Luca"],"pdf_url":"https://arxiv.org/pdf/2405.20962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09894v2","updated":"2024-05-31T16:00:05Z","published":"2024-02-15T11:39:11Z","title":"Not Just Novelty: A Longitudinal Study on Utility and Customization of\n  an AI Workflow","summary":"  Generative AI brings novel and impressive abilities to help people in\neveryday tasks. There are many AI workflows that solve real and complex\nproblems by chaining AI outputs together with human interaction. Although there\nis an undeniable lure of AI, it is uncertain how useful generative AI workflows\nare after the novelty wears off. Additionally, workflows built with generative\nAI have the potential to be easily customized to fit users' individual needs,\nbut do users take advantage of this? We conducted a three-week longitudinal\nstudy with 12 users to understand the familiarization and customization of\ngenerative AI tools for science communication. Our study revealed that there\nexists a familiarization phase, during which users were exploring the novel\ncapabilities of the workflow and discovering which aspects they found useful.\nAfter this phase, users understood the workflow and were able to anticipate the\noutputs. Surprisingly, after familiarization the perceived utility of the\nsystem was rated higher than before, indicating that the perceived utility of\nAI is not just a novelty effect. The increase in benefits mainly comes from\nend-users' ability to customize prompts, and thus potentially appropriate the\nsystem to their own needs. This points to a future where generative AI systems\ncan allow us to design for appropriation.\n","authors":["Tao Long","Katy Ilonka Gero","Lydia B. Chilton"],"pdf_url":"https://arxiv.org/pdf/2402.09894v2.pdf","comment":"22 pages, 16 figures. ACM Conference on Designing Interactive Systems\n  (DIS 2024)"},{"id":"http://arxiv.org/abs/2402.04513v2","updated":"2024-05-31T15:59:34Z","published":"2024-02-07T01:46:50Z","title":"Online Cascade Learning for Efficient Inference over Streams","summary":"  Large Language Models (LLMs) have a natural role in answering complex queries\nabout data streams, but the high computational cost of LLM inference makes them\ninfeasible in many such tasks. We propose online cascade learning, the first\napproach to address this challenge. The objective here is to learn a \"cascade\"\nof models, starting with lower-capacity models (such as logistic regression)\nand ending with a powerful LLM, along with a deferral policy that determines\nthe model to be used on a given input. We formulate the task of learning\ncascades online as an imitation-learning problem, where smaller models are\nupdated over time imitating the collected LLM demonstrations, and give a\nno-regret algorithm for the problem. Experimental results across four\nbenchmarks show that our method parallels LLMs in accuracy while cutting down\ninference costs by as much as 90% with strong robustness against input\ndistribution shifts, underscoring its efficacy and adaptability in stream\nprocessing.\n","authors":["Lunyiu Nie","Zhimin Ding","Erdong Hu","Christopher Jermaine","Swarat Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2402.04513v2.pdf","comment":"ICML 2024 Main Conference Paper"},{"id":"http://arxiv.org/abs/2402.08638v5","updated":"2024-05-31T15:57:58Z","published":"2024-02-13T18:04:53Z","title":"SemRel2024: A Collection of Semantic Textual Relatedness Datasets for 13\n  Languages","summary":"  Exploring and quantifying semantic relatedness is central to representing\nlanguage and holds significant implications across various NLP tasks. While\nearlier NLP research primarily focused on semantic similarity, often within the\nEnglish language context, we instead investigate the broader phenomenon of\nsemantic relatedness. In this paper, we present \\textit{SemRel}, a new semantic\nrelatedness dataset collection annotated by native speakers across 13\nlanguages: \\textit{Afrikaans, Algerian Arabic, Amharic, English, Hausa, Hindi,\nIndonesian, Kinyarwanda, Marathi, Moroccan Arabic, Modern Standard Arabic,\nSpanish,} and \\textit{Telugu}. These languages originate from five distinct\nlanguage families and are predominantly spoken in Africa and Asia -- regions\ncharacterised by a relatively limited availability of NLP resources. Each\ninstance in the SemRel datasets is a sentence pair associated with a score that\nrepresents the degree of semantic textual relatedness between the two\nsentences. The scores are obtained using a comparative annotation framework. We\ndescribe the data collection and annotation processes, challenges when building\nthe datasets, baseline experiments, and their impact and utility in NLP.\n","authors":["Nedjma Ousidhoum","Shamsuddeen Hassan Muhammad","Mohamed Abdalla","Idris Abdulmumin","Ibrahim Said Ahmad","Sanchit Ahuja","Alham Fikri Aji","Vladimir Araujo","Abinew Ali Ayele","Pavan Baswani","Meriem Beloucif","Chris Biemann","Sofia Bourhim","Christine De Kock","Genet Shanko Dekebo","Oumaima Hourrane","Gopichand Kanumolu","Lokesh Madasu","Samuel Rutunda","Manish Shrivastava","Thamar Solorio","Nirmal Surange","Hailegnaw Getaneh Tilaye","Krishnapriya Vishnubhotla","Genta Winata","Seid Muhie Yimam","Saif M. Mohammad"],"pdf_url":"https://arxiv.org/pdf/2402.08638v5.pdf","comment":"Accepted to the Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2405.20956v1","updated":"2024-05-31T15:55:51Z","published":"2024-05-31T15:55:51Z","title":"A Robot Walks into a Bar: Can Language Models Serve asCreativity Support\n  Tools for Comedy? An Evaluation of LLMs' Humour Alignment with Comedians","summary":"  We interviewed twenty professional comedians who perform live shows in front\nof audiences and who use artificial intelligence in their artistic process as\npart of 3-hour workshops on ``AI x Comedy'' conducted at the Edinburgh Festival\nFringe in August 2023 and online. The workshop consisted of a comedy writing\nsession with large language models (LLMs), a human-computer interaction\nquestionnaire to assess the Creativity Support Index of AI as a writing tool,\nand a focus group interrogating the comedians' motivations for and processes of\nusing AI, as well as their ethical concerns about bias, censorship and\ncopyright. Participants noted that existing moderation strategies used in\nsafety filtering and instruction-tuned LLMs reinforced hegemonic viewpoints by\nerasing minority groups and their perspectives, and qualified this as a form of\ncensorship. At the same time, most participants felt the LLMs did not succeed\nas a creativity support tool, by producing bland and biased comedy tropes, akin\nto ``cruise ship comedy material from the 1950s, but a bit less racist''. Our\nwork extends scholarship about the subtle difference between, one the one hand,\nharmful speech, and on the other hand, ``offensive'' language as a practice of\nresistance, satire and ``punching up''. We also interrogate the global value\nalignment behind such language models, and discuss the importance of\ncommunity-based value alignment and data ownership to build AI tools that\nbetter suit artists' needs.\n","authors":["Piotr Wojciech Mirowski","Juliette Love","Kory W. Mathewson","Shakir Mohamed"],"pdf_url":"https://arxiv.org/pdf/2405.20956v1.pdf","comment":"15 pages, 1 figure, published at ACM FAccT 2024"},{"id":"http://arxiv.org/abs/2405.20947v1","updated":"2024-05-31T15:44:33Z","published":"2024-05-31T15:44:33Z","title":"OR-Bench: An Over-Refusal Benchmark for Large Language Models","summary":"  Large Language Models (LLMs) require careful safety alignment to prevent\nmalicious outputs. While significant research focuses on mitigating harmful\ncontent generation, the enhanced safety often come with the side effect of\nover-refusal, where the LLMs may reject innocuous prompts and become less\nhelpful. Although the issue of over-refusal has been empirically observed, a\nsystematic measurement is challenging due to the difficulty of crafting prompts\nthat appear harmful but are benign. This study proposes a novel method for\nautomatically generating large-scale sets of ``seemingly toxic prompts''\n(benign prompts likely rejected by LLMs). Leveraging this technique, we\nintroduce OR-Bench, the first large-scale over-refusal benchmark. OR-Bench\ncomprises 80,000 seemingly toxic prompts across 10 common rejection categories,\na subset of around 1,000 hard prompts that are challenging even for\nstate-of-the-art LLMs, and an additional 600 toxic prompts to prevent\nindiscriminate responses. We then conduct a comprehensive study to measure the\nover-refusal of 25 popular LLMs across 8 model families. Our datasets are\navailable at https://huggingface.co/datasets/bench-llm/OR-Bench and the\ncorresponding demo can be found at\nhttps://huggingface.co/spaces/bench-llm/or-bench. We hope this benchmark can\nhelp the community develop better safety aligned models.\n","authors":["Justin Cui","Wei-Lin Chiang","Ion Stoica","Cho-Jui Hsieh"],"pdf_url":"https://arxiv.org/pdf/2405.20947v1.pdf","comment":"version 1"},{"id":"http://arxiv.org/abs/2405.18669v2","updated":"2024-05-31T15:42:53Z","published":"2024-05-29T00:23:55Z","title":"Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities","summary":"  Integrating multiple generative foundation models, especially those trained\non different modalities, into something greater than the sum of its parts poses\nsignificant challenges. Two key hurdles are the availability of aligned data\n(concepts that contain similar meaning but is expressed differently in\ndifferent modalities), and effectively leveraging unimodal representations in\ncross-domain generative tasks, without compromising their original unimodal\ncapabilities.\n  We propose Zipper, a multi-tower decoder architecture that addresses these\nconcerns by using cross-attention to flexibly compose multimodal generative\nmodels from independently pre-trained unimodal decoders. In our experiments\nfusing speech and text modalities, we show the proposed architecture performs\nvery competitively in scenarios with limited aligned text-speech data. We also\nshowcase the flexibility of our model to selectively maintain unimodal (e.g.,\ntext-to-text generation) generation performance by freezing the corresponding\nmodal tower (e.g. text). In cross-modal tasks such as automatic speech\nrecognition (ASR) where the output modality is text, we show that freezing the\ntext backbone results in negligible performance degradation. In cross-modal\ntasks such as text-to-speech generation (TTS) where the output modality is\nspeech, we show that using a pre-trained speech backbone results in superior\nperformance to the baseline.\n","authors":["Vicky Zayats","Peter Chen","Melissa Ferrari","Dirk Padfield"],"pdf_url":"https://arxiv.org/pdf/2405.18669v2.pdf","comment":"Under review at NeurIPS"},{"id":"http://arxiv.org/abs/2310.00835v3","updated":"2024-05-31T15:36:09Z","published":"2023-10-02T00:59:07Z","title":"TRAM: Benchmarking Temporal Reasoning for Large Language Models","summary":"  Reasoning about time is essential for understanding the nuances of events\ndescribed in natural language. Previous research on this topic has been limited\nin scope, characterized by a lack of standardized benchmarks that would allow\nfor consistent evaluations across different studies. In this paper, we\nintroduce TRAM, a temporal reasoning benchmark composed of ten datasets,\nencompassing various temporal aspects of events such as order, arithmetic,\nfrequency, and duration, designed to facilitate a comprehensive evaluation of\nthe TeR capabilities of large language models (LLMs). We evaluate popular LLMs\nlike GPT-4 and Llama2 in zero-shot and few-shot scenarios, and establish\nbaselines with BERT-based and domain-specific models. Our findings indicate\nthat the best-performing model lags significantly behind human performance. It\nis our aspiration that TRAM will spur further progress in enhancing the TeR\ncapabilities of LLMs.\n","authors":["Yuqing Wang","Yun Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.00835v3.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2308.01399v2","updated":"2024-05-31T15:32:02Z","published":"2023-07-31T17:57:49Z","title":"Learning to Model the World with Language","summary":"  To interact with humans and act in the world, agents need to understand the\nrange of language that people use and relate it to the visual world. While\ncurrent agents can learn to execute simple language instructions, we aim to\nbuild agents that leverage diverse language -- language like \"this button turns\non the TV\" or \"I put the bowls away\" -- that conveys general knowledge,\ndescribes the state of the world, provides interactive feedback, and more. Our\nkey idea is that agents should interpret such diverse language as a signal that\nhelps them predict the future: what they will observe, how the world will\nbehave, and which situations will be rewarded. This perspective unifies\nlanguage understanding with future prediction as a powerful self-supervised\nlearning objective. We instantiate this in Dynalang, an agent that learns a\nmultimodal world model to predict future text and image representations, and\nlearns to act from imagined model rollouts. While current methods that learn\nlanguage-conditioned policies degrade in performance with more diverse types of\nlanguage, we show that Dynalang learns to leverage environment descriptions,\ngame rules, and instructions to excel on tasks ranging from game-playing to\nnavigating photorealistic home scans. Finally, we show that our method enables\nadditional capabilities due to learning a generative model: Dynalang can be\npretrained on text-only data, enabling learning from offline datasets, and\ngenerate language grounded in an environment.\n","authors":["Jessy Lin","Yuqing Du","Olivia Watkins","Danijar Hafner","Pieter Abbeel","Dan Klein","Anca Dragan"],"pdf_url":"https://arxiv.org/pdf/2308.01399v2.pdf","comment":"ICML 2024. Website: https://dynalang.github.io/"},{"id":"http://arxiv.org/abs/2405.20917v1","updated":"2024-05-31T15:21:53Z","published":"2024-05-31T15:21:53Z","title":"Learning to Estimate System Specifications in Linear Temporal Logic\n  using Transformers and Mamba","summary":"  Temporal logic is a framework for representing and reasoning about\npropositions that evolve over time. It is commonly used for specifying\nrequirements in various domains, including hardware and software systems, as\nwell as robotics. Specification mining or formula generation involves\nextracting temporal logic formulae from system traces and has numerous\napplications, such as detecting bugs and improving interpretability. Although\nthere has been a surge of deep learning-based methods for temporal logic\nsatisfiability checking in recent years, the specification mining literature\nhas been lagging behind in adopting deep learning methods despite their many\nadvantages, such as scalability. In this paper, we introduce autoregressive\nmodels that can generate linear temporal logic formulae from traces, towards\naddressing the specification mining problem. We propose multiple architectures\nfor this task: transformer encoder-decoder, decoder-only transformer, and\nMamba, which is an emerging alternative to transformer models. Additionally, we\ndevise a metric for quantifying the distinctiveness of the generated formulae\nand a straightforward algorithm to enforce the syntax constraints. Our\nexperiments show that the proposed architectures yield promising results,\ngenerating correct and distinct formulae at a fraction of the compute cost\nneeded for the combinatorial baseline.\n","authors":["İlker Işık","Ebru Aydin Gol","Ramazan Gokberk Cinbis"],"pdf_url":"https://arxiv.org/pdf/2405.20917v1.pdf","comment":"20 pages, 15 figures"},{"id":"http://arxiv.org/abs/2404.07611v2","updated":"2024-05-31T15:19:18Z","published":"2024-04-11T09:59:01Z","title":"NoticIA: A Clickbait Article Summarization Dataset in Spanish","summary":"  We present NoticIA, a dataset consisting of 850 Spanish news articles\nfeaturing prominent clickbait headlines, each paired with high-quality,\nsingle-sentence generative summarizations written by humans. This task demands\nadvanced text understanding and summarization abilities, challenging the\nmodels' capacity to infer and connect diverse pieces of information to meet the\nuser's informational needs generated by the clickbait headline. We evaluate the\nSpanish text comprehension capabilities of a wide range of state-of-the-art\nlarge language models. Additionally, we use the dataset to train\nClickbaitFighter, a task-specific model that achieves near-human performance in\nthis task.\n","authors":["Iker García-Ferrero","Begoña Altuna"],"pdf_url":"https://arxiv.org/pdf/2404.07611v2.pdf","comment":"Accepted in the journal Procesamiento del Lenguaje Natural"},{"id":"http://arxiv.org/abs/2405.20906v1","updated":"2024-05-31T15:17:47Z","published":"2024-05-31T15:17:47Z","title":"Enhancing Vision Models for Text-Heavy Content Understanding and\n  Interaction","summary":"  Interacting and understanding with text heavy visual content with multiple\nimages is a major challenge for traditional vision models. This paper is on\nenhancing vision models' capability to comprehend or understand and learn from\nimages containing a huge amount of textual information from the likes of\ntextbooks and research papers which contain multiple images like graphs, etc\nand tables in them with different types of axes and scales. The approach\ninvolves dataset preprocessing, fine tuning which is by using instructional\noriented data and evaluation. We also built a visual chat application\nintegrating CLIP for image encoding and a model from the Massive Text Embedding\nBenchmark which is developed to consider both textual and visual inputs. An\naccuracy of 96.71% was obtained. The aim of the project is to increase and also\nenhance the advance vision models' capabilities in understanding complex visual\ntextual data interconnected data, contributing to multimodal AI.\n","authors":["Adithya TG","Adithya SK","Abhinav R Bharadwaj","Abhiram HA","Dr. Surabhi Narayan"],"pdf_url":"https://arxiv.org/pdf/2405.20906v1.pdf","comment":"5 pages, 4 figures (including 1 graph)"},{"id":"http://arxiv.org/abs/2402.03962v3","updated":"2024-05-31T15:16:21Z","published":"2024-02-06T12:42:21Z","title":"Position: Stop Making Unscientific AGI Performance Claims","summary":"  Developments in the field of Artificial Intelligence (AI), and particularly\nlarge language models (LLMs), have created a 'perfect storm' for observing\n'sparks' of Artificial General Intelligence (AGI) that are spurious. Like\nsimpler models, LLMs distill meaningful representations in their latent\nembeddings that have been shown to correlate with external variables.\nNonetheless, the correlation of such representations has often been linked to\nhuman-like intelligence in the latter but not the former. We probe models of\nvarying complexity including random projections, matrix decompositions, deep\nautoencoders and transformers: all of them successfully distill information\nthat can be used to predict latent or external variables and yet none of them\nhave previously been linked to AGI. We argue and empirically demonstrate that\nthe finding of meaningful patterns in latent spaces of models cannot be seen as\nevidence in favor of AGI. Additionally, we review literature from the social\nsciences that shows that humans are prone to seek such patterns and\nanthropomorphize. We conclude that both the methodological setup and common\npublic image of AI are ideal for the misinterpretation that correlations\nbetween model representations and some variables of interest are 'caused' by\nthe model's understanding of underlying 'ground truth' relationships. We,\ntherefore, call for the academic community to exercise extra caution, and to be\nkeenly aware of principles of academic integrity, in interpreting and\ncommunicating about AI research outcomes.\n","authors":["Patrick Altmeyer","Andrew M. Demetriou","Antony Bartlett","Cynthia C. S. Liem"],"pdf_url":"https://arxiv.org/pdf/2402.03962v3.pdf","comment":"21 pages, 15 figures. Pre-print to be published at International\n  Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2405.20902v1","updated":"2024-05-31T15:15:04Z","published":"2024-05-31T15:15:04Z","title":"Preemptive Answer \"Attacks\" on Chain-of-Thought Reasoning","summary":"  Large language models (LLMs) showcase impressive reasoning capabilities when\ncoupled with Chain-of-Thought (CoT) prompting. However, the robustness of this\napproach warrants further investigation. In this paper, we introduce a novel\nscenario termed preemptive answers, where the LLM obtains an answer before\nengaging in reasoning. This situation can arise inadvertently or induced by\nmalicious users by prompt injection attacks. Experiments reveal that preemptive\nanswers significantly impair the model's reasoning capability across various\nCoT methods and a broad spectrum of datasets. To bolster the robustness of\nreasoning, we propose two measures aimed at mitigating this issue to some\nextent.\n","authors":["Rongwu Xu","Zehan Qi","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2405.20902v1.pdf","comment":"Accepted to ACL'24 (Findings). Camera-ready version"},{"id":"http://arxiv.org/abs/2312.09085v5","updated":"2024-05-31T15:13:33Z","published":"2023-12-14T16:16:50Z","title":"The Earth is Flat because...: Investigating LLMs' Belief towards\n  Misinformation via Persuasive Conversation","summary":"  Large language models (LLMs) encapsulate vast amounts of knowledge but still\nremain vulnerable to external misinformation. Existing research mainly studied\nthis susceptibility behavior in a single-turn setting. However, belief can\nchange during a multi-turn conversation, especially a persuasive one.\nTherefore, in this study, we delve into LLMs' susceptibility to persuasive\nconversations, particularly on factual questions that they can answer\ncorrectly. We first curate the Farm (i.e., Fact to Misinform) dataset, which\ncontains factual questions paired with systematically generated persuasive\nmisinformation. Then, we develop a testing framework to track LLMs' belief\nchanges in a persuasive dialogue. Through extensive experiments, we find that\nLLMs' correct beliefs on factual knowledge can be easily manipulated by various\npersuasive strategies.\n","authors":["Rongwu Xu","Brian S. Lin","Shujian Yang","Tianqi Zhang","Weiyan Shi","Tianwei Zhang","Zhixuan Fang","Wei Xu","Han Qiu"],"pdf_url":"https://arxiv.org/pdf/2312.09085v5.pdf","comment":"Accepted to ACL'24 (Main). Camera-ready version"},{"id":"http://arxiv.org/abs/2405.20900v1","updated":"2024-05-31T15:12:33Z","published":"2024-05-31T15:12:33Z","title":"Large Language Models: A New Approach for Privacy Policy Analysis at\n  Scale","summary":"  The number and dynamic nature of web and mobile applications presents\nsignificant challenges for assessing their compliance with data protection\nlaws. In this context, symbolic and statistical Natural Language Processing\n(NLP) techniques have been employed for the automated analysis of these\nsystems' privacy policies. However, these techniques typically require\nlabor-intensive and potentially error-prone manually annotated datasets for\ntraining and validation. This research proposes the application of Large\nLanguage Models (LLMs) as an alternative for effectively and efficiently\nextracting privacy practices from privacy policies at scale. Particularly, we\nleverage well-known LLMs such as ChatGPT and Llama 2, and offer guidance on the\noptimal design of prompts, parameters, and models, incorporating advanced\nstrategies such as few-shot learning. We further illustrate its capability to\ndetect detailed and varied privacy practices accurately. Using several renowned\ndatasets in the domain as a benchmark, our evaluation validates its exceptional\nperformance, achieving an F1 score exceeding 93%. Besides, it does so with\nreduced costs, faster processing times, and fewer technical knowledge\nrequirements. Consequently, we advocate for LLM-based solutions as a sound\nalternative to traditional NLP techniques for the automated analysis of privacy\npolicies at scale.\n","authors":["David Rodriguez","Ian Yang","Jose M. Del Alamo","Norman Sadeh"],"pdf_url":"https://arxiv.org/pdf/2405.20900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20895v1","updated":"2024-05-31T15:04:15Z","published":"2024-05-31T15:04:15Z","title":"A comparison of correspondence analysis with PMI-based word embedding\n  methods","summary":"  Popular word embedding methods such as GloVe and Word2Vec are related to the\nfactorization of the pointwise mutual information (PMI) matrix. In this paper,\nwe link correspondence analysis (CA) to the factorization of the PMI matrix. CA\nis a dimensionality reduction method that uses singular value decomposition\n(SVD), and we show that CA is mathematically close to the weighted\nfactorization of the PMI matrix. In addition, we present variants of CA that\nturn out to be successful in the factorization of the word-context matrix, i.e.\nCA applied to a matrix where the entries undergo a square-root transformation\n(ROOT-CA) and a root-root transformation (ROOTROOT-CA). An empirical comparison\namong CA- and PMI-based methods shows that overall results of ROOT-CA and\nROOTROOT-CA are slightly better than those of the PMI-based methods.\n","authors":["Qianqian Qi","David J. Hessen","Peter G. M. van der Heijden"],"pdf_url":"https://arxiv.org/pdf/2405.20895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15032v2","updated":"2024-05-31T14:47:55Z","published":"2024-05-23T20:10:38Z","title":"Aya 23: Open Weight Releases to Further Multilingual Progress","summary":"  This technical report introduces Aya 23, a family of multilingual language\nmodels. Aya 23 builds on the recent release of the Aya model (\\\"Ust\\\"un et al.,\n2024), focusing on pairing a highly performant pre-trained model with the\nrecently released Aya collection (Singh et al., 2024). The result is a powerful\nmultilingual large language model serving 23 languages, expanding state-of-art\nlanguage modeling capabilities to approximately half of the world's population.\nThe Aya model covered 101 languages whereas Aya 23 is an experiment in depth vs\nbreadth, exploring the impact of allocating more capacity to fewer languages\nthat are included during pre-training. Aya 23 outperforms both previous\nmassively multilingual models like Aya 101 for the languages it covers, as well\nas widely used models like Gemma, Mistral and Mixtral on an extensive range of\ndiscriminative and generative tasks. We release the open weights for both the\n8B and 35B models as part of our continued commitment for expanding access to\nmultilingual progress.\n","authors":["Viraat Aryabumi","John Dang","Dwarak Talupuru","Saurabh Dash","David Cairuz","Hangyu Lin","Bharat Venkitesh","Madeline Smith","Jon Ander Campos","Yi Chern Tan","Kelly Marchisio","Max Bartolo","Sebastian Ruder","Acyr Locatelli","Julia Kreutzer","Nick Frosst","Aidan Gomez","Phil Blunsom","Marzieh Fadaee","Ahmet Üstün","Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2405.15032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20859v1","updated":"2024-05-31T14:43:31Z","published":"2024-05-31T14:43:31Z","title":"clembench-2024: A Challenging, Dynamic, Complementary, Multilingual\n  Benchmark and Underlying Flexible Framework for LLMs as Multi-Action Agents","summary":"  It has been established in recent work that Large Language Models (LLMs) can\nbe prompted to \"self-play\" conversational games that probe certain capabilities\n(general instruction following, strategic goal orientation, language\nunderstanding abilities), where the resulting interactive game play can be\nautomatically scored. In this paper, we take one of the proposed frameworks for\nsetting up such game-play environments, and further test its usefulness as an\nevaluation instrument, along a number of dimensions: We show that it can easily\nkeep up with new developments while avoiding data contamination, we show that\nthe tests implemented within it are not yet saturated (human performance is\nsubstantially higher than that of even the best models), and we show that it\nlends itself to investigating additional questions, such as the impact of the\nprompting language on performance. We believe that the approach forms a good\nbasis for making decisions on model choice for building applied interactive\nsystems, and perhaps ultimately setting up a closed-loop development\nenvironment of system and simulated evaluator.\n","authors":["Anne Beyer","Kranti Chalamalasetti","Sherzod Hakimov","Brielen Madureira","Philipp Sadler","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2405.20859v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2405.20852v1","updated":"2024-05-31T14:34:23Z","published":"2024-05-31T14:34:23Z","title":"Towards Spoken Language Understanding via Multi-level Multi-grained\n  Contrastive Learning","summary":"  Spoken language understanding (SLU) is a core task in task-oriented dialogue\nsystems, which aims at understanding the user's current goal through\nconstructing semantic frames. SLU usually consists of two subtasks, including\nintent detection and slot filling. Although there are some SLU frameworks joint\nmodeling the two subtasks and achieving high performance, most of them still\noverlook the inherent relationships between intents and slots and fail to\nachieve mutual guidance between the two subtasks. To solve the problem, we\npropose a multi-level multi-grained SLU framework MMCL to apply contrastive\nlearning at three levels, including utterance level, slot level, and word level\nto enable intent and slot to mutually guide each other. For the utterance\nlevel, our framework implements coarse granularity contrastive learning and\nfine granularity contrastive learning simultaneously. Besides, we also apply\nthe self-distillation method to improve the robustness of the model.\nExperimental results and further analysis demonstrate that our proposed model\nachieves new state-of-the-art results on two public multi-intent SLU datasets,\nobtaining a 2.6 overall accuracy improvement on the MixATIS dataset compared to\nprevious best models.\n","authors":["Xuxin Cheng","Wanshi Xu","Zhihong Zhu","Hongxiang Li","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2405.20852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20850v1","updated":"2024-05-31T14:33:07Z","published":"2024-05-31T14:33:07Z","title":"Improving Reward Models with Synthetic Critiques","summary":"  Reward models (RM) play a critical role in aligning language models through\nthe process of reinforcement learning from human feedback. RMs are trained to\npredict a score reflecting human preference, which requires significant time\nand cost for human annotation. Additionally, RMs tend to quickly overfit on\nsuperficial features in the training set, hindering their generalization\nperformance on unseen distributions. We propose a novel approach using\nsynthetic natural language critiques generated by large language models to\nprovide additional feedback, evaluating aspects such as instruction following,\ncorrectness, and style. This offers richer signals and more robust features for\nRMs to assess and score on. We demonstrate that high-quality critiques improve\nthe performance and data efficiency of RMs initialized from different\npretrained models. Conversely, we also show that low-quality critiques\nnegatively impact performance. Furthermore, incorporating critiques enhances\nthe interpretability and robustness of RM training.\n","authors":["Zihuiwen Ye","Fraser Greenlee-Scott","Max Bartolo","Phil Blunsom","Jon Ander Campos","Matthias Gallé"],"pdf_url":"https://arxiv.org/pdf/2405.20850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20846v1","updated":"2024-05-31T14:31:46Z","published":"2024-05-31T14:31:46Z","title":"Don't Buy it! Reassessing the Ad Understanding Abilities of Contrastive\n  Multimodal Models","summary":"  Image-based advertisements are complex multimodal stimuli that often contain\nunusual visual elements and figurative language. Previous research on automatic\nad understanding has reported impressive zero-shot accuracy of contrastive\nvision-and-language models (VLMs) on an ad-explanation retrieval task. Here, we\nexamine the original task setup and show that contrastive VLMs can solve it by\nexploiting grounding heuristics. To control for this confound, we introduce\nTRADE, a new evaluation test set with adversarial grounded explanations. While\nthese explanations look implausible to humans, we show that they \"fool\" four\ndifferent contrastive VLMs. Our findings highlight the need for an improved\noperationalisation of automatic ad understanding that truly evaluates VLMs'\nmultimodal reasoning abilities. We make our code and TRADE available at\nhttps://github.com/dmg-illc/trade .\n","authors":["A. Bavaresco","A. Testoni","R. Fernández"],"pdf_url":"https://arxiv.org/pdf/2405.20846v1.pdf","comment":"Accepted to the main conference ACL 2024"},{"id":"http://arxiv.org/abs/2204.09140v2","updated":"2024-05-31T14:28:40Z","published":"2022-04-19T21:55:18Z","title":"Multi-hop Question Answering","summary":"  The task of Question Answering (QA) has attracted significant research\ninterest for long. Its relevance to language understanding and knowledge\nretrieval tasks, along with the simple setting makes the task of QA crucial for\nstrong AI systems. Recent success on simple QA tasks has shifted the focus to\nmore complex settings. Among these, Multi-Hop QA (MHQA) is one of the most\nresearched tasks over the recent years. In broad terms, MHQA is the task of\nanswering natural language questions that involve extracting and combining\nmultiple pieces of information and doing multiple steps of reasoning. An\nexample of a multi-hop question would be \"The Argentine PGA Championship record\nholder has won how many tournaments worldwide?\". Answering the question would\nneed two pieces of information: \"Who is the record holder for Argentine PGA\nChampionship tournaments?\" and \"How many tournaments did [Answer of Sub Q1]\nwin?\". The ability to answer multi-hop questions and perform multi step\nreasoning can significantly improve the utility of NLP systems. Consequently,\nthe field has seen a surge with high quality datasets, models and evaluation\nstrategies. The notion of 'multiple hops' is somewhat abstract which results in\na large variety of tasks that require multi-hop reasoning. This leads to\ndifferent datasets and models that differ significantly from each other and\nmakes the field challenging to generalize and survey. We aim to provide a\ngeneral and formal definition of the MHQA task, and organize and summarize\nexisting MHQA frameworks. We also outline some best practices for building MHQA\ndatasets. This book provides a systematic and thorough introduction as well as\nthe structuring of the existing attempts to this highly interesting, yet quite\nchallenging task.\n","authors":["Vaibhav Mavi","Anubhav Jangra","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2204.09140v2.pdf","comment":"Published at Foundations and Trends in Information Retrieval"},{"id":"http://arxiv.org/abs/2405.20835v1","updated":"2024-05-31T14:24:33Z","published":"2024-05-31T14:24:33Z","title":"Outliers and Calibration Sets have Diminishing Effect on Quantization of\n  Modern LLMs","summary":"  Post-Training Quantization (PTQ) enhances the efficiency of Large Language\nModels (LLMs) by enabling faster operation and compatibility with more\naccessible hardware through reduced memory usage, at the cost of small\nperformance drops. We explore the role of calibration sets in PTQ, specifically\ntheir effect on hidden activations in various notable open-source LLMs.\nCalibration sets are crucial for evaluating activation magnitudes and\nidentifying outliers, which can distort the quantization range and negatively\nimpact performance. Our analysis reveals a marked contrast in quantization\neffectiveness across models. The older OPT model, which much of the\nquantization literature is based on, shows significant performance\ndeterioration and high susceptibility to outliers with varying calibration\nsets. In contrast, newer models like Llama-2 7B, Llama-3 8B, Command-R 35B, and\nMistral 7B demonstrate strong robustness, with Mistral 7B showing near-immunity\nto outliers and stable activations. These findings suggest a shift in PTQ\nstrategies might be needed. As advancements in pre-training methods reduce the\nrelevance of outliers, there is an emerging need to reassess the fundamentals\nof current quantization literature. The emphasis should pivot towards\noptimizing inference speed, rather than primarily focusing on outlier\npreservation, to align with the evolving characteristics of state-of-the-art\nLLMs.\n","authors":["Davide Paglieri","Saurabh Dash","Tim Rocktäschel","Jack Parker-Holder"],"pdf_url":"https://arxiv.org/pdf/2405.20835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20833v1","updated":"2024-05-31T14:23:30Z","published":"2024-05-31T14:23:30Z","title":"That's Optional: A Contemporary Exploration of \"that\" Omission in\n  English Subordinate Clauses","summary":"  The Uniform Information Density (UID) hypothesis posits that speakers\noptimize the communicative properties of their utterances by avoiding spikes in\ninformation, thereby maintaining a relatively uniform information profile over\ntime. This paper investigates the impact of UID principles on syntactic\nreduction, specifically focusing on the optional omission of the connector\n\"that\" in English subordinate clauses. Building upon previous research, we\nextend our investigation to a larger corpus of written English, utilize\ncontemporary large language models (LLMs) and extend the information-uniformity\nprinciples by the notion of entropy, to estimate the UID manifestations in the\nusecase of syntactic reduction choices.\n","authors":["Ella Rabinovich"],"pdf_url":"https://arxiv.org/pdf/2405.20833v1.pdf","comment":"ACL2024 (main conference), 8 pages"},{"id":"http://arxiv.org/abs/2405.20830v1","updated":"2024-05-31T14:21:04Z","published":"2024-05-31T14:21:04Z","title":"Self-Augmented Preference Optimization: Off-Policy Paradigms for\n  Language Model Alignment","summary":"  Traditional language model alignment methods, such as Direct Preference\nOptimization (DPO), are limited by their dependence on static, pre-collected\npaired preference data, which hampers their adaptability and practical\napplicability. To overcome this limitation, we introduce Self-Augmented\nPreference Optimization (SAPO), an effective and scalable training paradigm\nthat does not require existing paired data. Building on the self-play concept,\nwhich autonomously generates negative responses, we further incorporate an\noff-policy learning pipeline to enhance data exploration and exploitation.\nSpecifically, we employ an Exponential Moving Average (EMA) model in\nconjunction with a replay buffer to enable dynamic updates of response\nsegments, effectively integrating real-time feedback with insights from\nhistorical data. Our comprehensive evaluations of the LLaMA3-8B and Mistral-7B\nmodels across benchmarks, including the Open LLM Leaderboard, IFEval,\nAlpacaEval 2.0, and MT-Bench, demonstrate that SAPO matches or surpasses\nestablished offline contrastive baselines, such as DPO and Odds Ratio\nPreference Optimization, and outperforms offline self-play methods like SPIN.\nOur code is available at https://github.com/yinyueqin/SAPO\n","authors":["Yueqin Yin","Zhendong Wang","Yujia Xie","Weizhu Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.20830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20818v1","updated":"2024-05-31T14:14:01Z","published":"2024-05-31T14:14:01Z","title":"An iterated learning model of language change that mixes supervised and\n  unsupervised learning","summary":"  The iterated learning model is an agent-based model of language change in\nwhich language is transmitted from a tutor to a pupil which itself becomes a\ntutor to a new pupil, and so on. Languages that are stable, expressive, and\ncompositional arise spontaneously as a consequence of a language transmission\nbottleneck. Previous models have implemented an agent's mapping from signals to\nmeanings using an artificial neural network decoder, but have relied on an\nunrealistic and computationally expensive process of obversion to implement the\nassociated encoder, mapping from meanings to signals. Here, a new model is\npresented in which both decoder and encoder are neural networks, trained\nseparately through supervised learning, and trained together through\nunsupervised learning in the form of an autoencoder. This avoids the\nsubstantial computational burden entailed in obversion and introduces a mixture\nof supervised and unsupervised learning as observed during human development.\n","authors":["Jack Bunyan","Seth Bullock","Conor Houghton"],"pdf_url":"https://arxiv.org/pdf/2405.20818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20805v1","updated":"2024-05-31T14:05:27Z","published":"2024-05-31T14:05:27Z","title":"Multilingual Text Style Transfer: Datasets & Models for Indian Languages","summary":"  Text style transfer (TST) involves altering the linguistic style of a text\nwhile preserving its core content. This paper focuses on sentiment transfer, a\nvital TST subtask (Mukherjee et al., 2022a), across a spectrum of Indian\nlanguages: Hindi, Magahi, Malayalam, Marathi, Punjabi, Odia, Telugu, and Urdu,\nexpanding upon previous work on English-Bangla sentiment transfer (Mukherjee et\nal., 2023). We introduce dedicated datasets of 1,000 positive and 1,000\nnegative style-parallel sentences for each of these eight languages. We then\nevaluate the performance of various benchmark models categorized into parallel,\nnon-parallel, cross-lingual, and shared learning approaches, including the\nLlama2 and GPT-3.5 large language models (LLMs). Our experiments highlight the\nsignificance of parallel data in TST and demonstrate the effectiveness of the\nMasked Style Filling (MSF) approach (Mukherjee et al., 2023) in non-parallel\ntechniques. Moreover, cross-lingual and joint multilingual learning methods\nshow promise, offering insights into selecting optimal models tailored to the\nspecific language and task requirements. To the best of our knowledge, this\nwork represents the first comprehensive exploration of the TST task as\nsentiment transfer across a diverse set of languages.\n","authors":["Sourabrata Mukherjee","Atul Kr. Ojha","Akanksha Bansal","Deepak Alok","John P. McCrae","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2405.20805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16698v2","updated":"2024-05-31T14:03:10Z","published":"2024-04-25T15:59:16Z","title":"Cooperate or Collapse: Emergence of Sustainability Behaviors in a\n  Society of LLM Agents","summary":"  As AI systems pervade human life, ensuring that large language models (LLMs)\nmake safe decisions is a significant challenge. This paper introduces the\nGovernance of the Commons Simulation (GovSim), a generative simulation platform\ndesigned to study strategic interactions and cooperative decision-making in\nLLMs. Using GovSim, we investigate the dynamics of sustainable resource sharing\nin a society of AI agents. This environment allows us to study the influence of\nethical considerations, strategic planning, and negotiation skills on\ncooperative outcomes for AI agents. We develop an LLM-based agent architecture\ndesigned for these social dilemmas and test it with a variety of LLMs. We find\nthat all but the most powerful LLM agents fail to achieve a sustainable\nequilibrium in GovSim. Ablations reveal that successful multi-agent\ncommunication between agents is critical for achieving cooperation in these\ncases. Furthermore, our analyses show that the failure to achieve sustainable\ncooperation in most LLMs stems from their inability to formulate and analyze\nhypotheses about the long-term effects of their actions on the equilibrium of\nthe group. Finally, we show that agents that leverage\n``Universalization''-based reasoning, a theory of moral thinking, are able to\nachieve significantly greater sustainability. Taken together, GovSim enables us\nto study the mechanisms that underlie sustainable self-government with\nsignificant specificity and scale. We open source the full suite of our\nresearch results, including the simulation environment, agent prompts, and a\ncomprehensive web interface.\n","authors":["Giorgio Piatti","Zhijing Jin","Max Kleiman-Weiner","Bernhard Schölkopf","Mrinmaya Sachan","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2404.16698v2.pdf","comment":"Revised version"},{"id":"http://arxiv.org/abs/2305.15805v3","updated":"2024-05-31T14:02:24Z","published":"2023-05-25T07:39:41Z","title":"Dynamic Context Pruning for Efficient and Interpretable Autoregressive\n  Transformers","summary":"  Autoregressive Transformers adopted in Large Language Models (LLMs) are hard\nto scale to long sequences. Despite several works trying to reduce their\ncomputational cost, most of LLMs still adopt attention layers between all pairs\nof tokens in the sequence, thus incurring a quadratic cost. In this study, we\npresent a novel approach that dynamically prunes contextual information while\npreserving the model's expressiveness, resulting in reduced memory and\ncomputational requirements during inference. Our method employs a learnable\nmechanism that determines which uninformative tokens can be dropped from the\ncontext at any point across the generation process. By doing so, our approach\nnot only addresses performance concerns but also enhances interpretability,\nproviding valuable insight into the model's decision-making process. Our\ntechnique can be applied to existing pre-trained models through a\nstraightforward fine-tuning process, and the pruning strength can be specified\nby a sparsity parameter. Notably, our empirical findings demonstrate that we\ncan effectively prune up to 80\\% of the context without significant performance\ndegradation on downstream tasks, offering a valuable tool for mitigating\ninference costs. Our reference implementation achieves up to $2\\times$ increase\nin inference throughput and even greater memory savings.\n","authors":["Sotiris Anagnostidis","Dario Pavllo","Luca Biggio","Lorenzo Noci","Aurelien Lucchi","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2305.15805v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20797v1","updated":"2024-05-31T13:59:18Z","published":"2024-05-31T13:59:18Z","title":"Ovis: Structural Embedding Alignment for Multimodal Large Language Model","summary":"  Current Multimodal Large Language Models (MLLMs) typically integrate a\npre-trained LLM with another pre-trained vision transformer through a\nconnector, such as an MLP, endowing the LLM with visual capabilities. However,\nthe misalignment between two embedding strategies in MLLMs -- the structural\ntextual embeddings based on an embedding look-up table and the continuous\nembeddings generated directly by the vision encoder -- makes challenges for a\nmore seamless fusion of visual and textual information. We propose Ovis, a\nnovel MLLM architecture designed to structurally align visual and textual\nembeddings. Ovis integrates an additional learnable visual embedding table into\nthe visual encoder's process. To capture rich visual semantics, each image\npatch indexes the visual embedding table multiple times, resulting in a final\nvisual embedding that is a probabilistic combination of the indexed embeddings.\nThis structural approach mirrors the method used for generating textual\nembeddings. Empirical evaluations on various multimodal benchmarks demonstrate\nthat Ovis outperforms open-source MLLMs of similar parameter scales and even\nsurpasses the proprietary model Qwen-VL-Plus overall. These results highlight\nthe potential of Ovis' structured visual representation for advancing MLLM\narchitectural design and promoting more effective multimodal learning. Both the\nsource code and the training dataset of Ovis will be made publicly available.\n","authors":["Shiyin Lu","Yang Li","Qing-Guo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2405.20797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10144v2","updated":"2024-05-31T13:11:15Z","published":"2024-03-15T09:43:52Z","title":"NLP Verification: Towards a General Methodology for Certifying\n  Robustness","summary":"  Deep neural networks have exhibited substantial success in the field of\nNatural Language Processing and ensuring their safety and reliability is\ncrucial: there are safety critical contexts where such models must be robust to\nvariability or attack, and give guarantees over their output. Unlike Computer\nVision, NLP lacks a unified verification methodology and, despite recent\nadvancements in literature, they are often light on the pragmatical issues of\nNLP verification. In this paper, we attempt to distil and evaluate general\ncomponents of an NLP verification pipeline, that emerges from the progress in\nthe field to date. Our contributions are two-fold. Firstly, we give a general\n(i.e. algorithm-independent) characterisation of verifiable subspaces that\nresult from embedding sentences into continuous spaces. We identify, and give\nan effective method to deal with, the technical challenge of semantic\ngeneralisability of verified subspaces; and propose it as a standard metric in\nthe NLP verification pipelines (alongside with the standard metrics of model\naccuracy and model verifiability). Secondly, we propose a general methodology\nto analyse the effect of the embedding gap -- a problem that refers to the\ndiscrepancy between verification of geometric subspaces, and the semantic\nmeaning of sentences which the geometric subspaces are supposed to represent.\nIn extreme cases, poor choices in embedding of sentences may invalidate\nverification results. We propose a number of practical NLP methods that can\nhelp to quantify the effects of the embedding gap; and in particular we propose\nthe metric of falsifiability of semantic subspaces as another fundamental\nmetric to be reported as part of the NLP verification pipeline. We believe that\ntogether these general principles pave the way towards a more consolidated and\neffective development of this new domain.\n","authors":["Marco Casadio","Tanvi Dinkar","Ekaterina Komendantskaya","Luca Arnaboldi","Matthew L. Daggitt","Omri Isac","Guy Katz","Verena Rieser","Oliver Lemon"],"pdf_url":"https://arxiv.org/pdf/2403.10144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18870v2","updated":"2024-05-31T12:45:50Z","published":"2024-05-29T08:31:16Z","title":"LLMs achieve adult human performance on higher-order theory of mind\n  tasks","summary":"  This paper examines the extent to which large language models (LLMs) have\ndeveloped higher-order theory of mind (ToM); the human ability to reason about\nmultiple mental and emotional states in a recursive manner (e.g. I think that\nyou believe that she knows). This paper builds on prior work by introducing a\nhandwritten test suite -- Multi-Order Theory of Mind Q&A -- and using it to\ncompare the performance of five LLMs to a newly gathered adult human benchmark.\nWe find that GPT-4 and Flan-PaLM reach adult-level and near adult-level\nperformance on ToM tasks overall, and that GPT-4 exceeds adult performance on\n6th order inferences. Our results suggest that there is an interplay between\nmodel size and finetuning for the realisation of ToM abilities, and that the\nbest-performing LLMs have developed a generalised capacity for ToM. Given the\nrole that higher-order ToM plays in a wide range of cooperative and competitive\nhuman behaviours, these findings have significant implications for user-facing\nLLM applications.\n","authors":["Winnie Street","John Oliver Siy","Geoff Keeling","Adrien Baranes","Benjamin Barnett","Michael McKibben","Tatenda Kanyere","Alison Lentz","Blaise Aguera y Arcas","Robin I. M. Dunbar"],"pdf_url":"https://arxiv.org/pdf/2405.18870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07043v2","updated":"2024-05-31T12:27:52Z","published":"2024-02-10T21:06:34Z","title":"A Tale of Tails: Model Collapse as a Change of Scaling Laws","summary":"  As AI model size grows, neural scaling laws have become a crucial tool to\npredict the improvements of large models when increasing capacity and the size\nof original (human or natural) training data. Yet, the widespread use of\npopular models means that the ecosystem of online data and text will co-evolve\nto progressively contain increased amounts of synthesized data. In this paper\nwe ask: How will the scaling laws change in the inevitable regime where\nsynthetic data makes its way into the training corpus? Will future models,\nstill improve, or be doomed to degenerate up to total (model) collapse? We\ndevelop a theoretical framework of model collapse through the lens of scaling\nlaws. We discover a wide range of decay phenomena, analyzing loss of scaling,\nshifted scaling with number of generations, the ''un-learning\" of skills, and\ngrokking when mixing human and synthesized data. Our theory is validated by\nlarge-scale experiments with a transformer on an arithmetic task and text\ngeneration using the large language model Llama2.\n","authors":["Elvis Dohmatob","Yunzhen Feng","Pu Yang","Francois Charton","Julia Kempe"],"pdf_url":"https://arxiv.org/pdf/2402.07043v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20755v1","updated":"2024-05-31T11:43:31Z","published":"2024-05-31T11:43:31Z","title":"Improving code-mixed hate detection by native sample mixing: A case\n  study for Hindi-English code-mixed scenario","summary":"  Hate detection has long been a challenging task for the NLP community. The\ntask becomes complex in a code-mixed environment because the models must\nunderstand the context and the hate expressed through language alteration.\nCompared to the monolingual setup, we see very less work on code-mixed hate as\nlarge-scale annotated hate corpora are unavailable to make the study. To\novercome this bottleneck, we propose using native language hate samples. We\nhypothesise that in the era of multilingual language models (MLMs), hate in\ncode-mixed settings can be detected by majorly relying on the native language\nsamples. Even though the NLP literature reports the effectiveness of MLMs on\nhate detection in many cross-lingual settings, their extensive evaluation in a\ncode-mixed scenario is yet to be done. This paper attempts to fill this gap\nthrough rigorous empirical experiments. We considered the Hindi-English\ncode-mixed setup as a case study as we have the linguistic expertise for the\nsame. Some of the interesting observations we got are: (i) adding native hate\nsamples in the code-mixed training set, even in small quantity, improved the\nperformance of MLMs for code-mixed hate detection, (ii) MLMs trained with\nnative samples alone observed to be detecting code-mixed hate to a large\nextent, (iii) The visualisation of attention scores revealed that, when native\nsamples were included in training, MLMs could better focus on the hate emitting\nwords in the code-mixed context, and (iv) finally, when hate is subjective or\nsarcastic, naively mixing native samples doesn't help much to detect code-mixed\nhate. We will release the data and code repository to reproduce the reported\nresults.\n","authors":["Debajyoti Mazumder","Aakash Kumar","Jasabanta Patro"],"pdf_url":"https://arxiv.org/pdf/2405.20755v1.pdf","comment":"Generated from XeLaTeX"},{"id":"http://arxiv.org/abs/2405.20708v1","updated":"2024-05-31T09:00:43Z","published":"2024-05-31T09:00:43Z","title":"FinGen: A Dataset for Argument Generation in Finance","summary":"  Thinking about the future is one of the important activities that people do\nin daily life. Futurists also pay a lot of effort into figuring out possible\nscenarios for the future. We argue that the exploration of this direction is\nstill in an early stage in the NLP research. To this end, we propose three\nargument generation tasks in the financial application scenario. Our\nexperimental results show these tasks are still big challenges for\nrepresentative generation models. Based on our empirical results, we further\npoint out several unresolved issues and challenges in this research direction.\n","authors":["Chung-Chi Chen","Hiroya Takamura","Ichiro Kobayashi","Yusuke Miyao"],"pdf_url":"https://arxiv.org/pdf/2405.20708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20703v1","updated":"2024-05-31T08:57:09Z","published":"2024-05-31T08:57:09Z","title":"It is Simple Sometimes: A Study On Improving Aspect-Based Sentiment\n  Analysis Performance","summary":"  Aspect-Based Sentiment Analysis (ABSA) involves extracting opinions from\ntextual data about specific entities and their corresponding aspects through\nvarious complementary subtasks. Several prior research has focused on\ndeveloping ad hoc designs of varying complexities for these subtasks. In this\npaper, we present a generative framework extensible to any ABSA subtask. We\nbuild upon the instruction tuned model proposed by Scaria et al. (2023), who\npresent an instruction-based model with task descriptions followed by\nin-context examples on ABSA subtasks. We propose PFInstruct, an extension to\nthis instruction learning paradigm by appending an NLP-related task prefix to\nthe task description. This simple approach leads to improved performance across\nall tested SemEval subtasks, surpassing previous state-of-the-art (SOTA) on the\nATE subtask (Rest14) by +3.28 F1-score, and on the AOOE subtask by an average\nof +5.43 F1-score across SemEval datasets. Furthermore, we explore the impact\nof the prefix-enhanced prompt quality on the ABSA subtasks and find that even a\nnoisy prefix enhances model performance compared to the baseline. Our method\nalso achieves competitive results on a biomedical domain dataset (ERSA).\n","authors":["Laura Cabello","Uchenna Akujuobi"],"pdf_url":"https://arxiv.org/pdf/2405.20703v1.pdf","comment":"Accepted to ACL Findings 2024"},{"id":"http://arxiv.org/abs/2405.19967v2","updated":"2024-05-31T08:54:24Z","published":"2024-05-30T11:46:42Z","title":"Improved Out-of-Scope Intent Classification with Dual Encoding and\n  Threshold-based Re-Classification","summary":"  Detecting out-of-scope user utterances is essential for task-oriented\ndialogues and intent classification. Current methodologies face difficulties\nwith the unpredictable distribution of outliers and often rely on assumptions\nabout data distributions. We present the Dual Encoder for Threshold-Based\nRe-Classification (DETER) to address these challenges. This end-to-end\nframework efficiently detects out-of-scope intents without requiring\nassumptions on data distributions or additional post-processing steps. The core\nof DETER utilizes dual text encoders, the Universal Sentence Encoder (USE) and\nthe Transformer-based Denoising AutoEncoder (TSDAE), to generate user utterance\nembeddings, which are classified through a branched neural architecture.\nFurther, DETER generates synthetic outliers using self-supervision and\nincorporates out-of-scope phrases from open-domain datasets. This approach\nensures a comprehensive training set for out-of-scope detection. Additionally,\na threshold-based re-classification mechanism refines the model's initial\npredictions. Evaluations on the CLINC-150, Stackoverflow, and Banking77\ndatasets demonstrate DETER's efficacy. Our model outperforms previous\nbenchmarks, increasing up to 13% and 5% in F1 score for known and unknown\nintents on CLINC-150 and Stackoverflow, and 16% for known and 24% % for unknown\nintents on Banking77. The source code has been released at\nhttps://github.com/Hossam-Mohammed-tech/Intent_Classification_OOS.\n","authors":["Hossam M. Zawbaa","Wael Rashwan","Sourav Dutta","Haytham Assem"],"pdf_url":"https://arxiv.org/pdf/2405.19967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20701v1","updated":"2024-05-31T08:53:59Z","published":"2024-05-31T08:53:59Z","title":"Unveiling the Lexical Sensitivity of LLMs: Combinatorial Optimization\n  for Prompt Enhancement","summary":"  Large language models (LLMs) demonstrate exceptional instruct-following\nability to complete various downstream tasks. Although this impressive ability\nmakes LLMs flexible task solvers, their performance in solving tasks also\nheavily relies on instructions. In this paper, we reveal that LLMs are\nover-sensitive to lexical variations in task instructions, even when the\nvariations are imperceptible to humans. By providing models with neighborhood\ninstructions, which are closely situated in the latent representation space and\ndiffer by only one semantically similar word, the performance on downstream\ntasks can be vastly different. Following this property, we propose a black-box\nCombinatorial Optimization framework for Prompt Lexical Enhancement (COPLE).\nCOPLE performs iterative lexical optimization according to the feedback from a\nbatch of proxy tasks, using a search strategy related to word influence.\nExperiments show that even widely-used human-crafted prompts for current\nbenchmarks suffer from the lexical sensitivity of models, and COPLE recovers\nthe declined model ability in both instruct-following and solving downstream\ntasks.\n","authors":["Pengwei Zhan","Zhen Xu","Qian Tan","Jie Song","Ru Xie"],"pdf_url":"https://arxiv.org/pdf/2405.20701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15854v2","updated":"2024-05-31T08:37:04Z","published":"2024-01-29T03:05:35Z","title":"LSTM-based Deep Neural Network With A Focus on Sentence Representation\n  for Sequential Sentence Classification in Medical Scientific Abstracts","summary":"  The Sequential Sentence Classification task within the domain of medical\nabstracts, termed as SSC, involves the categorization of sentences into\npre-defined headings based on their roles in conveying critical information in\nthe abstract. In the SSC task, sentences are sequentially related to each\nother. For this reason, the role of sentence embeddings is crucial for\ncapturing both the semantic information between words in the sentence and the\ncontextual relationship of sentences within the abstract, which then enhances\nthe SSC system performance. In this paper, we propose a LSTM-based deep\nlearning network with a focus on creating comprehensive sentence representation\nat the sentence level. To demonstrate the efficacy of the created sentence\nrepresentation, a system utilizing these sentence embeddings is also developed,\nwhich consists of a Convolutional-Recurrent neural network (C-RNN) at the\nabstract level and a multi-layer perception network (MLP) at the segment level.\nOur proposed system yields highly competitive results compared to\nstate-of-the-art systems and further enhances the F1 scores of the baseline by\n1.0%, 2.8%, and 2.6% on the benchmark datasets PudMed 200K RCT, PudMed 20K RCT\nand NICTA-PIBOSO, respectively. This indicates the significant impact of\nimproving sentence representation on boosting model performance.\n","authors":["Phat Lam","Lam Pham","Tin Nguyen","Hieu Tang","Michael Seidl","Medina Andresel","Alexander Schindler"],"pdf_url":"https://arxiv.org/pdf/2401.15854v2.pdf","comment":"Submitted to FedCSIS 2024"},{"id":"http://arxiv.org/abs/2405.20684v1","updated":"2024-05-31T08:26:47Z","published":"2024-05-31T08:26:47Z","title":"Joint Embeddings for Graph Instruction Tuning","summary":"  Large Language Models (LLMs) have achieved impressive performance in text\nunderstanding and have become an essential tool for building smart assistants.\nOriginally focusing on text, they have been enhanced with multimodal\ncapabilities in recent works that successfully built visual instruction\nfollowing assistants. As far as the graph modality goes, however, no such\nassistants have yet been developed. Graph structures are complex in that they\nrepresent relation between different features and are permutation invariant.\nMoreover, representing them in purely textual form does not always lead to good\nLLM performance even for finetuned models. As a result, there is a need to\ndevelop a new method to integrate graphs in LLMs for general graph\nunderstanding. This work explores the integration of the graph modality in LLM\nfor general graph instruction following tasks. It aims at producing a deep\nlearning model that enhances an underlying LLM with graph embeddings and trains\nit to understand them and to produce, given an instruction, an answer grounded\nin the graph representation. The approach performs significantly better than a\ngraph to text approach and remains consistent even for larger graphs.\n","authors":["Vlad Argatu","Aaron Haag","Oliver Lohse"],"pdf_url":"https://arxiv.org/pdf/2405.20684v1.pdf","comment":"Conference Preprint"},{"id":"http://arxiv.org/abs/2405.20680v1","updated":"2024-05-31T08:22:49Z","published":"2024-05-31T08:22:49Z","title":"Unraveling and Mitigating Retriever Inconsistencies in\n  Retrieval-Augmented Large Language Models","summary":"  Although Retrieval-Augmented Large Language Models (RALMs) demonstrate their\nsuperiority in terms of factuality, they do not consistently outperform the\noriginal retrieval-free Language Models (LMs). Our experiments reveal that this\nexample-level performance inconsistency exists not only between\nretrieval-augmented and retrieval-free LM but also among different retrievers.\nTo understand this phenomenon, we investigate the degeneration behavior of\nRALMs and theoretically decompose it into four categories. Further analysis\nbased on our decomposition reveals that the innate difference in knowledge\nsources and the unpredictable degeneration of the reader model contribute most\nto the inconsistency. Drawing from our analysis, we introduce Ensemble of\nRetrievers (EoR), a trainable framework that can adaptively retrieve from\ndifferent knowledge sources and effectively decrease unpredictable reader\nerrors. Our experiments on Open Domain Question Answering show that EoR\nsubstantially improves performance over the RALM with a single retriever by\nconsiderably reducing inconsistent behaviors.\n","authors":["Mingda Li","Xinyu Li","Yifan Chen","Wenfeng Xuan","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20680v1.pdf","comment":"ACL 2024 (findings)"},{"id":"http://arxiv.org/abs/2405.20671v1","updated":"2024-05-31T08:13:35Z","published":"2024-05-31T08:13:35Z","title":"Position Coupling: Leveraging Task Structure for Improved Length\n  Generalization of Transformers","summary":"  Even for simple arithmetic tasks like integer addition, it is challenging for\nTransformers to generalize to longer sequences than those encountered during\ntraining. To tackle this problem, we propose position coupling, a simple yet\neffective method that directly embeds the structure of the tasks into the\npositional encoding of a (decoder-only) Transformer. Taking a departure from\nthe vanilla absolute position mechanism assigning unique position IDs to each\nof the tokens, we assign the same position IDs to two or more \"relevant\"\ntokens; for integer addition tasks, we regard digits of the same significance\nas in the same position. On the empirical side, we show that with the proposed\nposition coupling, a small (1-layer) Transformer trained on 1 to 30-digit\nadditions can generalize up to 200-digit additions (6.67x of the trained\nlength). On the theoretical side, we prove that a 1-layer Transformer with\ncoupled positions can solve the addition task involving exponentially many\ndigits, whereas any 1-layer Transformer without positional information cannot\nentirely solve it. We also demonstrate that position coupling can be applied to\nother algorithmic tasks such as addition with multiple summands, Nx2\nmultiplication, copy/reverse, and a two-dimensional task.\n","authors":["Hanseul Cho","Jaeyoung Cha","Pranjal Awasthi","Srinadh Bhojanapalli","Anupam Gupta","Chulhee Yun"],"pdf_url":"https://arxiv.org/pdf/2405.20671v1.pdf","comment":"73 pages, 20 figures, 90 tables"},{"id":"http://arxiv.org/abs/2405.19732v2","updated":"2024-05-31T08:13:34Z","published":"2024-05-30T06:24:14Z","title":"Two Optimizers Are Better Than One: LLM Catalyst for Enhancing\n  Gradient-Based Optimization","summary":"  Learning a skill generally relies on both practical experience by doer and\ninsightful high-level guidance by instructor. Will this strategy also work well\nfor solving complex non-convex optimization problems? Here, a common\ngradient-based optimizer acts like a disciplined doer, making locally optimal\nupdate at each step. Recent methods utilize large language models (LLMs) to\noptimize solutions for concrete problems by inferring from natural language\ninstructions, akin to a high-level instructor. In this paper, we show that\nthese two optimizers are complementary to each other, suggesting a\ncollaborative optimization approach. The gradient-based optimizer and LLM-based\noptimizer are combined in an interleaved manner. We instruct LLMs using task\ndescriptions and timely optimization trajectories recorded during\ngradient-based optimization. Inferred results from LLMs are used as restarting\npoints for the next stage of gradient optimization. By leveraging both the\nlocally rigorous gradient-based optimizer and the high-level deductive\nLLM-based optimizer, our combined optimization method consistently yields\nimprovements over competitive baseline prompt tuning methods. Our results\ndemonstrate the synergistic effect of conventional gradient-based optimization\nand the inference ability of LLMs. The code is released at\nhttps://github.com/guozix/LLM-catalyst.\n","authors":["Zixian Guo","Ming Liu","Zhilong Ji","Jinfeng Bai","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.19732v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01289v2","updated":"2024-05-31T08:07:45Z","published":"2024-03-02T19:01:40Z","title":"Greed is All You Need: An Evaluation of Tokenizer Inference Methods","summary":"  While subword tokenizers such as BPE and WordPiece are typically used to\nbuild vocabularies for NLP models, the method of decoding text into a sequence\nof tokens from these vocabularies is often left unspecified, or ill-suited to\nthe method in which they were constructed. We provide a controlled analysis of\nseven tokenizer inference methods across four different algorithms and three\nvocabulary sizes, performed on a novel intrinsic evaluation suite we curated\nfor English, combining measures rooted in morphology, cognition, and\ninformation theory. We show that for the most commonly used tokenizers, greedy\ninference performs surprisingly well; and that SaGe, a recently-introduced\ncontextually-informed tokenizer, outperforms all others on morphological\nalignment.\n","authors":["Omri Uzan","Craig W. Schmidt","Chris Tanner","Yuval Pinter"],"pdf_url":"https://arxiv.org/pdf/2403.01289v2.pdf","comment":"ACL 2024 (main)"},{"id":"http://arxiv.org/abs/2311.11745v2","updated":"2024-05-31T07:57:13Z","published":"2023-11-20T13:13:24Z","title":"ELF: Encoding Speaker-Specific Latent Speech Feature for Speech\n  Synthesis","summary":"  In this work, we propose a novel method for modeling numerous speakers, which\nenables expressing the overall characteristics of speakers in detail like a\ntrained multi-speaker model without additional training on the target speaker's\ndataset. Although various works with similar purposes have been actively\nstudied, their performance has not yet reached that of trained multi-speaker\nmodels due to their fundamental limitations. To overcome previous limitations,\nwe propose effective methods for feature learning and representing target\nspeakers' speech characteristics by discretizing the features and conditioning\nthem to a speech synthesis model. Our method obtained a significantly higher\nsimilarity mean opinion score (SMOS) in subjective similarity evaluation than\nseen speakers of a high-performance multi-speaker model, even with unseen\nspeakers. The proposed method also outperforms a zero-shot method by\nsignificant margins. Furthermore, our method shows remarkable performance in\ngenerating new artificial speakers. In addition, we demonstrate that the\nencoded latent features are sufficiently informative to reconstruct an original\nspeaker's speech completely. It implies that our method can be used as a\ngeneral methodology to encode and reconstruct speakers' characteristics in\nvarious tasks.\n","authors":["Jungil Kong","Junmo Lee","Jeongmin Kim","Beomjeong Kim","Jihoon Park","Dohee Kong","Changheon Lee","Sangjin Kim"],"pdf_url":"https://arxiv.org/pdf/2311.11745v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2310.18339v2","updated":"2024-05-31T07:56:08Z","published":"2023-10-21T17:18:09Z","title":"When MOE Meets LLMs: Parameter Efficient Fine-tuning for Multi-task\n  Medical Applications","summary":"  The recent surge in Large Language Models (LLMs) has garnered significant\nattention across numerous fields. Fine-tuning is often required to fit general\nLLMs for a specific domain, like the web-based healthcare system. However, two\nproblems arise during fine-tuning LLMs for medical applications. One is the\ntask variety problem, which involves distinct tasks in real-world medical\nscenarios. The variety often leads to sub-optimal fine-tuning for data\nimbalance and seesaw problems. Besides, the large amount of parameters in LLMs\nleads to huge time and computation consumption by fine-tuning. To address these\ntwo problems, we propose a novel parameter efficient fine-tuning framework for\nmulti-task medical applications, dubbed as MOELoRA. The designed framework aims\nto absorb both the benefits of mixture-of-expert (MOE) for multi-task learning\nand low-rank adaptation (LoRA) for parameter efficient fine-tuning. For\nunifying MOE and LoRA, we devise multiple experts as the trainable parameters,\nwhere each expert consists of a pair of low-rank matrices to retain the small\nsize of trainable parameters. Then, a task-motivated gate function for all\nMOELoRA layers is proposed, which can control the contributions of each expert\nand produce distinct parameters for various tasks. We conduct experiments on a\nmulti-task medical dataset, indicating MOELoRA outperforms the existing\nparameter efficient fine-tuning methods. The code is available online.\n","authors":["Qidong Liu","Xian Wu","Xiangyu Zhao","Yuanshao Zhu","Derong Xu","Feng Tian","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2310.18339v2.pdf","comment":"accepted by SIGIR'24"},{"id":"http://arxiv.org/abs/2405.20657v1","updated":"2024-05-31T07:51:16Z","published":"2024-05-31T07:51:16Z","title":"DORY: Deliberative Prompt Recovery for LLM","summary":"  Prompt recovery in large language models (LLMs) is crucial for understanding\nhow LLMs work and addressing concerns regarding privacy, copyright, etc. The\ntrend towards inference-only APIs complicates this task by restricting access\nto essential outputs for recovery. To tackle this challenge, we extract\nprompt-related information from limited outputs and identify a strong(negative)\ncorrelation between output probability-based uncertainty and the success of\nprompt recovery. This finding led to the development of Deliberative PrOmpt\nRecoverY (DORY), our novel approach that leverages uncertainty to recover\nprompts accurately. DORY involves reconstructing drafts from outputs, refining\nthese with hints, and filtering out noise based on uncertainty. Our evaluation\nacross diverse LLMs and prompt benchmarks shows that DORY outperforms existing\nbaselines, improving performance by approximately 10.82% and establishing a new\nstate-of-the-art record in prompt recovery tasks. Significantly, DORY operates\nusing a single LLM without any external resources or model, offering a\ncost-effective, user-friendly prompt recovery solution.\n","authors":["Lirong Gao","Ru Peng","Yiming Zhang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.20657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20654v1","updated":"2024-05-31T07:43:42Z","published":"2024-05-31T07:43:42Z","title":"Passage-specific Prompt Tuning for Passage Reranking in Question\n  Answering with Large Language Models","summary":"  Effective passage retrieval and reranking methods have been widely utilized\nto identify suitable candidates in open-domain question answering tasks, recent\nstudies have resorted to LLMs for reranking the retrieved passages by the\nlog-likelihood of the question conditioned on each passage. Although these\nmethods have demonstrated promising results, the performance is notably\nsensitive to the human-written prompt (or hard prompt), and fine-tuning LLMs\ncan be computationally intensive and time-consuming. Furthermore, this approach\nlimits the leverage of question-passage relevance pairs and passage-specific\nknowledge to enhance the ranking capabilities of LLMs. In this paper, we\npropose passage-specific prompt tuning for reranking in open-domain question\nanswering (PSPT): a parameter-efficient method that fine-tunes learnable\npassage-specific soft prompts, incorporating passage-specific knowledge from a\nlimited set of question-passage relevance pairs. The method involves ranking\nretrieved passages based on the log-likelihood of the model generating the\nquestion conditioned on each passage and the learned soft prompt. We conducted\nextensive experiments utilizing the Llama-2-chat-7B model across three publicly\navailable open-domain question answering datasets and the results demonstrate\nthe effectiveness of the proposed approach.\n","authors":["Xuyang Wu","Zhiyuan Peng","Sravanthi Rajanala","Hsin-Tai Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2405.20654v1.pdf","comment":"Accepted at Gen-IR@SIGIR24"},{"id":"http://arxiv.org/abs/2403.03031v3","updated":"2024-05-31T07:42:44Z","published":"2024-03-05T15:08:16Z","title":"Learning to Use Tools via Cooperative and Interactive Agents","summary":"  Tool learning empowers large language models (LLMs) as agents to use external\ntools to extend their capability. Existing methods employ one single LLM-based\nagent to iteratively select and execute tools, thereafter incorporating the\nresult into the next action prediction. However, they still suffer from\npotential performance degradation when addressing complex tasks due to: (1) the\nlimitation of the inherent capability of a single LLM to perform diverse\nactions, and (2) the struggle to adaptively correct mistakes when the task\nfails. To mitigate these problems, we propose the ConAgents, a Cooperative and\ninteractive Agents framework, which modularizes the workflow of tool learning\ninto Grounding, Execution, and Observing agents. We also introduce an iterative\ncalibration (IterCali) method, enabling the agents to adapt themselves based on\nthe feedback from the tool environment. Experiments conducted on three datasets\ndemonstrate the superiority of our ConAgents (e.g., 6 point improvement over\nthe SOTA baseline). We further provide fine-granularity analysis for the\nefficiency and consistency of our framework.\n","authors":["Zhengliang Shi","Shen Gao","Xiuyi Chen","Lingyong Yan","Haibo Shi","Dawei Yin","Zhumin Chen","Pengjie Ren","Suzan Verberne","Zhaochun Ren"],"pdf_url":"https://arxiv.org/pdf/2403.03031v3.pdf","comment":"working in process, 20 pages"},{"id":"http://arxiv.org/abs/2405.20649v1","updated":"2024-05-31T07:30:34Z","published":"2024-05-31T07:30:34Z","title":"Reward-based Input Construction for Cross-document Relation Extraction","summary":"  Relation extraction (RE) is a fundamental task in natural language\nprocessing, aiming to identify relations between target entities in text. While\nmany RE methods are designed for a single sentence or document, cross-document\nRE has emerged to address relations across multiple long documents. Given the\nnature of long documents in cross-document RE, extracting document embeddings\nis challenging due to the length constraints of pre-trained language models.\nTherefore, we propose REward-based Input Construction (REIC), the first\nlearning-based sentence selector for cross-document RE. REIC extracts sentences\nbased on relational evidence, enabling the RE module to effectively infer\nrelations. Since supervision of evidence sentences is generally unavailable, we\ntrain REIC using reinforcement learning with RE prediction scores as rewards.\nExperimental results demonstrate the superiority of our method over heuristic\nmethods for different RE structures and backbones in cross-document RE. Our\ncode is publicly available at https://github.com/aailabkaist/REIC.\n","authors":["Byeonghu Na","Suhyeon Jo","Yeongmin Kim","Il-Chul Moon"],"pdf_url":"https://arxiv.org/pdf/2405.20649v1.pdf","comment":"Accepted at ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2405.20648v1","updated":"2024-05-31T07:30:24Z","published":"2024-05-31T07:30:24Z","title":"Shotluck Holmes: A Family of Efficient Small-Scale Large Language Vision\n  Models For Video Captioning and Summarization","summary":"  Video is an increasingly prominent and information-dense medium, yet it poses\nsubstantial challenges for language models. A typical video consists of a\nsequence of shorter segments, or shots, that collectively form a coherent\nnarrative. Each shot is analogous to a word in a sentence where multiple data\nstreams of information (such as visual and auditory data) must be processed\nsimultaneously. Comprehension of the entire video requires not only\nunderstanding the visual-audio information of each shot but also requires that\nthe model links the ideas between each shot to generate a larger,\nall-encompassing story. Despite significant progress in the field, current\nworks often overlook videos' more granular shot-by-shot semantic information.\nIn this project, we propose a family of efficient large language vision models\n(LLVMs) to boost video summarization and captioning called Shotluck Holmes. By\nleveraging better pretraining and data collection strategies, we extend the\nabilities of existing small LLVMs from being able to understand a picture to\nbeing able to understand a sequence of frames. Specifically, we show that\nShotluck Holmes achieves better performance than state-of-the-art results on\nthe Shot2Story video captioning and summary task with significantly smaller and\nmore computationally efficient models.\n","authors":["Richard Luo","Austin Peng","Adithya Vasudev","Rishabh Jain"],"pdf_url":"https://arxiv.org/pdf/2405.20648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19831v2","updated":"2024-05-31T07:24:55Z","published":"2024-05-30T08:41:33Z","title":"Just Rewrite It Again: A Post-Processing Method for Enhanced Semantic\n  Similarity and Privacy Preservation of Differentially Private Rewritten Text","summary":"  The study of Differential Privacy (DP) in Natural Language Processing often\nviews the task of text privatization as a $\\textit{rewriting}$ task, in which\nsensitive input texts are rewritten to hide explicit or implicit private\ninformation. In order to evaluate the privacy-preserving capabilities of a DP\ntext rewriting mechanism, $\\textit{empirical privacy}$ tests are frequently\nemployed. In these tests, an adversary is modeled, who aims to infer sensitive\ninformation (e.g., gender) about the author behind a (privatized) text. Looking\nto improve the empirical protections provided by DP rewriting methods, we\npropose a simple post-processing method based on the goal of aligning rewritten\ntexts with their original counterparts, where DP rewritten texts are rewritten\n$\\textit{again}$. Our results show that such an approach not only produces\noutputs that are more semantically reminiscent of the original inputs, but also\ntexts which score on average better in empirical privacy evaluations.\nTherefore, our approach raises the bar for DP rewriting methods in their\nempirical privacy evaluations, providing an extra layer of protection against\nmalicious adversaries.\n","authors":["Stephen Meisenbacher","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2405.19831v2.pdf","comment":"10 pages, 2 figures, 2 tables. Accepted to ARES 2024 (IWAPS)"},{"id":"http://arxiv.org/abs/2405.20646v1","updated":"2024-05-31T07:24:42Z","published":"2024-05-31T07:24:42Z","title":"Large Language Models Enhanced Sequential Recommendation for Long-tail\n  User and Item","summary":"  Sequential recommendation systems (SRS) serve the purpose of predicting\nusers' subsequent preferences based on their past interactions and have been\napplied across various domains such as e-commerce and social networking\nplatforms. However, practical SRS encounters challenges due to the fact that\nmost users engage with only a limited number of items, while the majority of\nitems are seldom consumed. These challenges, termed as the long-tail user and\nlong-tail item dilemmas, often create obstacles for traditional SRS methods.\nMitigating these challenges is crucial as they can significantly impact user\nsatisfaction and business profitability. While some research endeavors have\nalleviated these issues, they still grapple with issues such as seesaw or noise\nstemming from the scarcity of interactions. The emergence of large language\nmodels (LLMs) presents a promising avenue to address these challenges from a\nsemantic standpoint. In this study, we introduce the Large Language Models\nEnhancement framework for Sequential Recommendation (LLM-ESR), which leverages\nsemantic embeddings from LLMs to enhance SRS performance without increasing\ncomputational overhead. To combat the long-tail item challenge, we propose a\ndual-view modeling approach that fuses semantic information from LLMs with\ncollaborative signals from traditional SRS. To address the long-tail user\nchallenge, we introduce a retrieval augmented self-distillation technique to\nrefine user preference representations by incorporating richer interaction data\nfrom similar users. Through comprehensive experiments conducted on three\nauthentic datasets using three widely used SRS models, our proposed enhancement\nframework demonstrates superior performance compared to existing methodologies.\n","authors":["Qidong Liu","Xian Wu","Xiangyu Zhao","Yejing Wang","Zijian Zhang","Feng Tian","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.20646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13923v2","updated":"2024-05-31T07:22:45Z","published":"2024-05-22T18:53:25Z","title":"Why Not Transform Chat Large Language Models to Non-English?","summary":"  The scarcity of non-English data limits the development of non-English large\nlanguage models (LLMs). Transforming English-centric LLMs to non-English has\nbeen identified as an effective and resource-efficient method. Previous works\nstart from base LLMs and perform knowledge distillation (KD) with data\ngenerated by stronger LLMs, e.g. GPT-4. Compared to base LLMs, chat LLMs are\nfurther optimized for advanced abilities, e.g. multi-turn conversation and\nhuman preference alignment, and thus more powerful in both helpfulness and\nsafety. However, transforming a chat LLM involves two critical issues: (1) How\ncan we effectively transfer advanced abilities without their supervised data?\n(2) How can we prevent the original knowledge from catastrophic forgetting\nduring transformation? We target these issues by introducing a simple framework\ncalled TransLLM. For the first issue, TransLLM divides the transfer problem\ninto some common sub-tasks with the translation chain-of-thought, which uses\nthe translation as the bridge between English and non-English step-by-step. We\nfurther enhance the performance of sub-tasks with publicly available data. For\nthe second issue, we propose a method comprising two synergistic components:\nlow-rank adaptation for training to maintain the original LLM parameters, and\nrecovery KD, which utilizes data generated by the chat LLM itself to recover\nthe original knowledge from the frozen parameters. In the experiments, we\ntransform the LLaMA-2-chat-7B to the Thai language. Our method, using only\nsingle-turn data, outperforms strong baselines and ChatGPT on multi-turn\nbenchmark MT-bench. Furthermore, our method, without safety data, rejects more\nharmful queries of safety benchmark AdvBench than both ChatGPT and GPT-4.\n","authors":["Xiang Geng","Ming Zhu","Jiahuan Li","Zhejian Lai","Wei Zou","Shuaijie She","Jiaxin Guo","Xiaofeng Zhao","Yinglu Li","Yuang Li","Chang Su","Yanqing Zhao","Xinglin Lyu","Min Zhang","Jiajun Chen","Hao Yang","Shujian Huang"],"pdf_url":"https://arxiv.org/pdf/2405.13923v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10650v6","updated":"2024-05-31T07:22:42Z","published":"2024-05-17T09:25:30Z","title":"SPOR: A Comprehensive and Practical Evaluation Method for Compositional\n  Generalization in Data-to-Text Generation","summary":"  Compositional generalization is an important ability of language models and\nhas many different manifestations. For data-to-text generation, previous\nresearch on this ability is limited to a single manifestation called\nSystematicity and lacks consideration of large language models (LLMs), which\ncannot fully cover practical application scenarios. In this work, we propose\nSPOR, a comprehensive and practical evaluation method for compositional\ngeneralization in data-to-text generation. SPOR includes four aspects of\nmanifestations (Systematicity, Productivity, Order invariance, and Rule\nlearnability) and allows high-quality evaluation without additional manual\nannotations based on existing datasets. We demonstrate SPOR on two different\ndatasets and evaluate some existing language models including LLMs. We find\nthat the models are deficient in various aspects of the evaluation and need\nfurther improvement. Our work shows the necessity for comprehensive research on\ndifferent manifestations of compositional generalization in data-to-text\ngeneration and provides a framework for evaluation.\n","authors":["Ziyao Xu","Houfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.10650v6.pdf","comment":"Accepted to ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2401.00368v3","updated":"2024-05-31T07:22:01Z","published":"2023-12-31T02:13:18Z","title":"Improving Text Embeddings with Large Language Models","summary":"  In this paper, we introduce a novel and simple method for obtaining\nhigh-quality text embeddings using only synthetic data and less than 1k\ntraining steps. Unlike existing methods that often depend on multi-stage\nintermediate pre-training with billions of weakly-supervised text pairs,\nfollowed by fine-tuning with a few labeled datasets, our method does not\nrequire building complex training pipelines or relying on manually collected\ndatasets that are often constrained by task diversity and language coverage. We\nleverage proprietary LLMs to generate diverse synthetic data for hundreds of\nthousands of text embedding tasks across 93 languages. We then fine-tune\nopen-source decoder-only LLMs on the synthetic data using standard contrastive\nloss. Experiments demonstrate that our method achieves strong performance on\nhighly competitive text embedding benchmarks without using any labeled data.\nFurthermore, when fine-tuned with a mixture of synthetic and labeled data, our\nmodel sets new state-of-the-art results on the BEIR and MTEB benchmarks.\n","authors":["Liang Wang","Nan Yang","Xiaolong Huang","Linjun Yang","Rangan Majumder","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2401.00368v3.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2405.20628v1","updated":"2024-05-31T05:40:56Z","published":"2024-05-31T05:40:56Z","title":"ToxVidLLM: A Multimodal LLM-based Framework for Toxicity Detection in\n  Code-Mixed Videos","summary":"  In an era of rapidly evolving internet technology, the surge in multimodal\ncontent, including videos, has expanded the horizons of online communication.\nHowever, the detection of toxic content in this diverse landscape, particularly\nin low-resource code-mixed languages, remains a critical challenge. While\nsubstantial research has addressed toxic content detection in textual data, the\nrealm of video content, especially in non-English languages, has been\nrelatively underexplored. This paper addresses this research gap by introducing\na benchmark dataset, the first of its kind, consisting of 931 videos with 4021\ncode-mixed Hindi-English utterances collected from YouTube. Each utterance\nwithin this dataset has been meticulously annotated for toxicity, severity, and\nsentiment labels. We have developed an advanced Multimodal Multitask framework\nbuilt for Toxicity detection in Video Content by leveraging Large Language\nModels (LLMs), crafted for the primary objective along with the additional\ntasks of conducting sentiment and severity analysis. ToxVidLLM incorporates\nthree key modules the Encoder module, Cross-Modal Synchronization module, and\nMultitask module crafting a generic multimodal LLM customized for intricate\nvideo classification tasks. Our experiments reveal that incorporating multiple\nmodalities from the videos substantially enhances the performance of toxic\ncontent detection by achieving an Accuracy and Weighted F1 score of 94.29% and\n94.35%, respectively.\n","authors":["Krishanu Maity","A. S. Poornash","Sriparna Saha","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2405.20628v1.pdf","comment":"ACL Findings 2024"},{"id":"http://arxiv.org/abs/2405.20624v1","updated":"2024-05-31T05:22:07Z","published":"2024-05-31T05:22:07Z","title":"Leveraging Large Language Models for Entity Matching","summary":"  Entity matching (EM) is a critical task in data integration, aiming to\nidentify records across different datasets that refer to the same real-world\nentities. Traditional methods often rely on manually engineered features and\nrule-based systems, which struggle with diverse and unstructured data. The\nemergence of Large Language Models (LLMs) such as GPT-4 offers transformative\npotential for EM, leveraging their advanced semantic understanding and\ncontextual capabilities. This vision paper explores the application of LLMs to\nEM, discussing their advantages, challenges, and future research directions.\nAdditionally, we review related work on applying weak supervision and\nunsupervised approaches to EM, highlighting how LLMs can enhance these methods.\n","authors":["Qianyu Huang","Tongfang Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.20624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20613v1","updated":"2024-05-31T04:05:09Z","published":"2024-05-31T04:05:09Z","title":"FineRadScore: A Radiology Report Line-by-Line Evaluation Technique\n  Generating Corrections with Severity Scores","summary":"  The current gold standard for evaluating generated chest x-ray (CXR) reports\nis through radiologist annotations. However, this process can be extremely\ntime-consuming and costly, especially when evaluating large numbers of reports.\nIn this work, we present FineRadScore, a Large Language Model (LLM)-based\nautomated evaluation metric for generated CXR reports. Given a candidate report\nand a ground-truth report, FineRadScore gives the minimum number of\nline-by-line corrections required to go from the candidate to the ground-truth\nreport. Additionally, FineRadScore provides an error severity rating with each\ncorrection and generates comments explaining why the correction was needed. We\ndemonstrate that FineRadScore's corrections and error severity scores align\nwith radiologist opinions. We also show that, when used to judge the quality of\nthe report as a whole, FineRadScore aligns with radiologists as well as current\nstate-of-the-art automated CXR evaluation metrics. Finally, we analyze\nFineRadScore's shortcomings to provide suggestions for future improvements.\n","authors":["Alyssa Huang","Oishi Banerjee","Kay Wu","Eduardo Pontes Reis","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2405.20613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20612v1","updated":"2024-05-31T03:59:15Z","published":"2024-05-31T03:59:15Z","title":"UniBias: Unveiling and Mitigating LLM Bias through Internal Attention\n  and FFN Manipulation","summary":"  Large language models (LLMs) have demonstrated impressive capabilities in\nvarious tasks using the in-context learning (ICL) paradigm. However, their\neffectiveness is often compromised by inherent bias, leading to prompt\nbrittleness, i.e., sensitivity to design settings such as example selection,\norder, and prompt formatting. Previous studies have addressed LLM bias through\nexternal adjustment of model outputs, but the internal mechanisms that lead to\nsuch bias remain unexplored. Our work delves into these mechanisms,\nparticularly investigating how feedforward neural networks (FFNs) and attention\nheads result in the bias of LLMs. By Interpreting the contribution of\nindividual FFN vectors and attention heads, we identify the biased LLM\ncomponents that skew LLMs' prediction toward specific labels. To mitigate these\nbiases, we introduce UniBias, an inference-only method that effectively\nidentifies and eliminates biased FFN vectors and attention heads. Extensive\nexperiments across 12 NLP datasets demonstrate that UniBias significantly\nenhances ICL performance and alleviates prompt brittleness of LLMs.\n","authors":["Hanzhang Zhou","Zijian Feng","Zixiao Zhu","Junlang Qian","Kezhi Mao"],"pdf_url":"https://arxiv.org/pdf/2405.20612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20611v1","updated":"2024-05-31T03:57:19Z","published":"2024-05-31T03:57:19Z","title":"Bi-Directional Transformers vs. word2vec: Discovering Vulnerabilities in\n  Lifted Compiled Code","summary":"  Detecting vulnerabilities within compiled binaries is challenging due to lost\nhigh-level code structures and other factors such as architectural\ndependencies, compilers, and optimization options. To address these obstacles,\nthis research explores vulnerability detection by using natural language\nprocessing (NLP) embedding techniques with word2vec, BERT, and RoBERTa to learn\nsemantics from intermediate representation (LLVM) code. Long short-term memory\n(LSTM) neural networks were trained on embeddings from encoders created using\napproximately 118k LLVM functions from the Juliet dataset. This study is\npioneering in its comparison of word2vec models with multiple bidirectional\ntransformer (BERT, RoBERTa) embeddings built using LLVM code to train neural\nnetworks to detect vulnerabilities in compiled binaries. word2vec Continuous\nBag of Words (CBOW) models achieved 92.3% validation accuracy in detecting\nvulnerabilities, outperforming word2vec Skip-Gram, BERT, and RoBERTa. This\nsuggests that complex contextual NLP embeddings may not provide advantages over\nsimpler word2vec models for this task when a limited number (e.g. 118K) of data\nsamples are used to train the bidirectional transformer-based models. The\ncomparative results provide novel insights into selecting optimal embeddings\nfor learning compiler-independent semantic code representations to advance\nmachine learning detection of vulnerabilities in compiled binaries.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2405.20611v1.pdf","comment":"8 pages, 0 figures, IEEE 4th Cyber Awareness and Research Symposium\n  2024 (CARS'24)"},{"id":"http://arxiv.org/abs/2405.20608v1","updated":"2024-05-31T03:48:00Z","published":"2024-05-31T03:48:00Z","title":"Identifying while Learning for Document Event Causality Identification","summary":"  Event Causality Identification (ECI) aims to detect whether there exists a\ncausal relation between two events in a document. Existing studies adopt a kind\nof identifying after learning paradigm, where events' representations are first\nlearned and then used for the identification. Furthermore, they mainly focus on\nthe causality existence, but ignoring causal direction. In this paper, we take\ncare of the causal direction and propose a new identifying while learning mode\nfor the ECI task. We argue that a few causal relations can be easily identified\nwith high confidence, and the directionality and structure of these identified\ncausalities can be utilized to update events' representations for boosting next\nround of causality identification. To this end, this paper designs an\n*iterative learning and identifying framework*: In each iteration, we construct\nan event causality graph, on which events' causal structure representations are\nupdated for boosting causal identification. Experiments on two public datasets\nshow that our approach outperforms the state-of-the-art algorithms in both\nevaluations for causality existence identification and direction\nidentification.\n","authors":["Cheng Liu","Wei Xiang","Bang Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20608v1.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2402.16714v2","updated":"2024-05-31T03:34:57Z","published":"2024-02-26T16:31:28Z","title":"Quantum linear algebra is all you need for Transformer architectures","summary":"  Generative machine learning methods such as large-language models are\nrevolutionizing the creation of text and images. While these models are\npowerful they also harness a large amount of computational resources. The\ntransformer is a key component in large language models that aims to generate a\nsuitable completion of a given partial sequence. In this work, we investigate\ntransformer architectures under the lens of fault-tolerant quantum computing.\nThe input model is one where trained weight matrices are given as block\nencodings and we construct the query, key, and value matrices for the\ntransformer. We show how to prepare a block encoding of the self-attention\nmatrix, with a new subroutine for the row-wise application of the softmax\nfunction. In addition, we combine quantum subroutines to construct important\nbuilding blocks in the transformer, the residual connection and layer\nnormalization, and the feed-forward neural network. Our subroutines prepare an\namplitude encoding of the transformer output, which can be measured to obtain a\nprediction. Based on common open-source large-language models, we provide\ninsights into the behavior of important parameters determining the run time of\nthe quantum algorithm. We discuss the potential and challenges for obtaining a\nquantum advantage.\n","authors":["Naixu Guo","Zhan Yu","Matthew Choi","Aman Agrawal","Kouhei Nakaji","Alán Aspuru-Guzik","Patrick Rebentrost"],"pdf_url":"https://arxiv.org/pdf/2402.16714v2.pdf","comment":"31 pages, 4 figures, 2 tables, comments are welcome"},{"id":"http://arxiv.org/abs/2405.20602v1","updated":"2024-05-31T03:26:42Z","published":"2024-05-31T03:26:42Z","title":"Masked Language Modeling Becomes Conditional Density Estimation for\n  Tabular Data Synthesis","summary":"  In this paper, our goal is to generate synthetic data for heterogeneous\n(mixed-type) tabular datasets with high machine learning utility (MLu). Given\nthat the MLu performance relies on accurately approximating the conditional\ndistributions, we focus on devising a synthetic data generation method based on\nconditional distribution estimation. We propose a novel synthetic data\ngeneration method, MaCoDE, by redefining the multi-class classification task of\nMasked Language Modeling (MLM) as histogram-based non-parametric conditional\ndensity estimation. Our proposed method enables estimating conditional\ndensities across arbitrary combinations of target and conditional variables.\nFurthermore, we demonstrate that our proposed method bridges the theoretical\ngap between distributional learning and MLM. To validate the effectiveness of\nour proposed model, we conduct synthetic data generation experiments on 10\nreal-world datasets. Given the analogy between predicting masked input tokens\nin MLM and missing data imputation, we also evaluate the performance of\nmultiple imputations on incomplete datasets with various missing data\nmechanisms. Moreover, our proposed model offers the advantage of enabling\nadjustments to data privacy levels without requiring re-training.\n","authors":["Seunghwan An","Gyeongdong Woo","Jaesung Lim","ChangHyun Kim","Sungchul Hong","Jong-June Jeon"],"pdf_url":"https://arxiv.org/pdf/2405.20602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12146v3","updated":"2024-05-31T03:25:42Z","published":"2024-02-19T13:57:55Z","title":"Enabling Weak LLMs to Judge Response Reliability via Meta Ranking","summary":"  Despite the strong performance of large language models (LLMs) across a wide\nrange of tasks, they still have reliability issues. Previous studies indicate\nthat strong LLMs like GPT-4-turbo excel in evaluating the reliability of\nresponses from LLMs, but face efficiency and local deployment issues. Thus, to\nenable weak LLMs to effectively assess the reliability of LLM responses, we\npropose a novel cross-query-comparison-based method called $\\textit{Meta\nRanking}$ (MR). Unlike previous few-shot methods that solely based on\nin-context learning capabilities in LLMs, MR assesses reliability by pairwisely\nranking the target query-response pair with multiple reference query-response\npairs. We found that MR is highly effective in error detection for LLM\nresponses, where weak LLMs, such as Phi-2, could surpass strong baselines like\nGPT-3.5-turbo, requiring only five reference samples and significantly\nimproving efficiency. We further demonstrate that MR can enhance strong LLMs'\nperformance in two practical applications: model cascading and instruction\ntuning. In model cascading, we combine open- and closed-source LLMs to achieve\nperformance comparable to GPT-4-turbo with lower costs. In instruction tuning,\nwe use MR for iterative training data filtering, significantly reducing data\nprocessing time and enabling LLaMA-7B and Phi-2 to surpass Alpaca-13B with\nfewer training tokens. These results underscore the high potential of MR in\nboth efficiency and effectiveness.\n","authors":["Zijun Liu","Boqun Kou","Peng Li","Ming Yan","Ji Zhang","Fei Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.12146v3.pdf","comment":"Preprint, under review. 28 pages"},{"id":"http://arxiv.org/abs/2311.03732v4","updated":"2024-05-31T03:07:23Z","published":"2023-11-07T05:22:11Z","title":"Learning to Learn for Few-shot Continual Active Learning","summary":"  Continual learning strives to ensure stability in solving previously seen\ntasks while demonstrating plasticity in a novel domain. Recent advances in\ncontinual learning are mostly confined to a supervised learning setting,\nespecially in NLP domain. In this work, we consider a few-shot continual active\nlearning setting where labeled data are inadequate, and unlabeled data are\nabundant but with a limited annotation budget. We exploit meta-learning and\npropose a method, called Meta-Continual Active Learning. This method\nsequentially queries the most informative examples from a pool of unlabeled\ndata for annotation to enhance task-specific performance and tackle continual\nlearning problems through meta-objective. Specifically, we employ meta-learning\nand experience replay to address inter-task confusion and catastrophic\nforgetting. We further incorporate textual augmentations to avoid memory\nover-fitting caused by experience replay and sample queries, thereby ensuring\ngeneralization. We conduct extensive experiments on benchmark text\nclassification datasets from diverse domains to validate the feasibility and\neffectiveness of meta-continual active learning. We also analyze the impact of\ndifferent active learning strategies on various meta continual learning models.\nThe experimental results demonstrate that introducing randomness into sample\nselection is the best default strategy for maintaining generalization in\nmeta-continual learning framework.\n","authors":["Stella Ho","Ming Liu","Shang Gao","Longxiang Gao"],"pdf_url":"https://arxiv.org/pdf/2311.03732v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19670v2","updated":"2024-05-31T02:56:56Z","published":"2024-05-30T03:44:54Z","title":"One Token Can Help! Learning Scalable and Pluggable Virtual Tokens for\n  Retrieval-Augmented Large Language Models","summary":"  Retrieval-augmented generation (RAG) is a promising way to improve large\nlanguage models (LLMs) for generating more factual, accurate, and up-to-date\ncontent. Existing methods either optimize prompts to guide LLMs in leveraging\nretrieved information or directly fine-tune the LLMs to adapt to RAG scenarios.\nAlthough fine-tuning can yield better performance, it often compromises the\nLLMs' general generation capabilities by modifying their parameters. This\nlimitation poses challenges in practical applications, especially when LLMs are\nalready deployed, as parameter adjustments may affect their original\nfunctionality. To address this, we propose a novel method that involves\nlearning scalable and pluggable virtual tokens for RAG. By maintaining the\nLLMs' original parameters and fine-tuning only the embeddings of these\npluggable tokens, our approach not only enhances LLMs' performance but also\npreserves their general generation capacities. Furthermore, we design several\ntraining strategies to improve the scalability, flexibility, and\ngeneralizability of our method. Comprehensive experiments across nine\nquestion-answering tasks demonstrate the superiority of our approach.\n","authors":["Yutao Zhu","Zhaoheng Huang","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2405.19670v2.pdf","comment":"working in progress, repo: https://github.com/DaoD/SPRING/"},{"id":"http://arxiv.org/abs/2405.20588v1","updated":"2024-05-31T02:56:49Z","published":"2024-05-31T02:56:49Z","title":"DAFNet: Dynamic Auxiliary Fusion for Sequential Model Editing in Large\n  Language Models","summary":"  Recently, while large language models (LLMs) have demonstrated impressive\nresults, they still suffer from hallucination, i.e., the generation of false\ninformation. Model editing is the task of fixing factual mistakes in LLMs; yet,\nmost previous works treat it as a one-time task, paying little attention to\never-emerging mistakes generated by LLMs. We address the task of sequential\nmodel editing (SME) that aims to rectify mistakes continuously. A Dynamic\nAuxiliary Fusion Network (DAFNet) is designed to enhance the semantic\ninteraction among the factual knowledge within the entire sequence, preventing\ncatastrophic forgetting during the editing process of multiple knowledge\ntriples. Specifically, (1) for semantic fusion within a relation triple, we\naggregate the intra-editing attention flow into auto-regressive self-attention\nwith token-level granularity in LLMs. We further leverage multi-layer diagonal\ninter-editing attention flow to update the weighted representations of the\nentire sequence-level granularity. (2) Considering that auxiliary parameters\nare required to store the knowledge for sequential editing, we construct a new\ndataset named \\textbf{DAFSet}, fulfilling recent, popular, long-tail and robust\nproperties to enhance the generality of sequential editing. Experiments show\nDAFNet significantly outperforms strong baselines in single-turn and sequential\nediting. The usage of DAFSet also consistently improves the performance of\nother auxiliary network-based methods in various scenarios\n","authors":["Taolin Zhang","Qizhou Chen","Dongyang Li","Chengyu Wang","Xiaofeng He","Longtao Huang","Hui Xue","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2405.20588v1.pdf","comment":"ACL2024 findings"},{"id":"http://arxiv.org/abs/2405.20585v1","updated":"2024-05-31T02:53:22Z","published":"2024-05-31T02:53:22Z","title":"GAMedX: Generative AI-based Medical Entity Data Extractor Using Large\n  Language Models","summary":"  In the rapidly evolving field of healthcare and beyond, the integration of\ngenerative AI in Electronic Health Records (EHRs) represents a pivotal\nadvancement, addressing a critical gap in current information extraction\ntechniques. This paper introduces GAMedX, a Named Entity Recognition (NER)\napproach utilizing Large Language Models (LLMs) to efficiently extract entities\nfrom medical narratives and unstructured text generated throughout various\nphases of the patient hospital visit. By addressing the significant challenge\nof processing unstructured medical text, GAMedX leverages the capabilities of\ngenerative AI and LLMs for improved data extraction. Employing a unified\napproach, the methodology integrates open-source LLMs for NER, utilizing\nchained prompts and Pydantic schemas for structured output to navigate the\ncomplexities of specialized medical jargon. The findings reveal significant\nROUGE F1 score on one of the evaluation datasets with an accuracy of 98\\%. This\ninnovation enhances entity extraction, offering a scalable, cost-effective\nsolution for automated forms filling from unstructured data. As a result,\nGAMedX streamlines the processing of unstructured narratives, and sets a new\nstandard in NER applications, contributing significantly to theoretical and\npractical advancements beyond the medical technology sphere.\n","authors":["Mohammed-Khalil Ghali","Abdelrahman Farrag","Hajar Sakai","Hicham El Baz","Yu Jin","Sarah Lam"],"pdf_url":"https://arxiv.org/pdf/2405.20585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08846v2","updated":"2024-05-31T02:37:10Z","published":"2024-04-12T23:27:46Z","title":"Experimental Design for Active Transductive Inference in Large Language\n  Models","summary":"  One emergent ability of large language models (LLMs) is that query-specific\nexamples can be included in the prompt at inference time. In this work, we use\nactive learning for adaptive prompt design and call it Active In-context Prompt\nDesign (AIPD). We design the LLM prompt by adaptively choosing few-shot\nexamples from a training set to optimize performance on a test set. The\ntraining examples are initially unlabeled and we obtain the label of the most\ninformative ones, which maximally reduces uncertainty in the LLM prediction. We\npropose two algorithms, GO and SAL, which differ in how the few-shot examples\nare chosen. We analyze these algorithms in linear models: first GO and then use\nits equivalence with SAL. We experiment with many different tasks in small,\nmedium-sized, and large language models; and show that GO and SAL outperform\nother methods for choosing few-shot examples in the LLM prompt at inference\ntime.\n","authors":["Subhojyoti Mukherjee","Anusha Lalitha","Aniket Deshmukh","Ge Liu","Yifei Ma","Branislav Kveton"],"pdf_url":"https://arxiv.org/pdf/2404.08846v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20582v1","updated":"2024-05-31T02:28:41Z","published":"2024-05-31T02:28:41Z","title":"The Point of View of a Sentiment: Towards Clinician Bias Detection in\n  Psychiatric Notes","summary":"  In psychiatry, negative patient descriptions and stigmatizing language can\ncontribute to healthcare disparities in two ways: (1) read by patients they can\nharm their trust and engagement with the medical center; (2) read by future\nproviders they may negatively influence the future perspective of a patient. By\nleveraging large language models, this work aims to identify the sentiment\nexpressed in psychiatric clinical notes based on the reader's point of view.\nExtracting sentences from the Mount Sinai Health System's large and diverse\nclinical notes, we used prompts and in-context learning to adapt three large\nlanguage models (GPT-3.5, Llama 2, Mistral) to classify the sentiment conveyed\nby the sentences according to the provider or non-provider point of view.\nResults showed that GPT-3.5 aligns best to provider point of view, whereas\nMistral aligns best to non-provider point of view.\n","authors":["Alissa A. Valentine","Lauren A. Lepow","Alexander W. Charney","Isotta Landi"],"pdf_url":"https://arxiv.org/pdf/2405.20582v1.pdf","comment":"Oral presentation at NAACL 2024 Queer in AI Workshop"},{"id":"http://arxiv.org/abs/2405.20574v1","updated":"2024-05-31T02:05:45Z","published":"2024-05-31T02:05:45Z","title":"Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with\n  Ko-H5 Benchmark","summary":"  This paper introduces the Open Ko-LLM Leaderboard and the Ko-H5 Benchmark as\nvital tools for evaluating Large Language Models (LLMs) in Korean.\nIncorporating private test sets while mirroring the English Open LLM\nLeaderboard, we establish a robust evaluation framework that has been well\nintegrated in the Korean LLM community. We perform data leakage analysis that\nshows the benefit of private test sets along with a correlation study within\nthe Ko-H5 benchmark and temporal analyses of the Ko-H5 score. Moreover, we\npresent empirical support for the need to expand beyond set benchmarks. We hope\nthe Open Ko-LLM Leaderboard sets precedent for expanding LLM evaluation to\nfoster more linguistic diversity.\n","authors":["Chanjun Park","Hyeonwoo Kim","Dahyun Kim","Seonghwan Cho","Sanghoon Kim","Sukyung Lee","Yungi Kim","Hwalsuk Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20574v1.pdf","comment":"Accepted at ACL 2024 Main"},{"id":"http://arxiv.org/abs/2405.20172v2","updated":"2024-05-31T01:59:20Z","published":"2024-05-30T15:44:27Z","title":"Iterative Feature Boosting for Explainable Speech Emotion Recognition","summary":"  In speech emotion recognition (SER), using predefined features without\nconsidering their practical importance may lead to high dimensional datasets,\nincluding redundant and irrelevant information. Consequently, high-dimensional\nlearning often results in decreasing model accuracy while increasing\ncomputational complexity. Our work underlines the importance of carefully\nconsidering and analyzing features in order to build efficient SER systems. We\npresent a new supervised SER method based on an efficient feature engineering\napproach. We pay particular attention to the explainability of results to\nevaluate feature relevance and refine feature sets. This is performed\niteratively through feature evaluation loop, using Shapley values to boost\nfeature selection and improve overall framework performance. Our approach\nallows thus to balance the benefits between model performance and transparency.\nThe proposed method outperforms human-level performance (HLP) and\nstate-of-the-art machine learning methods in emotion recognition on the TESS\ndataset.\n","authors":["Alaa Nfissi","Wassim Bouachir","Nizar Bouguila","Brian Mishara"],"pdf_url":"https://arxiv.org/pdf/2405.20172v2.pdf","comment":"Published in: 2023 International Conference on Machine Learning and\n  Applications (ICMLA)"},{"id":"http://arxiv.org/abs/2405.19325v2","updated":"2024-05-31T01:41:49Z","published":"2024-05-29T17:55:03Z","title":"Nearest Neighbor Speculative Decoding for LLM Generation and Attribution","summary":"  Large language models (LLMs) often hallucinate and lack the ability to\nprovide attribution for their generations. Semi-parametric LMs, such as kNN-LM,\napproach these limitations by refining the output of an LM for a given prompt\nusing its nearest neighbor matches in a non-parametric data store. However,\nthese models often exhibit slow inference speeds and produce non-fluent texts.\nIn this paper, we introduce Nearest Neighbor Speculative Decoding (NEST), a\nnovel semi-parametric language modeling approach that is capable of\nincorporating real-world text spans of arbitrary length into the LM generations\nand providing attribution to their sources. NEST performs token-level retrieval\nat each inference step to compute a semi-parametric mixture distribution and\nidentify promising span continuations in a corpus. It then uses an approximate\nspeculative decoding procedure that accepts a prefix of the retrieved span or\ngenerates a new token. NEST significantly enhances the generation quality and\nattribution rate of the base LM across a variety of knowledge-intensive tasks,\nsurpassing the conventional kNN-LM method and performing competitively with\nin-context retrieval augmentation. In addition, NEST substantially improves the\ngeneration speed, achieving a 1.8x speedup in inference time when applied to\nLlama-2-Chat 70B.\n","authors":["Minghan Li","Xilun Chen","Ari Holtzman","Beidi Chen","Jimmy Lin","Wen-tau Yih","Xi Victoria Lin"],"pdf_url":"https://arxiv.org/pdf/2405.19325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07989v2","updated":"2024-05-31T01:36:53Z","published":"2024-04-11T17:59:45Z","title":"Any2Point: Empowering Any-modality Large Models for Efficient 3D\n  Understanding","summary":"  Large foundation models have recently emerged as a prominent focus of\ninterest, attaining superior performance in widespread scenarios. Due to the\nscarcity of 3D data, many efforts have been made to adapt pre-trained\ntransformers from vision to 3D domains. However, such 2D-to-3D approaches are\nstill limited, due to the potential loss of spatial geometries and high\ncomputation cost. More importantly, their frameworks are mainly designed for 2D\nmodels, lacking a general any-to-3D paradigm. In this paper, we introduce\nAny2Point, a parameter-efficient method to empower any-modality large models\n(vision, language, audio) for 3D understanding. Given a frozen transformer from\nany source modality, we propose a 3D-to-any (1D or 2D) virtual projection\nstrategy that correlates the input 3D points to the original 1D or 2D positions\nwithin the source modality. This mechanism enables us to assign each 3D token\nwith a positional encoding paired with the pre-trained model, which avoids 3D\ngeometry loss caused by the true projection and better motivates the\ntransformer for 3D learning with 1D/2D positional priors. Then, within each\ntransformer block, we insert an any-to-3D guided adapter module for\nparameter-efficient fine-tuning. The adapter incorporates prior spatial\nknowledge from the source modality to guide the local feature aggregation of 3D\ntokens, compelling the semantic adaption of any-modality transformers. We\nconduct extensive experiments to showcase the effectiveness and efficiency of\nour method. Code and models are released at\nhttps://github.com/Ivan-Tang-3D/Any2Point.\n","authors":["Yiwen Tang","Ray Zhang","Jiaming Liu","Zoey Guo","Dong Wang","Zhigang Wang","Bin Zhao","Shanghang Zhang","Peng Gao","Hongsheng Li","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.07989v2.pdf","comment":"Code and models are released at\n  https://github.com/Ivan-Tang-3D/Any2Point"},{"id":"http://arxiv.org/abs/2311.09510v3","updated":"2024-05-31T01:32:23Z","published":"2023-11-16T02:25:36Z","title":"Tailoring with Targeted Precision: Edit-Based Agents for Open-Domain\n  Procedure Customization","summary":"  How-to procedures, such as how to plant a garden, are now used by millions of\nusers, but sometimes need customizing to meet a user's specific needs, e.g.,\nplanting a garden without pesticides. Our goal is to measure and improve an\nLLM's ability to perform such customization. Our approach is to test several\nsimple multi-LLM-agent architectures for customization, as well as an\nend-to-end LLM, using a new evaluation set, called CustomPlans, of over 200\nWikiHow procedures each with a customization need. We find that a simple\narchitecture with two LLM agents used sequentially performs best, one that\nedits a generic how-to procedure and one that verifies its executability,\nsignificantly outperforming (10.5% absolute) an end-to-end prompted LLM. This\nsuggests that LLMs can be configured reasonably effectively for procedure\ncustomization. This also suggests that multi-agent editing architectures may be\nworth exploring further for other customization applications (e.g. coding,\ncreative writing) in the future.\n","authors":["Yash Kumar Lal","Li Zhang","Faeze Brahman","Bodhisattwa Prasad Majumder","Peter Clark","Niket Tandon"],"pdf_url":"https://arxiv.org/pdf/2311.09510v3.pdf","comment":"Camera ready version accepted to Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2305.15255v4","updated":"2024-05-31T01:29:27Z","published":"2023-05-24T15:39:43Z","title":"Spoken Question Answering and Speech Continuation Using\n  Spectrogram-Powered LLM","summary":"  We present Spectron, a novel approach to adapting pre-trained large language\nmodels (LLMs) to perform spoken question answering (QA) and speech\ncontinuation. By endowing the LLM with a pre-trained speech encoder, our model\nbecomes able to take speech inputs and generate speech outputs. The entire\nsystem is trained end-to-end and operates directly on spectrograms, simplifying\nour architecture. Key to our approach is a training objective that jointly\nsupervises speech recognition, text continuation, and speech synthesis using\nonly paired speech-text pairs, enabling a `cross-modal' chain-of-thought within\na single decoding pass. Our method surpasses existing spoken language models in\nspeaker preservation and semantic coherence. Furthermore, the proposed model\nimproves upon direct initialization in retaining the knowledge of the original\nLLM as demonstrated through spoken QA datasets. We release our audio samples\n(https://michelleramanovich.github.io/spectron/spectron) and spoken QA dataset\n(https://github.com/google-research-datasets/LLAMA1-Test-Set).\n","authors":["Eliya Nachmani","Alon Levkovitch","Roy Hirsch","Julian Salazar","Chulayuth Asawaroengchai","Soroosh Mariooryad","Ehud Rivlin","RJ Skerry-Ryan","Michelle Tadmor Ramanovich"],"pdf_url":"https://arxiv.org/pdf/2305.15255v4.pdf","comment":"ICLR 2024 camera-ready"},{"id":"http://arxiv.org/abs/2405.19787v2","updated":"2024-05-31T01:23:41Z","published":"2024-05-30T07:54:07Z","title":"From Symbolic Tasks to Code Generation: Diversification Yields Better\n  Task Performers","summary":"  Instruction tuning -- tuning large language models on instruction-output\npairs -- is a promising technique for making models better adapted to the real\nworld. Yet, the key factors driving the model's capability to understand and\nfollow instructions not seen during training remain under-explored. Our\ninvestigation begins with a series of synthetic experiments within the\ntheoretical framework of a Turing-complete algorithm called Markov algorithm,\nwhich allows fine-grained control over the instruction-tuning data.\nGeneralization and robustness with respect to the training distribution emerge\nonce a diverse enough set of tasks is provided, even though very few examples\nare provided for each task. We extend these initial results to a real-world\napplication scenario of code generation and find that a more diverse\ninstruction set, extending beyond code-related tasks, improves the performance\nof code generation. Our observations suggest that a more diverse semantic space\nfor instruction-tuning sets greatly improves the model's ability to follow\ninstructions and perform tasks.\n","authors":["Dylan Zhang","Justin Wang","Francois Charton"],"pdf_url":"https://arxiv.org/pdf/2405.19787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04626v2","updated":"2024-05-31T00:12:59Z","published":"2024-03-07T16:11:43Z","title":"MedFLIP: Medical Vision-and-Language Self-supervised Fast Pre-Training\n  with Masked Autoencoder","summary":"  Within the domain of medical analysis, extensive research has explored the\npotential of mutual learning between Masked Autoencoders(MAEs) and multimodal\ndata. However, the impact of MAEs on intermodality remains a key challenge. We\nintroduce MedFLIP, a Fast Language-Image Pre-training method for Medical\nanalysis. We explore MAEs for zero-shot learning with crossed domains, which\nenhances the model's ability to learn from limited data, a common scenario in\nmedical diagnostics. We verify that masking an image does not affect\ninter-modal learning. Furthermore, we propose the SVD loss to enhance the\nrepresentation learning for characteristics of medical images, aiming to\nimprove classification accuracy by leveraging the structural intricacies of\nsuch data. Our theory posits that masking encourages semantic preservation,\nrobust feature extraction, regularization, domain adaptation, and invariance\nlearning. Lastly, we validate using language will improve the zero-shot\nperformance for the medical image analysis. MedFLIP's scaling of the masking\nprocess marks an advancement in the field, offering a pathway to rapid and\nprecise medical image analysis without the traditional computational\nbottlenecks. Through experiments and validation, MedFLIP demonstrates efficient\nperformance improvements, helps for future research and application in medical\ndiagnostics.\n","authors":["Lei Li","Tianfang Zhang","Xinglin Zhang","Jiaqi Liu","Bingqi Ma","Yan Luo","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2403.04626v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17509v2","updated":"2024-05-31T00:09:56Z","published":"2024-02-27T13:49:12Z","title":"Extreme Miscalibration and the Illusion of Adversarial Robustness","summary":"  Deep learning-based Natural Language Processing (NLP) models are vulnerable\nto adversarial attacks, where small perturbations can cause a model to\nmisclassify. Adversarial Training (AT) is often used to increase model\nrobustness. However, we have discovered an intriguing phenomenon: deliberately\nor accidentally miscalibrating models masks gradients in a way that interferes\nwith adversarial attack search methods, giving rise to an apparent increase in\nrobustness. We show that this observed gain in robustness is an illusion of\nrobustness (IOR), and demonstrate how an adversary can perform various forms of\ntest-time temperature calibration to nullify the aforementioned interference\nand allow the adversarial attack to find adversarial examples. Hence, we urge\nthe NLP community to incorporate test-time temperature scaling into their\nrobustness evaluations to ensure that any observed gains are genuine. Finally,\nwe show how the temperature can be scaled during \\textit{training} to improve\ngenuine robustness.\n","authors":["Vyas Raina","Samson Tan","Volkan Cevher","Aditya Rawal","Sheng Zha","George Karypis"],"pdf_url":"https://arxiv.org/pdf/2402.17509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16367v2","updated":"2024-05-31T23:47:15Z","published":"2024-04-25T07:10:29Z","title":"Learning Syntax Without Planting Trees: Understanding When and Why\n  Transformers Generalize Hierarchically","summary":"  Transformers trained on natural language data have been shown to learn its\nhierarchical structure and generalize to sentences with unseen syntactic\nstructures without explicitly encoding any structural bias. In this work, we\ninvestigate sources of inductive bias in transformer models and their training\nthat could cause such generalization behavior to emerge. We extensively\nexperiment with transformer models trained on multiple synthetic datasets and\nwith different training objectives and show that while other objectives e.g.\nsequence-to-sequence modeling, prefix language modeling, often failed to lead\nto hierarchical generalization, models trained with the language modeling\nobjective consistently learned to generalize hierarchically. We then conduct\npruning experiments to study how transformers trained with the language\nmodeling objective encode hierarchical structure. When pruned, we find joint\nexistence of subnetworks within the model with different generalization\nbehaviors (subnetworks corresponding to hierarchical structure and linear\norder). Finally, we take a Bayesian perspective to further uncover\ntransformers' preference for hierarchical generalization: We establish a\ncorrelation between whether transformers generalize hierarchically on a dataset\nand whether the simplest explanation of that dataset is provided by a\nhierarchical grammar compared to regular grammars exhibiting linear\ngeneralization.\n","authors":["Kabir Ahuja","Vidhisha Balachandran","Madhur Panwar","Tianxing He","Noah A. Smith","Navin Goyal","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2404.16367v2.pdf","comment":"Code now available: https://github.com/kabirahuja2431/transformers-hg"},{"id":"http://arxiv.org/abs/2310.12516v2","updated":"2024-05-31T23:46:24Z","published":"2023-10-19T06:37:32Z","title":"ReEval: Automatic Hallucination Evaluation for Retrieval-Augmented Large\n  Language Models via Transferable Adversarial Attacks","summary":"  Despite remarkable advancements in mitigating hallucinations in large\nlanguage models (LLMs) by retrieval augmentation, it remains challenging to\nmeasure the reliability of LLMs using static question-answering (QA) data.\nSpecifically, given the potential of data contamination (e.g., leading to\nmemorization), good static benchmark performance does not ensure that model can\nreliably use the provided evidence for responding, which is essential to avoid\nhallucination when the required knowledge is new or private. Inspired by\nadversarial machine learning, we investigate the feasibility of automatically\nperturbing existing static one for dynamic evaluation. Specifically, this paper\npresents ReEval, an LLM-based framework using prompt chaining to perturb the\noriginal evidence for generating new test cases for evaluating the LLMs'\nreliability in using new evidence for answering.\n  We implement ReEval using ChatGPT and evaluate the resulting variants of two\npopular open-domain QA datasets on a collection of LLMs under various prompting\nsettings. Our generated data is human-readable and useful to trigger\nhallucination in LLM. Accurate models on static data are observed to produce\nunsupported answers from the perturbed evidence, with pronounced accuracy drops\nacross LLMs including GPT-4. We find that our adversarial examples are\ntransferable across all considered LLMs. The examples generated by a small\nmodel can be used to evaluate a much larger model, making our approach\ncost-effective.\n","authors":["Xiaodong Yu","Hao Cheng","Xiaodong Liu","Dan Roth","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2310.12516v2.pdf","comment":"NAACL 2024 Findings"},{"id":"http://arxiv.org/abs/2405.16969v3","updated":"2024-05-31T23:15:55Z","published":"2024-05-27T09:06:24Z","title":"The Multi-Range Theory of Translation Quality Measurement: MQM scoring\n  models and Statistical Quality Control","summary":"  The year 2024 marks the 10th anniversary of the Multidimensional Quality\nMetrics (MQM) framework for analytic translation quality evaluation. The MQM\nerror typology has been widely used by practitioners in the translation and\nlocalization industry and has served as the basis for many derivative projects.\nThe annual Conference on Machine Translation (WMT) shared tasks on both human\nand automatic translation quality evaluations used the MQM error typology.\n  The metric stands on two pillars: error typology and the scoring model. The\nscoring model calculates the quality score from annotation data, detailing how\nto convert error type and severity counts into numeric scores to determine if\nthe content meets specifications. Previously, only the raw scoring model had\nbeen published. This April, the MQM Council published the Linear Calibrated\nScoring Model, officially presented herein, along with the Non-Linear Scoring\nModel, which had not been published before.\n  This paper details the latest MQM developments and presents a universal\napproach to translation quality measurement across three sample size ranges. It\nalso explains why Statistical Quality Control should be used for very small\nsample sizes, starting from a single sentence.\n","authors":["Arle Lommel","Serge Gladkoff","Alan Melby","Sue Ellen Wright","Ingemar Strandvik","Katerina Gasova","Angelika Vaasa","Andy Benzo","Romina Marazzato Sparano","Monica Foresi","Johani Innis","Lifeng Han","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2405.16969v3.pdf","comment":"working paper, 20 pages"},{"id":"http://arxiv.org/abs/2308.06795v2","updated":"2024-05-31T22:41:54Z","published":"2023-08-13T15:44:39Z","title":"Robust Infidelity: When Faithfulness Measures on Masked Language Models\n  Are Misleading","summary":"  A common approach to quantifying neural text classifier interpretability is\nto calculate faithfulness metrics based on iteratively masking salient input\ntokens and measuring changes in the model prediction. We propose that this\nproperty is better described as \"sensitivity to iterative masking\", and\nhighlight pitfalls in using this measure for comparing text classifier\ninterpretability. We show that iterative masking produces large variation in\nfaithfulness scores between otherwise comparable Transformer encoder text\nclassifiers. We then demonstrate that iteratively masked samples produce\nembeddings outside the distribution seen during training, resulting in\nunpredictable behaviour. We further explore task-specific considerations that\nundermine principled comparison of interpretability using iterative masking,\nsuch as an underlying similarity to salience-based adversarial attacks. Our\nfindings give insight into how these behaviours affect neural text classifiers,\nand provide guidance on how sensitivity to iterative masking should be\ninterpreted.\n","authors":["Evan Crothers","Herna Viktor","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2308.06795v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17284v2","updated":"2024-05-31T21:30:44Z","published":"2024-05-27T15:47:46Z","title":"An NLP Crosswalk Between the Common Core State Standards and NAEP Item\n  Specifications","summary":"  Natural language processing (NLP) is rapidly developing for applications in\neducational assessment. In this paper, I describe an NLP-based procedure that\ncan be used to support subject matter experts in establishing a crosswalk\nbetween item specifications and content standards. This paper extends recent\nwork by proposing and demonstrating the use of multivariate similarity based on\nembedding vectors for sentences or texts. In particular, a hybrid regression\nprocedure is demonstrated for establishing the match of each content standard\nto multiple item specifications. The procedure is used to evaluate the match of\nthe Common Core State Standards (CCSS) for mathematics at grade 4 to the\ncorresponding item specifications for the 2026 National Assessment of\nEducational Progress (NAEP).\n","authors":["Gregory Camilli"],"pdf_url":"https://arxiv.org/pdf/2405.17284v2.pdf","comment":"Deleted repeated sections. Corrected proper nouns. Corrected type in\n  CCSS sentences"},{"id":"http://arxiv.org/abs/2310.17086v2","updated":"2024-05-31T20:37:54Z","published":"2023-10-26T01:08:47Z","title":"Transformers Learn Higher-Order Optimization Methods for In-Context\n  Learning: A Study with Linear Models","summary":"  Transformers excel at in-context learning (ICL) -- learning from\ndemonstrations without parameter updates -- but how they do so remains a\nmystery. Recent work suggests that Transformers may internally run Gradient\nDescent (GD), a first-order optimization method, to perform ICL. In this paper,\nwe instead demonstrate that Transformers learn to approximate higher-order\noptimization methods for ICL. For in-context linear regression, Transformers\nshare a similar convergence rate as Iterative Newton's Method; both are\nexponentially faster than GD. Empirically, predictions from successive\nTransformer layers closely match different iterations of Newton's Method\nlinearly, with each middle layer roughly computing 3 iterations; thus,\nTransformers and Newton's method converge at roughly the same rate. In\ncontrast, Gradient Descent converges exponentially more slowly. We also show\nthat Transformers can learn in-context on ill-conditioned data, a setting where\nGradient Descent struggles but Iterative Newton succeeds. Finally, to\ncorroborate our empirical findings, we prove that Transformers can implement\n$k$ iterations of Newton's method with $k + \\mathcal{O}(1)$ layers.\n","authors":["Deqing Fu","Tian-Qi Chen","Robin Jia","Vatsal Sharan"],"pdf_url":"https://arxiv.org/pdf/2310.17086v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2405.21075v1","updated":"2024-05-31T17:59:47Z","published":"2024-05-31T17:59:47Z","title":"Video-MME: The First-Ever Comprehensive Evaluation Benchmark of\n  Multi-modal LLMs in Video Analysis","summary":"  In the quest for artificial general intelligence, Multi-modal Large Language\nModels (MLLMs) have emerged as a focal point in recent advancements. However,\nthe predominant focus remains on developing their capabilities in static image\nunderstanding. The potential of MLLMs in processing sequential visual data is\nstill insufficiently explored, highlighting the absence of a comprehensive,\nhigh-quality assessment of their performance. In this paper, we introduce\nVideo-MME, the first-ever full-spectrum, Multi-Modal Evaluation benchmark of\nMLLMs in Video analysis. Our work distinguishes from existing benchmarks\nthrough four key features: 1) Diversity in video types, spanning 6 primary\nvisual domains with 30 subfields to ensure broad scenario generalizability; 2)\nDuration in temporal dimension, encompassing both short-, medium-, and\nlong-term videos, ranging from 11 seconds to 1 hour, for robust contextual\ndynamics; 3) Breadth in data modalities, integrating multi-modal inputs besides\nvideo frames, including subtitles and audios, to unveil the all-round\ncapabilities of MLLMs; 4) Quality in annotations, utilizing rigorous manual\nlabeling by expert annotators to facilitate precise and reliable model\nassessment. 900 videos with a total of 256 hours are manually selected and\nannotated by repeatedly viewing all the video content, resulting in 2,700\nquestion-answer pairs. With Video-MME, we extensively evaluate various\nstate-of-the-art MLLMs, including GPT-4 series and Gemini 1.5 Pro, as well as\nopen-source image models like InternVL-Chat-V1.5 and video models like\nLLaVA-NeXT-Video. Our experiments reveal that Gemini 1.5 Pro is the\nbest-performing commercial model, significantly outperforming the open-source\nmodels. Our dataset along with these findings underscores the need for further\nimprovements in handling longer sequences and multi-modal data. Project Page:\nhttps://video-mme.github.io\n","authors":["Chaoyou Fu","Yuhan Dai","Yondong Luo","Lei Li","Shuhuai Ren","Renrui Zhang","Zihan Wang","Chenyu Zhou","Yunhang Shen","Mengdan Zhang","Peixian Chen","Yanwei Li","Shaohui Lin","Sirui Zhao","Ke Li","Tong Xu","Xiawu Zheng","Enhong Chen","Rongrong Ji","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2405.21075v1.pdf","comment":"Project Page: https://video-mme.github.io"},{"id":"http://arxiv.org/abs/2405.21074v1","updated":"2024-05-31T17:59:12Z","published":"2024-05-31T17:59:12Z","title":"Latent Intrinsics Emerge from Training to Relight","summary":"  Image relighting is the task of showing what a scene from a source image\nwould look like if illuminated differently. Inverse graphics schemes recover an\nexplicit representation of geometry and a set of chosen intrinsics, then\nrelight with some form of renderer. However error control for inverse graphics\nis difficult, and inverse graphics methods can represent only the effects of\nthe chosen intrinsics. This paper describes a relighting method that is\nentirely data-driven, where intrinsics and lighting are each represented as\nlatent variables. Our approach produces SOTA relightings of real scenes, as\nmeasured by standard metrics. We show that albedo can be recovered from our\nlatent intrinsics without using any example albedos, and that the albedos\nrecovered are competitive with SOTA methods.\n","authors":["Xiao Zhang","William Gao","Seemandhar Jain","Michael Maire","David. A. Forsyth","Anand Bhattad"],"pdf_url":"https://arxiv.org/pdf/2405.21074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21070v1","updated":"2024-05-31T17:57:24Z","published":"2024-05-31T17:57:24Z","title":"Generalization Beyond Data Imbalance: A Controlled Study on CLIP for\n  Transferable Insights","summary":"  Severe data imbalance naturally exists among web-scale vision-language\ndatasets. Despite this, we find CLIP pre-trained thereupon exhibits notable\nrobustness to the data imbalance compared to supervised learning, and\ndemonstrates significant effectiveness in learning generalizable\nrepresentations. With an aim to investigate the reasons behind this finding, we\nconduct controlled experiments to study various underlying factors, and reveal\nthat CLIP's pretext task forms a dynamic classification problem wherein only a\nsubset of classes is present in training. This isolates the bias from dominant\nclasses and implicitly balances the learning signal. Furthermore, the\nrobustness and discriminability of CLIP improve with more descriptive language\nsupervision, larger data scale, and broader open-world concepts, which are\ninaccessible to supervised learning. Our study not only uncovers the mechanisms\nbehind CLIP's generalizability beyond data imbalance but also provides\ntransferable insights for the research community. The findings are validated in\nboth supervised and self-supervised learning, enabling models trained on\nimbalanced data to achieve CLIP-level performance on diverse recognition tasks.\nCode will be available at: https://github.com/CVMI-Lab/clip-beyond-tail.\n","authors":["Xin Wen","Bingchen Zhao","Yilun Chen","Jiangmiao Pang","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2405.21070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21066v1","updated":"2024-05-31T17:54:52Z","published":"2024-05-31T17:54:52Z","title":"Mixed Diffusion for 3D Indoor Scene Synthesis","summary":"  Realistic conditional 3D scene synthesis significantly enhances and\naccelerates the creation of virtual environments, which can also provide\nextensive training data for computer vision and robotics research among other\napplications. Diffusion models have shown great performance in related\napplications, e.g., making precise arrangements of unordered sets. However,\nthese models have not been fully explored in floor-conditioned scene synthesis\nproblems. We present MiDiffusion, a novel mixed discrete-continuous diffusion\nmodel architecture, designed to synthesize plausible 3D indoor scenes from\ngiven room types, floor plans, and potentially pre-existing objects. We\nrepresent a scene layout by a 2D floor plan and a set of objects, each defined\nby its category, location, size, and orientation. Our approach uniquely\nimplements structured corruption across the mixed discrete semantic and\ncontinuous geometric domains, resulting in a better conditioned problem for the\nreverse denoising step. We evaluate our approach on the 3D-FRONT dataset. Our\nexperimental results demonstrate that MiDiffusion substantially outperforms\nstate-of-the-art autoregressive and diffusion models in floor-conditioned 3D\nscene synthesis. In addition, our models can handle partial object constraints\nvia a corruption-and-masking strategy without task specific training. We show\nMiDiffusion maintains clear advantages over existing approaches in scene\ncompletion and furniture arrangement experiments.\n","authors":["Siyi Hu","Diego Martin Arroyo","Stephanie Debats","Fabian Manhardt","Luca Carlone","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2405.21066v1.pdf","comment":"19 pages, 14 figures. Under review. Code to be released at:\n  https://github.com/MIT-SPARK/MiDiffusion"},{"id":"http://arxiv.org/abs/2405.21059v1","updated":"2024-05-31T17:49:51Z","published":"2024-05-31T17:49:51Z","title":"Unified Directly Denoising for Both Variance Preserving and Variance\n  Exploding Diffusion Models","summary":"  Previous work has demonstrated that, in the Variance Preserving (VP)\nscenario, the nascent Directly Denoising Diffusion Models (DDDM) can generate\nhigh-quality images in one step while achieving even better performance in\nmultistep sampling. However, the Pseudo-LPIPS loss used in DDDM leads to\nconcerns about the bias in assessment. Here, we propose a unified DDDM (uDDDM)\nframework that generates images in one-step/multiple steps for both Variance\nPreserving (VP) and Variance Exploding (VE) cases. We provide theoretical\nproofs of the existence and uniqueness of the model's solution paths, as well\nas the non-intersecting property of the sampling paths. Additionally, we\npropose an adaptive Pseudo-Huber loss function to balance the convergence to\nthe true solution and the stability of convergence process.Through a\ncomprehensive evaluation, we demonstrate that uDDDMs achieve FID scores\ncomparable to the best-performing methods available for CIFAR-10 in both VP and\nVE. Specifically, uDDDM achieves one-step generation on CIFAR10 with FID of\n2.63 and 2.53 for VE and VP respectively. By extending the sampling to 1000\nsteps, we further reduce FID score to 1.71 and 1.65 for VE and VP respectively,\nsetting state-of-the-art performance in both cases.\n","authors":["Jingjing Wang","Dan Zhang","Feng Luo"],"pdf_url":"https://arxiv.org/pdf/2405.21059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21056v1","updated":"2024-05-31T17:47:22Z","published":"2024-05-31T17:47:22Z","title":"An Organic Weed Control Prototype using Directed Energy and Deep\n  Learning","summary":"  Organic weed control is a vital to improve crop yield with a sustainable\napproach. In this work, a directed energy weed control robot prototype\nspecifically designed for organic farms is proposed. The robot uses a novel\ndistributed array robot (DAR) unit for weed treatment. Soybean and corn\ndatabases are built to train deep learning neural nets to perform weed\nrecognition. The initial deep learning neural nets show a high performance in\nclassifying crops. The robot uses a patented directed energy plant eradication\nrecipe that is completely organic and UV-C free, with no chemical damage or\nphysical disturbance to the soil. The deep learning can classify 8 common weed\nspecies in a soybean field under natural environment with up to 98% accuracy.\n","authors":["Deng Cao","Hongbo Zhang","Rajveer Dhillon"],"pdf_url":"https://arxiv.org/pdf/2405.21056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21050v1","updated":"2024-05-31T17:43:35Z","published":"2024-05-31T17:43:35Z","title":"Spectrum-Aware Parameter Efficient Fine-Tuning for Diffusion Models","summary":"  Adapting large-scale pre-trained generative models in a parameter-efficient\nmanner is gaining traction. Traditional methods like low rank adaptation\nachieve parameter efficiency by imposing constraints but may not be optimal for\ntasks requiring high representation capacity. We propose a novel spectrum-aware\nadaptation framework for generative models. Our method adjusts both singular\nvalues and their basis vectors of pretrained weights. Using the Kronecker\nproduct and efficient Stiefel optimizers, we achieve parameter-efficient\nadaptation of orthogonal matrices. We introduce Spectral Orthogonal\nDecomposition Adaptation (SODA), which balances computational efficiency and\nrepresentation capacity. Extensive evaluations on text-to-image diffusion\nmodels demonstrate SODA's effectiveness, offering a spectrum-aware alternative\nto existing fine-tuning methods.\n","authors":["Xinxi Zhang","Song Wen","Ligong Han","Felix Juefei-Xu","Akash Srivastava","Junzhou Huang","Hao Wang","Molei Tao","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2405.21050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21048v1","updated":"2024-05-31T17:41:11Z","published":"2024-05-31T17:41:11Z","title":"Kaleido Diffusion: Improving Conditional Diffusion Models with\n  Autoregressive Latent Modeling","summary":"  Diffusion models have emerged as a powerful tool for generating high-quality\nimages from textual descriptions. Despite their successes, these models often\nexhibit limited diversity in the sampled images, particularly when sampling\nwith a high classifier-free guidance weight. To address this issue, we present\nKaleido, a novel approach that enhances the diversity of samples by\nincorporating autoregressive latent priors. Kaleido integrates an\nautoregressive language model that encodes the original caption and generates\nlatent variables, serving as abstract and intermediary representations for\nguiding and facilitating the image generation process. In this paper, we\nexplore a variety of discrete latent representations, including textual\ndescriptions, detection bounding boxes, object blobs, and visual tokens. These\nrepresentations diversify and enrich the input conditions to the diffusion\nmodels, enabling more diverse outputs. Our experimental results demonstrate\nthat Kaleido effectively broadens the diversity of the generated image samples\nfrom a given textual description while maintaining high image quality.\nFurthermore, we show that Kaleido adheres closely to the guidance provided by\nthe generated latent variables, demonstrating its capability to effectively\ncontrol and direct the image generation process.\n","authors":["Jiatao Gu","Ying Shen","Shuangfei Zhai","Yizhe Zhang","Navdeep Jaitly","Joshua M. Susskind"],"pdf_url":"https://arxiv.org/pdf/2405.21048v1.pdf","comment":"22 pages, 14 figures"},{"id":"http://arxiv.org/abs/2402.11058v2","updated":"2024-05-31T17:30:13Z","published":"2024-02-16T20:14:47Z","title":"II-MMR: Identifying and Improving Multi-modal Multi-hop Reasoning in\n  Visual Question Answering","summary":"  Visual Question Answering (VQA) often involves diverse reasoning scenarios\nacross Vision and Language (V&L). Most prior VQA studies, however, have merely\nfocused on assessing the model's overall accuracy without evaluating it on\ndifferent reasoning cases. Furthermore, some recent works observe that\nconventional Chain-of-Thought (CoT) prompting fails to generate effective\nreasoning for VQA, especially for complex scenarios requiring multi-hop\nreasoning. In this paper, we propose II-MMR, a novel idea to identify and\nimprove multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA\nquestion with an image and finds a reasoning path to reach its answer using two\nnovel language promptings: (i) answer prediction-guided CoT prompt, or (ii)\nknowledge triplet-guided prompt. II-MMR then analyzes this path to identify\ndifferent reasoning cases in current VQA benchmarks by estimating how many hops\nand what types (i.e., visual or beyond-visual) of reasoning are required to\nanswer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR\nobserves that most of their VQA questions are easy to answer, simply demanding\n\"single-hop\" reasoning, whereas only a few questions require \"multi-hop\"\nreasoning. Moreover, while the recent V&L model struggles with such complex\nmulti-hop reasoning questions even using the traditional CoT method, II-MMR\nshows its effectiveness across all reasoning cases in both zero-shot and\nfine-tuning settings.\n","authors":["Jihyung Kil","Farideh Tavazoee","Dongyeop Kang","Joo-Kyung Kim"],"pdf_url":"https://arxiv.org/pdf/2402.11058v2.pdf","comment":"Accepted to ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2405.21022v1","updated":"2024-05-31T17:09:16Z","published":"2024-05-31T17:09:16Z","title":"You Only Scan Once: Efficient Multi-dimension Sequential Modeling with\n  LightNet","summary":"  Linear attention mechanisms have gained prominence in causal language models\ndue to their linear computational complexity and enhanced speed. However, the\ninherent decay mechanism in linear attention presents challenges when applied\nto multi-dimensional sequence modeling tasks, such as image processing and\nmulti-modal learning. In these scenarios, the utilization of sequential\nscanning to establish a global receptive field necessitates multiple scans for\nmulti-dimensional data, thereby leading to inefficiencies. This paper\nidentifies the inefficiency caused by a multiplicative linear recurrence and\nproposes an efficient alternative additive linear recurrence to avoid the\nissue, as it can handle multi-dimensional data within a single scan. We further\ndevelop an efficient multi-dimensional sequential modeling framework called\nLightNet based on the new recurrence. Moreover, we present two new\nmulti-dimensional linear relative positional encoding methods, MD-TPE and\nMD-LRPE to enhance the model's ability to discern positional information in\nmulti-dimensional scenarios. Our empirical evaluations across various tasks,\nincluding image classification, image generation, bidirectional language\nmodeling, and autoregressive language modeling, demonstrate the efficacy of\nLightNet, showcasing its potential as a versatile and efficient solution for\nmulti-dimensional sequential modeling.\n","authors":["Zhen Qin","Yuxin Mao","Xuyang Shen","Dong Li","Jing Zhang","Yuchao Dai","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2405.21022v1.pdf","comment":"Technical report. Yiran Zhong is the corresponding author. The code\n  is available at https://github.com/OpenNLPLab/LightNet"},{"id":"http://arxiv.org/abs/2405.21016v1","updated":"2024-05-31T17:05:59Z","published":"2024-05-31T17:05:59Z","title":"MpoxSLDNet: A Novel CNN Model for Detecting Monkeypox Lesions and\n  Performance Comparison with Pre-trained Models","summary":"  Monkeypox virus (MPXV) is a zoonotic virus that poses a significant threat to\npublic health, particularly in remote parts of Central and West Africa. Early\ndetection of monkeypox lesions is crucial for effective treatment. However, due\nto its similarity with other skin diseases, monkeypox lesion detection is a\nchallenging task. To detect monkeypox, many researchers used various\ndeep-learning models such as MobileNetv2, VGG16, ResNet50, InceptionV3,\nDenseNet121, EfficientNetB3, MobileNetV2, and Xception. However, these models\noften require high storage space due to their large size. This study aims to\nimprove the existing challenges by introducing a CNN model named MpoxSLDNet\n(Monkeypox Skin Lesion Detector Network) to facilitate early detection and\ncategorization of Monkeypox lesions and Non-Monkeypox lesions in digital\nimages. Our model represents a significant advancement in the field of\nmonkeypox lesion detection by offering superior performance metrics, including\nprecision, recall, F1-score, accuracy, and AUC, compared to traditional\npre-trained models such as VGG16, ResNet50, and DenseNet121. The key novelty of\nour approach lies in MpoxSLDNet's ability to achieve high detection accuracy\nwhile requiring significantly less storage space than existing models. By\naddressing the challenge of high storage requirements, MpoxSLDNet presents a\npractical solution for early detection and categorization of monkeypox lesions\nin resource-constrained healthcare settings. In this study, we have used\n\"Monkeypox Skin Lesion Dataset\" comprising 1428 skin images of monkeypox\nlesions and 1764 skin images of Non-Monkeypox lesions. Dataset's limitations\ncould potentially impact the model's ability to generalize to unseen cases.\nHowever, the MpoxSLDNet model achieved a validation accuracy of 94.56%,\ncompared to 86.25%, 84.38%, and 67.19% for VGG16, DenseNet121, and ResNet50,\nrespectively.\n","authors":["Fatema Jannat Dihan","Saydul Akbar Murad","Abu Jafar Md Muzahid","K. M. Aslam Uddin","Mohammed J. F. Alenazi","Anupam Kumar Bairagi","Sujit Biswas"],"pdf_url":"https://arxiv.org/pdf/2405.21016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18418v2","updated":"2024-05-31T17:03:00Z","published":"2024-05-28T17:57:23Z","title":"Hierarchical World Models as Visual Whole-Body Humanoid Controllers","summary":"  Whole-body control for humanoids is challenging due to the high-dimensional\nnature of the problem, coupled with the inherent instability of a bipedal\nmorphology. Learning from visual observations further exacerbates this\ndifficulty. In this work, we explore highly data-driven approaches to visual\nwhole-body humanoid control based on reinforcement learning, without any\nsimplifying assumptions, reward design, or skill primitives. Specifically, we\npropose a hierarchical world model in which a high-level agent generates\ncommands based on visual observations for a low-level agent to execute, both of\nwhich are trained with rewards. Our approach produces highly performant control\npolicies in 8 tasks with a simulated 56-DoF humanoid, while synthesizing\nmotions that are broadly preferred by humans. Code and videos:\nhttps://nicklashansen.com/rlpuppeteer\n","authors":["Nicklas Hansen","Jyothir S V","Vlad Sobal","Yann LeCun","Xiaolong Wang","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2405.18418v2.pdf","comment":"Code and videos at https://nicklashansen.com/rlpuppeteer"},{"id":"http://arxiv.org/abs/2405.00515v3","updated":"2024-05-31T16:55:20Z","published":"2024-05-01T13:51:39Z","title":"GAD-Generative Learning for HD Map-Free Autonomous Driving","summary":"  Deep-learning-based techniques have been widely adopted for autonomous\ndriving software stacks for mass production in recent years, focusing primarily\non perception modules, with some work extending this method to prediction\nmodules. However, the downstream planning and control modules are still\ndesigned with hefty handcrafted rules, dominated by optimization-based methods\nsuch as quadratic programming or model predictive control. This results in a\nperformance bottleneck for autonomous driving systems in that corner cases\nsimply cannot be solved by enumerating hand-crafted rules. We present a\ndeep-learning-based approach that brings prediction, decision, and planning\nmodules together with the attempt to overcome the rule-based methods'\ndeficiency in real-world applications of autonomous driving, especially for\nurban scenes. The DNN model we proposed is solely trained with 10 hours of\nhuman driver data, and it supports all mass-production ADAS features available\non the market to date. This method is deployed onto a Jiyue test car with no\nmodification to its factory-ready sensor set and compute platform. the\nfeasibility, usability, and commercial potential are demonstrated in this\narticle.\n","authors":["Weijian Sun","Yanbo Jia","Qi Zeng","Zihao Liu","Jiang Liao","Yue Li","Xianfeng Li"],"pdf_url":"https://arxiv.org/pdf/2405.00515v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21013v1","updated":"2024-05-31T16:55:04Z","published":"2024-05-31T16:55:04Z","title":"StrucTexTv3: An Efficient Vision-Language Model for Text-rich Image\n  Perception, Comprehension, and Beyond","summary":"  Text-rich images have significant and extensive value, deeply integrated into\nvarious aspects of human life. Notably, both visual cues and linguistic symbols\nin text-rich images play crucial roles in information transmission but are\naccompanied by diverse challenges. Therefore, the efficient and effective\nunderstanding of text-rich images is a crucial litmus test for the capability\nof Vision-Language Models. We have crafted an efficient vision-language model,\nStrucTexTv3, tailored to tackle various intelligent tasks for text-rich images.\nThe significant design of StrucTexTv3 is presented in the following aspects:\nFirstly, we adopt a combination of an effective multi-scale reduced visual\ntransformer and a multi-granularity token sampler (MG-Sampler) as a visual\ntoken generator, successfully solving the challenges of high-resolution input\nand complex representation learning for text-rich images. Secondly, we enhance\nthe perception and comprehension abilities of StrucTexTv3 through instruction\nlearning, seamlessly integrating various text-oriented tasks into a unified\nframework. Thirdly, we have curated a comprehensive collection of high-quality\ntext-rich images, abbreviated as TIM-30M, encompassing diverse scenarios like\nincidental scenes, office documents, web pages, and screenshots, thereby\nimproving the robustness of our model. Our method achieved SOTA results in\ntext-rich image perception tasks, and significantly improved performance in\ncomprehension tasks. Among multimodal models with LLM decoder of approximately\n1.8B parameters, it stands out as a leader, which also makes the deployment of\nedge devices feasible. In summary, the StrucTexTv3 model, featuring efficient\nstructural design, outstanding performance, and broad adaptability, offers\nrobust support for diverse intelligent application tasks involving text-rich\nimages, thus exhibiting immense potential for widespread application.\n","authors":["Pengyuan Lyu","Yulin Li","Hao Zhou","Weihong Ma","Xingyu Wan","Qunyi Xie","Liang Wu","Chengquan Zhang","Kun Yao","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2405.21013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14622v3","updated":"2024-05-31T16:37:53Z","published":"2024-05-23T14:30:33Z","title":"Calibrated Self-Rewarding Vision Language Models","summary":"  Large Vision-Language Models (LVLMs) have made substantial progress by\nintegrating pre-trained large language models (LLMs) and vision models through\ninstruction tuning. Despite these advancements, LVLMs often exhibit the\nhallucination phenomenon, where generated text responses appear linguistically\nplausible but contradict the input image, indicating a misalignment between\nimage and text pairs. This misalignment arises because the model tends to\nprioritize textual information over visual input, even when both the language\nmodel and visual representations are of high quality. Existing methods leverage\nadditional models or human annotations to curate preference data and enhance\nmodality alignment through preference optimization. These approaches may not\neffectively reflect the target LVLM's preferences, making the curated\npreferences easily distinguishable. Our work addresses these challenges by\nproposing the Calibrated Self-Rewarding (CSR) approach, which enables the model\nto self-improve by iteratively generating candidate responses, evaluating the\nreward for each response, and curating preference data for fine-tuning. In the\nreward modeling, we employ a step-wise strategy and incorporate visual\nconstraints into the self-rewarding process to place greater emphasis on visual\ninput. Empirical results demonstrate that CSR enhances performance and reduces\nhallucinations across ten benchmarks and tasks, achieving substantial\nimprovements over existing methods by 7.62%. Our empirical results are further\nsupported by rigorous theoretical analysis, under mild assumptions, verifying\nthe effectiveness of introducing visual constraints into the self-rewarding\nparadigm. Additionally, CSR shows compatibility with different vision-language\nmodels and the ability to incrementally improve performance through iterative\nfine-tuning. Our data and code are available at\nhttps://github.com/YiyangZhou/CSR.\n","authors":["Yiyang Zhou","Zhiyuan Fan","Dongjie Cheng","Sihan Yang","Zhaorun Chen","Chenhang Cui","Xiyao Wang","Yun Li","Linjun Zhang","Huaxiu Yao"],"pdf_url":"https://arxiv.org/pdf/2405.14622v3.pdf","comment":"fix some typos and add acknowledgement section in V3"},{"id":"http://arxiv.org/abs/2405.20991v1","updated":"2024-05-31T16:35:41Z","published":"2024-05-31T16:35:41Z","title":"Hard Cases Detection in Motion Prediction by Vision-Language Foundation\n  Models","summary":"  Addressing hard cases in autonomous driving, such as anomalous road users,\nextreme weather conditions, and complex traffic interactions, presents\nsignificant challenges. To ensure safety, it is crucial to detect and manage\nthese scenarios effectively for autonomous driving systems. However, the rarity\nand high-risk nature of these cases demand extensive, diverse datasets for\ntraining robust models. Vision-Language Foundation Models (VLMs) have shown\nremarkable zero-shot capabilities as being trained on extensive datasets. This\nwork explores the potential of VLMs in detecting hard cases in autonomous\ndriving. We demonstrate the capability of VLMs such as GPT-4v in detecting hard\ncases in traffic participant motion prediction on both agent and scenario\nlevels. We introduce a feasible pipeline where VLMs, fed with sequential image\nframes with designed prompts, effectively identify challenging agents or\nscenarios, which are verified by existing prediction models. Moreover, by\ntaking advantage of this detection of hard cases by VLMs, we further improve\nthe training efficiency of the existing motion prediction pipeline by\nperforming data selection for the training samples suggested by GPT. We show\nthe effectiveness and feasibility of our pipeline incorporating VLMs with\nstate-of-the-art methods on NuScenes datasets. The code is accessible at\nhttps://github.com/KTH-RPL/Detect_VLM.\n","authors":["Yi Yang","Qingwen Zhang","Kei Ikemura","Nazre Batool","John Folkesson"],"pdf_url":"https://arxiv.org/pdf/2405.20991v1.pdf","comment":"IEEE Intelligent Vehicles Symposium (IV) 2024"},{"id":"http://arxiv.org/abs/2405.20987v1","updated":"2024-05-31T16:33:20Z","published":"2024-05-31T16:33:20Z","title":"Early Stopping Criteria for Training Generative Adversarial Networks in\n  Biomedical Imaging","summary":"  Generative Adversarial Networks (GANs) have high computational costs to train\ntheir complex architectures. Throughout the training process, GANs' output is\nanalyzed qualitatively based on the loss and synthetic images' diversity and\nquality. Based on this qualitative analysis, training is manually halted once\nthe desired synthetic images are generated. By utilizing an early stopping\ncriterion, the computational cost and dependence on manual oversight can be\nreduced yet impacted by training problems such as mode collapse,\nnon-convergence, and instability. This is particularly prevalent in biomedical\nimagery, where training problems degrade the diversity and quality of synthetic\nimages, and the high computational cost associated with training makes complex\narchitectures increasingly inaccessible. This work proposes a novel early\nstopping criteria to quantitatively detect training problems, halt training,\nand reduce the computational costs associated with synthesizing biomedical\nimages. Firstly, the range of generator and discriminator loss values is\ninvestigated to assess whether mode collapse, non-convergence, and instability\noccur sequentially, concurrently, or interchangeably throughout the training of\nGANs. Secondly, utilizing these occurrences in conjunction with the Mean\nStructural Similarity Index (MS-SSIM) and Fr\\'echet Inception Distance (FID)\nscores of synthetic images forms the basis of the proposed early stopping\ncriteria. This work helps identify the occurrence of training problems in GANs\nusing low-resource computational cost and reduces training time to generate\ndiversified and high-quality synthetic images.\n","authors":["Muhammad Muneeb Saad","Mubashir Husain Rehmani","Ruairi O'Reilly"],"pdf_url":"https://arxiv.org/pdf/2405.20987v1.pdf","comment":"This paper is accepted at the 35th IEEE Irish Signals and Systems\n  Conference (ISSC 2024)"},{"id":"http://arxiv.org/abs/2405.20986v1","updated":"2024-05-31T16:32:46Z","published":"2024-05-31T16:32:46Z","title":"Uncertainty Quantification for Bird's Eye View Semantic Segmentation:\n  Methods and Benchmarks","summary":"  The fusion of raw features from multiple sensors on an autonomous vehicle to\ncreate a Bird's Eye View (BEV) representation is crucial for planning and\ncontrol systems. There is growing interest in using deep learning models for\nBEV semantic segmentation. Anticipating segmentation errors and improving the\nexplainability of DNNs is essential for autonomous driving, yet it is\nunder-studied. This paper introduces a benchmark for predictive uncertainty\nquantification in BEV segmentation. The benchmark assesses various approaches\nacross three popular datasets using two representative backbones and focuses on\nthe effectiveness of predicted uncertainty in identifying misclassified and\nout-of-distribution (OOD) pixels, as well as calibration. Empirical findings\nhighlight the challenges in uncertainty quantification. Our results find that\nevidential deep learning based approaches show the most promise by efficiently\nquantifying aleatoric and epistemic uncertainty. We propose the\nUncertainty-Focal-Cross-Entropy (UFCE) loss, designed for highly imbalanced\ndata, which consistently improves the segmentation quality and calibration.\nAdditionally, we introduce a vacuity-scaled regularization term that enhances\nthe model's focus on high uncertainty pixels, improving epistemic uncertainty\nquantification.\n","authors":["Linlin Yu","Bowen Yang","Tianhao Wang","Kangshuo Li","Feng Chen"],"pdf_url":"https://arxiv.org/pdf/2405.20986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20985v1","updated":"2024-05-31T16:31:38Z","published":"2024-05-31T16:31:38Z","title":"DeCo: Decoupling Token Compression from Semantic Abstraction in\n  Multimodal Large Language Models","summary":"  The visual projector, which bridges the vision and language modalities and\nfacilitates cross-modal alignment, serves as a crucial component in MLLMs.\nHowever, measuring the effectiveness of projectors in vision-language alignment\nremains under-explored, which currently can only be inferred from the\nperformance of MLLMs on downstream tasks. Motivated by the problem, this study\nexamines the projector module by interpreting the vision-language semantic flow\nwithin MLLMs. Specifically, we trace back the semantic relevance flow from\ngenerated language tokens to raw visual encoder patches and the intermediate\noutputs produced by projectors. Our findings reveal that compressive projectors\n(e.g., QFormer), abstract visual patches into a limited set of semantic\nconcepts, such as objects or attributes, resulting in a 'double abstraction'\nphenomenon. This involves a first visual semantic abstraction by the projector\nreferring to pre-defined query tokens, and a second extraction by the LLM based\non text instructions. The double abstraction is inefficient in training and\nwill result in cumulative vision semantics deficiency. To mitigate this issue,\nwe propose the key insight of 'Decouple Compression from Abstraction (DeCo),\nthat is compressing the visual token number at the patch level by projectors\nand allowing the LLM to handle visual semantic abstraction entirely.\nConsequently, we adopt a simple compressor, i.e., 2D Adaptive Pooling, to\ndownsample visual patches in a parameter-free manner. Empirical evaluation\ndemonstrates that DeCo surpasses traditional compressive projectors regarding\nboth performance and efficiency. It achieves performance gains of 0.9%, 7.1%,\nand 2.9% across the MLLM Benchmarks, Visual Localization, and Open-ended VQA\ntasks with fewer trainable parameters and faster convergence speed.\n","authors":["Linli Yao","Lei Li","Shuhuai Ren","Lean Wang","Yuanxin Liu","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2405.20985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20981v1","updated":"2024-05-31T16:26:30Z","published":"2024-05-31T16:26:30Z","title":"Generative Adversarial Networks in Ultrasound Imaging: Extending Field\n  of View Beyond Conventional Limits","summary":"  Transthoracic Echocardiography (TTE) is a fundamental, non-invasive\ndiagnostic tool in cardiovascular medicine, enabling detailed visualization of\ncardiac structures crucial for diagnosing various heart conditions. Despite its\nwidespread use, TTE ultrasound imaging faces inherent limitations, notably the\ntrade-off between field of view (FoV) and resolution. This paper introduces a\nnovel application of conditional Generative Adversarial Networks (cGANs),\nspecifically designed to extend the FoV in TTE ultrasound imaging while\nmaintaining high resolution. Our proposed cGAN architecture, termed echoGAN,\ndemonstrates the capability to generate realistic anatomical structures through\noutpainting, effectively broadening the viewable area in medical imaging. This\nadvancement has the potential to enhance both automatic and manual ultrasound\nnavigation, offering a more comprehensive view that could significantly reduce\nthe learning curve associated with ultrasound imaging and aid in more accurate\ndiagnoses. The results confirm that echoGAN reliably reproduce detailed cardiac\nfeatures, thereby promising a significant step forward in the field of\nnon-invasive cardiac naviagation and diagnostics.\n","authors":["Matej Gazda","Samuel Kadoury","Jakub Gazda","Peter Drotar"],"pdf_url":"https://arxiv.org/pdf/2405.20981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20980v1","updated":"2024-05-31T16:26:08Z","published":"2024-05-31T16:26:08Z","title":"Neural Gaussian Scale-Space Fields","summary":"  Gaussian scale spaces are a cornerstone of signal representation and\nprocessing, with applications in filtering, multiscale analysis, anti-aliasing,\nand many more. However, obtaining such a scale space is costly and cumbersome,\nin particular for continuous representations such as neural fields. We present\nan efficient and lightweight method to learn the fully continuous, anisotropic\nGaussian scale space of an arbitrary signal. Based on Fourier feature\nmodulation and Lipschitz bounding, our approach is trained self-supervised,\ni.e., training does not require any manual filtering. Our neural Gaussian\nscale-space fields faithfully capture multiscale representations across a broad\nrange of modalities, and support a diverse set of applications. These include\nimages, geometry, light-stage data, texture anti-aliasing, and multiscale\noptimization.\n","authors":["Felix Mujkanovic","Ntumba Elie Nsampi","Christian Theobalt","Hans-Peter Seidel","Thomas Leimkühler"],"pdf_url":"https://arxiv.org/pdf/2405.20980v1.pdf","comment":"15 pages; SIGGRAPH 2024; project page at\n  https://neural-gaussian-scale-space-fields.mpi-inf.mpg.de"},{"id":"http://arxiv.org/abs/2405.20971v1","updated":"2024-05-31T16:18:46Z","published":"2024-05-31T16:18:46Z","title":"Amortizing intractable inference in diffusion models for vision,\n  language, and control","summary":"  Diffusion models have emerged as effective distribution estimators in vision,\nlanguage, and reinforcement learning, but their use as priors in downstream\ntasks poses an intractable posterior inference problem. This paper studies\namortized sampling of the posterior over data, $\\mathbf{x}\\sim p^{\\rm\npost}(\\mathbf{x})\\propto p(\\mathbf{x})r(\\mathbf{x})$, in a model that consists\nof a diffusion generative model prior $p(\\mathbf{x})$ and a black-box\nconstraint or likelihood function $r(\\mathbf{x})$. We state and prove the\nasymptotic correctness of a data-free learning objective, relative trajectory\nbalance, for training a diffusion model that samples from this posterior, a\nproblem that existing methods solve only approximately or in restricted cases.\nRelative trajectory balance arises from the generative flow network perspective\non diffusion models, which allows the use of deep reinforcement learning\ntechniques to improve mode coverage. Experiments illustrate the broad potential\nof unbiased inference of arbitrary posteriors under diffusion priors: in vision\n(classifier guidance), language (infilling under a discrete diffusion LLM), and\nmultimodal data (text-to-image generation). Beyond generative modeling, we\napply relative trajectory balance to the problem of continuous control with a\nscore-based behavior prior, achieving state-of-the-art results on benchmarks in\noffline reinforcement learning.\n","authors":["Siddarth Venkatraman","Moksh Jain","Luca Scimeca","Minsu Kim","Marcin Sendera","Mohsin Hasan","Luke Rowe","Sarthak Mittal","Pablo Lemos","Emmanuel Bengio","Alexandre Adam","Jarrid Rector-Brooks","Yoshua Bengio","Glen Berseth","Nikolay Malkin"],"pdf_url":"https://arxiv.org/pdf/2405.20971v1.pdf","comment":"Code: https://github.com/GFNOrg/diffusion-finetuning"},{"id":"http://arxiv.org/abs/2311.10879v3","updated":"2024-05-31T16:15:01Z","published":"2023-11-17T21:48:41Z","title":"Pre- to Post-Contrast Breast MRI Synthesis for Enhanced Tumour\n  Segmentation","summary":"  Despite its benefits for tumour detection and treatment, the administration\nof contrast agents in dynamic contrast-enhanced MRI (DCE-MRI) is associated\nwith a range of issues, including their invasiveness, bioaccumulation, and a\nrisk of nephrogenic systemic fibrosis. This study explores the feasibility of\nproducing synthetic contrast enhancements by translating pre-contrast\nT1-weighted fat-saturated breast MRI to their corresponding first DCE-MRI\nsequence leveraging the capabilities of a generative adversarial network (GAN).\nAdditionally, we introduce a Scaled Aggregate Measure (SAMe) designed for\nquantitatively evaluating the quality of synthetic data in a principled manner\nand serving as a basis for selecting the optimal generative model. We assess\nthe generated DCE-MRI data using quantitative image quality metrics and apply\nthem to the downstream task of 3D breast tumour segmentation. Our results\nhighlight the potential of post-contrast DCE-MRI synthesis in enhancing the\nrobustness of breast tumour segmentation models via data augmentation. Our code\nis available at https://github.com/RichardObi/pre_post_synthesis.\n","authors":["Richard Osuala","Smriti Joshi","Apostolia Tsirikoglou","Lidia Garrucho","Walter H. L. Pinaya","Oliver Diaz","Karim Lekadir"],"pdf_url":"https://arxiv.org/pdf/2311.10879v3.pdf","comment":"Accepted as oral presentation at SPIE Medical Imaging 2024 (Image\n  Processing)"},{"id":"http://arxiv.org/abs/2207.11860v5","updated":"2024-05-31T16:04:07Z","published":"2022-07-25T00:42:38Z","title":"Behind Every Domain There is a Shift: Adapting Distortion-aware Vision\n  Transformers for Panoramic Semantic Segmentation","summary":"  In this paper, we address panoramic semantic segmentation which is\nunder-explored due to two critical challenges: (1) image distortions and object\ndeformations on panoramas; (2) lack of semantic annotations in the 360{\\deg}\nimagery. To tackle these problems, first, we propose the upgraded Transformer\nfor Panoramic Semantic Segmentation, i.e., Trans4PASS+, equipped with\nDeformable Patch Embedding (DPE) and Deformable MLP (DMLPv2) modules for\nhandling object deformations and image distortions whenever (before or after\nadaptation) and wherever (shallow or deep levels). Second, we enhance the\nMutual Prototypical Adaptation (MPA) strategy via pseudo-label rectification\nfor unsupervised domain adaptive panoramic segmentation. Third, aside from\nPinhole-to-Panoramic (Pin2Pan) adaptation, we create a new dataset (SynPASS)\nwith 9,080 panoramic images, facilitating Synthetic-to-Real (Syn2Real)\nadaptation scheme in 360{\\deg} imagery. Extensive experiments are conducted,\nwhich cover indoor and outdoor scenarios, and each of them is investigated with\nPin2Pan and Syn2Real regimens. Trans4PASS+ achieves state-of-the-art\nperformances on four domain adaptive panoramic semantic segmentation\nbenchmarks. Code is available at https://github.com/jamycheung/Trans4PASS.\n","authors":["Jiaming Zhang","Kailun Yang","Hao Shi","Simon Reiß","Kunyu Peng","Chaoxiang Ma","Haodong Fu","Philip H. S. Torr","Kaiwei Wang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2207.11860v5.pdf","comment":"Accepted to IEEE Transactions on Pattern Analysis and Machine\n  Intelligence (TPAMI). Extended version of CVPR 2022 paper arXiv:2203.01452.\n  Code is available at https://github.com/jamycheung/Trans4PASS"},{"id":"http://arxiv.org/abs/2307.16565v2","updated":"2024-05-31T16:00:18Z","published":"2023-07-31T10:55:15Z","title":"Towards Imbalanced Motion: Part-Decoupling Network for Video Portrait\n  Segmentation","summary":"  Video portrait segmentation (VPS), aiming at segmenting prominent foreground\nportraits from video frames, has received much attention in recent years.\nHowever, simplicity of existing VPS datasets leads to a limitation on extensive\nresearch of the task. In this work, we propose a new intricate large-scale\nMulti-scene Video Portrait Segmentation dataset MVPS consisting of 101 video\nclips in 7 scenario categories, in which 10,843 sampled frames are finely\nannotated at pixel level. The dataset has diverse scenes and complicated\nbackground environments, which is the most complex dataset in VPS to our best\nknowledge. Through the observation of a large number of videos with portraits\nduring dataset construction, we find that due to the joint structure of human\nbody, motion of portraits is part-associated, which leads that different parts\nare relatively independent in motion. That is, motion of different parts of the\nportraits is imbalanced. Towards this imbalance, an intuitive and reasonable\nidea is that different motion states in portraits can be better exploited by\ndecoupling the portraits into parts. To achieve this, we propose a\nPart-Decoupling Network (PDNet) for video portrait segmentation. Specifically,\nan Inter-frame Part-Discriminated Attention (IPDA) module is proposed which\nunsupervisedly segments portrait into parts and utilizes different\nattentiveness on discriminative features specified to each different part. In\nthis way, appropriate attention can be imposed to portrait parts with\nimbalanced motion to extract part-discriminated correlations, so that the\nportraits can be segmented more accurately. Experimental results demonstrate\nthat our method achieves leading performance with the comparison to\nstate-of-the-art methods.\n","authors":["Tianshu Yu","Changqun Xia","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2307.16565v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11265v2","updated":"2024-05-31T15:59:32Z","published":"2024-04-17T11:15:58Z","title":"The Victim and The Beneficiary: Exploiting a Poisoned Model to Train a\n  Clean Model on Poisoned Data","summary":"  Recently, backdoor attacks have posed a serious security threat to the\ntraining process of deep neural networks (DNNs). The attacked model behaves\nnormally on benign samples but outputs a specific result when the trigger is\npresent. However, compared with the rocketing progress of backdoor attacks,\nexisting defenses are difficult to deal with these threats effectively or\nrequire benign samples to work, which may be unavailable in real scenarios. In\nthis paper, we find that the poisoned samples and benign samples can be\ndistinguished with prediction entropy. This inspires us to propose a novel\ndual-network training framework: The Victim and The Beneficiary (V&B), which\nexploits a poisoned model to train a clean model without extra benign samples.\nFirstly, we sacrifice the Victim network to be a powerful poisoned sample\ndetector by training on suspicious samples. Secondly, we train the Beneficiary\nnetwork on the credible samples selected by the Victim to inhibit backdoor\ninjection. Thirdly, a semi-supervised suppression strategy is adopted for\nerasing potential backdoors and improving model performance. Furthermore, to\nbetter inhibit missed poisoned samples, we propose a strong data augmentation\nmethod, AttentionMix, which works well with our proposed V&B framework.\nExtensive experiments on two widely used datasets against 6 state-of-the-art\nattacks demonstrate that our framework is effective in preventing backdoor\ninjection and robust to various attacks while maintaining the performance on\nbenign samples. Our code is available at https://github.com/Zixuan-Zhu/VaB.\n","authors":["Zixuan Zhu","Rui Wang","Cong Zou","Lihua Jing"],"pdf_url":"https://arxiv.org/pdf/2404.11265v2.pdf","comment":"13 pages, 6 figures, published to ICCV"},{"id":"http://arxiv.org/abs/2405.19751v2","updated":"2024-05-31T15:48:05Z","published":"2024-05-30T06:56:11Z","title":"HQ-DiT: Efficient Diffusion Transformer with FP4 Hybrid Quantization","summary":"  Diffusion Transformers (DiTs) have recently gained substantial attention in\nboth industrial and academic fields for their superior visual generation\ncapabilities, outperforming traditional diffusion models that use U-Net.\nHowever,the enhanced performance of DiTs also comes with high parameter counts\nand implementation costs, seriously restricting their use on resource-limited\ndevices such as mobile phones. To address these challenges, we introduce the\nHybrid Floating-point Quantization for DiT(HQ-DiT), an efficient post-training\nquantization method that utilizes 4-bit floating-point (FP) precision on both\nweights and activations for DiT inference. Compared to fixed-point quantization\n(e.g., INT8), FP quantization, complemented by our proposed clipping range\nselection mechanism, naturally aligns with the data distribution within DiT,\nresulting in a minimal quantization error. Furthermore, HQ-DiT also implements\na universal identity mathematical transform to mitigate the serious\nquantization error caused by the outliers. The experimental results demonstrate\nthat DiT can achieve extremely low-precision quantization (i.e., 4 bits) with\nnegligible impact on performance. Our approach marks the first instance where\nboth weights and activations in DiTs are quantized to just 4 bits, with only a\n0.12 increase in sFID on ImageNet.\n","authors":["Wenxuan Liu","Sai Qian Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.19751v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20310v2","updated":"2024-05-31T15:27:52Z","published":"2024-05-30T17:52:52Z","title":"A Pixel Is Worth More Than One 3D Gaussians in Single-View 3D\n  Reconstruction","summary":"  Learning 3D scene representation from a single-view image is a long-standing\nfundamental problem in computer vision, with the inherent ambiguity in\npredicting contents unseen from the input view. Built on the recently proposed\n3D Gaussian Splatting (3DGS), the Splatter Image method has made promising\nprogress on fast single-image novel view synthesis via learning a single 3D\nGaussian for each pixel based on the U-Net feature map of an input image.\nHowever, it has limited expressive power to represent occluded components that\nare not observable in the input view. To address this problem, this paper\npresents a Hierarchical Splatter Image method in which a pixel is worth more\nthan one 3D Gaussians. Specifically, each pixel is represented by a parent 3D\nGaussian and a small number of child 3D Gaussians. Parent 3D Gaussians are\nlearned as done in the vanilla Splatter Image. Child 3D Gaussians are learned\nvia a lightweight Multi-Layer Perceptron (MLP) which takes as input the\nprojected image features of a parent 3D Gaussian and the embedding of a target\ncamera view. Both parent and child 3D Gaussians are learned end-to-end in a\nstage-wise way. The joint condition of input image features from eyes of the\nparent Gaussians and the target camera position facilitates learning to\nallocate child Gaussians to ``see the unseen'', recovering the occluded details\nthat are often missed by parent Gaussians.\n  In experiments, the proposed method is tested on the ShapeNet-SRN and CO3D\ndatasets with state-of-the-art performance obtained, especially showing\npromising capabilities of reconstructing occluded contents in the input view.\n","authors":["Jianghao Shen","Xue Nan","Tianfu Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20310v2.pdf","comment":"preprint, under review"},{"id":"http://arxiv.org/abs/2402.05861v2","updated":"2024-05-31T15:22:58Z","published":"2024-02-08T17:50:22Z","title":"Memory Consolidation Enables Long-Context Video Understanding","summary":"  Most transformer-based video encoders are limited to short temporal contexts\ndue to their quadratic complexity. While various attempts have been made to\nextend this context, this has often come at the cost of both conceptual and\ncomputational complexity. We propose to instead re-purpose existing pre-trained\nvideo transformers by simply fine-tuning them to attend to memories derived\nnon-parametrically from past activations. By leveraging redundancy reduction,\nour memory-consolidated vision transformer (MC-ViT) effortlessly extends its\ncontext far into the past and exhibits excellent scaling behavior when learning\nfrom longer videos. In doing so, MC-ViT sets a new state-of-the-art in\nlong-context video understanding on EgoSchema, Perception Test, and Diving48,\noutperforming methods that benefit from orders of magnitude more parameters.\n","authors":["Ivana Balažević","Yuge Shi","Pinelopi Papalampidi","Rahma Chaabouni","Skanda Koppula","Olivier J. Hénaff"],"pdf_url":"https://arxiv.org/pdf/2402.05861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20915v1","updated":"2024-05-31T15:21:44Z","published":"2024-05-31T15:21:44Z","title":"Fast yet Safe: Early-Exiting with Risk Control","summary":"  Scaling machine learning models significantly improves their performance.\nHowever, such gains come at the cost of inference being slow and\nresource-intensive. Early-exit neural networks (EENNs) offer a promising\nsolution: they accelerate inference by allowing intermediate layers to exit and\nproduce a prediction early. Yet a fundamental issue with EENNs is how to\ndetermine when to exit without severely degrading performance. In other words,\nwhen is it 'safe' for an EENN to go 'fast'? To address this issue, we\ninvestigate how to adapt frameworks of risk control to EENNs. Risk control\noffers a distribution-free, post-hoc solution that tunes the EENN's exiting\nmechanism so that exits only occur when the output is of sufficient quality. We\nempirically validate our insights on a range of vision and language tasks,\ndemonstrating that risk control can produce substantial computational savings,\nall the while preserving user-specified performance goals.\n","authors":["Metod Jazbec","Alexander Timans","Tin Hadži Veljković","Kaspar Sakmann","Dan Zhang","Christian A. Naesseth","Eric Nalisnick"],"pdf_url":"https://arxiv.org/pdf/2405.20915v1.pdf","comment":"25 pages, 11 figures, 4 tables (incl. appendix)"},{"id":"http://arxiv.org/abs/2405.20910v1","updated":"2024-05-31T15:21:06Z","published":"2024-05-31T15:21:06Z","title":"Predicting ptychography probe positions using single-shot phase\n  retrieval neural network","summary":"  Ptychography is a powerful imaging technique that is used in a variety of\nfields, including materials science, biology, and nanotechnology. However, the\naccuracy of the reconstructed ptychography image is highly dependent on the\naccuracy of the recorded probe positions which often contain errors. These\nerrors are typically corrected jointly with phase retrieval through numerical\noptimization approaches. When the error accumulates along the scan path or when\nthe error magnitude is large, these approaches may not converge with\nsatisfactory result. We propose a fundamentally new approach for ptychography\nprobe position prediction for data with large position errors, where a neural\nnetwork is used to make single-shot phase retrieval on individual diffraction\npatterns, yielding the object image at each scan point. The pairwise offsets\namong these images are then found using a robust image registration method, and\nthe results are combined to yield the complete scan path by constructing and\nsolving a linear equation. We show that our method can achieve good position\nprediction accuracy for data with large and accumulating errors on the order of\n$10^2$ pixels, a magnitude that often makes optimization-based algorithms fail\nto converge. For ptychography instruments without sophisticated position\ncontrol equipment such as interferometers, our method is of significant\npractical potential.\n","authors":["Ming Du","Tao Zhou","Junjing Deng","Daniel J. Ching","Steven Henke","Mathew J. Cherukara"],"pdf_url":"https://arxiv.org/pdf/2405.20910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20906v1","updated":"2024-05-31T15:17:47Z","published":"2024-05-31T15:17:47Z","title":"Enhancing Vision Models for Text-Heavy Content Understanding and\n  Interaction","summary":"  Interacting and understanding with text heavy visual content with multiple\nimages is a major challenge for traditional vision models. This paper is on\nenhancing vision models' capability to comprehend or understand and learn from\nimages containing a huge amount of textual information from the likes of\ntextbooks and research papers which contain multiple images like graphs, etc\nand tables in them with different types of axes and scales. The approach\ninvolves dataset preprocessing, fine tuning which is by using instructional\noriented data and evaluation. We also built a visual chat application\nintegrating CLIP for image encoding and a model from the Massive Text Embedding\nBenchmark which is developed to consider both textual and visual inputs. An\naccuracy of 96.71% was obtained. The aim of the project is to increase and also\nenhance the advance vision models' capabilities in understanding complex visual\ntextual data interconnected data, contributing to multimodal AI.\n","authors":["Adithya TG","Adithya SK","Abhinav R Bharadwaj","Abhiram HA","Dr. Surabhi Narayan"],"pdf_url":"https://arxiv.org/pdf/2405.20906v1.pdf","comment":"5 pages, 4 figures (including 1 graph)"},{"id":"http://arxiv.org/abs/2405.07801v3","updated":"2024-05-31T15:11:51Z","published":"2024-05-13T14:44:22Z","title":"Deep Learning-Based Object Pose Estimation: A Comprehensive Survey","summary":"  Object pose estimation is a fundamental computer vision problem with broad\napplications in augmented reality and robotics. Over the past decade, deep\nlearning models, due to their superior accuracy and robustness, have\nincreasingly supplanted conventional algorithms reliant on engineered point\npair features. Nevertheless, several challenges persist in contemporary\nmethods, including their dependency on labeled training data, model\ncompactness, robustness under challenging conditions, and their ability to\ngeneralize to novel unseen objects. A recent survey discussing the progress\nmade on different aspects of this area, outstanding challenges, and promising\nfuture directions, is missing. To fill this gap, we discuss the recent advances\nin deep learning-based object pose estimation, covering all three formulations\nof the problem, \\emph{i.e.}, instance-level, category-level, and unseen object\npose estimation. Our survey also covers multiple input data modalities,\ndegrees-of-freedom of output poses, object properties, and downstream tasks,\nproviding the readers with a holistic understanding of this field.\nAdditionally, it discusses training paradigms of different domains, inference\nmodes, application areas, evaluation metrics, and benchmark datasets, as well\nas reports the performance of current state-of-the-art methods on these\nbenchmarks, thereby facilitating the readers in selecting the most suitable\nmethod for their application. Finally, the survey identifies key challenges,\nreviews the prevailing trends along with their pros and cons, and identifies\npromising directions for future research. We also keep tracing the latest works\nat https://github.com/CNJianLiu/Awesome-Object-Pose-Estimation.\n","authors":["Jian Liu","Wei Sun","Hui Yang","Zhiwen Zeng","Chongpei Liu","Jin Zheng","Xingyu Liu","Hossein Rahmani","Nicu Sebe","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2405.07801v3.pdf","comment":"27 pages, 7 figures"},{"id":"http://arxiv.org/abs/2212.00394v3","updated":"2024-05-31T15:08:21Z","published":"2022-12-01T09:42:55Z","title":"From CNNs to Shift-Invariant Twin Models Based on Complex Wavelets","summary":"  We propose a novel method to increase shift invariance and prediction\naccuracy in convolutional neural networks. Specifically, we replace the\nfirst-layer combination \"real-valued convolutions + max pooling\" (RMax) by\n\"complex-valued convolutions + modulus\" (CMod), which is stable to\ntranslations, or shifts. To justify our approach, we claim that CMod and RMax\nproduce comparable outputs when the convolution kernel is band-pass and\noriented (Gabor-like filter). In this context, CMod can therefore be considered\nas a stable alternative to RMax. To enforce this property, we constrain the\nconvolution kernels to adopt such a Gabor-like structure. The corresponding\narchitecture is called mathematical twin, because it employs a well-defined\nmathematical operator to mimic the behavior of the original, freely-trained\nmodel. Our approach achieves superior accuracy on ImageNet and CIFAR-10\nclassification tasks, compared to prior methods based on low-pass filtering.\nArguably, our approach's emphasis on retaining high-frequency details\ncontributes to a better balance between shift invariance and information\npreservation, resulting in improved performance. Furthermore, it has a lower\ncomputational cost and memory footprint than concurrent work, making it a\npromising solution for practical implementation.\n","authors":["Hubert Leterme","Kévin Polisano","Valérie Perrier","Karteek Alahari"],"pdf_url":"https://arxiv.org/pdf/2212.00394v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00110v3","updated":"2024-05-31T15:07:31Z","published":"2023-11-30T18:19:47Z","title":"CLIP-QDA: An Explainable Concept Bottleneck Model","summary":"  In this paper, we introduce an explainable algorithm designed from a\nmulti-modal foundation model, that performs fast and explainable image\nclassification. Drawing inspiration from CLIP-based Concept Bottleneck Models\n(CBMs), our method creates a latent space where each neuron is linked to a\nspecific word. Observing that this latent space can be modeled with simple\ndistributions, we use a Mixture of Gaussians (MoG) formalism to enhance the\ninterpretability of this latent space. Then, we introduce CLIP-QDA, a\nclassifier that only uses statistical values to infer labels from the concepts.\nIn addition, this formalism allows for both local and global explanations.\nThese explanations come from the inner design of our architecture, our work is\npart of a new family of greybox models, combining performances of opaque\nfoundation models and the interpretability of transparent models. Our empirical\nfindings show that in instances where the MoG assumption holds, CLIP-QDA\nachieves similar accuracy with state-of-the-art methods CBMs. Our explanations\ncompete with existing XAI methods while being faster to compute.\n","authors":["Rémi Kazmierczak","Eloïse Berthier","Goran Frehse","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2312.00110v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20892v1","updated":"2024-05-31T15:03:35Z","published":"2024-05-31T15:03:35Z","title":"MALT: Multi-scale Action Learning Transformer for Online Action\n  Detection","summary":"  Online action detection (OAD) aims to identify ongoing actions from streaming\nvideo in real-time, without access to future frames. Since these actions\nmanifest at varying scales of granularity, ranging from coarse to fine,\nprojecting an entire set of action frames to a single latent encoding may\nresult in a lack of local information, necessitating the acquisition of action\nfeatures across multiple scales. In this paper, we propose a multi-scale action\nlearning transformer (MALT), which includes a novel recurrent decoder (used for\nfeature fusion) that includes fewer parameters and can be trained more\nefficiently. A hierarchical encoder with multiple encoding branches is further\nproposed to capture multi-scale action features. The output from the preceding\nbranch is then incrementally input to the subsequent branch as part of a\ncross-attention calculation. In this way, output features transition from\ncoarse to fine as the branches deepen. We also introduce an explicit frame\nscoring mechanism employing sparse attention, which filters irrelevant frames\nmore efficiently, without requiring an additional network. The proposed method\nachieved state-of-the-art performance on two benchmark datasets (THUMOS'14 and\nTVSeries), outperforming all existing models used for comparison, with an mAP\nof 0.2% for THUMOS'14 and an mcAP of 0.1% for TVseries.\n","authors":["Zhipeng Yang","Ruoyu Wang","Yang Tan","Liping Xie"],"pdf_url":"https://arxiv.org/pdf/2405.20892v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.20881v1","updated":"2024-05-31T14:55:31Z","published":"2024-05-31T14:55:31Z","title":"S4Fusion: Saliency-aware Selective State Space Model for Infrared\n  Visible Image Fusion","summary":"  As one of the tasks in Image Fusion, Infrared and Visible Image Fusion aims\nto integrate complementary information captured by sensors of different\nmodalities into a single image. The Selective State Space Model (SSSM), known\nfor its ability to capture long-range dependencies, has demonstrated its\npotential in the field of computer vision. However, in image fusion, current\nmethods underestimate the potential of SSSM in capturing the global spatial\ninformation of both modalities. This limitation prevents the simultaneous\nconsideration of the global spatial information from both modalities during\ninteraction, leading to a lack of comprehensive perception of salient targets.\nConsequently, the fusion results tend to bias towards one modality instead of\nadaptively preserving salient targets. To address this issue, we propose the\nSaliency-aware Selective State Space Fusion Model (S4Fusion). In our S4Fusion,\nthe designed Cross-Modal Spatial Awareness Module (CMSA) can simultaneously\nfocus on global spatial information from both modalities while facilitating\ntheir interaction, thereby comprehensively capturing complementary information.\nAdditionally, S4Fusion leverages a pre-trained network to perceive uncertainty\nin the fused images. By minimizing this uncertainty, S4Fusion adaptively\nhighlights salient targets from both images. Extensive experiments demonstrate\nthat our approach produces high-quality images and enhances performance in\ndownstream tasks.\n","authors":["Haolong Ma","Hui Li","Chunyang Cheng","Gaoang Wang","Xiaoning Song","Xiaojun Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20881v1.pdf","comment":"NurIPS, Under review"},{"id":"http://arxiv.org/abs/2405.20876v1","updated":"2024-05-31T14:52:49Z","published":"2024-05-31T14:52:49Z","title":"Investigating Calibration and Corruption Robustness of Post-hoc Pruned\n  Perception CNNs: An Image Classification Benchmark Study","summary":"  Convolutional Neural Networks (CNNs) have achieved state-of-the-art\nperformance in many computer vision tasks. However, high computational and\nstorage demands hinder their deployment into resource-constrained environments,\nsuch as embedded devices. Model pruning helps to meet these restrictions by\nreducing the model size, while maintaining superior performance. Meanwhile,\nsafety-critical applications pose more than just resource and performance\nconstraints. In particular, predictions must not be overly confident, i.e.,\nprovide properly calibrated uncertainty estimations (proper uncertainty\ncalibration), and CNNs must be robust against corruptions like naturally\noccurring input perturbations (natural corruption robustness). This work\ninvestigates the important trade-off between uncertainty calibration, natural\ncorruption robustness, and performance for current state-of-research post-hoc\nCNN pruning techniques in the context of image classification tasks. Our study\nreveals that post-hoc pruning substantially improves the model's uncertainty\ncalibration, performance, and natural corruption robustness, sparking hope for\nsafe and robust embedded CNNs.Furthermore, uncertainty calibration and natural\ncorruption robustness are not mutually exclusive targets under pruning, as\nevidenced by the improved safety aspects obtained by post-hoc unstructured\npruning with increasing compression.\n","authors":["Pallavi Mitra","Gesina Schwalbe","Nadja Klein"],"pdf_url":"https://arxiv.org/pdf/2405.20876v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.18953v2","updated":"2024-05-31T14:51:58Z","published":"2023-10-29T09:54:03Z","title":"TIC-TAC: A Framework for Improved Covariance Estimation in Deep\n  Heteroscedastic Regression","summary":"  Deep heteroscedastic regression involves jointly optimizing the mean and\ncovariance of the predicted distribution using the negative log-likelihood.\nHowever, recent works show that this may result in sub-optimal convergence due\nto the challenges associated with covariance estimation. While the literature\naddresses this by proposing alternate formulations to mitigate the impact of\nthe predicted covariance, we focus on improving the predicted covariance\nitself. We study two questions: (1) Does the predicted covariance truly capture\nthe randomness of the predicted mean? (2) In the absence of supervision, how\ncan we quantify the accuracy of covariance estimation? We address (1) with a\nTaylor Induced Covariance (TIC), which captures the randomness of the predicted\nmean by incorporating its gradient and curvature through the second order\nTaylor polynomial. Furthermore, we tackle (2) by introducing a Task Agnostic\nCorrelations (TAC) metric, which combines the notion of correlations and\nabsolute error to evaluate the covariance. We evaluate TIC-TAC across multiple\nexperiments spanning synthetic and real-world datasets. Our results show that\nnot only does TIC accurately learn the covariance, it additionally facilitates\nan improved convergence of the negative log-likelihood. Our code is available\nat https://github.com/vita-epfl/TIC-TAC\n","authors":["Megh Shukla","Mathieu Salzmann","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2310.18953v2.pdf","comment":"ICML 2024. Please feel free to provide feedback!"},{"id":"http://arxiv.org/abs/2405.20868v1","updated":"2024-05-31T14:47:27Z","published":"2024-05-31T14:47:27Z","title":"Responsible AI for Earth Observation","summary":"  The convergence of artificial intelligence (AI) and Earth observation (EO)\ntechnologies has brought geoscience and remote sensing into an era of\nunparalleled capabilities. AI's transformative impact on data analysis,\nparticularly derived from EO platforms, holds great promise in addressing\nglobal challenges such as environmental monitoring, disaster response and\nclimate change analysis. However, the rapid integration of AI necessitates a\ncareful examination of the responsible dimensions inherent in its application\nwithin these domains. In this paper, we represent a pioneering effort to\nsystematically define the intersection of AI and EO, with a central focus on\nresponsible AI practices. Specifically, we identify several critical components\nguiding this exploration from both academia and industry perspectives within\nthe EO field: AI and EO for social good, mitigating unfair biases, AI security\nin EO, geo-privacy and privacy-preserving measures, as well as maintaining\nscientific excellence, open data, and guiding AI usage based on ethical\nprinciples. Furthermore, the paper explores potential opportunities and\nemerging trends, providing valuable insights for future research endeavors.\n","authors":["Pedram Ghamisi","Weikang Yu","Andrea Marinoni","Caroline M. Gevaert","Claudio Persello","Sivasakthy Selvakumaran","Manuela Girotto","Benjamin P. Horton","Philippe Rufin","Patrick Hostert","Fabio Pacifici","Peter M. Atkinson"],"pdf_url":"https://arxiv.org/pdf/2405.20868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20867v1","updated":"2024-05-31T14:47:20Z","published":"2024-05-31T14:47:20Z","title":"Automatic Channel Pruning for Multi-Head Attention","summary":"  Despite the strong performance of Transformers, their quadratic computation\ncomplexity presents challenges in applying them to vision tasks. Automatic\npruning is one of effective methods for reducing computation complexity without\nheuristic approaches. However, directly applying it to multi-head attention is\nnot straightforward due to channel misalignment. In this paper, we propose an\nautomatic channel pruning method to take into account the multi-head attention\nmechanism. First, we incorporate channel similarity-based weights into the\npruning indicator to preserve more informative channels in each head. Then, we\nadjust pruning indicator to enforce removal of channels in equal proportions\nacross all heads, preventing the channel misalignment. We also add a reweight\nmodule to compensate for information loss resulting from channel removal, and\nan effective initialization step for pruning indicator based on difference of\nattention between original structure and each channel. Our proposed method can\nbe used to not only original attention, but also linear attention, which is\nmore efficient as linear complexity with respect to the number of tokens. On\nImageNet-1K, applying our pruning method to the FLattenTransformer, which\nincludes both attention mechanisms, shows outperformed accuracy for several\nMACs compared with previous state-of-the-art efficient models and pruned\nmethods. Code will be available soon.\n","authors":["Eunho Lee","Youngbae Hwang"],"pdf_url":"https://arxiv.org/pdf/2405.20867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17389v3","updated":"2024-05-31T14:38:08Z","published":"2023-11-29T06:42:12Z","title":"360Loc: A Dataset and Benchmark for Omnidirectional Visual Localization\n  with Cross-device Queries","summary":"  Portable 360$^\\circ$ cameras are becoming a cheap and efficient tool to\nestablish large visual databases. By capturing omnidirectional views of a\nscene, these cameras could expedite building environment models that are\nessential for visual localization. However, such an advantage is often\noverlooked due to the lack of valuable datasets. This paper introduces a new\nbenchmark dataset, 360Loc, composed of 360$^\\circ$ images with ground truth\nposes for visual localization. We present a practical implementation of\n360$^\\circ$ mapping combining 360$^\\circ$ images with lidar data to generate\nthe ground truth 6DoF poses. 360Loc is the first dataset and benchmark that\nexplores the challenge of cross-device visual positioning, involving\n360$^\\circ$ reference frames, and query frames from pinhole, ultra-wide FoV\nfisheye, and 360$^\\circ$ cameras. We propose a virtual camera approach to\ngenerate lower-FoV query frames from 360$^\\circ$ images, which ensures a fair\ncomparison of performance among different query types in visual localization\ntasks. We also extend this virtual camera approach to feature matching-based\nand pose regression-based methods to alleviate the performance loss caused by\nthe cross-device domain gap, and evaluate its effectiveness against\nstate-of-the-art baselines. We demonstrate that omnidirectional visual\nlocalization is more robust in challenging large-scale scenes with symmetries\nand repetitive structures. These results provide new insights into 360-camera\nmapping and omnidirectional visual localization with cross-device queries.\n","authors":["Huajian Huang","Changkun Liu","Yipeng Zhu","Hui Cheng","Tristan Braud","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.17389v3.pdf","comment":"CVPR 2024. Project Page: https://huajianup.github.io/research/360Loc/"},{"id":"http://arxiv.org/abs/2405.20853v1","updated":"2024-05-31T14:35:35Z","published":"2024-05-31T14:35:35Z","title":"MeshXL: Neural Coordinate Field for Generative 3D Foundation Models","summary":"  The polygon mesh representation of 3D data exhibits great flexibility, fast\nrendering speed, and storage efficiency, which is widely preferred in various\napplications. However, given its unstructured graph representation, the direct\ngeneration of high-fidelity 3D meshes is challenging. Fortunately, with a\npre-defined ordering strategy, 3D meshes can be represented as sequences, and\nthe generation process can be seamlessly treated as an auto-regressive problem.\nIn this paper, we validate the Neural Coordinate Field (NeurCF), an explicit\ncoordinate representation with implicit neural embeddings, is a\nsimple-yet-effective representation for large-scale sequential mesh modeling.\nAfter that, we present MeshXL, a family of generative pre-trained\nauto-regressive models, which addresses the process of 3D mesh generation with\nmodern large language model approaches. Extensive experiments show that MeshXL\nis able to generate high-quality 3D meshes, and can also serve as foundation\nmodels for various down-stream applications.\n","authors":["Sijin Chen","Xin Chen","Anqi Pang","Xianfang Zeng","Wei Cheng","Yijun Fu","Fukun Yin","Yanru Wang","Zhibin Wang","Chi Zhang","Jingyi Yu","Gang Yu","Bin Fu","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2405.20853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20851v1","updated":"2024-05-31T14:33:13Z","published":"2024-05-31T14:33:13Z","title":"MegActor: Harness the Power of Raw Video for Vivid Portrait Animation","summary":"  Despite raw driving videos contain richer information on facial expressions\nthan intermediate representations such as landmarks in the field of portrait\nanimation, they are seldom the subject of research. This is due to two\nchallenges inherent in portrait animation driven with raw videos: 1)\nsignificant identity leakage; 2) Irrelevant background and facial details such\nas wrinkles degrade performance. To harnesses the power of the raw videos for\nvivid portrait animation, we proposed a pioneering conditional diffusion model\nnamed as MegActor. First, we introduced a synthetic data generation framework\nfor creating videos with consistent motion and expressions but inconsistent IDs\nto mitigate the issue of ID leakage. Second, we segmented the foreground and\nbackground of the reference image and employed CLIP to encode the background\ndetails. This encoded information is then integrated into the network via a\ntext embedding module, thereby ensuring the stability of the background.\nFinally, we further style transfer the appearance of the reference image to the\ndriving video to eliminate the influence of facial details in the driving\nvideos. Our final model was trained solely on public datasets, achieving\nresults comparable to commercial models. We hope this will help the open-source\ncommunity.The code is available at\nhttps://github.com/megvii-research/MegFaceAnimate.\n","authors":["Shurong Yang","Huadong Li","Juhao Wu","Minhao Jing","Linze Li","Renhe Ji","Jiajun Liang","Haoqiang Fan"],"pdf_url":"https://arxiv.org/pdf/2405.20851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00766v4","updated":"2024-05-31T14:29:13Z","published":"2024-01-01T14:14:35Z","title":"Exposure Bracketing is All You Need for Unifying Image Restoration and\n  Enhancement Tasks","summary":"  It is highly desired but challenging to acquire high-quality photos with\nclear content in low-light environments. Although multi-image processing\nmethods (using burst, dual-exposure, or multi-exposure images) have made\nsignificant progress in addressing this issue, they typically focus on specific\nrestoration or enhancement problems, and do not fully explore the potential of\nutilizing multiple images. Motivated by the fact that multi-exposure images are\ncomplementary in denoising, deblurring, high dynamic range imaging, and\nsuper-resolution, we propose to utilize exposure bracketing photography to\nunify image restoration and enhancement tasks in this work. Due to the\ndifficulty in collecting real-world pairs, we suggest a solution that first\npre-trains the model with synthetic paired data and then adapts it to\nreal-world unlabeled images. In particular, a temporally modulated recurrent\nnetwork (TMRNet) and self-supervised adaptation method are proposed. Moreover,\nwe construct a data simulation pipeline to synthesize pairs and collect\nreal-world images from 200 nighttime scenarios. Experiments on both datasets\nshow that our method performs favorably against the state-of-the-art\nmulti-image processing ones. The dataset, code, and pre-trained models are\navailable at https://github.com/cszhilu1998/BracketIRE.\n","authors":["Zhilu Zhang","Shuohao Zhang","Renlong Wu","Zifei Yan","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2401.00766v4.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2405.20838v1","updated":"2024-05-31T14:25:45Z","published":"2024-05-31T14:25:45Z","title":"einspace: Searching for Neural Architectures from Fundamental Operations","summary":"  Neural architecture search (NAS) finds high performing networks for a given\ntask. Yet the results of NAS are fairly prosaic; they did not e.g. create a\nshift from convolutional structures to transformers. This is not least because\nthe search spaces in NAS often aren't diverse enough to include such\ntransformations a priori. Instead, for NAS to provide greater potential for\nfundamental design shifts, we need a novel expressive search space design which\nis built from more fundamental operations. To this end, we introduce einspace,\na search space based on a parameterised probabilistic context-free grammar. Our\nspace is versatile, supporting architectures of various sizes and complexities,\nwhile also containing diverse network operations which allow it to model\nconvolutions, attention components and more. It contains many existing\ncompetitive architectures, and provides flexibility for discovering new ones.\nUsing this search space, we perform experiments to find novel architectures as\nwell as improvements on existing ones on the diverse Unseen NAS datasets. We\nshow that competitive architectures can be obtained by searching from scratch,\nand we consistently find large improvements when initialising the search with\nstrong baselines. We believe that this work is an important advancement towards\na transformative NAS paradigm where search space expressivity and strategic\nsearch initialisation play key roles.\n","authors":["Linus Ericsson","Miguel Espinosa","Chenhongyi Yang","Antreas Antoniou","Amos Storkey","Shay B. Cohen","Steven McDonagh","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2405.20838v1.pdf","comment":"Project page at https://linusericsson.github.io/einspace/"},{"id":"http://arxiv.org/abs/2405.20834v1","updated":"2024-05-31T14:23:49Z","published":"2024-05-31T14:23:49Z","title":"Retrieval Meets Reasoning: Even High-school Textbook Knowledge Benefits\n  Multimodal Reasoning","summary":"  Large language models equipped with retrieval-augmented generation (RAG)\nrepresent a burgeoning field aimed at enhancing answering capabilities by\nleveraging external knowledge bases. Although the application of RAG with\nlanguage-only models has been extensively explored, its adaptation into\nmultimodal vision-language models remains nascent. Going beyond mere answer\ngeneration, the primary goal of multimodal RAG is to cultivate the models'\nability to reason in response to relevant queries. To this end, we introduce a\nnovel multimodal RAG framework named RMR (Retrieval Meets Reasoning). The RMR\nframework employs a bi-modal retrieval module to identify the most relevant\nquestion-answer pairs, which then serve as scaffolds for the multimodal\nreasoning process. This training-free approach not only encourages the model to\nengage deeply with the reasoning processes inherent in the retrieved content\nbut also facilitates the generation of answers that are precise and richly\ninterpretable. Surprisingly, utilizing solely the ScienceQA dataset, collected\nfrom elementary and high school science curricula, RMR significantly boosts the\nperformance of various vision-language models across a spectrum of benchmark\ndatasets, including A-OKVQA, MMBench, and SEED. These outcomes highlight the\nsubstantial potential of our multimodal retrieval and reasoning mechanism to\nimprove the reasoning capabilities of vision-language models.\n","authors":["Cheng Tan","Jingxuan Wei","Linzhuang Sun","Zhangyang Gao","Siyuan Li","Bihui Yu","Ruifeng Guo","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2405.20834v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.07217v2","updated":"2024-05-31T14:23:09Z","published":"2024-02-23T10:08:45Z","title":"Attention-aware Semantic Communications for Collaborative Inference","summary":"  We propose a communication-efficient collaborative inference framework in the\ndomain of edge inference, focusing on the efficient use of vision transformer\n(ViT) models. The partitioning strategy of conventional collaborative inference\nfails to reduce communication cost because of the inherent architecture of ViTs\nmaintaining consistent layer dimensions across the entire transformer encoder.\nTherefore, instead of employing the partitioning strategy, our framework\nutilizes a lightweight ViT model on the edge device, with the server deploying\na complicated ViT model. To enhance communication efficiency and achieve the\nclassification accuracy of the server model, we propose two strategies: 1)\nattention-aware patch selection and 2) entropy-aware image transmission.\nAttention-aware patch selection leverages the attention scores generated by the\nedge device's transformer encoder to identify and select the image patches\ncritical for classification. This strategy enables the edge device to transmit\nonly the essential patches to the server, significantly improving communication\nefficiency. Entropy-aware image transmission uses min-entropy as a metric to\naccurately determine whether to depend on the lightweight model on the edge\ndevice or to request the inference from the server model. In our framework, the\nlightweight ViT model on the edge device acts as a semantic encoder,\nefficiently identifying and selecting the crucial image information required\nfor the classification task. Our experiments demonstrate that the proposed\ncollaborative inference framework can reduce communication overhead by 68% with\nonly a minimal loss in accuracy compared to the server model on the ImageNet\ndataset.\n","authors":["Jiwoong Im","Nayoung Kwon","Taewoo Park","Jiheon Woo","Jaeho Lee","Yongjune Kim"],"pdf_url":"https://arxiv.org/pdf/2404.07217v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20829v1","updated":"2024-05-31T14:21:00Z","published":"2024-05-31T14:21:00Z","title":"Rethinking Open-World Semi-Supervised Learning: Distribution Mismatch\n  and Inductive Inference","summary":"  Open-world semi-supervised learning (OWSSL) extends conventional\nsemi-supervised learning to open-world scenarios by taking account of novel\ncategories in unlabeled datasets. Despite the recent advancements in OWSSL, the\nsuccess often relies on the assumptions that 1) labeled and unlabeled datasets\nshare the same balanced class prior distribution, which does not generally hold\nin real-world applications, and 2) unlabeled training datasets are utilized for\nevaluation, where such transductive inference might not adequately address\nchallenges in the wild. In this paper, we aim to generalize OWSSL by addressing\nthem. Our work suggests that practical OWSSL may require different training\nsettings, evaluation methods, and learning strategies compared to those\nprevalent in the existing literature.\n","authors":["Seongheon Park","Hyuk Kwon","Kwanghoon Sohn","Kibok Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20829v1.pdf","comment":"CVPR Workshop on Computer Vision in the Wild (CVinW), 2024"},{"id":"http://arxiv.org/abs/2405.20810v1","updated":"2024-05-31T14:07:39Z","published":"2024-05-31T14:07:39Z","title":"Context-aware Difference Distilling for Multi-change Captioning","summary":"  Multi-change captioning aims to describe complex and coupled changes within\nan image pair in natural language. Compared with single-change captioning, this\ntask requires the model to have higher-level cognition ability to reason an\narbitrary number of changes. In this paper, we propose a novel context-aware\ndifference distilling (CARD) network to capture all genuine changes for\nyielding sentences. Given an image pair, CARD first decouples context features\nthat aggregate all similar/dissimilar semantics, termed common/difference\ncontext features. Then, the consistency and independence constraints are\ndesigned to guarantee the alignment/discrepancy of common/difference context\nfeatures. Further, the common context features guide the model to mine locally\nunchanged features, which are subtracted from the pair to distill locally\ndifference features. Next, the difference context features augment the locally\ndifference features to ensure that all changes are distilled. In this way, we\nobtain an omni-representation of all changes, which is translated into\nlinguistic sentences by a transformer decoder. Extensive experiments on three\npublic datasets show CARD performs favourably against state-of-the-art\nmethods.The code is available at https://github.com/tuyunbin/CARD.\n","authors":["Yunbin Tu","Liang Li","Li Su","Zheng-Jun Zha","Chenggang Yan","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2405.20810v1.pdf","comment":"Accepted by ACL 2024 main conference (long paper)"},{"id":"http://arxiv.org/abs/2402.12550v2","updated":"2024-05-31T14:04:05Z","published":"2024-02-19T21:20:22Z","title":"Multilinear Mixture of Experts: Scalable Expert Specialization through\n  Factorization","summary":"  The Mixture of Experts (MoE) paradigm provides a powerful way to decompose\ndense layers into smaller, modular computations often more amenable to human\ninterpretation, debugging, and editability. However, a major challenge lies in\nthe computational cost of scaling the number of experts high enough to achieve\nfine-grained specialization. In this paper, we propose the Multilinear Mixture\nof Experts ($\\mu$MoE) layer to address this, focusing on vision models.\n$\\mu$MoE layers enable scalable expert specialization by performing an implicit\ncomputation on prohibitively large weight tensors entirely in factorized form.\nConsequently, $\\mu$MoEs (1) avoid the restrictively high inference-time costs\nof 'soft' MoEs, yet (2) do not inherit the training issues of the popular\n'sparse' MoEs' discrete (non-differentiable) expert routing. We present both\nqualitative and quantitative evidence that scaling $\\mu$MoE layers when\nfine-tuning foundation models for vision tasks leads to more specialized\nexperts at the class-level, further enabling manual bias correction in CelebA\nattribute classification. Finally, we show qualitative results demonstrating\nthe expert specialism achieved when pre-training large GPT2 and MLP-Mixer\nmodels with parameter-matched $\\mu$MoE blocks at every layer, maintaining\ncomparable accuracy. Our code is available at:\nhttps://github.com/james-oldfield/muMoE.\n","authors":["James Oldfield","Markos Georgopoulos","Grigorios G. Chrysos","Christos Tzelepis","Yannis Panagakis","Mihalis A. Nicolaou","Jiankang Deng","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2402.12550v2.pdf","comment":"Github: https://github.com/james-oldfield/muMoE. Project page:\n  https://james-oldfield.github.io/muMoE/"},{"id":"http://arxiv.org/abs/2405.18839v2","updated":"2024-05-31T14:03:07Z","published":"2024-05-29T07:40:31Z","title":"MEGA: Masked Generative Autoencoder for Human Mesh Recovery","summary":"  Human Mesh Recovery (HMR) from a single RGB image is a highly ambiguous\nproblem, as similar 2D projections can correspond to multiple 3D\ninterpretations. Nevertheless, most HMR methods overlook this ambiguity and\nmake a single prediction without accounting for the associated uncertainty. A\nfew approaches generate a distribution of human meshes, enabling the sampling\nof multiple predictions; however, none of them is competitive with the latest\nsingle-output model when making a single prediction. This work proposes a new\napproach based on masked generative modeling. By tokenizing the human pose and\nshape, we formulate the HMR task as generating a sequence of discrete tokens\nconditioned on an input image. We introduce MEGA, a MaskEd Generative\nAutoencoder trained to recover human meshes from images and partial human mesh\ntoken sequences. Given an image, our flexible generation scheme allows us to\npredict a single human mesh in deterministic mode or to generate multiple human\nmeshes in stochastic mode. MEGA enables us to propose multiple outputs and to\nevaluate the uncertainty of the predictions. Experiments on in-the-wild\nbenchmarks show that MEGA achieves state-of-the-art performance in\ndeterministic and stochastic modes, outperforming single-output and\nmulti-output approaches.\n","authors":["Guénolé Fiche","Simon Leglaive","Xavier Alameda-Pineda","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2405.18839v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20797v1","updated":"2024-05-31T13:59:18Z","published":"2024-05-31T13:59:18Z","title":"Ovis: Structural Embedding Alignment for Multimodal Large Language Model","summary":"  Current Multimodal Large Language Models (MLLMs) typically integrate a\npre-trained LLM with another pre-trained vision transformer through a\nconnector, such as an MLP, endowing the LLM with visual capabilities. However,\nthe misalignment between two embedding strategies in MLLMs -- the structural\ntextual embeddings based on an embedding look-up table and the continuous\nembeddings generated directly by the vision encoder -- makes challenges for a\nmore seamless fusion of visual and textual information. We propose Ovis, a\nnovel MLLM architecture designed to structurally align visual and textual\nembeddings. Ovis integrates an additional learnable visual embedding table into\nthe visual encoder's process. To capture rich visual semantics, each image\npatch indexes the visual embedding table multiple times, resulting in a final\nvisual embedding that is a probabilistic combination of the indexed embeddings.\nThis structural approach mirrors the method used for generating textual\nembeddings. Empirical evaluations on various multimodal benchmarks demonstrate\nthat Ovis outperforms open-source MLLMs of similar parameter scales and even\nsurpasses the proprietary model Qwen-VL-Plus overall. These results highlight\nthe potential of Ovis' structured visual representation for advancing MLLM\narchitectural design and promoting more effective multimodal learning. Both the\nsource code and the training dataset of Ovis will be made publicly available.\n","authors":["Shiyin Lu","Yang Li","Qing-Guo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2405.20797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20795v1","updated":"2024-05-31T13:56:55Z","published":"2024-05-31T13:56:55Z","title":"InsightSee: Advancing Multi-agent Vision-Language Models for Enhanced\n  Visual Understanding","summary":"  Accurate visual understanding is imperative for advancing autonomous systems\nand intelligent robots. Despite the powerful capabilities of vision-language\nmodels (VLMs) in processing complex visual scenes, precisely recognizing\nobscured or ambiguously presented visual elements remains challenging. To\ntackle such issues, this paper proposes InsightSee, a multi-agent framework to\nenhance VLMs' interpretative capabilities in handling complex visual\nunderstanding scenarios. The framework comprises a description agent, two\nreasoning agents, and a decision agent, which are integrated to refine the\nprocess of visual information interpretation. The design of these agents and\nthe mechanisms by which they can be enhanced in visual information processing\nare presented. Experimental results demonstrate that the InsightSee framework\nnot only boosts performance on specific visual tasks but also retains the\noriginal models' strength. The proposed framework outperforms state-of-the-art\nalgorithms in 6 out of 9 benchmark tests, with a substantial advancement in\nmultimodal understanding.\n","authors":["Huaxiang Zhang","Yaojia Mu","Guo-Niu Zhu","Zhongxue Gan"],"pdf_url":"https://arxiv.org/pdf/2405.20795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19473v5","updated":"2024-05-31T13:56:39Z","published":"2024-02-29T18:59:01Z","title":"Retrieval-Augmented Generation for AI-Generated Content: A Survey","summary":"  Advancements in model algorithms, the growth of foundational models, and\naccess to high-quality datasets have propelled the evolution of Artificial\nIntelligence Generated Content (AIGC). Despite its notable successes, AIGC\nstill faces hurdles such as updating knowledge, handling long-tail data,\nmitigating data leakage, and managing high training and inference costs.\nRetrieval-Augmented Generation (RAG) has recently emerged as a paradigm to\naddress such challenges. In particular, RAG introduces the information\nretrieval process, which enhances the generation process by retrieving relevant\nobjects from available data stores, leading to higher accuracy and better\nrobustness. In this paper, we comprehensively review existing efforts that\nintegrate RAG technique into AIGC scenarios. We first classify RAG foundations\naccording to how the retriever augments the generator, distilling the\nfundamental abstractions of the augmentation methodologies for various\nretrievers and generators. This unified perspective encompasses all RAG\nscenarios, illuminating advancements and pivotal technologies that help with\npotential future progress. We also summarize additional enhancements methods\nfor RAG, facilitating effective engineering and implementation of RAG systems.\nThen from another view, we survey on practical applications of RAG across\ndifferent modalities and tasks, offering valuable references for researchers\nand practitioners. Furthermore, we introduce the benchmarks for RAG, discuss\nthe limitations of current RAG systems, and suggest potential directions for\nfuture research. Github: https://github.com/PKU-DAIR/RAG-Survey.\n","authors":["Penghao Zhao","Hailin Zhang","Qinhan Yu","Zhengren Wang","Yunteng Geng","Fangcheng Fu","Ling Yang","Wentao Zhang","Jie Jiang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2402.19473v5.pdf","comment":"Citing 334 papers, 21 pages, 1 table, 12 figures. Project:\n  https://github.com/PKU-DAIR/RAG-Survey"},{"id":"http://arxiv.org/abs/2405.20791v1","updated":"2024-05-31T13:48:54Z","published":"2024-05-31T13:48:54Z","title":"GS-Phong: Meta-Learned 3D Gaussians for Relightable Novel View Synthesis","summary":"  Decoupling the illumination in 3D scenes is crucial for novel view synthesis\nand relighting. In this paper, we propose a novel method for representing a\nscene illuminated by a point light using a set of relightable 3D Gaussian\npoints. Inspired by the Blinn-Phong model, our approach decomposes the scene\ninto ambient, diffuse, and specular components, enabling the synthesis of\nrealistic lighting effects. To facilitate the decomposition of geometric\ninformation independent of lighting conditions, we introduce a novel bilevel\noptimization-based meta-learning framework. The fundamental idea is to view the\nrendering tasks under various lighting positions as a multi-task learning\nproblem, which our meta-learning approach effectively addresses by generalizing\nthe learned Gaussian geometries not only across different viewpoints but also\nacross diverse light positions. Experimental results demonstrate the\neffectiveness of our approach in terms of training efficiency and rendering\nquality compared to existing methods for free-viewpoint relighting.\n","authors":["Yumeng He","Yunbo Wang","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2405.20791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15356v2","updated":"2024-05-31T12:53:36Z","published":"2023-11-26T17:17:28Z","title":"Having Second Thoughts? Let's hear it","summary":"  Deep learning models loosely mimic bottom-up signal pathways from low-order\nsensory areas to high-order cognitive areas. After training, DL models can\noutperform humans on some domain-specific tasks, but their decision-making\nprocess has been known to be easily disrupted. Since the human brain consists\nof multiple functional areas highly connected to one another and relies on\nintricate interplays between bottom-up and top-down (from high-order to\nlow-order areas) processing, we hypothesize that incorporating top-down signal\nprocessing may make DL models more robust. To address this hypothesis, we\npropose a certification process mimicking selective attention and test if it\ncould make DL models more robust. Our empirical evaluations suggest that this\nnewly proposed certification can improve DL models' accuracy and help us build\nsafety measures to alleviate their vulnerabilities with both artificial and\nnatural adversarial examples.\n","authors":["Jung H. Lee","Sujith Vijayan"],"pdf_url":"https://arxiv.org/pdf/2311.15356v2.pdf","comment":"10 pages, 6 figures, 3 table and Append/Supplementary materials.\n  Section 3 has been substantially revised"},{"id":"http://arxiv.org/abs/2405.20764v1","updated":"2024-05-31T12:35:06Z","published":"2024-05-31T12:35:06Z","title":"CoMoFusion: Fast and High-quality Fusion of Infrared and Visible Image\n  with Consistency Model","summary":"  Generative models are widely utilized to model the distribution of fused\nimages in the field of infrared and visible image fusion. However, current\ngenerative models based fusion methods often suffer from unstable training and\nslow inference speed. To tackle this problem, a novel fusion method based on\nconsistency model is proposed, termed as CoMoFusion, which can generate the\nhigh-quality images and achieve fast image inference speed. In specific, the\nconsistency model is used to construct multi-modal joint features in the latent\nspace with the forward and reverse process. Then, the infrared and visible\nfeatures extracted by the trained consistency model are fed into fusion module\nto generate the final fused image. In order to enhance the texture and salient\ninformation of fused images, a novel loss based on pixel value selection is\nalso designed. Extensive experiments on public datasets illustrate that our\nmethod obtains the SOTA fusion performance compared with the existing fusion\nmethods.\n","authors":["Zhiming Meng","Hui Li","Zeyang Zhang","Zhongwei Shen","Yunlong Yu","Xiaoning Song","Xiaojun Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20759v1","updated":"2024-05-31T12:20:02Z","published":"2024-05-31T12:20:02Z","title":"Information Theoretic Text-to-Image Alignment","summary":"  Diffusion models for Text-to-Image (T2I) conditional generation have seen\ntremendous success recently. Despite their success, accurately capturing user\nintentions with these models still requires a laborious trial and error\nprocess. This challenge is commonly identified as a model alignment problem, an\nissue that has attracted considerable attention by the research community.\nInstead of relying on fine-grained linguistic analyses of prompts, human\nannotation, or auxiliary vision-language models to steer image generation, in\nthis work we present a novel method that relies on an information-theoretic\nalignment measure. In a nutshell, our method uses self-supervised fine-tuning\nand relies on point-wise mutual information between prompts and images to\ndefine a synthetic training set to induce model alignment. Our comparative\nanalysis shows that our method is on-par or superior to the state-of-the-art,\nyet requires nothing but a pre-trained denoising network to estimate MI and a\nlightweight fine-tuning strategy.\n","authors":["Chao Wang","Giulio Franzese","Alessandro Finamore","Massimo Gallo","Pietro Michiardi"],"pdf_url":"https://arxiv.org/pdf/2405.20759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19996v2","updated":"2024-05-31T11:39:33Z","published":"2024-05-30T12:32:35Z","title":"DP-IQA: Utilizing Diffusion Prior for Blind Image Quality Assessment in\n  the Wild","summary":"  Image quality assessment (IQA) plays a critical role in selecting\nhigh-quality images and guiding compression and enhancement methods in a series\nof applications. The blind IQA, which assesses the quality of in-the-wild\nimages containing complex authentic distortions without reference images, poses\ngreater challenges. Existing methods are limited to modeling a uniform\ndistribution with local patches and are bothered by the gap between low and\nhigh-level visions (caused by widely adopted pre-trained classification\nnetworks). In this paper, we propose a novel IQA method called diffusion\npriors-based IQA (DP-IQA), which leverages the prior knowledge from the\npre-trained diffusion model with its excellent powers to bridge semantic gaps\nin the perception of the visual quality of images. Specifically, we use\npre-trained stable diffusion as the backbone, extract multi-level features from\nthe denoising U-Net during the upsampling process at a specified timestep, and\ndecode them to estimate the image quality score. The text and image adapters\nare adopted to mitigate the domain gap for downstream tasks and correct the\ninformation loss caused by the variational autoencoder bottleneck. Finally, we\ndistill the knowledge in the above model into a CNN-based student model,\nsignificantly reducing the parameter to enhance applicability, with the student\nmodel performing similarly or even better than the teacher model surprisingly.\nExperimental results demonstrate that our DP-IQA achieves state-of-the-art\nresults on various in-the-wild datasets with better generalization capability,\nwhich shows the superiority of our method in global modeling and utilizing the\nhierarchical feature clues of diffusion for evaluating image quality.\n","authors":["Honghao Fu","Yufei Wang","Wenhan Yang","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2405.19996v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20750v1","updated":"2024-05-31T11:14:12Z","published":"2024-05-31T11:14:12Z","title":"Diffusion Models Are Innate One-Step Generators","summary":"  Diffusion Models (DMs) have achieved great success in image generation and\nother fields. By fine sampling through the trajectory defined by the SDE/ODE\nsolver based on a well-trained score model, DMs can generate remarkable\nhigh-quality results. However, this precise sampling often requires multiple\nsteps and is computationally demanding. To address this problem, instance-based\ndistillation methods have been proposed to distill a one-step generator from a\nDM by having a simpler student model mimic a more complex teacher model. Yet,\nour research reveals an inherent limitations in these methods: the teacher\nmodel, with more steps and more parameters, occupies different local minima\ncompared to the student model, leading to suboptimal performance when the\nstudent model attempts to replicate the teacher. To avoid this problem, we\nintroduce a novel distributional distillation method, which uses an exclusive\ndistributional loss. This method exceeds state-of-the-art (SOTA) results while\nrequiring significantly fewer training images. Additionally, we show that DMs'\nlayers are activated differently at different time steps, leading to an\ninherent capability to generate images in a single step. Freezing most of the\nconvolutional layers in a DM during distributional distillation leads to\nfurther performance improvements. Our method achieves the SOTA results on\nCIFAR-10 (FID 1.54), AFHQv2 64x64 (FID 1.23), FFHQ 64x64 (FID 0.85) and\nImageNet 64x64 (FID 1.16) with great efficiency. Most of those results are\nobtained with only 5 million training images within 6 hours on 8 A100 GPUs.\nThis breakthrough not only enhances the understanding of efficient image\ngeneration models but also offers a scalable framework for advancing the state\nof the art in various applications.\n","authors":["Bowen Zheng","Tianming Yang"],"pdf_url":"https://arxiv.org/pdf/2405.20750v1.pdf","comment":"9 pages, 4 figures and 4 tables on the main contents"},{"id":"http://arxiv.org/abs/2405.14200v2","updated":"2024-05-31T11:09:59Z","published":"2024-05-23T05:58:10Z","title":"Awesome Multi-modal Object Tracking","summary":"  Multi-modal object tracking (MMOT) is an emerging field that combines data\nfrom various modalities, \\eg vision (RGB), depth, thermal infrared, event,\nlanguage and audio, to estimate the state of an arbitrary object in a video\nsequence. It is of great significance for many applications such as autonomous\ndriving and intelligent surveillance. In recent years, MMOT has received more\nand more attention. However, existing MMOT algorithms mainly focus on two\nmodalities (\\eg RGB+depth, RGB+thermal infrared, and RGB+language). To leverage\nmore modalities, some recent efforts have been made to learn a unified visual\nobject tracking model for any modality. Additionally, some large-scale\nmulti-modal tracking benchmarks have been established by simultaneously\nproviding more than two modalities, such as vision-language-audio (\\eg\nWebUAV-3M) and vision-depth-language (\\eg UniMod1K). To track the latest\nprogress in MMOT, we conduct a comprehensive investigation in this report.\nSpecifically, we first divide existing MMOT tasks into five main categories,\n\\ie RGBL tracking, RGBE tracking, RGBD tracking, RGBT tracking, and\nmiscellaneous (RGB+X), where X can be any modality, such as language, depth,\nand event. Then, we analyze and summarize each MMOT task, focusing on widely\nused datasets and mainstream tracking algorithms based on their technical\nparadigms (\\eg self-supervised learning, prompt learning, knowledge\ndistillation, generative models, and state space models). Finally, we maintain\na continuously updated paper list for MMOT at\nhttps://github.com/983632847/Awesome-Multimodal-Object-Tracking.\n","authors":["Chunhui Zhang","Li Liu","Hao Wen","Xi Zhou","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.14200v2.pdf","comment":"A continuously updated project to track the latest progress in\n  multi-modal object tracking"},{"id":"http://arxiv.org/abs/2405.20330v2","updated":"2024-05-31T10:52:56Z","published":"2024-05-30T17:59:02Z","title":"4DHands: Reconstructing Interactive Hands in 4D with Transformers","summary":"  In this paper, we introduce 4DHands, a robust approach to recovering\ninteractive hand meshes and their relative movement from monocular inputs. Our\napproach addresses two major limitations of previous methods: lacking a unified\nsolution for handling various hand image inputs and neglecting the positional\nrelationship of two hands within images. To overcome these challenges, we\ndevelop a transformer-based architecture with novel tokenization and feature\nfusion strategies. Specifically, we propose a Relation-aware Two-Hand\nTokenization (RAT) method to embed positional relation information into the\nhand tokens. In this way, our network can handle both single-hand and two-hand\ninputs and explicitly leverage relative hand positions, facilitating the\nreconstruction of intricate hand interactions in real-world scenarios. As such\ntokenization indicates the relative relationship of two hands, it also supports\nmore effective feature fusion. To this end, we further develop a\nSpatio-temporal Interaction Reasoning (SIR) module to fuse hand tokens in 4D\nwith attention and decode them into 3D hand meshes and relative temporal\nmovements. The efficacy of our approach is validated on several benchmark\ndatasets. The results on in-the-wild videos and real-world scenarios\ndemonstrate the superior performances of our approach for interactive hand\nreconstruction. More video results can be found on the project page:\nhttps://4dhands.github.io.\n","authors":["Dixuan Lin","Yuxiang Zhang","Mengcheng Li","Yebin Liu","Wei Jing","Qi Yan","Qianying Wang","Hongwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20330v2.pdf","comment":"More demo videos can be seen at our project page:\n  https://4dhands.github.io"},{"id":"http://arxiv.org/abs/2405.20743v1","updated":"2024-05-31T10:13:17Z","published":"2024-05-31T10:13:17Z","title":"Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent\n  Codes","summary":"  Trajectory forecasting is crucial for video surveillance analytics, as it\nenables the anticipation of future movements for a set of agents, e.g.\nbasketball players engaged in intricate interactions with long-term intentions.\nDeep generative models offer a natural learning approach for trajectory\nforecasting, yet they encounter difficulties in achieving an optimal balance\nbetween sampling fidelity and diversity. We address this challenge by\nleveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a\ndiscrete latent space to tackle the issue of posterior collapse. Specifically,\nwe introduce an instance-based codebook that allows tailored latent\nrepresentations for each example. In a nutshell, the rows of the codebook are\ndynamically adjusted to reflect contextual information (i.e., past motion\npatterns extracted from the observed trajectories). In this way, the\ndiscretization process gains flexibility, leading to improved reconstructions.\nNotably, instance-level dynamics are injected into the codebook through\nlow-rank updates, which restrict the customization of the codebook to a lower\ndimension space. The resulting discrete space serves as the basis of the\nsubsequent step, which regards the training of a diffusion-based predictive\nmodel. We show that such a two-fold framework, augmented with instance-level\ndiscretization, leads to accurate and diverse forecasts, yielding\nstate-of-the-art performance on three established benchmarks.\n","authors":["Riccardo Benaglia","Angelo Porrello","Pietro Buzzega","Simone Calderara","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2405.20743v1.pdf","comment":"15 pages, 3 figures, 5 tables"},{"id":"http://arxiv.org/abs/2405.20735v1","updated":"2024-05-31T09:59:11Z","published":"2024-05-31T09:59:11Z","title":"Language Augmentation in CLIP for Improved Anatomy Detection on\n  Multi-modal Medical Images","summary":"  Vision-language models have emerged as a powerful tool for previously\nchallenging multi-modal classification problem in the medical domain. This\ndevelopment has led to the exploration of automated image description\ngeneration for multi-modal clinical scans, particularly for radiology report\ngeneration. Existing research has focused on clinical descriptions for specific\nmodalities or body regions, leaving a gap for a model providing entire-body\nmulti-modal descriptions. In this paper, we address this gap by automating the\ngeneration of standardized body station(s) and list of organ(s) across the\nwhole body in multi-modal MR and CT radiological images. Leveraging the\nversatility of the Contrastive Language-Image Pre-training (CLIP), we refine\nand augment the existing approach through multiple experiments, including\nbaseline model fine-tuning, adding station(s) as a superset for better\ncorrelation between organs, along with image and language augmentations. Our\nproposed approach demonstrates 47.6% performance improvement over baseline\nPubMedCLIP.\n","authors":["Mansi Kakkar","Dattesh Shanbhag","Chandan Aladahalli","Gurunath Reddy M"],"pdf_url":"https://arxiv.org/pdf/2405.20735v1.pdf","comment":"$\\copyright$ 2024 IEEE. Accepted in 46th Annual International\n  Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)\n  2024"},{"id":"http://arxiv.org/abs/2405.20729v1","updated":"2024-05-31T09:37:39Z","published":"2024-05-31T09:37:39Z","title":"Extreme Point Supervised Instance Segmentation","summary":"  This paper introduces a novel approach to learning instance segmentation\nusing extreme points, i.e., the topmost, leftmost, bottommost, and rightmost\npoints, of each object. These points are readily available in the modern\nbounding box annotation process while offering strong clues for precise\nsegmentation, and thus allows to improve performance at the same annotation\ncost with box-supervised methods. Our work considers extreme points as a part\nof the true instance mask and propagates them to identify potential foreground\nand background points, which are all together used for training a pseudo label\ngenerator. Then pseudo labels given by the generator are in turn used for\nsupervised learning of our final model. On three public benchmarks, our method\nsignificantly outperforms existing box-supervised methods, further narrowing\nthe gap with its fully supervised counterpart. In particular, our model\ngenerates high-quality masks when a target object is separated into multiple\nparts, where previous box-supervised methods often fail.\n","authors":["Hyeonjun Lee","Sehyun Hwang","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2405.20729v1.pdf","comment":"CVPR 2024 Accepted"},{"id":"http://arxiv.org/abs/2405.20091v2","updated":"2024-05-31T09:35:36Z","published":"2024-05-30T14:27:40Z","title":"Visual Attention Analysis in Online Learning","summary":"  In this paper, we present an approach in the Multimodal Learning Analytics\nfield. Within this approach, we have developed a tool to visualize and analyze\neye movement data collected during learning sessions in online courses. The\ntool is named VAAD (an acronym for Visual Attention Analysis Dashboard). These\neye movement data have been gathered using an eye-tracker and subsequently\nprocessed and visualized for interpretation. The purpose of the tool is to\nconduct a descriptive analysis of the data by facilitating its visualization,\nenabling the identification of differences and learning patterns among various\nlearner populations. Additionally, it integrates a predictive module capable of\nanticipating learner activities during a learning session. Consequently, VAAD\nholds the potential to offer valuable insights into online learning behaviors\nfrom both descriptive and predictive perspectives.\n","authors":["Miriam Navarro","Álvaro Becerra","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2405.20091v2.pdf","comment":"Accepted in CEDI 2024 (VII Congreso Espa\\~nol de Inform\\'atica), A\n  Coru\\~na, Spain"},{"id":"http://arxiv.org/abs/2405.20725v1","updated":"2024-05-31T09:29:43Z","published":"2024-05-31T09:29:43Z","title":"GI-NAS: Boosting Gradient Inversion Attacks through Adaptive Neural\n  Architecture Search","summary":"  Gradient Inversion Attacks invert the transmitted gradients in Federated\nLearning (FL) systems to reconstruct the sensitive data of local clients and\nhave raised considerable privacy concerns. A majority of gradient inversion\nmethods rely heavily on explicit prior knowledge (e.g., a well pre-trained\ngenerative model), which is often unavailable in realistic scenarios. To\nalleviate this issue, researchers have proposed to leverage the implicit prior\nknowledge of an over-parameterized network. However, they only utilize a fixed\nneural architecture for all the attack settings. This would hinder the adaptive\nuse of implicit architectural priors and consequently limit the\ngeneralizability. In this paper, we further exploit such implicit prior\nknowledge by proposing Gradient Inversion via Neural Architecture Search\n(GI-NAS), which adaptively searches the network and captures the implicit\npriors behind neural architectures. Extensive experiments verify that our\nproposed GI-NAS can achieve superior attack performance compared to\nstate-of-the-art gradient inversion methods, even under more practical settings\nwith high-resolution images, large-sized batches, and advanced defense\nstrategies.\n","authors":["Wenbo Yu","Hao Fang","Bin Chen","Xiaohang Sui","Chuan Chen","Hao Wu","Shu-Tao Xia","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2405.20725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20721v1","updated":"2024-05-31T09:23:39Z","published":"2024-05-31T09:23:39Z","title":"ContextGS: Compact 3D Gaussian Splatting with Anchor Level Context Model","summary":"  Recently, 3D Gaussian Splatting (3DGS) has become a promising framework for\nnovel view synthesis, offering fast rendering speeds and high fidelity.\nHowever, the large number of Gaussians and their associated attributes require\neffective compression techniques. Existing methods primarily compress neural\nGaussians individually and independently, i.e., coding all the neural Gaussians\nat the same time, with little design for their interactions and spatial\ndependence. Inspired by the effectiveness of the context model in image\ncompression, we propose the first autoregressive model at the anchor level for\n3DGS compression in this work. We divide anchors into different levels and the\nanchors that are not coded yet can be predicted based on the already coded ones\nin all the coarser levels, leading to more accurate modeling and higher coding\nefficiency. To further improve the efficiency of entropy coding, e.g., to code\nthe coarsest level with no already coded anchors, we propose to introduce a\nlow-dimensional quantized feature as the hyperprior for each anchor, which can\nbe effectively compressed. Our work pioneers the context model in the anchor\nlevel for 3DGS representation, yielding an impressive size reduction of over\n100 times compared to vanilla 3DGS and 15 times compared to the most recent\nstate-of-the-art work Scaffold-GS, while achieving comparable or even higher\nrendering quality.\n","authors":["Yufei Wang","Zhihao Li","Lanqing Guo","Wenhan Yang","Alex C. Kot","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2405.20721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20720v1","updated":"2024-05-31T09:23:25Z","published":"2024-05-31T09:23:25Z","title":"Power of Cooperative Supervision: Multiple Teachers Framework for\n  Enhanced 3D Semi-Supervised Object Detection","summary":"  To ensure safe urban driving for autonomous platforms, it is crucial not only\nto develop high-performance object detection techniques but also to establish a\ndiverse and representative dataset that captures various urban environments and\nobject characteristics. To address these two issues, we have constructed a\nmulti-class 3D LiDAR dataset reflecting diverse urban environments and object\ncharacteristics, and developed a robust 3D semi-supervised object detection\n(SSOD) based on a multiple teachers framework. This SSOD framework categorizes\nsimilar classes and assigns specialized teachers to each category. Through\ncollaborative supervision among these category-specialized teachers, the\nstudent network becomes increasingly proficient, leading to a highly effective\nobject detector. We propose a simple yet effective augmentation technique,\nPie-based Point Compensating Augmentation (PieAug), to enable the teacher\nnetwork to generate high-quality pseudo-labels. Extensive experiments on the\nWOD, KITTI, and our datasets validate the effectiveness of our proposed method\nand the quality of our dataset. Experimental results demonstrate that our\napproach consistently outperforms existing state-of-the-art 3D semi-supervised\nobject detection methods across all datasets. We plan to release our\nmulti-class LiDAR dataset and the source code available on our Github\nrepository in the near future.\n","authors":["Jin-Hee Lee","Jae-Keun Lee","Je-Seok Kim","Soon Kwon"],"pdf_url":"https://arxiv.org/pdf/2405.20720v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2405.20719v1","updated":"2024-05-31T09:20:33Z","published":"2024-05-31T09:20:33Z","title":"Climate Variable Downscaling with Conditional Normalizing Flows","summary":"  Predictions of global climate models typically operate on coarse spatial\nscales due to the large computational costs of climate simulations. This has\nled to a considerable interest in methods for statistical downscaling, a\nsimilar process to super-resolution in the computer vision context, to provide\nmore local and regional climate information. In this work, we apply conditional\nnormalizing flows to the task of climate variable downscaling. We showcase its\nsuccessful performance on an ERA5 water content dataset for different\nupsampling factors. Additionally, we show that the method allows us to assess\nthe predictive uncertainty in terms of standard deviation from the fitted\nconditional distribution mean.\n","authors":["Christina Winkler","Paula Harder","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2405.20719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20717v1","updated":"2024-05-31T09:14:36Z","published":"2024-05-31T09:14:36Z","title":"Cyclic image generation using chaotic dynamics","summary":"  Successive image generation using cyclic transformations is demonstrated by\nextending the CycleGAN model to transform images among three different\ncategories. Repeated application of the trained generators produces sequences\nof images that transition among the different categories. The generated image\nsequences occupy a more limited region of the image space compared with the\noriginal training dataset. Quantitative evaluation using precision and recall\nmetrics indicates that the generated images have high quality but reduced\ndiversity relative to the training dataset. Such successive generation\nprocesses are characterized as chaotic dynamics in terms of dynamical system\ntheory. Positive Lyapunov exponents estimated from the generated trajectories\nconfirm the presence of chaotic dynamics, with the Lyapunov dimension of the\nattractor found to be comparable to the intrinsic dimension of the training\ndata manifold. The results suggest that chaotic dynamics in the image space\ndefined by the deep generative model contribute to the diversity of the\ngenerated images, constituting a novel approach for multi-class image\ngeneration. This model can be interpreted as an extension of classical\nassociative memory to perform hetero-association among image categories.\n","authors":["Takaya Tanaka","Yutaka Yamaguti"],"pdf_url":"https://arxiv.org/pdf/2405.20717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20711v1","updated":"2024-05-31T09:07:15Z","published":"2024-05-31T09:07:15Z","title":"Revisiting Mutual Information Maximization for Generalized Category\n  Discovery","summary":"  Generalized category discovery presents a challenge in a realistic scenario,\nwhich requires the model's generalization ability to recognize unlabeled\nsamples from known and unknown categories. This paper revisits the challenge of\ngeneralized category discovery through the lens of information maximization\n(InfoMax) with a probabilistic parametric classifier. Our findings reveal that\nensuring independence between known and unknown classes while concurrently\nassuming a uniform probability distribution across all classes, yields an\nenlarged margin among known and unknown classes that promotes the model's\nperformance. To achieve the aforementioned independence, we propose a novel\nInfoMax-based method, Regularized Parametric InfoMax (RPIM), which adopts\npseudo labels to supervise unlabeled samples during InfoMax, while proposing a\nregularization to ensure the quality of the pseudo labels. Additionally, we\nintroduce novel semantic-bias transformation to refine the features from the\npre-trained model instead of direct fine-tuning to rescue the computational\ncosts. Extensive experiments on six benchmark datasets validate the\neffectiveness of our method. RPIM significantly improves the performance\nregarding unknown classes, surpassing the state-of-the-art method by an average\nmargin of 3.5%.\n","authors":["Zhaorui Tan","Chengrui Zhang","Xi Yang","Jie Sun","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2405.20711v1.pdf","comment":"Preprint version"},{"id":"http://arxiv.org/abs/2405.11129v2","updated":"2024-05-31T08:56:29Z","published":"2024-05-18T00:47:29Z","title":"MotionGS : Compact Gaussian Splatting SLAM by Motion Filter","summary":"  With their high-fidelity scene representation capability, the attention of\nSLAM field is deeply attracted by the Neural Radiation Field (NeRF) and 3D\nGaussian Splatting (3DGS). Recently, there has been a surge in NeRF-based SLAM,\nwhile 3DGS-based SLAM is sparse. A novel 3DGS-based SLAM approach with a fusion\nof deep visual feature, dual keyframe selection and 3DGS is presented in this\npaper. Compared with the existing methods, the proposed tracking is achieved by\nfeature extraction and motion filter on each frame. The joint optimization of\nposes and 3D Gaussians runs through the entire mapping process. Additionally,\nthe coarse-to-fine pose estimation and compact Gaussian scene representation\nare implemented by dual keyframe selection and novel loss functions.\nExperimental results demonstrate that the proposed algorithm not only\noutperforms the existing methods in tracking and mapping, but also has less\nmemory usage.\n","authors":["Xinli Guo","Weidong Zhang","Ruonan Liu","Peng Han","Hongtian Chen"],"pdf_url":"https://arxiv.org/pdf/2405.11129v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.03636v2","updated":"2024-05-31T08:50:18Z","published":"2021-03-05T12:44:22Z","title":"CoDeGAN: Contrastive Disentanglement for Generative Adversarial Network","summary":"  Disentanglement, a critical concern in interpretable machine learning, has\nalso garnered significant attention from the computer vision community. Many\nexisting GAN-based class disentanglement (unsupervised) approaches, such as\nInfoGAN and its variants, primarily aim to maximize the mutual information (MI)\nbetween the generated image and its latent codes. However, this focus may lead\nto a tendency for the network to generate highly similar images when presented\nwith the same latent class factor, potentially resulting in mode collapse or\nmode dropping. To alleviate this problem, we propose \\texttt{CoDeGAN}\n(Contrastive Disentanglement for Generative Adversarial Networks), where we\nrelax similarity constraints for disentanglement from the image domain to the\nfeature domain. This modification not only enhances the stability of GAN\ntraining but also improves their disentangling capabilities. Moreover, we\nintegrate self-supervised pre-training into CoDeGAN to learn semantic\nrepresentations, significantly facilitating unsupervised disentanglement.\nExtensive experimental results demonstrate the superiority of our method over\nstate-of-the-art approaches across multiple benchmarks. The code is available\nat https://github.com/learninginvision/CoDeGAN.\n","authors":["Jiangwei Zhao","Zejia Liu","Xiaohan Guo","Lili Pan"],"pdf_url":"https://arxiv.org/pdf/2103.03636v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08291v3","updated":"2024-05-31T08:48:46Z","published":"2023-12-13T17:08:38Z","title":"VQ-HPS: Human Pose and Shape Estimation in a Vector-Quantized Latent\n  Space","summary":"  Previous works on Human Pose and Shape Estimation (HPSE) from RGB images can\nbe broadly categorized into two main groups: parametric and non-parametric\napproaches. Parametric techniques leverage a low-dimensional statistical body\nmodel for realistic results, whereas recent non-parametric methods achieve\nhigher precision by directly regressing the 3D coordinates of the human body\nmesh. This work introduces a novel paradigm to address the HPSE problem,\ninvolving a low-dimensional discrete latent representation of the human mesh\nand framing HPSE as a classification task. Instead of predicting body model\nparameters or 3D vertex coordinates, we focus on predicting the proposed\ndiscrete latent representation, which can be decoded into a registered human\nmesh. This innovative paradigm offers two key advantages. Firstly, predicting a\nlow-dimensional discrete representation confines our predictions to the space\nof anthropomorphic poses and shapes even when little training data is\navailable. Secondly, by framing the problem as a classification task, we can\nharness the discriminative power inherent in neural networks. The proposed\nmodel, VQ-HPS, predicts the discrete latent representation of the mesh. The\nexperimental results demonstrate that VQ-HPS outperforms the current\nstate-of-the-art non-parametric approaches while yielding results as realistic\nas those produced by parametric methods when trained with little data. VQ-HPS\nalso shows promising results when training on large-scale datasets,\nhighlighting the significant potential of the classification approach for HPSE.\nSee the project page at https://g-fiche.github.io/research-pages/vqhps/\n","authors":["Guénolé Fiche","Simon Leglaive","Xavier Alameda-Pineda","Antonio Agudo","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2312.08291v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20693v1","updated":"2024-05-31T08:39:02Z","published":"2024-05-31T08:39:02Z","title":"R$^2$-Gaussian: Rectifying Radiative Gaussian Splatting for Tomographic\n  Reconstruction","summary":"  3D Gaussian splatting (3DGS) has shown promising results in image rendering\nand surface reconstruction. However, its potential in volumetric reconstruction\ntasks, such as X-ray computed tomography, remains under-explored. This paper\nintroduces R2-Gaussian, the first 3DGS-based framework for sparse-view\ntomographic reconstruction. By carefully deriving X-ray rasterization\nfunctions, we discover a previously unknown integration bias in the standard\n3DGS formulation, which hampers accurate volume retrieval. To address this\nissue, we propose a novel rectification technique via refactoring the\nprojection from 3D to 2D Gaussians. Our new method presents three key\ninnovations: (1) introducing tailored Gaussian kernels, (2) extending\nrasterization to X-ray imaging, and (3) developing a CUDA-based differentiable\nvoxelizer. Extensive experiments demonstrate that our method outperforms\nstate-of-the-art approaches by 0.93 dB in PSNR and 0.014 in SSIM. Crucially, it\ndelivers high-quality results in 3 minutes, which is 12x faster than NeRF-based\nmethods and on par with traditional algorithms. The superior performance and\nrapid convergence of our method highlight its practical value.\n","authors":["Ruyi Zha","Tao Jun Lin","Yuanhao Cai","Jiwen Cao","Yanhao Zhang","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2405.20693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20687v1","updated":"2024-05-31T08:31:26Z","published":"2024-05-31T08:31:26Z","title":"Conditioning GAN Without Training Dataset","summary":"  Deep learning algorithms have a large number of trainable parameters often\nwith sizes of hundreds of thousands or more. Training this algorithm requires a\nlarge amount of training data and generating a sufficiently large dataset for\nthese algorithms is costly\\cite{noguchi2019image}.\n  GANs are generative neural networks that use two deep learning networks that\nare competing with each other. The networks are generator and discriminator\nnetworks. The generator tries to generate realistic images which resemble the\nactual training dataset by approximating the training data distribution and the\ndiscriminator is trained to classify images as real or\nfake(generated)\\cite{goodfellow2016nips}. Training these GAN algorithms also\nrequires a large amount of training dataset\\cite{noguchi2019image}.\n  In this study, the aim is to address the question, \"Given an unconditioned\npretrained generator network and a pretrained classifier, is it feasible to\ndevelop a conditioned generator without relying on any training dataset?\"\n  The paper begins with a general introduction to the problem. The subsequent\nsections are structured as follows: Section 2 provides background information\non the problem. Section 3 reviews relevant literature on the topic. Section 4\noutlines the methodology employed in this study. Section 5 presents the\nexperimental results. Section 6 discusses the findings and proposes potential\nfuture research directions. Finally, Section 7 offers concluding remarks.\n  The implementation can be accessed\n\\href{https://github.com/kidist-amde/BigGAN-PyTorch}{here}.\n","authors":["Kidist Amde Mekonnen"],"pdf_url":"https://arxiv.org/pdf/2405.20687v1.pdf","comment":"5 pages, 2 figures, Part of my MSc project course, School Project\n  Course 2022"},{"id":"http://arxiv.org/abs/2405.20685v1","updated":"2024-05-31T08:26:53Z","published":"2024-05-31T08:26:53Z","title":"Enhancing Counterfactual Image Generation Using Mahalanobis Distance\n  with Distribution Preferences in Feature Space","summary":"  In the realm of Artificial Intelligence (AI), the importance of Explainable\nArtificial Intelligence (XAI) is increasingly recognized, particularly as AI\nmodels become more integral to our lives. One notable single-instance XAI\napproach is counterfactual explanation, which aids users in comprehending a\nmodel's decisions and offers guidance on altering these decisions. Specifically\nin the context of image classification models, effective image counterfactual\nexplanations can significantly enhance user understanding. This paper\nintroduces a novel method for computing feature importance within the feature\nspace of a black-box model. By employing information fusion techniques, our\nmethod maximizes the use of data to address feature counterfactual explanations\nin the feature space. Subsequently, we utilize an image generation model to\ntransform these feature counterfactual explanations into image counterfactual\nexplanations. Our experiments demonstrate that the counterfactual explanations\ngenerated by our method closely resemble the original images in both pixel and\nfeature spaces. Additionally, our method outperforms established baselines,\nachieving impressive experimental results.\n","authors":["Yukai Zhang","Ao Xu","Zihao Li","Tieru Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16539v3","updated":"2024-05-31T08:24:43Z","published":"2024-03-25T08:31:14Z","title":"Data-Efficient 3D Visual Grounding via Order-Aware Referring","summary":"  3D visual grounding aims to identify the target object within a 3D point\ncloud scene referred to by a natural language description. Previous works\nusually require significant data relating to point color and their descriptions\nto exploit the corresponding complicated verbo-visual relations. In our work,\nwe introduce Vigor, a novel Data-Efficient 3D Visual Grounding framework via\nOrder-aware Referring. Vigor leverages LLM to produce a desirable referential\norder from the input description for 3D visual grounding. With the proposed\nstacked object-referring blocks, the predicted anchor objects in the above\norder allow one to locate the target object progressively without supervision\non the identities of anchor objects or exact relations between anchor/target\nobjects. In addition, we present an order-aware warm-up training strategy,\nwhich augments referential orders for pre-training the visual grounding\nframework. This allows us to better capture the complex verbo-visual relations\nand benefit the desirable data-efficient learning scheme. Experimental results\non the NR3D and ScanRefer datasets demonstrate our superiority in low-resource\nscenarios. In particular, Vigor surpasses current state-of-the-art frameworks\nby 9.3% and 7.6% grounding accuracy under 1% data and 10% data settings on the\nNR3D dataset, respectively.\n","authors":["Tung-Yu Wu","Sheng-Yu Huang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16539v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20675v1","updated":"2024-05-31T08:19:44Z","published":"2024-05-31T08:19:44Z","title":"Adv-KD: Adversarial Knowledge Distillation for Faster Diffusion Sampling","summary":"  Diffusion Probabilistic Models (DPMs) have emerged as a powerful class of\ndeep generative models, achieving remarkable performance in image synthesis\ntasks. However, these models face challenges in terms of widespread adoption\ndue to their reliance on sequential denoising steps during sample generation.\nThis dependence leads to substantial computational requirements, making them\nunsuitable for resource-constrained or real-time processing systems. To address\nthese challenges, we propose a novel method that integrates denoising phases\ndirectly into the model's architecture, thereby reducing the need for\nresource-intensive computations. Our approach combines diffusion models with\ngenerative adversarial networks (GANs) through knowledge distillation, enabling\nmore efficient training and evaluation. By utilizing a pre-trained diffusion\nmodel as a teacher model, we train a student model through adversarial\nlearning, employing layerwise transformations for denoising and submodules for\npredicting the teacher model's output at various points in time. This\nintegration significantly reduces the number of parameters and denoising steps\nrequired, leading to improved sampling speed at test time. We validate our\nmethod with extensive experiments, demonstrating comparable performance with\nreduced computational requirements compared to existing approaches. By enabling\nthe deployment of diffusion models on resource-constrained devices, our\nresearch mitigates their computational burden and paves the way for wider\naccessibility and practical use across the research community and end-users.\n  Our code is publicly available at https://github.com/kidist-amde/Adv-KD\n","authors":["Kidist Amde Mekonnen","Nicola Dall'Asen","Paolo Rota"],"pdf_url":"https://arxiv.org/pdf/2405.20675v1.pdf","comment":"7 pages, 11 figures, ELLIS Doctoral Symposium 2023 in Helsinki,\n  Finland"},{"id":"http://arxiv.org/abs/2402.17502v2","updated":"2024-05-31T08:19:08Z","published":"2024-02-27T13:41:32Z","title":"FedLPPA: Learning Personalized Prompt and Aggregation for Federated\n  Weakly-supervised Medical Image Segmentation","summary":"  Federated learning (FL) effectively mitigates the data silo challenge brought\nabout by policies and privacy concerns, implicitly harnessing more data for\ndeep model training. However, traditional centralized FL models grapple with\ndiverse multi-center data, especially in the face of significant data\nheterogeneity, notably in medical contexts. In the realm of medical image\nsegmentation, the growing imperative to curtail annotation costs has amplified\nthe importance of weakly-supervised techniques which utilize sparse annotations\nsuch as points, scribbles, etc. A pragmatic FL paradigm shall accommodate\ndiverse annotation formats across different sites, which research topic remains\nunder-investigated. In such context, we propose a novel personalized FL\nframework with learnable prompt and aggregation (FedLPPA) to uniformly leverage\nheterogeneous weak supervision for medical image segmentation. In FedLPPA, a\nlearnable universal knowledge prompt is maintained, complemented by multiple\nlearnable personalized data distribution prompts and prompts representing the\nsupervision sparsity. Integrated with sample features through a dual-attention\nmechanism, those prompts empower each local task decoder to adeptly adjust to\nboth the local distribution and the supervision form. Concurrently, a\ndual-decoder strategy, predicated on prompt similarity, is introduced for\nenhancing the generation of pseudo-labels in weakly-supervised learning,\nalleviating overfitting and noise accumulation inherent to local data, while an\nadaptable aggregation method is employed to customize the task decoder on a\nparameter-wise basis. Extensive experiments on four distinct medical image\nsegmentation tasks involving different modalities underscore the superiority of\nFedLPPA, with its efficacy closely parallels that of fully supervised\ncentralized training. Our code and data will be available.\n","authors":["Li Lin","Yixiang Liu","Jiewei Wu","Pujin Cheng","Zhiyuan Cai","Kenneth K. Y. Wong","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2402.17502v2.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2405.20674v1","updated":"2024-05-31T08:18:39Z","published":"2024-05-31T08:18:39Z","title":"4Diffusion: Multi-view Video Diffusion Model for 4D Generation","summary":"  Current 4D generation methods have achieved noteworthy efficacy with the aid\nof advanced diffusion generative models. However, these methods lack multi-view\nspatial-temporal modeling and encounter challenges in integrating diverse prior\nknowledge from multiple diffusion models, resulting in inconsistent temporal\nappearance and flickers. In this paper, we propose a novel 4D generation\npipeline, namely 4Diffusion aimed at generating spatial-temporally consistent\n4D content from a monocular video. We first design a unified diffusion model\ntailored for multi-view video generation by incorporating a learnable motion\nmodule into a frozen 3D-aware diffusion model to capture multi-view\nspatial-temporal correlations. After training on a curated dataset, our\ndiffusion model acquires reasonable temporal consistency and inherently\npreserves the generalizability and spatial consistency of the 3D-aware\ndiffusion model. Subsequently, we propose 4D-aware Score Distillation Sampling\nloss, which is based on our multi-view video diffusion model, to optimize 4D\nrepresentation parameterized by dynamic NeRF. This aims to eliminate\ndiscrepancies arising from multiple diffusion models, allowing for generating\nspatial-temporally consistent 4D content. Moreover, we devise an anchor loss to\nenhance the appearance details and facilitate the learning of dynamic NeRF.\nExtensive qualitative and quantitative experiments demonstrate that our method\nachieves superior performance compared to previous methods.\n","authors":["Haiyu Zhang","Xinyuan Chen","Yaohui Wang","Xihui Liu","Yunhong Wang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2405.20674v1.pdf","comment":"Project Page: https://aejion.github.io/4diffusion/"},{"id":"http://arxiv.org/abs/2405.20672v1","updated":"2024-05-31T08:14:44Z","published":"2024-05-31T08:14:44Z","title":"Investigating and unmasking feature-level vulnerabilities of CNNs to\n  adversarial perturbations","summary":"  This study explores the impact of adversarial perturbations on Convolutional\nNeural Networks (CNNs) with the aim of enhancing the understanding of their\nunderlying mechanisms. Despite numerous defense methods proposed in the\nliterature, there is still an incomplete understanding of this phenomenon.\nInstead of treating the entire model as vulnerable, we propose that specific\nfeature maps learned during training contribute to the overall vulnerability.\nTo investigate how the hidden representations learned by a CNN affect its\nvulnerability, we introduce the Adversarial Intervention framework. Experiments\nwere conducted on models trained on three well-known computer vision datasets,\nsubjecting them to attacks of different nature. Our focus centers on the\neffects that adversarial perturbations to a model's initial layer have on the\noverall behavior of the model. Empirical results revealed compelling insights:\na) perturbing selected channel combinations in shallow layers causes\nsignificant disruptions; b) the channel combinations most responsible for the\ndisruptions are common among different types of attacks; c) despite shared\nvulnerable combinations of channels, different attacks affect hidden\nrepresentations with varying magnitudes; d) there exists a positive correlation\nbetween a kernel's magnitude and its vulnerability. In conclusion, this work\nintroduces a novel framework to study the vulnerability of a CNN model to\nadversarial perturbations, revealing insights that contribute to a deeper\nunderstanding of the phenomenon. The identified properties pave the way for the\ndevelopment of efficient ad-hoc defense mechanisms in future applications.\n","authors":["Davide Coppola","Hwee Kuan Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20672v1.pdf","comment":"22 pages, 15 figures (including appendix)"},{"id":"http://arxiv.org/abs/2405.19732v2","updated":"2024-05-31T08:13:34Z","published":"2024-05-30T06:24:14Z","title":"Two Optimizers Are Better Than One: LLM Catalyst for Enhancing\n  Gradient-Based Optimization","summary":"  Learning a skill generally relies on both practical experience by doer and\ninsightful high-level guidance by instructor. Will this strategy also work well\nfor solving complex non-convex optimization problems? Here, a common\ngradient-based optimizer acts like a disciplined doer, making locally optimal\nupdate at each step. Recent methods utilize large language models (LLMs) to\noptimize solutions for concrete problems by inferring from natural language\ninstructions, akin to a high-level instructor. In this paper, we show that\nthese two optimizers are complementary to each other, suggesting a\ncollaborative optimization approach. The gradient-based optimizer and LLM-based\noptimizer are combined in an interleaved manner. We instruct LLMs using task\ndescriptions and timely optimization trajectories recorded during\ngradient-based optimization. Inferred results from LLMs are used as restarting\npoints for the next stage of gradient optimization. By leveraging both the\nlocally rigorous gradient-based optimizer and the high-level deductive\nLLM-based optimizer, our combined optimization method consistently yields\nimprovements over competitive baseline prompt tuning methods. Our results\ndemonstrate the synergistic effect of conventional gradient-based optimization\nand the inference ability of LLMs. The code is released at\nhttps://github.com/guozix/LLM-catalyst.\n","authors":["Zixian Guo","Ming Liu","Zhilong Ji","Jinfeng Bai","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.19732v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20669v1","updated":"2024-05-31T08:11:25Z","published":"2024-05-31T08:11:25Z","title":"Fourier123: One Image to High-Quality 3D Object Generation with Hybrid\n  Fourier Score Distillation","summary":"  Single image-to-3D generation is pivotal for crafting controllable 3D assets.\nGiven its underconstrained nature, we leverage geometric priors from a 3D novel\nview generation diffusion model and appearance priors from a 2D image\ngeneration method to guide the optimization process. We note that a disparity\nexists between the training datasets of 2D and 3D diffusion models, leading to\ntheir outputs showing marked differences in appearance. Specifically, 2D models\ntend to deliver more detailed visuals, whereas 3D models produce consistent yet\nover-smooth results across different views. Hence, we optimize a set of 3D\nGaussians using 3D priors in spatial domain to ensure geometric consistency,\nwhile exploiting 2D priors in the frequency domain through Fourier transform\nfor higher visual quality. This 2D-3D hybrid Fourier Score Distillation\nobjective function (dubbed hy-FSD), can be integrated into existing 3D\ngeneration methods, yielding significant performance improvements. With this\ntechnique, we further develop an image-to-3D generation pipeline to create\nhigh-quality 3D objects within one minute, named Fourier123. Extensive\nexperiments demonstrate that Fourier123 excels in efficient generation with\nrapid convergence speed and visual-friendly generation results.\n","authors":["Shuzhou Yang","Yu Wang","Haijie Li","Jiarui Meng","Xiandong Meng","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20067v2","updated":"2024-05-31T08:11:24Z","published":"2024-05-30T13:56:58Z","title":"N-Dimensional Gaussians for Fitting of High Dimensional Functions","summary":"  In the wake of many new ML-inspired approaches for reconstructing and\nrepresenting high-quality 3D content, recent hybrid and explicitly learned\nrepresentations exhibit promising performance and quality characteristics.\nHowever, their scaling to higher dimensions is challenging, e.g. when\naccounting for dynamic content with respect to additional parameters such as\nmaterial properties, illumination, or time. In this paper, we tackle these\nchallenges for an explicit representations based on Gaussian mixture models.\nWith our solutions, we arrive at efficient fitting of compact N-dimensional\nGaussian mixtures and enable efficient evaluation at render time: For fast\nfitting and evaluation, we introduce a high-dimensional culling scheme that\nefficiently bounds N-D Gaussians, inspired by Locality Sensitive Hashing. For\nadaptive refinement yet compact representation, we introduce a loss-adaptive\ndensity control scheme that incrementally guides the use of additional capacity\ntowards missing details. With these tools we can for the first time represent\ncomplex appearance that depends on many input dimensions beyond position or\nviewing angle within a compact, explicit representation optimized in minutes\nand rendered in milliseconds.\n","authors":["Stavros Diolatzis","Tobias Zirr","Alexandr Kuznetsov","Georgios Kopanas","Anton Kaplanyan"],"pdf_url":"https://arxiv.org/pdf/2405.20067v2.pdf","comment":"https://www.sdiolatz.info/ndg-fitting/"},{"id":"http://arxiv.org/abs/2405.15465v2","updated":"2024-05-31T08:08:23Z","published":"2024-05-24T11:40:22Z","title":"Scale-Invariant Feature Disentanglement via Adversarial Learning for\n  UAV-based Object Detection","summary":"  Detecting objects from Unmanned Aerial Vehicles (UAV) is often hindered by a\nlarge number of small objects, resulting in low detection accuracy. To address\nthis issue, mainstream approaches typically utilize multi-stage inferences.\nDespite their remarkable detecting accuracies, real-time efficiency is\nsacrificed, making them less practical to handle real applications. To this\nend, we propose to improve the single-stage inference accuracy through learning\nscale-invariant features. Specifically, a Scale-Invariant Feature Disentangling\nmodule is designed to disentangle scale-related and scale-invariant features.\nThen an Adversarial Feature Learning scheme is employed to enhance\ndisentanglement. Finally, scale-invariant features are leveraged for robust\nUAV-based object detection. Furthermore, we construct a multi-modal UAV object\ndetection dataset, State-Air, which incorporates annotated UAV state\nparameters. We apply our approach to three state-of-the-art lightweight\ndetection frameworks on three benchmark datasets, including State-Air.\nExtensive experiments demonstrate that our approach can effectively improve\nmodel accuracy. Our code and dataset are provided in Supplementary Materials\nand will be publicly available once the paper is accepted.\n","authors":["Fan Liu","Liang Yao","Chuanyi Zhang","Ting Wu","Xinlei Zhang","Xiruo Jiang","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.15465v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20666v1","updated":"2024-05-31T08:06:05Z","published":"2024-05-31T08:06:05Z","title":"MASA: Motion-aware Masked Autoencoder with Semantic Alignment for Sign\n  Language Recognition","summary":"  Sign language recognition (SLR) has long been plagued by insufficient model\nrepresentation capabilities. Although current pre-training approaches have\nalleviated this dilemma to some extent and yielded promising performance by\nemploying various pretext tasks on sign pose data, these methods still suffer\nfrom two primary limitations: 1) Explicit motion information is usually\ndisregarded in previous pretext tasks, leading to partial information loss and\nlimited representation capability. 2) Previous methods focus on the local\ncontext of a sign pose sequence, without incorporating the guidance of the\nglobal meaning of lexical signs. To this end, we propose a Motion-Aware masked\nautoencoder with Semantic Alignment (MASA) that integrates rich motion cues and\nglobal semantic information in a self-supervised learning paradigm for SLR. Our\nframework contains two crucial components, i.e., a motion-aware masked\nautoencoder (MA) and a momentum semantic alignment module (SA). Specifically,\nin MA, we introduce an autoencoder architecture with a motion-aware masked\nstrategy to reconstruct motion residuals of masked frames, thereby explicitly\nexploring dynamic motion cues among sign pose sequences. Moreover, in SA, we\nembed our framework with global semantic awareness by aligning the embeddings\nof different augmented samples from the input sequence in the shared latent\nspace. In this way, our framework can simultaneously learn local motion cues\nand global semantic features for comprehensive sign language representation.\nFurthermore, we conduct extensive experiments to validate the effectiveness of\nour method, achieving new state-of-the-art performance on four public\nbenchmarks.\n","authors":["Weichao Zhao","Hezhen Hu","Wengang Zhou","Yunyao Mao","Min Wang","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2405.20666v1.pdf","comment":"Accepted by TCSVT 2024"},{"id":"http://arxiv.org/abs/2405.19092v3","updated":"2024-05-31T07:56:37Z","published":"2024-05-29T13:54:12Z","title":"Benchmarking and Improving Detail Image Caption","summary":"  Image captioning has long been regarded as a fundamental task in visual\nunderstanding. Recently, however, few large vision-language model (LVLM)\nresearch discusses model's image captioning performance because of the outdated\nshort-caption benchmarks and unreliable evaluation metrics. In this work, we\npropose to benchmark detail image caption task by curating high-quality\nevaluation datasets annotated by human experts, GPT-4V and Gemini-1.5-Pro. We\nalso design a more reliable caption evaluation metric called CAPTURE (CAPtion\nevaluation by exTracting and coUpling coRE information). CAPTURE extracts\nvisual elements, e.g., objects, attributes and relations from captions, and\nthen matches these elements through three stages, achieving the highest\nconsistency with expert judgements over other rule-based or model-based caption\nmetrics. The proposed benchmark and metric provide reliable evaluation for\nLVLM's detailed image captioning ability. Guided by this evaluation, we further\nexplore to unleash LVLM's detail caption capabilities by synthesizing\nhigh-quality data through a five-stage data construction pipeline. Our pipeline\nonly uses a given LVLM itself and other open-source tools, without any human or\nGPT-4V annotation in the loop. Experiments show that the proposed data\nconstruction strategy significantly improves model-generated detail caption\ndata quality for LVLMs with leading performance, and the data quality can be\nfurther improved in a self-looping paradigm. All code and dataset will be\npublicly available at https://github.com/foundation-multimodal-models/CAPTURE.\n","authors":["Hongyuan Dong","Jiawen Li","Bohong Wu","Jiacong Wang","Yuan Zhang","Haoyuan Guo"],"pdf_url":"https://arxiv.org/pdf/2405.19092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19620v2","updated":"2024-05-31T07:40:55Z","published":"2024-05-30T02:13:56Z","title":"SparseDrive: End-to-End Autonomous Driving via Sparse Scene\n  Representation","summary":"  The well-established modular autonomous driving system is decoupled into\ndifferent standalone tasks, e.g. perception, prediction and planning, suffering\nfrom information loss and error accumulation across modules. In contrast,\nend-to-end paradigms unify multi-tasks into a fully differentiable framework,\nallowing for optimization in a planning-oriented spirit. Despite the great\npotential of end-to-end paradigms, both the performance and efficiency of\nexisting methods are not satisfactory, particularly in terms of planning\nsafety. We attribute this to the computationally expensive BEV (bird's eye\nview) features and the straightforward design for prediction and planning. To\nthis end, we explore the sparse representation and review the task design for\nend-to-end autonomous driving, proposing a new paradigm named SparseDrive.\nConcretely, SparseDrive consists of a symmetric sparse perception module and a\nparallel motion planner. The sparse perception module unifies detection,\ntracking and online mapping with a symmetric model architecture, learning a\nfully sparse representation of the driving scene. For motion prediction and\nplanning, we review the great similarity between these two tasks, leading to a\nparallel design for motion planner. Based on this parallel design, which models\nplanning as a multi-modal problem, we propose a hierarchical planning selection\nstrategy , which incorporates a collision-aware rescore module, to select a\nrational and safe trajectory as the final planning output. With such effective\ndesigns, SparseDrive surpasses previous state-of-the-arts by a large margin in\nperformance of all tasks, while achieving much higher training and inference\nefficiency. Code will be avaliable at https://github.com/swc-17/SparseDrive for\nfacilitating future research.\n","authors":["Wenchao Sun","Xuewu Lin","Yining Shi","Chuang Zhang","Haoran Wu","Sifa Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.19620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20081v2","updated":"2024-05-31T07:40:04Z","published":"2024-05-30T14:11:27Z","title":"NoiseBoost: Alleviating Hallucination with Noise Perturbation for\n  Multimodal Large Language Models","summary":"  Multimodal large language models (MLLMs) contribute a powerful mechanism to\nunderstanding visual information building on large language models. However,\nMLLMs are notorious for suffering from hallucinations, especially when\ngenerating lengthy, detailed descriptions for images. Our analysis reveals that\nhallucinations stem from the inherent summarization mechanism of large language\nmodels, leading to excessive dependence on linguistic tokens while neglecting\nvision information. In this paper, we propose NoiseBoost, a broadly applicable\nand simple method for alleviating hallucinations for MLLMs through the\nintegration of noise feature perturbations. Noise perturbation acts as a\nregularizer, facilitating a balanced distribution of attention weights among\nvisual and linguistic tokens. Despite its simplicity, NoiseBoost consistently\nenhances the performance of MLLMs across common training strategies, including\nsupervised fine-tuning and reinforcement learning. Further, NoiseBoost\npioneerly enables semi-supervised learning for MLLMs, unleashing the power of\nunlabeled data. Comprehensive experiments demonstrate that NoiseBoost improves\ndense caption accuracy by 8.1% with human evaluation and achieves comparable\nresults with 50% of the data by mining unlabeled data. Code and models are\navailable at https://kaiwu5.github.io/noiseboost.\n","authors":["Kai Wu","Boyuan Jiang","Zhengkai Jiang","Qingdong He","Donghao Luo","Shengzhi Wang","Qingwen Liu","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20081v2.pdf","comment":"14 pages, 5 figures with supplementary material"},{"id":"http://arxiv.org/abs/2405.20650v1","updated":"2024-05-31T07:32:31Z","published":"2024-05-31T07:32:31Z","title":"GenMix: Combining Generative and Mixture Data Augmentation for Medical\n  Image Classification","summary":"  In this paper, we propose a novel data augmentation technique called GenMix,\nwhich combines generative and mixture approaches to leverage the strengths of\nboth methods. While generative models excel at creating new data patterns, they\nface challenges such as mode collapse in GANs and difficulties in training\ndiffusion models, especially with limited medical imaging data. On the other\nhand, mixture models enhance class boundary regions but tend to favor the major\nclass in scenarios with class imbalance. To address these limitations, GenMix\nintegrates both approaches to complement each other. GenMix operates in two\nstages: (1) training a generative model to produce synthetic images, and (2)\nperforming mixup between synthetic and real data. This process improves the\nquality and diversity of synthetic data while simultaneously benefiting from\nthe new pattern learning of generative models and the boundary enhancement of\nmixture models. We validate the effectiveness of our method on the task of\nclassifying focal liver lesions (FLLs) in CT images. Our results demonstrate\nthat GenMix enhances the performance of various generative models, including\nDCGAN, StyleGAN, Textual Inversion, and Diffusion Models. Notably, the proposed\nmethod with Textual Inversion outperforms other methods without fine-tuning\ndiffusion model on the FLL dataset.\n","authors":["Hansang Lee","Haeil Lee","Helen Hong"],"pdf_url":"https://arxiv.org/pdf/2405.20650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20648v1","updated":"2024-05-31T07:30:24Z","published":"2024-05-31T07:30:24Z","title":"Shotluck Holmes: A Family of Efficient Small-Scale Large Language Vision\n  Models For Video Captioning and Summarization","summary":"  Video is an increasingly prominent and information-dense medium, yet it poses\nsubstantial challenges for language models. A typical video consists of a\nsequence of shorter segments, or shots, that collectively form a coherent\nnarrative. Each shot is analogous to a word in a sentence where multiple data\nstreams of information (such as visual and auditory data) must be processed\nsimultaneously. Comprehension of the entire video requires not only\nunderstanding the visual-audio information of each shot but also requires that\nthe model links the ideas between each shot to generate a larger,\nall-encompassing story. Despite significant progress in the field, current\nworks often overlook videos' more granular shot-by-shot semantic information.\nIn this project, we propose a family of efficient large language vision models\n(LLVMs) to boost video summarization and captioning called Shotluck Holmes. By\nleveraging better pretraining and data collection strategies, we extend the\nabilities of existing small LLVMs from being able to understand a picture to\nbeing able to understand a sequence of frames. Specifically, we show that\nShotluck Holmes achieves better performance than state-of-the-art results on\nthe Shot2Story video captioning and summary task with significantly smaller and\nmore computationally efficient models.\n","authors":["Richard Luo","Austin Peng","Adithya Vasudev","Rishabh Jain"],"pdf_url":"https://arxiv.org/pdf/2405.20648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08970v3","updated":"2024-05-31T07:29:20Z","published":"2023-06-15T09:05:36Z","title":"An Efficient and Multi-private Key Secure Aggregation for Federated\n  Learning","summary":"  With the emergence of privacy leaks in federated learning, secure aggregation\nprotocols that mainly adopt either homomorphic encryption or threshold secret\nsharing have been widely developed for federated learning to protect the\nprivacy of the local training data of each client. However, these existing\nprotocols suffer from many shortcomings, such as the dependence on a trusted\nthird party, the vulnerability to clients being corrupted, low efficiency, the\ntrade-off between security and fault tolerance, etc. To solve these\ndisadvantages, we propose an efficient and multi-private key secure aggregation\nscheme for federated learning. Specifically, we skillfully modify the variant\nElGamal encryption technique to achieve homomorphic addition operation, which\nhas two important advantages: 1) The server and each client can freely select\npublic and private keys without introducing a trust third party and 2) Compared\nto the variant ElGamal encryption, the plaintext space is relatively large,\nwhich is more suitable for the deep model. Besides, for the high dimensional\ndeep model parameter, we introduce a super-increasing sequence to compress\nmulti-dimensional data into 1-D, which can greatly reduce encryption and\ndecryption times as well as communication for ciphertext transmission. Detailed\nsecurity analyses show that our proposed scheme achieves the semantic security\nof both individual local gradients and the aggregated result while achieving\noptimal robustness in tolerating both client collusion and dropped clients.\nExtensive simulations demonstrate that the accuracy of our scheme is almost the\nsame as the non-private approach, while the efficiency of our scheme is much\nbetter than the state-of-the-art homomorphic encryption-based secure\naggregation schemes. More importantly, the efficiency advantages of our scheme\nwill become increasingly prominent as the number of model parameters increases.\n","authors":["Xue Yang","Zifeng Liu","Xiaohu Tang","Rongxing Lu","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2306.08970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11190v2","updated":"2024-05-31T07:24:55Z","published":"2024-05-18T06:03:42Z","title":"ReasonPix2Pix: Instruction Reasoning Dataset for Advanced Image Editing","summary":"  Instruction-based image editing focuses on equipping a generative model with\nthe capacity to adhere to human-written instructions for editing images.\nCurrent approaches typically comprehend explicit and specific instructions.\nHowever, they often exhibit a deficiency in executing active reasoning\ncapacities required to comprehend instructions that are implicit or\ninsufficiently defined. To enhance active reasoning capabilities and impart\nintelligence to the editing model, we introduce ReasonPix2Pix, a comprehensive\nreasoning-attentive instruction editing dataset. The dataset is characterized\nby 1) reasoning instruction, 2) more realistic images from fine-grained\ncategories, and 3) increased variances between input and edited images. When\nfine-tuned with our dataset under supervised conditions, the model demonstrates\nsuperior performance in instructional editing tasks, independent of whether the\ntasks require reasoning or not. The code will be available at\nhttps://github.com/Jin-Ying/ReasonPix2Pix.\n","authors":["Ying Jin","Pengyang Ling","Xiaoyi Dong","Pan Zhang","Jiaqi Wang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2405.11190v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20643v1","updated":"2024-05-31T07:07:54Z","published":"2024-05-31T07:07:54Z","title":"Learning Gaze-aware Compositional GAN","summary":"  Gaze-annotated facial data is crucial for training deep neural networks\n(DNNs) for gaze estimation. However, obtaining these data is labor-intensive\nand requires specialized equipment due to the challenge of accurately\nannotating the gaze direction of a subject. In this work, we present a\ngenerative framework to create annotated gaze data by leveraging the benefits\nof labeled and unlabeled data sources. We propose a Gaze-aware Compositional\nGAN that learns to generate annotated facial images from a limited labeled\ndataset. Then we transfer this model to an unlabeled data domain to take\nadvantage of the diversity it provides. Experiments demonstrate our approach's\neffectiveness in generating within-domain image augmentations in the ETH-XGaze\ndataset and cross-domain augmentations in the CelebAMask-HQ dataset domain for\ngaze estimation DNN training. We also show additional applications of our work,\nwhich include facial image editing and gaze redirection.\n","authors":["Nerea Aranjuelo","Siyu Huang","Ignacio Arganda-Carreras","Luis Unzueta","Oihana Otaegui","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2405.20643v1.pdf","comment":"Accepted by ETRA 2024 as Full paper, and as journal paper in\n  Proceedings of the ACM on Computer Graphics and Interactive Techniques"},{"id":"http://arxiv.org/abs/2405.20299v2","updated":"2024-05-31T06:56:51Z","published":"2024-05-30T17:46:23Z","title":"Scaling White-Box Transformers for Vision","summary":"  CRATE, a white-box transformer architecture designed to learn compressed and\nsparse representations, offers an intriguing alternative to standard vision\ntransformers (ViTs) due to its inherent mathematical interpretability. Despite\nextensive investigations into the scaling behaviors of language and vision\ntransformers, the scalability of CRATE remains an open question which this\npaper aims to address. Specifically, we propose CRATE-$\\alpha$, featuring\nstrategic yet minimal modifications to the sparse coding block in the CRATE\narchitecture design, and a light training recipe designed to improve the\nscalability of CRATE. Through extensive experiments, we demonstrate that\nCRATE-$\\alpha$ can effectively scale with larger model sizes and datasets. For\nexample, our CRATE-$\\alpha$-B substantially outperforms the prior best CRATE-B\nmodel accuracy on ImageNet classification by 3.7%, achieving an accuracy of\n83.2%. Meanwhile, when scaling further, our CRATE-$\\alpha$-L obtains an\nImageNet classification accuracy of 85.1%. More notably, these model\nperformance improvements are achieved while preserving, and potentially even\nenhancing the interpretability of learned CRATE models, as we demonstrate\nthrough showing that the learned token representations of increasingly larger\ntrained CRATE-$\\alpha$ models yield increasingly higher-quality unsupervised\nobject segmentation of images. The project page is\nhttps://rayjryang.github.io/CRATE-alpha/.\n","authors":["Jinrui Yang","Xianhang Li","Druv Pai","Yuyin Zhou","Yi Ma","Yaodong Yu","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2405.20299v2.pdf","comment":"project page: https://rayjryang.github.io/CRATE-alpha/"},{"id":"http://arxiv.org/abs/2404.18426v2","updated":"2024-05-31T06:54:44Z","published":"2024-04-29T04:56:52Z","title":"Efficient Meta-Learning Enabled Lightweight Multiscale Few-Shot Object\n  Detection in Remote Sensing Images","summary":"  Presently, the task of few-shot object detection (FSOD) in remote sensing\nimages (RSIs) has become a focal point of attention. Numerous few-shot\ndetectors, particularly those based on two-stage detectors, face challenges\nwhen dealing with the multiscale complexities inherent in RSIs. Moreover, these\ndetectors present impractical characteristics in real-world applications,\nmainly due to their unwieldy model parameters when handling large amount of\ndata. In contrast, we recognize the advantages of one-stage detectors,\nincluding high detection speed and a global receptive field. Consequently, we\nchoose the YOLOv7 one-stage detector as a baseline and subject it to a novel\nmeta-learning training framework. This transformation allows the detector to\nadeptly address FSOD tasks while capitalizing on its inherent advantage of\nlightweight. Additionally, we thoroughly investigate the samples generated by\nthe meta-learning strategy and introduce a novel meta-sampling approach to\nretain samples produced by our designed meta-detection head. Coupled with our\ndevised meta-cross loss, we deliberately utilize \"negative samples\" that are\noften overlooked to extract valuable knowledge from them. This approach serves\nto enhance detection accuracy and efficiently refine the overall meta-learning\nstrategy. To validate the effectiveness of our proposed detector, we conducted\nperformance comparisons with current state-of-the-art detectors using the DIOR\nand NWPU VHR-10.v2 datasets, yielding satisfactory results.\n","authors":["Wenbin Guan","Zijiu Yang","Xiaohong Wu","Liqiong Chen","Feng Huang","Xiaohai He","Honggang Chen"],"pdf_url":"https://arxiv.org/pdf/2404.18426v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20633v1","updated":"2024-05-31T05:49:37Z","published":"2024-05-31T05:49:37Z","title":"Action-OOD: An End-to-End Skeleton-Based Model for Robust\n  Out-of-Distribution Human Action Detection","summary":"  Human action recognition is a crucial task in computer vision systems.\nHowever, in real-world scenarios, human actions often fall outside the\ndistribution of training data, requiring a model to both recognize\nin-distribution (ID) actions and reject out-of-distribution (OOD) ones. Despite\nits importance, there has been limited research on OOD detection in human\nactions. Existing works on OOD detection mainly focus on image data with RGB\nstructure, and many methods are post-hoc in nature. While these methods are\nconvenient and computationally efficient, they often lack sufficient accuracy\nand fail to consider the presence of OOD samples. To address these challenges,\nwe propose a novel end-to-end skeleton-based model called Action-OOD,\nspecifically designed for OOD human action detection. Unlike some existing\napproaches that may require prior knowledge of existing OOD data distribution,\nour model solely utilizes in-distribution (ID) data during the training stage,\neffectively mitigating the overconfidence issue prevalent in OOD detection. We\nintroduce an attention-based feature fusion block, which enhances the model's\ncapability to recognize unknown classes while preserving classification\naccuracy for known classes. Further, we present a novel energy-based loss\nfunction and successfully integrate it with the traditional cross-entropy loss\nto maximize the separation of data distributions between ID and OOD. Through\nextensive experiments conducted on NTU-RGB+D 60, NTU-RGB+D 120, and\nKinetics-400 datasets, we demonstrate the superior performance of our proposed\napproach compared to state-of-the-art methods. Our findings underscore the\neffectiveness of classic OOD detection techniques in the context of\nskeleton-based action recognition tasks, offering promising avenues for future\nresearch in this field. Code will be available at:\nhttps://github.com/YilliaJing/Action-OOD.git.\n","authors":["Jing Xu","Anqi Zhu","Jingyu Lin","Qiuhong Ke","Cunjian Chen"],"pdf_url":"https://arxiv.org/pdf/2405.20633v1.pdf","comment":"Under consideration at Computer Vision and Image Understanding"},{"id":"http://arxiv.org/abs/2405.20628v1","updated":"2024-05-31T05:40:56Z","published":"2024-05-31T05:40:56Z","title":"ToxVidLLM: A Multimodal LLM-based Framework for Toxicity Detection in\n  Code-Mixed Videos","summary":"  In an era of rapidly evolving internet technology, the surge in multimodal\ncontent, including videos, has expanded the horizons of online communication.\nHowever, the detection of toxic content in this diverse landscape, particularly\nin low-resource code-mixed languages, remains a critical challenge. While\nsubstantial research has addressed toxic content detection in textual data, the\nrealm of video content, especially in non-English languages, has been\nrelatively underexplored. This paper addresses this research gap by introducing\na benchmark dataset, the first of its kind, consisting of 931 videos with 4021\ncode-mixed Hindi-English utterances collected from YouTube. Each utterance\nwithin this dataset has been meticulously annotated for toxicity, severity, and\nsentiment labels. We have developed an advanced Multimodal Multitask framework\nbuilt for Toxicity detection in Video Content by leveraging Large Language\nModels (LLMs), crafted for the primary objective along with the additional\ntasks of conducting sentiment and severity analysis. ToxVidLLM incorporates\nthree key modules the Encoder module, Cross-Modal Synchronization module, and\nMultitask module crafting a generic multimodal LLM customized for intricate\nvideo classification tasks. Our experiments reveal that incorporating multiple\nmodalities from the videos substantially enhances the performance of toxic\ncontent detection by achieving an Accuracy and Weighted F1 score of 94.29% and\n94.35%, respectively.\n","authors":["Krishanu Maity","A. S. Poornash","Sriparna Saha","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2405.20628v1.pdf","comment":"ACL Findings 2024"},{"id":"http://arxiv.org/abs/2405.19917v2","updated":"2024-05-31T05:29:13Z","published":"2024-05-30T10:30:07Z","title":"Multimodal Cross-Domain Few-Shot Learning for Egocentric Action\n  Recognition","summary":"  We address a novel cross-domain few-shot learning task (CD-FSL) with\nmultimodal input and unlabeled target data for egocentric action recognition.\nThis paper simultaneously tackles two critical challenges associated with\negocentric action recognition in CD-FSL settings: (1) the extreme domain gap in\negocentric videos (\\eg, daily life vs. industrial domain) and (2) the\ncomputational cost for real-world applications. We propose MM-CDFSL, a\ndomain-adaptive and computationally efficient approach designed to enhance\nadaptability to the target domain and improve inference speed. To address the\nfirst challenge, we propose the incorporation of multimodal distillation into\nthe student RGB model using teacher models. Each teacher model is trained\nindependently on source and target data for its respective modality. Leveraging\nonly unlabeled target data during multimodal distillation enhances the student\nmodel's adaptability to the target domain. We further introduce ensemble masked\ninference, a technique that reduces the number of input tokens through masking.\nIn this approach, ensemble prediction mitigates the performance degradation\ncaused by masking, effectively addressing the second issue. Our approach\noutperformed the state-of-the-art CD-FSL approaches with a substantial margin\non multiple egocentric datasets, improving by an average of 6.12/6.10 points\nfor 1-shot/5-shot settings while achieving $2.2$ times faster inference speed.\nProject page: https://masashi-hatano.github.io/MM-CDFSL/\n","authors":["Masashi Hatano","Ryo Hachiuma","Ryo Fujii","Hideo Saito"],"pdf_url":"https://arxiv.org/pdf/2405.19917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13540v2","updated":"2024-05-31T05:15:40Z","published":"2024-05-22T11:20:32Z","title":"Directly Denoising Diffusion Models","summary":"  In this paper, we present the Directly Denoising Diffusion Model (DDDM): a\nsimple and generic approach for generating realistic images with few-step\nsampling, while multistep sampling is still preserved for better performance.\nDDDMs require no delicately designed samplers nor distillation on pre-trained\ndistillation models. DDDMs train the diffusion model conditioned on an\nestimated target that was generated from previous training iterations of its\nown. To generate images, samples generated from the previous time step are also\ntaken into consideration, guiding the generation process iteratively. We\nfurther propose Pseudo-LPIPS, a novel metric loss that is more robust to\nvarious values of hyperparameter. Despite its simplicity, the proposed approach\ncan achieve strong performance in benchmark datasets. Our model achieves FID\nscores of 2.57 and 2.33 on CIFAR-10 in one-step and two-step sampling\nrespectively, surpassing those obtained from GANs and distillation-based\nmodels. By extending the sampling to 1000 steps, we further reduce FID score to\n1.79, aligning with state-of-the-art methods in the literature. For ImageNet\n64x64, our approach stands as a competitive contender against leading models.\n","authors":["Dan Zhang","Jingjing Wang","Feng Luo"],"pdf_url":"https://arxiv.org/pdf/2405.13540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20319v2","updated":"2024-05-31T04:09:41Z","published":"2024-05-30T17:55:46Z","title":"ParSEL: Parameterized Shape Editing with Language","summary":"  The ability to edit 3D assets from natural language presents a compelling\nparadigm to aid in the democratization of 3D content creation. However, while\nnatural language is often effective at communicating general intent, it is\npoorly suited for specifying precise manipulation. To address this gap, we\nintroduce ParSEL, a system that enables controllable editing of high-quality 3D\nassets from natural language. Given a segmented 3D mesh and an editing request,\nParSEL produces a parameterized editing program. Adjusting the program\nparameters allows users to explore shape variations with a precise control over\nthe magnitudes of edits. To infer editing programs which align with an input\nedit request, we leverage the abilities of large-language models (LLMs).\nHowever, while we find that LLMs excel at identifying initial edit operations,\nthey often fail to infer complete editing programs, and produce outputs that\nviolate shape semantics. To overcome this issue, we introduce Analytical Edit\nPropagation (AEP), an algorithm which extends a seed edit with additional\noperations until a complete editing program has been formed. Unlike prior\nmethods, AEP searches for analytical editing operations compatible with a range\nof possible user edits through the integration of computer algebra systems for\ngeometric analysis. Experimentally we demonstrate ParSEL's effectiveness in\nenabling controllable editing of 3D objects through natural language requests\nover alternative system designs.\n","authors":["Aditya Ganeshan","Ryan Y. Huang","Xianghao Xu","R. Kenny Jones","Daniel Ritchie"],"pdf_url":"https://arxiv.org/pdf/2405.20319v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20614v1","updated":"2024-05-31T04:06:11Z","published":"2024-05-31T04:06:11Z","title":"EPIDetect: Video-based convulsive seizure detection in chronic epilepsy\n  mouse model for anti-epilepsy drug screening","summary":"  In the preclinical translational studies, drug candidates with remarkable\nanti-epileptic efficacy demonstrate long-term suppression of spontaneous\nrecurrent seizures (SRSs), particularly convulsive seizures (CSs), in mouse\nmodels of chronic epilepsy. However, the current methods for monitoring CSs\nhave limitations in terms of invasiveness, specific laboratory settings, high\ncost, and complex operation, which hinder drug screening efforts. In this\nstudy, a camera-based system for automated detection of CSs in chronically\nepileptic mice is first established to screen potential anti-epilepsy drugs.\n","authors":["Junming Ren","Zhoujian Xiao","Yujia Zhang","Yujie Yang","Ling He","Ezra Yoon","Stephen Temitayo Bello","Xi Chen","Dapeng Wu","Micky Tortorella","Jufang He"],"pdf_url":"https://arxiv.org/pdf/2405.20614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20610v1","updated":"2024-05-31T03:54:59Z","published":"2024-05-31T03:54:59Z","title":"Revisiting and Maximizing Temporal Knowledge in Semi-supervised Semantic\n  Segmentation","summary":"  In semi-supervised semantic segmentation, the Mean Teacher- and\nco-training-based approaches are employed to mitigate confirmation bias and\ncoupling problems. However, despite their high performance, these approaches\nfrequently involve complex training pipelines and a substantial computational\nburden, limiting the scalability and compatibility of these methods. In this\npaper, we propose a PrevMatch framework that effectively mitigates the\naforementioned limitations by maximizing the utilization of the temporal\nknowledge obtained during the training process. The PrevMatch framework relies\non two core strategies: (1) we reconsider the use of temporal knowledge and\nthus directly utilize previous models obtained during training to generate\nadditional pseudo-label guidance, referred to as previous guidance. (2) we\ndesign a highly randomized ensemble strategy to maximize the effectiveness of\nthe previous guidance. Experimental results on four benchmark semantic\nsegmentation datasets confirm that the proposed method consistently outperforms\nexisting methods across various evaluation protocols. In particular, with\nDeepLabV3+ and ResNet-101 network settings, PrevMatch outperforms the existing\nstate-of-the-art method, Diverse Co-training, by +1.6 mIoU on Pascal VOC with\nonly 92 annotated images, while achieving 2.4 times faster training.\nFurthermore, the results indicate that PrevMatch induces stable optimization,\nparticularly in benefiting classes that exhibit poor performance. Code is\navailable at https://github.com/wooseok-shin/PrevMatch\n","authors":["Wooseok Shin","Hyun Joon Park","Jin Sob Kim","Sung Won Han"],"pdf_url":"https://arxiv.org/pdf/2405.20610v1.pdf","comment":"14 pages, 5 figures, submitted to IEEE TPAMI. This work has been\n  submitted to the IEEE for possible publication. Copyright may be transferred\n  without notice, after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2405.20607v1","updated":"2024-05-31T03:47:44Z","published":"2024-05-31T03:47:44Z","title":"Textual Inversion and Self-supervised Refinement for Radiology Report\n  Generation","summary":"  Existing mainstream approaches follow the encoder-decoder paradigm for\ngenerating radiology reports. They focus on improving the network structure of\nencoders and decoders, which leads to two shortcomings: overlooking the\nmodality gap and ignoring report content constraints. In this paper, we\nproposed Textual Inversion and Self-supervised Refinement (TISR) to address the\nabove two issues. Specifically, textual inversion can project text and image\ninto the same space by representing images as pseudo words to eliminate the\ncross-modeling gap. Subsequently, self-supervised refinement refines these\npseudo words through contrastive loss computation between images and texts,\nenhancing the fidelity of generated reports to images. Notably, TISR is\northogonal to most existing methods, plug-and-play. We conduct experiments on\ntwo widely-used public datasets and achieve significant improvements on various\nbaselines, which demonstrates the effectiveness and generalization of TISR. The\ncode will be available soon.\n","authors":["Yuanjiang Luo","Hongxiang Li","Xuan Wu","Meng Cao","Xiaoshuang Huang","Zhihong Zhu","Peixi Liao","Hu Chen","Yi Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20606v1","updated":"2024-05-31T03:40:15Z","published":"2024-05-31T03:40:15Z","title":"Vision-Language Meets the Skeleton: Progressively Distillation with\n  Cross-Modal Knowledge for 3D Action Representation Learning","summary":"  Supervised and self-supervised learning are two main training paradigms for\nskeleton-based human action recognition. However, the former one-hot\nclassification requires labor-intensive predefined action categories\nannotations, while the latter involves skeleton transformations (e.g.,\ncropping) in the pretext tasks that may impair the skeleton structure. To\naddress these challenges, we introduce a novel skeleton-based training\nframework (C$^2$VL) based on Cross-modal Contrastive learning that uses the\nprogressive distillation to learn task-agnostic human skeleton action\nrepresentation from the Vision-Language knowledge prompts. Specifically, we\nestablish the vision-language action concept space through vision-language\nknowledge prompts generated by pre-trained large multimodal models (LMMs),\nwhich enrich the fine-grained details that the skeleton action space lacks.\nMoreover, we propose the intra-modal self-similarity and inter-modal\ncross-consistency softened targets in the cross-modal contrastive process to\nprogressively control and guide the degree of pulling vision-language knowledge\nprompts and corresponding skeletons closer. These soft instance discrimination\nand self-knowledge distillation strategies contribute to the learning of better\nskeleton-based action representations from the noisy skeleton-vision-language\npairs. During the inference phase, our method requires only the skeleton data\nas the input for action recognition and no longer for vision-language prompts.\nExtensive experiments show that our method achieves state-of-the-art results on\nNTU RGB+D 60, NTU RGB+D 120, and PKU-MMD datasets. The code will be available\nin the future.\n","authors":["Yang Chen","Tian He","Junfeng Fu","Ling Wang","Jingcai Guo","Hong Cheng"],"pdf_url":"https://arxiv.org/pdf/2405.20606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20605v1","updated":"2024-05-31T03:39:26Z","published":"2024-05-31T03:39:26Z","title":"Searching for internal symbols underlying deep learning","summary":"  Deep learning (DL) enables deep neural networks (DNNs) to automatically learn\ncomplex tasks or rules from given examples without instructions or guiding\nprinciples. As we do not engineer DNNs' functions, it is extremely difficult to\ndiagnose their decisions, and multiple lines of studies proposed to explain\nprinciples of DNNs/DL operations. Notably, one line of studies suggests that\nDNNs may learn concepts, the high level features recognizable to humans. Thus,\nwe hypothesized that DNNs develop abstract codes, not necessarily recognizable\nto humans, which can be used to augment DNNs' decision-making. To address this\nhypothesis, we combined foundation segmentation models and unsupervised\nlearning to extract internal codes and identify potential use of abstract codes\nto make DL's decision-making more reliable and safer.\n","authors":["Jung H. Lee","Sujith Vijayan"],"pdf_url":"https://arxiv.org/pdf/2405.20605v1.pdf","comment":"10 pages, 7 figures, 3 tables and Appendix"},{"id":"http://arxiv.org/abs/2405.20596v1","updated":"2024-05-31T03:13:45Z","published":"2024-05-31T03:13:45Z","title":"Generalized Semi-Supervised Learning via Self-Supervised Feature\n  Adaptation","summary":"  Traditional semi-supervised learning (SSL) assumes that the feature\ndistributions of labeled and unlabeled data are consistent which rarely holds\nin realistic scenarios. In this paper, we propose a novel SSL setting, where\nunlabeled samples are drawn from a mixed distribution that deviates from the\nfeature distribution of labeled samples. Under this setting, previous SSL\nmethods tend to predict wrong pseudo-labels with the model fitted on labeled\ndata, resulting in noise accumulation. To tackle this issue, we propose\nSelf-Supervised Feature Adaptation (SSFA), a generic framework for improving\nSSL performance when labeled and unlabeled data come from different\ndistributions. SSFA decouples the prediction of pseudo-labels from the current\nmodel to improve the quality of pseudo-labels. Particularly, SSFA incorporates\na self-supervised task into the SSL framework and uses it to adapt the feature\nextractor of the model to the unlabeled data. In this way, the extracted\nfeatures better fit the distribution of unlabeled data, thereby generating\nhigh-quality pseudo-labels. Extensive experiments show that our proposed SSFA\nis applicable to various pseudo-label-based SSL learners and significantly\nimproves performance in labeled, unlabeled, and even unseen distributions.\n","authors":["Jiachen Liang","Ruibing Hou","Hong Chang","Bingpeng Ma","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2405.20596v1.pdf","comment":"10 pages; Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2405.20584v1","updated":"2024-05-31T02:45:31Z","published":"2024-05-31T02:45:31Z","title":"Disrupting Diffusion: Token-Level Attention Erasure Attack against\n  Diffusion-based Customization","summary":"  With the development of diffusion-based customization methods like\nDreamBooth, individuals now have access to train the models that can generate\ntheir personalized images. Despite the convenience, malicious users have\nmisused these techniques to create fake images, thereby triggering a privacy\nsecurity crisis. In light of this, proactive adversarial attacks are proposed\nto protect users against customization. The adversarial examples are trained to\ndistort the customization model's outputs and thus block the misuse. In this\npaper, we propose DisDiff (Disrupting Diffusion), a novel adversarial attack\nmethod to disrupt the diffusion model outputs. We first delve into the\nintrinsic image-text relationships, well-known as cross-attention, and\nempirically find that the subject-identifier token plays an important role in\nguiding image generation. Thus, we propose the Cross-Attention Erasure module\nto explicitly \"erase\" the indicated attention maps and disrupt the text\nguidance. Besides,we analyze the influence of the sampling process of the\ndiffusion model on Projected Gradient Descent (PGD) attack and introduce a\nnovel Merit Sampling Scheduler to adaptively modulate the perturbation updating\namplitude in a step-aware manner. Our DisDiff outperforms the state-of-the-art\nmethods by 12.75% of FDFR scores and 7.25% of ISM scores across two facial\nbenchmarks and two commonly used prompts on average.\n","authors":["Yisu Liu","Jinyang An","Wanqian Zhang","Dayan Wu","Jingzi Gu","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20584v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2402.06497v3","updated":"2024-05-31T01:58:44Z","published":"2024-02-09T16:08:16Z","title":"Iris-SAM: Iris Segmentation Using a Foundation Model","summary":"  Iris segmentation is a critical component of an iris biometric system and it\ninvolves extracting the annular iris region from an ocular image. In this work,\nwe develop a pixel-level iris segmentation model from a foundational model,\nviz., Segment Anything Model (SAM), that has been successfully used for\nsegmenting arbitrary objects. The primary contribution of this work lies in the\nintegration of different loss functions during the fine-tuning of SAM on ocular\nimages. In particular, the importance of Focal Loss is borne out in the\nfine-tuning process since it strategically addresses the class imbalance\nproblem (i.e., iris versus non-iris pixels). Experiments on ND-IRIS-0405,\nCASIA-Iris-Interval-v3, and IIT-Delhi-Iris datasets convey the efficacy of the\ntrained model for the task of iris segmentation. For instance, on the\nND-IRIS-0405 dataset, an average segmentation accuracy of 99.58% was achieved,\ncompared to the best baseline performance of 89.75%.\n","authors":["Parisa Farmanifard","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2402.06497v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07248v4","updated":"2024-05-31T01:37:03Z","published":"2023-04-14T16:53:06Z","title":"The University of California San Francisco Brain Metastases Stereotactic\n  Radiosurgery (UCSF-BMSR) MRI Dataset","summary":"  The University of California San Francisco Brain Metastases Stereotactic\nRadiosurgery (UCSF-BMSR) dataset is a public, clinical, multimodal brain MRI\ndataset consisting of 560 brain MRIs from 412 patients with expert annotations\nof 5136 brain metastases. Data consists of registered and skull stripped T1\npost-contrast, T1 pre-contrast, FLAIR and subtraction (T1 pre-contrast - T1\npost-contrast) images and voxelwise segmentations of enhancing brain metastases\nin NifTI format. The dataset also includes patient demographics, surgical\nstatus and primary cancer types. The UCSF-BSMR has been made publicly available\nin the hopes that researchers will use these data to push the boundaries of AI\napplications for brain metastases. The dataset is freely available for\nnon-commercial use at https://imagingdatasets.ucsf.edu/dataset/1\n","authors":["Jeffrey D. Rudie","Rachit Saluja","David A. Weiss","Pierre Nedelec","Evan Calabrese","John B. Colby","Benjamin Laguna","John Mongan","Steve Braunstein","Christopher P. Hess","Andreas M. Rauschecker","Leo P. Sugrue","Javier E. Villanueva-Meyer"],"pdf_url":"https://arxiv.org/pdf/2304.07248v4.pdf","comment":"15 pages, 2 tables, 2 figures"},{"id":"http://arxiv.org/abs/2404.07989v2","updated":"2024-05-31T01:36:53Z","published":"2024-04-11T17:59:45Z","title":"Any2Point: Empowering Any-modality Large Models for Efficient 3D\n  Understanding","summary":"  Large foundation models have recently emerged as a prominent focus of\ninterest, attaining superior performance in widespread scenarios. Due to the\nscarcity of 3D data, many efforts have been made to adapt pre-trained\ntransformers from vision to 3D domains. However, such 2D-to-3D approaches are\nstill limited, due to the potential loss of spatial geometries and high\ncomputation cost. More importantly, their frameworks are mainly designed for 2D\nmodels, lacking a general any-to-3D paradigm. In this paper, we introduce\nAny2Point, a parameter-efficient method to empower any-modality large models\n(vision, language, audio) for 3D understanding. Given a frozen transformer from\nany source modality, we propose a 3D-to-any (1D or 2D) virtual projection\nstrategy that correlates the input 3D points to the original 1D or 2D positions\nwithin the source modality. This mechanism enables us to assign each 3D token\nwith a positional encoding paired with the pre-trained model, which avoids 3D\ngeometry loss caused by the true projection and better motivates the\ntransformer for 3D learning with 1D/2D positional priors. Then, within each\ntransformer block, we insert an any-to-3D guided adapter module for\nparameter-efficient fine-tuning. The adapter incorporates prior spatial\nknowledge from the source modality to guide the local feature aggregation of 3D\ntokens, compelling the semantic adaption of any-modality transformers. We\nconduct extensive experiments to showcase the effectiveness and efficiency of\nour method. Code and models are released at\nhttps://github.com/Ivan-Tang-3D/Any2Point.\n","authors":["Yiwen Tang","Ray Zhang","Jiaming Liu","Zoey Guo","Dong Wang","Zhigang Wang","Bin Zhao","Shanghang Zhang","Peng Gao","Hongsheng Li","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2404.07989v2.pdf","comment":"Code and models are released at\n  https://github.com/Ivan-Tang-3D/Any2Point"},{"id":"http://arxiv.org/abs/2405.20247v2","updated":"2024-05-31T01:33:45Z","published":"2024-05-30T16:58:34Z","title":"KerasCV and KerasNLP: Vision and Language Power-Ups","summary":"  We present the Keras domain packages KerasCV and KerasNLP, extensions of the\nKeras API for Computer Vision and Natural Language Processing workflows,\ncapable of running on either JAX, TensorFlow, or PyTorch. These domain packages\nare designed to enable fast experimentation, with a focus on ease-of-use and\nperformance. We adopt a modular, layered design: at the library's lowest level\nof abstraction, we provide building blocks for creating models and data\npreprocessing pipelines, and at the library's highest level of abstraction, we\nprovide pretrained ``task\" models for popular architectures such as Stable\nDiffusion, YOLOv8, GPT2, BERT, Mistral, CLIP, Gemma, T5, etc. Task models have\nbuilt-in preprocessing, pretrained weights, and can be fine-tuned on raw\ninputs. To enable efficient training, we support XLA compilation for all\nmodels, and run all preprocessing via a compiled graph of TensorFlow operations\nusing the tf.data API. The libraries are fully open-source (Apache 2.0 license)\nand available on GitHub.\n","authors":["Matthew Watson","Divyashree Shivakumar Sreepathihalli","Francois Chollet","Martin Gorner","Kiranbir Sodhia","Ramesh Sampath","Tirth Patel","Haifeng Jin","Neel Kovelamudi","Gabriel Rasskin","Samaneh Saadat","Luke Wood","Chen Qian","Jonathan Bischof","Ian Stenbit","Abheesht Sharma","Anshuman Mishra"],"pdf_url":"https://arxiv.org/pdf/2405.20247v2.pdf","comment":"Submitted to Journal of Machine Learning Open Source Software"},{"id":"http://arxiv.org/abs/2401.03922v3","updated":"2024-05-31T01:10:42Z","published":"2024-01-08T14:33:57Z","title":"SNeurodCNN: Structure-focused Neurodegeneration Convolutional Neural\n  Network for Modelling and Classification of Alzheimer's Disease","summary":"  Alzheimer's disease (AD), the predominant form of dementia, is a growing\nglobal challenge, emphasizing the urgent need for accurate and early diagnosis.\nCurrent clinical diagnoses rely on radiologist expert interpretation, which is\nprone to human error. Deep learning has thus far shown promise for early AD\ndiagnosis. However, existing methods often overlook focal structural atrophy\ncritical for enhanced understanding of the cerebral cortex neurodegeneration.\nThis paper proposes a deep learning framework that includes a novel\nstructure-focused neurodegeneration CNN architecture named SNeurodCNN and an\nimage brightness enhancement preprocessor using gamma correction. The\nSNeurodCNN architecture takes as input the focal structural atrophy features\nresulting from segmentation of brain structures captured through magnetic\nresonance imaging (MRI). As a result, the architecture considers only necessary\nCNN components, which comprises of two downsampling convolutional blocks and\ntwo fully connected layers, for achieving the desired classification task, and\nutilises regularisation techniques to regularise learnable parameters.\nLeveraging mid-sagittal and para-sagittal brain image viewpoints from the\nAlzheimer's Disease Neuroimaging Initiative (ADNI) dataset, our framework\ndemonstrated exceptional performance. The para-sagittal viewpoint achieved\n97.8% accuracy, 97.0% specificity, and 98.5% sensitivity, while the\nmid-sagittal viewpoint offered deeper insights with 98.1% accuracy, 97.2%\nspecificity, and 99.0% sensitivity. Model analysis revealed the ability of\nSNeurodCNN to capture the structural dynamics of mild cognitive impairment\n(MCI) and AD in the frontal lobe, occipital lobe, cerebellum, temporal, and\nparietal lobe, suggesting its potential as a brain structural change\ndigi-biomarker for early AD diagnosis. This work can be reproduced using code\nwe made available on GitHub.\n","authors":["Simisola Odimayo","Chollette C. Olisah","Khadija Mohammed"],"pdf_url":"https://arxiv.org/pdf/2401.03922v3.pdf","comment":"36 Pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.20559v1","updated":"2024-05-31T00:57:58Z","published":"2024-05-31T00:57:58Z","title":"Universal evaluation and design of imaging systems using information\n  estimation","summary":"  Information theory, which describes the transmission of signals in the\npresence of noise, has enabled the development of reliable communication\nsystems that underlie the modern world. Imaging systems can also be viewed as a\nform of communication, in which information about the object is \"transmitted\"\nthrough images. However, the application of information theory to imaging\nsystems has been limited by the challenges of accounting for their physical\nconstraints. Here, we introduce a framework that addresses these limitations by\nmodeling the probabilistic relationship between objects and their measurements.\nUsing this framework, we develop a method to estimate information using only a\ndataset of noisy measurements, without making any assumptions about the image\nformation process. We demonstrate that these estimates comprehensively quantify\nmeasurement quality across a diverse range of imaging systems and applications.\nFurthermore, we introduce Information-Driven Encoder Analysis Learning (IDEAL),\na technique to optimize the design of imaging hardware for maximum information\ncapture. This work provides new insights into the fundamental performance\nlimits of imaging systems and offers powerful new tools for their analysis and\ndesign.\n","authors":["Henry Pinkard","Leyla Kabuli","Eric Markley","Tiffany Chien","Jiantao Jiao","Laura Waller"],"pdf_url":"https://arxiv.org/pdf/2405.20559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11386v2","updated":"2024-05-31T00:48:18Z","published":"2024-05-18T20:22:22Z","title":"Liver Fat Quantification Network with Body Shape","summary":"  It is critically important to detect the content of liver fat as it is\nrelated to cardiac complications and cardiovascular disease mortality. However,\nexisting methods are either associated with high cost and/or medical\ncomplications (e.g., liver biopsy, imaging technology) or only roughly estimate\nthe grades of steatosis. In this paper, we propose a deep neural network to\nestimate the percentage of liver fat using only body shapes. The proposed is\ncomposed of a flexible baseline network and a lightweight Attention module. The\nattention module is trained to generate discriminative and diverse features\nwhich significant improve the performance. In order to validate the method, we\nperform extensive tests on the public medical dataset. The results verify that\nour proposed method yields state-of-the-art performance with Root mean squared\nerror (RMSE) of 5.26% and R-Squared value over 0.8. It offers an accurate and\nmore accessible assessment of hepatic steatosis.\n","authors":["Qiyue Wang","Wu Xue","Xiaoke Zhang","Fang Jin","James Hahn"],"pdf_url":"https://arxiv.org/pdf/2405.11386v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19687v2","updated":"2024-05-31T00:35:31Z","published":"2024-05-30T04:57:54Z","title":"Autonomous Driving with Spiking Neural Networks","summary":"  Autonomous driving demands an integrated approach that encompasses\nperception, prediction, and planning, all while operating under strict energy\nconstraints to enhance scalability and environmental sustainability. We present\nSpiking Autonomous Driving (SAD), the first unified Spiking Neural Network\n(SNN) to address the energy challenges faced by autonomous driving systems\nthrough its event-driven and energy-efficient nature. SAD is trained end-to-end\nand consists of three main modules: perception, which processes inputs from\nmulti-view cameras to construct a spatiotemporal bird's eye view; prediction,\nwhich utilizes a novel dual-pathway with spiking neurons to forecast future\nstates; and planning, which generates safe trajectories considering predicted\noccupancy, traffic rules, and ride comfort. Evaluated on the nuScenes dataset,\nSAD achieves competitive performance in perception, prediction, and planning\ntasks, while drawing upon the energy efficiency of SNNs. This work highlights\nthe potential of neuromorphic computing to be applied to energy-efficient\nautonomous driving, a critical step toward sustainable and safety-critical\nautomotive technology. Our code is available at\n\\url{https://github.com/ridgerchu/SAD}.\n","authors":["Rui-Jie Zhu","Ziqing Wang","Leilani Gilpin","Jason K. Eshraghian"],"pdf_url":"https://arxiv.org/pdf/2405.19687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04626v2","updated":"2024-05-31T00:12:59Z","published":"2024-03-07T16:11:43Z","title":"MedFLIP: Medical Vision-and-Language Self-supervised Fast Pre-Training\n  with Masked Autoencoder","summary":"  Within the domain of medical analysis, extensive research has explored the\npotential of mutual learning between Masked Autoencoders(MAEs) and multimodal\ndata. However, the impact of MAEs on intermodality remains a key challenge. We\nintroduce MedFLIP, a Fast Language-Image Pre-training method for Medical\nanalysis. We explore MAEs for zero-shot learning with crossed domains, which\nenhances the model's ability to learn from limited data, a common scenario in\nmedical diagnostics. We verify that masking an image does not affect\ninter-modal learning. Furthermore, we propose the SVD loss to enhance the\nrepresentation learning for characteristics of medical images, aiming to\nimprove classification accuracy by leveraging the structural intricacies of\nsuch data. Our theory posits that masking encourages semantic preservation,\nrobust feature extraction, regularization, domain adaptation, and invariance\nlearning. Lastly, we validate using language will improve the zero-shot\nperformance for the medical image analysis. MedFLIP's scaling of the masking\nprocess marks an advancement in the field, offering a pathway to rapid and\nprecise medical image analysis without the traditional computational\nbottlenecks. Through experiments and validation, MedFLIP demonstrates efficient\nperformance improvements, helps for future research and application in medical\ndiagnostics.\n","authors":["Lei Li","Tianfang Zhang","Xinglin Zhang","Jiaqi Liu","Bingqi Ma","Yan Luo","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2403.04626v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10373v3","updated":"2024-05-31T23:45:57Z","published":"2023-08-20T21:47:54Z","title":"HoSNN: Adversarially-Robust Homeostatic Spiking Neural Networks with\n  Adaptive Firing Thresholds","summary":"  While spiking neural networks (SNNs) offer a promising neurally-inspired\nmodel of computation, they are vulnerable to adversarial attacks. We present\nthe first study that draws inspiration from neural homeostasis to design a\nthreshold-adapting leaky integrate-and-fire (TA-LIF) neuron model and utilize\nTA-LIF neurons to construct the adversarially robust homeostatic SNNs (HoSNNs)\nfor improved robustness. The TA-LIF model incorporates a self-stabilizing\ndynamic thresholding mechanism, offering a local feedback control solution to\nthe minimization of each neuron's membrane potential error caused by\nadversarial disturbance. Theoretical analysis demonstrates favorable dynamic\nproperties of TA-LIF neurons in terms of the bounded-input bounded-output\nstability and suppressed time growth of membrane potential error, underscoring\ntheir superior robustness compared with the standard LIF neurons. When trained\nwith weak FGSM attacks (attack budget = 2/255) and tested with much stronger\nPGD attacks (attack budget = 8/255), our HoSNNs significantly improve model\naccuracy on several datasets: from 30.54% to 74.91% on FashionMNIST, from 0.44%\nto 35.06% on SVHN, from 0.56% to 42.63% on CIFAR10, from 0.04% to 16.66% on\nCIFAR100, over the conventional LIF-based SNNs.\n","authors":["Hejia Geng","Peng Li"],"pdf_url":"https://arxiv.org/pdf/2308.10373v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16589v2","updated":"2024-05-31T21:26:39Z","published":"2023-11-28T08:15:27Z","title":"HD Maps are Lane Detection Generalizers: A Novel Generative Framework\n  for Single-Source Domain Generalization","summary":"  Lane detection is a vital task for vehicles to navigate and localize their\nposition on the road. To ensure reliable driving, lane detection models must\nhave robust generalization performance in various road environments. However,\ndespite the advanced performance in the trained domain, their generalization\nperformance still falls short of expectations due to the domain discrepancy. To\nbridge this gap, we propose a novel generative framework using HD Maps for\nSingle-Source Domain Generalization (SSDG) in lane detection. We first generate\nnumerous front-view images from lane markings of HD Maps. Next, we\nstrategically select a core subset among the generated images using (i) lane\nstructure and (ii) road surrounding criteria to maximize their diversity. In\nthe end, utilizing this core set, we train lane detection models to boost their\ngeneralization performance. We validate that our generative framework from HD\nMaps outperforms the Domain Adaptation model MLDA with +3.01%p accuracy\nimprovement, even though we do not access the target domain images.\n","authors":["Daeun Lee","Minhyeok Heo","Jiwon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.16589v2.pdf","comment":"Accepted by CVPR Data-Driven Autonomous Driving Simulation Workshop,\n  2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2405.20994v1","updated":"2024-05-31T16:38:54Z","published":"2024-05-31T16:38:54Z","title":"CWRCzech: 100M Query-Document Czech Click Dataset and Its Application to\n  Web Relevance Ranking","summary":"  We present CWRCzech, Click Web Ranking dataset for Czech, a 100M\nquery-document Czech click dataset for relevance ranking with user behavior\ndata collected from search engine logs of Seznam.cz. To the best of our\nknowledge, CWRCzech is the largest click dataset with raw text published so\nfar. It provides document positions in the search results as well as\ninformation about user behavior: 27.6M clicked documents and 10.8M dwell times.\nIn addition, we also publish a manually annotated Czech test for the relevance\ntask, containing nearly 50k query-document pairs, each annotated by at least 2\nannotators. Finally, we analyze how the user behavior data improve relevance\nranking and show that models trained on data automatically harnessed at\nsufficient scale can surpass the performance of models trained on human\nannotated data. CWRCzech is published under an academic non-commercial license\nand is available to the research community at\nhttps://github.com/seznam/CWRCzech.\n","authors":["Josef Vonášek","Milan Straka","Rostislav Krč","Lenka Lasoňová","Ekaterina Egorova","Jana Straková","Jakub Náplava"],"pdf_url":"https://arxiv.org/pdf/2405.20994v1.pdf","comment":"Accepted to SIGIR 2024"},{"id":"http://arxiv.org/abs/2405.20878v1","updated":"2024-05-31T14:53:12Z","published":"2024-05-31T14:53:12Z","title":"SelfGNN: Self-Supervised Graph Neural Networks for Sequential\n  Recommendation","summary":"  Sequential recommendation effectively addresses information overload by\nmodeling users' temporal and sequential interaction patterns. To overcome the\nlimitations of supervision signals, recent approaches have adopted\nself-supervised learning techniques in recommender systems. However, there are\nstill two critical challenges that remain unsolved. Firstly, existing\nsequential models primarily focus on long-term modeling of individual\ninteraction sequences, overlooking the valuable short-term collaborative\nrelationships among the behaviors of different users. Secondly, real-world data\noften contain noise, particularly in users' short-term behaviors, which can\narise from temporary intents or misclicks. Such noise negatively impacts the\naccuracy of both graph and sequence models, further complicating the modeling\nprocess. To address these challenges, we propose a novel framework called\nSelf-Supervised Graph Neural Network (SelfGNN) for sequential recommendation.\nThe SelfGNN framework encodes short-term graphs based on time intervals and\nutilizes Graph Neural Networks (GNNs) to learn short-term collaborative\nrelationships. It captures long-term user and item representations at multiple\ngranularity levels through interval fusion and dynamic behavior modeling.\nImportantly, our personalized self-augmented learning structure enhances model\nrobustness by mitigating noise in short-term graphs based on long-term user\ninterests and personal stability. Extensive experiments conducted on four\nreal-world datasets demonstrate that SelfGNN outperforms various\nstate-of-the-art baselines. Our model implementation codes are available at\nhttps://github.com/HKUDS/SelfGNN.\n","authors":["Yuxi Liu","Lianghao Xia","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2405.20878v1.pdf","comment":"Accepted by SIGIR'24"},{"id":"http://arxiv.org/abs/2204.09140v2","updated":"2024-05-31T14:28:40Z","published":"2022-04-19T21:55:18Z","title":"Multi-hop Question Answering","summary":"  The task of Question Answering (QA) has attracted significant research\ninterest for long. Its relevance to language understanding and knowledge\nretrieval tasks, along with the simple setting makes the task of QA crucial for\nstrong AI systems. Recent success on simple QA tasks has shifted the focus to\nmore complex settings. Among these, Multi-Hop QA (MHQA) is one of the most\nresearched tasks over the recent years. In broad terms, MHQA is the task of\nanswering natural language questions that involve extracting and combining\nmultiple pieces of information and doing multiple steps of reasoning. An\nexample of a multi-hop question would be \"The Argentine PGA Championship record\nholder has won how many tournaments worldwide?\". Answering the question would\nneed two pieces of information: \"Who is the record holder for Argentine PGA\nChampionship tournaments?\" and \"How many tournaments did [Answer of Sub Q1]\nwin?\". The ability to answer multi-hop questions and perform multi step\nreasoning can significantly improve the utility of NLP systems. Consequently,\nthe field has seen a surge with high quality datasets, models and evaluation\nstrategies. The notion of 'multiple hops' is somewhat abstract which results in\na large variety of tasks that require multi-hop reasoning. This leads to\ndifferent datasets and models that differ significantly from each other and\nmakes the field challenging to generalize and survey. We aim to provide a\ngeneral and formal definition of the MHQA task, and organize and summarize\nexisting MHQA frameworks. We also outline some best practices for building MHQA\ndatasets. This book provides a systematic and thorough introduction as well as\nthe structuring of the existing attempts to this highly interesting, yet quite\nchallenging task.\n","authors":["Vaibhav Mavi","Anubhav Jangra","Adam Jatowt"],"pdf_url":"https://arxiv.org/pdf/2204.09140v2.pdf","comment":"Published at Foundations and Trends in Information Retrieval"},{"id":"http://arxiv.org/abs/2405.20718v1","updated":"2024-05-31T09:14:48Z","published":"2024-05-31T09:14:48Z","title":"Popularity-Aware Alignment and Contrast for Mitigating Popularity Bias","summary":"  Collaborative Filtering (CF) typically suffers from the significant challenge\nof popularity bias due to the uneven distribution of items in real-world\ndatasets. This bias leads to a significant accuracy gap between popular and\nunpopular items. It not only hinders accurate user preference understanding but\nalso exacerbates the Matthew effect in recommendation systems. To alleviate\npopularity bias, existing efforts focus on emphasizing unpopular items or\nseparating the correlation between item representations and their popularity.\nDespite the effectiveness, existing works still face two persistent challenges:\n(1) how to extract common supervision signals from popular items to improve the\nunpopular item representations, and (2) how to alleviate the representation\nseparation caused by popularity bias. In this work, we conduct an empirical\nanalysis of popularity bias and propose Popularity-Aware Alignment and Contrast\n(PAAC) to address two challenges. Specifically, we use the common supervisory\nsignals modeled in popular item representations and propose a novel\npopularity-aware supervised alignment module to learn unpopular item\nrepresentations. Additionally, we suggest re-weighting the contrastive learning\nloss to mitigate the representation separation from a popularity-centric\nperspective. Finally, we validate the effectiveness and rationale of PAAC in\nmitigating popularity bias through extensive experiments on three real-world\ndatasets. Our code is available at\nhttps://github.com/miaomiao-cai2/KDD2024-PAAC.\n","authors":["Miaomiao Cai","Lei Chen","Yifan Wang","Haoyue Bai","Peijie Sun","Le Wu","Min Zhang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.20718v1.pdf","comment":"Accepted by KDD 2024"},{"id":"http://arxiv.org/abs/2405.20710v1","updated":"2024-05-31T09:07:03Z","published":"2024-05-31T09:07:03Z","title":"Information Maximization via Variational Autoencoders for Cross-Domain\n  Recommendation","summary":"  Cross-Domain Sequential Recommendation (CDSR) methods aim to address the data\nsparsity and cold-start problems present in Single-Domain Sequential\nRecommendation (SDSR). Existing CDSR methods typically rely on overlapping\nusers, designing complex cross-domain modules to capture users' latent\ninterests that can propagate across different domains. However, their\npropagated informative information is limited to the overlapping users and the\nusers who have rich historical behavior records. As a result, these methods\noften underperform in real-world scenarios, where most users are\nnon-overlapping (cold-start) and long-tailed. In this research, we introduce a\nnew CDSR framework named Information Maximization Variational Autoencoder\n(\\textbf{\\texttt{IM-VAE}}). Here, we suggest using a Pseudo-Sequence Generator\nto enhance the user's interaction history input for downstream fine-grained\nCDSR models to alleviate the cold-start issues. We also propose a Generative\nRecommendation Framework combined with three regularizers inspired by the\nmutual information maximization (MIM) theory \\cite{mcgill1954multivariate} to\ncapture the semantic differences between a user's interests shared across\ndomains and those specific to certain domains, as well as address the\ninformational gap between a user's actual interaction sequences and the\npseudo-sequences generated. To the best of our knowledge, this paper is the\nfirst CDSR work that considers the information disentanglement and denoising of\npseudo-sequences in the open-world recommendation scenario. Empirical\nexperiments illustrate that \\texttt{IM-VAE} outperforms the state-of-the-art\napproaches on two real-world cross-domain datasets on all sorts of users,\nincluding cold-start and tailed users, demonstrating the effectiveness of\n\\texttt{IM-VAE} in open-world recommendation.\n","authors":["Xuying Ning","Wujiang Xu","Xiaolei Liu","Mingming Ha","Qiongxu Ma","Youru Li","Linxun Chen","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16969v2","updated":"2024-05-31T08:37:12Z","published":"2024-01-30T12:50:38Z","title":"Taxonomy of Mathematical Plagiarism","summary":"  Plagiarism is a pressing concern, even more so with the availability of large\nlanguage models. Existing plagiarism detection systems reliably find copied and\nmoderately reworded text but fail for idea plagiarism, especially in\nmathematical science, which heavily uses formal mathematical notation. We make\ntwo contributions. First, we establish a taxonomy of mathematical content reuse\nby annotating potentially plagiarised 122 scientific document pairs. Second, we\nanalyze the best-performing approaches to detect plagiarism and mathematical\ncontent similarity on the newly established taxonomy. We found that the\nbest-performing methods for plagiarism and math content similarity achieve an\noverall detection score (PlagDet) of 0.06 and 0.16, respectively. The\nbest-performing methods failed to detect most cases from all seven newly\nestablished math similarity types. Outlined contributions will benefit research\nin plagiarism detection systems, recommender systems, question-answering\nsystems, and search engines. We make our experiment's code and annotated\ndataset available to the community:\nhttps://github.com/gipplab/Taxonomy-of-Mathematical-Plagiarism\n","authors":["Ankit Satpute","Andre Greiner-Petter","Noah Gießing","Isabel Beckenbach","Moritz Schubotz","Olaf Teschke","Akiko Aizawa","Bela Gipp"],"pdf_url":"https://arxiv.org/pdf/2401.16969v2.pdf","comment":"46th European Conference on Information Retrieval (ECIR)"},{"id":"http://arxiv.org/abs/2405.20654v1","updated":"2024-05-31T07:43:42Z","published":"2024-05-31T07:43:42Z","title":"Passage-specific Prompt Tuning for Passage Reranking in Question\n  Answering with Large Language Models","summary":"  Effective passage retrieval and reranking methods have been widely utilized\nto identify suitable candidates in open-domain question answering tasks, recent\nstudies have resorted to LLMs for reranking the retrieved passages by the\nlog-likelihood of the question conditioned on each passage. Although these\nmethods have demonstrated promising results, the performance is notably\nsensitive to the human-written prompt (or hard prompt), and fine-tuning LLMs\ncan be computationally intensive and time-consuming. Furthermore, this approach\nlimits the leverage of question-passage relevance pairs and passage-specific\nknowledge to enhance the ranking capabilities of LLMs. In this paper, we\npropose passage-specific prompt tuning for reranking in open-domain question\nanswering (PSPT): a parameter-efficient method that fine-tunes learnable\npassage-specific soft prompts, incorporating passage-specific knowledge from a\nlimited set of question-passage relevance pairs. The method involves ranking\nretrieved passages based on the log-likelihood of the model generating the\nquestion conditioned on each passage and the learned soft prompt. We conducted\nextensive experiments utilizing the Llama-2-chat-7B model across three publicly\navailable open-domain question answering datasets and the results demonstrate\nthe effectiveness of the proposed approach.\n","authors":["Xuyang Wu","Zhiyuan Peng","Sravanthi Rajanala","Hsin-Tai Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2405.20654v1.pdf","comment":"Accepted at Gen-IR@SIGIR24"},{"id":"http://arxiv.org/abs/2405.20646v1","updated":"2024-05-31T07:24:42Z","published":"2024-05-31T07:24:42Z","title":"Large Language Models Enhanced Sequential Recommendation for Long-tail\n  User and Item","summary":"  Sequential recommendation systems (SRS) serve the purpose of predicting\nusers' subsequent preferences based on their past interactions and have been\napplied across various domains such as e-commerce and social networking\nplatforms. However, practical SRS encounters challenges due to the fact that\nmost users engage with only a limited number of items, while the majority of\nitems are seldom consumed. These challenges, termed as the long-tail user and\nlong-tail item dilemmas, often create obstacles for traditional SRS methods.\nMitigating these challenges is crucial as they can significantly impact user\nsatisfaction and business profitability. While some research endeavors have\nalleviated these issues, they still grapple with issues such as seesaw or noise\nstemming from the scarcity of interactions. The emergence of large language\nmodels (LLMs) presents a promising avenue to address these challenges from a\nsemantic standpoint. In this study, we introduce the Large Language Models\nEnhancement framework for Sequential Recommendation (LLM-ESR), which leverages\nsemantic embeddings from LLMs to enhance SRS performance without increasing\ncomputational overhead. To combat the long-tail item challenge, we propose a\ndual-view modeling approach that fuses semantic information from LLMs with\ncollaborative signals from traditional SRS. To address the long-tail user\nchallenge, we introduce a retrieval augmented self-distillation technique to\nrefine user preference representations by incorporating richer interaction data\nfrom similar users. Through comprehensive experiments conducted on three\nauthentic datasets using three widely used SRS models, our proposed enhancement\nframework demonstrates superior performance compared to existing methodologies.\n","authors":["Qidong Liu","Xian Wu","Xiangyu Zhao","Yejing Wang","Zijian Zhang","Feng Tian","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.20646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00368v3","updated":"2024-05-31T07:22:01Z","published":"2023-12-31T02:13:18Z","title":"Improving Text Embeddings with Large Language Models","summary":"  In this paper, we introduce a novel and simple method for obtaining\nhigh-quality text embeddings using only synthetic data and less than 1k\ntraining steps. Unlike existing methods that often depend on multi-stage\nintermediate pre-training with billions of weakly-supervised text pairs,\nfollowed by fine-tuning with a few labeled datasets, our method does not\nrequire building complex training pipelines or relying on manually collected\ndatasets that are often constrained by task diversity and language coverage. We\nleverage proprietary LLMs to generate diverse synthetic data for hundreds of\nthousands of text embedding tasks across 93 languages. We then fine-tune\nopen-source decoder-only LLMs on the synthetic data using standard contrastive\nloss. Experiments demonstrate that our method achieves strong performance on\nhighly competitive text embedding benchmarks without using any labeled data.\nFurthermore, when fine-tuned with a mixture of synthetic and labeled data, our\nmodel sets new state-of-the-art results on the BEIR and MTEB benchmarks.\n","authors":["Liang Wang","Nan Yang","Xiaolong Huang","Linjun Yang","Rangan Majumder","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2401.00368v3.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2405.20626v1","updated":"2024-05-31T05:31:00Z","published":"2024-05-31T05:31:00Z","title":"Causal Distillation for Alleviating Performance Heterogeneity in\n  Recommender Systems","summary":"  Recommendation performance usually exhibits a long-tail distribution over\nusers -- a small portion of head users enjoy much more accurate recommendation\nservices than the others. We reveal two sources of this performance\nheterogeneity problem: the uneven distribution of historical interactions (a\nnatural source); and the biased training of recommender models (a model\nsource). As addressing this problem cannot sacrifice the overall performance, a\nwise choice is to eliminate the model bias while maintaining the natural\nheterogeneity. The key to debiased training lies in eliminating the effect of\nconfounders that influence both the user's historical behaviors and the next\nbehavior. The emerging causal recommendation methods achieve this by modeling\nthe causal effect between user behaviors, however potentially neglect\nunobserved confounders (\\eg, friend suggestions) that are hard to measure in\npractice. To address unobserved confounders, we resort to the front-door\nadjustment (FDA) in causal theory and propose a causal multi-teacher\ndistillation framework (CausalD). FDA requires proper mediators in order to\nestimate the causal effects of historical behaviors on the next behavior. To\nachieve this, we equip CausalD with multiple heterogeneous recommendation\nmodels to model the mediator distribution. Then, the causal effect estimated by\nFDA is the expectation of recommendation prediction over the mediator\ndistribution and the prior distribution of historical behaviors, which is\ntechnically achieved by multi-teacher ensemble. To pursue efficient inference,\nCausalD further distills multiple teachers into one student model to directly\ninfer the causal effect for making recommendations.\n","authors":["Shengyu Zhang","Ziqi Jiang","Jiangchao Yao","Fuli Feng","Kun Kuang","Zhou Zhao","Shuo Li","Hongxia Yang","Tat-Seng Chua","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20626v1.pdf","comment":"TKDE 2023"},{"id":"http://arxiv.org/abs/2405.20565v1","updated":"2024-05-31T01:07:37Z","published":"2024-05-31T01:07:37Z","title":"Knowledge Enhanced Multi-intent Transformer Network for Recommendation","summary":"  Incorporating Knowledge Graphs into Recommendation has attracted growing\nattention in industry, due to the great potential of KG in providing abundant\nsupplementary information and interpretability for the underlying models.\nHowever, simply integrating KG into recommendation usually brings in negative\nfeedback in industry, due to the ignorance of the following two factors: i)\nusers' multiple intents, which involve diverse nodes in KG. For example, in\ne-commerce scenarios, users may exhibit preferences for specific styles,\nbrands, or colors. ii) knowledge noise, which is a prevalent issue in Knowledge\nEnhanced Recommendation (KGR) and even more severe in industry scenarios. The\nirrelevant knowledge properties of items may result in inferior model\nperformance compared to approaches that do not incorporate knowledge. To tackle\nthese challenges, we propose a novel approach named Knowledge Enhanced\nMulti-intent Transformer Network for Recommendation (KGTN), comprising two\nprimary modules: Global Intents Modeling with Graph Transformer, and Knowledge\nContrastive Denoising under Intents. Specifically, Global Intents with Graph\nTransformer focuses on capturing learnable user intents, by incorporating\nglobal signals from user-item-relation-entity interactions with a graph\ntransformer, meanwhile learning intent-aware user/item representations.\nKnowledge Contrastive Denoising under Intents is dedicated to learning precise\nand robust representations. It leverages intent-aware representations to sample\nrelevant knowledge, and proposes a local-global contrastive mechanism to\nenhance noise-irrelevant representation learning. Extensive experiments\nconducted on benchmark datasets show the superior performance of our proposed\nmethod over the state-of-the-arts. And online A/B testing results on Alibaba\nlarge-scale industrial recommendation platform also indicate the real-scenario\neffectiveness of KGTN.\n","authors":["Ding Zou","Wei Wei","Feida Zhu","Chuanyu Xu","Tao Zhang","Chengfu Huo"],"pdf_url":"https://arxiv.org/pdf/2405.20565v1.pdf","comment":"Accept By The Web Conf 2024 (WWW 2024) Industry Track. arXiv admin\n  note: text overlap with arXiv:2204.08807"},{"id":"http://arxiv.org/abs/2406.00231v1","updated":"2024-05-31T23:29:42Z","published":"2024-05-31T23:29:42Z","title":"LLM-RankFusion: Mitigating Intrinsic Inconsistency in LLM-based Ranking","summary":"  Ranking passages by prompting a large language model (LLM) can achieve\npromising performance in modern information retrieval (IR) systems. A common\napproach is to sort the ranking list by prompting LLMs for pairwise comparison.\nHowever, sorting-based methods require consistent comparisons to correctly sort\nthe passages, which we show that LLMs often violate. We identify two kinds of\nintrinsic inconsistency in LLM-based pairwise comparisons: order inconsistency\nwhich leads to conflicting results when switching the passage order, and\ntransitive inconsistency which leads to non-transitive triads among all\npreference pairs. In this paper, we propose LLM-RankFusion, an LLM-based\nranking framework that mitigates these inconsistencies and produces a robust\nranking list. LLM-RankFusion mitigates order inconsistency using in-context\nlearning (ICL) to demonstrate order-agnostic comparisons and calibration to\nestimate the underlying preference probability between two passages. We then\naddress transitive inconsistency by aggregating the ranking results from\nmultiple rankers. In our experiments, we empirically show that LLM-RankFusion\ncan significantly reduce inconsistent pairwise comparison results, and improve\nthe ranking quality by making the final ranking list more robust.\n","authors":["Yifan Zeng","Ojas Tendolkar","Raymond Baartmans","Qingyun Wu","Huazheng Wang","Lizhong Chen"],"pdf_url":"https://arxiv.org/pdf/2406.00231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00198v1","updated":"2024-05-31T21:19:41Z","published":"2024-05-31T21:19:41Z","title":"ImplicitSLIM and How it Improves Embedding-based Collaborative Filtering","summary":"  We present ImplicitSLIM, a novel unsupervised learning approach for sparse\nhigh-dimensional data, with applications to collaborative filtering. Sparse\nlinear methods (SLIM) and their variations show outstanding performance, but\nthey are memory-intensive and hard to scale. ImplicitSLIM improves\nembedding-based models by extracting embeddings from SLIM-like models in a\ncomputationally cheap and memory-efficient way, without explicit learning of\nheavy SLIM-like models. We show that ImplicitSLIM improves performance and\nspeeds up convergence for both state of the art and classical collaborative\nfiltering methods. The source code for ImplicitSLIM, related models, and\napplications is available at https://github.com/ilya-shenbin/ImplicitSLIM.\n","authors":["Ilya Shenbin","Sergey Nikolenko"],"pdf_url":"https://arxiv.org/pdf/2406.00198v1.pdf","comment":"Published as a conference paper at ICLR 2024; authors' version"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2402.07131v2","updated":"2024-05-31T17:59:36Z","published":"2024-02-11T08:59:02Z","title":"Resampling methods for Private Statistical Inference","summary":"  We consider the task of constructing confidence intervals with differential\nprivacy. We propose two private variants of the non-parametric bootstrap, which\nprivately compute the median of the results of multiple \"little\" bootstraps run\non partitions of the data and give asymptotic bounds on the coverage error of\nthe resulting confidence intervals. For a fixed differential privacy parameter\n$\\epsilon$, our methods enjoy the same error rates as that of the non-private\nbootstrap to within logarithmic factors in the sample size $n$. We empirically\nvalidate the performance of our methods for mean estimation, median estimation,\nand logistic regression with both real and synthetic data. Our methods achieve\nsimilar coverage accuracy to existing methods (and non-private baselines) while\nproviding notably shorter ($\\gtrsim 10$ times) confidence intervals than\nprevious approaches.\n","authors":["Karan Chadha","John Duchi","Rohith Kuditipudi"],"pdf_url":"https://arxiv.org/pdf/2402.07131v2.pdf","comment":"45 pages"},{"id":"http://arxiv.org/abs/2405.21070v1","updated":"2024-05-31T17:57:24Z","published":"2024-05-31T17:57:24Z","title":"Generalization Beyond Data Imbalance: A Controlled Study on CLIP for\n  Transferable Insights","summary":"  Severe data imbalance naturally exists among web-scale vision-language\ndatasets. Despite this, we find CLIP pre-trained thereupon exhibits notable\nrobustness to the data imbalance compared to supervised learning, and\ndemonstrates significant effectiveness in learning generalizable\nrepresentations. With an aim to investigate the reasons behind this finding, we\nconduct controlled experiments to study various underlying factors, and reveal\nthat CLIP's pretext task forms a dynamic classification problem wherein only a\nsubset of classes is present in training. This isolates the bias from dominant\nclasses and implicitly balances the learning signal. Furthermore, the\nrobustness and discriminability of CLIP improve with more descriptive language\nsupervision, larger data scale, and broader open-world concepts, which are\ninaccessible to supervised learning. Our study not only uncovers the mechanisms\nbehind CLIP's generalizability beyond data imbalance but also provides\ntransferable insights for the research community. The findings are validated in\nboth supervised and self-supervised learning, enabling models trained on\nimbalanced data to achieve CLIP-level performance on diverse recognition tasks.\nCode will be available at: https://github.com/CVMI-Lab/clip-beyond-tail.\n","authors":["Xin Wen","Bingchen Zhao","Yilun Chen","Jiangmiao Pang","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2405.21070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00752v2","updated":"2024-05-31T17:55:27Z","published":"2023-12-01T18:01:34Z","title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces","summary":"  Foundation models, now powering most of the exciting applications in deep\nlearning, are almost universally based on the Transformer architecture and its\ncore attention module. Many subquadratic-time architectures such as linear\nattention, gated convolution and recurrent models, and structured state space\nmodels (SSMs) have been developed to address Transformers' computational\ninefficiency on long sequences, but they have not performed as well as\nattention on important modalities such as language. We identify that a key\nweakness of such models is their inability to perform content-based reasoning,\nand make several improvements. First, simply letting the SSM parameters be\nfunctions of the input addresses their weakness with discrete modalities,\nallowing the model to selectively propagate or forget information along the\nsequence length dimension depending on the current token. Second, even though\nthis change prevents the use of efficient convolutions, we design a\nhardware-aware parallel algorithm in recurrent mode. We integrate these\nselective SSMs into a simplified end-to-end neural network architecture without\nattention or even MLP blocks (Mamba). Mamba enjoys fast inference (5$\\times$\nhigher throughput than Transformers) and linear scaling in sequence length, and\nits performance improves on real data up to million-length sequences. As a\ngeneral sequence model backbone, Mamba achieves state-of-the-art performance\nacross several modalities such as language, audio, and genomics. On language\nmodeling, our Mamba-3B model outperforms Transformers of the same size and\nmatches Transformers twice its size, both in pretraining and downstream\nevaluation.\n","authors":["Albert Gu","Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2312.00752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21064v1","updated":"2024-05-31T17:53:00Z","published":"2024-05-31T17:53:00Z","title":"Recurrent neural networks: vanishing and exploding gradients are not the\n  end of the story","summary":"  Recurrent neural networks (RNNs) notoriously struggle to learn long-term\nmemories, primarily due to vanishing and exploding gradients. The recent\nsuccess of state-space models (SSMs), a subclass of RNNs, to overcome such\ndifficulties challenges our theoretical understanding. In this paper, we delve\ninto the optimization challenges of RNNs and discover that, as the memory of a\nnetwork increases, changes in its parameters result in increasingly large\noutput variations, making gradient-based learning highly sensitive, even\nwithout exploding gradients. Our analysis further reveals the importance of the\nelement-wise recurrence design pattern combined with careful parametrizations\nin mitigating this effect. This feature is present in SSMs, as well as in other\narchitectures, such as LSTMs. Overall, our insights provide a new explanation\nfor some of the difficulties in gradient-based learning of RNNs and why some\narchitectures perform better than others.\n","authors":["Nicolas Zucchet","Antonio Orvieto"],"pdf_url":"https://arxiv.org/pdf/2405.21064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21063v1","updated":"2024-05-31T17:51:07Z","published":"2024-05-31T17:51:07Z","title":"Neural Network Verification with Branch-and-Bound for General\n  Nonlinearities","summary":"  Branch-and-bound (BaB) is among the most effective methods for neural network\n(NN) verification. However, existing works on BaB have mostly focused on NNs\nwith piecewise linear activations, especially ReLU networks. In this paper, we\ndevelop a general framework, named GenBaB, to conduct BaB for general\nnonlinearities in general computational graphs based on linear bound\npropagation. To decide which neuron to branch, we design a new branching\nheuristic which leverages linear bounds as shortcuts to efficiently estimate\nthe potential improvement after branching. To decide nontrivial branching\npoints for general nonlinear functions, we propose to optimize branching points\noffline, which can be efficiently leveraged during verification with a lookup\ntable. We demonstrate the effectiveness of our GenBaB on verifying a wide range\nof NNs, including networks with activation functions such as Sigmoid, Tanh,\nSine and GeLU, as well as networks involving multi-dimensional nonlinear\noperations such as multiplications in LSTMs and Vision Transformers. Our\nframework also allows the verification of general nonlinear computation graphs\nand enables verification applications beyond simple neural networks,\nparticularly for AC Optimal Power Flow (ACOPF). GenBaB is part of the latest\n$\\alpha,\\!\\beta$-CROWN, the winner of the 4th International Verification of\nNeural Networks Competition (VNN-COMP 2023).\n","authors":["Zhouxing Shi","Qirui Jin","Zico Kolter","Suman Jana","Cho-Jui Hsieh","Huan Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.21063v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2405.21061v1","updated":"2024-05-31T17:50:27Z","published":"2024-05-31T17:50:27Z","title":"Graph External Attention Enhanced Transformer","summary":"  The Transformer architecture has recently gained considerable attention in\nthe field of graph representation learning, as it naturally overcomes several\nlimitations of Graph Neural Networks (GNNs) with customized attention\nmechanisms or positional and structural encodings. Despite making some\nprogress, existing works tend to overlook external information of graphs,\nspecifically the correlation between graphs. Intuitively, graphs with similar\nstructures should have similar representations. Therefore, we propose Graph\nExternal Attention (GEA) -- a novel attention mechanism that leverages multiple\nexternal node/edge key-value units to capture inter-graph correlations\nimplicitly. On this basis, we design an effective architecture called Graph\nExternal Attention Enhanced Transformer (GEAET), which integrates local\nstructure and global interaction information for more comprehensive graph\nrepresentations. Extensive experiments on benchmark datasets demonstrate that\nGEAET achieves state-of-the-art empirical performance. The source code is\navailable for reproducibility at: https://github.com/icm1018/GEAET.\n","authors":["Jianqing Liang","Min Chen","Jiye Liang"],"pdf_url":"https://arxiv.org/pdf/2405.21061v1.pdf","comment":"In Proceedings of ICML 2024"},{"id":"http://arxiv.org/abs/2405.21060v1","updated":"2024-05-31T17:50:01Z","published":"2024-05-31T17:50:01Z","title":"Transformers are SSMs: Generalized Models and Efficient Algorithms\n  Through Structured State Space Duality","summary":"  While Transformers have been the main architecture behind deep learning's\nsuccess in language modeling, state-space models (SSMs) such as Mamba have\nrecently been shown to match or outperform Transformers at small to medium\nscale. We show that these families of models are actually quite closely\nrelated, and develop a rich framework of theoretical connections between SSMs\nand variants of attention, connected through various decompositions of a\nwell-studied class of structured semiseparable matrices. Our state space\nduality (SSD) framework allows us to design a new architecture (Mamba-2) whose\ncore layer is an a refinement of Mamba's selective SSM that is 2-8X faster,\nwhile continuing to be competitive with Transformers on language modeling.\n","authors":["Tri Dao","Albert Gu"],"pdf_url":"https://arxiv.org/pdf/2405.21060v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2402.15938v3","updated":"2024-05-31T17:49:03Z","published":"2024-02-24T23:54:41Z","title":"Generalization or Memorization: Data Contamination and Trustworthy\n  Evaluation for Large Language Models","summary":"  Recent statements about the impressive capabilities of large language models\n(LLMs) are usually supported by evaluating on open-access benchmarks.\nConsidering the vast size and wide-ranging sources of LLMs' training data, it\ncould explicitly or implicitly include test data, leading to LLMs being more\nsusceptible to data contamination. However, due to the opacity of training\ndata, the black-box access of models, and the rapid growth of synthetic\ntraining data, detecting and mitigating data contamination for LLMs faces\nsignificant challenges. In this paper, we propose CDD, which stands for\nContamination Detection via output Distribution for LLMs. CDD necessitates only\nthe sampled texts to detect data contamination, by identifying the peakedness\nof LLM's output distribution. To mitigate the impact of data contamination in\nevaluation, we also present TED: Trustworthy Evaluation via output\nDistribution, based on the correction of LLM's output distribution. To\nfacilitate this study, we introduce two benchmarks, i.e., DetCon and ComiEval,\nfor data contamination detection and contamination mitigation evaluation tasks.\nExtensive experimental results show that CDD achieves the average relative\nimprovements of 21.8\\%-30.2\\% over other contamination detection approaches in\nterms of Accuracy, F1 Score, and AUC metrics, and can effectively detect\nimplicit contamination. TED substantially mitigates performance improvements up\nto 66.9\\% attributed to data contamination across various contamination setups.\nIn real-world applications, we reveal that ChatGPT exhibits a high potential to\nsuffer from data contamination on HumanEval benchmark.\n","authors":["Yihong Dong","Xue Jiang","Huanyu Liu","Zhi Jin","Bin Gu","Mengfei Yang","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2402.15938v3.pdf","comment":"Accepted to ACL"},{"id":"http://arxiv.org/abs/2405.17697v2","updated":"2024-05-31T17:47:52Z","published":"2024-05-27T23:04:37Z","title":"P4: Towards private, personalized, and Peer-to-Peer learning","summary":"  Personalized learning is a proposed approach to address the problem of data\nheterogeneity in collaborative machine learning. In a decentralized setting,\nthe two main challenges of personalization are client clustering and data\nprivacy. In this paper, we address these challenges by developing P4\n(Personalized Private Peer-to-Peer) a method that ensures that each client\nreceives a personalized model while maintaining differential privacy guarantee\nof each client's local dataset during and after the training. Our approach\nincludes the design of a lightweight algorithm to identify similar clients and\ngroup them in a private, peer-to-peer (P2P) manner. Once grouped, we develop\ndifferentially-private knowledge distillation for clients to co-train with\nminimal impact on accuracy. We evaluate our proposed method on three benchmark\ndatasets (FEMNIST or Federated EMNIST, CIFAR-10 and CIFAR-100) and two\ndifferent neural network architectures (Linear and CNN-based networks) across a\nrange of privacy parameters. The results demonstrate the potential of P4, as it\noutperforms the state-of-the-art of differential private P2P by up to 40\npercent in terms of accuracy. We also show the practicality of P4 by\nimplementing it on resource constrained devices, and validating that it has\nminimal overhead, e.g., about 7 seconds to run collaborative training between\ntwo clients.\n","authors":["Mohammad Mahdi Maheri","Sandra Siby","Sina Abdollahi","Anastasia Borovykh","Hamed Haddadi"],"pdf_url":"https://arxiv.org/pdf/2405.17697v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04240v2","updated":"2024-05-31T17:43:54Z","published":"2024-04-05T17:41:52Z","title":"Dynamic Conditional Optimal Transport through Simulation-Free Flows","summary":"  We study the geometry of conditional optimal transport (COT) and prove a\ndynamical formulation which generalizes the Benamou-Brenier Theorem. Equipped\nwith these tools, we propose a simulation-free flow-based method for\nconditional generative modeling. Our method couples an arbitrary source\ndistribution to a specified target distribution through a triangular COT plan,\nand a conditional generative model is obtained by approximating the geodesic\npath of measures induced by this COT plan. Our theory and methods are\napplicable in infinite-dimensional settings, making them well suited for a wide\nclass of Bayesian inverse problems. Empirically, we demonstrate that our method\nis competitive on several challenging conditional generation tasks, including\nan infinite-dimensional inverse problem.\n","authors":["Gavin Kerrigan","Giosue Migliorini","Padhraic Smyth"],"pdf_url":"https://arxiv.org/pdf/2404.04240v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21050v1","updated":"2024-05-31T17:43:35Z","published":"2024-05-31T17:43:35Z","title":"Spectrum-Aware Parameter Efficient Fine-Tuning for Diffusion Models","summary":"  Adapting large-scale pre-trained generative models in a parameter-efficient\nmanner is gaining traction. Traditional methods like low rank adaptation\nachieve parameter efficiency by imposing constraints but may not be optimal for\ntasks requiring high representation capacity. We propose a novel spectrum-aware\nadaptation framework for generative models. Our method adjusts both singular\nvalues and their basis vectors of pretrained weights. Using the Kronecker\nproduct and efficient Stiefel optimizers, we achieve parameter-efficient\nadaptation of orthogonal matrices. We introduce Spectral Orthogonal\nDecomposition Adaptation (SODA), which balances computational efficiency and\nrepresentation capacity. Extensive evaluations on text-to-image diffusion\nmodels demonstrate SODA's effectiveness, offering a spectrum-aware alternative\nto existing fine-tuning methods.\n","authors":["Xinxi Zhang","Song Wen","Ligong Han","Felix Juefei-Xu","Akash Srivastava","Junzhou Huang","Hao Wang","Molei Tao","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2405.21050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21047v1","updated":"2024-05-31T17:39:15Z","published":"2024-05-31T17:39:15Z","title":"Grammar-Aligned Decoding","summary":"  Large Language Models (LLMs) struggle with reliably generating highly\nstructured outputs, such as program code, mathematical formulas, or well-formed\nmarkup. Constrained decoding approaches mitigate this problem by greedily\nrestricting what tokens an LLM can output at each step to guarantee that the\noutput matches a given constraint. Specifically, in grammar-constrained\ndecoding (GCD), the LLM's output must follow a given grammar. In this paper we\ndemonstrate that GCD techniques (and in general constrained decoding\ntechniques) can distort the LLM's distribution, leading to outputs that are\ngrammatical but appear with likelihoods that are not proportional to the ones\ngiven by the LLM, and so ultimately are low-quality. We call the problem of\naligning sampling with a grammar constraint, grammar-aligned decoding (GAD),\nand propose adaptive sampling with approximate expected futures (ASAp), a\ndecoding algorithm that guarantees the output to be grammatical while provably\nproducing outputs that match the conditional probability of the LLM's\ndistribution conditioned on the given grammar constraint. Our algorithm uses\nprior sample outputs to soundly overapproximate the future grammaticality of\ndifferent output prefixes. Our evaluation on code generation and structured NLP\ntasks shows how ASAp often produces outputs with higher likelihood (according\nto the LLM's distribution) than existing GCD techniques, while still enforcing\nthe desired grammatical constraints.\n","authors":["Kanghee Park","Jiayu Wang","Taylor Berg-Kirkpatrick","Nadia Polikarpova","Loris D'Antoni"],"pdf_url":"https://arxiv.org/pdf/2405.21047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21046v1","updated":"2024-05-31T17:39:06Z","published":"2024-05-31T17:39:06Z","title":"Exploratory Preference Optimization: Harnessing Implicit\n  Q*-Approximation for Sample-Efficient RLHF","summary":"  Reinforcement learning from human feedback (RLHF) has emerged as a central\ntool for language model alignment. We consider online exploration in RLHF,\nwhich exploits interactive access to human or AI feedback by deliberately\nencouraging the model to produce diverse, maximally informative responses. By\nallowing RLHF to confidently stray from the pre-trained model, online\nexploration offers the possibility of novel, potentially super-human\ncapabilities, but its full potential as a paradigm for language model training\nhas yet to be realized, owing to computational and statistical bottlenecks in\ndirectly adapting existing reinforcement learning techniques. We propose a new\nalgorithm for online exploration in RLHF, Exploratory Preference Optimization\n(XPO), which is simple and practical -- a one-line change to (online) Direct\nPreference Optimization (DPO; Rafailov et al., 2023) -- yet enjoys the\nstrongest known provable guarantees and promising empirical performance. XPO\naugments the DPO objective with a novel and principled exploration bonus,\nempowering the algorithm to explore outside the support of the initial model\nand human feedback data. In theory, we show that XPO is provably\nsample-efficient and converges to a near-optimal language model policy under\nnatural exploration conditions, irrespective of whether the initial model has\ngood coverage. Our analysis, which builds on the observation that DPO\nimplicitly performs a form of $Q^{\\star}$-approximation (or, Bellman error\nminimization), combines previously disparate techniques from language modeling\nand theoretical reinforcement learning in a serendipitous fashion through the\nperspective of KL-regularized Markov decision processes. Empirically, we find\nthat XPO is more sample-efficient than non-exploratory DPO variants in a\npreliminary evaluation.\n","authors":["Tengyang Xie","Dylan J. Foster","Akshay Krishnamurthy","Corby Rosset","Ahmed Awadallah","Alexander Rakhlin"],"pdf_url":"https://arxiv.org/pdf/2405.21046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18239v2","updated":"2024-05-31T17:38:51Z","published":"2024-04-28T16:31:32Z","title":"SOUL: Unlocking the Power of Second-Order Optimization for LLM\n  Unlearning","summary":"  Large Language Models (LLMs) have highlighted the necessity of effective\nunlearning mechanisms to comply with data regulations and ethical AI practices.\nLLM unlearning aims at removing undesired data influences and associated model\ncapabilities without compromising utility out of the scope of unlearning. While\ninterest in studying LLM unlearning is growing,the impact of the optimizer\nchoice for LLM unlearning remains under-explored. In this work, we shed light\non the significance of optimizer selection in LLM unlearning for the first\ntime, establishing a clear connection between {second-order optimization} and\ninfluence unlearning (a classical approach using influence functions to update\nthe model for data influence removal). This insight propels us to develop a\nsecond-order unlearning framework, termed SOUL, built upon the second-order\nclipped stochastic optimization (Sophia)-based LLM training method. SOUL\nextends the static, one-shot model update using influence unlearning to a\ndynamic, iterative unlearning process. Our extensive experiments show that SOUL\nconsistently outperforms conventional first-order methods across various\nunlearning tasks, models, and metrics, suggesting the promise of second-order\noptimization in providing a scalable and easily implementable solution for LLM\nunlearning.\n","authors":["Jinghan Jia","Yihua Zhang","Yimeng Zhang","Jiancheng Liu","Bharat Runwal","James Diffenderfer","Bhavya Kailkhura","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2404.18239v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21045v1","updated":"2024-05-31T17:38:49Z","published":"2024-05-31T17:38:49Z","title":"An Attention-Based Multi-Context Convolutional Encoder-Decoder Neural\n  Network for Work Zone Traffic Impact Prediction","summary":"  Work zone is one of the major causes of non-recurrent traffic congestion and\nroad incidents. Despite the significance of its impact, studies on predicting\nthe traffic impact of work zones remain scarce. In this paper, we propose a\ndata integration pipeline that enhances the utilization of work zone and\ntraffic data from diversified platforms, and introduce a novel deep learning\nmodel to predict the traffic speed and incident likelihood during planned work\nzone events. The proposed model transforms traffic patterns into 2D space-time\nimages for both model input and output and employs an attention-based\nmulti-context convolutional encoder-decoder architecture to capture the\nspatial-temporal dependencies between work zone events and traffic variations.\nTrained and validated on four years of archived work zone traffic data from\nMaryland, USA, the model demonstrates superior performance over baseline models\nin predicting traffic speed, incident likelihood, and inferred traffic\nattributes such as queue length and congestion timings (i.e., start time and\nduration). Specifically, the proposed model outperforms the baseline models by\nreducing the prediction error of traffic speed by 5% to 34%, queue length by\n11% to 29%, congestion timing by 6% to 17%, and increasing the accuracy of\nincident predictions by 5% to 7%. Consequently, this model offers substantial\npromise for enhancing the planning and traffic management of work zones.\n","authors":["Qinhua Jiang","Xishun Liao","Yaofa Gong","Jiaqi Ma"],"pdf_url":"https://arxiv.org/pdf/2405.21045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21043v1","updated":"2024-05-31T17:36:16Z","published":"2024-05-31T17:36:16Z","title":"Target Networks and Over-parameterization Stabilize Off-policy\n  Bootstrapping with Function Approximation","summary":"  We prove that the combination of a target network and over-parameterized\nlinear function approximation establishes a weaker convergence condition for\nbootstrapped value estimation in certain cases, even with off-policy data. Our\ncondition is naturally satisfied for expected updates over the entire\nstate-action space or learning with a batch of complete trajectories from\nepisodic Markov decision processes. Notably, using only a target network or an\nover-parameterized model does not provide such a convergence guarantee.\nAdditionally, we extend our results to learning with truncated trajectories,\nshowing that convergence is achievable for all tasks with minor modifications,\nakin to value truncation for the final states in trajectories. Our primary\nresult focuses on temporal difference estimation for prediction, providing\nhigh-probability value estimation error bounds and empirical analysis on\nBaird's counterexample and a Four-room task. Furthermore, we explore the\ncontrol setting, demonstrating that similar convergence conditions apply to\nQ-learning.\n","authors":["Fengdi Che","Chenjun Xiao","Jincheng Mei","Bo Dai","Ramki Gummadi","Oscar A Ramirez","Christopher K Harris","A. Rupam Mahmood","Dale Schuurmans"],"pdf_url":"https://arxiv.org/pdf/2405.21043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21042v1","updated":"2024-05-31T17:33:07Z","published":"2024-05-31T17:33:07Z","title":"Comparing information content of representation spaces for\n  disentanglement with VAE ensembles","summary":"  Disentanglement is the endeavour to use machine learning to divide\ninformation about a dataset into meaningful fragments. In practice these\nfragments are representation (sub)spaces, often the set of channels in the\nlatent space of a variational autoencoder (VAE). Assessments of disentanglement\npredominantly employ metrics that are coarse-grained at the model level, but\nthis approach can obscure much about the process of information fragmentation.\nHere we propose to study the learned channels in aggregate, as the fragments of\ninformation learned by an ensemble of repeat training runs. Additionally, we\ndepart from prior work where measures of similarity between individual\nsubspaces neglected the nature of data embeddings as probability distributions.\nInstead, we view representation subspaces as communication channels that\nperform a soft clustering of the data; consequently, we generalize two classic\ninformation-theoretic measures of similarity between clustering assignments to\ncompare representation spaces. We develop a lightweight method of estimation\nbased on fingerprinting representation subspaces by their ability to\ndistinguish dataset samples, allowing us to identify, analyze, and leverage\nmeaningful structure in ensembles of VAEs trained on synthetic and natural\ndatasets. Using this fully unsupervised pipeline we identify \"hotspots\" in the\nspace of information fragments: groups of nearly identical representation\nsubspaces that appear repeatedly in an ensemble of VAEs, particularly as\nregularization is increased. Finally, we leverage the proposed methodology to\nachieve ensemble learning with VAEs, boosting the information content of a set\nof weak learners -- a capability not possible with previous methods of\nassessing channel similarity.\n","authors":["Kieran A. Murphy","Sam Dillavou","Dani S. Bassett"],"pdf_url":"https://arxiv.org/pdf/2405.21042v1.pdf","comment":"Code:\n  https://github.com/murphyka/representation-space-info-comparison"},{"id":"http://arxiv.org/abs/2402.09615v3","updated":"2024-05-31T17:31:38Z","published":"2024-02-14T23:09:15Z","title":"API Pack: A Massive Multi-Programming Language Dataset for API Call\n  Generation","summary":"  We introduce API Pack, a massive multi-programming language dataset\ncontaining more than 1 million instruction-API call pairs to improve the API\ncall generation capabilities of large language models. By fine-tuning\nCodeLlama-13B on 20,000 Python instances from API Pack, we achieved around 10%\nand 5% higher accuracy compared to GPT-3.5 and GPT-4, respectively, in\ngenerating unseen API calls. Fine-tuning on API Pack enables cross-programming\nlanguage generalization by leveraging a large amount of data in one language\nand small amounts of data from other languages. Scaling the training data to 1\nmillion instances further improves the model's generalization to new APIs not\nencountered during training. We open-source the API Pack dataset, trained\nmodels, and associated source code at https://github.com/zguo0525/API-Pack to\nfacilitate further research.\n","authors":["Zhen Guo","Adriana Meza Soria","Wei Sun","Yikang Shen","Rameswar Panda"],"pdf_url":"https://arxiv.org/pdf/2402.09615v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21036v1","updated":"2024-05-31T17:29:39Z","published":"2024-05-31T17:29:39Z","title":"A-PETE: Adaptive Prototype Explanations of Tree Ensembles","summary":"  The need for interpreting machine learning models is addressed through\nprototype explanations within the context of tree ensembles. An algorithm named\nAdaptive Prototype Explanations of Tree Ensembles (A-PETE) is proposed to\nautomatise the selection of prototypes for these classifiers. Its unique\ncharacteristics is using a specialised distance measure and a modified k-medoid\napproach. Experiments demonstrated its competitive predictive accuracy with\nrespect to earlier explanation algorithms. It also provides a a sufficient\nnumber of prototypes for the purpose of interpreting the random forest\nclassifier.\n","authors":["Jacek Karolczak","Jerzy Stefanowski"],"pdf_url":"https://arxiv.org/pdf/2405.21036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08097v2","updated":"2024-05-31T17:20:29Z","published":"2024-02-12T22:34:53Z","title":"An Accelerated Gradient Method for Convex Smooth Simple Bilevel\n  Optimization","summary":"  In this paper, we focus on simple bilevel optimization problems, where we\nminimize a convex smooth objective function over the optimal solution set of\nanother convex smooth constrained optimization problem. We present a novel\nbilevel optimization method that locally approximates the solution set of the\nlower-level problem using a cutting plane approach and employs an accelerated\ngradient-based update to reduce the upper-level objective function over the\napproximated solution set. We measure the performance of our method in terms of\nsuboptimality and infeasibility errors and provide non-asymptotic convergence\nguarantees for both error criteria. Specifically, when the feasible set is\ncompact, we show that our method requires at most\n$\\mathcal{O}(\\max\\{1/\\sqrt{\\epsilon_{f}}, 1/\\epsilon_g\\})$ iterations to find a\nsolution that is $\\epsilon_f$-suboptimal and $\\epsilon_g$-infeasible. Moreover,\nunder the additional assumption that the lower-level objective satisfies the\n$r$-th H\\\"olderian error bound, we show that our method achieves an iteration\ncomplexity of\n$\\mathcal{O}(\\max\\{\\epsilon_{f}^{-\\frac{2r-1}{2r}},\\epsilon_{g}^{-\\frac{2r-1}{2r}}\\})$,\nwhich matches the optimal complexity of single-level convex constrained\noptimization when $r=1$.\n","authors":["Jincheng Cao","Ruichen Jiang","Erfan Yazdandoost Hamedani","Aryan Mokhtari"],"pdf_url":"https://arxiv.org/pdf/2402.08097v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19961v2","updated":"2024-05-31T17:18:35Z","published":"2024-05-30T11:32:42Z","title":"Collective Variable Free Transition Path Sampling with Generative Flow\n  Network","summary":"  Understanding transition paths between meta-stable states in molecular\nsystems is fundamental for material design and drug discovery. However,\nsampling these paths via molecular dynamics simulations is computationally\nprohibitive due to the high-energy barriers between the meta-stable states.\nRecent machine learning approaches are often restricted to simple systems or\nrely on collective variables (CVs) extracted from expensive domain knowledge.\nIn this work, we propose to leverage generative flow networks (GFlowNets) to\nsample transition paths without relying on CVs. We reformulate the problem as\namortized energy-based sampling over molecular trajectories and train a bias\npotential by minimizing the squared log-ratio between the target distribution\nand the generator, derived from the flow matching objective of GFlowNets. Our\nevaluation on three proteins (Alanine Dipeptide, Polyproline, and Chignolin)\ndemonstrates that our approach, called TPS-GFN, generates more realistic and\ndiverse transition paths than the previous CV-free machine learning approach.\n","authors":["Kiyoung Seong","Seonghyun Park","Seonghwan Kim","Woo Youn Kim","Sungsoo Ahn"],"pdf_url":"https://arxiv.org/pdf/2405.19961v2.pdf","comment":"9 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2405.21027v1","updated":"2024-05-31T17:16:29Z","published":"2024-05-31T17:16:29Z","title":"Fusion-PSRO: Nash Policy Fusion for Policy Space Response Oracles","summary":"  For solving zero-sum games involving non-transitivity, a common approach is\nto maintain population policies to approximate the Nash Equilibrium (NE).\nPrevious research has shown that the Policy Space Response Oracle (PSRO) is an\neffective multi-agent reinforcement learning framework for these games.\nHowever, repeatedly training new policies from scratch to approximate the Best\nResponse (BR) to opponents' mixed policies at each iteration is inefficient and\ncostly. While some PSRO methods initialize a new BR policy by inheriting from\npast BR policies, this approach limits the exploration of new policies,\nespecially against challenging opponents.To address this issue, we propose\nFusion-PSRO, which uses model fusion to initialize the policy for better\napproximation to BR. With Top-k probabilities from NE, we select high-quality\nbase policies and fuse them into a new BR policy through model averaging. This\napproach allows the initialized policy to incorporate multiple expert policies,\nmaking it easier to handle difficult opponents compared to inheriting or\ninitializing from scratch. Additionally, our method only modifies the policy\ninitialization, enabling its application to nearly all PSRO variants without\nadditional training overhead.Our experiments with non-transitive matrix games,\nLeduc poker, and the more complex Liars Dice demonstrate that Fusion-PSRO\nenhances the performance of nearly all PSRO variants, achieving lower\nexploitability.\n","authors":["Jiesong Lian","Yucong Huang","Mingzhi Wang","Chengdong Ma","Yixue Hao","Ying Wen","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2405.21027v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.21021v1","updated":"2024-05-31T17:09:07Z","published":"2024-05-31T17:09:07Z","title":"Beyond Conventional Parametric Modeling: Data-Driven Framework for\n  Estimation and Prediction of Time Activity Curves in Dynamic PET Imaging","summary":"  Dynamic Positron Emission Tomography (dPET) imaging and Time-Activity Curve\n(TAC) analyses are essential for understanding and quantifying the\nbiodistribution of radiopharmaceuticals over time and space. Traditional\ncompartmental modeling, while foundational, commonly struggles to fully capture\nthe complexities of biological systems, including non-linear dynamics and\nvariability. This study introduces an innovative data-driven neural\nnetwork-based framework, inspired by Reaction Diffusion systems, designed to\naddress these limitations. Our approach, which adaptively fits TACs from dPET,\nenables the direct calibration of diffusion coefficients and reaction terms\nfrom observed data, offering significant improvements in predictive accuracy\nand robustness over traditional methods, especially in complex biological\nscenarios. By more accurately modeling the spatio-temporal dynamics of\nradiopharmaceuticals, our method advances modeling of pharmacokinetic and\npharmacodynamic processes, enabling new possibilities in quantitative nuclear\nmedicine.\n","authors":["Niloufar Zakariaei","Arman Rahmim","Eldad Haber"],"pdf_url":"https://arxiv.org/pdf/2405.21021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21018v1","updated":"2024-05-31T17:07:15Z","published":"2024-05-31T17:07:15Z","title":"Improved Techniques for Optimization-Based Jailbreaking on Large\n  Language Models","summary":"  Large language models (LLMs) are being rapidly developed, and a key component\nof their widespread deployment is their safety-related alignment. Many\nred-teaming efforts aim to jailbreak LLMs, where among these efforts, the\nGreedy Coordinate Gradient (GCG) attack's success has led to a growing interest\nin the study of optimization-based jailbreaking techniques. Although GCG is a\nsignificant milestone, its attacking efficiency remains unsatisfactory. In this\npaper, we present several improved (empirical) techniques for\noptimization-based jailbreaks like GCG. We first observe that the single target\ntemplate of \"Sure\" largely limits the attacking performance of GCG; given this,\nwe propose to apply diverse target templates containing harmful self-suggestion\nand/or guidance to mislead LLMs. Besides, from the optimization aspects, we\npropose an automatic multi-coordinate updating strategy in GCG (i.e.,\nadaptively deciding how many tokens to replace in each step) to accelerate\nconvergence, as well as tricks like easy-to-hard initialisation. Then, we\ncombine these improved technologies to develop an efficient jailbreak method,\ndubbed $\\mathcal{I}$-GCG. In our experiments, we evaluate on a series of\nbenchmarks (such as NeurIPS 2023 Red Teaming Track). The results demonstrate\nthat our improved techniques can help GCG outperform state-of-the-art\njailbreaking attacks and achieve nearly 100% attack success rate. The code is\nreleased at https://github.com/jiaxiaojunQAQ/I-GCG.\n","authors":["Xiaojun Jia","Tianyu Pang","Chao Du","Yihao Huang","Jindong Gu","Yang Liu","Xiaochun Cao","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2405.21018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.00825v4","updated":"2024-05-31T17:07:04Z","published":"2022-04-27T05:03:45Z","title":"Stochastic Online Fisher Markets: Static Pricing Limits and Adaptive\n  Enhancements","summary":"  Fisher markets are one of the most fundamental models for resource\nallocation. However, the problem of computing equilibrium prices in Fisher\nmarkets typically relies on complete knowledge of users' budgets and utility\nfunctions and requires transactions to happen in a static market where all\nusers are present simultaneously. Motivated by these practical considerations,\nwe study an online variant of Fisher markets, wherein users with privately\nknown utility and budget parameters, drawn i.i.d. from a distribution, arrive\nsequentially. In this setting, we first study the limitations of static pricing\nalgorithms, which set uniform prices for all users, along two performance\nmetrics: (i) regret, i.e., the optimality gap in the objective of the\nEisenberg-Gale program between an online algorithm and an oracle with complete\ninformation, and (ii) capacity violations, i.e., the over-consumption of goods\nrelative to their capacities. Given the limitations of static pricing, we\ndesign adaptive posted-pricing algorithms, one with knowledge of the\ndistribution of users' budget and utility parameters and another that adjusts\nprices solely based on past observations of user consumption, i.e., revealed\npreference feedback, with improved performance guarantees. Finally, we present\nnumerical experiments to compare our revealed preference algorithm's\nperformance to several benchmarks.\n","authors":["Devansh Jalota","Yinyu Ye"],"pdf_url":"https://arxiv.org/pdf/2205.00825v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18418v2","updated":"2024-05-31T17:03:00Z","published":"2024-05-28T17:57:23Z","title":"Hierarchical World Models as Visual Whole-Body Humanoid Controllers","summary":"  Whole-body control for humanoids is challenging due to the high-dimensional\nnature of the problem, coupled with the inherent instability of a bipedal\nmorphology. Learning from visual observations further exacerbates this\ndifficulty. In this work, we explore highly data-driven approaches to visual\nwhole-body humanoid control based on reinforcement learning, without any\nsimplifying assumptions, reward design, or skill primitives. Specifically, we\npropose a hierarchical world model in which a high-level agent generates\ncommands based on visual observations for a low-level agent to execute, both of\nwhich are trained with rewards. Our approach produces highly performant control\npolicies in 8 tasks with a simulated 56-DoF humanoid, while synthesizing\nmotions that are broadly preferred by humans. Code and videos:\nhttps://nicklashansen.com/rlpuppeteer\n","authors":["Nicklas Hansen","Jyothir S V","Vlad Sobal","Yann LeCun","Xiaolong Wang","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2405.18418v2.pdf","comment":"Code and videos at https://nicklashansen.com/rlpuppeteer"},{"id":"http://arxiv.org/abs/2305.09938v4","updated":"2024-05-31T17:02:37Z","published":"2023-05-17T03:52:40Z","title":"Mastering Long-Tail Complexity on Graphs: Characterization, Learning,\n  and Generalization","summary":"  In the context of long-tail classification on graphs, the vast majority of\nexisting work primarily revolves around the development of model debiasing\nstrategies, intending to mitigate class imbalances and enhance the overall\nperformance. Despite the notable success, there is very limited literature that\nprovides a theoretical tool for characterizing the behaviors of long-tail\nclasses in graphs and gaining insight into generalization performance in\nreal-world scenarios. To bridge this gap, we propose a generalization bound for\nlong-tail classification on graphs by formulating the problem in the fashion of\nmulti-task learning, i.e., each task corresponds to the prediction of one\nparticular class. Our theoretical results show that the generalization\nperformance of long-tail classification is dominated by the overall loss range\nand the task complexity. Building upon the theoretical findings, we propose a\nnovel generic framework HierTail for long-tail classification on graphs. In\nparticular, we start with a hierarchical task grouping module that allows us to\nassign related tasks into hypertasks and thus control the complexity of the\ntask space; then, we further design a balanced contrastive learning module to\nadaptively balance the gradients of both head and tail classes to control the\nloss range across all tasks in a unified fashion. Extensive experiments\ndemonstrate the effectiveness of HierTail in characterizing long-tail classes\non real graphs, which achieves up to 12.9% improvement over the leading\nbaseline method in accuracy.\n","authors":["Haohui Wang","Baoyu Jing","Kaize Ding","Yada Zhu","Wei Cheng","Si Zhang","Yonghui Fan","Liqing Zhang","Dawei Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.09938v4.pdf","comment":"Accepted at KDD 2024"},{"id":"http://arxiv.org/abs/2405.21012v1","updated":"2024-05-31T16:52:51Z","published":"2024-05-31T16:52:51Z","title":"G-Transformer for Conditional Average Potential Outcome Estimation over\n  Time","summary":"  Estimating potential outcomes for treatments over time based on observational\ndata is important for personalized decision-making in medicine. Yet, existing\nneural methods for this task suffer from either (a) bias or (b) large variance.\nIn order to address both limitations, we introduce the G-transformer (GT). Our\nGT is a novel, neural end-to-end model designed for unbiased, low-variance\nestimation of conditional average potential outcomes (CAPOs) over time.\nSpecifically, our GT is the first neural model to perform regression-based\niterative G-computation for CAPOs in the time-varying setting. We evaluate the\neffectiveness of our GT across various experiments. In sum, this work\nrepresents a significant step towards personalized decision-making from\nelectronic health records.\n","authors":["Konstantin Hess","Dennis Frauen","Valentyn Melnychuk","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2405.21012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21003v1","updated":"2024-05-31T16:44:40Z","published":"2024-05-31T16:44:40Z","title":"Explaining Predictions by Characteristic Rules","summary":"  Characteristic rules have been advocated for their ability to improve\ninterpretability over discriminative rules within the area of rule learning.\nHowever, the former type of rule has not yet been used by techniques for\nexplaining predictions. A novel explanation technique, called CEGA\n(Characteristic Explanatory General Association rules), is proposed, which\nemploys association rule mining to aggregate multiple explanations generated by\nany standard local explanation technique into a set of characteristic rules. An\nempirical investigation is presented, in which CEGA is compared to two\nstate-of-the-art methods, Anchors and GLocalX, for producing local and\naggregated explanations in the form of discriminative rules. The results\nsuggest that the proposed approach provides a better trade-off between fidelity\nand complexity compared to the two state-of-the-art approaches; CEGA and\nAnchors significantly outperform GLocalX with respect to fidelity, while CEGA\nand GLocalX significantly outperform Anchors with respect to the number of\ngenerated rules. The effect of changing the format of the explanations of CEGA\nto discriminative rules and using LIME and SHAP as local explanation techniques\ninstead of Anchors are also investigated. The results show that the\ncharacteristic explanatory rules still compete favorably with rules in the\nstandard discriminative format. The results also indicate that using CEGA in\ncombination with either SHAP or Anchors consistently leads to a higher fidelity\ncompared to using LIME as the local explanation technique.\n","authors":["Amr Alkhatib","Henrik Boström","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2405.21003v1.pdf","comment":"Machine Learning and Knowledge Discovery in Databases. ECML PKDD 2022"},{"id":"http://arxiv.org/abs/2212.07946v3","updated":"2024-05-31T16:40:03Z","published":"2022-12-15T16:28:06Z","title":"Active Inference and Reinforcement Learning: A unified inference on\n  continuous state and action spaces under partial observability","summary":"  Reinforcement learning (RL) has garnered significant attention for developing\ndecision-making agents that aim to maximize rewards, specified by an external\nsupervisor, within fully observable environments. However, many real-world\nproblems involve partial observations, formulated as partially observable\nMarkov decision processes (POMDPs). Previous studies have tackled RL in POMDPs\nby either incorporating the memory of past actions and observations or by\ninferring the true state of the environment from observed data. However,\naggregating observed data over time becomes impractical in continuous spaces.\nMoreover, inference-based RL approaches often require many samples to perform\nwell, as they focus solely on reward maximization and neglect uncertainty in\nthe inferred state. Active inference (AIF) is a framework formulated in POMDPs\nand directs agents to select actions by minimizing a function called expected\nfree energy (EFE). This supplies reward-maximizing (exploitative) behaviour, as\nin RL, with information-seeking (exploratory) behaviour. Despite this\nexploratory behaviour of AIF, its usage is limited to discrete spaces due to\nthe computational challenges associated with EFE. In this paper, we propose a\nunified principle that establishes a theoretical connection between AIF and RL,\nenabling seamless integration of these two approaches and overcoming their\naforementioned limitations in continuous space POMDP settings. We substantiate\nour findings with theoretical analysis, providing novel perspectives for\nutilizing AIF in the design of artificial agents. Experimental results\ndemonstrate the superior learning capabilities of our method in solving\ncontinuous space partially observable tasks. Notably, our approach harnesses\ninformation-seeking exploration, enabling it to effectively solve reward-free\nproblems and rendering explicit task reward design by an external supervisor\noptional.\n","authors":["Parvin Malekzadeh","Konstantinos N. Plataniotis"],"pdf_url":"https://arxiv.org/pdf/2212.07946v3.pdf","comment":"90 pages including appendices"},{"id":"http://arxiv.org/abs/2405.20993v1","updated":"2024-05-31T16:38:35Z","published":"2024-05-31T16:38:35Z","title":"Information limits and Thouless-Anderson-Palmer equations for spiked\n  matrix models with structured noise","summary":"  We consider a prototypical problem of Bayesian inference for a structured\nspiked model: a low-rank signal is corrupted by additive noise. While both\ninformation-theoretic and algorithmic limits are well understood when the noise\nis i.i.d. Gaussian, the more realistic case of structured noise still proves to\nbe challenging. To capture the structure while maintaining mathematical\ntractability, a line of work has focused on rotationally invariant noise.\nHowever, existing studies either provide sub-optimal algorithms or they are\nlimited to a special class of noise ensembles. In this paper, we establish the\nfirst characterization of the information-theoretic limits for a noise matrix\ndrawn from a general trace ensemble. These limits are then achieved by an\nefficient algorithm inspired by the theory of adaptive Thouless-Anderson-Palmer\n(TAP) equations. Our approach leverages tools from statistical physics (replica\nmethod) and random matrix theory (generalized spherical integrals), and it\nunveils the equivalence between the rotationally invariant model and a\nsurrogate Gaussian model.\n","authors":["Jean Barbier","Francesco Camilli","Marco Mondelli","Yizhou Xu"],"pdf_url":"https://arxiv.org/pdf/2405.20993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14622v3","updated":"2024-05-31T16:37:53Z","published":"2024-05-23T14:30:33Z","title":"Calibrated Self-Rewarding Vision Language Models","summary":"  Large Vision-Language Models (LVLMs) have made substantial progress by\nintegrating pre-trained large language models (LLMs) and vision models through\ninstruction tuning. Despite these advancements, LVLMs often exhibit the\nhallucination phenomenon, where generated text responses appear linguistically\nplausible but contradict the input image, indicating a misalignment between\nimage and text pairs. This misalignment arises because the model tends to\nprioritize textual information over visual input, even when both the language\nmodel and visual representations are of high quality. Existing methods leverage\nadditional models or human annotations to curate preference data and enhance\nmodality alignment through preference optimization. These approaches may not\neffectively reflect the target LVLM's preferences, making the curated\npreferences easily distinguishable. Our work addresses these challenges by\nproposing the Calibrated Self-Rewarding (CSR) approach, which enables the model\nto self-improve by iteratively generating candidate responses, evaluating the\nreward for each response, and curating preference data for fine-tuning. In the\nreward modeling, we employ a step-wise strategy and incorporate visual\nconstraints into the self-rewarding process to place greater emphasis on visual\ninput. Empirical results demonstrate that CSR enhances performance and reduces\nhallucinations across ten benchmarks and tasks, achieving substantial\nimprovements over existing methods by 7.62%. Our empirical results are further\nsupported by rigorous theoretical analysis, under mild assumptions, verifying\nthe effectiveness of introducing visual constraints into the self-rewarding\nparadigm. Additionally, CSR shows compatibility with different vision-language\nmodels and the ability to incrementally improve performance through iterative\nfine-tuning. Our data and code are available at\nhttps://github.com/YiyangZhou/CSR.\n","authors":["Yiyang Zhou","Zhiyuan Fan","Dongjie Cheng","Sihan Yang","Zhaorun Chen","Chenhang Cui","Xiyao Wang","Yun Li","Linjun Zhang","Huaxiu Yao"],"pdf_url":"https://arxiv.org/pdf/2405.14622v3.pdf","comment":"fix some typos and add acknowledgement section in V3"},{"id":"http://arxiv.org/abs/2405.20991v1","updated":"2024-05-31T16:35:41Z","published":"2024-05-31T16:35:41Z","title":"Hard Cases Detection in Motion Prediction by Vision-Language Foundation\n  Models","summary":"  Addressing hard cases in autonomous driving, such as anomalous road users,\nextreme weather conditions, and complex traffic interactions, presents\nsignificant challenges. To ensure safety, it is crucial to detect and manage\nthese scenarios effectively for autonomous driving systems. However, the rarity\nand high-risk nature of these cases demand extensive, diverse datasets for\ntraining robust models. Vision-Language Foundation Models (VLMs) have shown\nremarkable zero-shot capabilities as being trained on extensive datasets. This\nwork explores the potential of VLMs in detecting hard cases in autonomous\ndriving. We demonstrate the capability of VLMs such as GPT-4v in detecting hard\ncases in traffic participant motion prediction on both agent and scenario\nlevels. We introduce a feasible pipeline where VLMs, fed with sequential image\nframes with designed prompts, effectively identify challenging agents or\nscenarios, which are verified by existing prediction models. Moreover, by\ntaking advantage of this detection of hard cases by VLMs, we further improve\nthe training efficiency of the existing motion prediction pipeline by\nperforming data selection for the training samples suggested by GPT. We show\nthe effectiveness and feasibility of our pipeline incorporating VLMs with\nstate-of-the-art methods on NuScenes datasets. The code is accessible at\nhttps://github.com/KTH-RPL/Detect_VLM.\n","authors":["Yi Yang","Qingwen Zhang","Kei Ikemura","Nazre Batool","John Folkesson"],"pdf_url":"https://arxiv.org/pdf/2405.20991v1.pdf","comment":"IEEE Intelligent Vehicles Symposium (IV) 2024"},{"id":"http://arxiv.org/abs/2405.20990v1","updated":"2024-05-31T16:35:29Z","published":"2024-05-31T16:35:29Z","title":"Locking Machine Learning Models into Hardware","summary":"  Modern Machine Learning models are expensive IP and business competitiveness\noften depends on keeping this IP confidential. This in turn restricts how these\nmodels are deployed -- for example it is unclear how to deploy a model\non-device without inevitably leaking the underlying model. At the same time,\nconfidential computing technologies such as Multi-Party Computation or\nHomomorphic encryption remain impractical for wide adoption. In this paper we\ntake a different approach and investigate feasibility of ML-specific mechanisms\nthat deter unauthorized model use by restricting the model to only be usable on\nspecific hardware, making adoption on unauthorized hardware inconvenient. That\nway, even if IP is compromised, it cannot be trivially used without specialised\nhardware or major model adjustment. In a sense, we seek to enable cheap locking\nof machine learning models into specific hardware. We demonstrate that locking\nmechanisms are feasible by either targeting efficiency of model\nrepresentations, such making models incompatible with quantisation, or tie the\nmodel's operation on specific characteristics of hardware, such as number of\ncycles for arithmetic operations. We demonstrate that locking comes with\nnegligible work and latency overheads, while significantly restricting\nusability of the resultant model on unauthorized hardware.\n","authors":["Eleanor Clifford","Adhithya Saravanan","Harry Langford","Cheng Zhang","Yiren Zhao","Robert Mullins","Ilia Shumailov","Jamie Hayes"],"pdf_url":"https://arxiv.org/pdf/2405.20990v1.pdf","comment":"10 pages, 2 figures of main text; 14 pages, 16 figures of appendices"},{"id":"http://arxiv.org/abs/2405.20988v1","updated":"2024-05-31T16:34:11Z","published":"2024-05-31T16:34:11Z","title":"Communication-Efficient Distributed Deep Learning via Federated Dynamic\n  Averaging","summary":"  Driven by the ever-growing volume and decentralized nature of data, coupled\nwith the escalating size of modern models, distributed deep learning (DDL) has\nbeen entrenched as the preferred paradigm for training. However, frequent\nsynchronization of DL models, encompassing millions to many billions of\nparameters, creates a communication bottleneck, severely hindering scalability.\nWorse yet, DDL algorithms typically waste valuable bandwidth, and make\nthemselves less practical in bandwidth-constrained federated settings, by\nrelying on overly simplistic, periodic, and rigid synchronization schedules. To\naddress these shortcomings, we propose Federated Dynamic Averaging (FDA), a\ncommunication-efficient DDL strategy that dynamically triggers synchronization\nbased on the value of the model variance. Through extensive experiments across\na wide range of learning tasks we demonstrate that FDA reduces communication\ncost by orders of magnitude, compared to both traditional and cutting-edge\ncommunication-efficient algorithms. Remarkably, FDA achieves this without\nsacrificing convergence speed - in stark contrast to the trade-offs encountered\nin the field. Additionally, we show that FDA maintains robust performance\nacross diverse data heterogeneity settings.\n","authors":["Michail Theologitis","Georgios Frangias","Georgios Anestis","Vasilis Samoladas","Antonios Deligiannakis"],"pdf_url":"https://arxiv.org/pdf/2405.20988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20987v1","updated":"2024-05-31T16:33:20Z","published":"2024-05-31T16:33:20Z","title":"Early Stopping Criteria for Training Generative Adversarial Networks in\n  Biomedical Imaging","summary":"  Generative Adversarial Networks (GANs) have high computational costs to train\ntheir complex architectures. Throughout the training process, GANs' output is\nanalyzed qualitatively based on the loss and synthetic images' diversity and\nquality. Based on this qualitative analysis, training is manually halted once\nthe desired synthetic images are generated. By utilizing an early stopping\ncriterion, the computational cost and dependence on manual oversight can be\nreduced yet impacted by training problems such as mode collapse,\nnon-convergence, and instability. This is particularly prevalent in biomedical\nimagery, where training problems degrade the diversity and quality of synthetic\nimages, and the high computational cost associated with training makes complex\narchitectures increasingly inaccessible. This work proposes a novel early\nstopping criteria to quantitatively detect training problems, halt training,\nand reduce the computational costs associated with synthesizing biomedical\nimages. Firstly, the range of generator and discriminator loss values is\ninvestigated to assess whether mode collapse, non-convergence, and instability\noccur sequentially, concurrently, or interchangeably throughout the training of\nGANs. Secondly, utilizing these occurrences in conjunction with the Mean\nStructural Similarity Index (MS-SSIM) and Fr\\'echet Inception Distance (FID)\nscores of synthetic images forms the basis of the proposed early stopping\ncriteria. This work helps identify the occurrence of training problems in GANs\nusing low-resource computational cost and reduces training time to generate\ndiversified and high-quality synthetic images.\n","authors":["Muhammad Muneeb Saad","Mubashir Husain Rehmani","Ruairi O'Reilly"],"pdf_url":"https://arxiv.org/pdf/2405.20987v1.pdf","comment":"This paper is accepted at the 35th IEEE Irish Signals and Systems\n  Conference (ISSC 2024)"},{"id":"http://arxiv.org/abs/2405.20986v1","updated":"2024-05-31T16:32:46Z","published":"2024-05-31T16:32:46Z","title":"Uncertainty Quantification for Bird's Eye View Semantic Segmentation:\n  Methods and Benchmarks","summary":"  The fusion of raw features from multiple sensors on an autonomous vehicle to\ncreate a Bird's Eye View (BEV) representation is crucial for planning and\ncontrol systems. There is growing interest in using deep learning models for\nBEV semantic segmentation. Anticipating segmentation errors and improving the\nexplainability of DNNs is essential for autonomous driving, yet it is\nunder-studied. This paper introduces a benchmark for predictive uncertainty\nquantification in BEV segmentation. The benchmark assesses various approaches\nacross three popular datasets using two representative backbones and focuses on\nthe effectiveness of predicted uncertainty in identifying misclassified and\nout-of-distribution (OOD) pixels, as well as calibration. Empirical findings\nhighlight the challenges in uncertainty quantification. Our results find that\nevidential deep learning based approaches show the most promise by efficiently\nquantifying aleatoric and epistemic uncertainty. We propose the\nUncertainty-Focal-Cross-Entropy (UFCE) loss, designed for highly imbalanced\ndata, which consistently improves the segmentation quality and calibration.\nAdditionally, we introduce a vacuity-scaled regularization term that enhances\nthe model's focus on high uncertainty pixels, improving epistemic uncertainty\nquantification.\n","authors":["Linlin Yu","Bowen Yang","Tianhao Wang","Kangshuo Li","Feng Chen"],"pdf_url":"https://arxiv.org/pdf/2405.20986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20984v1","updated":"2024-05-31T16:31:07Z","published":"2024-05-31T16:31:07Z","title":"Bayesian Design Principles for Offline-to-Online Reinforcement Learning","summary":"  Offline reinforcement learning (RL) is crucial for real-world applications\nwhere exploration can be costly or unsafe. However, offline learned policies\nare often suboptimal, and further online fine-tuning is required. In this\npaper, we tackle the fundamental dilemma of offline-to-online fine-tuning: if\nthe agent remains pessimistic, it may fail to learn a better policy, while if\nit becomes optimistic directly, performance may suffer from a sudden drop. We\nshow that Bayesian design principles are crucial in solving such a dilemma.\nInstead of adopting optimistic or pessimistic policies, the agent should act in\na way that matches its belief in optimal policies.\n  Such a probability-matching agent can avoid a sudden performance drop while\nstill being guaranteed to find the optimal policy. Based on our theoretical\nfindings, we introduce a novel algorithm that outperforms existing methods on\nvarious benchmarks, demonstrating the efficacy of our approach. Overall, the\nproposed approach provides a new perspective on offline-to-online RL that has\nthe potential to enable more effective learning from offline data.\n","authors":["Hao Hu","Yiqin Yang","Jianing Ye","Chengjie Wu","Ziqing Mai","Yujing Hu","Tangjie Lv","Changjie Fan","Qianchuan Zhao","Chongjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20984v1.pdf","comment":"Forty-first International Conference on Machine Learning (ICML), 2024"},{"id":"http://arxiv.org/abs/2402.15259v3","updated":"2024-05-31T16:28:10Z","published":"2024-02-23T11:04:33Z","title":"Open Ad Hoc Teamwork with Cooperative Game Theory","summary":"  Ad hoc teamwork poses a challenging problem, requiring the design of an agent\nto collaborate with teammates without prior coordination or joint training.\nOpen ad hoc teamwork (OAHT) further complicates this challenge by considering\nenvironments with a changing number of teammates, referred to as open teams.\nOne promising solution in practice to this problem is leveraging the\ngeneralizability of graph neural networks to handle an unrestricted number of\nagents, named graph-based policy learning (GPL). However, its joint Q-value\nrepresentation over a coordination graph lacks convincing explanations. In this\npaper, we establish a new theory to understand the joint Q-value representation\nfor OAHT, from the perspective of cooperative game theory, and validate its\nlearning paradigm. Building on our theory, we propose a novel algorithm named\nCIAO, compatible with GPL framework, with additional provable implementation\ntricks that can facilitate learning. The demos of experimental results are\navailable on https://sites.google.com/view/ciao2024, and the code of\nexperiments is published on https://github.com/hsvgbkhgbv/CIAO.\n","authors":["Jianhong Wang","Yang Li","Yuan Zhang","Wei Pan","Samuel Kaski"],"pdf_url":"https://arxiv.org/pdf/2402.15259v3.pdf","comment":"Published at ICML 2024, 29 pages"},{"id":"http://arxiv.org/abs/2310.02905v2","updated":"2024-05-31T16:27:53Z","published":"2023-10-02T02:01:16Z","title":"Use Your INSTINCT: INSTruction optimization for LLMs usIng Neural\n  bandits Coupled with Transformers","summary":"  Large language models (LLMs) have shown remarkable instruction-following\ncapabilities and achieved impressive performances in various applications.\nHowever, the performances of LLMs depend heavily on the instructions given to\nthem, which are typically manually tuned with substantial human efforts. Recent\nwork has used the query-efficient Bayesian optimization (BO) algorithm to\nautomatically optimize the instructions given to black-box LLMs. However, BO\nusually falls short when optimizing highly sophisticated (e.g.,\nhigh-dimensional) objective functions, such as the functions mapping an\ninstruction to the performance of an LLM. This is mainly due to the limited\nexpressive power of the Gaussian process (GP) which is used by BO as a\nsurrogate to model the objective function. Meanwhile, it has been repeatedly\nshown that neural networks (NNs), especially pre-trained transformers, possess\nstrong expressive power and can model highly complex functions. So, we adopt a\nneural bandit algorithm which replaces the GP in BO by an NN surrogate to\noptimize instructions for black-box LLMs. More importantly, the neural bandit\nalgorithm allows us to naturally couple the NN surrogate with the hidden\nrepresentation learned by a pre-trained transformer (i.e., an open-source LLM),\nwhich significantly boosts its performance. These motivate us to propose our\nINSTruction optimization usIng Neural bandits Coupled with Transformers\n(INSTINCT) algorithm. We perform instruction optimization for ChatGPT and use\nextensive experiments to show that INSTINCT consistently outperforms baselines\nin different tasks, e.g., various instruction induction tasks and the task of\nimproving zero-shot chain-of-thought instructions. Our code is available at\nhttps://github.com/xqlin98/INSTINCT.\n","authors":["Xiaoqiang Lin","Zhaoxuan Wu","Zhongxiang Dai","Wenyang Hu","Yao Shu","See-Kiong Ng","Patrick Jaillet","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2310.02905v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2405.20980v1","updated":"2024-05-31T16:26:08Z","published":"2024-05-31T16:26:08Z","title":"Neural Gaussian Scale-Space Fields","summary":"  Gaussian scale spaces are a cornerstone of signal representation and\nprocessing, with applications in filtering, multiscale analysis, anti-aliasing,\nand many more. However, obtaining such a scale space is costly and cumbersome,\nin particular for continuous representations such as neural fields. We present\nan efficient and lightweight method to learn the fully continuous, anisotropic\nGaussian scale space of an arbitrary signal. Based on Fourier feature\nmodulation and Lipschitz bounding, our approach is trained self-supervised,\ni.e., training does not require any manual filtering. Our neural Gaussian\nscale-space fields faithfully capture multiscale representations across a broad\nrange of modalities, and support a diverse set of applications. These include\nimages, geometry, light-stage data, texture anti-aliasing, and multiscale\noptimization.\n","authors":["Felix Mujkanovic","Ntumba Elie Nsampi","Christian Theobalt","Hans-Peter Seidel","Thomas Leimkühler"],"pdf_url":"https://arxiv.org/pdf/2405.20980v1.pdf","comment":"15 pages; SIGGRAPH 2024; project page at\n  https://neural-gaussian-scale-space-fields.mpi-inf.mpg.de"},{"id":"http://arxiv.org/abs/2405.20975v1","updated":"2024-05-31T16:21:55Z","published":"2024-05-31T16:21:55Z","title":"ACE: A Model Poisoning Attack on Contribution Evaluation Methods in\n  Federated Learning","summary":"  In Federated Learning (FL), a set of clients collaboratively train a machine\nlearning model (called global model) without sharing their local training data.\nThe local training data of clients is typically non-i.i.d. and heterogeneous,\nresulting in varying contributions from individual clients to the final\nperformance of the global model. In response, many contribution evaluation\nmethods were proposed, where the server could evaluate the contribution made by\neach client and incentivize the high-contributing clients to sustain their\nlong-term participation in FL. Existing studies mainly focus on developing new\nmetrics or algorithms to better measure the contribution of each client.\nHowever, the security of contribution evaluation methods of FL operating in\nadversarial environments is largely unexplored. In this paper, we propose the\nfirst model poisoning attack on contribution evaluation methods in FL, termed\nACE. Specifically, we show that any malicious client utilizing ACE could\nmanipulate the parameters of its local model such that it is evaluated to have\na high contribution by the server, even when its local training data is indeed\nof low quality. We perform both theoretical analysis and empirical evaluations\nof ACE. Theoretically, we show our design of ACE can effectively boost the\nmalicious client's perceived contribution when the server employs the\nwidely-used cosine distance metric to measure contribution. Empirically, our\nresults show ACE effectively and efficiently deceive five state-of-the-art\ncontribution evaluation methods. In addition, ACE preserves the accuracy of the\nfinal global models on testing inputs. We also explore six countermeasures to\ndefend ACE. Our results show they are inadequate to thwart ACE, highlighting\nthe urgent need for new defenses to safeguard the contribution evaluation\nmethods in FL.\n","authors":["Zhangchen Xu","Fengqing Jiang","Luyao Niu","Jinyuan Jia","Bo Li","Radha Poovendran"],"pdf_url":"https://arxiv.org/pdf/2405.20975v1.pdf","comment":"To appear in the 33rd USENIX Security Symposium, 2024"},{"id":"http://arxiv.org/abs/2405.20974v1","updated":"2024-05-31T16:21:16Z","published":"2024-05-31T16:21:16Z","title":"SaySelf: Teaching LLMs to Express Confidence with Self-Reflective\n  Rationales","summary":"  Large language models (LLMs) often generate inaccurate or fabricated\ninformation and generally fail to indicate their confidence, which limits their\nbroader applications. Previous work elicits confidence from LLMs by direct or\nself-consistency prompting, or constructing specific datasets for supervised\nfinetuning. The prompting-based approaches have inferior performance, and the\ntraining-based approaches are limited to binary or inaccurate group-level\nconfidence estimates. In this work, we present the advanced SaySelf, a training\nframework that teaches LLMs to express more accurate fine-grained confidence\nestimates. In addition, beyond the confidence scores, SaySelf initiates the\nprocess of directing LLMs to produce self-reflective rationales that clearly\nidentify gaps in their parametric knowledge and explain their uncertainty. This\nis achieved by using an LLM to automatically summarize the uncertainties in\nspecific knowledge via natural language. The summarization is based on the\nanalysis of the inconsistency in multiple sampled reasoning chains, and the\nresulting data is utilized for supervised fine-tuning. Moreover, we utilize\nreinforcement learning with a meticulously crafted reward function to calibrate\nthe confidence estimates, motivating LLMs to deliver accurate, high-confidence\npredictions and to penalize overconfidence in erroneous outputs. Experimental\nresults in both in-distribution and out-of-distribution datasets demonstrate\nthe effectiveness of SaySelf in reducing the confidence calibration error and\nmaintaining the task performance. We show that the generated self-reflective\nrationales are reasonable and can further contribute to the calibration. The\ncode is made public at \\url{https://github.com/xu1868/SaySelf}.\n","authors":["Tianyang Xu","Shujin Wu","Shizhe Diao","Xiaoze Liu","Xingyao Wang","Yangyi Chen","Jing Gao"],"pdf_url":"https://arxiv.org/pdf/2405.20974v1.pdf","comment":"The code is available at \\url{https://github.com/xu1868/SaySelf}"},{"id":"http://arxiv.org/abs/2405.20973v1","updated":"2024-05-31T16:21:05Z","published":"2024-05-31T16:21:05Z","title":"LCQ: Low-Rank Codebook based Quantization for Large Language Models","summary":"  Large language models~(LLMs) have recently demonstrated promising performance\nin many tasks. However, the high storage and computational cost of LLMs has\nbecome a challenge for deploying LLMs. Weight quantization has been widely used\nfor model compression, which can reduce both storage and computational cost.\nMost existing weight quantization methods for LLMs use a rank-one codebook for\nquantization, which results in substantial accuracy loss when the compression\nratio is high. In this paper, we propose a novel weight quantization method,\ncalled low-rank codebook based quantization~(LCQ), for LLMs. LCQ adopts a\nlow-rank codebook, the rank of which can be larger than one, for quantization.\nExperiments show that LCQ can achieve better accuracy than existing methods\nwith a negligibly extra storage cost.\n","authors":["Wen-Pu Cai","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2405.20973v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.20971v1","updated":"2024-05-31T16:18:46Z","published":"2024-05-31T16:18:46Z","title":"Amortizing intractable inference in diffusion models for vision,\n  language, and control","summary":"  Diffusion models have emerged as effective distribution estimators in vision,\nlanguage, and reinforcement learning, but their use as priors in downstream\ntasks poses an intractable posterior inference problem. This paper studies\namortized sampling of the posterior over data, $\\mathbf{x}\\sim p^{\\rm\npost}(\\mathbf{x})\\propto p(\\mathbf{x})r(\\mathbf{x})$, in a model that consists\nof a diffusion generative model prior $p(\\mathbf{x})$ and a black-box\nconstraint or likelihood function $r(\\mathbf{x})$. We state and prove the\nasymptotic correctness of a data-free learning objective, relative trajectory\nbalance, for training a diffusion model that samples from this posterior, a\nproblem that existing methods solve only approximately or in restricted cases.\nRelative trajectory balance arises from the generative flow network perspective\non diffusion models, which allows the use of deep reinforcement learning\ntechniques to improve mode coverage. Experiments illustrate the broad potential\nof unbiased inference of arbitrary posteriors under diffusion priors: in vision\n(classifier guidance), language (infilling under a discrete diffusion LLM), and\nmultimodal data (text-to-image generation). Beyond generative modeling, we\napply relative trajectory balance to the problem of continuous control with a\nscore-based behavior prior, achieving state-of-the-art results on benchmarks in\noffline reinforcement learning.\n","authors":["Siddarth Venkatraman","Moksh Jain","Luca Scimeca","Minsu Kim","Marcin Sendera","Mohsin Hasan","Luke Rowe","Sarthak Mittal","Pablo Lemos","Emmanuel Bengio","Alexandre Adam","Jarrid Rector-Brooks","Yoshua Bengio","Glen Berseth","Nikolay Malkin"],"pdf_url":"https://arxiv.org/pdf/2405.20971v1.pdf","comment":"Code: https://github.com/GFNOrg/diffusion-finetuning"},{"id":"http://arxiv.org/abs/2405.20970v1","updated":"2024-05-31T16:18:06Z","published":"2024-05-31T16:18:06Z","title":"PUAL: A Classifier on Trifurcate Positive-Unlabeled Data","summary":"  Positive-unlabeled (PU) learning aims to train a classifier using the data\ncontaining only labeled-positive instances and unlabeled instances. However,\nexisting PU learning methods are generally hard to achieve satisfactory\nperformance on trifurcate data, where the positive instances distribute on both\nsides of the negative instances. To address this issue, firstly we propose a PU\nclassifier with asymmetric loss (PUAL), by introducing a structure of\nasymmetric loss on positive instances into the objective function of the global\nand local learning classifier. Then we develop a kernel-based algorithm to\nenable PUAL to obtain non-linear decision boundary. We show that, through\nexperiments on both simulated and real-world datasets, PUAL can achieve\nsatisfactory classification on trifurcate data.\n","authors":["Xiaoke Wang","Xiaochen Yang","Rui Zhu","Jing-Hao Xue"],"pdf_url":"https://arxiv.org/pdf/2405.20970v1.pdf","comment":"24 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.10879v3","updated":"2024-05-31T16:15:01Z","published":"2023-11-17T21:48:41Z","title":"Pre- to Post-Contrast Breast MRI Synthesis for Enhanced Tumour\n  Segmentation","summary":"  Despite its benefits for tumour detection and treatment, the administration\nof contrast agents in dynamic contrast-enhanced MRI (DCE-MRI) is associated\nwith a range of issues, including their invasiveness, bioaccumulation, and a\nrisk of nephrogenic systemic fibrosis. This study explores the feasibility of\nproducing synthetic contrast enhancements by translating pre-contrast\nT1-weighted fat-saturated breast MRI to their corresponding first DCE-MRI\nsequence leveraging the capabilities of a generative adversarial network (GAN).\nAdditionally, we introduce a Scaled Aggregate Measure (SAMe) designed for\nquantitatively evaluating the quality of synthetic data in a principled manner\nand serving as a basis for selecting the optimal generative model. We assess\nthe generated DCE-MRI data using quantitative image quality metrics and apply\nthem to the downstream task of 3D breast tumour segmentation. Our results\nhighlight the potential of post-contrast DCE-MRI synthesis in enhancing the\nrobustness of breast tumour segmentation models via data augmentation. Our code\nis available at https://github.com/RichardObi/pre_post_synthesis.\n","authors":["Richard Osuala","Smriti Joshi","Apostolia Tsirikoglou","Lidia Garrucho","Walter H. L. Pinaya","Oliver Diaz","Karim Lekadir"],"pdf_url":"https://arxiv.org/pdf/2311.10879v3.pdf","comment":"Accepted as oral presentation at SPIE Medical Imaging 2024 (Image\n  Processing)"},{"id":"http://arxiv.org/abs/2310.00154v2","updated":"2024-05-31T16:11:27Z","published":"2023-09-29T21:23:27Z","title":"Primal Dual Continual Learning: Balancing Stability and Plasticity\n  through Adaptive Memory Allocation","summary":"  Continual learning is inherently a constrained learning problem. The goal is\nto learn a predictor under a no-forgetting requirement. Although several prior\nstudies formulate it as such, they do not solve the constrained problem\nexplicitly. In this work, we show that it is both possible and beneficial to\nundertake the constrained optimization problem directly. To do this, we\nleverage recent results in constrained learning through Lagrangian duality. We\nfocus on memory-based methods, where a small subset of samples from previous\ntasks can be stored in a replay buffer. In this setting, we analyze two\nversions of the continual learning problem: a coarse approach with constraints\nat the task level and a fine approach with constraints at the sample level. We\nshow that dual variables indicate the sensitivity of the optimal value of the\ncontinual learning problem with respect to constraint perturbations. We then\nleverage this result to partition the buffer in the coarse approach, allocating\nmore resources to harder tasks, and to populate the buffer in the fine\napproach, including only impactful samples. We derive a deviation bound on dual\nvariables as sensitivity indicators, and empirically corroborate this result in\ndiverse continual learning benchmarks. We also discuss the limitations of these\nmethods with respect to the amount of memory available and the expressiveness\nof the parametrization.\n","authors":["Juan Elenter","Navid NaderiAlizadeh","Tara Javidi","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2310.00154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01371v2","updated":"2024-05-31T16:05:37Z","published":"2024-03-03T02:19:49Z","title":"eXponential FAmily Dynamical Systems (XFADS): Large-scale nonlinear\n  Gaussian state-space modeling","summary":"  State-space graphical models and the variational autoencoder framework\nprovide a principled apparatus for learning dynamical systems from data.\nState-of-the-art probabilistic approaches are often able to scale to large\nproblems at the cost of flexibility of the variational posterior or\nexpressivity of the dynamics model. However, those consolidations can be\ndetrimental if the ultimate goal is to learn a generative model capable of\nexplaining the spatiotemporal structure of the data and making accurate\nforecasts. We introduce a low-rank structured variational autoencoding\nframework for nonlinear Gaussian state-space graphical models capable of\ncapturing dense covariance structures that are important for learning dynamical\nsystems with predictive capabilities. Our inference algorithm exploits the\ncovariance structures that arise naturally from sample based approximate\nGaussian message passing and low-rank amortized posterior updates --\neffectively performing approximate variational smoothing with time complexity\nscaling linearly in the state dimensionality. In comparisons with other deep\nstate-space model architectures our approach consistently demonstrates the\nability to learn a more predictive generative model. Furthermore, when applied\nto neural physiological recordings, our approach is able to learn a dynamical\nsystem capable of forecasting population spiking and behavioral correlates from\na small portion of single trials.\n","authors":["Matthew Dowling","Yuan Zhao","Il Memming Park"],"pdf_url":"https://arxiv.org/pdf/2403.01371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04513v2","updated":"2024-05-31T15:59:34Z","published":"2024-02-07T01:46:50Z","title":"Online Cascade Learning for Efficient Inference over Streams","summary":"  Large Language Models (LLMs) have a natural role in answering complex queries\nabout data streams, but the high computational cost of LLM inference makes them\ninfeasible in many such tasks. We propose online cascade learning, the first\napproach to address this challenge. The objective here is to learn a \"cascade\"\nof models, starting with lower-capacity models (such as logistic regression)\nand ending with a powerful LLM, along with a deferral policy that determines\nthe model to be used on a given input. We formulate the task of learning\ncascades online as an imitation-learning problem, where smaller models are\nupdated over time imitating the collected LLM demonstrations, and give a\nno-regret algorithm for the problem. Experimental results across four\nbenchmarks show that our method parallels LLMs in accuracy while cutting down\ninference costs by as much as 90% with strong robustness against input\ndistribution shifts, underscoring its efficacy and adaptability in stream\nprocessing.\n","authors":["Lunyiu Nie","Zhimin Ding","Erdong Hu","Christopher Jermaine","Swarat Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2402.04513v2.pdf","comment":"ICML 2024 Main Conference Paper"},{"id":"http://arxiv.org/abs/2405.20954v1","updated":"2024-05-31T15:54:01Z","published":"2024-05-31T15:54:01Z","title":"Aligning Multiclass Neural Network Classifier Criterion with Task\n  Performance via $F_β$-Score","summary":"  Multiclass neural network classifiers are typically trained using\ncross-entropy loss. Following training, the performance of this same neural\nnetwork is evaluated using an application-specific metric based on the\nmulticlass confusion matrix, such as the Macro $F_\\beta$-Score. It is\nquestionable whether the use of cross-entropy will yield a classifier that\naligns with the intended application-specific performance criteria,\nparticularly in scenarios where there is a need to emphasize one aspect of\nclassifier performance. For example, if greater precision is preferred over\nrecall, the $\\beta$ value in the $F_\\beta$ evaluation metric can be adjusted\naccordingly, but the cross-entropy objective remains unaware of this preference\nduring training. We propose a method that addresses this training-evaluation\ngap for multiclass neural network classifiers such that users can train these\nmodels informed by the desired final $F_\\beta$-Score. Following prior work in\nbinary classification, we utilize the concepts of the soft-set confusion\nmatrices and a piecewise-linear approximation of the Heaviside step function.\nOur method extends the $2 \\times 2$ binary soft-set confusion matrix to a\nmulticlass $d \\times d$ confusion matrix and proposes dynamic adaptation of the\nthreshold value $\\tau$, which parameterizes the piecewise-linear Heaviside\napproximation during run-time. We present a theoretical analysis that shows\nthat our method can be used to optimize for a soft-set based approximation of\nMacro-$F_\\beta$ that is a consistent estimator of Macro-$F_\\beta$, and our\nextensive experiments show the practical effectiveness of our approach.\n","authors":["Nathan Tsoi","Deyuan Li","Taesoo Daniel Lee","Marynel Vázquez"],"pdf_url":"https://arxiv.org/pdf/2405.20954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18669v2","updated":"2024-05-31T15:42:53Z","published":"2024-05-29T00:23:55Z","title":"Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities","summary":"  Integrating multiple generative foundation models, especially those trained\non different modalities, into something greater than the sum of its parts poses\nsignificant challenges. Two key hurdles are the availability of aligned data\n(concepts that contain similar meaning but is expressed differently in\ndifferent modalities), and effectively leveraging unimodal representations in\ncross-domain generative tasks, without compromising their original unimodal\ncapabilities.\n  We propose Zipper, a multi-tower decoder architecture that addresses these\nconcerns by using cross-attention to flexibly compose multimodal generative\nmodels from independently pre-trained unimodal decoders. In our experiments\nfusing speech and text modalities, we show the proposed architecture performs\nvery competitively in scenarios with limited aligned text-speech data. We also\nshowcase the flexibility of our model to selectively maintain unimodal (e.g.,\ntext-to-text generation) generation performance by freezing the corresponding\nmodal tower (e.g. text). In cross-modal tasks such as automatic speech\nrecognition (ASR) where the output modality is text, we show that freezing the\ntext backbone results in negligible performance degradation. In cross-modal\ntasks such as text-to-speech generation (TTS) where the output modality is\nspeech, we show that using a pre-trained speech backbone results in superior\nperformance to the baseline.\n","authors":["Vicky Zayats","Peter Chen","Melissa Ferrari","Dirk Padfield"],"pdf_url":"https://arxiv.org/pdf/2405.18669v2.pdf","comment":"Under review at NeurIPS"},{"id":"http://arxiv.org/abs/2405.20935v1","updated":"2024-05-31T15:34:13Z","published":"2024-05-31T15:34:13Z","title":"Effective Interplay between Sparsity and Quantization: From Theory to\n  Practice","summary":"  The increasing size of deep neural networks necessitates effective model\ncompression to improve computational efficiency and reduce their memory\nfootprint. Sparsity and quantization are two prominent compression methods that\nhave individually demonstrated significant reduction in computational and\nmemory footprints while preserving model accuracy. While effective, the\ninterplay between these two methods remains an open question. In this paper, we\ninvestigate the interaction between these two methods and assess whether their\ncombination impacts final model accuracy. We mathematically prove that applying\nsparsity before quantization is the optimal sequence for these operations,\nminimizing error in computation. Our empirical studies across a wide range of\nmodels, including OPT and Llama model families (125M-8B) and ViT corroborate\nthese theoretical findings. In addition, through rigorous analysis, we\ndemonstrate that sparsity and quantization are not orthogonal; their\ninteraction can significantly harm model accuracy, with quantization error\nplaying a dominant role in this degradation. Our findings extend to the\nefficient deployment of large models in resource-limited compute platforms and\nreduce serving cost, offering insights into best practices for applying these\ncompression methods to maximize efficacy without compromising accuracy.\n","authors":["Simla Burcu Harma","Ayan Chakraborty","Elizaveta Kostenok","Danila Mishin","Dongho Ha","Babak Falsafi","Martin Jaggi","Ming Liu","Yunho Oh","Suvinay Subramanian","Amir Yazdanbakhsh"],"pdf_url":"https://arxiv.org/pdf/2405.20935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20933v1","updated":"2024-05-31T15:32:43Z","published":"2024-05-31T15:32:43Z","title":"Concentration Bounds for Optimized Certainty Equivalent Risk Estimation","summary":"  We consider the problem of estimating the Optimized Certainty Equivalent\n(OCE) risk from independent and identically distributed (i.i.d.) samples. For\nthe classic sample average approximation (SAA) of OCE, we derive mean-squared\nerror as well as concentration bounds (assuming sub-Gaussianity). Further, we\nanalyze an efficient stochastic approximation-based OCE estimator, and derive\nfinite sample bounds for the same. To show the applicability of our bounds, we\nconsider a risk-aware bandit problem, with OCE as the risk. For this problem,\nwe derive bound on the probability of mis-identification. Finally, we conduct\nnumerical experiments to validate the theoretical findings.\n","authors":["Ayon Ghosh","L. A. Prashanth","Krishna Jagannathan"],"pdf_url":"https://arxiv.org/pdf/2405.20933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01399v2","updated":"2024-05-31T15:32:02Z","published":"2023-07-31T17:57:49Z","title":"Learning to Model the World with Language","summary":"  To interact with humans and act in the world, agents need to understand the\nrange of language that people use and relate it to the visual world. While\ncurrent agents can learn to execute simple language instructions, we aim to\nbuild agents that leverage diverse language -- language like \"this button turns\non the TV\" or \"I put the bowls away\" -- that conveys general knowledge,\ndescribes the state of the world, provides interactive feedback, and more. Our\nkey idea is that agents should interpret such diverse language as a signal that\nhelps them predict the future: what they will observe, how the world will\nbehave, and which situations will be rewarded. This perspective unifies\nlanguage understanding with future prediction as a powerful self-supervised\nlearning objective. We instantiate this in Dynalang, an agent that learns a\nmultimodal world model to predict future text and image representations, and\nlearns to act from imagined model rollouts. While current methods that learn\nlanguage-conditioned policies degrade in performance with more diverse types of\nlanguage, we show that Dynalang learns to leverage environment descriptions,\ngame rules, and instructions to excel on tasks ranging from game-playing to\nnavigating photorealistic home scans. Finally, we show that our method enables\nadditional capabilities due to learning a generative model: Dynalang can be\npretrained on text-only data, enabling learning from offline datasets, and\ngenerate language grounded in an environment.\n","authors":["Jessy Lin","Yuqing Du","Olivia Watkins","Danijar Hafner","Pieter Abbeel","Dan Klein","Anca Dragan"],"pdf_url":"https://arxiv.org/pdf/2308.01399v2.pdf","comment":"ICML 2024. Website: https://dynalang.github.io/"},{"id":"http://arxiv.org/abs/2403.03938v2","updated":"2024-05-31T15:31:16Z","published":"2024-03-06T18:47:32Z","title":"GUIDE: Guidance-based Incremental Learning with Diffusion Models","summary":"  We introduce GUIDE, a novel continual learning approach that directs\ndiffusion models to rehearse samples at risk of being forgotten. Existing\ngenerative strategies combat catastrophic forgetting by randomly sampling\nrehearsal examples from a generative model. Such an approach contradicts\nbuffer-based approaches where sampling strategy plays an important role. We\npropose to bridge this gap by incorporating classifier guidance into the\ndiffusion process to produce rehearsal examples specifically targeting\ninformation forgotten by a continuously trained model. This approach enables\nthe generation of samples from preceding task distributions, which are more\nlikely to be misclassified in the context of recently encountered classes. Our\nexperimental results show that GUIDE significantly reduces catastrophic\nforgetting, outperforming conventional random sampling approaches and\nsurpassing recent state-of-the-art methods in continual learning with\ngenerative replay.\n","authors":["Bartosz Cywiński","Kamil Deja","Tomasz Trzciński","Bartłomiej Twardowski","Łukasz Kuciński"],"pdf_url":"https://arxiv.org/pdf/2403.03938v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20917v1","updated":"2024-05-31T15:21:53Z","published":"2024-05-31T15:21:53Z","title":"Learning to Estimate System Specifications in Linear Temporal Logic\n  using Transformers and Mamba","summary":"  Temporal logic is a framework for representing and reasoning about\npropositions that evolve over time. It is commonly used for specifying\nrequirements in various domains, including hardware and software systems, as\nwell as robotics. Specification mining or formula generation involves\nextracting temporal logic formulae from system traces and has numerous\napplications, such as detecting bugs and improving interpretability. Although\nthere has been a surge of deep learning-based methods for temporal logic\nsatisfiability checking in recent years, the specification mining literature\nhas been lagging behind in adopting deep learning methods despite their many\nadvantages, such as scalability. In this paper, we introduce autoregressive\nmodels that can generate linear temporal logic formulae from traces, towards\naddressing the specification mining problem. We propose multiple architectures\nfor this task: transformer encoder-decoder, decoder-only transformer, and\nMamba, which is an emerging alternative to transformer models. Additionally, we\ndevise a metric for quantifying the distinctiveness of the generated formulae\nand a straightforward algorithm to enforce the syntax constraints. Our\nexperiments show that the proposed architectures yield promising results,\ngenerating correct and distinct formulae at a fraction of the compute cost\nneeded for the combinatorial baseline.\n","authors":["İlker Işık","Ebru Aydin Gol","Ramazan Gokberk Cinbis"],"pdf_url":"https://arxiv.org/pdf/2405.20917v1.pdf","comment":"20 pages, 15 figures"},{"id":"http://arxiv.org/abs/2405.20915v1","updated":"2024-05-31T15:21:44Z","published":"2024-05-31T15:21:44Z","title":"Fast yet Safe: Early-Exiting with Risk Control","summary":"  Scaling machine learning models significantly improves their performance.\nHowever, such gains come at the cost of inference being slow and\nresource-intensive. Early-exit neural networks (EENNs) offer a promising\nsolution: they accelerate inference by allowing intermediate layers to exit and\nproduce a prediction early. Yet a fundamental issue with EENNs is how to\ndetermine when to exit without severely degrading performance. In other words,\nwhen is it 'safe' for an EENN to go 'fast'? To address this issue, we\ninvestigate how to adapt frameworks of risk control to EENNs. Risk control\noffers a distribution-free, post-hoc solution that tunes the EENN's exiting\nmechanism so that exits only occur when the output is of sufficient quality. We\nempirically validate our insights on a range of vision and language tasks,\ndemonstrating that risk control can produce substantial computational savings,\nall the while preserving user-specified performance goals.\n","authors":["Metod Jazbec","Alexander Timans","Tin Hadži Veljković","Kaspar Sakmann","Dan Zhang","Christian A. Naesseth","Eric Nalisnick"],"pdf_url":"https://arxiv.org/pdf/2405.20915v1.pdf","comment":"25 pages, 11 figures, 4 tables (incl. appendix)"},{"id":"http://arxiv.org/abs/2405.20905v1","updated":"2024-05-31T15:16:48Z","published":"2024-05-31T15:16:48Z","title":"VENI, VINDy, VICI: a variational reduced-order modeling framework with\n  uncertainty quantification","summary":"  The simulation of many complex phenomena in engineering and science requires\nsolving expensive, high-dimensional systems of partial differential equations\n(PDEs). To circumvent this, reduced-order models (ROMs) have been developed to\nspeed up computations. However, when governing equations are unknown or\npartially known, typically ROMs lack interpretability and reliability of the\npredicted solutions.\n  In this work we present a data-driven, non-intrusive framework for building\nROMs where the latent variables and dynamics are identified in an interpretable\nmanner and uncertainty is quantified. Starting from a limited amount of\nhigh-dimensional, noisy data the proposed framework constructs an efficient ROM\nby leveraging variational autoencoders for dimensionality reduction along with\na newly introduced, variational version of sparse identification of nonlinear\ndynamics (SINDy), which we refer to as Variational Identification of Nonlinear\nDynamics (VINDy).\n  In detail, the method consists of Variational Encoding of Noisy Inputs (VENI)\nto identify the distribution of reduced coordinates. Simultaneously, we learn\nthe distribution of the coefficients of a pre-determined set of candidate\nfunctions by VINDy. Once trained offline, the identified model can be queried\nfor new parameter instances and new initial conditions to compute the\ncorresponding full-time solutions. The probabilistic setup enables uncertainty\nquantification as the online testing consists of Variational Inference\nnaturally providing Certainty Intervals (VICI). In this work we showcase the\neffectiveness of the newly proposed VINDy method in identifying interpretable\nand accurate dynamical system for the R\\\"ossler system with different noise\nintensities and sources. Then the performance of the overall method - named\nVENI, VINDy, VICI - is tested on PDE benchmarks including structural mechanics\nand fluid dynamics.\n","authors":["Paolo Conti","Jonas Kneifl","Andrea Manzoni","Attilio Frangi","Jörg Fehr","Steven L. Brunton","J. Nathan Kutz"],"pdf_url":"https://arxiv.org/pdf/2405.20905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07856v3","updated":"2024-05-31T15:14:58Z","published":"2023-06-13T15:35:01Z","title":"Bayesian Program Learning by Decompiling Amortized Knowledge","summary":"  DreamCoder is an inductive program synthesis system that, whilst solving\nproblems, learns to simplify search in an iterative wake-sleep procedure. The\ncost of search is amortized by training a neural search policy, reducing search\nbreadth and effectively \"compiling\" useful information to compose program\nsolutions across tasks. Additionally, a library of program components is learnt\nto compress and express discovered solutions in fewer components, reducing\nsearch depth. We present a novel approach for library learning that directly\nleverages the neural search policy, effectively \"decompiling\" its amortized\nknowledge to extract relevant program components. This provides stronger\namortized inference: the amortized knowledge learnt to reduce search breadth is\nnow also used to reduce search depth. We integrate our approach with DreamCoder\nand demonstrate faster domain proficiency with improved generalization on a\nrange of domains, particularly when fewer example solutions are available.\n","authors":["Alessandro B. Palmarini","Christopher G. Lucas","N. Siddharth"],"pdf_url":"https://arxiv.org/pdf/2306.07856v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05300v3","updated":"2024-05-31T15:14:43Z","published":"2024-03-08T13:29:46Z","title":"Unity by Diversity: Improved Representation Learning in Multimodal VAEs","summary":"  Variational Autoencoders for multimodal data hold promise for many tasks in\ndata analysis, such as representation learning, conditional generation, and\nimputation. Current architectures either share the encoder output, decoder\ninput, or both across modalities to learn a shared representation. Such\narchitectures impose hard constraints on the model. In this work, we show that\na better latent representation can be obtained by replacing these hard\nconstraints with a soft constraint. We propose a new mixture-of-experts prior,\nsoftly guiding each modality's latent representation towards a shared aggregate\nposterior. This approach results in a superior latent representation and allows\neach encoding to preserve information better from its uncompressed original\nfeatures. In extensive experiments on multiple benchmark datasets and two\nchallenging real-world datasets, we show improved learned latent\nrepresentations and imputation of missing data modalities compared to existing\nmethods.\n","authors":["Thomas M. Sutter","Yang Meng","Andrea Agostini","Daphné Chopard","Norbert Fortin","Julia E. Vogt","Bahbak Shahbaba","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2403.05300v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06650v2","updated":"2024-05-31T15:05:40Z","published":"2023-01-17T01:12:44Z","title":"Enhancing Deep Traffic Forecasting Models with Dynamic Regression","summary":"  Deep learning models for traffic forecasting often assume the residual is\nindependent and isotropic across time and space. This assumption simplifies\nloss functions such as mean absolute error, but real-world residual processes\noften exhibit significant autocorrelation and structured spatiotemporal\ncorrelation. This paper introduces a dynamic regression (DR) framework to\nenhance existing spatiotemporal traffic forecasting models by incorporating\nstructured learning for the residual process. We assume the residual of the\nbase model (i.e., a well-developed traffic forecasting model) follows a\nmatrix-variate seasonal autoregressive (AR) model, which is seamlessly\nintegrated into the training process through the redesign of the loss function.\nImportantly, the parameters of the DR framework are jointly optimized alongside\nthe base model. We evaluate the effectiveness of the proposed framework on\nstate-of-the-art (SOTA) deep traffic forecasting models using both speed and\nflow datasets, demonstrating improved performance and providing interpretable\nAR coefficients and spatiotemporal covariance matrices.\n","authors":["Vincent Zhihao Zheng","Seongjin Choi","Lijun Sun"],"pdf_url":"https://arxiv.org/pdf/2301.06650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16785v2","updated":"2024-05-31T15:03:11Z","published":"2024-02-26T18:00:29Z","title":"CARTE: Pretraining and Transfer for Tabular Learning","summary":"  Pretrained deep-learning models are the go-to solution for images or text.\nHowever, for tabular data the standard is still to train tree-based models.\nIndeed, transfer learning on tables hits the challenge of data integration:\nfinding correspondences, correspondences in the entries (entity matching) where\ndifferent words may denote the same entity, correspondences across columns\n(schema matching), which may come in different orders, names... We propose a\nneural architecture that does not need such correspondences. As a result, we\ncan pretrain it on background data that has not been matched. The architecture\n-- CARTE for Context Aware Representation of Table Entries -- uses a graph\nrepresentation of tabular (or relational) data to process tables with different\ncolumns, string embedding of entries and columns names to model an open\nvocabulary, and a graph-attentional network to contextualize entries with\ncolumn names and neighboring entries. An extensive benchmark shows that CARTE\nfacilitates learning, outperforming a solid set of baselines including the best\ntree-based models. CARTE also enables joint learning across tables with\nunmatched columns, enhancing a small table with bigger ones. CARTE opens the\ndoor to large pretrained models for tabular data.\n","authors":["Myung Jun Kim","Léo Grinsztajn","Gaël Varoquaux"],"pdf_url":"https://arxiv.org/pdf/2402.16785v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10780v2","updated":"2024-05-31T15:00:36Z","published":"2024-05-13T21:37:50Z","title":"Intelligent and Miniaturized Neural Interfaces: An Emerging Era in\n  Neurotechnology","summary":"  Integrating smart algorithms on neural devices presents significant\nopportunities for various brain disorders. In this paper, we review the latest\nadvancements in the development of three categories of intelligent neural\nprostheses featuring embedded signal processing on the implantable or wearable\ndevice. These include: 1) Neural interfaces for closed-loop symptom tracking\nand responsive stimulation; 2) Neural interfaces for emerging network-related\nconditions, such as psychiatric disorders; and 3) Intelligent BMI SoCs for\nmovement recovery following paralysis.\n","authors":["Mahsa Shoaran","Uisub Shin","MohammadAli Shaeri"],"pdf_url":"https://arxiv.org/pdf/2405.10780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00664v5","updated":"2024-05-31T14:58:20Z","published":"2023-05-01T05:26:33Z","title":"EvoluNet: Advancing Dynamic Non-IID Transfer Learning on Graphs","summary":"  Non-IID transfer learning on graphs is crucial in many high-stakes domains.\nThe majority of existing works assume stationary distribution for both source\nand target domains. However, real-world graphs are intrinsically dynamic,\npresenting challenges in terms of domain evolution and dynamic discrepancy\nbetween source and target domains. To bridge the gap, we shift the problem to\nthe dynamic setting and pose the question: given the label-rich source graphs\nand the label-scarce target graphs both observed in previous T timestamps, how\ncan we effectively characterize the evolving domain discrepancy and optimize\nthe generalization performance of the target domain at the incoming T+1\ntimestamp? To answer it, we propose a generalization bound for dynamic non-IID\ntransfer learning on graphs, which implies the generalization performance is\ndominated by domain evolution and domain discrepancy between source and target\ngraphs. Inspired by the theoretical results, we introduce a novel generic\nframework named EvoluNet. It leverages a transformer-based temporal encoding\nmodule to model temporal information of the evolving domains and then uses a\ndynamic domain unification module to efficiently learn domain-invariant\nrepresentations across the source and target domains. Finally, EvoluNet\noutperforms the state-of-the-art models by up to 12.1%, demonstrating its\neffectiveness in transferring knowledge from dynamic source graphs to dynamic\ntarget graphs.\n","authors":["Haohui Wang","Yuzhen Mao","Yujun Yan","Yaoqing Yang","Jianhui Sun","Kevin Choi","Balaji Veeramani","Alison Hu","Edward Bowen","Tyler Cody","Dawei Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.00664v5.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2405.20882v1","updated":"2024-05-31T14:55:38Z","published":"2024-05-31T14:55:38Z","title":"Sheaf HyperNetworks for Personalized Federated Learning","summary":"  Graph hypernetworks (GHNs), constructed by combining graph neural networks\n(GNNs) with hypernetworks (HNs), leverage relational data across various\ndomains such as neural architecture search, molecular property prediction and\nfederated learning. Despite GNNs and HNs being individually successful, we show\nthat GHNs present problems compromising their performance, such as\nover-smoothing and heterophily. Moreover, we cannot apply GHNs directly to\npersonalized federated learning (PFL) scenarios, where a priori client relation\ngraph may be absent, private, or inaccessible. To mitigate these limitations in\nthe context of PFL, we propose a novel class of HNs, sheaf hypernetworks\n(SHNs), which combine cellular sheaf theory with HNs to improve parameter\nsharing for PFL. We thoroughly evaluate SHNs across diverse PFL tasks,\nincluding multi-class classification, traffic and weather forecasting.\nAdditionally, we provide a methodology for constructing client relation graphs\nin scenarios where such graphs are unavailable. We show that SHNs consistently\noutperform existing PFL solutions in complex non-IID scenarios. While the\nbaselines' performance fluctuates depending on the task, SHNs show improvements\nof up to 2.7% in accuracy and 5.3% in lower mean squared error over the\nbest-performing baseline.\n","authors":["Bao Nguyen","Lorenzo Sani","Xinchi Qiu","Pietro Liò","Nicholas D. Lane"],"pdf_url":"https://arxiv.org/pdf/2405.20882v1.pdf","comment":"25 pages, 12 figures, 7 tables, pre-print under review"},{"id":"http://arxiv.org/abs/2405.20879v1","updated":"2024-05-31T14:54:51Z","published":"2024-05-31T14:54:51Z","title":"Flow matching achieves minimax optimal convergence","summary":"  Flow matching (FM) has gained significant attention as a simulation-free\ngenerative model. Unlike diffusion models, which are based on stochastic\ndifferential equations, FM employs a simpler approach by solving an ordinary\ndifferential equation with an initial condition from a normal distribution,\nthus streamlining the sample generation process. This paper discusses the\nconvergence properties of FM in terms of the $p$-Wasserstein distance, a\nmeasure of distributional discrepancy. We establish that FM can achieve the\nminmax optimal convergence rate for $1 \\leq p \\leq 2$, presenting the first\ntheoretical evidence that FM can reach convergence rates comparable to those of\ndiffusion models. Our analysis extends existing frameworks by examining a\nbroader class of mean and variance functions for the vector fields and\nidentifies specific conditions necessary to attain these optimal rates.\n","authors":["Kenji Fukumizu","Taiji Suzuki","Noboru Isobe","Kazusato Oko","Masanori Koyama"],"pdf_url":"https://arxiv.org/pdf/2405.20879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20877v1","updated":"2024-05-31T14:52:58Z","published":"2024-05-31T14:52:58Z","title":"Waveform Design for Over-the-Air Computing","summary":"  In response to the increasing number of devices anticipated in\nnext-generation networks, a shift toward over-the-air (OTA) computing has been\nproposed. Leveraging the superposition of multiple access channels, OTA\ncomputing enables efficient resource management by supporting simultaneous\nuncoded transmission in the time and the frequency domain. Thus, to advance the\nintegration of OTA computing, our study presents a theoretical analysis\naddressing practical issues encountered in current digital communication\ntransceivers, such as time sampling error and intersymbol interference (ISI).\nTo this end, we examine the theoretical mean squared error (MSE) for OTA\ntransmission under time sampling error and ISI, while also exploring methods\nfor minimizing the MSE in the OTA transmission. Utilizing alternating\noptimization, we also derive optimal power policies for both the devices and\nthe base station. Additionally, we propose a novel deep neural network\n(DNN)-based approach to design waveforms enhancing OTA transmission performance\nunder time sampling error and ISI. To ensure fair comparison with existing\nwaveforms like the raised cosine (RC) and the better-than-raised-cosine (BRTC),\nwe incorporate a custom loss function integrating energy and bandwidth\nconstraints, along with practical design considerations such as waveform\nsymmetry. Simulation results validate our theoretical analysis and demonstrate\nperformance gains of the designed pulse over RC and BTRC waveforms. To\nfacilitate testing of our results without necessitating the DNN structure\nrecreation, we provide curve fitting parameters for select DNN-based waveforms\nas well.\n","authors":["Nikos G. Evgenidis","Nikos A. Mitsiou","Sotiris A. Tegos","Panagiotis D. Diamantoulakis","Panagiotis Sarigiannidis","Ioannis T. Rekanos","George K. Karagiannidis"],"pdf_url":"https://arxiv.org/pdf/2405.20877v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2310.18953v2","updated":"2024-05-31T14:51:58Z","published":"2023-10-29T09:54:03Z","title":"TIC-TAC: A Framework for Improved Covariance Estimation in Deep\n  Heteroscedastic Regression","summary":"  Deep heteroscedastic regression involves jointly optimizing the mean and\ncovariance of the predicted distribution using the negative log-likelihood.\nHowever, recent works show that this may result in sub-optimal convergence due\nto the challenges associated with covariance estimation. While the literature\naddresses this by proposing alternate formulations to mitigate the impact of\nthe predicted covariance, we focus on improving the predicted covariance\nitself. We study two questions: (1) Does the predicted covariance truly capture\nthe randomness of the predicted mean? (2) In the absence of supervision, how\ncan we quantify the accuracy of covariance estimation? We address (1) with a\nTaylor Induced Covariance (TIC), which captures the randomness of the predicted\nmean by incorporating its gradient and curvature through the second order\nTaylor polynomial. Furthermore, we tackle (2) by introducing a Task Agnostic\nCorrelations (TAC) metric, which combines the notion of correlations and\nabsolute error to evaluate the covariance. We evaluate TIC-TAC across multiple\nexperiments spanning synthetic and real-world datasets. Our results show that\nnot only does TIC accurately learn the covariance, it additionally facilitates\nan improved convergence of the negative log-likelihood. Our code is available\nat https://github.com/vita-epfl/TIC-TAC\n","authors":["Megh Shukla","Mathieu Salzmann","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2310.18953v2.pdf","comment":"ICML 2024. Please feel free to provide feedback!"},{"id":"http://arxiv.org/abs/2402.01000v3","updated":"2024-05-31T14:49:11Z","published":"2024-02-01T20:27:19Z","title":"Multivariate Probabilistic Time Series Forecasting with Correlated\n  Errors","summary":"  Accurately modeling the correlation structure of errors is essential for\nreliable uncertainty quantification in probabilistic time series forecasting.\nRecent deep learning models for multivariate time series have developed\nefficient parameterizations for time-varying contemporaneous covariance, but\nthey often assume temporal independence of errors for simplicity. However,\nreal-world data frequently exhibit significant error autocorrelation and\ncross-lag correlation due to factors such as missing covariates. In this paper,\nwe present a plug-and-play method that learns the covariance structure of\nerrors over multiple steps for autoregressive models with Gaussian-distributed\nerrors. To achieve scalable inference and computational efficiency, we model\nthe contemporaneous covariance using a low-rank-plus-diagonal parameterization\nand characterize cross-covariance through a group of independent latent\ntemporal processes. The learned covariance matrix can be used to calibrate\npredictions based on observed residuals. We evaluate our method on\nprobabilistic models built on RNN and Transformer architectures, and the\nresults confirm the effectiveness of our approach in enhancing predictive\naccuracy and uncertainty quantification without significantly increasing the\nparameter size.\n","authors":["Vincent Zhihao Zheng","Lijun Sun"],"pdf_url":"https://arxiv.org/pdf/2402.01000v3.pdf","comment":"This paper extends the work presented in arXiv:2305.17028 to a\n  multivariate setting"},{"id":"http://arxiv.org/abs/2211.10737v4","updated":"2024-05-31T14:47:25Z","published":"2022-11-19T16:17:11Z","title":"Accuracy Booster: Enabling 4-bit Fixed-point Arithmetic for DNN Training","summary":"  The unprecedented demand for computing resources to train DNN models has led\nto a search for minimal numerical encoding. Recent state-of-the-art (SOTA)\nproposals advocate for multi-level scaled narrow bitwidth numerical formats. In\nthis paper, we show that single-level scaling is sufficient to maintain\ntraining accuracy while maximizing arithmetic density. We identify a previously\nproposed single-level scaled format for 8-bit training, Hybrid Block Floating\nPoint (HBFP), as the optimal candidate to minimize. We perform a full-scale\nexploration of the HBFP design space using mathematical tools to study the\ninterplay among various parameters and identify opportunities for even smaller\nencodings across layers and epochs. Based on our findings, we propose Accuracy\nBooster, a mixed-mantissa HBFP technique that uses 4-bit mantissas for over 99%\nof all arithmetic operations in training and 6-bit mantissas only in the last\nepoch and first/last layers. We show Accuracy Booster enables increasing\narithmetic density over all other SOTA formats by at least 2.3x while achieving\nstate-of-the-art accuracies in 4-bit training.\n","authors":["Simla Burcu Harma","Ayan Chakraborty","Nicholas Sperry","Babak Falsafi","Martin Jaggi","Yunho Oh"],"pdf_url":"https://arxiv.org/pdf/2211.10737v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20860v1","updated":"2024-05-31T14:44:05Z","published":"2024-05-31T14:44:05Z","title":"Enhancing Efficiency of Safe Reinforcement Learning via Sample\n  Manipulation","summary":"  Safe reinforcement learning (RL) is crucial for deploying RL agents in\nreal-world applications, as it aims to maximize long-term rewards while\nsatisfying safety constraints. However, safe RL often suffers from sample\ninefficiency, requiring extensive interactions with the environment to learn a\nsafe policy. We propose Efficient Safe Policy Optimization (ESPO), a novel\napproach that enhances the efficiency of safe RL through sample manipulation.\nESPO employs an optimization framework with three modes: maximizing rewards,\nminimizing costs, and balancing the trade-off between the two. By dynamically\nadjusting the sampling process based on the observed conflict between reward\nand safety gradients, ESPO theoretically guarantees convergence, optimization\nstability, and improved sample complexity bounds. Experiments on the\nSafety-MuJoCo and Omnisafe benchmarks demonstrate that ESPO significantly\noutperforms existing primal-based and primal-dual-based baselines in terms of\nreward maximization and constraint satisfaction. Moreover, ESPO achieves\nsubstantial gains in sample efficiency, requiring 25--29% fewer samples than\nbaselines, and reduces training time by 21--38%.\n","authors":["Shangding Gu","Laixi Shi","Yuhao Ding","Alois Knoll","Costas Spanos","Adam Wierman","Ming Jin"],"pdf_url":"https://arxiv.org/pdf/2405.20860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20042v2","updated":"2024-05-31T14:42:52Z","published":"2024-05-30T13:23:02Z","title":"CycleFormer : TSP Solver Based on Language Modeling","summary":"  We propose a new transformer model for the Traveling Salesman Problem (TSP)\ncalled CycleFormer. We identified distinctive characteristics that need to be\nconsidered when applying a conventional transformer model to TSP and aimed to\nfully incorporate these elements into the TSP-specific transformer. Unlike the\ntoken sets in typical language models, which are limited and static, the token\n(node) set in TSP is unlimited and dynamic. To exploit this fact to the\nfullest, we equated the encoder output with the decoder linear layer and\ndirectly connected the context vector of the encoder to the decoder encoding.\nAdditionally, we added a positional encoding to the encoder tokens that\nreflects the two-dimensional nature of TSP, and devised a circular positional\nencoding for the decoder tokens that considers the cyclic properties of a tour.\nBy incorporating these ideas, CycleFormer outperforms state-of-the-art (SOTA)\ntransformer models for TSP from TSP-50 to TSP-500. Notably, on TSP-500, the\noptimality gap was reduced by approximately 2.8 times, from 3.09% to 1.10%,\ncompared to the existing SOTA. The code will be made available at\nhttps://github.com/Giventicket/CycleFormer.\n","authors":["Jieun Yook","Junpyo Seo","Joon Huh","Han Joon Byun","Byung-ro Mooon"],"pdf_url":"https://arxiv.org/pdf/2405.20042v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.05621v4","updated":"2024-05-31T14:35:18Z","published":"2021-03-09T18:46:01Z","title":"The Common Intuition to Transfer Learning Can Win or Lose: Case Studies\n  for Linear Regression","summary":"  We study a fundamental transfer learning process from source to target linear\nregression tasks, including overparameterized settings where there are more\nlearned parameters than data samples. The target task learning is addressed by\nusing its training data together with the parameters previously computed for\nthe source task. We define a transfer learning approach to the target task as a\nlinear regression optimization with a regularization on the distance between\nthe to-be-learned target parameters and the already-learned source parameters.\nWe analytically characterize the generalization performance of our transfer\nlearning approach and demonstrate its ability to resolve the peak in\ngeneralization errors in double descent phenomena of the minimum L2-norm\nsolution to linear regression. Moreover, we show that for sufficiently related\ntasks, the optimally tuned transfer learning approach can outperform the\noptimally tuned ridge regression method, even when the true parameter vector\nconforms to an isotropic Gaussian prior distribution. Namely, we demonstrate\nthat transfer learning can beat the minimum mean square error (MMSE) solution\nof the independent target task. Our results emphasize the ability of transfer\nlearning to extend the solution space to the target task and, by that, to have\nan improved MMSE solution. We formulate the linear MMSE solution to our\ntransfer learning setting and point out its key differences from the common\ndesign philosophy to transfer learning.\n","authors":["Yehuda Dar","Daniel LeJeune","Richard G. Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2103.05621v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20848v1","updated":"2024-05-31T14:32:31Z","published":"2024-05-31T14:32:31Z","title":"SLIM: a Scalable Light-weight Root Cause Analysis for Imbalanced Data in\n  Microservice","summary":"  The newly deployed service -- one kind of change service, could lead to a new\ntype of minority fault. Existing state-of-the-art methods for fault\nlocalization rarely consider the imbalanced fault classification in change\nservice. This paper proposes a novel method that utilizes decision rule sets to\ndeal with highly imbalanced data by optimizing the F1 score subject to\ncardinality constraints. The proposed method greedily generates the rule with\nmaximal marginal gain and uses an efficient minorize-maximization (MM) approach\nto select rules iteratively, maximizing a non-monotone submodular lower bound.\nCompared with existing fault localization algorithms, our algorithm can adapt\nto the imbalanced fault scenario of change service, and provide interpretable\nfault causes which are easy to understand and verify. Our method can also be\ndeployed in the online training setting, with only about 15% training overhead\ncompared to the current SOTA methods. Empirical studies showcase that our\nalgorithm outperforms existing fault localization algorithms in both accuracy\nand model interpretability.\n","authors":["Rui Ren","Jingbang Yang","Linxiao Yang","Xinyue Gu","Liang Sun"],"pdf_url":"https://arxiv.org/pdf/2405.20848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00846v2","updated":"2024-05-31T14:26:47Z","published":"2024-05-01T20:21:44Z","title":"Gameplay Filters: Safe Robot Walking through Adversarial Imagination","summary":"  Ensuring the safe operation of legged robots in uncertain, novel environments\nis crucial to their widespread adoption. Despite recent advances in safety\nfilters that can keep arbitrary task-driven policies from incurring safety\nfailures, existing solutions for legged robot locomotion still rely on\nsimplified dynamics and may fail when the robot is perturbed away from\npredefined stable gaits. This paper presents a general approach that leverages\noffline game-theoretic reinforcement learning to synthesize a highly robust\nsafety filter for high-order nonlinear dynamics. This gameplay filter then\nmaintains runtime safety by continually simulating adversarial futures and\nprecluding task-driven actions that would cause it to lose future games (and\nthereby violate safety). Validated on a 36-dimensional quadruped robot\nlocomotion task, the gameplay safety filter exhibits inherent robustness to the\nsim-to-real gap without manual tuning or heuristic designs. Physical\nexperiments demonstrate the effectiveness of the gameplay safety filter under\nperturbations, such as tugging and unmodeled irregular terrains, while\nsimulation studies shed light on how to trade off computation and\nconservativeness without compromising safety.\n","authors":["Duy P. Nguyen","Kai-Chieh Hsu","Wenhao Yu","Jie Tan","Jaime F. Fisac"],"pdf_url":"https://arxiv.org/pdf/2405.00846v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16069v2","updated":"2024-05-31T14:25:58Z","published":"2024-05-25T05:40:16Z","title":"IncomeSCM: From tabular data set to time-series simulator and causal\n  estimation benchmark","summary":"  Evaluating observational estimators of causal effects demands information\nthat is rarely available: unconfounded interventions and outcomes from the\npopulation of interest, created either by randomization or adjustment. As a\nresult, it is customary to fall back on simulators when creating benchmark\ntasks. Simulators offer great control but are often too simplistic to make\nchallenging tasks, either because they are hand-designed and lack the nuances\nof real-world data, or because they are fit to observational data without\nstructural constraints. In this work, we propose a general, repeatable strategy\nfor turning observational data into sequential structural causal models and\nchallenging estimation tasks by following two simple principles: 1) fitting\nreal-world data where possible, and 2) creating complexity by composing simple,\nhand-designed mechanisms. We implement these ideas in a highly configurable\nsoftware package and apply it to the well-known Adult income data set to\nconstruct the \\tt IncomeSCM simulator. From this, we devise multiple estimation\ntasks and sample data sets to compare established estimators of causal effects.\nThe tasks present a suitable challenge, with effect estimates varying greatly\nin quality between methods, despite similar performance in the modeling of\nfactual outcomes, highlighting the need for dedicated causal estimators and\nmodel selection criteria.\n","authors":["Fredrik D. Johansson"],"pdf_url":"https://arxiv.org/pdf/2405.16069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20838v1","updated":"2024-05-31T14:25:45Z","published":"2024-05-31T14:25:45Z","title":"einspace: Searching for Neural Architectures from Fundamental Operations","summary":"  Neural architecture search (NAS) finds high performing networks for a given\ntask. Yet the results of NAS are fairly prosaic; they did not e.g. create a\nshift from convolutional structures to transformers. This is not least because\nthe search spaces in NAS often aren't diverse enough to include such\ntransformations a priori. Instead, for NAS to provide greater potential for\nfundamental design shifts, we need a novel expressive search space design which\nis built from more fundamental operations. To this end, we introduce einspace,\na search space based on a parameterised probabilistic context-free grammar. Our\nspace is versatile, supporting architectures of various sizes and complexities,\nwhile also containing diverse network operations which allow it to model\nconvolutions, attention components and more. It contains many existing\ncompetitive architectures, and provides flexibility for discovering new ones.\nUsing this search space, we perform experiments to find novel architectures as\nwell as improvements on existing ones on the diverse Unseen NAS datasets. We\nshow that competitive architectures can be obtained by searching from scratch,\nand we consistently find large improvements when initialising the search with\nstrong baselines. We believe that this work is an important advancement towards\na transformative NAS paradigm where search space expressivity and strategic\nsearch initialisation play key roles.\n","authors":["Linus Ericsson","Miguel Espinosa","Chenhongyi Yang","Antreas Antoniou","Amos Storkey","Shay B. Cohen","Steven McDonagh","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2405.20838v1.pdf","comment":"Project page at https://linusericsson.github.io/einspace/"},{"id":"http://arxiv.org/abs/2405.20836v1","updated":"2024-05-31T14:24:39Z","published":"2024-05-31T14:24:39Z","title":"Solving partial differential equations with sampled neural networks","summary":"  Approximation of solutions to partial differential equations (PDE) is an\nimportant problem in computational science and engineering. Using neural\nnetworks as an ansatz for the solution has proven a challenge in terms of\ntraining time and approximation accuracy. In this contribution, we discuss how\nsampling the hidden weights and biases of the ansatz network from data-agnostic\nand data-dependent probability distributions allows us to progress on both\nchallenges. In most examples, the random sampling schemes outperform iterative,\ngradient-based optimization of physics-informed neural networks regarding\ntraining time and accuracy by several orders of magnitude. For time-dependent\nPDE, we construct neural basis functions only in the spatial domain and then\nsolve the associated ordinary differential equation with classical methods from\nscientific computing over a long time horizon. This alleviates one of the\ngreatest challenges for neural PDE solvers because it does not require us to\nparameterize the solution in time. For second-order elliptic PDE in Barron\nspaces, we prove the existence of sampled networks with $L^2$ convergence to\nthe solution. We demonstrate our approach on several time-dependent and static\nPDEs. We also illustrate how sampled networks can effectively solve inverse\nproblems in this setting. Benefits compared to common numerical schemes include\nspectral convergence and mesh-free construction of basis functions.\n","authors":["Chinmay Datar","Taniya Kapoor","Abhishek Chandra","Qing Sun","Iryna Burak","Erik Lien Bolager","Anna Veselovska","Massimo Fornasier","Felix Dietrich"],"pdf_url":"https://arxiv.org/pdf/2405.20836v1.pdf","comment":"16 pages, 15 figures"},{"id":"http://arxiv.org/abs/2405.20835v1","updated":"2024-05-31T14:24:33Z","published":"2024-05-31T14:24:33Z","title":"Outliers and Calibration Sets have Diminishing Effect on Quantization of\n  Modern LLMs","summary":"  Post-Training Quantization (PTQ) enhances the efficiency of Large Language\nModels (LLMs) by enabling faster operation and compatibility with more\naccessible hardware through reduced memory usage, at the cost of small\nperformance drops. We explore the role of calibration sets in PTQ, specifically\ntheir effect on hidden activations in various notable open-source LLMs.\nCalibration sets are crucial for evaluating activation magnitudes and\nidentifying outliers, which can distort the quantization range and negatively\nimpact performance. Our analysis reveals a marked contrast in quantization\neffectiveness across models. The older OPT model, which much of the\nquantization literature is based on, shows significant performance\ndeterioration and high susceptibility to outliers with varying calibration\nsets. In contrast, newer models like Llama-2 7B, Llama-3 8B, Command-R 35B, and\nMistral 7B demonstrate strong robustness, with Mistral 7B showing near-immunity\nto outliers and stable activations. These findings suggest a shift in PTQ\nstrategies might be needed. As advancements in pre-training methods reduce the\nrelevance of outliers, there is an emerging need to reassess the fundamentals\nof current quantization literature. The emphasis should pivot towards\noptimizing inference speed, rather than primarily focusing on outlier\npreservation, to align with the evolving characteristics of state-of-the-art\nLLMs.\n","authors":["Davide Paglieri","Saurabh Dash","Tim Rocktäschel","Jack Parker-Holder"],"pdf_url":"https://arxiv.org/pdf/2405.20835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07217v2","updated":"2024-05-31T14:23:09Z","published":"2024-02-23T10:08:45Z","title":"Attention-aware Semantic Communications for Collaborative Inference","summary":"  We propose a communication-efficient collaborative inference framework in the\ndomain of edge inference, focusing on the efficient use of vision transformer\n(ViT) models. The partitioning strategy of conventional collaborative inference\nfails to reduce communication cost because of the inherent architecture of ViTs\nmaintaining consistent layer dimensions across the entire transformer encoder.\nTherefore, instead of employing the partitioning strategy, our framework\nutilizes a lightweight ViT model on the edge device, with the server deploying\na complicated ViT model. To enhance communication efficiency and achieve the\nclassification accuracy of the server model, we propose two strategies: 1)\nattention-aware patch selection and 2) entropy-aware image transmission.\nAttention-aware patch selection leverages the attention scores generated by the\nedge device's transformer encoder to identify and select the image patches\ncritical for classification. This strategy enables the edge device to transmit\nonly the essential patches to the server, significantly improving communication\nefficiency. Entropy-aware image transmission uses min-entropy as a metric to\naccurately determine whether to depend on the lightweight model on the edge\ndevice or to request the inference from the server model. In our framework, the\nlightweight ViT model on the edge device acts as a semantic encoder,\nefficiently identifying and selecting the crucial image information required\nfor the classification task. Our experiments demonstrate that the proposed\ncollaborative inference framework can reduce communication overhead by 68% with\nonly a minimal loss in accuracy compared to the server model on the ImageNet\ndataset.\n","authors":["Jiwoong Im","Nayoung Kwon","Taewoo Park","Jiheon Woo","Jaeho Lee","Yongjune Kim"],"pdf_url":"https://arxiv.org/pdf/2404.07217v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20830v1","updated":"2024-05-31T14:21:04Z","published":"2024-05-31T14:21:04Z","title":"Self-Augmented Preference Optimization: Off-Policy Paradigms for\n  Language Model Alignment","summary":"  Traditional language model alignment methods, such as Direct Preference\nOptimization (DPO), are limited by their dependence on static, pre-collected\npaired preference data, which hampers their adaptability and practical\napplicability. To overcome this limitation, we introduce Self-Augmented\nPreference Optimization (SAPO), an effective and scalable training paradigm\nthat does not require existing paired data. Building on the self-play concept,\nwhich autonomously generates negative responses, we further incorporate an\noff-policy learning pipeline to enhance data exploration and exploitation.\nSpecifically, we employ an Exponential Moving Average (EMA) model in\nconjunction with a replay buffer to enable dynamic updates of response\nsegments, effectively integrating real-time feedback with insights from\nhistorical data. Our comprehensive evaluations of the LLaMA3-8B and Mistral-7B\nmodels across benchmarks, including the Open LLM Leaderboard, IFEval,\nAlpacaEval 2.0, and MT-Bench, demonstrate that SAPO matches or surpasses\nestablished offline contrastive baselines, such as DPO and Odds Ratio\nPreference Optimization, and outperforms offline self-play methods like SPIN.\nOur code is available at https://github.com/yinyueqin/SAPO\n","authors":["Yueqin Yin","Zhendong Wang","Yujia Xie","Weizhu Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.20830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20829v1","updated":"2024-05-31T14:21:00Z","published":"2024-05-31T14:21:00Z","title":"Rethinking Open-World Semi-Supervised Learning: Distribution Mismatch\n  and Inductive Inference","summary":"  Open-world semi-supervised learning (OWSSL) extends conventional\nsemi-supervised learning to open-world scenarios by taking account of novel\ncategories in unlabeled datasets. Despite the recent advancements in OWSSL, the\nsuccess often relies on the assumptions that 1) labeled and unlabeled datasets\nshare the same balanced class prior distribution, which does not generally hold\nin real-world applications, and 2) unlabeled training datasets are utilized for\nevaluation, where such transductive inference might not adequately address\nchallenges in the wild. In this paper, we aim to generalize OWSSL by addressing\nthem. Our work suggests that practical OWSSL may require different training\nsettings, evaluation methods, and learning strategies compared to those\nprevalent in the existing literature.\n","authors":["Seongheon Park","Hyuk Kwon","Kwanghoon Sohn","Kibok Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20829v1.pdf","comment":"CVPR Workshop on Computer Vision in the Wild (CVinW), 2024"},{"id":"http://arxiv.org/abs/2312.10045v2","updated":"2024-05-31T14:19:03Z","published":"2023-12-01T11:27:08Z","title":"Interpretable Knowledge Tracing via Response Influence-based\n  Counterfactual Reasoning","summary":"  Knowledge tracing (KT) plays a crucial role in computer-aided education and\nintelligent tutoring systems, aiming to assess students' knowledge proficiency\nby predicting their future performance on new questions based on their past\nresponse records. While existing deep learning knowledge tracing (DLKT) methods\nhave significantly improved prediction accuracy and achieved state-of-the-art\nresults, they often suffer from a lack of interpretability. To address this\nlimitation, current approaches have explored incorporating psychological\ninfluences to achieve more explainable predictions, but they tend to overlook\nthe potential influences of historical responses. In fact, understanding how\nmodels make predictions based on response influences can enhance the\ntransparency and trustworthiness of the knowledge tracing process, presenting\nan opportunity for a new paradigm of interpretable KT. However, measuring\nunobservable response influences is challenging. In this paper, we resort to\ncounterfactual reasoning that intervenes in each response to answer\n\\textit{what if a student had answered a question incorrectly that he/she\nactually answered correctly, and vice versa}. Based on this, we propose RCKT, a\nnovel response influence-based counterfactual knowledge tracing framework. RCKT\ngenerates response influences by comparing prediction outcomes from factual\nsequences and constructed counterfactual sequences after interventions.\nAdditionally, we introduce maximization and inference techniques to leverage\naccumulated influences from different past responses, further improving the\nmodel's performance and credibility. Extensive experimental results demonstrate\nthat our RCKT method outperforms state-of-the-art knowledge tracing methods on\nfour datasets against six baselines, and provides credible interpretations of\nresponse influences.\n","authors":["Jiajun Cui","Minghe Yu","Bo Jiang","Aimin Zhou","Jianyong Wang","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.10045v2.pdf","comment":"ICDE'24 (fixing a few typos). Source code at\n  https://github.com/JJCui96/RCKT. Keywords: knowledge tracing, interpretable\n  machine learning, counterfactual reasoning, artificial intelligence for\n  education"},{"id":"http://arxiv.org/abs/2405.20825v1","updated":"2024-05-31T14:18:37Z","published":"2024-05-31T14:18:37Z","title":"Analysis of clinical, dosimetric and radiomic features for predicting\n  local failure after stereotactic radiotherapy of brain metastases in\n  malignant melanoma","summary":"  Background: The aim of this study was to investigate the role of clinical,\ndosimetric and pretherapeutic magnetic resonance imaging (MRI) features for\nlesion-specific outcome prediction of stereotactic radiotherapy (SRT) in\npatients with brain metastases from malignant melanoma (MBM).\n  Methods: In this multicenter, retrospective analysis, we reviewed 517 MBM\nfrom 130 patients treated with SRT (single fraction or hypofractionated). For\neach gross tumor volume (GTV) 1576 radiomic features (RF) were calculated (788\neach for the GTV and for a 3 mm margin around the GTV). Clinical parameters,\nradiation dose and RF from pretherapeutic contrast-enhanced T1-weighted MRI\nfrom different institutions were evaluated with a feature processing and\nelimination pipeline in a nested cross-validation scheme.\n  Results: Seventy-two (72) of 517 lesions (13.9%) showed a local failure (LF)\nafter SRT. The processing pipeline showed clinical, dosimetric and radiomic\nfeatures providing information for LF prediction. The most prominent ones were\nthe correlation of the gray level co-occurrence matrix of the margin (hazard\nratio (HR): 0.37, confidence interval (CI): 0.23-0.58) and systemic therapy\nbefore SRT (HR: 0.55, CI: 0.42-0.70). The majority of RF associated with LF was\ncalculated in the margin around the GTV.\n  Conclusions: Pretherapeutic MRI based RF connected with lesion-specific\noutcome after SRT could be identified, despite multicentric data and minor\ndifferences in imaging protocols. Image data analysis of the surrounding\nmetastatic environment may provide therapy-relevant information with the\npotential to further individualize radiotherapy strategies.\n","authors":["Nanna E. Hartong","Ilias Sachpazidis","Oliver Blanck","Lucas Etzel","Jan C. Peeken","Stephanie E. Combs","Horst Urbach","Maxim Zaitsev","Dimos Baltas","Ilinca Popp","Anca-Ligia Grosu","Tobias Fechter"],"pdf_url":"https://arxiv.org/pdf/2405.20825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14041v4","updated":"2024-05-31T14:18:31Z","published":"2022-12-02T16:34:56Z","title":"Deciphering RNA Secondary Structure Prediction: A Probabilistic K-Rook\n  Matching Perspective","summary":"  The secondary structure of ribonucleic acid (RNA) is more stable and\naccessible in the cell than its tertiary structure, making it essential for\nfunctional prediction. Although deep learning has shown promising results in\nthis field, current methods suffer from poor generalization and high\ncomplexity. In this work, we reformulate the RNA secondary structure prediction\nas a K-Rook problem, thereby simplifying the prediction process into\nprobabilistic matching within a finite solution space. Building on this\ninnovative perspective, we introduce RFold, a simple yet effective method that\nlearns to predict the most matching K-Rook solution from the given sequence.\nRFold employs a bi-dimensional optimization strategy that decomposes the\nprobabilistic matching problem into row-wise and column-wise components to\nreduce the matching complexity, simplifying the solving process while\nguaranteeing the validity of the output. Extensive experiments demonstrate that\nRFold achieves competitive performance and about eight times faster inference\nefficiency than the state-of-the-art approaches. The code and Colab demo are\navailable in\n\\href{http://github.com/A4Bio/RFold}{http://github.com/A4Bio/RFold}.\n","authors":["Cheng Tan","Zhangyang Gao","Hanqun Cao","Xingran Chen","Ge Wang","Lirong Wu","Jun Xia","Jiangbin Zheng","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2212.14041v4.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.20824v1","updated":"2024-05-31T14:16:52Z","published":"2024-05-31T14:16:52Z","title":"Online Convex Optimisation: The Optimal Switching Regret for all\n  Segmentations Simultaneously","summary":"  We consider the classic problem of online convex optimisation. Whereas the\nnotion of static regret is relevant for stationary problems, the notion of\nswitching regret is more appropriate for non-stationary problems. A switching\nregret is defined relative to any segmentation of the trial sequence, and is\nequal to the sum of the static regrets of each segment. In this paper we show\nthat, perhaps surprisingly, we can achieve the asymptotically optimal switching\nregret on every possible segmentation simultaneously. Our algorithm for doing\nso is very efficient: having a space and per-trial time complexity that is\nlogarithmic in the time-horizon. Our algorithm also obtains novel bounds on its\ndynamic regret: being adaptive to variations in the rate of change of the\ncomparator sequence.\n","authors":["Stephen Pasteris","Chris Hicks","Vasilios Mavroudis","Mark Herbster"],"pdf_url":"https://arxiv.org/pdf/2405.20824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20821v1","updated":"2024-05-31T14:15:44Z","published":"2024-05-31T14:15:44Z","title":"Pursuing Overall Welfare in Federated Learning through Sequential\n  Decision Making","summary":"  In traditional federated learning, a single global model cannot perform\nequally well for all clients. Therefore, the need to achieve the client-level\nfairness in federated system has been emphasized, which can be realized by\nmodifying the static aggregation scheme for updating the global model to an\nadaptive one, in response to the local signals of the participating clients.\nOur work reveals that existing fairness-aware aggregation strategies can be\nunified into an online convex optimization framework, in other words, a central\nserver's sequential decision making process. To enhance the decision making\ncapability, we propose simple and intuitive improvements for suboptimal designs\nwithin existing methods, presenting AAggFF. Considering practical requirements,\nwe further subdivide our method tailored for the cross-device and the\ncross-silo settings, respectively. Theoretical analyses guarantee sublinear\nregret upper bounds for both settings: $\\mathcal{O}(\\sqrt{T \\log{K}})$ for the\ncross-device setting, and $\\mathcal{O}(K \\log{T})$ for the cross-silo setting,\nwith $K$ clients and $T$ federation rounds. Extensive experiments demonstrate\nthat the federated system equipped with AAggFF achieves better degree of\nclient-level fairness than existing methods in both practical settings. Code is\navailable at https://github.com/vaseline555/AAggFF\n","authors":["Seok-Ju Hahn","Gi-Soo Kim","Junghye Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20821v1.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2405.20808v1","updated":"2024-05-31T14:07:33Z","published":"2024-05-31T14:07:33Z","title":"Optimally Improving Cooperative Learning in a Social Setting","summary":"  We consider a cooperative learning scenario where a collection of networked\nagents with individually owned classifiers dynamically update their\npredictions, for the same classification task, through communication or\nobservations of each other's predictions. Clearly if highly influential\nvertices use erroneous classifiers, there will be a negative effect on the\naccuracy of all the agents in the network. We ask the following question: how\ncan we optimally fix the prediction of a few classifiers so as maximize the\noverall accuracy in the entire network. To this end we consider an aggregate\nand an egalitarian objective function. We show a polynomial time algorithm for\noptimizing the aggregate objective function, and show that optimizing the\negalitarian objective function is NP-hard. Furthermore, we develop\napproximation algorithms for the egalitarian improvement. The performance of\nall of our algorithms are guaranteed by mathematical analysis and backed by\nexperiments on synthetic and real data.\n","authors":["Shahrzad Haddadan","Cheng Xin","Jie Gao"],"pdf_url":"https://arxiv.org/pdf/2405.20808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17810v2","updated":"2024-05-31T14:07:00Z","published":"2024-02-27T12:43:09Z","title":"BioT5+: Towards Generalized Biological Understanding with IUPAC\n  Integration and Multi-task Tuning","summary":"  Recent research trends in computational biology have increasingly focused on\nintegrating text and bio-entity modeling, especially in the context of\nmolecules and proteins. However, previous efforts like BioT5 faced challenges\nin generalizing across diverse tasks and lacked a nuanced understanding of\nmolecular structures, particularly in their textual representations (e.g.,\nIUPAC). This paper introduces BioT5+, an extension of the BioT5 framework,\ntailored to enhance biological research and drug discovery. BioT5+ incorporates\nseveral novel features: integration of IUPAC names for molecular understanding,\ninclusion of extensive bio-text and molecule data from sources like bioRxiv and\nPubChem, the multi-task instruction tuning for generality across tasks, and a\nnumerical tokenization technique for improved processing of numerical data.\nThese enhancements allow BioT5+ to bridge the gap between molecular\nrepresentations and their textual descriptions, providing a more holistic\nunderstanding of biological entities, and largely improving the grounded\nreasoning of bio-text and bio-sequences. The model is pre-trained and\nfine-tuned with a large number of experiments, including \\emph{3 types of\nproblems (classification, regression, generation), 15 kinds of tasks, and 21\ntotal benchmark datasets}, demonstrating the remarkable performance and\nstate-of-the-art results in most cases. BioT5+ stands out for its ability to\ncapture intricate relationships in biological data, thereby contributing\nsignificantly to bioinformatics and computational biology. Our code is\navailable at \\url{https://github.com/QizhiPei/BioT5}.\n","authors":["Qizhi Pei","Lijun Wu","Kaiyuan Gao","Xiaozhuan Liang","Yin Fang","Jinhua Zhu","Shufang Xie","Tao Qin","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2402.17810v2.pdf","comment":"Accepted by ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2402.12550v2","updated":"2024-05-31T14:04:05Z","published":"2024-02-19T21:20:22Z","title":"Multilinear Mixture of Experts: Scalable Expert Specialization through\n  Factorization","summary":"  The Mixture of Experts (MoE) paradigm provides a powerful way to decompose\ndense layers into smaller, modular computations often more amenable to human\ninterpretation, debugging, and editability. However, a major challenge lies in\nthe computational cost of scaling the number of experts high enough to achieve\nfine-grained specialization. In this paper, we propose the Multilinear Mixture\nof Experts ($\\mu$MoE) layer to address this, focusing on vision models.\n$\\mu$MoE layers enable scalable expert specialization by performing an implicit\ncomputation on prohibitively large weight tensors entirely in factorized form.\nConsequently, $\\mu$MoEs (1) avoid the restrictively high inference-time costs\nof 'soft' MoEs, yet (2) do not inherit the training issues of the popular\n'sparse' MoEs' discrete (non-differentiable) expert routing. We present both\nqualitative and quantitative evidence that scaling $\\mu$MoE layers when\nfine-tuning foundation models for vision tasks leads to more specialized\nexperts at the class-level, further enabling manual bias correction in CelebA\nattribute classification. Finally, we show qualitative results demonstrating\nthe expert specialism achieved when pre-training large GPT2 and MLP-Mixer\nmodels with parameter-matched $\\mu$MoE blocks at every layer, maintaining\ncomparable accuracy. Our code is available at:\nhttps://github.com/james-oldfield/muMoE.\n","authors":["James Oldfield","Markos Georgopoulos","Grigorios G. Chrysos","Christos Tzelepis","Yannis Panagakis","Mihalis A. Nicolaou","Jiankang Deng","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2402.12550v2.pdf","comment":"Github: https://github.com/james-oldfield/muMoE. Project page:\n  https://james-oldfield.github.io/muMoE/"},{"id":"http://arxiv.org/abs/2305.15805v3","updated":"2024-05-31T14:02:24Z","published":"2023-05-25T07:39:41Z","title":"Dynamic Context Pruning for Efficient and Interpretable Autoregressive\n  Transformers","summary":"  Autoregressive Transformers adopted in Large Language Models (LLMs) are hard\nto scale to long sequences. Despite several works trying to reduce their\ncomputational cost, most of LLMs still adopt attention layers between all pairs\nof tokens in the sequence, thus incurring a quadratic cost. In this study, we\npresent a novel approach that dynamically prunes contextual information while\npreserving the model's expressiveness, resulting in reduced memory and\ncomputational requirements during inference. Our method employs a learnable\nmechanism that determines which uninformative tokens can be dropped from the\ncontext at any point across the generation process. By doing so, our approach\nnot only addresses performance concerns but also enhances interpretability,\nproviding valuable insight into the model's decision-making process. Our\ntechnique can be applied to existing pre-trained models through a\nstraightforward fine-tuning process, and the pruning strength can be specified\nby a sparsity parameter. Notably, our empirical findings demonstrate that we\ncan effectively prune up to 80\\% of the context without significant performance\ndegradation on downstream tasks, offering a valuable tool for mitigating\ninference costs. Our reference implementation achieves up to $2\\times$ increase\nin inference throughput and even greater memory savings.\n","authors":["Sotiris Anagnostidis","Dario Pavllo","Luca Biggio","Lorenzo Noci","Aurelien Lucchi","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2305.15805v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15154v2","updated":"2024-05-31T14:01:32Z","published":"2024-05-24T02:13:46Z","title":"Online Prompt Pricing based on Combinatorial Multi-Armed Bandit and\n  Hierarchical Stackelberg Game","summary":"  Generation models have shown promising performance in various tasks, making\ntrading around machine learning models possible. In this paper, we aim at a\nnovel prompt trading scenario, prompt bundle trading (PBT) system, and propose\nan online pricing mechanism. Based on the combinatorial multi-armed bandit\n(CMAB) and three-stage hierarchical Stackelburg (HS) game, our pricing\nmechanism considers the profits of the consumer, platform, and seller,\nsimultaneously achieving the profit satisfaction of these three participants.\nWe break down the pricing issue into two steps, namely unknown category\nselection and incentive strategy optimization. The former step is to select a\nset of categories with the highest qualities, and the latter is to derive the\noptimal strategy for each participant based on the chosen categories. Unlike\nthe existing fixed pricing mode, the PBT pricing mechanism we propose is more\nflexible and diverse, which is more in accord with the transaction needs of\nreal-world scenarios. We test our method on a simulated text-to-image dataset.\nThe experimental results demonstrate the effectiveness of our algorithm, which\nprovides a feasible price-setting standard for the prompt marketplaces.\n","authors":["Meiling Li","Hongrun Ren","Haixu Xiong","Zhenxing Qian","Xinpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.15154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20800v1","updated":"2024-05-31T14:01:12Z","published":"2024-05-31T14:01:12Z","title":"Shape Constraints in Symbolic Regression using Penalized Least Squares","summary":"  We study the addition of shape constraints and their consideration during the\nparameter estimation step of symbolic regression (SR). Shape constraints serve\nas a means to introduce prior knowledge about the shape of the otherwise\nunknown model function into SR. Unlike previous works that have explored shape\nconstraints in SR, we propose minimizing shape constraint violations during\nparameter estimation using gradient-based numerical optimization.\n  We test three algorithm variants to evaluate their performance in identifying\nthree symbolic expressions from a synthetically generated data set. This paper\nexamines two benchmark scenarios: one with varying noise levels and another\nwith reduced amounts of training data. The results indicate that incorporating\nshape constraints into the expression search is particularly beneficial when\ndata is scarce. Compared to using shape constraints only in the selection\nprocess, our approach of minimizing violations during parameter estimation\nshows a statistically significant benefit in some of our test cases, without\nbeing significantly worse in any instance.\n","authors":["Viktor Martinek","Julia Reuter","Ophelia Frotscher","Sanaz Mostaghim","Markus Richter","Roland Herzog"],"pdf_url":"https://arxiv.org/pdf/2405.20800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20799v1","updated":"2024-05-31T14:00:44Z","published":"2024-05-31T14:00:44Z","title":"Rough Transformers: Lightweight Continuous-Time Sequence Modelling with\n  Path Signatures","summary":"  Time-series data in real-world settings typically exhibit long-range\ndependencies and are observed at non-uniform intervals. In these settings,\ntraditional sequence-based recurrent models struggle. To overcome this,\nresearchers often replace recurrent architectures with Neural ODE-based models\nto account for irregularly sampled data and use Transformer-based architectures\nto account for long-range dependencies. Despite the success of these two\napproaches, both incur very high computational costs for input sequences of\neven moderate length. To address this challenge, we introduce the Rough\nTransformer, a variation of the Transformer model that operates on\ncontinuous-time representations of input sequences and incurs significantly\nlower computational costs. In particular, we propose \\textit{multi-view\nsignature attention}, which uses path signatures to augment vanilla attention\nand to capture both local and global (multi-scale) dependencies in the input\ndata, while remaining robust to changes in the sequence length and sampling\nfrequency and yielding improved spatial processing. We find that, on a variety\nof time-series-related tasks, Rough Transformers consistently outperform their\nvanilla attention counterparts while obtaining the representational benefits of\nNeural ODE-based models, all at a fraction of the computational time and memory\nresources.\n","authors":["Fernando Moreno-Pino","Álvaro Arroyo","Harrison Waldon","Xiaowen Dong","Álvaro Cartea"],"pdf_url":"https://arxiv.org/pdf/2405.20799v1.pdf","comment":"Preprint. Under review. arXiv admin note: text overlap with\n  arXiv:2403.10288"},{"id":"http://arxiv.org/abs/2402.09838v2","updated":"2024-05-31T13:59:44Z","published":"2024-02-15T10:00:13Z","title":"Performative Reinforcement Learning in Gradually Shifting Environments","summary":"  When Reinforcement Learning (RL) agents are deployed in practice, they might\nimpact their environment and change its dynamics. We propose a new framework to\nmodel this phenomenon, where the current environment depends on the deployed\npolicy as well as its previous dynamics. This is a generalization of\nPerformative RL (PRL) [Mandal et al., 2023]. Unlike PRL, our framework allows\nto model scenarios where the environment gradually adjusts to a deployed\npolicy. We adapt two algorithms from the performative prediction literature to\nour setting and propose a novel algorithm called Mixed Delayed Repeated\nRetraining (MDRR). We provide conditions under which these algorithms converge\nand compare them using three metrics: number of retrainings, approximation\nguarantee, and number of samples per deployment. MDRR is the first algorithm in\nthis setting which combines samples from multiple deployments in its training.\nThis makes MDRR particularly suitable for scenarios where the environment's\nresponse strongly depends on its previous dynamics, which are common in\npractice. We experimentally compare the algorithms using a simulation-based\ntestbed and our results show that MDRR converges significantly faster than\nprevious approaches.\n","authors":["Ben Rank","Stelios Triantafyllou","Debmalya Mandal","Goran Radanovic"],"pdf_url":"https://arxiv.org/pdf/2402.09838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20797v1","updated":"2024-05-31T13:59:18Z","published":"2024-05-31T13:59:18Z","title":"Ovis: Structural Embedding Alignment for Multimodal Large Language Model","summary":"  Current Multimodal Large Language Models (MLLMs) typically integrate a\npre-trained LLM with another pre-trained vision transformer through a\nconnector, such as an MLP, endowing the LLM with visual capabilities. However,\nthe misalignment between two embedding strategies in MLLMs -- the structural\ntextual embeddings based on an embedding look-up table and the continuous\nembeddings generated directly by the vision encoder -- makes challenges for a\nmore seamless fusion of visual and textual information. We propose Ovis, a\nnovel MLLM architecture designed to structurally align visual and textual\nembeddings. Ovis integrates an additional learnable visual embedding table into\nthe visual encoder's process. To capture rich visual semantics, each image\npatch indexes the visual embedding table multiple times, resulting in a final\nvisual embedding that is a probabilistic combination of the indexed embeddings.\nThis structural approach mirrors the method used for generating textual\nembeddings. Empirical evaluations on various multimodal benchmarks demonstrate\nthat Ovis outperforms open-source MLLMs of similar parameter scales and even\nsurpasses the proprietary model Qwen-VL-Plus overall. These results highlight\nthe potential of Ovis' structured visual representation for advancing MLLM\narchitectural design and promoting more effective multimodal learning. Both the\nsource code and the training dataset of Ovis will be made publicly available.\n","authors":["Shiyin Lu","Yang Li","Qing-Guo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2405.20797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20794v1","updated":"2024-05-31T13:54:25Z","published":"2024-05-31T13:54:25Z","title":"Model Interpretation and Explainability: Towards Creating Transparency\n  in Prediction Models","summary":"  Explainable AI (XAI) has a counterpart in analytical modeling which we refer\nto as model explainability. We tackle the issue of model explainability in the\ncontext of prediction models. We analyze a dataset of loans from a credit card\ncompany and apply three stages: execute and compare four different prediction\nmethods, apply the best known explainability techniques in the current\nliterature to the model training sets to identify feature importance (FI)\n(static case), and finally to cross-check whether the FI set holds up under\nwhat if prediction scenarios for continuous and categorical variables (dynamic\ncase). We found inconsistency in FI identification between the static and\ndynamic cases. We summarize the state of the art in model explainability and\nsuggest further research to advance the field.\n","authors":["Donald Kridel","Jacob Dineen","Daniel Dolk","David Castillo"],"pdf_url":"https://arxiv.org/pdf/2405.20794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13397v3","updated":"2024-05-31T13:53:26Z","published":"2023-10-20T10:12:06Z","title":"Equivariant Deep Weight Space Alignment","summary":"  Permutation symmetries of deep networks make basic operations like model\nmerging and similarity estimation challenging. In many cases, aligning the\nweights of the networks, i.e., finding optimal permutations between their\nweights, is necessary. Unfortunately, weight alignment is an NP-hard problem.\nPrior research has mainly focused on solving relaxed versions of the alignment\nproblem, leading to either time-consuming methods or sub-optimal solutions. To\naccelerate the alignment process and improve its quality, we propose a novel\nframework aimed at learning to solve the weight alignment problem, which we\nname Deep-Align. To that end, we first prove that weight alignment adheres to\ntwo fundamental symmetries and then, propose a deep architecture that respects\nthese symmetries. Notably, our framework does not require any labeled data. We\nprovide a theoretical analysis of our approach and evaluate Deep-Align on\nseveral types of network architectures and learning setups. Our experimental\nresults indicate that a feed-forward pass with Deep-Align produces better or\nequivalent alignments compared to those produced by current optimization\nalgorithms. Additionally, our alignments can be used as an effective\ninitialization for other methods, leading to improved solutions with a\nsignificant speedup in convergence.\n","authors":["Aviv Navon","Aviv Shamsian","Ethan Fetaya","Gal Chechik","Nadav Dym","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2310.13397v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.20791v1","updated":"2024-05-31T13:48:54Z","published":"2024-05-31T13:48:54Z","title":"GS-Phong: Meta-Learned 3D Gaussians for Relightable Novel View Synthesis","summary":"  Decoupling the illumination in 3D scenes is crucial for novel view synthesis\nand relighting. In this paper, we propose a novel method for representing a\nscene illuminated by a point light using a set of relightable 3D Gaussian\npoints. Inspired by the Blinn-Phong model, our approach decomposes the scene\ninto ambient, diffuse, and specular components, enabling the synthesis of\nrealistic lighting effects. To facilitate the decomposition of geometric\ninformation independent of lighting conditions, we introduce a novel bilevel\noptimization-based meta-learning framework. The fundamental idea is to view the\nrendering tasks under various lighting positions as a multi-task learning\nproblem, which our meta-learning approach effectively addresses by generalizing\nthe learned Gaussian geometries not only across different viewpoints but also\nacross diverse light positions. Experimental results demonstrate the\neffectiveness of our approach in terms of training efficiency and rendering\nquality compared to existing methods for free-viewpoint relighting.\n","authors":["Yumeng He","Yunbo Wang","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2405.20791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20790v1","updated":"2024-05-31T13:45:52Z","published":"2024-05-31T13:45:52Z","title":"Intersectional Unfairness Discovery","summary":"  AI systems have been shown to produce unfair results for certain subgroups of\npopulation, highlighting the need to understand bias on certain sensitive\nattributes. Current research often falls short, primarily focusing on the\nsubgroups characterized by a single sensitive attribute, while neglecting the\nnature of intersectional fairness of multiple sensitive attributes. This paper\nfocuses on its one fundamental aspect by discovering diverse high-bias\nsubgroups under intersectional sensitive attributes. Specifically, we propose a\nBias-Guided Generative Network (BGGN). By treating each bias value as a reward,\nBGGN efficiently generates high-bias intersectional sensitive attributes.\nExperiments on real-world text and image datasets demonstrate a diverse and\nefficient discovery of BGGN. To further evaluate the generated unseen but\npossible unfair intersectional sensitive attributes, we formulate them as\nprompts and use modern generative AI to produce new texts and images. The\nresults of frequently generating biased data provides new insights of\ndiscovering potential unfairness in popular modern generative AI systems.\nWarning: This paper contains generative examples that are offensive in nature.\n","authors":["Gezheng Xu","Qi Chen","Charles Ling","Boyu Wang","Changjian Shui"],"pdf_url":"https://arxiv.org/pdf/2405.20790v1.pdf","comment":"ICML-2024 Camera-ready"},{"id":"http://arxiv.org/abs/2405.20772v1","updated":"2024-05-31T13:28:37Z","published":"2024-05-31T13:28:37Z","title":"Reinforcement Learning for Sociohydrology","summary":"  In this study, we discuss how reinforcement learning (RL) provides an\neffective and efficient framework for solving sociohydrology problems. The\nefficacy of RL for these types of problems is evident because of its ability to\nupdate policies in an iterative manner - something that is also foundational to\nsociohydrology, where we are interested in representing the co-evolution of\nhuman-water interactions. We present a simple case study to demonstrate the\nimplementation of RL in a problem of runoff reduction through management\ndecisions related to changes in land-use land-cover (LULC). We then discuss the\nbenefits of RL for these types of problems and share our perspectives on the\nfuture research directions in this area.\n","authors":["Tirthankar Roy","Shivendra Srivastava","Beichen Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10144v2","updated":"2024-05-31T13:11:15Z","published":"2024-03-15T09:43:52Z","title":"NLP Verification: Towards a General Methodology for Certifying\n  Robustness","summary":"  Deep neural networks have exhibited substantial success in the field of\nNatural Language Processing and ensuring their safety and reliability is\ncrucial: there are safety critical contexts where such models must be robust to\nvariability or attack, and give guarantees over their output. Unlike Computer\nVision, NLP lacks a unified verification methodology and, despite recent\nadvancements in literature, they are often light on the pragmatical issues of\nNLP verification. In this paper, we attempt to distil and evaluate general\ncomponents of an NLP verification pipeline, that emerges from the progress in\nthe field to date. Our contributions are two-fold. Firstly, we give a general\n(i.e. algorithm-independent) characterisation of verifiable subspaces that\nresult from embedding sentences into continuous spaces. We identify, and give\nan effective method to deal with, the technical challenge of semantic\ngeneralisability of verified subspaces; and propose it as a standard metric in\nthe NLP verification pipelines (alongside with the standard metrics of model\naccuracy and model verifiability). Secondly, we propose a general methodology\nto analyse the effect of the embedding gap -- a problem that refers to the\ndiscrepancy between verification of geometric subspaces, and the semantic\nmeaning of sentences which the geometric subspaces are supposed to represent.\nIn extreme cases, poor choices in embedding of sentences may invalidate\nverification results. We propose a number of practical NLP methods that can\nhelp to quantify the effects of the embedding gap; and in particular we propose\nthe metric of falsifiability of semantic subspaces as another fundamental\nmetric to be reported as part of the NLP verification pipeline. We believe that\ntogether these general principles pave the way towards a more consolidated and\neffective development of this new domain.\n","authors":["Marco Casadio","Tanvi Dinkar","Ekaterina Komendantskaya","Luca Arnaboldi","Matthew L. Daggitt","Omri Isac","Guy Katz","Verena Rieser","Oliver Lemon"],"pdf_url":"https://arxiv.org/pdf/2403.10144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20763v1","updated":"2024-05-31T12:32:34Z","published":"2024-05-31T12:32:34Z","title":"Improving Generalization and Convergence by Enhancing Implicit\n  Regularization","summary":"  In this work, we propose an Implicit Regularization Enhancement (IRE)\nframework to accelerate the discovery of flat solutions in deep learning,\nthereby improving generalization and convergence. Specifically, IRE decouples\nthe dynamics of flat and sharp directions, which boosts the sharpness reduction\nalong flat directions while maintaining the training stability in sharp\ndirections. We show that IRE can be practically incorporated with {\\em generic\nbase optimizers} without introducing significant computational overload.\nExperiments show that IRE consistently improves the generalization performance\nfor image classification tasks across a variety of benchmark datasets\n(CIFAR-10/100, ImageNet) and models (ResNets and ViTs). Surprisingly, IRE also\nachieves a $2\\times$ {\\em speed-up} compared to AdamW in the pre-training of\nLlama models (of sizes ranging from 60M to 229M) on datasets including\nWikitext-103, Minipile, and Openwebtext. Moreover, we provide theoretical\nguarantees, showing that IRE can substantially accelerate the convergence\ntowards flat minima in Sharpness-aware Minimization (SAM).\n","authors":["Mingze Wang","Haotian He","Jinbo Wang","Zilin Wang","Guanhua Huang","Feiyu Xiong","Zhiyu Li","Weinan E","Lei Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20763v1.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2402.07043v2","updated":"2024-05-31T12:27:52Z","published":"2024-02-10T21:06:34Z","title":"A Tale of Tails: Model Collapse as a Change of Scaling Laws","summary":"  As AI model size grows, neural scaling laws have become a crucial tool to\npredict the improvements of large models when increasing capacity and the size\nof original (human or natural) training data. Yet, the widespread use of\npopular models means that the ecosystem of online data and text will co-evolve\nto progressively contain increased amounts of synthesized data. In this paper\nwe ask: How will the scaling laws change in the inevitable regime where\nsynthetic data makes its way into the training corpus? Will future models,\nstill improve, or be doomed to degenerate up to total (model) collapse? We\ndevelop a theoretical framework of model collapse through the lens of scaling\nlaws. We discover a wide range of decay phenomena, analyzing loss of scaling,\nshifted scaling with number of generations, the ''un-learning\" of skills, and\ngrokking when mixing human and synthesized data. Our theory is validated by\nlarge-scale experiments with a transformer on an arithmetic task and text\ngeneration using the large language model Llama2.\n","authors":["Elvis Dohmatob","Yunzhen Feng","Pu Yang","Francois Charton","Julia Kempe"],"pdf_url":"https://arxiv.org/pdf/2402.07043v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20761v1","updated":"2024-05-31T12:27:38Z","published":"2024-05-31T12:27:38Z","title":"Share Your Secrets for Privacy! Confidential Forecasting with Vertical\n  Federated Learning","summary":"  Vertical federated learning (VFL) is a promising area for time series\nforecasting in industrial applications, such as predictive maintenance and\nmachine control. Critical challenges to address in manufacturing include data\nprivacy and over-fitting on small and noisy datasets during both training and\ninference. Additionally, to increase industry adaptability, such forecasting\nmodels must scale well with the number of parties while ensuring strong\nconvergence and low-tuning complexity. We address those challenges and propose\n'Secret-shared Time Series Forecasting with VFL' (STV), a novel framework that\nexhibits the following key features: i) a privacy-preserving algorithm for\nforecasting with SARIMAX and autoregressive trees on vertically partitioned\ndata; ii) serverless forecasting using secret sharing and multi-party\ncomputation; iii) novel N-party algorithms for matrix multiplication and\ninverse operations for direct parameter optimization, giving strong convergence\nwith minimal hyperparameter tuning complexity. We conduct evaluations on six\nrepresentative datasets from public and industry-specific contexts. Our results\ndemonstrate that STV's forecasting accuracy is comparable to those of\ncentralized approaches. They also show that our direct optimization can\noutperform centralized methods, which include state-of-the-art diffusion models\nand long-short-term memory, by 23.81% on forecasting accuracy. We also conduct\na scalability analysis by examining the communication costs of direct and\niterative optimization to navigate the choice between the two. Code and\nappendix are available: https://github.com/adis98/STV\n","authors":["Aditya Shankar","Lydia Y. Chen","Jérémie Decouchant","Dimitra Gkorou","Rihan Hai"],"pdf_url":"https://arxiv.org/pdf/2405.20761v1.pdf","comment":"Submitted to the 27TH EUROPEAN CONFERENCE ON ARTIFICIAL INTELLIGENCE\n  (ECAI 2024)"},{"id":"http://arxiv.org/abs/2309.16476v2","updated":"2024-05-31T12:25:31Z","published":"2023-09-28T14:39:50Z","title":"High-dimensional robust regression under heavy-tailed data: Asymptotics\n  and Universality","summary":"  We investigate the high-dimensional properties of robust regression\nestimators in the presence of heavy-tailed contamination of both the covariates\nand response functions. In particular, we provide a sharp asymptotic\ncharacterisation of M-estimators trained on a family of elliptical covariate\nand noise data distributions including cases where second and higher moments do\nnot exist. We show that, despite being consistent, the Huber loss with\noptimally tuned location parameter $\\delta$ is suboptimal in the\nhigh-dimensional regime in the presence of heavy-tailed noise, highlighting the\nnecessity of further regularisation to achieve optimal performance. This result\nalso uncovers the existence of a transition in $\\delta$ as a function of the\nsample complexity and contamination. Moreover, we derive the decay rates for\nthe excess risk of ridge regression. We show that, while it is both optimal and\nuniversal for covariate distributions with finite second moment, its decay rate\ncan be considerably faster when the covariates' second moment does not exist.\nFinally, we show that our formulas readily generalise to a richer family of\nmodels and data distributions, such as generalised linear estimation with\narbitrary convex regularisation trained on mixture models.\n","authors":["Urte Adomaityte","Leonardo Defilippis","Bruno Loureiro","Gabriele Sicuro"],"pdf_url":"https://arxiv.org/pdf/2309.16476v2.pdf","comment":"13 pages + Supplementary information"},{"id":"http://arxiv.org/abs/2405.20759v1","updated":"2024-05-31T12:20:02Z","published":"2024-05-31T12:20:02Z","title":"Information Theoretic Text-to-Image Alignment","summary":"  Diffusion models for Text-to-Image (T2I) conditional generation have seen\ntremendous success recently. Despite their success, accurately capturing user\nintentions with these models still requires a laborious trial and error\nprocess. This challenge is commonly identified as a model alignment problem, an\nissue that has attracted considerable attention by the research community.\nInstead of relying on fine-grained linguistic analyses of prompts, human\nannotation, or auxiliary vision-language models to steer image generation, in\nthis work we present a novel method that relies on an information-theoretic\nalignment measure. In a nutshell, our method uses self-supervised fine-tuning\nand relies on point-wise mutual information between prompts and images to\ndefine a synthetic training set to induce model alignment. Our comparative\nanalysis shows that our method is on-par or superior to the state-of-the-art,\nyet requires nothing but a pre-trained denoising network to estimate MI and a\nlightweight fine-tuning strategy.\n","authors":["Chao Wang","Giulio Franzese","Alessandro Finamore","Massimo Gallo","Pietro Michiardi"],"pdf_url":"https://arxiv.org/pdf/2405.20759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16056v3","updated":"2024-05-31T11:44:39Z","published":"2024-05-25T04:51:41Z","title":"FedSheafHN: Personalized Federated Learning on Graph-structured Data","summary":"  Personalized subgraph Federated Learning (FL) is a task that customizes Graph\nNeural Networks (GNNs) to individual client needs, accommodating diverse data\ndistributions. However, applying hypernetworks in FL, while aiming to\nfacilitate model personalization, often encounters challenges due to inadequate\nrepresentation of client-specific characteristics. To overcome these\nlimitations, we propose a model called FedSheafHN, using enhanced collaboration\ngraph embedding and efficient personalized model parameter generation.\nSpecifically, our model embeds each client's local subgraph into a\nserver-constructed collaboration graph. We utilize sheaf diffusion in the\ncollaboration graph to learn client representations. Our model improves the\nintegration and interpretation of complex client characteristics. Furthermore,\nour model ensures the generation of personalized models through advanced\nhypernetworks optimized for parallel operations across clients. Empirical\nevaluations demonstrate that FedSheafHN outperforms existing methods in most\nscenarios, in terms of client model performance on various graph-structured\ndatasets. It also has fast model convergence and effective new clients\ngeneralization.\n","authors":["Wenfei Liang","Yanan Zhao","Rui She","Yiming Li","Wee Peng Tay"],"pdf_url":"https://arxiv.org/pdf/2405.16056v3.pdf","comment":"This paper was submitted to ICML 2024 in Feb 2024. You can find a\n  record\n  here:https://github.com/CarrieWFF/ICML-2024-submission-recording/blob/main/Screenshot%20of%20FedSheafHN%20submission%20to%20ICML%202024.png"},{"id":"http://arxiv.org/abs/2405.19542v2","updated":"2024-05-31T11:31:12Z","published":"2024-05-29T22:04:40Z","title":"Anatomical Region Recognition and Real-time Bone Tracking Methods by\n  Dynamically Decoding A-Mode Ultrasound Signals","summary":"  Accurate bone tracking is crucial for kinematic analysis in orthopedic\nsurgery and prosthetic robotics. Traditional methods (e.g., skin markers) are\nsubject to soft tissue artifacts, and the bone pins used in surgery introduce\nthe risk of additional trauma and infection. For electromyography (EMG), its\ninability to directly measure joint angles requires complex algorithms for\nkinematic estimation. To address these issues, A-mode ultrasound-based tracking\nhas been proposed as a non-invasive and safe alternative. However, this\napproach suffers from limited accuracy in peak detection when processing\nreceived ultrasound signals. To build a precise and real-time bone tracking\napproach, this paper introduces a deep learning-based method for anatomical\nregion recognition and bone tracking using A-mode ultrasound signals,\nspecifically focused on the knee joint. The algorithm is capable of\nsimultaneously performing bone tracking and identifying the anatomical region\nwhere the A-mode ultrasound transducer is placed. It contains the fully\nconnection between all encoding and decoding layers of the cascaded U-Nets to\nfocus only on the signal region that is most likely to have the bone peak, thus\npinpointing the exact location of the peak and classifying the anatomical\nregion of the signal. The experiment showed a 97% accuracy in the\nclassification of the anatomical regions and a precision of around 0.5$\\pm$1mm\nunder dynamic tracking conditions for various anatomical areas surrounding the\nknee joint. In general, this approach shows great potential beyond the\ntraditional method, in terms of the accuracy achieved and the recognition of\nthe anatomical region where the ultrasound has been attached as an additional\nfunctionality.\n","authors":["Bangyu Lan","Stefano Stramigioli","Kenan Niu"],"pdf_url":"https://arxiv.org/pdf/2405.19542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02617v3","updated":"2024-05-31T11:30:31Z","published":"2023-06-05T06:31:14Z","title":"Permutation Decision Trees","summary":"  Decision Tree is a well understood Machine Learning model that is based on\nminimizing impurities in the internal nodes. The most common impurity measures\nare Shannon entropy and Gini impurity. These impurity measures are insensitive\nto the order of training data and hence the final tree obtained is invariant to\nany permutation of the data. This is a limitation in terms of modeling when\nthere are temporal order dependencies between data instances. In this research,\nwe propose the adoption of Effort-To-Compress (ETC) - a complexity measure, for\nthe first time, as an alternative impurity measure. Unlike Shannon entropy and\nGini impurity, structural impurity based on ETC is able to capture order\ndependencies in the data, thus obtaining potentially different decision trees\nfor different permutations of the same data instances, a concept we term as\nPermutation Decision Trees (PDT). We then introduce the notion of Permutation\nBagging achieved using permutation decision trees without the need for random\nfeature selection and sub-sampling. We conduct a performance comparison between\nPermutation Decision Trees and classical decision trees across various\nreal-world datasets, including Appendicitis, Breast Cancer Wisconsin, Diabetes\nPima Indian, Ionosphere, Iris, Sonar, and Wine. Our findings reveal that PDT\ndemonstrates comparable performance to classical decision trees across most\ndatasets. Remarkably, in certain instances, PDT even slightly surpasses the\nperformance of classical decision trees. In comparing Permutation Bagging with\nRandom Forest, we attain comparable performance to Random Forest models\nconsisting of 50 to 1000 trees, using merely 21 trees. This highlights the\nefficiency and effectiveness of Permutation Bagging in achieving comparable\nperformance outcomes with significantly fewer trees.\n","authors":["Harikrishnan N B","Arham Jain","Nithin Nagaraj"],"pdf_url":"https://arxiv.org/pdf/2306.02617v3.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.01906v2","updated":"2024-05-31T11:14:16Z","published":"2023-11-03T13:30:52Z","title":"Simplifying Transformer Blocks","summary":"  A simple design recipe for deep Transformers is to compose identical building\nblocks. But standard transformer blocks are far from simple, interweaving\nattention and MLP sub-blocks with skip connections & normalisation layers in\nprecise arrangements. This complexity leads to brittle architectures, where\nseemingly minor changes can significantly reduce training speed, or render\nmodels untrainable.\n  In this work, we ask to what extent the standard transformer block can be\nsimplified? Combining signal propagation theory and empirical observations, we\nmotivate modifications that allow many block components to be removed with no\nloss of training speed, including skip connections, projection or value\nparameters, sequential sub-blocks and normalisation layers. In experiments on\nboth autoregressive decoder-only and BERT encoder-only models, our simplified\ntransformers emulate the per-update training speed and performance of standard\ntransformers, while enjoying 15% faster training throughput, and using 15%\nfewer parameters.\n","authors":["Bobby He","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2311.01906v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2405.20748v1","updated":"2024-05-31T10:30:14Z","published":"2024-05-31T10:30:14Z","title":"OpenTensor: Reproducing Faster Matrix Multiplication Discovering\n  Algorithms","summary":"  OpenTensor is a reproduction of AlphaTensor, which discovered a new algorithm\nthat outperforms the state-of-the-art methods for matrix multiplication by Deep\nReinforcement Learning (DRL). While AlphaTensor provides a promising framework\nfor solving scientific problems, it is really hard to reproduce due to the\nmassive tricks and lack of source codes. In this paper, we clean up the\nalgorithm pipeline, clarify the technical details, and make some improvements\nto the training process. Computational results show that OpenTensor can\nsuccessfully find efficient matrix multiplication algorithms.\n","authors":["Yiwen Sun","Wenye Li"],"pdf_url":"https://arxiv.org/pdf/2405.20748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20743v1","updated":"2024-05-31T10:13:17Z","published":"2024-05-31T10:13:17Z","title":"Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent\n  Codes","summary":"  Trajectory forecasting is crucial for video surveillance analytics, as it\nenables the anticipation of future movements for a set of agents, e.g.\nbasketball players engaged in intricate interactions with long-term intentions.\nDeep generative models offer a natural learning approach for trajectory\nforecasting, yet they encounter difficulties in achieving an optimal balance\nbetween sampling fidelity and diversity. We address this challenge by\nleveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a\ndiscrete latent space to tackle the issue of posterior collapse. Specifically,\nwe introduce an instance-based codebook that allows tailored latent\nrepresentations for each example. In a nutshell, the rows of the codebook are\ndynamically adjusted to reflect contextual information (i.e., past motion\npatterns extracted from the observed trajectories). In this way, the\ndiscretization process gains flexibility, leading to improved reconstructions.\nNotably, instance-level dynamics are injected into the codebook through\nlow-rank updates, which restrict the customization of the codebook to a lower\ndimension space. The resulting discrete space serves as the basis of the\nsubsequent step, which regards the training of a diffusion-based predictive\nmodel. We show that such a two-fold framework, augmented with instance-level\ndiscretization, leads to accurate and diverse forecasts, yielding\nstate-of-the-art performance on three established benchmarks.\n","authors":["Riccardo Benaglia","Angelo Porrello","Pietro Buzzega","Simone Calderara","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2405.20743v1.pdf","comment":"15 pages, 3 figures, 5 tables"},{"id":"http://arxiv.org/abs/2405.20738v1","updated":"2024-05-31T10:07:24Z","published":"2024-05-31T10:07:24Z","title":"Federated Random Forest for Partially Overlapping Clinical Data","summary":"  In the healthcare sector, a consciousness surrounding data privacy and\ncorresponding data protection regulations, as well as heterogeneous and\nnon-harmonized data, pose huge challenges to large-scale data analysis.\nMoreover, clinical data often involves partially overlapping features, as some\nobservations may be missing due to various reasons, such as differences in\nprocedures, diagnostic tests, or other recorded patient history information\nacross hospitals or institutes. To address the challenges posed by partially\noverlapping features and incomplete data in clinical datasets, a comprehensive\napproach is required. Particularly in the domain of medical data, promising\noutcomes are achieved by federated random forests whenever features align.\nHowever, for most standard algorithms, like random forest, it is essential that\nall data sets have identical parameters. Therefore, in this work the concept of\nfederated random forest is adapted to a setting with partially overlapping\nfeatures. Moreover, our research assesses the effectiveness of the newly\ndeveloped federated random forest models for partially overlapping clinical\ndata. For aggregating the federated, globally optimized model, only features\navailable locally at each site can be used. We tackled two issues in\nfederation: (i) the quantity of involved parties, (ii) the varying overlap of\nfeatures. This evaluation was conducted across three clinical datasets. The\nfederated random forest model even in cases where only a subset of features\noverlaps consistently demonstrates superior performance compared to its local\ncounterpart. This holds true across various scenarios, including datasets with\nimbalanced classes. Consequently, federated random forests for partially\noverlapped data offer a promising solution to transcend barriers in\ncollaborative research and corporate cooperation.\n","authors":["Youngjun Park","Cord Eric Schmidt","Benedikt Marcel Batton","Anne-Christin Hauschild"],"pdf_url":"https://arxiv.org/pdf/2405.20738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06958v3","updated":"2024-05-31T09:58:08Z","published":"2023-11-12T20:52:14Z","title":"Towards Climate Variable Prediction with Conditioned Spatio-Temporal\n  Normalizing Flows","summary":"  This study investigates how conditional normalizing flows can be applied to\nremote sensing data products in climate science for spatio-temporal prediction.\nThe method is chosen due to its desired properties such as exact likelihood\ncomputation, predictive uncertainty estimation and efficient inference and\nsampling which facilitates faster exploration of climate scenarios.\nExperimental findings reveal that the conditioned spatio-temporal flow\nsurpasses both deterministic and stochastic baselines in prolonged rollout\nscenarios. It exhibits stable extrapolation beyond the training time horizon\nfor extended rollout durations. These findings contribute valuable insights to\nthe field of spatio-temporal modeling, with potential applications spanning\ndiverse scientific disciplines.\n","authors":["Christina Winkler","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2311.06958v3.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2405.20731v1","updated":"2024-05-31T09:39:41Z","published":"2024-05-31T09:39:41Z","title":"Maximum Temperature Prediction Using Remote Sensing Data Via\n  Convolutional Neural Network","summary":"  Urban heat islands, defined as specific zones exhibiting substantially higher\ntemperatures than their immediate environs, pose significant threats to\nenvironmental sustainability and public health. This study introduces a novel\nmachine-learning model that amalgamates data from the Sentinel-3 satellite,\nmeteorological predictions, and additional remote sensing inputs. The primary\naim is to generate detailed spatiotemporal maps that forecast the peak\ntemperatures within a 24-hour period in Turin. Experimental results validate\nthe model's proficiency in predicting temperature patterns, achieving a Mean\nAbsolute Error (MAE) of 2.09 degrees Celsius for the year 2023 at a resolution\nof 20 meters per pixel, thereby enriching our knowledge of urban climatic\nbehavior. This investigation enhances the understanding of urban microclimates,\nemphasizing the importance of cross-disciplinary data integration, and laying\nthe groundwork for informed policy-making aimed at alleviating the negative\nimpacts of extreme urban temperatures.\n","authors":["Lorenzo Innocenti","Giacomo Blanco","Luca Barco","Claudio Rossi"],"pdf_url":"https://arxiv.org/pdf/2405.20731v1.pdf","comment":"4 pages, submitted to IEEE MetroLivEnv 2024 conference"},{"id":"http://arxiv.org/abs/2405.20091v2","updated":"2024-05-31T09:35:36Z","published":"2024-05-30T14:27:40Z","title":"Visual Attention Analysis in Online Learning","summary":"  In this paper, we present an approach in the Multimodal Learning Analytics\nfield. Within this approach, we have developed a tool to visualize and analyze\neye movement data collected during learning sessions in online courses. The\ntool is named VAAD (an acronym for Visual Attention Analysis Dashboard). These\neye movement data have been gathered using an eye-tracker and subsequently\nprocessed and visualized for interpretation. The purpose of the tool is to\nconduct a descriptive analysis of the data by facilitating its visualization,\nenabling the identification of differences and learning patterns among various\nlearner populations. Additionally, it integrates a predictive module capable of\nanticipating learner activities during a learning session. Consequently, VAAD\nholds the potential to offer valuable insights into online learning behaviors\nfrom both descriptive and predictive perspectives.\n","authors":["Miriam Navarro","Álvaro Becerra","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2405.20091v2.pdf","comment":"Accepted in CEDI 2024 (VII Congreso Espa\\~nol de Inform\\'atica), A\n  Coru\\~na, Spain"},{"id":"http://arxiv.org/abs/2405.20724v1","updated":"2024-05-31T09:26:26Z","published":"2024-05-31T09:26:26Z","title":"Learning on Large Graphs using Intersecting Communities","summary":"  Message Passing Neural Networks (MPNNs) are a staple of graph machine\nlearning. MPNNs iteratively update each node's representation in an input graph\nby aggregating messages from the node's neighbors, which necessitates a memory\ncomplexity of the order of the number of graph edges. This complexity might\nquickly become prohibitive for large graphs provided they are not very sparse.\nIn this paper, we propose a novel approach to alleviate this problem by\napproximating the input graph as an intersecting community graph (ICG) -- a\ncombination of intersecting cliques. The key insight is that the number of\ncommunities required to approximate a graph does not depend on the graph size.\nWe develop a new constructive version of the Weak Graph Regularity Lemma to\nefficiently construct an approximating ICG for any input graph. We then devise\nan efficient graph learning algorithm operating directly on ICG in linear\nmemory and time with respect to the number of nodes (rather than edges). This\noffers a new and fundamentally different pipeline for learning on very large\nnon-sparse graphs, whose applicability is demonstrated empirically on node\nclassification tasks and spatio-temporal data processing.\n","authors":["Ben Finkelshtein","İsmail İlkan Ceylan","Michael Bronstein","Ron Levie"],"pdf_url":"https://arxiv.org/pdf/2405.20724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10843v2","updated":"2024-05-31T09:20:24Z","published":"2023-06-19T10:45:10Z","title":"Female mosquito detection by means of AI techniques inside release\n  containers in the context of a Sterile Insect Technique program","summary":"  The Sterile Insect Technique (SIT) is a biological pest control technique\nbased on the release into the environment of sterile males of the insect\nspecies whose population is to be controlled. The entire SIT process involves\nmass-rearing within a biofactory, sorting of the specimens by sex,\nsterilization, and subsequent release of the sterile males into the\nenvironment. The reason for avoiding the release of female specimens is\nbecause, unlike males, females bite, with the subsequent risk of disease\ntransmission. In the case of Aedes mosquito biofactories for SIT, the key point\nof the whole process is sex separation. This process is nowadays performed by a\ncombination of mechanical devices and AI-based vision systems. However, there\nis still a possibility of false negatives, so a last stage of verification is\nnecessary before releasing them into the environment. It is known that the\nsound produced by the flapping of adult male mosquitoes is different from that\nproduced by females, so this feature can be used to detect the presence of\nfemales in containers prior to environmental release. This paper presents a\nstudy for the detection of females in Aedes mosquito release vessels for SIT\nprograms. The containers used consist of PVC a tubular design of 8.8cm diameter\nand 12.5cm height. The containers were placed in an experimental setup that\nallowed the recording of the sound of mosquito flight inside of them. Each\ncontainer was filled with 250 specimens considering the cases of (i) only male\nmosquitoes, (ii) only female mosquitoes, and (iii) 75% males and 25% females.\nCase (i) was used for training and testing, whereas cases (ii) and (iii) were\nused only for testing. Two algorithms were implemented for the detection of\nfemale mosquitoes: an unsupervised outlier detection algorithm (iForest) and a\none-class SVM trained with male-only recordings.\n","authors":["Javier Naranjo-Alcazar","Jordi Grau-Haro","David Almenar","Pedro Zuccarello"],"pdf_url":"https://arxiv.org/pdf/2306.10843v2.pdf","comment":"Accepted EUSIPCO 2024"},{"id":"http://arxiv.org/abs/2405.20717v1","updated":"2024-05-31T09:14:36Z","published":"2024-05-31T09:14:36Z","title":"Cyclic image generation using chaotic dynamics","summary":"  Successive image generation using cyclic transformations is demonstrated by\nextending the CycleGAN model to transform images among three different\ncategories. Repeated application of the trained generators produces sequences\nof images that transition among the different categories. The generated image\nsequences occupy a more limited region of the image space compared with the\noriginal training dataset. Quantitative evaluation using precision and recall\nmetrics indicates that the generated images have high quality but reduced\ndiversity relative to the training dataset. Such successive generation\nprocesses are characterized as chaotic dynamics in terms of dynamical system\ntheory. Positive Lyapunov exponents estimated from the generated trajectories\nconfirm the presence of chaotic dynamics, with the Lyapunov dimension of the\nattractor found to be comparable to the intrinsic dimension of the training\ndata manifold. The results suggest that chaotic dynamics in the image space\ndefined by the deep generative model contribute to the diversity of the\ngenerated images, constituting a novel approach for multi-class image\ngeneration. This model can be interpreted as an extension of classical\nassociative memory to perform hetero-association among image categories.\n","authors":["Takaya Tanaka","Yutaka Yamaguti"],"pdf_url":"https://arxiv.org/pdf/2405.20717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19967v2","updated":"2024-05-31T08:54:24Z","published":"2024-05-30T11:46:42Z","title":"Improved Out-of-Scope Intent Classification with Dual Encoding and\n  Threshold-based Re-Classification","summary":"  Detecting out-of-scope user utterances is essential for task-oriented\ndialogues and intent classification. Current methodologies face difficulties\nwith the unpredictable distribution of outliers and often rely on assumptions\nabout data distributions. We present the Dual Encoder for Threshold-Based\nRe-Classification (DETER) to address these challenges. This end-to-end\nframework efficiently detects out-of-scope intents without requiring\nassumptions on data distributions or additional post-processing steps. The core\nof DETER utilizes dual text encoders, the Universal Sentence Encoder (USE) and\nthe Transformer-based Denoising AutoEncoder (TSDAE), to generate user utterance\nembeddings, which are classified through a branched neural architecture.\nFurther, DETER generates synthetic outliers using self-supervision and\nincorporates out-of-scope phrases from open-domain datasets. This approach\nensures a comprehensive training set for out-of-scope detection. Additionally,\na threshold-based re-classification mechanism refines the model's initial\npredictions. Evaluations on the CLINC-150, Stackoverflow, and Banking77\ndatasets demonstrate DETER's efficacy. Our model outperforms previous\nbenchmarks, increasing up to 13% and 5% in F1 score for known and unknown\nintents on CLINC-150 and Stackoverflow, and 16% for known and 24% % for unknown\nintents on Banking77. The source code has been released at\nhttps://github.com/Hossam-Mohammed-tech/Intent_Classification_OOS.\n","authors":["Hossam M. Zawbaa","Wael Rashwan","Sourav Dutta","Haytham Assem"],"pdf_url":"https://arxiv.org/pdf/2405.19967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20052v2","updated":"2024-05-31T08:50:25Z","published":"2024-05-30T13:38:28Z","title":"Hardware-Efficient EMG Decoding for Next-Generation Hand Prostheses","summary":"  Advancements in neural engineering have enabled the development of Robotic\nProsthetic Hands (RPHs) aimed at restoring hand functionality. Current\ncommercial RPHs offer limited control through basic on/off commands. Recent\nprogresses in machine learning enable finger movement decoding with higher\ndegrees of freedom, yet the high computational complexity of such models limits\ntheir application in portable devices. Future RPH designs must balance\nportability, low power consumption, and high decoding accuracy to be practical\nfor individuals with disabilities. To this end, we introduce a novel\nattractor-based neural network to realize on-chip movement decoding for\nnext-generation portable RPHs. The proposed architecture comprises an encoder,\nan attention layer, an attractor network, and a refinement regressor. We tested\nour model on four healthy subjects and achieved a decoding accuracy of 80.3%.\nOur proposed model is over 120 and 50 times more compact compared to\nstate-of-the-art LSTM and CNN models, respectively, with comparable (or\nsuperior) decoding accuracy. Therefore, it exhibits minimal hardware complexity\nand can be effectively integrated as a System-on-Chip.\n","authors":["Mohammad Kalbasi","MohammadAli Shaeri","Vincent Alexandre Mendez","Solaiman Shokur","Silvestro Micera","Mahsa Shoaran"],"pdf_url":"https://arxiv.org/pdf/2405.20052v2.pdf","comment":"\\{copyright} 2024 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2103.03636v2","updated":"2024-05-31T08:50:18Z","published":"2021-03-05T12:44:22Z","title":"CoDeGAN: Contrastive Disentanglement for Generative Adversarial Network","summary":"  Disentanglement, a critical concern in interpretable machine learning, has\nalso garnered significant attention from the computer vision community. Many\nexisting GAN-based class disentanglement (unsupervised) approaches, such as\nInfoGAN and its variants, primarily aim to maximize the mutual information (MI)\nbetween the generated image and its latent codes. However, this focus may lead\nto a tendency for the network to generate highly similar images when presented\nwith the same latent class factor, potentially resulting in mode collapse or\nmode dropping. To alleviate this problem, we propose \\texttt{CoDeGAN}\n(Contrastive Disentanglement for Generative Adversarial Networks), where we\nrelax similarity constraints for disentanglement from the image domain to the\nfeature domain. This modification not only enhances the stability of GAN\ntraining but also improves their disentangling capabilities. Moreover, we\nintegrate self-supervised pre-training into CoDeGAN to learn semantic\nrepresentations, significantly facilitating unsupervised disentanglement.\nExtensive experimental results demonstrate the superiority of our method over\nstate-of-the-art approaches across multiple benchmarks. The code is available\nat https://github.com/learninginvision/CoDeGAN.\n","authors":["Jiangwei Zhao","Zejia Liu","Xiaohan Guo","Lili Pan"],"pdf_url":"https://arxiv.org/pdf/2103.03636v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09636v2","updated":"2024-05-31T08:50:05Z","published":"2024-04-15T10:12:33Z","title":"All-in-one simulation-based inference","summary":"  Amortized Bayesian inference trains neural networks to solve stochastic\ninference problems using model simulations, thereby making it possible to\nrapidly perform Bayesian inference for any newly observed data. However,\ncurrent simulation-based amortized inference methods are simulation-hungry and\ninflexible: They require the specification of a fixed parametric prior,\nsimulator, and inference tasks ahead of time. Here, we present a new amortized\ninference method -- the Simformer -- which overcomes these limitations. By\ntraining a probabilistic diffusion model with transformer architectures, the\nSimformer outperforms current state-of-the-art amortized inference approaches\non benchmark tasks and is substantially more flexible: It can be applied to\nmodels with function-valued parameters, it can handle inference scenarios with\nmissing or unstructured data, and it can sample arbitrary conditionals of the\njoint distribution of parameters and data, including both posterior and\nlikelihood. We showcase the performance and flexibility of the Simformer on\nsimulators from ecology, epidemiology, and neuroscience, and demonstrate that\nit opens up new possibilities and application domains for amortized Bayesian\ninference on simulation-based models.\n","authors":["Manuel Gloeckler","Michael Deistler","Christian Weilbach","Frank Wood","Jakob H. Macke"],"pdf_url":"https://arxiv.org/pdf/2404.09636v2.pdf","comment":"To be published in the proceedings of the 41st International\n  Conference on Machine Learning (ICML 2024), Vienna, Austria. PMLR 235, 2024"},{"id":"http://arxiv.org/abs/2306.08595v3","updated":"2024-05-31T08:39:03Z","published":"2023-06-14T15:55:19Z","title":"TensorKrowch: Smooth integration of tensor networks in machine learning","summary":"  Tensor networks are factorizations of high-dimensional tensors into networks\nof smaller tensors. They have applications in physics and mathematics, and\nrecently have been proposed as promising machine learning architectures. To\nease the integration of tensor networks in machine learning pipelines, we\nintroduce TensorKrowch, an open source Python library built on top of PyTorch.\nProviding a user-friendly interface, TensorKrowch allows users to construct any\ntensor network, train it, and integrate it as a layer in more intricate deep\nlearning models. In this paper, we describe the main functionality and basic\nusage of TensorKrowch, and provide technical details on its building blocks and\nthe optimizations performed to achieve efficient operation.\n","authors":["José Ramón Pareja Monturiol","David Pérez-García","Alejandro Pozas-Kerstjens"],"pdf_url":"https://arxiv.org/pdf/2306.08595v3.pdf","comment":"20 pages, 2 figures. The TensorKrowch GitHub repository is in\n  https://github.com/joserapa98/tensorkrowch and the TensorKrowch documentation\n  is in https://joserapa98.github.io/tensorkrowch. V3: Accepted version,\n  corrected acknowledgments"},{"id":"http://arxiv.org/abs/2405.20692v1","updated":"2024-05-31T08:38:25Z","published":"2024-05-31T08:38:25Z","title":"In-Context Decision Transformer: Reinforcement Learning via Hierarchical\n  Chain-of-Thought","summary":"  In-context learning is a promising approach for offline reinforcement\nlearning (RL) to handle online tasks, which can be achieved by providing task\nprompts. Recent works demonstrated that in-context RL could emerge with\nself-improvement in a trial-and-error manner when treating RL tasks as an\nacross-episodic sequential prediction problem. Despite the self-improvement not\nrequiring gradient updates, current works still suffer from high computational\ncosts when the across-episodic sequence increases with task horizons. To this\nend, we propose an In-context Decision Transformer (IDT) to achieve\nself-improvement in a high-level trial-and-error manner. Specifically, IDT is\ninspired by the efficient hierarchical structure of human decision-making and\nthus reconstructs the sequence to consist of high-level decisions instead of\nlow-level actions that interact with environments. As one high-level decision\ncan guide multi-step low-level actions, IDT naturally avoids excessively long\nsequences and solves online tasks more efficiently. Experimental results show\nthat IDT achieves state-of-the-art in long-horizon tasks over current\nin-context RL methods. In particular, the online evaluation time of our IDT is\n\\textbf{36$\\times$} times faster than baselines in the D4RL benchmark and\n\\textbf{27$\\times$} times faster in the Grid World benchmark.\n","authors":["Sili Huang","Jifeng Hu","Hechang Chen","Lichao Sun","Bo Yang"],"pdf_url":"https://arxiv.org/pdf/2405.20692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20690v1","updated":"2024-05-31T08:35:56Z","published":"2024-05-31T08:35:56Z","title":"Unleashing the Potential of Diffusion Models for Incomplete Data\n  Imputation","summary":"  This paper introduces DiffPuter, an iterative method for missing data\nimputation that leverages the Expectation-Maximization (EM) algorithm and\nDiffusion Models. By treating missing data as hidden variables that can be\nupdated during model training, we frame the missing data imputation task as an\nEM problem. During the M-step, DiffPuter employs a diffusion model to learn the\njoint distribution of both the observed and currently estimated missing data.\nIn the E-step, DiffPuter re-estimates the missing data based on the conditional\nprobability given the observed data, utilizing the diffusion model learned in\nthe M-step. Starting with an initial imputation, DiffPuter alternates between\nthe M-step and E-step until convergence. Through this iterative process,\nDiffPuter progressively refines the complete data distribution, yielding\nincreasingly accurate estimations of the missing data. Our theoretical analysis\ndemonstrates that the unconditional training and conditional sampling processes\nof the diffusion model align precisely with the objectives of the M-step and\nE-step, respectively. Empirical evaluations across 10 diverse datasets and\ncomparisons with 16 different imputation methods highlight DiffPuter's superior\nperformance. Notably, DiffPuter achieves an average improvement of 8.10% in MAE\nand 5.64% in RMSE compared to the most competitive existing method.\n","authors":["Hengrui Zhang","Liancheng Fang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2405.20690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20687v1","updated":"2024-05-31T08:31:26Z","published":"2024-05-31T08:31:26Z","title":"Conditioning GAN Without Training Dataset","summary":"  Deep learning algorithms have a large number of trainable parameters often\nwith sizes of hundreds of thousands or more. Training this algorithm requires a\nlarge amount of training data and generating a sufficiently large dataset for\nthese algorithms is costly\\cite{noguchi2019image}.\n  GANs are generative neural networks that use two deep learning networks that\nare competing with each other. The networks are generator and discriminator\nnetworks. The generator tries to generate realistic images which resemble the\nactual training dataset by approximating the training data distribution and the\ndiscriminator is trained to classify images as real or\nfake(generated)\\cite{goodfellow2016nips}. Training these GAN algorithms also\nrequires a large amount of training dataset\\cite{noguchi2019image}.\n  In this study, the aim is to address the question, \"Given an unconditioned\npretrained generator network and a pretrained classifier, is it feasible to\ndevelop a conditioned generator without relying on any training dataset?\"\n  The paper begins with a general introduction to the problem. The subsequent\nsections are structured as follows: Section 2 provides background information\non the problem. Section 3 reviews relevant literature on the topic. Section 4\noutlines the methodology employed in this study. Section 5 presents the\nexperimental results. Section 6 discusses the findings and proposes potential\nfuture research directions. Finally, Section 7 offers concluding remarks.\n  The implementation can be accessed\n\\href{https://github.com/kidist-amde/BigGAN-PyTorch}{here}.\n","authors":["Kidist Amde Mekonnen"],"pdf_url":"https://arxiv.org/pdf/2405.20687v1.pdf","comment":"5 pages, 2 figures, Part of my MSc project course, School Project\n  Course 2022"},{"id":"http://arxiv.org/abs/2405.19383v2","updated":"2024-05-31T08:29:26Z","published":"2024-05-29T08:48:52Z","title":"Network Analytics for Anti-Money Laundering -- A Systematic Literature\n  Review and Experimental Evaluation","summary":"  Money laundering presents a pervasive challenge, burdening society by\nfinancing illegal activities. To more effectively combat and detect money\nlaundering, the use of network information is increasingly being explored,\nexploiting that money laundering necessarily involves interconnected parties.\nThis has lead to a surge in literature on network analytics (NA) for anti-money\nlaundering (AML). The literature, however, is fragmented and a comprehensive\noverview of existing work is missing. This results in limited understanding of\nthe methods that may be applied and their comparative detection power.\nTherefore, this paper presents an extensive and systematic review of the\nliterature. We identify and analyse 97 papers in the Web of Science and Scopus\ndatabases, resulting in a taxonomy of approaches following the fraud analytics\nframework of Bockel-Rickermann et al.. Moreover, this paper presents a\ncomprehensive experimental framework to evaluate and compare the performance of\nprominent NA methods in a uniform setup. The framework is applied on the\npublicly available Elliptic data set and implements manual feature engineering,\nrandom walk-based methods, and deep learning GNNs. We conclude from the results\nthat network analytics increases the predictive power of the AML model with\ngraph neural networks giving the best results. An open source implementation of\nthe experimental framework is provided to facilitate researchers and\npractitioners to extend upon these results and experiment on proprietary data.\nAs such, we aim to promote a standardised approach towards the analysis and\nevaluation of network analytics for AML.\n","authors":["Bruno Deprez","Toon Vanderschueren","Bart Baesens","Tim Verdonck","Wouter Verbeke"],"pdf_url":"https://arxiv.org/pdf/2405.19383v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20685v1","updated":"2024-05-31T08:26:53Z","published":"2024-05-31T08:26:53Z","title":"Enhancing Counterfactual Image Generation Using Mahalanobis Distance\n  with Distribution Preferences in Feature Space","summary":"  In the realm of Artificial Intelligence (AI), the importance of Explainable\nArtificial Intelligence (XAI) is increasingly recognized, particularly as AI\nmodels become more integral to our lives. One notable single-instance XAI\napproach is counterfactual explanation, which aids users in comprehending a\nmodel's decisions and offers guidance on altering these decisions. Specifically\nin the context of image classification models, effective image counterfactual\nexplanations can significantly enhance user understanding. This paper\nintroduces a novel method for computing feature importance within the feature\nspace of a black-box model. By employing information fusion techniques, our\nmethod maximizes the use of data to address feature counterfactual explanations\nin the feature space. Subsequently, we utilize an image generation model to\ntransform these feature counterfactual explanations into image counterfactual\nexplanations. Our experiments demonstrate that the counterfactual explanations\ngenerated by our method closely resemble the original images in both pixel and\nfeature spaces. Additionally, our method outperforms established baselines,\nachieving impressive experimental results.\n","authors":["Yukai Zhang","Ao Xu","Zihao Li","Tieru Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04928v2","updated":"2024-05-31T08:23:42Z","published":"2024-01-10T04:55:24Z","title":"Relaxed Contrastive Learning for Federated Learning","summary":"  We propose a novel contrastive learning framework to effectively address the\nchallenges of data heterogeneity in federated learning. We first analyze the\ninconsistency of gradient updates across clients during local training and\nestablish its dependence on the distribution of feature representations,\nleading to the derivation of the supervised contrastive learning (SCL)\nobjective to mitigate local deviations. In addition, we show that a na\\\"ive\nadoption of SCL in federated learning leads to representation collapse,\nresulting in slow convergence and limited performance gains. To address this\nissue, we introduce a relaxed contrastive learning loss that imposes a\ndivergence penalty on excessively similar sample pairs within each class. This\nstrategy prevents collapsed representations and enhances feature\ntransferability, facilitating collaborative training and leading to significant\nperformance improvements. Our framework outperforms all existing federated\nlearning approaches by huge margins on the standard benchmarks through\nextensive experimental results.\n","authors":["Seonguk Seo","Jinkyu Kim","Geeho Kim","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2401.04928v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20678v1","updated":"2024-05-31T08:21:11Z","published":"2024-05-31T08:21:11Z","title":"No-Regret Learning for Fair Multi-Agent Social Welfare Optimization","summary":"  We consider the problem of online multi-agent Nash social welfare (NSW)\nmaximization. While previous works of Hossain et al. [2021], Jones et al.\n[2023] study similar problems in stochastic multi-agent multi-armed bandits and\nshow that $\\sqrt{T}$-regret is possible after $T$ rounds, their fairness\nmeasure is the product of all agents' rewards, instead of their NSW (that is,\ntheir geometric mean). Given the fundamental role of NSW in the fairness\nliterature, it is more than natural to ask whether no-regret fair learning with\nNSW as the objective is possible. In this work, we provide a complete answer to\nthis question in various settings. Specifically, in stochastic $N$-agent\n$K$-armed bandits, we develop an algorithm with\n$\\widetilde{\\mathcal{O}}\\left(K^{\\frac{2}{N}}T^{\\frac{N-1}{N}}\\right)$ regret\nand prove that the dependence on $T$ is tight, making it a sharp contrast to\nthe $\\sqrt{T}$-regret bounds of Hossain et al. [2021], Jones et al. [2023]. We\nthen consider a more challenging version of the problem with adversarial\nrewards. Somewhat surprisingly, despite NSW being a concave function, we prove\nthat no algorithm can achieve sublinear regret. To circumvent such negative\nresults, we further consider a setting with full-information feedback and\ndesign two algorithms with $\\sqrt{T}$-regret: the first one has no dependence\non $N$ at all and is applicable to not just NSW but a broad class of welfare\nfunctions, while the second one has better dependence on $K$ and is preferable\nwhen $N$ is small. Finally, we also show that logarithmic regret is possible\nwhenever there exists one agent who is indifferent about different arms.\n","authors":["Mengxiao Zhang","Ramiro Deo-Campo Vuong","Haipeng Luo"],"pdf_url":"https://arxiv.org/pdf/2405.20678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20677v1","updated":"2024-05-31T08:21:09Z","published":"2024-05-31T08:21:09Z","title":"Provably Efficient Interactive-Grounded Learning with Personalized\n  Reward","summary":"  Interactive-Grounded Learning (IGL) [Xie et al., 2021] is a powerful\nframework in which a learner aims at maximizing unobservable rewards through\ninteracting with an environment and observing reward-dependent feedback on the\ntaken actions. To deal with personalized rewards that are ubiquitous in\napplications such as recommendation systems, Maghakian et al. [2022] study a\nversion of IGL with context-dependent feedback, but their algorithm does not\ncome with theoretical guarantees. In this work, we consider the same problem\nand provide the first provably efficient algorithms with sublinear regret under\nrealizability. Our analysis reveals that the step-function estimator of prior\nwork can deviate uncontrollably due to finite-sample effects. Our solution is a\nnovel Lipschitz reward estimator which underestimates the true reward and\nenjoys favorable generalization performances. Building on this estimator, we\npropose two algorithms, one based on explore-then-exploit and the other based\non inverse-gap weighting. We apply IGL to learning from image feedback and\nlearning from text feedback, which are reward-free settings that arise in\npractice. Experimental results showcase the importance of using our Lipschitz\nreward estimator and the overall effectiveness of our algorithms.\n","authors":["Mengxiao Zhang","Yuheng Zhang","Haipeng Luo","Paul Mineiro"],"pdf_url":"https://arxiv.org/pdf/2405.20677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20675v1","updated":"2024-05-31T08:19:44Z","published":"2024-05-31T08:19:44Z","title":"Adv-KD: Adversarial Knowledge Distillation for Faster Diffusion Sampling","summary":"  Diffusion Probabilistic Models (DPMs) have emerged as a powerful class of\ndeep generative models, achieving remarkable performance in image synthesis\ntasks. However, these models face challenges in terms of widespread adoption\ndue to their reliance on sequential denoising steps during sample generation.\nThis dependence leads to substantial computational requirements, making them\nunsuitable for resource-constrained or real-time processing systems. To address\nthese challenges, we propose a novel method that integrates denoising phases\ndirectly into the model's architecture, thereby reducing the need for\nresource-intensive computations. Our approach combines diffusion models with\ngenerative adversarial networks (GANs) through knowledge distillation, enabling\nmore efficient training and evaluation. By utilizing a pre-trained diffusion\nmodel as a teacher model, we train a student model through adversarial\nlearning, employing layerwise transformations for denoising and submodules for\npredicting the teacher model's output at various points in time. This\nintegration significantly reduces the number of parameters and denoising steps\nrequired, leading to improved sampling speed at test time. We validate our\nmethod with extensive experiments, demonstrating comparable performance with\nreduced computational requirements compared to existing approaches. By enabling\nthe deployment of diffusion models on resource-constrained devices, our\nresearch mitigates their computational burden and paves the way for wider\naccessibility and practical use across the research community and end-users.\n  Our code is publicly available at https://github.com/kidist-amde/Adv-KD\n","authors":["Kidist Amde Mekonnen","Nicola Dall'Asen","Paolo Rota"],"pdf_url":"https://arxiv.org/pdf/2405.20675v1.pdf","comment":"7 pages, 11 figures, ELLIS Doctoral Symposium 2023 in Helsinki,\n  Finland"},{"id":"http://arxiv.org/abs/2402.09050v2","updated":"2024-05-31T08:15:06Z","published":"2024-02-14T09:46:53Z","title":"End-to-End Training Induces Information Bottleneck through Layer-Role\n  Differentiation: A Comparative Analysis with Layer-wise Training","summary":"  End-to-end (E2E) training, optimizing the entire model through error\nbackpropagation, fundamentally supports the advancements of deep learning.\nDespite its high performance, E2E training faces the problems of memory\nconsumption, parallel computing, and discrepancy with the functionalities of\nthe actual brain. Various alternative methods have been proposed to overcome\nthese difficulties; however, no one can yet match the performance of E2E\ntraining, thereby falling short in practicality. Furthermore, there is no deep\nunderstanding regarding differences in the trained model properties beyond the\nperformance gap. In this paper, we reconsider why E2E training demonstrates a\nsuperior performance through a comparison with layer-wise training, a non-E2E\nmethod that locally sets errors. On the basis of the observation that E2E\ntraining has an advantage in propagating input information, we analyze the\ninformation plane dynamics of intermediate representations based on the\nHilbert-Schmidt independence criterion (HSIC). The results of our normalized\nHSIC value analysis reveal the E2E training ability to exhibit different\ninformation dynamics across layers, in addition to efficient information\npropagation. Furthermore, we show that this layer-role differentiation leads to\nthe final representation following the information bottleneck principle. It\nsuggests the need to consider the cooperative interactions between layers, not\njust the final layer when analyzing the information bottleneck of deep\nlearning.\n","authors":["Keitaro Sakamoto","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2402.09050v2.pdf","comment":"TMLR2024"},{"id":"http://arxiv.org/abs/2402.14991v2","updated":"2024-05-31T08:14:11Z","published":"2024-02-22T22:03:16Z","title":"Quantum Theory and Application of Contextual Optimal Transport","summary":"  Optimal Transport (OT) has fueled machine learning (ML) across many domains.\nWhen paired data measurements $(\\boldsymbol{\\mu}, \\boldsymbol{\\nu})$ are\ncoupled to covariates, a challenging conditional distribution learning setting\narises. Existing approaches for learning a $\\textit{global}$ transport map\nparameterized through a potentially unseen context utilize Neural OT and\nlargely rely on Brenier's theorem. Here, we propose a first-of-its-kind quantum\ncomputing formulation for amortized optimization of contextualized\ntransportation plans. We exploit a direct link between doubly stochastic\nmatrices and unitary operators thus unravelling a natural connection between OT\nand quantum computation. We verify our method (QontOT) on synthetic and real\ndata by predicting variations in cell type distributions conditioned on drug\ndosage. Importantly we conduct a 24-qubit hardware experiment on a task\nchallenging for classical computers and report a performance that cannot be\nmatched with our classical neural OT approach. In sum, this is a first step\ntoward learning to predict contextualized transportation plans through quantum\ncomputing.\n","authors":["Nicola Mariella","Albert Akhriev","Francesco Tacchino","Christa Zoufal","Juan Carlos Gonzalez-Espitia","Benedek Harsanyi","Eugene Koskin","Ivano Tavernelli","Stefan Woerner","Marianna Rapsomaniki","Sergiy Zhuk","Jannis Born"],"pdf_url":"https://arxiv.org/pdf/2402.14991v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.20671v1","updated":"2024-05-31T08:13:35Z","published":"2024-05-31T08:13:35Z","title":"Position Coupling: Leveraging Task Structure for Improved Length\n  Generalization of Transformers","summary":"  Even for simple arithmetic tasks like integer addition, it is challenging for\nTransformers to generalize to longer sequences than those encountered during\ntraining. To tackle this problem, we propose position coupling, a simple yet\neffective method that directly embeds the structure of the tasks into the\npositional encoding of a (decoder-only) Transformer. Taking a departure from\nthe vanilla absolute position mechanism assigning unique position IDs to each\nof the tokens, we assign the same position IDs to two or more \"relevant\"\ntokens; for integer addition tasks, we regard digits of the same significance\nas in the same position. On the empirical side, we show that with the proposed\nposition coupling, a small (1-layer) Transformer trained on 1 to 30-digit\nadditions can generalize up to 200-digit additions (6.67x of the trained\nlength). On the theoretical side, we prove that a 1-layer Transformer with\ncoupled positions can solve the addition task involving exponentially many\ndigits, whereas any 1-layer Transformer without positional information cannot\nentirely solve it. We also demonstrate that position coupling can be applied to\nother algorithmic tasks such as addition with multiple summands, Nx2\nmultiplication, copy/reverse, and a two-dimensional task.\n","authors":["Hanseul Cho","Jaeyoung Cha","Pranjal Awasthi","Srinadh Bhojanapalli","Anupam Gupta","Chulhee Yun"],"pdf_url":"https://arxiv.org/pdf/2405.20671v1.pdf","comment":"73 pages, 20 figures, 90 tables"},{"id":"http://arxiv.org/abs/2405.19732v2","updated":"2024-05-31T08:13:34Z","published":"2024-05-30T06:24:14Z","title":"Two Optimizers Are Better Than One: LLM Catalyst for Enhancing\n  Gradient-Based Optimization","summary":"  Learning a skill generally relies on both practical experience by doer and\ninsightful high-level guidance by instructor. Will this strategy also work well\nfor solving complex non-convex optimization problems? Here, a common\ngradient-based optimizer acts like a disciplined doer, making locally optimal\nupdate at each step. Recent methods utilize large language models (LLMs) to\noptimize solutions for concrete problems by inferring from natural language\ninstructions, akin to a high-level instructor. In this paper, we show that\nthese two optimizers are complementary to each other, suggesting a\ncollaborative optimization approach. The gradient-based optimizer and LLM-based\noptimizer are combined in an interleaved manner. We instruct LLMs using task\ndescriptions and timely optimization trajectories recorded during\ngradient-based optimization. Inferred results from LLMs are used as restarting\npoints for the next stage of gradient optimization. By leveraging both the\nlocally rigorous gradient-based optimizer and the high-level deductive\nLLM-based optimizer, our combined optimization method consistently yields\nimprovements over competitive baseline prompt tuning methods. Our results\ndemonstrate the synergistic effect of conventional gradient-based optimization\nand the inference ability of LLMs. The code is released at\nhttps://github.com/guozix/LLM-catalyst.\n","authors":["Zixian Guo","Ming Liu","Zhilong Ji","Jinfeng Bai","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.19732v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2002.01605v2","updated":"2024-05-31T08:11:57Z","published":"2020-02-05T02:06:56Z","title":"Exploratory Machine Learning with Unknown Unknowns","summary":"  In conventional supervised learning, a training dataset is given with\nground-truth labels from a known label set, and the learned model will classify\nunseen instances to known labels. This paper studies a new problem setting in\nwhich there are unknown classes in the training data misperceived as other\nlabels, and thus their existence appears unknown from the given supervision. We\nattribute the unknown unknowns to the fact that the training dataset is badly\nadvised by the incompletely perceived label space due to the insufficient\nfeature information. To this end, we propose the exploratory machine learning,\nwhich examines and investigates training data by actively augmenting the\nfeature space to discover potentially hidden classes. Our method consists of\nthree ingredients including rejection model, feature exploration, and model\ncascade. We provide theoretical analysis to justify its superiority, and\nvalidate the effectiveness on both synthetic and real datasets.\n","authors":["Peng Zhao","Jia-Wei Shan","Yu-Jie Zhang","Zhi-Hua Zhou"],"pdf_url":"https://arxiv.org/pdf/2002.01605v2.pdf","comment":"published at Artificial Intelligence, preliminary conference version\n  published at AAAI'21"},{"id":"http://arxiv.org/abs/2405.20668v1","updated":"2024-05-31T08:09:36Z","published":"2024-05-31T08:09:36Z","title":"Improving Paratope and Epitope Prediction by Multi-Modal Contrastive\n  Learning and Interaction Informativeness Estimation","summary":"  Accurately predicting antibody-antigen binding residues, i.e., paratopes and\nepitopes, is crucial in antibody design. However, existing methods solely focus\non uni-modal data (either sequence or structure), disregarding the\ncomplementary information present in multi-modal data, and most methods predict\nparatopes and epitopes separately, overlooking their specific spatial\ninteractions. In this paper, we propose a novel Multi-modal contrastive\nlearning and Interaction informativeness estimation-based method for Paratope\nand Epitope prediction, named MIPE, by using both sequence and structure data\nof antibodies and antigens. MIPE implements a multi-modal contrastive learning\nstrategy, which maximizes representations of binding and non-binding residues\nwithin each modality and meanwhile aligns uni-modal representations towards\neffective modal representations. To exploit the spatial interaction\ninformation, MIPE also incorporates an interaction informativeness estimation\nthat computes the estimated interaction matrices between antibodies and\nantigens, thereby approximating them to the actual ones. Extensive experiments\ndemonstrate the superiority of our method compared to baselines. Additionally,\nthe ablation studies and visualizations demonstrate the superiority of MIPE\nowing to the better representations acquired through multi-modal contrastive\nlearning and the interaction patterns comprehended by the interaction\ninformativeness estimation.\n","authors":["Zhiwei Wang","Yongkang Wang","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20668v1.pdf","comment":"This paper is accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2405.20664v1","updated":"2024-05-31T08:03:52Z","published":"2024-05-31T08:03:52Z","title":"Weak Robust Compatibility Between Learning Algorithms and Counterfactual\n  Explanation Generation Algorithms","summary":"  Counterfactual explanation generation is a powerful method for Explainable\nArtificial Intelligence. It can help users understand why machine learning\nmodels make specific decisions, and how to change those decisions. Evaluating\nthe robustness of counterfactual explanation algorithms is therefore crucial.\nPrevious literature has widely studied the robustness based on the perturbation\nof input instances. However, the robustness defined from the perspective of\nperturbed instances is sometimes biased, because this definition ignores the\nimpact of learning algorithms on robustness. In this paper, we propose a more\nreasonable definition, Weak Robust Compatibility, based on the perspective of\nexplanation strength. In practice, we propose WRC-Test to help us generate more\nrobust counterfactuals. Meanwhile, we designed experiments to verify the\neffectiveness of WRC-Test. Theoretically, we introduce the concepts of PAC\nlearning theory and define the concept of PAC WRC-Approximability. Based on\nreasonable assumptions, we establish oracle inequalities about weak robustness,\nwhich gives a sufficient condition for PAC WRC-Approximability.\n","authors":["Ao Xu","Tieru Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14578v2","updated":"2024-05-31T08:01:56Z","published":"2024-05-23T13:52:36Z","title":"Surge Phenomenon in Optimal Learning Rate and Batch Size Scaling","summary":"  In current deep learning tasks, Adam style optimizers such as Adam, Adagrad,\nRMSProp, Adafactor, and Lion have been widely used as alternatives to SGD style\noptimizers. These optimizers typically update model parameters using the sign\nof gradients, resulting in more stable convergence curves. The learning rate\nand the batch size are the most critical hyperparameters for optimizers, which\nrequire careful tuning to enable effective convergence. Previous research has\nshown that the optimal learning rate increases linearly or follows similar\nrules with batch size for SGD style optimizers. However, this conclusion is not\napplicable to Adam style optimizers. In this paper, we elucidate the connection\nbetween optimal learning rates and batch sizes for Adam style optimizers\nthrough both theoretical analysis and extensive experiments. First, we raise\nthe scaling law between batch sizes and optimal learning rates in the sign of\ngradient case, in which we prove that the optimal learning rate first rises and\nthen falls as the batch size increases. Moreover, the peak value of the surge\nwill gradually move toward the larger batch size as training progresses.\nSecond, we conducted experiments on various CV and NLP tasks and verified the\ncorrectness of the scaling law.\n","authors":["Shuaipeng Li","Penghao Zhao","Hailin Zhang","Xingwu Sun","Hao Wu","Dian Jiao","Weiyan Wang","Chengjun Liu","Zheng Fang","Jinbao Xue","Yangyu Tao","Bin Cui","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2405.14578v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19059v2","updated":"2024-05-31T07:45:53Z","published":"2024-05-29T13:00:10Z","title":"Robust Entropy Search for Safe Efficient Bayesian Optimization","summary":"  The practical use of Bayesian Optimization (BO) in engineering applications\nimposes special requirements: high sampling efficiency on the one hand and\nfinding a robust solution on the other hand. We address the case of adversarial\nrobustness, where all parameters are controllable during the optimization\nprocess, but a subset of them is uncontrollable or even adversely perturbed at\nthe time of application. To this end, we develop an efficient information-based\nacquisition function that we call Robust Entropy Search (RES). We empirically\ndemonstrate its benefits in experiments on synthetic and real-life data. The\nresults showthat RES reliably finds robust optima, outperforming\nstate-of-the-art algorithms.\n","authors":["Dorina Weichert","Alexander Kister","Sebastian Houben","Patrick Link","Gunar Ernis"],"pdf_url":"https://arxiv.org/pdf/2405.19059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.10927v4","updated":"2024-05-31T07:39:50Z","published":"2021-10-21T06:49:10Z","title":"SecureBoost+ : A High Performance Gradient Boosting Tree Framework for\n  Large Scale Vertical Federated Learning","summary":"  Gradient boosting decision tree (GBDT) is a widely used ensemble algorithm in\nthe industry. Its vertical federated learning version, SecureBoost, is one of\nthe most popular algorithms used in cross-silo privacy-preserving modeling. As\nthe area of privacy computation thrives in recent years, demands for\nlarge-scale and high-performance federated learning have grown dramatically in\nreal-world applications. In this paper, to fulfill these requirements, we\npropose SecureBoost+ that is both novel and improved from the prior work\nSecureBoost. SecureBoost+ integrates several ciphertext calculation\noptimizations and engineering optimizations. The experimental results\ndemonstrate that Secureboost+ has significant performance improvements on large\nand high dimensional data sets compared to SecureBoost. It makes effective and\nefficient large-scale vertical federated learning possible.\n","authors":["Weijing Chen","Guoqiang Ma","Tao Fan","Yan Kang","Qian Xu","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2110.10927v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20652v1","updated":"2024-05-31T07:39:22Z","published":"2024-05-31T07:39:22Z","title":"Sign is Not a Remedy: Multiset-to-Multiset Message Passing for Learning\n  on Heterophilic Graphs","summary":"  Graph Neural Networks (GNNs) have gained significant attention as a powerful\nmodeling and inference method, especially for homophilic graph-structured data.\nTo empower GNNs in heterophilic graphs, where adjacent nodes exhibit dissimilar\nlabels or features, Signed Message Passing (SMP) has been widely adopted.\nHowever, there is a lack of theoretical and empirical analysis regarding the\nlimitations of SMP. In this work, we unveil some potential pitfalls of SMP and\ntheir remedies. We first identify two limitations of SMP: undesirable\nrepresentation update for multi-hop neighbors and vulnerability against\noversmoothing issues. To overcome these challenges, we propose a novel message\npassing function called Multiset to Multiset GNN(M2M-GNN). Our theoretical\nanalyses and extensive experiments demonstrate that M2M-GNN effectively\nalleviates the aforementioned limitations of SMP, yielding superior performance\nin comparison\n","authors":["Langzhang Liang","Sunwoo Kim","Kijung Shin","Zenglin Xu","Shirui Pan","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2405.20652v1.pdf","comment":"Published as a conference paper at ICML 2024"},{"id":"http://arxiv.org/abs/2405.20649v1","updated":"2024-05-31T07:30:34Z","published":"2024-05-31T07:30:34Z","title":"Reward-based Input Construction for Cross-document Relation Extraction","summary":"  Relation extraction (RE) is a fundamental task in natural language\nprocessing, aiming to identify relations between target entities in text. While\nmany RE methods are designed for a single sentence or document, cross-document\nRE has emerged to address relations across multiple long documents. Given the\nnature of long documents in cross-document RE, extracting document embeddings\nis challenging due to the length constraints of pre-trained language models.\nTherefore, we propose REward-based Input Construction (REIC), the first\nlearning-based sentence selector for cross-document RE. REIC extracts sentences\nbased on relational evidence, enabling the RE module to effectively infer\nrelations. Since supervision of evidence sentences is generally unavailable, we\ntrain REIC using reinforcement learning with RE prediction scores as rewards.\nExperimental results demonstrate the superiority of our method over heuristic\nmethods for different RE structures and backbones in cross-document RE. Our\ncode is publicly available at https://github.com/aailabkaist/REIC.\n","authors":["Byeonghu Na","Suhyeon Jo","Yeongmin Kim","Il-Chul Moon"],"pdf_url":"https://arxiv.org/pdf/2405.20649v1.pdf","comment":"Accepted at ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2405.20648v1","updated":"2024-05-31T07:30:24Z","published":"2024-05-31T07:30:24Z","title":"Shotluck Holmes: A Family of Efficient Small-Scale Large Language Vision\n  Models For Video Captioning and Summarization","summary":"  Video is an increasingly prominent and information-dense medium, yet it poses\nsubstantial challenges for language models. A typical video consists of a\nsequence of shorter segments, or shots, that collectively form a coherent\nnarrative. Each shot is analogous to a word in a sentence where multiple data\nstreams of information (such as visual and auditory data) must be processed\nsimultaneously. Comprehension of the entire video requires not only\nunderstanding the visual-audio information of each shot but also requires that\nthe model links the ideas between each shot to generate a larger,\nall-encompassing story. Despite significant progress in the field, current\nworks often overlook videos' more granular shot-by-shot semantic information.\nIn this project, we propose a family of efficient large language vision models\n(LLVMs) to boost video summarization and captioning called Shotluck Holmes. By\nleveraging better pretraining and data collection strategies, we extend the\nabilities of existing small LLVMs from being able to understand a picture to\nbeing able to understand a sequence of frames. Specifically, we show that\nShotluck Holmes achieves better performance than state-of-the-art results on\nthe Shot2Story video captioning and summary task with significantly smaller and\nmore computationally efficient models.\n","authors":["Richard Luo","Austin Peng","Adithya Vasudev","Rishabh Jain"],"pdf_url":"https://arxiv.org/pdf/2405.20648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08970v3","updated":"2024-05-31T07:29:20Z","published":"2023-06-15T09:05:36Z","title":"An Efficient and Multi-private Key Secure Aggregation for Federated\n  Learning","summary":"  With the emergence of privacy leaks in federated learning, secure aggregation\nprotocols that mainly adopt either homomorphic encryption or threshold secret\nsharing have been widely developed for federated learning to protect the\nprivacy of the local training data of each client. However, these existing\nprotocols suffer from many shortcomings, such as the dependence on a trusted\nthird party, the vulnerability to clients being corrupted, low efficiency, the\ntrade-off between security and fault tolerance, etc. To solve these\ndisadvantages, we propose an efficient and multi-private key secure aggregation\nscheme for federated learning. Specifically, we skillfully modify the variant\nElGamal encryption technique to achieve homomorphic addition operation, which\nhas two important advantages: 1) The server and each client can freely select\npublic and private keys without introducing a trust third party and 2) Compared\nto the variant ElGamal encryption, the plaintext space is relatively large,\nwhich is more suitable for the deep model. Besides, for the high dimensional\ndeep model parameter, we introduce a super-increasing sequence to compress\nmulti-dimensional data into 1-D, which can greatly reduce encryption and\ndecryption times as well as communication for ciphertext transmission. Detailed\nsecurity analyses show that our proposed scheme achieves the semantic security\nof both individual local gradients and the aggregated result while achieving\noptimal robustness in tolerating both client collusion and dropped clients.\nExtensive simulations demonstrate that the accuracy of our scheme is almost the\nsame as the non-private approach, while the efficiency of our scheme is much\nbetter than the state-of-the-art homomorphic encryption-based secure\naggregation schemes. More importantly, the efficiency advantages of our scheme\nwill become increasingly prominent as the number of model parameters increases.\n","authors":["Xue Yang","Zifeng Liu","Xiaohu Tang","Rongxing Lu","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2306.08970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12995v3","updated":"2024-05-31T07:28:40Z","published":"2024-03-05T13:35:41Z","title":"ESM All-Atom: Multi-scale Protein Language Model for Unified Molecular\n  Modeling","summary":"  Protein language models have demonstrated significant potential in the field\nof protein engineering. However, current protein language models primarily\noperate at the residue scale, which limits their ability to provide information\nat the atom level. This limitation prevents us from fully exploiting the\ncapabilities of protein language models for applications involving both\nproteins and small molecules. In this paper, we propose ESM-AA (ESM All-Atom),\na novel approach that enables atom-scale and residue-scale unified molecular\nmodeling. ESM-AA achieves this by pre-training on multi-scale code-switch\nprotein sequences and utilizing a multi-scale position encoding to capture\nrelationships among residues and atoms. Experimental results indicate that\nESM-AA surpasses previous methods in protein-molecule tasks, demonstrating the\nfull utilization of protein language models. Further investigations reveal that\nthrough unified molecular modeling, ESM-AA not only gains molecular knowledge\nbut also retains its understanding of proteins. The source codes of ESM-AA are\npublicly released at https://github.com/zhengkangjie/ESM-AA.\n","authors":["Kangjie Zheng","Siyu Long","Tianyu Lu","Junwei Yang","Xinyu Dai","Ming Zhang","Zaiqing Nie","Wei-Ying Ma","Hao Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.12995v3.pdf","comment":"ICML2024 camera-ready, update some experimental results, add github\n  url"}],"Multimedia":[{"id":"http://arxiv.org/abs/2405.20078v2","updated":"2024-05-31T16:49:19Z","published":"2024-05-30T14:08:09Z","title":"NeRF View Synthesis: Subjective Quality Assessment and Objective Metrics\n  Evaluation","summary":"  Neural radiance fields (NeRF) are a groundbreaking computer vision technology\nthat enables the generation of high-quality, immersive visual content from\nmultiple viewpoints. This capability holds significant advantages for\napplications such as virtual/augmented reality, 3D modelling and content\ncreation for the film and entertainment industry. However, the evaluation of\nNeRF methods poses several challenges, including a lack of comprehensive\ndatasets, reliable assessment methodologies, and objective quality metrics.\nThis paper addresses the problem of NeRF quality assessment thoroughly, by\nconducting a rigorous subjective quality assessment test that considers several\nscene classes and recently proposed NeRF view synthesis methods. Additionally,\nthe performance of a wide range of state-of-the-art conventional and\nlearning-based full-reference 2D image and video quality assessment metrics is\nevaluated against the subjective scores of the subjective study. The\nexperimental results are analyzed in depth, providing a comparative evaluation\nof several NeRF methods and objective quality metrics, across different classes\nof visual scenes, including real and synthetic content for front-face and\n360-degree camera trajectories.\n","authors":["Pedro Martin","Antonio Rodrigues","Joao Ascenso","Maria Paula Queluz"],"pdf_url":"https://arxiv.org/pdf/2405.20078v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20687v1","updated":"2024-05-31T08:31:26Z","published":"2024-05-31T08:31:26Z","title":"Conditioning GAN Without Training Dataset","summary":"  Deep learning algorithms have a large number of trainable parameters often\nwith sizes of hundreds of thousands or more. Training this algorithm requires a\nlarge amount of training data and generating a sufficiently large dataset for\nthese algorithms is costly\\cite{noguchi2019image}.\n  GANs are generative neural networks that use two deep learning networks that\nare competing with each other. The networks are generator and discriminator\nnetworks. The generator tries to generate realistic images which resemble the\nactual training dataset by approximating the training data distribution and the\ndiscriminator is trained to classify images as real or\nfake(generated)\\cite{goodfellow2016nips}. Training these GAN algorithms also\nrequires a large amount of training dataset\\cite{noguchi2019image}.\n  In this study, the aim is to address the question, \"Given an unconditioned\npretrained generator network and a pretrained classifier, is it feasible to\ndevelop a conditioned generator without relying on any training dataset?\"\n  The paper begins with a general introduction to the problem. The subsequent\nsections are structured as follows: Section 2 provides background information\non the problem. Section 3 reviews relevant literature on the topic. Section 4\noutlines the methodology employed in this study. Section 5 presents the\nexperimental results. Section 6 discusses the findings and proposes potential\nfuture research directions. Finally, Section 7 offers concluding remarks.\n  The implementation can be accessed\n\\href{https://github.com/kidist-amde/BigGAN-PyTorch}{here}.\n","authors":["Kidist Amde Mekonnen"],"pdf_url":"https://arxiv.org/pdf/2405.20687v1.pdf","comment":"5 pages, 2 figures, Part of my MSc project course, School Project\n  Course 2022"},{"id":"http://arxiv.org/abs/2405.20675v1","updated":"2024-05-31T08:19:44Z","published":"2024-05-31T08:19:44Z","title":"Adv-KD: Adversarial Knowledge Distillation for Faster Diffusion Sampling","summary":"  Diffusion Probabilistic Models (DPMs) have emerged as a powerful class of\ndeep generative models, achieving remarkable performance in image synthesis\ntasks. However, these models face challenges in terms of widespread adoption\ndue to their reliance on sequential denoising steps during sample generation.\nThis dependence leads to substantial computational requirements, making them\nunsuitable for resource-constrained or real-time processing systems. To address\nthese challenges, we propose a novel method that integrates denoising phases\ndirectly into the model's architecture, thereby reducing the need for\nresource-intensive computations. Our approach combines diffusion models with\ngenerative adversarial networks (GANs) through knowledge distillation, enabling\nmore efficient training and evaluation. By utilizing a pre-trained diffusion\nmodel as a teacher model, we train a student model through adversarial\nlearning, employing layerwise transformations for denoising and submodules for\npredicting the teacher model's output at various points in time. This\nintegration significantly reduces the number of parameters and denoising steps\nrequired, leading to improved sampling speed at test time. We validate our\nmethod with extensive experiments, demonstrating comparable performance with\nreduced computational requirements compared to existing approaches. By enabling\nthe deployment of diffusion models on resource-constrained devices, our\nresearch mitigates their computational burden and paves the way for wider\naccessibility and practical use across the research community and end-users.\n  Our code is publicly available at https://github.com/kidist-amde/Adv-KD\n","authors":["Kidist Amde Mekonnen","Nicola Dall'Asen","Paolo Rota"],"pdf_url":"https://arxiv.org/pdf/2405.20675v1.pdf","comment":"7 pages, 11 figures, ELLIS Doctoral Symposium 2023 in Helsinki,\n  Finland"},{"id":"http://arxiv.org/abs/2405.20606v1","updated":"2024-05-31T03:40:15Z","published":"2024-05-31T03:40:15Z","title":"Vision-Language Meets the Skeleton: Progressively Distillation with\n  Cross-Modal Knowledge for 3D Action Representation Learning","summary":"  Supervised and self-supervised learning are two main training paradigms for\nskeleton-based human action recognition. However, the former one-hot\nclassification requires labor-intensive predefined action categories\nannotations, while the latter involves skeleton transformations (e.g.,\ncropping) in the pretext tasks that may impair the skeleton structure. To\naddress these challenges, we introduce a novel skeleton-based training\nframework (C$^2$VL) based on Cross-modal Contrastive learning that uses the\nprogressive distillation to learn task-agnostic human skeleton action\nrepresentation from the Vision-Language knowledge prompts. Specifically, we\nestablish the vision-language action concept space through vision-language\nknowledge prompts generated by pre-trained large multimodal models (LMMs),\nwhich enrich the fine-grained details that the skeleton action space lacks.\nMoreover, we propose the intra-modal self-similarity and inter-modal\ncross-consistency softened targets in the cross-modal contrastive process to\nprogressively control and guide the degree of pulling vision-language knowledge\nprompts and corresponding skeletons closer. These soft instance discrimination\nand self-knowledge distillation strategies contribute to the learning of better\nskeleton-based action representations from the noisy skeleton-vision-language\npairs. During the inference phase, our method requires only the skeleton data\nas the input for action recognition and no longer for vision-language prompts.\nExtensive experiments show that our method achieves state-of-the-art results on\nNTU RGB+D 60, NTU RGB+D 120, and PKU-MMD datasets. The code will be available\nin the future.\n","authors":["Yang Chen","Tian He","Junfeng Fu","Ling Wang","Jingcai Guo","Hong Cheng"],"pdf_url":"https://arxiv.org/pdf/2405.20606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00135v1","updated":"2024-05-31T18:55:10Z","published":"2024-05-31T18:55:10Z","title":"Advancing Ear Biometrics: Enhancing Accuracy and Robustness through Deep\n  Learning","summary":"  Biometric identification is a reliable method to verify individuals based on\ntheir unique physical or behavioral traits, offering a secure alternative to\ntraditional methods like passwords or PINs. This study focuses on ear biometric\nidentification, exploiting its distinctive features for enhanced accuracy,\nreliability, and usability. While past studies typically investigate face\nrecognition and fingerprint analysis, our research demonstrates the\neffectiveness of ear biometrics in overcoming limitations such as variations in\nfacial expressions and lighting conditions. We utilized two datasets: AMI (700\nimages from 100 individuals) and EarNV1.0 (28,412 images from 164 individuals).\nTo improve the accuracy and robustness of our ear biometric identification\nsystem, we applied various techniques including data preprocessing and\naugmentation. Our models achieved a testing accuracy of 99.35% on the AMI\nDataset and 98.1% on the EarNV1.0 dataset, showcasing the effectiveness of our\napproach in precisely identifying individuals based on ear biometric\ncharacteristics.\n","authors":["Youssef Mohamed","Zeyad Youssef","Ahmed Heakl","Ahmed Zaky"],"pdf_url":"https://arxiv.org/pdf/2406.00135v1.pdf","comment":"6 pages, 8 figures, 3 tables, International IEEE Conference on the\n  Intelligent Methods, Systems, and Applications"},{"id":"http://arxiv.org/abs/2406.00093v1","updated":"2024-05-31T17:59:56Z","published":"2024-05-31T17:59:56Z","title":"Bootstrap3D: Improving 3D Content Creation with Synthetic Data","summary":"  Recent years have witnessed remarkable progress in multi-view diffusion\nmodels for 3D content creation. However, there remains a significant gap in\nimage quality and prompt-following ability compared to 2D diffusion models. A\ncritical bottleneck is the scarcity of high-quality 3D assets with detailed\ncaptions. To address this challenge, we propose Bootstrap3D, a novel framework\nthat automatically generates an arbitrary quantity of multi-view images to\nassist in training multi-view diffusion models. Specifically, we introduce a\ndata generation pipeline that employs (1) 2D and video diffusion models to\ngenerate multi-view images based on constructed text prompts, and (2) our\nfine-tuned 3D-aware MV-LLaVA for filtering high-quality data and rewriting\ninaccurate captions. Leveraging this pipeline, we have generated 1 million\nhigh-quality synthetic multi-view images with dense descriptive captions to\naddress the shortage of high-quality 3D data. Furthermore, we present a\nTraining Timestep Reschedule (TTR) strategy that leverages the denoising\nprocess to learn multi-view consistency while maintaining the original 2D\ndiffusion prior. Extensive experiments demonstrate that Bootstrap3D can\ngenerate high-quality multi-view images with superior aesthetic quality,\nimage-text alignment, and maintained view consistency.\n","authors":["Zeyi Sun","Tong Wu","Pan Zhang","Yuhang Zang","Xiaoyi Dong","Yuanjun Xiong","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2406.00093v1.pdf","comment":"Project Page: https://sunzey.github.io/Bootstrap3D/"}]},"2024-06-03T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2404.14387v2","updated":"2024-06-03T17:47:30Z","published":"2024-04-22T17:43:23Z","title":"A Survey on Self-Evolution of Large Language Models","summary":"  Large language models (LLMs) have significantly advanced in various fields\nand intelligent agent applications. However, current LLMs that learn from human\nor external model supervision are costly and may face performance ceilings as\ntask complexity and diversity increase. To address this issue, self-evolution\napproaches that enable LLM to autonomously acquire, refine, and learn from\nexperiences generated by the model itself are rapidly growing. This new\ntraining paradigm inspired by the human experiential learning process offers\nthe potential to scale LLMs towards superintelligence. In this work, we present\na comprehensive survey of self-evolution approaches in LLMs. We first propose a\nconceptual framework for self-evolution and outline the evolving process as\niterative cycles composed of four phases: experience acquisition, experience\nrefinement, updating, and evaluation. Second, we categorize the evolution\nobjectives of LLMs and LLM-based agents; then, we summarize the literature and\nprovide taxonomy and insights for each module. Lastly, we pinpoint existing\nchallenges and propose future directions to improve self-evolution frameworks,\nequipping researchers with critical insights to fast-track the development of\nself-evolving LLMs. Our corresponding GitHub repository is available at\nhttps://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/Awesome-Self-Evolution-of-LLM\n","authors":["Zhengwei Tao","Ting-En Lin","Xiancai Chen","Hangyu Li","Yuchuan Wu","Yongbin Li","Zhi Jin","Fei Huang","Dacheng Tao","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.14387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09808v3","updated":"2024-06-03T17:43:23Z","published":"2023-11-16T11:32:47Z","title":"PixT3: Pixel-based Table-To-Text Generation","summary":"  Table-to-text generation involves generating appropriate textual descriptions\ngiven structured tabular data. It has attracted increasing attention in recent\nyears thanks to the popularity of neural network models and the availability of\nlarge-scale datasets. A common feature across existing methods is their\ntreatment of the input as a string, i.e., by employing linearization techniques\nthat do not always preserve information in the table, are verbose, and lack\nspace efficiency. We propose to rethink data-to-text generation as a visual\nrecognition task, removing the need for rendering the input in a string format.\nWe present PixT3, a multimodal table-to-text model that overcomes the\nchallenges of linearization and input size limitations encountered by existing\nmodels. PixT3 is trained with a new self-supervised learning objective to\nreinforce table structure awareness and is applicable to open-ended and\ncontrolled generation settings. Experiments on the ToTTo and Logic2Text\nbenchmarks show that PixT3 is competitive and, in some settings, superior to\ngenerators that operate solely on text.\n","authors":["Iñigo Alonso","Eneko Agirre","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2311.09808v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17505v3","updated":"2024-06-03T17:35:04Z","published":"2024-01-30T23:46:35Z","title":"Arrows of Time for Large Language Models","summary":"  We study the probabilistic modeling performed by Autoregressive Large\nLanguage Models (LLMs) through the angle of time directionality, addressing a\nquestion first raised in (Shannon, 1951). For large enough models, we\nempirically find a time asymmetry in their ability to learn natural language: a\ndifference in the average log-perplexity when trying to predict the next token\nversus when trying to predict the previous one. This difference is at the same\ntime subtle and very consistent across various modalities (language, model\nsize, training time, ...). Theoretically, this is surprising: from an\ninformation-theoretic point of view, there should be no such difference. We\nprovide a theoretical framework to explain how such an asymmetry can appear\nfrom sparsity and computational complexity considerations, and outline a number\nof perspectives opened by our results.\n","authors":["Vassilis Papadopoulos","Jérémie Wenger","Clément Hongler"],"pdf_url":"https://arxiv.org/pdf/2401.17505v3.pdf","comment":"Re-arranged and updated figures. Added experiments. 12 figures, 20\n  pages"},{"id":"http://arxiv.org/abs/2403.17846v2","updated":"2024-06-03T17:12:25Z","published":"2024-03-26T16:36:43Z","title":"Hierarchical Open-Vocabulary 3D Scene Graphs for Language-Grounded Robot\n  Navigation","summary":"  Recent open-vocabulary robot mapping methods enrich dense geometric maps with\npre-trained visual-language features. While these maps allow for the prediction\nof point-wise saliency maps when queried for a certain language concept,\nlarge-scale environments and abstract queries beyond the object level still\npose a considerable hurdle, ultimately limiting language-grounded robotic\nnavigation. In this work, we present HOV-SG, a hierarchical open-vocabulary 3D\nscene graph mapping approach for language-grounded robot navigation. Leveraging\nopen-vocabulary vision foundation models, we first obtain state-of-the-art\nopen-vocabulary segment-level maps in 3D and subsequently construct a 3D scene\ngraph hierarchy consisting of floor, room, and object concepts, each enriched\nwith open-vocabulary features. Our approach is able to represent multi-story\nbuildings and allows robotic traversal of those using a cross-floor Voronoi\ngraph. HOV-SG is evaluated on three distinct datasets and surpasses previous\nbaselines in open-vocabulary semantic accuracy on the object, room, and floor\nlevel while producing a 75% reduction in representation size compared to dense\nopen-vocabulary maps. In order to prove the efficacy and generalization\ncapabilities of HOV-SG, we showcase successful long-horizon\nlanguage-conditioned robot navigation within real-world multi-storage\nenvironments. We provide code and trial video data at http://hovsg.github.io/.\n","authors":["Abdelrhman Werby","Chenguang Huang","Martin Büchner","Abhinav Valada","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2403.17846v2.pdf","comment":"Code and video are available at http://hovsg.github.io/"},{"id":"http://arxiv.org/abs/2401.04854v3","updated":"2024-06-03T17:01:06Z","published":"2024-01-10T00:05:45Z","title":"Are Language Models More Like Libraries or Like Librarians?\n  Bibliotechnism, the Novel Reference Problem, and the Attitudes of LLMs","summary":"  Are LLMs cultural technologies like photocopiers or printing presses, which\ntransmit information but cannot create new content? A challenge for this idea,\nwhich we call bibliotechnism, is that LLMs generate novel text. We begin with a\ndefense of bibliotechnism, showing how even novel text may inherit its meaning\nfrom original human-generated text. We then argue that bibliotechnism faces an\nindependent challenge from examples in which LLMs generate novel reference,\nusing new names to refer to new entities. Such examples could be explained if\nLLMs were not cultural technologies but had beliefs, desires, and intentions.\nAccording to interpretationism in the philosophy of mind, a system has such\nattitudes if and only if its behavior is well explained by the hypothesis that\nit does. Interpretationists may hold that LLMs have attitudes, and thus have a\nsimple solution to the novel reference problem. We emphasize, however, that\ninterpretationism is compatible with very simple creatures having attitudes and\ndiffers sharply from views that presuppose these attitudes require\nconsciousness, sentience, or intelligence (topics about which we make no\nclaims).\n","authors":["Harvey Lederman","Kyle Mahowald"],"pdf_url":"https://arxiv.org/pdf/2401.04854v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14867v2","updated":"2024-06-03T16:59:20Z","published":"2023-12-22T17:45:19Z","title":"VIEScore: Towards Explainable Metrics for Conditional Image Synthesis\n  Evaluation","summary":"  In the rapidly advancing field of conditional image generation research,\nchallenges such as limited explainability lie in effectively evaluating the\nperformance and capabilities of various models. This paper introduces VIEScore,\na Visual Instruction-guided Explainable metric for evaluating any conditional\nimage generation tasks. VIEScore leverages general knowledge from Multimodal\nLarge Language Models (MLLMs) as the backbone and does not require training or\nfine-tuning. We evaluate VIEScore on seven prominent tasks in conditional image\ntasks and found: (1) VIEScore (GPT4-o) achieves a high Spearman correlation of\n0.4 with human evaluations, while the human-to-human correlation is 0.45. (2)\nVIEScore (with open-source MLLM) is significantly weaker than GPT-4o and GPT-4v\nin evaluating synthetic images. (3) VIEScore achieves a correlation on par with\nhuman ratings in the generation tasks but struggles in editing tasks. With\nthese results, we believe VIEScore shows its great potential to replace human\njudges in evaluating image synthesis tasks.\n","authors":["Max Ku","Dongfu Jiang","Cong Wei","Xiang Yue","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2312.14867v2.pdf","comment":"Accepted to ACL2024 main"},{"id":"http://arxiv.org/abs/2403.01748v3","updated":"2024-06-03T16:58:04Z","published":"2024-03-04T05:55:01Z","title":"NeuSpeech: Decode Neural signal as Speech","summary":"  Decoding language from brain dynamics is an important open direction in the\nrealm of brain-computer interface (BCI), especially considering the rapid\ngrowth of large language models. Compared to invasive-based signals which\nrequire electrode implantation surgery, non-invasive neural signals (e.g. EEG,\nMEG) have attracted increasing attention considering their safety and\ngenerality. However, the exploration is not adequate in three aspects: 1)\nprevious methods mainly focus on EEG but none of the previous works address\nthis problem on MEG with better signal quality; 2) prior works have\npredominantly used $``teacher-forcing\"$ during generative decoding, which is\nimpractical; 3) prior works are mostly $``BART-based\"$ not fully\nauto-regressive, which performs better in other sequence tasks. In this paper,\nwe explore the brain-to-text translation of MEG signals in a speech-decoding\nformation. Here we are the first to investigate a cross-attention-based\n``whisper\" model for generating text directly from MEG signals without teacher\nforcing. Our model achieves impressive BLEU-1 scores of 60.30 and 52.89 without\npretraining $\\&$ teacher-forcing on two major datasets ($\\textit{GWilliams}$\nand $\\textit{Schoffelen}$). This paper conducts a comprehensive review to\nunderstand how speech decoding formation performs on the neural decoding tasks,\nincluding pretraining initialization, training $\\&$ evaluation set splitting,\naugmentation, and scaling law. Code is available at\nhttps://github.com/NeuSpeech/NeuSpeech1$.\n","authors":["Yiqian Yang","Yiqun Duan","Qiang Zhang","Hyejeong Jo","Jinni Zhou","Won Hee Lee","Renjing Xu","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2403.01748v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08277v5","updated":"2024-06-03T16:48:59Z","published":"2024-02-13T08:12:48Z","title":"Towards Faithful and Robust LLM Specialists for Evidence-Based\n  Question-Answering","summary":"  Advances towards more faithful and traceable answers of Large Language Models\n(LLMs) are crucial for various research and practical endeavors. One avenue in\nreaching this goal is basing the answers on reliable sources. However, this\nEvidence-Based QA has proven to work insufficiently with LLMs in terms of\nciting the correct sources (source quality) and truthfully representing the\ninformation within sources (answer attributability). In this work, we\nsystematically investigate how to robustly fine-tune LLMs for better source\nquality and answer attributability. Specifically, we introduce a data\ngeneration pipeline with automated data quality filters, which can synthesize\ndiversified high-quality training and testing data at scale. We further\nintroduce four test sets to benchmark the robustness of fine-tuned specialist\nmodels. Extensive evaluation shows that fine-tuning on synthetic data improves\nperformance on both in- and out-of-distribution. Furthermore, we show that data\nquality, which can be drastically improved by proposed quality filters, matters\nmore than quantity in improving Evidence-Based QA.\n","authors":["Tobias Schimanski","Jingwei Ni","Mathias Kraus","Elliott Ash","Markus Leippold"],"pdf_url":"https://arxiv.org/pdf/2402.08277v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14555v4","updated":"2024-06-03T16:43:16Z","published":"2024-05-23T13:35:34Z","title":"Subtle Biases Need Subtler Measures: Dual Metrics for Evaluating\n  Representative and Affinity Bias in Large Language Models","summary":"  Research on Large Language Models (LLMs) has often neglected subtle biases\nthat, although less apparent, can significantly influence the models' outputs\ntoward particular social narratives. This study addresses two such biases\nwithin LLMs: representative bias, which denotes a tendency of LLMs to generate\noutputs that mirror the experiences of certain identity groups, and affinity\nbias, reflecting the models' evaluative preferences for specific narratives or\nviewpoints. We introduce two novel metrics to measure these biases: the\nRepresentative Bias Score (RBS) and the Affinity Bias Score (ABS), and present\nthe Creativity-Oriented Generation Suite (CoGS), a collection of open-ended\ntasks such as short story writing and poetry composition, designed with\ncustomized rubrics to detect these subtle biases. Our analysis uncovers marked\nrepresentative biases in prominent LLMs, with a preference for identities\nassociated with being white, straight, and men. Furthermore, our investigation\nof affinity bias reveals distinctive evaluative patterns within each model,\nakin to `bias fingerprints'. This trend is also seen in human evaluators,\nhighlighting a complex interplay between human and machine bias perceptions.\n","authors":["Abhishek Kumar","Sarfaroz Yunusov","Ali Emami"],"pdf_url":"https://arxiv.org/pdf/2405.14555v4.pdf","comment":"9 pages (excluding references), accepted to ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2405.16277v3","updated":"2024-06-03T16:42:55Z","published":"2024-05-25T15:28:22Z","title":"Picturing Ambiguity: A Visual Twist on the Winograd Schema Challenge","summary":"  Large Language Models (LLMs) have demonstrated remarkable success in tasks\nlike the Winograd Schema Challenge (WSC), showcasing advanced textual\ncommon-sense reasoning. However, applying this reasoning to multimodal domains,\nwhere understanding text and images together is essential, remains a\nsubstantial challenge. To address this, we introduce WinoVis, a novel dataset\nspecifically designed to probe text-to-image models on pronoun disambiguation\nwithin multimodal contexts. Utilizing GPT-4 for prompt generation and Diffusion\nAttentive Attribution Maps (DAAM) for heatmap analysis, we propose a novel\nevaluation framework that isolates the models' ability in pronoun\ndisambiguation from other visual processing challenges. Evaluation of\nsuccessive model versions reveals that, despite incremental advancements,\nStable Diffusion 2.0 achieves a precision of 56.7% on WinoVis, only marginally\nsurpassing random guessing. Further error analysis identifies important areas\nfor future research aimed at advancing text-to-image models in their ability to\ninterpret and interact with the complex visual world.\n","authors":["Brendan Park","Madeline Janecek","Naser Ezzati-Jivan","Yifeng Li","Ali Emami"],"pdf_url":"https://arxiv.org/pdf/2405.16277v3.pdf","comment":"9 pages (excluding references), accepted to ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2405.16282v3","updated":"2024-06-03T16:41:53Z","published":"2024-05-25T15:42:04Z","title":"Confidence Under the Hood: An Investigation into the\n  Confidence-Probability Alignment in Large Language Models","summary":"  As the use of Large Language Models (LLMs) becomes more widespread,\nunderstanding their self-evaluation of confidence in generated responses\nbecomes increasingly important as it is integral to the reliability of the\noutput of these models. We introduce the concept of Confidence-Probability\nAlignment, that connects an LLM's internal confidence, quantified by token\nprobabilities, to the confidence conveyed in the model's response when\nexplicitly asked about its certainty. Using various datasets and prompting\ntechniques that encourage model introspection, we probe the alignment between\nmodels' internal and expressed confidence. These techniques encompass using\nstructured evaluation scales to rate confidence, including answer options when\nprompting, and eliciting the model's confidence level for outputs it does not\nrecognize as its own. Notably, among the models analyzed, OpenAI's GPT-4 showed\nthe strongest confidence-probability alignment, with an average Spearman's\n$\\hat{\\rho}$ of 0.42, across a wide range of tasks. Our work contributes to the\nongoing efforts to facilitate risk assessment in the application of LLMs and to\nfurther our understanding of model trustworthiness.\n","authors":["Abhishek Kumar","Robert Morabito","Sanzhar Umbet","Jad Kabbara","Ali Emami"],"pdf_url":"https://arxiv.org/pdf/2405.16282v3.pdf","comment":"9 pages (excluding references), accepted to ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2307.02863v5","updated":"2024-06-03T16:32:02Z","published":"2023-07-06T09:03:10Z","title":"ValiTex -- a unified validation framework for computational text-based\n  measures of social constructs","summary":"  Guidance on how to validate computational text-based measures of social\nconstructs is fragmented. While researchers generally acknowledge the\nimportance of validating text-based measures, they often lack a shared\nvocabulary and a unified framework to do so. This paper introduces ValiText, a\nnew validation framework designed to assist scholars in validly measuring\nsocial constructs in textual data. The framework is built on a conceptual\nfoundation of validity in the social sciences, strengthened by an empirical\nreview of validation practices in the social sciences and consultations with\nexperts. Ultimately, ValiText prescribes researchers to demonstrate three types\nof validation evidence: substantive evidence (outlining the theoretical\nunderpinning of the measure), structural evidence (examining the properties of\nthe text model and its output) and external evidence (testing for how the\nmeasure relates to independent information). The framework is further\nsupplemented by a checklist of validation steps, offering practical guidance in\nthe form of documentation sheets that guide researchers in the validation\nprocess.\n","authors":["Lukas Birkenmaier","Claudia Wagner","Clemens Lechner"],"pdf_url":"https://arxiv.org/pdf/2307.02863v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06270v3","updated":"2024-06-03T16:23:28Z","published":"2024-05-10T06:52:44Z","title":"XAI4LLM. Let Machine Learning Models and LLMs Collaborate for Enhanced\n  In-Context Learning in Healthcare","summary":"  The integration of Large Language Models (LLMs) into healthcare diagnostics\noffers a promising avenue for clinical decision-making. This study outlines the\ndevelopment of a novel method for zero-shot/few-shot in-context learning (ICL)\nby integrating medical domain knowledge using a multi-layered structured\nprompt. We also explore the efficacy of two communication styles between the\nuser and LLMs: the Numerical Conversational (NC) style, which processes data\nincrementally, and the Natural Language Single-Turn (NL-ST) style, which\nemploys long narrative prompts.\n  Our study systematically evaluates the diagnostic accuracy and risk factors,\nincluding gender bias and false negative rates, using a dataset of 920 patient\nrecords in various few-shot scenarios. Results indicate that traditional\nclinical machine learning (ML) models generally outperform LLMs in zero-shot\nand few-shot settings. However, the performance gap narrows significantly when\nemploying few-shot examples alongside effective explainable AI (XAI) methods as\nsources of domain knowledge. Moreover, with sufficient time and an increased\nnumber of examples, the conversational style (NC) nearly matches the\nperformance of ML models. Most notably, LLMs demonstrate comparable or superior\ncost-sensitive accuracy relative to ML models.\n  This research confirms that, with appropriate domain knowledge and tailored\ncommunication strategies, LLMs can significantly enhance diagnostic processes.\nThe findings highlight the importance of optimizing the number of training\nexamples and communication styles to improve accuracy and reduce biases in LLM\napplications.\n","authors":["Fatemeh Nazary","Yashar Deldjoo","Tommaso Di Noia","Eugenio di Sciascio"],"pdf_url":"https://arxiv.org/pdf/2405.06270v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19334v2","updated":"2024-06-03T16:19:03Z","published":"2024-02-29T16:37:08Z","title":"Here's a Free Lunch: Sanitizing Backdoored Models with Model Merge","summary":"  The democratization of pre-trained language models through open-source\ninitiatives has rapidly advanced innovation and expanded access to cutting-edge\ntechnologies. However, this openness also brings significant security risks,\nincluding backdoor attacks, where hidden malicious behaviors are triggered by\nspecific inputs, compromising natural language processing (NLP) system\nintegrity and reliability. This paper suggests that merging a backdoored model\nwith other homogeneous models can significantly remediate backdoor\nvulnerabilities even if such models are not entirely secure. In our\nexperiments, we verify our hypothesis on various models (BERT-Base,\nRoBERTa-Large, Llama2-7B, and Mistral-7B) and datasets (SST-2, OLID, AG News,\nand QNLI). Compared to multiple advanced defensive approaches, our method\noffers an effective and efficient inference-stage defense against backdoor\nattacks on classification and instruction-tuned tasks without additional\nresources or specific knowledge. Our approach consistently outperforms recent\nadvanced baselines, leading to an average of about 75% reduction in the attack\nsuccess rate. Since model merging has been an established approach for\nimproving model performance, the extra advantage it provides regarding defense\ncan be seen as a cost-free bonus.\n","authors":["Ansh Arora","Xuanli He","Maximilian Mozes","Srinibas Swain","Mark Dras","Qiongkai Xu"],"pdf_url":"https://arxiv.org/pdf/2402.19334v2.pdf","comment":"accepted to ACL2024 (Findings)"},{"id":"http://arxiv.org/abs/2405.19701v2","updated":"2024-06-03T15:59:34Z","published":"2024-05-30T05:26:57Z","title":"Significance of Chain of Thought in Gender Bias Mitigation for\n  English-Dravidian Machine Translation","summary":"  Gender bias in machine translation (MT) sys- tems poses a significant\nchallenge to achieving accurate and inclusive translations. This paper examines\ngender bias in machine translation systems for languages such as Telugu and\nKan- nada from the Dravidian family, analyzing how gender inflections affect\ntranslation accuracy and neutrality using Google Translate and Chat- GPT. It\nfinds that while plural forms can reduce bias, individual-centric sentences\noften main- tain the bias due to historical stereotypes. The study evaluates\nthe Chain of Thought process- ing, noting significant bias mitigation from 80%\nto 4% in Telugu and from 40% to 0% in Kan- nada. It also compares Telugu and\nKannada translations, emphasizing the need for language specific strategies to\naddress these challenges and suggesting directions for future research to\nenhance fairness in both data preparation and prompts during inference.\n","authors":["Lavanya Prahallad","Radhika Mamidi"],"pdf_url":"https://arxiv.org/pdf/2405.19701v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2405.19266v2","updated":"2024-06-03T15:27:10Z","published":"2024-05-29T16:59:38Z","title":"PediatricsGPT: Large Language Models as Chinese Medical Assistants for\n  Pediatric Applications","summary":"  Developing intelligent pediatric consultation systems offers promising\nprospects for improving diagnostic efficiency, especially in China, where\nhealthcare resources are scarce. Despite recent advances in Large Language\nModels (LLMs) for Chinese medicine, their performance is sub-optimal in\npediatric applications due to inadequate instruction data and vulnerable\ntraining procedures. To address the above issues, this paper builds PedCorpus,\na high-quality dataset of over 300,000 multi-task instructions from pediatric\ntextbooks, guidelines, and knowledge graph resources to fulfil diverse\ndiagnostic demands. Upon well-designed PedCorpus, we propose PediatricsGPT, the\nfirst Chinese pediatric LLM assistant built on a systematic and robust training\npipeline. In the continuous pre-training phase, we introduce a hybrid\ninstruction pre-training mechanism to mitigate the internal-injected knowledge\ninconsistency of LLMs for medical domain adaptation. Immediately, the\nfull-parameter Supervised Fine-Tuning (SFT) is utilized to incorporate the\ngeneral medical knowledge schema into the models. After that, we devise a\ndirect following preference optimization to enhance the generation of\npediatrician-like humanistic responses. In the parameter-efficient secondary\nSFT phase, a mixture of universal-specific experts strategy is presented to\nresolve the competency conflict between medical generalist and pediatric\nexpertise mastery. Extensive results based on the metrics, GPT-4, and doctor\nevaluations on distinct doctor downstream tasks show that PediatricsGPT\nconsistently outperforms previous Chinese medical LLMs. Our model and dataset\nwill be open-source for community development.\n","authors":["Dingkang Yang","Jinjie Wei","Dongling Xiao","Shunli Wang","Tong Wu","Gang Li","Mingcheng Li","Shuaibing Wang","Jiawei Chen","Yue Jiang","Qingyao Xu","Ke Li","Peng Zhai","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.19266v2.pdf","comment":"A Technical Report on a Chinese Medical Large Language Model"},{"id":"http://arxiv.org/abs/2405.20962v2","updated":"2024-06-03T15:10:53Z","published":"2024-05-31T16:07:33Z","title":"Large Language Models are Zero-Shot Next Location Predictors","summary":"  Predicting the locations an individual will visit in the future is crucial\nfor solving many societal issues like disease diffusion and reduction of\npollution among many others. The models designed to tackle next-location\nprediction, however, require a significant amount of individual-level\ninformation to be trained effectively. Such data may be scarce or even\nunavailable in some geographic regions or peculiar scenarios (e.g., cold-start\nin recommendation systems). Moreover, the design of a next-location predictor\nable to generalize or geographically transfer knowledge is still an open\nresearch challenge. Recent advances in natural language processing have led to\na rapid diffusion of Large Language Models (LLMs) which have shown good\ngeneralization and reasoning capabilities. These insights, coupled with the\nrecent findings that LLMs are rich in geographical knowledge, allowed us to\nbelieve that these models can act as zero-shot next-location predictors. This\npaper evaluates the capabilities of many popular LLMs in this role,\nspecifically Llama, GPT-3.5 and Mistral 7B. After designing a proper prompt, we\ntested the models on three real-world mobility datasets. The results show that\nLLMs can obtain accuracies up to 32.4%, a significant relative improvement of\nover 600% when compared to sophisticated DL models specifically designed for\nhuman mobility. Moreover, we show that other LLMs are unable to perform the\ntask properly. To prevent positively biased results, we also propose a\nframework inspired by other studies to test data contamination. Finally, we\nexplored the possibility of using LLMs as text-based explainers for\nnext-location prediction showing that can effectively provide an explanation\nfor their decision. Notably, 7B models provide more generic, but still\nreliable, explanations compared to larger counterparts. Code:\ngithub.com/ssai-trento/LLM-zero-shot-NL\n","authors":["Ciro Beneduce","Bruno Lepri","Massimiliano Luca"],"pdf_url":"https://arxiv.org/pdf/2405.20962v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03407v2","updated":"2024-06-03T15:00:47Z","published":"2024-03-06T02:23:32Z","title":"Human vs. Machine: Behavioral Differences Between Expert Humans and\n  Language Models in Wargame Simulations","summary":"  To some, the advent of artificial intelligence (AI) promises better\ndecision-making and increased military effectiveness while reducing the\ninfluence of human error and emotions. However, there is still debate about how\nAI systems, especially large language models (LLMs), behave compared to humans\nin high-stakes military decision-making scenarios with the potential for\nincreased risks towards escalation and unnecessary conflicts. To test this\npotential and scrutinize the use of LLMs for such purposes, we use a new\nwargame experiment with 107 national security experts designed to look at\ncrisis escalation in a fictional US-China scenario and compare human players to\nLLM-simulated responses in separate simulations. Wargames have a long history\nin the development of military strategy and the response of nations to threats\nor attacks. Here, we show a considerable high-level agreement in the LLM and\nhuman responses and significant quantitative and qualitative differences in\nindividual actions and strategic tendencies. These differences depend on\nintrinsic biases in LLMs regarding the appropriate level of violence following\nstrategic instructions, the choice of LLM, and whether the LLMs are tasked to\ndecide for a team of players directly or first to simulate dialog between\nplayers. When simulating the dialog, the discussions lack quality and maintain\na farcical harmony. The LLM simulations cannot account for human player\ncharacteristics, showing no significant difference even for extreme traits,\nsuch as \"pacifist\" or \"aggressive sociopath\". Our results motivate policymakers\nto be cautious before granting autonomy or following AI-based strategy\nrecommendations.\n","authors":["Max Lamparth","Anthony Corso","Jacob Ganz","Oriana Skylar Mastro","Jacquelyn Schneider","Harold Trinkunas"],"pdf_url":"https://arxiv.org/pdf/2403.03407v2.pdf","comment":"Updated with new plot and more details"},{"id":"http://arxiv.org/abs/2306.06427v3","updated":"2024-06-03T14:59:11Z","published":"2023-06-10T12:42:36Z","title":"Boosting Language Models Reasoning with Chain-of-Knowledge Prompting","summary":"  Recently, Chain-of-Thought (CoT) prompting has delivered success on complex\nreasoning tasks, which aims at designing a simple prompt like ``Let's think\nstep by step'' or multiple in-context exemplars with well-designed rationales\nto elicit Large Language Models (LLMs) to generate intermediate reasoning\nsteps. However, the generated rationales often come with mistakes, making\nunfactual and unfaithful reasoning chains. To mitigate this brittleness, we\npropose a novel Chain-of-Knowledge (CoK) prompting, where we aim at eliciting\nLLMs to generate explicit pieces of knowledge evidence in the form of structure\ntriple. This is inspired by our human behaviors, i.e., we can draw a mind map\nor knowledge map as the reasoning evidence in the brain before answering a\ncomplex question. Benefiting from CoK, we additionally introduce a\nF^2-Verification method to estimate the reliability of the reasoning chains in\nterms of factuality and faithfulness. For the unreliable response, the wrong\nevidence can be indicated to prompt the LLM to rethink. Extensive experiments\ndemonstrate that our method can further improve the performance of commonsense,\nfactual, symbolic, and arithmetic reasoning tasks.\n","authors":["Jianing Wang","Qiushi Sun","Xiang Li","Ming Gao"],"pdf_url":"https://arxiv.org/pdf/2306.06427v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2403.15447v2","updated":"2024-06-03T14:49:00Z","published":"2024-03-18T01:38:19Z","title":"Decoding Compressed Trust: Scrutinizing the Trustworthiness of Efficient\n  LLMs Under Compression","summary":"  Compressing high-capability Large Language Models (LLMs) has emerged as a\nfavored strategy for resource-efficient inferences. While state-of-the-art\n(SoTA) compression methods boast impressive advancements in preserving benign\ntask performance, the potential risks of compression in terms of safety and\ntrustworthiness have been largely neglected. This study conducts the first,\nthorough evaluation of three (3) leading LLMs using five (5) SoTA compression\ntechniques across eight (8) trustworthiness dimensions. Our experiments\nhighlight the intricate interplay between compression and trustworthiness,\nrevealing some interesting patterns. We find that quantization is currently a\nmore effective approach than pruning in achieving efficiency and\ntrustworthiness simultaneously. For instance, a 4-bit quantized model retains\nthe trustworthiness of its original counterpart, but model pruning\nsignificantly degrades trustworthiness, even at 50% sparsity. Moreover,\nemploying quantization within a moderate bit range could unexpectedly improve\ncertain trustworthiness dimensions such as ethics and fairness. Conversely,\nextreme quantization to very low bit levels (3 bits) tends to reduce\ntrustworthiness significantly. This increased risk cannot be uncovered by\nlooking at benign performance alone, in turn, mandating comprehensive\ntrustworthiness evaluation in practice. These findings culminate in practical\nrecommendations for simultaneously achieving high utility, efficiency, and\ntrustworthiness in LLMs. Code and models are available at\nhttps://decoding-comp-trust.github.io.\n","authors":["Junyuan Hong","Jinhao Duan","Chenhui Zhang","Zhangheng Li","Chulin Xie","Kelsey Lieberman","James Diffenderfer","Brian Bartoldson","Ajay Jaiswal","Kaidi Xu","Bhavya Kailkhura","Dan Hendrycks","Dawn Song","Zhangyang Wang","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2403.15447v2.pdf","comment":"Accepted to ICML'24"},{"id":"http://arxiv.org/abs/2402.02416v3","updated":"2024-06-03T14:33:45Z","published":"2024-02-04T09:24:51Z","title":"Aligner: Efficient Alignment by Learning to Correct","summary":"  With the rapid development of large language models (LLMs) and ever-evolving\npractical requirements, finding an efficient and effective alignment method has\nnever been more critical. However, the tension between the complexity of\ncurrent alignment methods and the need for rapid iteration in deployment\nscenarios necessitates the development of a model-agnostic alignment approach\nthat can operate under these constraints. In this paper, we introduce Aligner,\na novel and simple alignment paradigm that learns the correctional residuals\nbetween preferred and dispreferred answers using a small model. Designed as a\nmodel-agnostic, plug-and-play module, Aligner can be directly applied to\nvarious open-source and API-based models with only one-off training, making it\nsuitable for rapid iteration. Notably, Aligner can be applied to any powerful,\nlarge-scale upstream models. Moreover, it can even iteratively bootstrap the\nupstream models using corrected responses as synthetic human preference data,\nbreaking through the model's performance ceiling. Our experiments demonstrate\nperformance improvements by deploying the same Aligner model across 11\ndifferent LLMs, evaluated on the 3H dimensions (helpfulness, harmlessness, and\nhonesty). Specifically, Aligner-7B has achieved an average improvement of\n68.9\\% in helpfulness and 23.8\\% in harmlessness across the tested LLMs while\nalso effectively reducing hallucination. In the Alpaca-Eval leaderboard,\nstacking Aligner-2B on GPT-4 Turbo improved its LC Win Rate from 55.0\\% to\n58.3\\%, surpassing GPT-4 Omni's 57.5\\% Win Rate (community report).\n","authors":["Jiaming Ji","Boyuan Chen","Hantao Lou","Donghai Hong","Borong Zhang","Xuehai Pan","Juntao Dai","Tianyi Qiu","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2402.02416v3.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2402.08567v2","updated":"2024-06-03T14:15:03Z","published":"2024-02-13T16:06:17Z","title":"Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM\n  Agents Exponentially Fast","summary":"  A multimodal large language model (MLLM) agent can receive instructions,\ncapture images, retrieve histories from memory, and decide which tools to use.\nNonetheless, red-teaming efforts have revealed that adversarial images/prompts\ncan jailbreak an MLLM and cause unaligned behaviors. In this work, we report an\neven more severe safety issue in multi-agent environments, referred to as\ninfectious jailbreak. It entails the adversary simply jailbreaking a single\nagent, and without any further intervention from the adversary, (almost) all\nagents will become infected exponentially fast and exhibit harmful behaviors.\nTo validate the feasibility of infectious jailbreak, we simulate multi-agent\nenvironments containing up to one million LLaVA-1.5 agents, and employ\nrandomized pair-wise chat as a proof-of-concept instantiation for multi-agent\ninteraction. Our results show that feeding an (infectious) adversarial image\ninto the memory of any randomly chosen agent is sufficient to achieve\ninfectious jailbreak. Finally, we derive a simple principle for determining\nwhether a defense mechanism can provably restrain the spread of infectious\njailbreak, but how to design a practical defense that meets this principle\nremains an open question to investigate. Our project page is available at\nhttps://sail-sg.github.io/Agent-Smith/.\n","authors":["Xiangming Gu","Xiaosen Zheng","Tianyu Pang","Chao Du","Qian Liu","Ye Wang","Jing Jiang","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2402.08567v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2402.14856v2","updated":"2024-06-03T13:53:01Z","published":"2024-02-20T12:58:14Z","title":"Comparing Inferential Strategies of Humans and Large Language Models in\n  Deductive Reasoning","summary":"  Deductive reasoning plays a pivotal role in the formulation of sound and\ncohesive arguments. It allows individuals to draw conclusions that logically\nfollow, given the truth value of the information provided. Recent progress in\nthe domain of large language models (LLMs) has showcased their capability in\nexecuting deductive reasoning tasks. Nonetheless, a significant portion of\nresearch primarily assesses the accuracy of LLMs in solving such tasks, often\noverlooking a deeper analysis of their reasoning behavior. In this study, we\ndraw upon principles from cognitive psychology to examine inferential\nstrategies employed by LLMs, through a detailed evaluation of their responses\nto propositional logic problems. Our findings indicate that LLMs display\nreasoning patterns akin to those observed in humans, including strategies like\n$\\textit{supposition following}$ or $\\textit{chain construction}$. Moreover,\nour research demonstrates that the architecture and scale of the model\nsignificantly affect its preferred method of reasoning, with more advanced\nmodels tending to adopt strategies more frequently than less sophisticated\nones. Importantly, we assert that a model's accuracy, that is the correctness\nof its final conclusion, does not necessarily reflect the validity of its\nreasoning process. This distinction underscores the necessity for more nuanced\nevaluation procedures in the field.\n","authors":["Philipp Mondorf","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2402.14856v2.pdf","comment":"ACL 2024 main, 31 pages, 19 figures"},{"id":"http://arxiv.org/abs/2312.10302v4","updated":"2024-06-03T13:46:16Z","published":"2023-12-16T03:33:12Z","title":"One-Shot Learning as Instruction Data Prospector for Large Language\n  Models","summary":"  Contemporary practices in instruction tuning often hinge on enlarging data\nscaling without a clear strategy for ensuring data quality, inadvertently\nintroducing noise that may compromise model performance. To address this\nchallenge, we introduce \\textsc{Nuggets}, a novel and efficient methodology\nthat leverages one-shot learning to discern and select high-quality instruction\ndata from extensive datasets. \\textsc{Nuggets} assesses the potential of\nindividual instruction examples to act as effective one-shot learning\ninstances, thereby identifying those that can significantly improve performance\nacross diverse tasks. \\textsc{Nuggets} utilizes a scoring system based on the\nimpact of candidate examples on the perplexity of a diverse anchor set,\nfacilitating the selection of the most advantageous data for instruction\ntuning. Through comprehensive evaluations on two benchmarks, including MT-Bench\nand Alpaca-Eval, we show that instruction tuning with the top 1\\% of examples\ncurated by \\textsc{Nuggets} substantially outperforms conventional methods\nemploying the entire dataset.\n","authors":["Yunshui Li","Binyuan Hui","Xiaobo Xia","Jiaxi Yang","Min Yang","Lei Zhang","Shuzheng Si","Ling-Hao Chen","Junhao Liu","Tongliang Liu","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2312.10302v4.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2309.08637v5","updated":"2024-06-03T13:39:40Z","published":"2023-09-14T15:34:01Z","title":"TextBind: Multi-turn Interleaved Multimodal Instruction-following in the\n  Wild","summary":"  Large language models with instruction-following abilities have\nrevolutionized the field of artificial intelligence. These models show\nexceptional generalizability to tackle various real-world tasks through their\nnatural language interfaces. However, their performance heavily relies on\nhigh-quality exemplar data, which is often difficult to obtain. This challenge\nis further exacerbated when it comes to multimodal instruction following. We\nintroduce TextBind, an almost annotation-free framework for empowering larger\nlanguage models with the multi-turn interleaved multimodal\ninstruction-following capabilities. Our approach requires only image-caption\npairs and generates multi-turn multimodal instruction-response conversations\nfrom a language model. To accommodate interleaved image-text inputs and\noutputs, we devise MIM, a language model-centric architecture that seamlessly\nintegrates image encoder and decoder models. We release our dataset, model, and\ndemo to foster future research in the area of multimodal instruction following.\n","authors":["Huayang Li","Siheng Li","Deng Cai","Longyue Wang","Lemao Liu","Taro Watanabe","Yujiu Yang","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2309.08637v5.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2401.10695v2","updated":"2024-06-03T13:32:45Z","published":"2024-01-19T14:00:19Z","title":"LangBridge: Multilingual Reasoning Without Multilingual Supervision","summary":"  We introduce LangBridge, a zero-shot approach to adapt language models for\nmultilingual reasoning tasks without multilingual supervision. LangBridge\noperates by bridging two models, each specialized in different aspects: (1) one\nspecialized in understanding multiple languages (e.g., mT5 encoder) and (2) one\nspecialized in reasoning (e.g., MetaMath). LangBridge connects the two models\nby introducing minimal trainable parameters between them. Despite utilizing\nonly English data for training, LangBridge considerably enhances the\nperformance of language models on low-resource languages across mathematical\nreasoning, code completion, logical reasoning, and commonsense reasoning. Our\nanalysis suggests that the efficacy of LangBridge stems from the\nlanguage-agnostic characteristics of multilingual representations. We publicly\nrelease our code and models.\n","authors":["Dongkeun Yoon","Joel Jang","Sungdong Kim","Seungone Kim","Sheikh Shafayat","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2401.10695v2.pdf","comment":"ACL 2024 Main"},{"id":"http://arxiv.org/abs/2402.02805v2","updated":"2024-06-03T13:07:06Z","published":"2024-02-05T08:26:33Z","title":"Graph-enhanced Large Language Models in Asynchronous Plan Reasoning","summary":"  Planning is a fundamental property of human intelligence. Reasoning about\nasynchronous plans is challenging since it requires sequential and parallel\nplanning to optimize time costs. Can large language models (LLMs) succeed at\nthis task? Here, we present the first large-scale study investigating this\nquestion. We find that a representative set of closed and open-source LLMs,\nincluding GPT-4 and LLaMA-2, behave poorly when not supplied with illustrations\nabout the task-solving process in our benchmark AsyncHow. We propose a novel\ntechnique called Plan Like a Graph (PLaG) that combines graphs with natural\nlanguage prompts and achieves state-of-the-art results. We show that although\nPLaG can boost model performance, LLMs still suffer from drastic degradation\nwhen task complexity increases, highlighting the limits of utilizing LLMs for\nsimulating digital devices. We see our study as an exciting step towards using\nLLMs as efficient autonomous agents. Our code and data are available at\nhttps://github.com/fangru-lin/graph-llm-asynchow-plan.\n","authors":["Fangru Lin","Emanuele La Malfa","Valentin Hofmann","Elle Michelle Yang","Anthony Cohn","Janet B. Pierrehumbert"],"pdf_url":"https://arxiv.org/pdf/2402.02805v2.pdf","comment":"Accepted at ICML-2024"},{"id":"http://arxiv.org/abs/2303.06458v3","updated":"2024-06-03T12:47:12Z","published":"2023-03-11T17:14:33Z","title":"ZeroNLG: Aligning and Autoencoding Domains for Zero-Shot Multimodal and\n  Multilingual Natural Language Generation","summary":"  Natural Language Generation (NLG) accepts input data in the form of images,\nvideos, or text and generates corresponding natural language text as output.\nExisting NLG methods mainly adopt a supervised approach and rely heavily on\ncoupled data-to-text pairs. However, for many targeted scenarios and for\nnon-English languages, sufficient quantities of labeled data are often not\navailable. To relax the dependency on labeled data of downstream tasks, we\npropose an intuitive and effective zero-shot learning framework, ZeroNLG, which\ncan deal with multiple NLG tasks, including image-to-text (image captioning),\nvideo-to-text (video captioning), and text-to-text (neural machine\ntranslation), across English, Chinese, German, and French within a unified\nframework. ZeroNLG does not require any labeled downstream pairs for training.\nDuring training, ZeroNLG (i) projects different domains (across modalities and\nlanguages) to corresponding coordinates in a shared common latent space; (ii)\nbridges different domains by aligning their corresponding coordinates in this\nspace; and (iii) builds an unsupervised multilingual auto-encoder to learn to\ngenerate text by reconstructing the input text given its coordinate in shared\nlatent space. Consequently, during inference, based on the data-to-text\npipeline, ZeroNLG can generate target sentences across different languages\ngiven the coordinate of input data in the common space. Within this unified\nframework, given visual (imaging or video) data as input, ZeroNLG can perform\nzero-shot visual captioning; given textual sentences as input, ZeroNLG can\nperform zero-shot machine translation. We present the results of extensive\nexperiments on twelve NLG tasks, showing that, without using any labeled\ndownstream pairs for training, ZeroNLG generates high-quality and believable\noutputs and significantly outperforms existing zero-shot methods.\n","authors":["Bang Yang","Fenglin Liu","Yuexian Zou","Xian Wu","Yaowei Wang","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2303.06458v3.pdf","comment":"Accepted by TPAMI (Our code and data are available at\n  https://github.com/yangbang18/ZeroNLG)"},{"id":"http://arxiv.org/abs/2405.11143v2","updated":"2024-06-03T12:19:18Z","published":"2024-05-20T01:04:40Z","title":"OpenRLHF: An Easy-to-use, Scalable and High-performance RLHF Framework","summary":"  As large language models (LLMs) continue to grow by scaling laws,\nreinforcement learning from human feedback (RLHF) has gained significant\nattention due to its outstanding performance. However, unlike pretraining or\nfine-tuning a single model, scaling reinforcement learning from human feedback\n(RLHF) for training large language models poses coordination challenges across\nfour models. We present OpenRLHF, an open-source framework enabling efficient\nRLHF scaling. Unlike existing RLHF frameworks that co-locate four models on the\nsame GPUs, OpenRLHF re-designs scheduling for the models beyond 70B parameters\nusing Ray, vLLM, and DeepSpeed, leveraging improved resource utilization and\ndiverse training approaches. Integrating seamlessly with Hugging Face, OpenRLHF\nprovides an out-of-the-box solution with optimized algorithms and launch\nscripts, which ensures user-friendliness. OpenRLHF implements RLHF, DPO,\nrejection sampling, and other alignment techniques. Empowering state-of-the-art\nLLM development, OpenRLHF's code is available at\nhttps://github.com/OpenLLMAI/OpenRLHF.\n","authors":["Jian Hu","Xibin Wu","Weixun Wang"," Xianyu","Dehao Zhang","Yu Cao"],"pdf_url":"https://arxiv.org/pdf/2405.11143v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11082v6","updated":"2024-06-03T12:19:16Z","published":"2023-04-19T17:50:09Z","title":"Fundamental Limitations of Alignment in Large Language Models","summary":"  An important aspect in developing language models that interact with humans\nis aligning their behavior to be useful and unharmful for their human users.\nThis is usually achieved by tuning the model in a way that enhances desired\nbehaviors and inhibits undesired ones, a process referred to as alignment. In\nthis paper, we propose a theoretical approach called Behavior Expectation\nBounds (BEB) which allows us to formally investigate several inherent\ncharacteristics and limitations of alignment in large language models.\nImportantly, we prove that within the limits of this framework, for any\nbehavior that has a finite probability of being exhibited by the model, there\nexist prompts that can trigger the model into outputting this behavior, with\nprobability that increases with the length of the prompt. This implies that any\nalignment process that attenuates an undesired behavior but does not remove it\naltogether, is not safe against adversarial prompting attacks. Furthermore, our\nframework hints at the mechanism by which leading alignment approaches such as\nreinforcement learning from human feedback make the LLM prone to being prompted\ninto the undesired behaviors. This theoretical result is being experimentally\ndemonstrated in large scale by the so called contemporary \"chatGPT jailbreaks\",\nwhere adversarial users trick the LLM into breaking its alignment guardrails by\ntriggering it into acting as a malicious persona. Our results expose\nfundamental limitations in alignment of LLMs and bring to the forefront the\nneed to devise reliable mechanisms for ensuring AI safety.\n","authors":["Yotam Wolf","Noam Wies","Oshri Avnery","Yoav Levine","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2304.11082v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07105v3","updated":"2024-06-03T12:14:34Z","published":"2024-01-13T16:09:49Z","title":"Graph Language Models","summary":"  While Language Models (LMs) are the workhorses of NLP, their interplay with\nstructured knowledge graphs (KGs) is still actively researched. Current methods\nfor encoding such graphs typically either (i) linearize them for embedding with\nLMs -- which underutilize structural information, or (ii) use Graph Neural\nNetworks (GNNs) to preserve the graph structure -- but GNNs cannot represent\ntext features as well as pretrained LMs. In our work we introduce a novel LM\ntype, the Graph Language Model (GLM), that integrates the strengths of both\napproaches and mitigates their weaknesses. The GLM parameters are initialized\nfrom a pretrained LM to enhance understanding of individual graph concepts and\ntriplets. Simultaneously, we design the GLM's architecture to incorporate graph\nbiases, thereby promoting effective knowledge distribution within the graph.\nThis enables GLMs to process graphs, texts, and interleaved inputs of both.\nEmpirical evaluations on relation classification tasks show that GLM embeddings\nsurpass both LM- and GNN-based baselines in supervised and zero-shot setting,\ndemonstrating their versatility.\n","authors":["Moritz Plenz","Anette Frank"],"pdf_url":"https://arxiv.org/pdf/2401.07105v3.pdf","comment":"Accepted at ACL 2024. 9 pages, 10 figures, 9 tables"},{"id":"http://arxiv.org/abs/2404.04232v2","updated":"2024-06-03T12:08:20Z","published":"2024-04-05T17:26:22Z","title":"Benchmarking and Improving Compositional Generalization of Multi-aspect\n  Controllable Text Generation","summary":"  Compositional generalization, representing the model's ability to generate\ntext with new attribute combinations obtained by recombining single attributes\nfrom the training data, is a crucial property for multi-aspect controllable\ntext generation (MCTG) methods. Nonetheless, a comprehensive compositional\ngeneralization evaluation benchmark of MCTG is still lacking. We propose\nCompMCTG, a benchmark encompassing diverse multi-aspect labeled datasets and a\ncrafted three-dimensional evaluation protocol, to holistically evaluate the\ncompositional generalization of MCTG approaches. We observe that existing MCTG\nworks generally confront a noticeable performance drop in compositional\ntesting. To mitigate this issue, we introduce Meta-MCTG, a training framework\nincorporating meta-learning, where we enable models to learn how to generalize\nby simulating compositional generalization scenarios in the training phase. We\ndemonstrate the effectiveness of Meta-MCTG through achieving obvious\nimprovement (by at most 3.64%) for compositional testing performance in 94.4%\ncases.\n","authors":["Tianqi Zhong","Zhaoyi Li","Quan Wang","Linqi Song","Ying Wei","Defu Lian","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2404.04232v2.pdf","comment":"Accepted to ACL 2024 (Main); 32 pages"},{"id":"http://arxiv.org/abs/2403.06833v2","updated":"2024-06-03T12:04:50Z","published":"2024-03-11T15:48:56Z","title":"Can LLMs Separate Instructions From Data? And What Do We Even Mean By\n  That?","summary":"  Instruction-tuned Large Language Models (LLMs) show impressive results in\nnumerous practical applications, but they lack essential safety features that\nare common in other areas of computer science, particularly an explicit\nseparation of instructions and data. This makes them vulnerable to\nmanipulations such as indirect prompt injections and generally unsuitable for\nsafety-critical tasks. Surprisingly, there is currently no established\ndefinition or benchmark to quantify this phenomenon. In this work, we close\nthis gap by introducing a formal measure for instruction-data separation and an\nempirical variant that is calculable from a model's outputs. We also present a\nnew dataset, SEP, that allows estimating the measure for real-world models. Our\nresults on various LLMs show that the problem of instruction-data separation is\nreal: all models fail to achieve high separation, and canonical mitigation\ntechniques, such as prompt engineering and fine-tuning, either fail to\nsubstantially improve separation or reduce model utility. The source code and\nSEP dataset are openly accessible at\nhttps://github.com/egozverev/Shold-It-Be-Executed-Or-Processed.\n","authors":["Egor Zverev","Sahar Abdelnabi","Soroush Tabesh","Mario Fritz","Christoph H. Lampert"],"pdf_url":"https://arxiv.org/pdf/2403.06833v2.pdf","comment":"GitHub:\n  https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed. 10 pages main\n  text, 30 pages in total"},{"id":"http://arxiv.org/abs/2404.08817v2","updated":"2024-06-03T11:56:38Z","published":"2024-04-12T21:28:18Z","title":"Revisiting Code Similarity Evaluation with Abstract Syntax Tree Edit\n  Distance","summary":"  This paper revisits recent code similarity evaluation metrics, particularly\nfocusing on the application of Abstract Syntax Tree (AST) editing distance in\ndiverse programming languages. In particular, we explore the usefulness of\nthese metrics and compare them to traditional sequence similarity metrics. Our\nexperiments showcase the effectiveness of AST editing distance in capturing\nintricate code structures, revealing a high correlation with established\nmetrics. Furthermore, we explore the strengths and weaknesses of AST editing\ndistance and prompt-based GPT similarity scores in comparison to BLEU score,\nexecution match, and Jaccard Similarity. We propose, optimize, and publish an\nadaptable metric that demonstrates effectiveness across all tested languages,\nrepresenting an enhanced version of Tree Similarity of Edit Distance (TSED).\n","authors":["Yewei Song","Cedric Lothritz","Daniel Tang","Tegawendé F. Bissyandé","Jacques Klein"],"pdf_url":"https://arxiv.org/pdf/2404.08817v2.pdf","comment":"ACL 2024 Main"},{"id":"http://arxiv.org/abs/2311.08045v4","updated":"2024-06-03T11:34:05Z","published":"2023-11-14T10:10:31Z","title":"Adversarial Preference Optimization: Enhancing Your Alignment via RM-LLM\n  Game","summary":"  Human preference alignment is essential to improve the interaction quality of\nlarge language models (LLMs). Existing alignment methods depend on manually\nannotated preference data to guide the LLM optimization directions. However,\ncontinuously updating LLMs for alignment raises a distribution gap between\nmodel-generated samples and human-annotated responses, hindering training\neffectiveness. To mitigate this issue, previous methods require additional\npreference annotation on newly generated samples to adapt to the shifted\ndistribution, which consumes a large amount of annotation resources. Targeting\nmore efficient human preference optimization, we propose an Adversarial\nPreference Optimization (APO) framework, in which the LLM and the reward model\nupdate alternatively via a min-max game. Through adversarial training, the\nreward model can adapt to the shifted generation distribution of the LLM\nwithout any additional annotation. With comprehensive experiments, we find the\nproposed adversarial training framework further enhances existing alignment\nbaselines in terms of LLM helpfulness and harmlessness. The code is at\nhttps://github.com/Linear95/APO.\n","authors":["Pengyu Cheng","Yifan Yang","Jian Li","Yong Dai","Tianhao Hu","Peixin Cao","Nan Du","Xiaolong Li"],"pdf_url":"https://arxiv.org/pdf/2311.08045v4.pdf","comment":"Accepted by ACL2024 findings"},{"id":"http://arxiv.org/abs/2401.17167v3","updated":"2024-06-03T11:28:29Z","published":"2024-01-30T16:52:56Z","title":"Planning, Creation, Usage: Benchmarking LLMs for Comprehensive Tool\n  Utilization in Real-World Complex Scenarios","summary":"  The recent trend of using Large Language Models (LLMs) as tool agents in\nreal-world applications underscores the necessity for comprehensive evaluations\nof their capabilities, particularly in complex scenarios involving planning,\ncreating, and using tools. However, existing benchmarks typically focus on\nsimple synthesized queries that do not reflect real-world complexity, thereby\noffering limited perspectives in evaluating tool utilization. To address this\nissue, we present UltraTool, a novel benchmark designed to improve and evaluate\nLLMs' ability in tool utilization within real-world scenarios. UltraTool\nfocuses on the entire process of using tools - from planning and creating to\napplying them in complex tasks. It emphasizes real-world complexities,\ndemanding accurate, multi-step planning for effective problem-solving. A key\nfeature of UltraTool is its independent evaluation of planning with natural\nlanguage, which happens before tool usage and simplifies the task solving by\nmapping out the intermediate steps. Thus, unlike previous work, it eliminates\nthe restriction of pre-defined toolset. Through extensive experiments on\nvarious LLMs, we offer novel insights into the evaluation of capabilities of\nLLMs in tool utilization, thereby contributing a fresh perspective to this\nrapidly evolving field. The benchmark is publicly available at\nhttps://github.com/JoeYing1019/UltraTool.\n","authors":["Shijue Huang","Wanjun Zhong","Jianqiao Lu","Qi Zhu","Jiahui Gao","Weiwen Liu","Yutai Hou","Xingshan Zeng","Yasheng Wang","Lifeng Shang","Xin Jiang","Ruifeng Xu","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2401.17167v3.pdf","comment":"Accepted by ACL2024 Findings"},{"id":"http://arxiv.org/abs/2401.15641v2","updated":"2024-06-03T11:11:13Z","published":"2024-01-28T12:33:14Z","title":"PRE: A Peer Review Based Large Language Model Evaluator","summary":"  The impressive performance of large language models (LLMs) has attracted\nconsiderable attention from the academic and industrial communities. Besides\nhow to construct and train LLMs, how to effectively evaluate and compare the\ncapacity of LLMs has also been well recognized as an important yet difficult\nproblem. Existing paradigms rely on either human annotators or model-based\nevaluators to evaluate the performance of LLMs on different tasks. However,\nthese paradigms often suffer from high cost, low generalizability, and\ninherited biases in practice, which make them incapable of supporting the\nsustainable development of LLMs in long term. In order to address these issues,\ninspired by the peer review systems widely used in academic publication\nprocess, we propose a novel framework that can automatically evaluate LLMs\nthrough a peer-review process. Specifically, for the evaluation of a specific\ntask, we first construct a small qualification exam to select \"reviewers\" from\na couple of powerful LLMs. Then, to actually evaluate the \"submissions\" written\nby different candidate LLMs, i.e., the evaluatees, we use the reviewer LLMs to\nrate or compare the submissions. The final ranking of evaluatee LLMs is\ngenerated based on the results provided by all reviewers. We conducted\nextensive experiments on text summarization tasks with eleven LLMs including\nGPT-4. The results demonstrate the existence of biasness when evaluating using\na single LLM. Also, our PRE model outperforms all the baselines, illustrating\nthe effectiveness of the peer review mechanism.\n","authors":["Zhumin Chu","Qingyao Ai","Yiteng Tu","Haitao Li","Yiqun Liu"],"pdf_url":"https://arxiv.org/pdf/2401.15641v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2402.06044v3","updated":"2024-06-03T10:48:16Z","published":"2024-02-08T20:35:06Z","title":"OpenToM: A Comprehensive Benchmark for Evaluating Theory-of-Mind\n  Reasoning Capabilities of Large Language Models","summary":"  Neural Theory-of-Mind (N-ToM), machine's ability to understand and keep track\nof the mental states of others, is pivotal in developing socially intelligent\nagents. However, prevalent N-ToM benchmarks have several shortcomings,\nincluding the presence of ambiguous and artificial narratives, absence of\npersonality traits and preferences, a lack of questions addressing characters'\npsychological mental states, and limited diversity in the questions posed. In\nresponse to these issues, we construct OpenToM, a new benchmark for assessing\nN-ToM with (1) longer and clearer narrative stories, (2) characters with\nexplicit personality traits, (3) actions that are triggered by character\nintentions, and (4) questions designed to challenge LLMs' capabilities of\nmodeling characters' mental states of both the physical and psychological\nworld. Using OpenToM, we reveal that state-of-the-art LLMs thrive at modeling\ncertain aspects of mental states in the physical world but fall short when\ntracking characters' mental states in the psychological world.\n","authors":["Hainiu Xu","Runcong Zhao","Lixing Zhu","Jinhua Du","Yulan He"],"pdf_url":"https://arxiv.org/pdf/2402.06044v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2404.10306v4","updated":"2024-06-03T10:42:36Z","published":"2024-04-16T06:27:39Z","title":"Balancing Speciality and Versatility: a Coarse to Fine Framework for\n  Supervised Fine-tuning Large Language Model","summary":"  Aligned Large Language Models (LLMs) showcase remarkable versatility, capable\nof handling diverse real-world tasks. Meanwhile, aligned LLMs are also expected\nto exhibit speciality, excelling in specific applications. However, fine-tuning\nwith extra data, a common practice to gain speciality, often leads to\ncatastrophic forgetting (CF) of previously acquired versatility, hindering the\nmodel's performance across diverse tasks. In response to this challenge, we\npropose CoFiTune, a coarse to fine framework in an attempt to strike the\nbalance between speciality and versatility. At the coarse-grained level, an\nempirical tree-search algorithm is utilized to pinpoint and update specific\nmodules that are crucial for speciality, while keeping other parameters frozen;\nat the fine-grained level, a soft-masking mechanism regulates the update to the\nLLMs, mitigating the CF issue without harming speciality. In an overall\nevaluation of both speciality and versatility, CoFiTune consistently\noutperforms baseline methods across diverse tasks and model scales. Compared to\nthe full-parameter SFT, CoFiTune leads to about 14% versatility improvement and\nmarginal speciality loss on a 13B model. Lastly, based on further analysis, we\nprovide a speculative insight into the information forwarding process in LLMs,\nwhich helps explain the effectiveness of the proposed method. The code is\navailable at https://github.com/rattlesnakey/CoFiTune.\n","authors":["Hengyuan Zhang","Yanru Wu","Dawei Li","Sak Yang","Rui Zhao","Yong Jiang","Fei Tan"],"pdf_url":"https://arxiv.org/pdf/2404.10306v4.pdf","comment":"43 pages, 10 figures, accepted by ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2402.09259v2","updated":"2024-06-03T10:30:00Z","published":"2024-02-14T15:45:56Z","title":"SyntaxShap: Syntax-aware Explainability Method for Text Generation","summary":"  To harness the power of large language models in safety-critical domains, we\nneed to ensure the explainability of their predictions. However, despite the\nsignificant attention to model interpretability, there remains an unexplored\ndomain in explaining sequence-to-sequence tasks using methods tailored for\ntextual data. This paper introduces SyntaxShap, a local, model-agnostic\nexplainability method for text generation that takes into consideration the\nsyntax in the text data. The presented work extends Shapley values to account\nfor parsing-based syntactic dependencies. Taking a game theoric approach,\nSyntaxShap only considers coalitions constraint by the dependency tree. We\nadopt a model-based evaluation to compare SyntaxShap and its weighted form to\nstate-of-the-art explainability methods adapted to text generation tasks, using\ndiverse metrics including faithfulness, coherency, and semantic alignment of\nthe explanations to the model. We show that our syntax-aware method produces\nexplanations that help build more faithful and coherent explanations for\npredictions by autoregressive models. Confronted with the misalignment of human\nand AI model reasoning, this paper also highlights the need for cautious\nevaluation strategies in explainable AI.\n","authors":["Kenza Amara","Rita Sevastjanova","Mennatallah El-Assady"],"pdf_url":"https://arxiv.org/pdf/2402.09259v2.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2402.09631v3","updated":"2024-06-03T10:24:22Z","published":"2024-02-15T00:20:30Z","title":"Representation Surgery: Theory and Practice of Affine Steering","summary":"  Language models often exhibit undesirable behavior, e.g., generating toxic or\ngender-biased text. In the case of neural language models, an encoding of the\nundesirable behavior is often present in the model's representations. Thus, one\nnatural (and common) approach to prevent the model from exhibiting undesirable\nbehavior is to steer the model's representations in a manner that reduces the\nprobability of it generating undesirable text. This paper investigates the\nformal and empirical properties of steering functions, i.e., transformation of\nthe neural language model's representations that alter its behavior. First, we\nderive two optimal, in the least-squares sense, affine steering functions under\ndifferent constraints. Our theory provides justification for existing\napproaches and offers a novel, improved steering approach. Second, we offer a\nseries of experiments that demonstrate the empirical effectiveness of the\nmethods in mitigating bias and reducing toxic generation.\n","authors":["Shashwat Singh","Shauli Ravfogel","Jonathan Herzig","Roee Aharoni","Ryan Cotterell","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2402.09631v3.pdf","comment":"Accepted in ICML 2024"},{"id":"http://arxiv.org/abs/2312.11075v4","updated":"2024-06-03T10:00:13Z","published":"2023-12-18T10:16:37Z","title":"Split and Rephrase with Large Language Models","summary":"  The Split and Rephrase (SPRP) task, which consists in splitting complex\nsentences into a sequence of shorter grammatical sentences, while preserving\nthe original meaning, can facilitate the processing of complex texts for humans\nand machines alike. It is also a valuable testbed to evaluate natural language\nprocessing models, as it requires modelling complex grammatical aspects. In\nthis work, we evaluate large language models on the task, showing that they can\nprovide large improvements over the state of the art on the main metrics,\nalthough still lagging in terms of splitting compliance. Results from two human\nevaluations further support the conclusions drawn from automated metric\nresults. We provide a comprehensive study that includes prompting variants,\ndomain shift, fine-tuned pretrained language models of varying parameter size\nand training data volumes, contrasted with both zero-shot and few-shot\napproaches on instruction-tuned language models. Although the latter were\nmarkedly outperformed by fine-tuned models, they may constitute a reasonable\noff-the-shelf alternative. Our results provide a fine-grained analysis of the\npotential and limitations of large language models for SPRP, with significant\nimprovements achievable using relatively small amounts of training data and\nmodel parameters overall, and remaining limitations for all models on the task.\n","authors":["David Ponce","Thierry Etchegoyhen","Jesús Calleja Pérez","Harritxu Gete"],"pdf_url":"https://arxiv.org/pdf/2312.11075v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04251v2","updated":"2024-06-03T09:33:14Z","published":"2024-02-06T18:59:30Z","title":"Linear-time Minimum Bayes Risk Decoding with Reference Aggregation","summary":"  Minimum Bayes Risk (MBR) decoding is a text generation technique that has\nbeen shown to improve the quality of machine translations, but is expensive,\neven if a sampling-based approximation is used. Besides requiring a large\nnumber of sampled sequences, it requires the pairwise calculation of a utility\nmetric, which has quadratic complexity. In this paper, we propose to\napproximate pairwise metric scores with scores calculated against aggregated\nreference representations. This changes the complexity of utility estimation\nfrom $O(n^2)$ to $O(n)$, while empirically preserving most of the quality gains\nof MBR decoding. We release our source code at https://github.com/ZurichNLP/mbr\n","authors":["Jannis Vamvas","Rico Sennrich"],"pdf_url":"https://arxiv.org/pdf/2402.04251v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2404.06395v3","updated":"2024-06-03T08:54:38Z","published":"2024-04-09T15:36:50Z","title":"MiniCPM: Unveiling the Potential of Small Language Models with Scalable\n  Training Strategies","summary":"  The burgeoning interest in developing Large Language Models (LLMs) with up to\ntrillion parameters has been met with concerns regarding resource efficiency\nand practical expense, particularly given the immense cost of experimentation.\nThis scenario underscores the importance of exploring the potential of Small\nLanguage Models (SLMs) as a resource-efficient alternative. In this context, we\nintroduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter\nvariants, not only excel in their respective categories but also demonstrate\ncapabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach\nexhibits scalability in both model and data dimensions for future LLM research.\nRegarding model scaling, we employ extensive model wind tunnel experiments for\nstable and optimal scaling. For data scaling, we introduce a\nWarmup-Stable-Decay (WSD) learning rate scheduler (LRS), conducive to\ncontinuous training and domain adaptation. We present an in-depth analysis of\nthe intriguing training dynamics that occurred in the WSD LRS. With WSD LRS, we\nare now able to efficiently study data-model scaling law without extensive\nretraining experiments on both axes of model and data, from which we derive the\nmuch higher compute optimal data-model ratio than Chinchilla Optimal.\nAdditionally, we introduce MiniCPM family, including MiniCPM-DPO, MiniCPM-MoE\nand MiniCPM-128K, whose excellent performance further cementing MiniCPM's\nfoundation in diverse SLM applications. MiniCPM models are available publicly\nat https://github.com/OpenBMB/MiniCPM .\n","authors":["Shengding Hu","Yuge Tu","Xu Han","Chaoqun He","Ganqu Cui","Xiang Long","Zhi Zheng","Yewei Fang","Yuxiang Huang","Weilin Zhao","Xinrong Zhang","Zheng Leng Thai","Kaihuo Zhang","Chongyi Wang","Yuan Yao","Chenyang Zhao","Jie Zhou","Jie Cai","Zhongwu Zhai","Ning Ding","Chao Jia","Guoyang Zeng","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2404.06395v3.pdf","comment":"revise according to peer review"},{"id":"http://arxiv.org/abs/2311.09189v2","updated":"2024-06-03T08:37:10Z","published":"2023-11-15T18:32:27Z","title":"PsyEval: A Suite of Mental Health Related Tasks for Evaluating Large\n  Language Models","summary":"  Evaluating Large Language Models (LLMs) in the mental health domain poses\ndistinct challenged from other domains, given the subtle and highly subjective\nnature of symptoms that exhibit significant variability among individuals. This\npaper presents PsyEval, the first comprehensive suite of mental health-related\ntasks for evaluating LLMs. PsyEval encompasses five sub-tasks that evaluate\nthree critical dimensions of mental health. This comprehensive framework is\ndesigned to thoroughly assess the unique challenges and intricacies of mental\nhealth-related tasks, making PsyEval a highly specialized and valuable tool for\nevaluating LLM performance in this domain. We evaluate twelve advanced LLMs\nusing PsyEval. Experiment results not only demonstrate significant room for\nimprovement in current LLMs concerning mental health but also unveil potential\ndirections for future model optimization.\n","authors":["Haoan Jin","Siyuan Chen","Dilawaier Dilixiati","Yewei Jiang","Mengyue Wu","Kenny Q. Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.09189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18741v2","updated":"2024-06-03T08:35:07Z","published":"2024-05-29T04:04:05Z","title":"Genshin: General Shield for Natural Language Processing with Large\n  Language Models","summary":"  Large language models (LLMs) like ChatGPT, Gemini, or LLaMA have been\ntrending recently, demonstrating considerable advancement and generalizability\npower in countless domains. However, LLMs create an even bigger black box\nexacerbating opacity, with interpretability limited to few approaches. The\nuncertainty and opacity embedded in LLMs' nature restrict their application in\nhigh-stakes domains like financial fraud, phishing, etc. Current approaches\nmainly rely on traditional textual classification with posterior interpretable\nalgorithms, suffering from attackers who may create versatile adversarial\nsamples to break the system's defense, forcing users to make trade-offs between\nefficiency and robustness. To address this issue, we propose a novel cascading\nframework called Genshin (General Shield for Natural Language Processing with\nLarge Language Models), utilizing LLMs as defensive one-time plug-ins. Unlike\nmost applications of LLMs that try to transform text into something new or\nstructural, Genshin uses LLMs to recover text to its original state. Genshin\naims to combine the generalizability of the LLM, the discrimination of the\nmedian model, and the interpretability of the simple model. Our experiments on\nthe task of sentimental analysis and spam detection have shown fatal flaws of\nthe current median models and exhilarating results on LLMs' recovery ability,\ndemonstrating that Genshin is both effective and efficient. In our ablation\nstudy, we unearth several intriguing observations. Utilizing the LLM defender,\na tool derived from the 4th paradigm, we have reproduced BERT's 15% optimal\nmask rate results in the 3rd paradigm of NLP. Additionally, when employing the\nLLM as a potential adversarial tool, attackers are capable of executing\neffective attacks that are nearly semantically lossless.\n","authors":["Xiao Peng","Tao Liu","Ying Wang"],"pdf_url":"https://arxiv.org/pdf/2405.18741v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19799v2","updated":"2024-06-03T08:13:10Z","published":"2024-05-30T08:10:50Z","title":"Unsupervised Mutual Learning of Dialogue Discourse Parsing and Topic\n  Segmentation","summary":"  The advancement of large language models (LLMs) has propelled the development\nof dialogue systems. Unlike the popular ChatGPT-like assistant model, which\nonly satisfies the user's preferences, task-oriented dialogue systems have also\nfaced new requirements and challenges in the broader business field. They are\nexpected to provide correct responses at each dialogue turn, at the same time,\nachieve the overall goal defined by the task. By understanding rhetorical\nstructures and topic structures via topic segmentation and discourse parsing, a\ndialogue system may do a better planning to achieve both objectives. However,\nwhile both structures belong to discourse structure in linguistics, rhetorical\nstructure and topic structure are mostly modeled separately or with one\nassisting the other in the prior work. The interaction between these two\nstructures has not been considered for joint modeling and mutual learning.\nFurthermore, unsupervised learning techniques to achieve the above are not well\nexplored. To fill this gap, we propose an unsupervised mutual learning\nframework of two structures leveraging the global and local connections between\nthem. We extend the topic modeling between non-adjacent discourse units to\nensure global structural relevance with rhetorical structures. We also\nincorporate rhetorical structures into the topic structure through a graph\nneural network model to ensure local coherence consistency. Finally, we utilize\nthe similarity between the two fused structures for mutual learning. The\nexperimental results demonstrate that our methods outperform all strong\nbaselines on two dialogue rhetorical datasets (STAC and Molweni), as well as\ndialogue topic datasets (Doc2Dial and TIAGE). We provide our code at\nhttps://github.com/Jeff-Sue/URT.\n","authors":["Jiahui Xu","Feng Jiang","Anningzhe Gao","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2405.19799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03161v3","updated":"2024-06-03T08:09:09Z","published":"2024-02-05T16:30:49Z","title":"Video-LaVIT: Unified Video-Language Pre-training with Decoupled\n  Visual-Motional Tokenization","summary":"  In light of recent advances in multimodal Large Language Models (LLMs), there\nis increasing attention to scaling them from image-text data to more\ninformative real-world videos. Compared to static images, video poses unique\nchallenges for effective large-scale pre-training due to the modeling of its\nspatiotemporal dynamics. In this paper, we address such limitations in\nvideo-language pre-training with an efficient video decomposition that\nrepresents each video as keyframes and temporal motions. These are then adapted\nto an LLM using well-designed tokenizers that discretize visual and temporal\ninformation as a few tokens, thus enabling unified generative pre-training of\nvideos, images, and text. At inference, the generated tokens from the LLM are\ncarefully recovered to the original continuous pixel space to create various\nvideo content. Our proposed framework is both capable of comprehending and\ngenerating image and video content, as demonstrated by its competitive\nperformance across 13 multimodal benchmarks in image and video understanding\nand generation. Our code and models are available at\nhttps://video-lavit.github.io.\n","authors":["Yang Jin","Zhicheng Sun","Kun Xu","Kun Xu","Liwei Chen","Hao Jiang","Quzhe Huang","Chengru Song","Yuliang Liu","Di Zhang","Yang Song","Kun Gai","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2402.03161v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04854v3","updated":"2024-06-03T07:48:19Z","published":"2024-02-07T13:54:06Z","title":"Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey","summary":"  Research surveys have always posed a challenge for beginner researchers who\nlack of research training. These researchers struggle to understand the\ndirections within their research topic, and the discovery of new research\nfindings within a short time. One way to provide intuitive assistance to\nbeginner researchers is by offering relevant knowledge graphs(KG) and\nrecommending related academic papers. However, existing navigation knowledge\ngraphs primarily rely on keywords in the research field and often fail to\npresent the logical hierarchy among multiple related papers clearly. Moreover,\nmost recommendation systems for academic papers simply rely on high text\nsimilarity, which can leave researchers confused as to why a particular article\nis being recommended. They may lack of grasp important information about the\ninsight connection between \"Issue resolved\" and \"Issue finding\" that they hope\nto obtain. To address these issues, this study aims to support research insight\nsurveys for beginner researchers by establishing a hierarchical tree-structured\nknowledge graph that reflects the inheritance insight of research topics and\nthe relevance insight among the academic papers.\n","authors":["Jinghong Li","Huy Phan","Wen Gu","Koichi Ota","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2402.04854v3.pdf","comment":"This paper will be submitted to 'The 18TH International Conference on\n  INnovations in Intelligent SysTems and Applications (INISTA 2024)'"},{"id":"http://arxiv.org/abs/2311.01775v2","updated":"2024-06-03T07:42:15Z","published":"2023-11-03T08:20:48Z","title":"UP4LS: User Profile Constructed by Multiple Attributes for Enhancing\n  Linguistic Steganalysis","summary":"  Linguistic steganalysis (LS) tasks aim to detect whether a text contains\nsecret information. Existing LS methods focus on the deep-learning model design\nand they achieve excellent results in ideal data. However, they overlook the\nunique user characteristics, leading to weak performance in social networks.\nAnd a few stegos here that further complicate detection. We propose the UP4LS,\na framework with the User Profile for enhancing LS in realistic scenarios.\nThree kinds of user attributes like writing habits are explored to build the\nprofile. For each attribute, the specific feature extraction module is\ndesigned. The extracted features are mapped to high-dimensional user features\nvia the deep-learning model of the method to be improved. The content feature\nis extracted by the language model. Then user and content features are\nintegrated. Existing methods can improve LS results by adding the UP4LS\nframework without changing their deep-learning models. Experiments show that\nUP4LS can significantly enhance the performance of LS-task baselines in\nrealistic scenarios, with the overall Acc increased by 25%, F1 increased by\n51%, and SOTA results. The improvement is especially pronounced in fewer\nstegos. Additionally, UP4LS also sets the stage for the related-task SOTA\nmethods to efficient LS.\n","authors":["Yihao Wang","Ruiqi Song","Lingxiao Li","Yifan Tang","Ru Zhang","Jianyi Liu"],"pdf_url":"https://arxiv.org/pdf/2311.01775v2.pdf","comment":"15 pages, 7 figures, 14 tables"},{"id":"http://arxiv.org/abs/2402.02801v2","updated":"2024-06-03T07:35:25Z","published":"2024-02-05T08:19:56Z","title":"KS-Lottery: Finding Certified Lottery Tickets for Multilingual Language\n  Models","summary":"  The lottery ticket hypothesis posits the existence of ``winning tickets''\nwithin a randomly initialized neural network. Do winning tickets exist for LLMs\nin fine-tuning scenarios? How can we find such winning tickets? In this paper,\nwe propose KS-Lottery, a method to identify a small subset of LLM parameters\nhighly effective in multilingual fine-tuning. Our key idea is to use\nKolmogorov-Smirnov Test to analyze the distribution shift of parameters before\nand after fine-tuning. We further theoretically prove that KS-Lottery can find\nthe certified winning tickets in the embedding layer, fine-tuning on the found\nparameters is guaranteed to perform as well as full fine-tuning. Comparing\nKS-Lottery with other parameter-efficient tuning algorithms on translation\ntasks, the experimental results show that KS-Lottery finds a much smaller set\nof parameters for fine-tuning while achieving the comparable performance as\nfull fine-tuning LLM. Surprisingly, we find that fine-tuning 18 tokens'\nembedding of LLaMA suffices to reach the fine-tuning translation\nperformance~\\footnote{https://github.com/CONE-MT/KS-Lottery.}.\n","authors":["Fei Yuan","Chang Ma","Shuai Yuan","Qiushi Sun","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2402.02801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14652v2","updated":"2024-06-03T07:34:05Z","published":"2024-02-22T16:04:03Z","title":"Cleaner Pretraining Corpus Curation with Neural Web Scraping","summary":"  The web contains large-scale, diverse, and abundant information to satisfy\nthe information-seeking needs of humans. Through meticulous data collection,\npreprocessing, and curation, webpages can be used as a fundamental data\nresource for language model pretraining. However, when confronted with the\nprogressively revolutionized and intricate nature of webpages,\nrule-based/feature-based web scrapers are becoming increasingly inadequate.\nThis paper presents a simple, fast, and effective Neural web Scraper\n(NeuScraper) to help extract primary and clean text contents from webpages.\nExperimental results show that NeuScraper surpasses the baseline scrapers by\nachieving more than a 20% improvement, demonstrating its potential in\nextracting higher-quality data to facilitate the language model pretraining.\nAll of the code is available at https://github.com/OpenMatch/NeuScraper.\n","authors":["Zhipeng Xu","Zhenghao Liu","Yukun Yan","Zhiyuan Liu","Ge Yu","Chenyan Xiong"],"pdf_url":"https://arxiv.org/pdf/2402.14652v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09353v5","updated":"2024-06-03T07:27:15Z","published":"2024-02-14T17:59:34Z","title":"DoRA: Weight-Decomposed Low-Rank Adaptation","summary":"  Among the widely used parameter-efficient fine-tuning (PEFT) methods, LoRA\nand its variants have gained considerable popularity because of avoiding\nadditional inference costs. However, there still often exists an accuracy gap\nbetween these methods and full fine-tuning (FT). In this work, we first\nintroduce a novel weight decomposition analysis to investigate the inherent\ndifferences between FT and LoRA. Aiming to resemble the learning capacity of FT\nfrom the findings, we propose Weight-Decomposed Low-Rank Adaptation (DoRA).\nDoRA decomposes the pre-trained weight into two components, magnitude and\ndirection, for fine-tuning, specifically employing LoRA for directional updates\nto efficiently minimize the number of trainable parameters. By employing \\ours,\nwe enhance both the learning capacity and training stability of LoRA while\navoiding any additional inference overhead. \\ours~consistently outperforms LoRA\non fine-tuning LLaMA, LLaVA, and VL-BART on various downstream tasks, such as\ncommonsense reasoning, visual instruction tuning, and image/video-text\nunderstanding. Code is available at https://github.com/NVlabs/DoRA.\n","authors":["Shih-Yang Liu","Chien-Yi Wang","Hongxu Yin","Pavlo Molchanov","Yu-Chiang Frank Wang","Kwang-Ting Cheng","Min-Hung Chen"],"pdf_url":"https://arxiv.org/pdf/2402.09353v5.pdf","comment":"Code available at https://github.com/NVlabs/DoRA"},{"id":"http://arxiv.org/abs/2401.04679v7","updated":"2024-06-03T06:59:31Z","published":"2024-01-09T17:09:01Z","title":"RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation","summary":"  We investigate parameter-efficient fine-tuning (PEFT) methods that can\nprovide good accuracy under limited computational and memory budgets in the\ncontext of large language models (LLMs). We present a new PEFT method called\nRobust Adaptation (RoSA) inspired by robust principal component analysis that\njointly trains $\\textit{low-rank}$ and $\\textit{highly-sparse}$ components on\ntop of a set of fixed pretrained weights to efficiently approximate the\nperformance of a full-fine-tuning (FFT) solution. Across a series of\nchallenging generative tasks such as grade-school math and SQL query\ngeneration, which require fine-tuning for good performance, we show that RoSA\noutperforms LoRA, pure sparse fine-tuning, and alternative hybrid methods at\nthe same parameter budget, and can even recover the performance of FFT on some\ntasks. We provide system support for RoSA to complement the training algorithm,\nspecifically in the form of sparse GPU kernels which enable memory- and\ncomputationally-efficient training, and show that it is also compatible with\nlow-precision base weights, resulting in the first joint representation\ncombining quantization, low-rank and sparse approximations. Our code is\navailable at https://github.com/IST-DASLab/RoSA.\n","authors":["Mahdi Nikdan","Soroush Tabesh","Elvir Crnčević","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2401.04679v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18018v4","updated":"2024-06-03T06:52:58Z","published":"2024-01-31T17:28:24Z","title":"On Prompt-Driven Safeguarding for Large Language Models","summary":"  Prepending model inputs with safety prompts is a common practice for\nsafeguarding large language models (LLMs) against queries with harmful intents.\nHowever, the underlying working mechanisms of safety prompts have not been\nunraveled yet, restricting the possibility of automatically optimizing them to\nimprove LLM safety. In this work, we investigate how LLMs' behavior (i.e.,\ncomplying with or refusing user queries) is affected by safety prompts from the\nperspective of model representation. We find that in the representation space,\nthe input queries are typically moved by safety prompts in a \"higher-refusal\"\ndirection, in which models become more prone to refusing to provide assistance,\neven when the queries are harmless. On the other hand, LLMs are naturally\ncapable of distinguishing harmful and harmless queries without safety prompts.\nInspired by these findings, we propose a method for safety prompt optimization,\nnamely DRO (Directed Representation Optimization). Treating a safety prompt as\ncontinuous, trainable embeddings, DRO learns to move the queries'\nrepresentations along or opposite the refusal direction, depending on their\nharmfulness. Experiments with eight LLMs on out-of-domain and jailbreak\nbenchmarks demonstrate that DRO remarkably improves the safeguarding\nperformance of human-crafted safety prompts, without compromising the models'\ngeneral performance.\n","authors":["Chujie Zheng","Fan Yin","Hao Zhou","Fandong Meng","Jie Zhou","Kai-Wei Chang","Minlie Huang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2401.18018v4.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2401.04514v2","updated":"2024-06-03T06:50:26Z","published":"2024-01-09T12:12:50Z","title":"Rewriting the Code: A Simple Method for Large Language Model Augmented\n  Code Search","summary":"  In code search, the Generation-Augmented Retrieval (GAR) framework, which\ngenerates exemplar code snippets to augment queries, has emerged as a promising\nstrategy to address the principal challenge of modality misalignment between\ncode snippets and natural language queries, particularly with the demonstrated\ncode generation capabilities of Large Language Models (LLMs). Nevertheless, our\npreliminary investigations indicate that the improvements conferred by such an\nLLM-augmented framework are somewhat constrained. This limitation could\npotentially be ascribed to the fact that the generated codes, albeit\nfunctionally accurate, frequently display a pronounced stylistic deviation from\nthe ground truth code in the codebase. In this paper, we extend the\nfoundational GAR framework and propose a simple yet effective method that\nadditionally Rewrites the Code (ReCo) within the codebase for style\nnormalization. Experimental results demonstrate that ReCo significantly boosts\nretrieval accuracy across sparse (up to 35.7%), zero-shot dense (up to 27.6%),\nand fine-tuned dense (up to 23.6%) retrieval settings in diverse search\nscenarios. To further elucidate the advantages of ReCo and stimulate research\nin code style normalization, we introduce Code Style Similarity, the first\nmetric tailored to quantify stylistic similarities in code. Notably, our\nempirical findings reveal the inadequacy of existing metrics in capturing\nstylistic nuances. The source code and data are available at\n\\url{https://github.com/Alex-HaochenLi/ReCo}.\n","authors":["Haochen Li","Xin Zhou","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2401.04514v2.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2402.14526v2","updated":"2024-06-03T06:48:34Z","published":"2024-02-22T13:20:53Z","title":"Balanced Data Sampling for Language Model Training with Clustering","summary":"  Data plays a fundamental role in the training of Large Language Models\n(LLMs). While attention has been paid to the collection and composition of\ndatasets, determining the data sampling strategy in training remains an open\nquestion. Most LLMs are trained with a simple strategy, random sampling.\nHowever, this sampling strategy ignores the unbalanced nature of training data\ndistribution, which can be sub-optimal. In this paper, we propose ClusterClip\nSampling to balance the text distribution of training data for better model\ntraining. Specifically, ClusterClip Sampling utilizes data clustering to\nreflect the data distribution of the training set and balances the common\nsamples and rare samples during training based on the cluster results. A\nrepetition clip operation is introduced to mitigate the overfitting issue led\nby samples from certain clusters. Extensive experiments validate the\neffectiveness of ClusterClip Sampling, which outperforms random sampling and\nother cluster-based sampling variants under various training datasets and large\nlanguage models.\n","authors":["Yunfan Shao","Linyang Li","Zhaoye Fei","Hang Yan","Dahua Lin","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2402.14526v2.pdf","comment":"ACL 2024 (findings), Code is released at\n  https://github.com/choosewhatulike/cluster-clip"},{"id":"http://arxiv.org/abs/2405.10738v2","updated":"2024-06-03T06:42:32Z","published":"2024-05-17T12:32:53Z","title":"Feature-Adaptive and Data-Scalable In-Context Learning","summary":"  In-context learning (ICL), which promotes inference with several\ndemonstrations, has become a widespread paradigm to stimulate LLM capabilities\nfor downstream tasks. Due to context length constraints, it cannot be further\nimproved in spite of more training data, and general features directly from\nLLMs in ICL are not adaptive to the specific downstream task. In this paper, we\npropose a feature-adaptive and data-scalable in-context learning framework\n(FADS-ICL), which can leverage task-adaptive features to promote inference on\nthe downstream task, with the supervision of beyond-context samples.\nSpecifically, it first extracts general features of beyond-context samples via\nthe LLM with ICL input form one by one, and introduces a task-specific\nmodulator to perform feature refinement and prediction after fitting a specific\ndownstream task. We conduct extensive experiments on FADS-ICL under varying\ndata settings (4$\\sim$128 shots) and LLM scale (0.8$\\sim$70B) settings.\nExperimental results show that FADS-ICL consistently outperforms previous\nstate-of-the-art methods by a significant margin under all settings, verifying\nthe effectiveness and superiority of FADS-ICL. For example, under the 1.5B and\n32 shots setting, FADS-ICL can achieve \\textbf{+14.3} average accuracy from\nfeature adaptation over vanilla ICL on 10 datasets, with \\textbf{+6.2} average\naccuracy over the previous state-of-the-art method, and the performance can\nfurther improve with increasing training data. Code and data are publicly\navailable at \\url{https://github.com/jiahaozhenbang/FADS-ICL}.\n","authors":["Jiahao Li","Quan Wang","Licheng Zhang","Guoqing Jin","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2405.10738v2.pdf","comment":"Accepted at ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2405.20680v2","updated":"2024-06-03T06:20:18Z","published":"2024-05-31T08:22:49Z","title":"Unraveling and Mitigating Retriever Inconsistencies in\n  Retrieval-Augmented Large Language Models","summary":"  Although Retrieval-Augmented Large Language Models (RALMs) demonstrate their\nsuperiority in terms of factuality, they do not consistently outperform the\noriginal retrieval-free Language Models (LMs). Our experiments reveal that this\nexample-level performance inconsistency exists not only between\nretrieval-augmented and retrieval-free LM but also among different retrievers.\nTo understand this phenomenon, we investigate the degeneration behavior of\nRALMs and theoretically decompose it into four categories. Further analysis\nbased on our decomposition reveals that the innate difference in knowledge\nsources and the unpredictable degeneration of the reader model contribute most\nto the inconsistency. Drawing from our analysis, we introduce Ensemble of\nRetrievers (EoR), a trainable framework that can adaptively retrieve from\ndifferent knowledge sources and effectively decrease unpredictable reader\nerrors. Our experiments on Open Domain Question Answering show that EoR\nsubstantially improves performance over the RALM with a single retriever by\nconsiderably reducing inconsistent behaviors.\n","authors":["Mingda Li","Xinyu Li","Yifan Chen","Wenfeng Xuan","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20680v2.pdf","comment":"ACL 2024 (findings)"},{"id":"http://arxiv.org/abs/2311.09071v2","updated":"2024-06-03T06:11:06Z","published":"2023-11-15T16:13:14Z","title":"How Vocabulary Sharing Facilitates Multilingualism in LLaMA?","summary":"  Large Language Models (LLMs), often show strong performance on English tasks,\nwhile exhibiting limitations on other languages. What is an LLM's multilingual\ncapability when it is trained only on certain languages? The underlying\nmechanism remains unclear. This study endeavors to examine the multilingual\ncapability of LLMs from the vocabulary sharing perspective by conducting an\nexhaustive analysis across 101 languages. Through the investigation of the\nperformance gap before and after embedding fine-tuning, we discovered four\ndistinct quadrants. By delving into each quadrant we provide actionable and\nefficient guidelines for tuning these languages. Extensive experiments reveal\nthat existing LLMs possess multilingual capabilities that surpass our\nexpectations, and we can significantly improve the multilingual performance of\nLLMs based on these attributes of each\nquadrant~\\footnote{\\url{https://github.com/CONE-MT/Vocabulary-Sharing-Facilitates-Multilingualism}.}.\n","authors":["Fei Yuan","Shuai Yuan","Zhiyong Wu","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2311.09071v2.pdf","comment":"ACL-2024 Findings"},{"id":"http://arxiv.org/abs/2402.15043v2","updated":"2024-06-03T06:02:39Z","published":"2024-02-23T01:30:39Z","title":"KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large\n  Language Models","summary":"  Automatic evaluation methods for large language models (LLMs) are hindered by\ndata contamination, leading to inflated assessments of their effectiveness.\nExisting strategies, which aim to detect contaminated texts, focus on\nquantifying contamination status instead of accurately gauging model\nperformance. In this paper, we introduce KIEval, a Knowledge-grounded\nInteractive Evaluation framework, which incorporates an LLM-powered\n\"interactor\" role for the first time to accomplish a dynamic\ncontamination-resilient evaluation. Starting with a question in a conventional\nLLM benchmark involving domain-specific knowledge, KIEval utilizes dynamically\ngenerated, multi-round, and knowledge-focused dialogues to determine whether a\nmodel's response is merely a recall of benchmark answers or demonstrates a deep\ncomprehension to apply knowledge in more complex conversations. Extensive\nexperiments on seven leading LLMs across five datasets validate KIEval's\neffectiveness and generalization. We also reveal that data contamination brings\nno contribution or even negative effect to models' real-world applicability and\nunderstanding, and existing contamination detection methods for LLMs can only\nidentify contamination in pre-training but not during supervised fine-tuning.\n","authors":["Zhuohao Yu","Chang Gao","Wenjin Yao","Yidong Wang","Wei Ye","Jindong Wang","Xing Xie","Yue Zhang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.15043v2.pdf","comment":"Accepted to ACL 2024 (main conference); 19 pages, 5 figures, 19\n  tables, code is available at: https://github.com/zhuohaoyu/KIEval"},{"id":"http://arxiv.org/abs/2404.03653v2","updated":"2024-06-03T06:02:34Z","published":"2024-04-04T17:59:46Z","title":"CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept\n  Matching","summary":"  Diffusion models have demonstrated great success in the field of\ntext-to-image generation. However, alleviating the misalignment between the\ntext prompts and images is still challenging. The root reason behind the\nmisalignment has not been extensively investigated. We observe that the\nmisalignment is caused by inadequate token attention activation. We further\nattribute this phenomenon to the diffusion model's insufficient condition\nutilization, which is caused by its training paradigm. To address the issue, we\npropose CoMat, an end-to-end diffusion model fine-tuning strategy with an\nimage-to-text concept matching mechanism. We leverage an image captioning model\nto measure image-to-text alignment and guide the diffusion model to revisit\nignored tokens. A novel attribute concentration module is also proposed to\naddress the attribute binding problem. Without any image or human preference\ndata, we use only 20K text prompts to fine-tune SDXL to obtain CoMat-SDXL.\nExtensive experiments show that CoMat-SDXL significantly outperforms the\nbaseline model SDXL in two text-to-image alignment benchmarks and achieves\nstart-of-the-art performance.\n","authors":["Dongzhi Jiang","Guanglu Song","Xiaoshi Wu","Renrui Zhang","Dazhong Shen","Zhuofan Zong","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.03653v2.pdf","comment":"Project Page: https://caraj7.github.io/comat"},{"id":"http://arxiv.org/abs/2402.11078v3","updated":"2024-06-03T05:39:10Z","published":"2024-02-16T21:10:33Z","title":"Model Editing by Standard Fine-Tuning","summary":"  Standard fine-tuning is considered not as effective as specialized methods\nfor model editing due to its comparatively poor performance. However, it is\nsimple, agnostic to the architectural details of the model being edited, and\nable to leverage advances in standard training techniques with no additional\nwork (e.g., black-box PEFT for computational efficiency), making it an\nappealing choice for a model editor. In this work, we show that standard\nfine-tuning alone can yield competitive model editing performance with two\nminor modifications. First, we optimize the conditional likelihood rather than\nthe full likelihood. Second, in addition to the typical practice of training on\nrandomly paraphrased edit prompts to encourage generalization, we also train on\nrandom or similar unedited facts to encourage locality. Our experiments on the\nZsRE and CounterFact datasets demonstrate that these simple modifications allow\nstandard fine-tuning to match or outperform highly specialized editors in terms\nof edit score.\n","authors":["Govind Gangadhar","Karl Stratos"],"pdf_url":"https://arxiv.org/pdf/2402.11078v3.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2402.11281v2","updated":"2024-06-03T04:53:00Z","published":"2024-02-17T13:41:44Z","title":"Can Large Multimodal Models Uncover Deep Semantics Behind Images?","summary":"  Understanding the deep semantics of images is essential in the era dominated\nby social media. However, current research works primarily on the superficial\ndescription of images, revealing a notable deficiency in the systematic\ninvestigation of the inherent deep semantics. In this work, we introduce\nDEEPEVAL, a comprehensive benchmark to assess Large Multimodal Models' (LMMs)\ncapacities of visual deep semantics. DEEPEVAL includes human-annotated dataset\nand three progressive subtasks: fine-grained description selection, in-depth\ntitle matching, and deep semantics understanding. Utilizing DEEPEVAL, we\nevaluate 9 open-source LMMs and GPT-4V(ision). Our evaluation demonstrates a\nsubstantial gap between the deep semantic comprehension capabilities of\nexisting LMMs and humans. For example, GPT-4V is 30% behind humans in\nunderstanding deep semantics, even though it achieves human-comparable\nperformance in image description. Further analysis reveals that LMM performance\non DEEPEVAL varies according to the specific facets of deep semantics explored,\nindicating the fundamental challenges remaining in developing LMMs.\n","authors":["Yixin Yang","Zheng Li","Qingxiu Dong","Heming Xia","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2402.11281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08540v5","updated":"2024-06-03T04:18:11Z","published":"2023-10-12T17:32:09Z","title":"Do pretrained Transformers Learn In-Context by Gradient Descent?","summary":"  The emergence of In-Context Learning (ICL) in LLMs remains a remarkable\nphenomenon that is partially understood. To explain ICL, recent studies have\ncreated theoretical connections to Gradient Descent (GD). We ask, do such\nconnections hold up in actual pre-trained language models? We highlight the\nlimiting assumptions in prior works that make their setup considerably\ndifferent from the practical setup in which language models are trained. For\nexample, their experimental verification uses \\emph{ICL objective} (training\nmodels explicitly for ICL), which differs from the emergent ICL in the wild.\nFurthermore, the theoretical hand-constructed weights used in these studies\nhave properties that don't match those of real LLMs. We also look for evidence\nin real models. We observe that ICL and GD have different sensitivity to the\norder in which they observe demonstrations. Finally, we probe and compare the\nICL vs. GD hypothesis in a natural setting. We conduct comprehensive empirical\nanalyses on language models pre-trained on natural data (LLaMa-7B). Our\ncomparisons of three performance metrics highlight the inconsistent behavior of\nICL and GD as a function of various factors such as datasets, models, and the\nnumber of demonstrations. We observe that ICL and GD modify the output\ndistribution of language models differently. These results indicate that\n\\emph{the equivalence between ICL and GD remains an open hypothesis} and calls\nfor further studies.\n","authors":["Lingfeng Shen","Aayush Mishra","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2310.08540v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12754v2","updated":"2024-06-03T03:17:01Z","published":"2023-12-20T04:27:13Z","title":"Spectral Prompt Tuning:Unveiling Unseen Classes for Zero-Shot Semantic\n  Segmentation","summary":"  Recently, CLIP has found practical utility in the domain of pixel-level\nzero-shot segmentation tasks. The present landscape features two-stage\nmethodologies beset by issues such as intricate pipelines and elevated\ncomputational costs. While current one-stage approaches alleviate these\nconcerns and incorporate Visual Prompt Training (VPT) to uphold CLIP's\ngeneralization capacity, they still fall short in fully harnessing CLIP's\npotential for pixel-level unseen class demarcation and precise pixel\npredictions. To further stimulate CLIP's zero-shot dense prediction capability,\nwe propose SPT-SEG, a one-stage approach that improves CLIP's adaptability from\nimage to pixel. Specifically, we initially introduce Spectral Prompt Tuning\n(SPT), incorporating spectral prompts into the CLIP visual encoder's shallow\nlayers to capture structural intricacies of images, thereby enhancing\ncomprehension of unseen classes. Subsequently, we introduce the Spectral Guided\nDecoder (SGD), utilizing both high and low-frequency information to steer the\nnetwork's spatial focus towards more prominent classification features,\nenabling precise pixel-level prediction outcomes. Through extensive experiments\non two public datasets, we demonstrate the superiority of our method over\nstate-of-the-art approaches, performing well across all classes and\nparticularly excelling in handling unseen classes. Code is available\nat:https://github.com/clearxu/SPT.\n","authors":["Wenhao Xu","Rongtao Xu","Changwei Wang","Shibiao Xu","Li Guo","Man Zhang","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12754v2.pdf","comment":"AAAI2024 Accepted"},{"id":"http://arxiv.org/abs/2311.09154v3","updated":"2024-06-03T03:06:55Z","published":"2023-11-15T17:50:30Z","title":"CLEAN-EVAL: Clean Evaluation on Contaminated Large Language Models","summary":"  We are currently in an era of fierce competition among various large language\nmodels (LLMs) continuously pushing the boundaries of benchmark performance.\nHowever, genuinely assessing the capabilities of these LLMs has become a\nchallenging and critical issue due to potential data contamination, and it\nwastes dozens of time and effort for researchers and engineers to download and\ntry those contaminated models. To save our precious time, we propose a novel\nand useful method, Clean-Eval, which mitigates the issue of data contamination\nand evaluates the LLMs in a cleaner manner. Clean-Eval employs an LLM to\nparaphrase and back-translate the contaminated data into a candidate set,\ngenerating expressions with the same meaning but in different surface forms. A\nsemantic detector is then used to filter the generated low-quality samples to\nnarrow down this candidate set. The best candidate is finally selected from\nthis set based on the BLEURT score. According to human assessment, this best\ncandidate is semantically similar to the original contamination data but\nexpressed differently. All candidates can form a new benchmark to evaluate the\nmodel. Our experiments illustrate that Clean-Eval substantially restores the\nactual evaluation results on contaminated LLMs under both few-shot learning and\nfine-tuning scenarios.\n","authors":["Wenhong Zhu","Hongkun Hao","Zhiwei He","Yunze Song","Yumeng Zhang","Hanxu Hu","Yiran Wei","Rui Wang","Hongyuan Lu"],"pdf_url":"https://arxiv.org/pdf/2311.09154v3.pdf","comment":"NAACL2024(findings)"},{"id":"http://arxiv.org/abs/2402.18223v2","updated":"2024-06-03T03:02:44Z","published":"2024-02-28T10:38:21Z","title":"Improving Open-Ended Text Generation via Adaptive Decoding","summary":"  Current language models decode text token by token according to probabilistic\ndistribution, and determining the appropriate candidates for the next token is\ncrucial to ensure generation quality. This study introduces adaptive decoding,\na mechanism that dynamically empowers language models to ascertain a sensible\ncandidate set during generation. Specifically, we introduce an entropy-based\nmetric called confidence and conceptualize determining the optimal candidate\nset as a confidence-increasing process. The rationality of including a token in\nthe candidate set is assessed by leveraging the increment of confidence.\nExperimental results reveal that our method balances diversity and coherence\nwell. The human evaluation shows that our method can generate human-preferred\ntext. Additionally, our method can potentially improve the reasoning ability of\nlanguage models.\n","authors":["Wenhong Zhu","Hongkun Hao","Zhiwei He","Yiming Ai","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2402.18223v2.pdf","comment":"ICML2024"},{"id":"http://arxiv.org/abs/2312.14591v3","updated":"2024-06-03T02:24:38Z","published":"2023-12-22T10:29:43Z","title":"Reasons to Reject? Aligning Language Models with Judgments","summary":"  As humans, we consistently interact with our peers and receive feedback in\nthe form of natural language. This language feedback allows us to maintain\nappropriate behavior, and rectify potential errors. The question arises\nnaturally: can we use language feedback to align large language models (LLMs)?\nIn contrast to previous research that aligns LLMs with scalar rewards, we\npresent the first systematic exploration of alignment through the lens of\nlanguage feedback (i.e., judgment). We start with an in-depth investigation of\npotential methods that can be adapted for aligning LLMs with judgments,\nrevealing that these methods cannot fully capitalize on judgments. To\nfacilitate more effective utilization of judgments, we propose a novel\nframework, Contrastive Unlikelihood Training (CUT), that allows for\nfine-grained inappropriate content detection and correction based on judgments.\nOur results show that, with merely 1317 off-the-shelf judgment data, CUT\n(LLaMA2-13b) can beat the 175B DaVinci003 and surpass the best baseline by\n50.84 points on AlpacaEval. CUT (LLaMA2-chat-13b) can also align LLMs in an\niterative fashion using up-to-date model-specific judgments, improving\nperformance from 81.09 to 91.68 points on AlpacaEval. Further analysis suggests\nthat judgments hold greater potential than rewards in LLM alignment.\n","authors":["Weiwen Xu","Deng Cai","Zhisong Zhang","Wai Lam","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2312.14591v3.pdf","comment":"Accepted at ACL 2024 Findings. Our source codes and models are\n  publicly available at https://github.com/wwxu21/CUT"},{"id":"http://arxiv.org/abs/2402.04411v2","updated":"2024-06-03T01:40:46Z","published":"2024-02-06T21:14:45Z","title":"DFA-RAG: Conversational Semantic Router for Large Language Model with\n  Definite Finite Automaton","summary":"  This paper introduces the retrieval-augmented large language model with\nDefinite Finite Automaton (DFA-RAG), a novel framework designed to enhance the\ncapabilities of conversational agents using large language models (LLMs).\nTraditional LLMs face challenges in generating regulated and compliant\nresponses in special scenarios with predetermined response guidelines, like\nemotional support and customer service. Our framework addresses these\nchallenges by embedding a Definite Finite Automaton (DFA), learned from\ntraining dialogues, within the LLM. This structured approach acts as a semantic\nrouter which enables the LLM to adhere to a deterministic response pathway. The\nrouting is achieved by the retrieval-augmentation generation (RAG) strategy,\nwhich carefully selects dialogue examples aligned with the current\nconversational context. The advantages of DFA-RAG include an interpretable\nstructure through human-readable DFA, context-aware retrieval for responses in\nconversations, and plug-and-play compatibility with existing LLMs. Extensive\nbenchmarks validate DFA-RAG's effectiveness, indicating its potential as a\nvaluable contribution to the conversational agent.\n","authors":["Yiyou Sun","Junjie Hu","Wei Cheng","Haifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04411v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2401.08417v4","updated":"2024-06-03T01:28:06Z","published":"2024-01-16T15:04:51Z","title":"Contrastive Preference Optimization: Pushing the Boundaries of LLM\n  Performance in Machine Translation","summary":"  Moderate-sized large language models (LLMs) -- those with 7B or 13B\nparameters -- exhibit promising machine translation (MT) performance. However,\neven the top-performing 13B LLM-based translation models, like ALMA, does not\nmatch the performance of state-of-the-art conventional encoder-decoder\ntranslation models or larger-scale LLMs such as GPT-4. In this study, we bridge\nthis performance gap. We first assess the shortcomings of supervised\nfine-tuning for LLMs in the MT task, emphasizing the quality issues present in\nthe reference data, despite being human-generated. Then, in contrast to SFT\nwhich mimics reference translations, we introduce Contrastive Preference\nOptimization (CPO), a novel approach that trains models to avoid generating\nadequate but not perfect translations. Applying CPO to ALMA models with only\n22K parallel sentences and 12M parameters yields significant improvements. The\nresulting model, called ALMA-R, can match or exceed the performance of the WMT\ncompetition winners and GPT-4 on WMT'21, WMT'22 and WMT'23 test datasets.\n","authors":["Haoran Xu","Amr Sharaf","Yunmo Chen","Weiting Tan","Lingfeng Shen","Benjamin Van Durme","Kenton Murray","Young Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2401.08417v4.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2404.18239v3","updated":"2024-06-03T01:10:53Z","published":"2024-04-28T16:31:32Z","title":"SOUL: Unlocking the Power of Second-Order Optimization for LLM\n  Unlearning","summary":"  Large Language Models (LLMs) have highlighted the necessity of effective\nunlearning mechanisms to comply with data regulations and ethical AI practices.\nLLM unlearning aims at removing undesired data influences and associated model\ncapabilities without compromising utility out of the scope of unlearning. While\ninterest in studying LLM unlearning is growing,the impact of the optimizer\nchoice for LLM unlearning remains under-explored. In this work, we shed light\non the significance of optimizer selection in LLM unlearning for the first\ntime, establishing a clear connection between {second-order optimization} and\ninfluence unlearning (a classical approach using influence functions to update\nthe model for data influence removal). This insight propels us to develop a\nsecond-order unlearning framework, termed SOUL, built upon the second-order\nclipped stochastic optimization (Sophia)-based LLM training method. SOUL\nextends the static, one-shot model update using influence unlearning to a\ndynamic, iterative unlearning process. Our extensive experiments show that SOUL\nconsistently outperforms conventional first-order methods across various\nunlearning tasks, models, and metrics, suggesting the promise of second-order\noptimization in providing a scalable and easily implementable solution for LLM\nunlearning.\n","authors":["Jinghan Jia","Yihua Zhang","Yimeng Zhang","Jiancheng Liu","Bharat Runwal","James Diffenderfer","Bhavya Kailkhura","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2404.18239v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11058v3","updated":"2024-06-03T01:09:38Z","published":"2024-02-16T20:14:47Z","title":"II-MMR: Identifying and Improving Multi-modal Multi-hop Reasoning in\n  Visual Question Answering","summary":"  Visual Question Answering (VQA) often involves diverse reasoning scenarios\nacross Vision and Language (V&L). Most prior VQA studies, however, have merely\nfocused on assessing the model's overall accuracy without evaluating it on\ndifferent reasoning cases. Furthermore, some recent works observe that\nconventional Chain-of-Thought (CoT) prompting fails to generate effective\nreasoning for VQA, especially for complex scenarios requiring multi-hop\nreasoning. In this paper, we propose II-MMR, a novel idea to identify and\nimprove multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA\nquestion with an image and finds a reasoning path to reach its answer using two\nnovel language promptings: (i) answer prediction-guided CoT prompt, or (ii)\nknowledge triplet-guided prompt. II-MMR then analyzes this path to identify\ndifferent reasoning cases in current VQA benchmarks by estimating how many hops\nand what types (i.e., visual or beyond-visual) of reasoning are required to\nanswer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR\nobserves that most of their VQA questions are easy to answer, simply demanding\n\"single-hop\" reasoning, whereas only a few questions require \"multi-hop\"\nreasoning. Moreover, while the recent V&L model struggles with such complex\nmulti-hop reasoning questions even using the traditional CoT method, II-MMR\nshows its effectiveness across all reasoning cases in both zero-shot and\nfine-tuning settings.\n","authors":["Jihyung Kil","Farideh Tavazoee","Dongyeop Kang","Joo-Kyung Kim"],"pdf_url":"https://arxiv.org/pdf/2402.11058v3.pdf","comment":"Accepted to ACL 2024 Findings"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.00700v2","updated":"2024-06-03T17:57:39Z","published":"2023-12-01T16:33:57Z","title":"GIFT: Generative Interpretable Fine-Tuning","summary":"  We present Generative Interpretable Fine-Tuning (GIFT) for\nparameter-efficient fine-tuning of pretrained Transformer backbones, which can\nbe formulated as a simple factorized matrix multiplication in the parameter\nspace or equivalently in the activation space, and thus embraces built-in\ninterpretability. For a pretrained layer with weights $\\omega\\in\n\\mathbb{R}^{d_{out}\\times d_{in}}$, our proposed GIFT learns the fine-tuned\nweights $\\hat{\\omega}$ directly from $\\omega$ as $\\hat{\\omega}=\\omega \\cdot\n(\\mathbb{I}+\\phi_{d_{in}\\times r}\\cdot \\psi_{r\\times d_{in}})$ where\n$\\mathbb{I}$ is an identity matrix. $\\Theta=(\\phi, \\psi)$ are the learnable\nparameters of the two linear layers of GIFT with $r$ being a hyper-parameter.\n$\\Theta$ is shared by all the layers selected for fine-tuning, resulting in\nsignificantly fewer trainable parameters compared to Low-Rank Adaptation\n(LoRA). We perform comprehensive evaluations on natural language tasks\n(commonsense reasoning and sequence classification) and computer vision tasks\n(visual fine-grained classification). We obtain the best accuracy and parameter\nefficiency among baselines both on the Commonsense170k reasoning benchmark\nusing LLaMA-1 (7B) and Llama-2 (7B)/-3 (8B) and on the FGVC and VTAB visual\nrecognition benchmarks using ImageNet-21k pretrained Vision Transformer\n(ViT-B/16). Notably, we obtain 5.9% absolute increase in average accuracy with\n53.8 times reduction of parameters on Commonsense170k using Llama-3 (8B)\ncompared to LoRA. We obtain performance comparable to LoRA on the GLUE\nbenchmark but with significantly fewer parameters using RoBERTa-Base/Large. We\nshow the output of the first linear layer (i.e., $\\omega\\cdot \\phi$) is\nsurprisingly interpretable, which can play the role of a token-clustering head\nas a by-product to localize meaningful objects/parts in images for computer\nvision tasks. Our code is publicly available.\n","authors":["Chinmay Savadikar","Xi Song","Tianfu Wu"],"pdf_url":"https://arxiv.org/pdf/2312.00700v2.pdf","comment":"Project page and code: https://savadikarc.github.io/gift"},{"id":"http://arxiv.org/abs/2403.19780v2","updated":"2024-06-03T17:56:14Z","published":"2024-03-28T19:06:37Z","title":"Mitigating Motion Blur in Neural Radiance Fields with Events and Frames","summary":"  Neural Radiance Fields (NeRFs) have shown great potential in novel view\nsynthesis. However, they struggle to render sharp images when the data used for\ntraining is affected by motion blur. On the other hand, event cameras excel in\ndynamic scenes as they measure brightness changes with microsecond resolution\nand are thus only marginally affected by blur. Recent methods attempt to\nenhance NeRF reconstructions under camera motion by fusing frames and events.\nHowever, they face challenges in recovering accurate color content or constrain\nthe NeRF to a set of predefined camera poses, harming reconstruction quality in\nchallenging conditions. This paper proposes a novel formulation addressing\nthese issues by leveraging both model- and learning-based modules. We\nexplicitly model the blur formation process, exploiting the event double\nintegral as an additional model-based prior. Additionally, we model the\nevent-pixel response using an end-to-end learnable response function, allowing\nour method to adapt to non-idealities in the real event-camera sensor. We show,\non synthetic and real data, that the proposed approach outperforms existing\ndeblur NeRFs that use only frames as well as those that combine frames and\nevents by +6.13dB and +2.48dB, respectively.\n","authors":["Marco Cannici","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2403.19780v2.pdf","comment":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR),\n  2024"},{"id":"http://arxiv.org/abs/2402.10093v2","updated":"2024-06-03T17:51:58Z","published":"2024-02-15T16:46:16Z","title":"MIM-Refiner: A Contrastive Learning Boost from Intermediate Pre-Trained\n  Representations","summary":"  We introduce MIM (Masked Image Modeling)-Refiner, a contrastive learning\nboost for pre-trained MIM models. MIM-Refiner is motivated by the insight that\nstrong representations within MIM models generally reside in intermediate\nlayers. Accordingly, MIM-Refiner leverages multiple contrastive heads that are\nconnected to different intermediate layers. In each head, a modified nearest\nneighbor objective constructs semantic clusters that capture semantic\ninformation which improves performance on downstream tasks, including\noff-the-shelf and fine-tuning settings.\n  The refinement process is short and simple - yet highly effective. Within a\nfew epochs, we refine the features of MIM models from subpar to\nstate-of-the-art, off-the-shelf features. Refining a ViT-H, pre-trained with\ndata2vec 2.0 on ImageNet-1K, sets a new state-of-the-art in linear probing\n(84.7%) and low-shot classification among models that are pre-trained on\nImageNet-1K. At ImageNet-1K 1-shot classification, MIM-Refiner advances the\nstate-of-the-art to 64.2%, outperforming larger models that were trained on up\nto 2000 times more data such as DINOv2-g, OpenCLIP-G and MAWS-6.5B.\n","authors":["Benedikt Alkin","Lukas Miklautz","Sepp Hochreiter","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2402.10093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10689v3","updated":"2024-06-03T17:50:58Z","published":"2023-09-19T15:23:52Z","title":"ReShader: View-Dependent Highlights for Single Image View-Synthesis","summary":"  In recent years, novel view synthesis from a single image has seen\nsignificant progress thanks to the rapid advancements in 3D scene\nrepresentation and image inpainting techniques. While the current approaches\nare able to synthesize geometrically consistent novel views, they often do not\nhandle the view-dependent effects properly. Specifically, the highlights in\ntheir synthesized images usually appear to be glued to the surfaces, making the\nnovel views unrealistic. To address this major problem, we make a key\nobservation that the process of synthesizing novel views requires changing the\nshading of the pixels based on the novel camera, and moving them to appropriate\nlocations. Therefore, we propose to split the view synthesis process into two\nindependent tasks of pixel reshading and relocation. During the reshading\nprocess, we take the single image as the input and adjust its shading based on\nthe novel camera. This reshaded image is then used as the input to an existing\nview synthesis method to relocate the pixels and produce the final novel view\nimage. We propose to use a neural network to perform reshading and generate a\nlarge set of synthetic input-reshaded pairs to train our network. We\ndemonstrate that our approach produces plausible novel view images with\nrealistic moving highlights on a variety of real world scenes.\n","authors":["Avinash Paliwal","Brandon Nguyen","Andrii Tsarov","Nima Khademi Kalantari"],"pdf_url":"https://arxiv.org/pdf/2309.10689v3.pdf","comment":"SIGGRAPH Asia 2023. Project page at\n  https://people.engr.tamu.edu/nimak/Papers/SIGAsia2023_Reshader/index.html and\n  video at https://www.youtube.com/watch?v=XW-tl48D3Ok"},{"id":"http://arxiv.org/abs/2311.18107v5","updated":"2024-06-03T17:46:49Z","published":"2023-11-29T21:45:33Z","title":"A Stochastic-Geometrical Framework for Object Pose Estimation based on\n  Mixture Models Avoiding the Correspondence Problem","summary":"  Background: Pose estimation of rigid objects is a practical challenge in\noptical metrology and computer vision. This paper presents a novel\nstochastic-geometrical modeling framework for object pose estimation based on\nobserving multiple feature points.\n  Methods: This framework utilizes mixture models for feature point densities\nin object space and for interpreting real measurements. Advantages are the\navoidance to resolve individual feature correspondences and to incorporate\ncorrect stochastic dependencies in multi-view applications. First, the general\nmodeling framework is presented, second, a general algorithm for pose\nestimation is derived, and third, two example models (camera and lateration\nsetup) are presented.\n  Results: Numerical experiments show the effectiveness of this modeling and\ngeneral algorithm by presenting four simulation scenarios for three observation\nsystems, including the dependence on measurement resolution, object\ndeformations and measurement noise. Probabilistic modeling utilizing mixture\nmodels shows the potential for accurate and robust pose estimations while\navoiding the correspondence problem.\n","authors":["Wolfgang Hoegele"],"pdf_url":"https://arxiv.org/pdf/2311.18107v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01016v2","updated":"2024-06-03T17:36:47Z","published":"2024-05-02T05:35:10Z","title":"Addressing Diverging Training Costs using Local Restoration for Precise\n  Bird's Eye View Map Construction","summary":"  Recent advancements in Bird's Eye View (BEV) fusion for map construction have\ndemonstrated remarkable mapping of urban environments. However, their deep and\nbulky architecture incurs substantial amounts of backpropagation memory and\ncomputing latency. Consequently, the problem poses an unavoidable bottleneck in\nconstructing high-resolution (HR) BEV maps, as their large-sized features cause\nsignificant increases in costs including GPU memory consumption and computing\nlatency, named diverging training costs issue. Affected by the problem, most\nexisting methods adopt low-resolution (LR) BEV and struggle to estimate the\nprecise locations of urban scene components like road lanes, and sidewalks. As\nthe imprecision leads to risky self-driving, the diverging training costs issue\nhas to be resolved. In this paper, we address the issue with our novel Trumpet\nNeural Network (TNN) mechanism. The framework utilizes LR BEV space and outputs\nan up-sampled semantic BEV map to create a memory-efficient pipeline. To this\nend, we introduce Local Restoration of BEV representation. Specifically, the\nup-sampled BEV representation has severely aliased, blocky signals, and thick\nsemantic labels. Our proposed Local Restoration restores the signals and thins\n(or narrows down) the width of the labels. Our extensive experiments show that\nthe TNN mechanism provides a plug-and-play memory-efficient pipeline, thereby\nenabling the effective estimation of real-sized (or precise) semantic labels\nfor BEV map construction.\n","authors":["Minsu Kim","Giseop Kim","Sunwook Choi"],"pdf_url":"https://arxiv.org/pdf/2405.01016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02730v2","updated":"2024-06-03T17:14:56Z","published":"2024-05-04T18:27:29Z","title":"U-DiTs: Downsample Tokens in U-Shaped Diffusion Transformers","summary":"  Diffusion Transformers (DiTs) introduce the transformer architecture to\ndiffusion tasks for latent-space image generation. With an isotropic\narchitecture that chains a series of transformer blocks, DiTs demonstrate\ncompetitive performance and good scalability; but meanwhile, the abandonment of\nU-Net by DiTs and their following improvements is worth rethinking. To this\nend, we conduct a simple toy experiment by comparing a U-Net architectured DiT\nwith an isotropic one. It turns out that the U-Net architecture only gain a\nslight advantage amid the U-Net inductive bias, indicating potential\nredundancies within the U-Net-style DiT. Inspired by the discovery that U-Net\nbackbone features are low-frequency-dominated, we perform token downsampling on\nthe query-key-value tuple for self-attention that bring further improvements\ndespite a considerable amount of reduction in computation. Based on\nself-attention with downsampled tokens, we propose a series of U-shaped DiTs\n(U-DiTs) in the paper and conduct extensive experiments to demonstrate the\nextraordinary performance of U-DiT models. The proposed U-DiT could outperform\nDiT-XL/2 with only 1/6 of its computation cost. Codes are available at\nhttps://github.com/YuchuanTian/U-DiT.\n","authors":["Yuchuan Tian","Zhijun Tu","Hanting Chen","Jie Hu","Chao Xu","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2405.02730v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.17846v2","updated":"2024-06-03T17:12:25Z","published":"2024-03-26T16:36:43Z","title":"Hierarchical Open-Vocabulary 3D Scene Graphs for Language-Grounded Robot\n  Navigation","summary":"  Recent open-vocabulary robot mapping methods enrich dense geometric maps with\npre-trained visual-language features. While these maps allow for the prediction\nof point-wise saliency maps when queried for a certain language concept,\nlarge-scale environments and abstract queries beyond the object level still\npose a considerable hurdle, ultimately limiting language-grounded robotic\nnavigation. In this work, we present HOV-SG, a hierarchical open-vocabulary 3D\nscene graph mapping approach for language-grounded robot navigation. Leveraging\nopen-vocabulary vision foundation models, we first obtain state-of-the-art\nopen-vocabulary segment-level maps in 3D and subsequently construct a 3D scene\ngraph hierarchy consisting of floor, room, and object concepts, each enriched\nwith open-vocabulary features. Our approach is able to represent multi-story\nbuildings and allows robotic traversal of those using a cross-floor Voronoi\ngraph. HOV-SG is evaluated on three distinct datasets and surpasses previous\nbaselines in open-vocabulary semantic accuracy on the object, room, and floor\nlevel while producing a 75% reduction in representation size compared to dense\nopen-vocabulary maps. In order to prove the efficacy and generalization\ncapabilities of HOV-SG, we showcase successful long-horizon\nlanguage-conditioned robot navigation within real-world multi-storage\nenvironments. We provide code and trial video data at http://hovsg.github.io/.\n","authors":["Abdelrhman Werby","Chenguang Huang","Martin Büchner","Abhinav Valada","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2403.17846v2.pdf","comment":"Code and video are available at http://hovsg.github.io/"},{"id":"http://arxiv.org/abs/2312.14867v2","updated":"2024-06-03T16:59:20Z","published":"2023-12-22T17:45:19Z","title":"VIEScore: Towards Explainable Metrics for Conditional Image Synthesis\n  Evaluation","summary":"  In the rapidly advancing field of conditional image generation research,\nchallenges such as limited explainability lie in effectively evaluating the\nperformance and capabilities of various models. This paper introduces VIEScore,\na Visual Instruction-guided Explainable metric for evaluating any conditional\nimage generation tasks. VIEScore leverages general knowledge from Multimodal\nLarge Language Models (MLLMs) as the backbone and does not require training or\nfine-tuning. We evaluate VIEScore on seven prominent tasks in conditional image\ntasks and found: (1) VIEScore (GPT4-o) achieves a high Spearman correlation of\n0.4 with human evaluations, while the human-to-human correlation is 0.45. (2)\nVIEScore (with open-source MLLM) is significantly weaker than GPT-4o and GPT-4v\nin evaluating synthetic images. (3) VIEScore achieves a correlation on par with\nhuman ratings in the generation tasks but struggles in editing tasks. With\nthese results, we believe VIEScore shows its great potential to replace human\njudges in evaluating image synthesis tasks.\n","authors":["Max Ku","Dongfu Jiang","Cong Wei","Xiang Yue","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2312.14867v2.pdf","comment":"Accepted to ACL2024 main"},{"id":"http://arxiv.org/abs/2405.16277v3","updated":"2024-06-03T16:42:55Z","published":"2024-05-25T15:28:22Z","title":"Picturing Ambiguity: A Visual Twist on the Winograd Schema Challenge","summary":"  Large Language Models (LLMs) have demonstrated remarkable success in tasks\nlike the Winograd Schema Challenge (WSC), showcasing advanced textual\ncommon-sense reasoning. However, applying this reasoning to multimodal domains,\nwhere understanding text and images together is essential, remains a\nsubstantial challenge. To address this, we introduce WinoVis, a novel dataset\nspecifically designed to probe text-to-image models on pronoun disambiguation\nwithin multimodal contexts. Utilizing GPT-4 for prompt generation and Diffusion\nAttentive Attribution Maps (DAAM) for heatmap analysis, we propose a novel\nevaluation framework that isolates the models' ability in pronoun\ndisambiguation from other visual processing challenges. Evaluation of\nsuccessive model versions reveals that, despite incremental advancements,\nStable Diffusion 2.0 achieves a precision of 56.7% on WinoVis, only marginally\nsurpassing random guessing. Further error analysis identifies important areas\nfor future research aimed at advancing text-to-image models in their ability to\ninterpret and interact with the complex visual world.\n","authors":["Brendan Park","Madeline Janecek","Naser Ezzati-Jivan","Yifeng Li","Ali Emami"],"pdf_url":"https://arxiv.org/pdf/2405.16277v3.pdf","comment":"9 pages (excluding references), accepted to ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2401.01163v3","updated":"2024-06-03T16:09:55Z","published":"2024-01-02T11:46:42Z","title":"NU-Class Net: A Novel Approach for Video Quality Enhancement","summary":"  Video content has experienced a surge in popularity, asserting its dominance\nover internet traffic and Internet of Things (IoT) networks. Video compression\nhas long been regarded as the primary means of efficiently managing the\nsubstantial multimedia traffic generated by video-capturing devices.\nNevertheless, video compression algorithms entail significant computational\ndemands in order to achieve substantial compression ratios. This complexity\npresents a formidable challenge when implementing efficient video coding\nstandards in resource-constrained embedded systems, such as IoT edge node\ncameras. To tackle this challenge, this paper introduces NU-Class Net, an\ninnovative deep-learning model designed to mitigate compression artifacts\nstemming from lossy compression codecs. This enhancement significantly elevates\nthe perceptible quality of low-bit-rate videos. By employing the NU-Class Net,\nthe video encoder within the video-capturing node can reduce output quality,\nthereby generating low-bit-rate videos and effectively curtailing both\ncomputation and bandwidth requirements at the edge. On the decoder side, which\nis typically less encumbered by resource limitations, NU-Class Net is applied\nafter the video decoder to compensate for artifacts and approximate the quality\nof the original video. Experimental results affirm the efficacy of the proposed\nmodel in enhancing the perceptible quality of videos, especially those streamed\nat low bit rates.\n","authors":["Parham Zilouchian Moghaddam","Mehdi Modarressi","Mohammad Amin Sadeghi"],"pdf_url":"https://arxiv.org/pdf/2401.01163v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00479v2","updated":"2024-06-03T15:55:06Z","published":"2023-07-02T05:26:54Z","title":"Domain Transfer Through Image-to-Image Translation for Uncertainty-Aware\n  Prostate Cancer Classification","summary":"  Prostate Cancer (PCa) is a prevalent disease among men, and multi-parametric\nMRIs offer a non-invasive method for its detection. While MRI-based deep\nlearning solutions have shown promise in supporting PCa diagnosis, acquiring\nsufficient training data, particularly in local clinics remains challenging.\nOne potential solution is to take advantage of publicly available datasets to\npre-train deep models and fine-tune them on the local data, but multi-source\nMRIs can pose challenges due to cross-domain distribution differences. These\nlimitations hinder the adoption of explainable and reliable deep-learning\nsolutions in local clinics for PCa diagnosis. In this work, we present a novel\napproach for unpaired image-to-image translation of prostate multi-parametric\nMRIs and an uncertainty-aware training approach for classifying clinically\nsignificant PCa, to be applied in data-constrained settings such as local and\nsmall clinics. Our approach involves a novel pipeline for translating unpaired\n3.0T multi-parametric prostate MRIs to 1.5T, thereby augmenting the available\ntraining data. Additionally, we introduce an evidential deep learning approach\nto estimate model uncertainty and employ dataset filtering techniques during\ntraining. Furthermore, we propose a simple, yet efficient Evidential Focal\nLoss, combining focal loss with evidential uncertainty, to train our model\neffectively. Our experiments demonstrate that the proposed method significantly\nimproves the Area Under ROC Curve (AUC) by over 20% compared to the previous\nwork. Our code is available at https://github.com/med-i-lab/DT_UE_PCa\n","authors":["Meng Zhou","Amoon Jamzad","Jason Izard","Alexandre Menard","Robert Siemens","Parvin Mousavi"],"pdf_url":"https://arxiv.org/pdf/2307.00479v2.pdf","comment":"Preprint. In Submission"},{"id":"http://arxiv.org/abs/2405.14959v2","updated":"2024-06-03T15:51:49Z","published":"2024-05-23T18:10:26Z","title":"EvGGS: A Collaborative Learning Framework for Event-based Generalizable\n  Gaussian Splatting","summary":"  Event cameras offer promising advantages such as high dynamic range and low\nlatency, making them well-suited for challenging lighting conditions and\nfast-moving scenarios. However, reconstructing 3D scenes from raw event streams\nis difficult because event data is sparse and does not carry absolute color\ninformation. To release its potential in 3D reconstruction, we propose the\nfirst event-based generalizable 3D reconstruction framework, called EvGGS,\nwhich reconstructs scenes as 3D Gaussians from only event input in a\nfeedforward manner and can generalize to unseen cases without any retraining.\nThis framework includes a depth estimation module, an intensity reconstruction\nmodule, and a Gaussian regression module. These submodules connect in a\ncascading manner, and we collaboratively train them with a designed joint loss\nto make them mutually promote. To facilitate related studies, we build a novel\nevent-based 3D dataset with various material objects and calibrated labels of\ngrayscale images, depth maps, camera poses, and silhouettes. Experiments show\nmodels that have jointly trained significantly outperform those trained\nindividually. Our approach performs better than all baselines in reconstruction\nquality, and depth/intensity predictions with satisfactory rendering speed.\n","authors":["Jiaxu Wang","Junhao He","Ziyi Zhang","Mingyuan Sun","Jingkai Sun","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2405.14959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20310v3","updated":"2024-06-03T15:13:55Z","published":"2024-05-30T17:52:52Z","title":"A Pixel Is Worth More Than One 3D Gaussians in Single-View 3D\n  Reconstruction","summary":"  Learning 3D scene representation from a single-view image is a long-standing\nfundamental problem in computer vision, with the inherent ambiguity in\npredicting contents unseen from the input view. Built on the recently proposed\n3D Gaussian Splatting (3DGS), the Splatter Image method has made promising\nprogress on fast single-image novel view synthesis via learning a single 3D\nGaussian for each pixel based on the U-Net feature map of an input image.\nHowever, it has limited expressive power to represent occluded components that\nare not observable in the input view. To address this problem, this paper\npresents a Hierarchical Splatter Image method in which a pixel is worth more\nthan one 3D Gaussians. Specifically, each pixel is represented by a parent 3D\nGaussian and a small number of child 3D Gaussians. Parent 3D Gaussians are\nlearned as done in the vanilla Splatter Image. Child 3D Gaussians are learned\nvia a lightweight Multi-Layer Perceptron (MLP) which takes as input the\nprojected image features of a parent 3D Gaussian and the embedding of a target\ncamera view. Both parent and child 3D Gaussians are learned end-to-end in a\nstage-wise way. The joint condition of input image features from eyes of the\nparent Gaussians and the target camera position facilitates learning to\nallocate child Gaussians to ``see the unseen'', recovering the occluded details\nthat are often missed by parent Gaussians.\n  In experiments, the proposed method is tested on the ShapeNet-SRN and CO3D\ndatasets with state-of-the-art performance obtained, especially showing\npromising capabilities of reconstructing occluded contents in the input view.\n","authors":["Jianghao Shen","Nan Xue","Tianfu Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20310v3.pdf","comment":"preprint, under review"},{"id":"http://arxiv.org/abs/2402.01516v2","updated":"2024-06-03T14:53:20Z","published":"2024-02-02T15:57:13Z","title":"Cross-view Masked Diffusion Transformers for Person Image Synthesis","summary":"  We present X-MDPT ($\\underline{Cross}$-view $\\underline{M}$asked\n$\\underline{D}$iffusion $\\underline{P}$rediction $\\underline{T}$ransformers), a\nnovel diffusion model designed for pose-guided human image generation. X-MDPT\ndistinguishes itself by employing masked diffusion transformers that operate on\nlatent patches, a departure from the commonly-used Unet structures in existing\nworks. The model comprises three key modules: 1) a denoising diffusion\nTransformer, 2) an aggregation network that consolidates conditions into a\nsingle vector for the diffusion process, and 3) a mask cross-prediction module\nthat enhances representation learning with semantic information from the\nreference image. X-MDPT demonstrates scalability, improving FID, SSIM, and\nLPIPS with larger models. Despite its simple design, our model outperforms\nstate-of-the-art approaches on the DeepFashion dataset while exhibiting\nefficiency in terms of training parameters, training time, and inference speed.\nOur compact 33MB model achieves an FID of 7.42, surpassing a prior Unet latent\ndiffusion approach (FID 8.07) using only $11\\times$ fewer parameters. Our best\nmodel surpasses the pixel-based diffusion with $\\frac{2}{3}$ of the parameters\nand achieves $5.43 \\times$ faster inference. The code is available at\nhttps://github.com/trungpx/xmdpt.\n","authors":["Trung X. Pham","Zhang Kang","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2402.01516v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2312.11538v2","updated":"2024-06-03T14:42:35Z","published":"2023-12-15T22:38:24Z","title":"Iterative Motion Editing with Natural Language","summary":"  Text-to-motion diffusion models can generate realistic animations from text\nprompts, but do not support fine-grained motion editing controls. In this\npaper, we present a method for using natural language to iteratively specify\nlocal edits to existing character animations, a task that is common in most\ncomputer animation workflows. Our key idea is to represent a space of motion\nedits using a set of kinematic motion editing operators (MEOs) whose effects on\nthe source motion is well-aligned with user expectations. We provide an\nalgorithm that leverages pre-existing language models to translate textual\ndescriptions of motion edits into source code for programs that define and\nexecute sequences of MEOs on a source animation. We execute MEOs by first\ntranslating them into keyframe constraints, and then use diffusion-based motion\nmodels to generate output motions that respect these constraints. Through a\nuser study and quantitative evaluation, we demonstrate that our system can\nperform motion edits that respect the animator's editing intent, remain\nfaithful to the original animation (it edits the original animation, but does\nnot dramatically change it), and yield realistic character animation results.\n","authors":["Purvi Goel","Kuan-Chieh Wang","C. Karen Liu","Kayvon Fatahalian"],"pdf_url":"https://arxiv.org/pdf/2312.11538v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04848v4","updated":"2024-06-03T14:18:29Z","published":"2023-06-08T00:56:33Z","title":"Interpreting and Improving Diffusion Models from an Optimization\n  Perspective","summary":"  Denoising is intuitively related to projection. Indeed, under the manifold\nhypothesis, adding random noise is approximately equivalent to orthogonal\nperturbation. Hence, learning to denoise is approximately learning to project.\nIn this paper, we use this observation to interpret denoising diffusion models\nas approximate gradient descent applied to the Euclidean distance function. We\nthen provide straight-forward convergence analysis of the DDIM sampler under\nsimple assumptions on the projection error of the denoiser. Finally, we propose\na new gradient-estimation sampler, generalizing DDIM using insights from our\ntheoretical results. In as few as 5-10 function evaluations, our sampler\nachieves state-of-the-art FID scores on pretrained CIFAR-10 and CelebA models\nand can generate high quality samples on latent diffusion models.\n","authors":["Frank Permenter","Chenyang Yuan"],"pdf_url":"https://arxiv.org/pdf/2306.04848v4.pdf","comment":"24 pages, 9 figures, 4 tables. To appear in ICML 2024"},{"id":"http://arxiv.org/abs/2402.08567v2","updated":"2024-06-03T14:15:03Z","published":"2024-02-13T16:06:17Z","title":"Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM\n  Agents Exponentially Fast","summary":"  A multimodal large language model (MLLM) agent can receive instructions,\ncapture images, retrieve histories from memory, and decide which tools to use.\nNonetheless, red-teaming efforts have revealed that adversarial images/prompts\ncan jailbreak an MLLM and cause unaligned behaviors. In this work, we report an\neven more severe safety issue in multi-agent environments, referred to as\ninfectious jailbreak. It entails the adversary simply jailbreaking a single\nagent, and without any further intervention from the adversary, (almost) all\nagents will become infected exponentially fast and exhibit harmful behaviors.\nTo validate the feasibility of infectious jailbreak, we simulate multi-agent\nenvironments containing up to one million LLaVA-1.5 agents, and employ\nrandomized pair-wise chat as a proof-of-concept instantiation for multi-agent\ninteraction. Our results show that feeding an (infectious) adversarial image\ninto the memory of any randomly chosen agent is sufficient to achieve\ninfectious jailbreak. Finally, we derive a simple principle for determining\nwhether a defense mechanism can provably restrain the spread of infectious\njailbreak, but how to design a practical defense that meets this principle\nremains an open question to investigate. Our project page is available at\nhttps://sail-sg.github.io/Agent-Smith/.\n","authors":["Xiangming Gu","Xiaosen Zheng","Tianyu Pang","Chao Du","Qian Liu","Ye Wang","Jing Jiang","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2402.08567v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2402.04050v2","updated":"2024-06-03T13:22:12Z","published":"2024-02-06T14:53:19Z","title":"Connecting the Dots: Collaborative Fine-tuning for Black-Box\n  Vision-Language Models","summary":"  With the emergence of pretrained vision-language models (VLMs), considerable\nefforts have been devoted to fine-tuning them for downstream tasks. Despite the\nprogress made in designing efficient fine-tuning methods, such methods require\naccess to the model's parameters, which can be challenging as model owners\noften opt to provide their models as a black box to safeguard model ownership.\nThis paper proposes a \\textbf{C}ollabo\\textbf{ra}tive\n\\textbf{F}ine-\\textbf{T}uning (\\textbf{CraFT}) approach for fine-tuning\nblack-box VLMs to downstream tasks, where one only has access to the input\nprompts and the output predictions of the model. CraFT comprises two modules, a\nprompt generation module for learning text prompts and a prediction refinement\nmodule for enhancing output predictions in residual style. Additionally, we\nintroduce an auxiliary prediction-consistent loss to promote consistent\noptimization across these modules. These modules are optimized by a novel\ncollaborative training algorithm. Extensive experiments on few-shot\nclassification over 15 datasets demonstrate the superiority of CraFT. The\nresults show that CraFT achieves a decent gain of about 12\\% with 16-shot\ndatasets and only 8,000 queries. Moreover, CraFT trains faster and uses only\nabout 1/80 of the memory footprint for deployment, while sacrificing only\n1.62\\% compared to the white-box method. Our code is publicly available at\nhttps://github.com/mrflogs/CraFT .\n","authors":["Zhengbo Wang","Jian Liang","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2402.04050v2.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2310.18651v5","updated":"2024-06-03T13:02:54Z","published":"2023-10-28T09:35:30Z","title":"Patch-Wise Self-Supervised Visual Representation Learning: A\n  Fine-Grained Approach","summary":"  Self-supervised visual representation learning traditionally focuses on\nimage-level instance discrimination. Our study introduces an innovative,\nfine-grained dimension by integrating patch-level discrimination into these\nmethodologies. This integration allows for the simultaneous analysis of local\nand global visual features, thereby enriching the quality of the learned\nrepresentations. Initially, the original images undergo spatial augmentation.\nSubsequently, we employ a distinctive photometric patch-level augmentation,\nwhere each patch is individually augmented, independent from other patches\nwithin the same view. This approach generates a diverse training dataset with\ndistinct color variations in each segment. The augmented images are then\nprocessed through a self-distillation learning framework, utilizing the Vision\nTransformer (ViT) as its backbone. The proposed method minimizes the\nrepresentation distances across both image and patch levels to capture details\nfrom macro to micro perspectives. To this end, we present a simple yet\neffective patch-matching algorithm to find the corresponding patches across the\naugmented views. Thanks to the efficient structure of the patch-matching\nalgorithm, our method reduces computational complexity compared to similar\napproaches. Consequently, we achieve an advanced understanding of the model\nwithout adding significant computational requirements. We have extensively\npretrained our method on datasets of varied scales, such as Cifar10,\nImageNet-100, and ImageNet-1K. It demonstrates superior performance over\nstate-of-the-art self-supervised representation learning methods in image\nclassification and downstream tasks, such as copy detection and image\nretrieval. The implementation of our method is accessible on GitHub.\n","authors":["Ali Javidani","Mohammad Amin Sadeghi","Babak Nadjar Araabi"],"pdf_url":"https://arxiv.org/pdf/2310.18651v5.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2303.06458v3","updated":"2024-06-03T12:47:12Z","published":"2023-03-11T17:14:33Z","title":"ZeroNLG: Aligning and Autoencoding Domains for Zero-Shot Multimodal and\n  Multilingual Natural Language Generation","summary":"  Natural Language Generation (NLG) accepts input data in the form of images,\nvideos, or text and generates corresponding natural language text as output.\nExisting NLG methods mainly adopt a supervised approach and rely heavily on\ncoupled data-to-text pairs. However, for many targeted scenarios and for\nnon-English languages, sufficient quantities of labeled data are often not\navailable. To relax the dependency on labeled data of downstream tasks, we\npropose an intuitive and effective zero-shot learning framework, ZeroNLG, which\ncan deal with multiple NLG tasks, including image-to-text (image captioning),\nvideo-to-text (video captioning), and text-to-text (neural machine\ntranslation), across English, Chinese, German, and French within a unified\nframework. ZeroNLG does not require any labeled downstream pairs for training.\nDuring training, ZeroNLG (i) projects different domains (across modalities and\nlanguages) to corresponding coordinates in a shared common latent space; (ii)\nbridges different domains by aligning their corresponding coordinates in this\nspace; and (iii) builds an unsupervised multilingual auto-encoder to learn to\ngenerate text by reconstructing the input text given its coordinate in shared\nlatent space. Consequently, during inference, based on the data-to-text\npipeline, ZeroNLG can generate target sentences across different languages\ngiven the coordinate of input data in the common space. Within this unified\nframework, given visual (imaging or video) data as input, ZeroNLG can perform\nzero-shot visual captioning; given textual sentences as input, ZeroNLG can\nperform zero-shot machine translation. We present the results of extensive\nexperiments on twelve NLG tasks, showing that, without using any labeled\ndownstream pairs for training, ZeroNLG generates high-quality and believable\noutputs and significantly outperforms existing zero-shot methods.\n","authors":["Bang Yang","Fenglin Liu","Yuexian Zou","Xian Wu","Yaowei Wang","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2303.06458v3.pdf","comment":"Accepted by TPAMI (Our code and data are available at\n  https://github.com/yangbang18/ZeroNLG)"},{"id":"http://arxiv.org/abs/2403.13341v2","updated":"2024-06-03T12:11:52Z","published":"2024-03-20T06:48:48Z","title":"FissionFusion: Fast Geometric Generation and Hierarchical Souping for\n  Medical Image Analysis","summary":"  The scarcity of well-annotated medical datasets requires leveraging transfer\nlearning from broader datasets like ImageNet or pre-trained models like CLIP.\nModel soups averages multiple fine-tuned models aiming to improve performance\non In-Domain (ID) tasks and enhance robustness against Out-of-Distribution\n(OOD) datasets. However, applying these methods to the medical imaging domain\nfaces challenges and results in suboptimal performance. This is primarily due\nto differences in error surface characteristics that stem from data\ncomplexities such as heterogeneity, domain shift, class imbalance, and\ndistributional shifts between training and testing phases. To address this\nissue, we propose a hierarchical merging approach that involves local and\nglobal aggregation of models at various levels based on models' hyperparameter\nconfigurations. Furthermore, to alleviate the need for training a large number\nof models in the hyperparameter search, we introduce a computationally\nefficient method using a cyclical learning rate scheduler to produce multiple\nmodels for aggregation in the weight space. Our method demonstrates significant\nimprovements over the model souping approach across multiple datasets (around\n6% gain in HAM10000 and CheXpert datasets) while maintaining low computational\ncosts for model generation and selection. Moreover, we achieve better results\non OOD datasets than model soups. The code is available at\nhttps://github.com/BioMedIA-MBZUAI/FissionFusion.\n","authors":["Santosh Sanjeev","Nuren Zhaksylyk","Ibrahim Almakky","Anees Ur Rehman Hashmi","Mohammad Areeb Qazi","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.13341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.08717v3","updated":"2024-06-03T12:06:06Z","published":"2022-03-16T16:14:19Z","title":"Weak Augmentation Guided Relational Self-Supervised Learning","summary":"  Self-supervised Learning (SSL) including the mainstream contrastive learning\nhas achieved great success in learning visual representations without data\nannotations. However, most methods mainly focus on the instance level\ninformation (\\ie, the different augmented images of the same instance should\nhave the same feature or cluster into the same class), but there is a lack of\nattention on the relationships between different instances. In this paper, we\nintroduce a novel SSL paradigm, which we term as relational self-supervised\nlearning (ReSSL) framework that learns representations by modeling the\nrelationship between different instances. Specifically, our proposed method\nemploys sharpened distribution of pairwise similarities among different\ninstances as \\textit{relation} metric, which is thus utilized to match the\nfeature embeddings of different augmentations. To boost the performance, we\nargue that weak augmentations matter to represent a more reliable relation, and\nleverage momentum strategy for practical efficiency. The designed asymmetric\npredictor head and an InfoNCE warm-up strategy enhance the robustness to\nhyper-parameters and benefit the resulting performance. Experimental results\nshow that our proposed ReSSL substantially outperforms the state-of-the-art\nmethods across different network architectures, including various lightweight\nnetworks (\\eg, EfficientNet and MobileNet).\n","authors":["Mingkai Zheng","Shan You","Fei Wang","Chen Qian","Changshui Zhang","Xiaogang Wang","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2203.08717v3.pdf","comment":"Extended version of NeurIPS 2021 paper. arXiv admin note: substantial\n  text overlap with arXiv:2107.09282"},{"id":"http://arxiv.org/abs/2311.17425v3","updated":"2024-06-03T11:58:25Z","published":"2023-11-29T07:57:30Z","title":"SpeechAct: Towards Generating Whole-body Motion from Speech","summary":"  This paper addresses the problem of generating whole-body motion from speech.\nDespite great successes, prior methods still struggle to produce reasonable and\ndiverse whole-body motions from speech. This is due to their reliance on\nsuboptimal representations and a lack of strategies for generating diverse\nresults. To address these challenges, we present a novel hybrid point\nrepresentation to achieve accurate and continuous motion generation, e.g.,\navoiding foot skating, and this representation can be transformed into an\neasy-to-use representation, i.e., SMPL-X body mesh, for many applications. To\ngenerate whole-body motion from speech, for facial motion, closely tied to the\naudio signal, we introduce an encoder-decoder architecture to achieve\ndeterministic outcomes. However, for the body and hands, which have weaker\nconnections to the audio signal, we aim to generate diverse yet reasonable\nmotions. To boost diversity in motion generation, we propose a contrastive\nmotion learning method to encourage the model to produce more distinctive\nrepresentations. Specifically, we design a robust VQ-VAE to learn a quantized\nmotion codebook using our hybrid representation. Then, we regress the motion\nrepresentation from the audio signal by a translation model employing our\ncontrastive motion learning method. Experimental results validate the superior\nperformance and the correctness of our model. The project page is available for\nresearch purposes at http://cic.tju.edu.cn/faculty/likun/projects/SpeechAct.\n","authors":["Jinsong Zhang","Minjie Zhu","Yuxiang Zhang","Yebin Liu","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2311.17425v3.pdf","comment":"the manuscript should be revised"},{"id":"http://arxiv.org/abs/2403.06807v2","updated":"2024-06-03T11:33:51Z","published":"2024-03-11T15:26:34Z","title":"Multistep Consistency Models","summary":"  Diffusion models are relatively easy to train but require many steps to\ngenerate samples. Consistency models are far more difficult to train, but\ngenerate samples in a single step.\n  In this paper we propose Multistep Consistency Models: A unification between\nConsistency Models (Song et al., 2023) and TRACT (Berthelot et al., 2023) that\ncan interpolate between a consistency model and a diffusion model: a trade-off\nbetween sampling speed and sampling quality. Specifically, a 1-step consistency\nmodel is a conventional consistency model whereas a $\\infty$-step consistency\nmodel is a diffusion model.\n  Multistep Consistency Models work really well in practice. By increasing the\nsample budget from a single step to 2-8 steps, we can train models more easily\nthat generate higher quality samples, while retaining much of the sampling\nspeed benefits. Notable results are 1.4 FID on Imagenet 64 in 8 step and 2.1\nFID on Imagenet128 in 8 steps with consistency distillation, using simple\nlosses without adversarial training. We also show that our method scales to a\ntext-to-image diffusion model, generating samples that are close to the quality\nof the original model.\n","authors":["Jonathan Heek","Emiel Hoogeboom","Tim Salimans"],"pdf_url":"https://arxiv.org/pdf/2403.06807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19996v3","updated":"2024-06-03T11:32:40Z","published":"2024-05-30T12:32:35Z","title":"DP-IQA: Utilizing Diffusion Prior for Blind Image Quality Assessment in\n  the Wild","summary":"  Image quality assessment (IQA) plays a critical role in selecting\nhigh-quality images and guiding compression and enhancement methods in a series\nof applications. The blind IQA, which assesses the quality of in-the-wild\nimages containing complex authentic distortions without reference images, poses\ngreater challenges. Existing methods are limited to modeling a uniform\ndistribution with local patches and are bothered by the gap between low and\nhigh-level visions (caused by widely adopted pre-trained classification\nnetworks). In this paper, we propose a novel IQA method called diffusion\npriors-based IQA (DP-IQA), which leverages the prior knowledge from the\npre-trained diffusion model with its excellent powers to bridge semantic gaps\nin the perception of the visual quality of images. Specifically, we use\npre-trained stable diffusion as the backbone, extract multi-level features from\nthe denoising U-Net during the upsampling process at a specified timestep, and\ndecode them to estimate the image quality score. The text and image adapters\nare adopted to mitigate the domain gap for downstream tasks and correct the\ninformation loss caused by the variational autoencoder bottleneck. Finally, we\ndistill the knowledge in the above model into a CNN-based student model,\nsignificantly reducing the parameter to enhance applicability, with the student\nmodel performing similarly or even better than the teacher model surprisingly.\nExperimental results demonstrate that our DP-IQA achieves state-of-the-art\nresults on various in-the-wild datasets with better generalization capability,\nwhich shows the superiority of our method in global modeling and utilizing the\nhierarchical feature clues of diffusion for evaluating image quality.\n","authors":["Honghao Fu","Yufei Wang","Wenhan Yang","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2405.19996v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14431v2","updated":"2024-06-03T11:06:53Z","published":"2023-02-28T09:21:12Z","title":"Efficient Masked Autoencoders with Self-Consistency","summary":"  Inspired by the masked language modeling (MLM) in natural language processing\ntasks, the masked image modeling (MIM) has been recognized as a strong\nself-supervised pre-training method in computer vision. However, the high\nrandom mask ratio of MIM results in two serious problems: 1) the inadequate\ndata utilization of images within each iteration brings prolonged pre-training,\nand 2) the high inconsistency of predictions results in unreliable generations,\n$i.e.$, the prediction of the identical patch may be inconsistent in different\nmask rounds, leading to divergent semantics in the ultimately generated\noutcomes. To tackle these problems, we propose the efficient masked\nautoencoders with self-consistency (EMAE) to improve the pre-training\nefficiency and increase the consistency of MIM. In particular, we present a\nparallel mask strategy that divides the image into K non-overlapping parts,\neach of which is generated by a random mask with the same mask ratio. Then the\nMIM task is conducted parallelly on all parts in an iteration and the model\nminimizes the loss between the predictions and the masked patches. Besides, we\ndesign the self-consistency learning to further maintain the consistency of\npredictions of overlapping masked patches among parts. Overall, our method is\nable to exploit the data more efficiently and obtains reliable representations.\nExperiments on ImageNet show that EMAE achieves the best performance on\nViT-Large with only 13% of MAE pre-training time using NVIDIA A100 GPUs. After\npre-training on diverse datasets, EMAE consistently obtains state-of-the-art\ntransfer ability on a variety of downstream tasks, such as image\nclassification, object detection, and semantic segmentation.\n","authors":["Zhaowen Li","Yousong Zhu","Zhiyang Chen","Wei Li","Chaoyang Zhao","Rui Zhao","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2302.14431v2.pdf","comment":"Accept by IEEE Transactions on Pattern Analysis and Machine\n  Intelligence (TPAMI)"},{"id":"http://arxiv.org/abs/2402.02085v3","updated":"2024-06-03T11:00:25Z","published":"2024-02-03T08:52:06Z","title":"DeCoF: Generated Video Detection via Frame Consistency: The First\n  Benchmark Dataset","summary":"  The escalating quality of video generated by advanced video generation\nmethods results in new security challenges, while there have been few relevant\nresearch efforts: 1) There is no open-source dataset for generated video\ndetection, 2) No generated video detection method has been proposed so far. To\nthis end, we propose an open-source dataset and a detection method for\ngenerated video for the first time. First, we propose a scalable dataset\nconsisting of 964 prompts, covering various forgery targets, scenes, behaviors,\nand actions, as well as various generation models with different architectures\nand generation methods, including the most popular commercial models like\nOpenAI's Sora and Google's Veo. Second, we found via probing experiments that\nspatial artifact-based detectors lack generalizability. Hence, we propose a\nsimple yet effective \\textbf{de}tection model based on \\textbf{f}rame\n\\textbf{co}nsistency (\\textbf{DeCoF}), which focuses on temporal artifacts by\neliminating the impact of spatial artifacts during feature learning. Extensive\nexperiments demonstrate the efficacy of DeCoF in detecting videos generated by\nunseen video generation models and confirm its powerful generalizability across\nseveral commercially proprietary models. Our code and dataset will be released\nat \\url{https://anonymous.4open.science/r/DeCoF-8394}.\n","authors":["Long Ma","Jiajia Zhang","Hongping Deng","Ningyu Zhang","Qinglang Guo","Haiyang Yu","Yong Liao","Pengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.02085v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13106v2","updated":"2024-06-03T10:44:26Z","published":"2024-04-19T14:43:43Z","title":"Automatic Cranial Defect Reconstruction with Self-Supervised Deep\n  Deformable Masked Autoencoders","summary":"  Thousands of people suffer from cranial injuries every year. They require\npersonalized implants that need to be designed and manufactured before the\nreconstruction surgery. The manual design is expensive and time-consuming\nleading to searching for algorithms whose goal is to automatize the process.\nThe problem can be formulated as volumetric shape completion and solved by deep\nneural networks dedicated to supervised image segmentation. However, such an\napproach requires annotating the ground-truth defects which is costly and\ntime-consuming. Usually, the process is replaced with synthetic defect\ngeneration. However, even the synthetic ground-truth generation is\ntime-consuming and limits the data heterogeneity, thus the deep models'\ngeneralizability. In our work, we propose an alternative and simple approach to\nuse a self-supervised masked autoencoder to solve the problem. This approach by\ndesign increases the heterogeneity of the training set and can be seen as a\nform of data augmentation. We compare the proposed method with several\nstate-of-the-art deep neural networks and show both the quantitative and\nqualitative improvement on the SkullBreak and SkullFix datasets. The proposed\nmethod can be used to efficiently reconstruct the cranial defects in real time.\n","authors":["Marek Wodzinski","Daria Hemmerling","Mateusz Daniol"],"pdf_url":"https://arxiv.org/pdf/2404.13106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06116v2","updated":"2024-06-03T08:59:51Z","published":"2024-05-09T21:47:46Z","title":"Rethinking Efficient and Effective Point-based Networks for Event Camera\n  Classification and Regression: EventMamba","summary":"  Event cameras, drawing inspiration from biological systems, efficiently\ndetect changes in ambient light with low latency and high dynamic range while\nconsuming minimal power. The most current approach to processing event data\noften involves converting it into frame-based representations, which is\nwell-established in traditional vision. However, this approach neglects the\nsparsity of event data, loses fine-grained temporal information during the\ntransformation process, and increases the computational burden, making it\nineffective for characterizing event camera properties. In contrast, Point\nCloud is a popular representation for 3D processing and is better suited to\nmatch the sparse and asynchronous nature of the event camera. Nevertheless,\ndespite the theoretical compatibility of point-based methods with event\ncameras, the results show a performance gap that is not yet satisfactory\ncompared to frame-based methods. In order to bridge the performance gap, we\npropose EventMamba, an efficient and effective Point Cloud framework that\nachieves competitive results even compared to the state-of-the-art (SOTA)\nframe-based method in both classification and regression tasks. This notable\naccomplishment is facilitated by our rethinking of the distinction between\nEvent Cloud and Point Cloud, emphasizing effective temporal information\nextraction through optimized network structures. Specifically, EventMamba\nleverages temporal aggregation and State Space Model (SSM) based Mamba boasting\nenhanced temporal information extraction capabilities. Through a hierarchical\nstructure, EventMamba is adept at abstracting local and global spatial features\nand implicit and explicit temporal features. By adhering to the lightweight\ndesign principle, EventMamba delivers impressive results with minimal\ncomputational resource utilization, demonstrating its efficiency and\neffectiveness.\n","authors":["Hongwei Ren","Yue Zhou","Jiadong Zhu","Haotian Fu","Yulong Huang","Xiaopeng Lin","Yuetong Fang","Fei Ma","Hao Yu","Bojun Cheng"],"pdf_url":"https://arxiv.org/pdf/2405.06116v2.pdf","comment":"Extension Journal of TTPOINT and PEPNet"},{"id":"http://arxiv.org/abs/2405.16094v2","updated":"2024-06-03T08:27:09Z","published":"2024-05-25T06:58:20Z","title":"PLUG: Revisiting Amodal Segmentation with Foundation Model and\n  Hierarchical Focus","summary":"  Aiming to predict the complete shapes of partially occluded objects, amodal\nsegmentation is an important step towards visual intelligence. With crucial\nsignificance, practical prior knowledge derives from sufficient training, while\nlimited amodal annotations pose challenges to achieve better performance. To\ntackle this problem, utilizing the mighty priors accumulated in the foundation\nmodel, we propose the first SAM-based amodal segmentation approach, PLUG.\nMethodologically, a novel framework with hierarchical focus is presented to\nbetter adapt the task characteristics and unleash the potential capabilities of\nSAM. In the region level, due to the association and division in visible and\noccluded areas, inmodal and amodal regions are assigned as the focuses of\ndistinct branches to avoid mutual disturbance. In the point level, we introduce\nthe concept of uncertainty to explicitly assist the model in identifying and\nfocusing on ambiguous points. Guided by the uncertainty map, a\ncomputation-economic point loss is applied to improve the accuracy of predicted\nboundaries. Experiments are conducted on several prominent datasets, and the\nresults show that our proposed method outperforms existing methods with large\nmargins. Even with fewer total parameters, our method still exhibits remarkable\nadvantages.\n","authors":["Zhaochen Liu","Limeng Qiao","Xiangxiang Chu","Tingting Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.16094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18729v2","updated":"2024-06-03T08:16:22Z","published":"2023-11-30T17:26:33Z","title":"Portrait4D: Learning One-Shot 4D Head Avatar Synthesis using Synthetic\n  Data","summary":"  Existing one-shot 4D head synthesis methods usually learn from monocular\nvideos with the aid of 3DMM reconstruction, yet the latter is evenly\nchallenging which restricts them from reasonable 4D head synthesis. We present\na method to learn one-shot 4D head synthesis via large-scale synthetic data.\nThe key is to first learn a part-wise 4D generative model from monocular images\nvia adversarial learning, to synthesize multi-view images of diverse identities\nand full motions as training data; then leverage a transformer-based animatable\ntriplane reconstructor to learn 4D head reconstruction using the synthetic\ndata. A novel learning strategy is enforced to enhance the generalizability to\nreal images by disentangling the learning process of 3D reconstruction and\nreenactment. Experiments demonstrate our superiority over the prior art.\n","authors":["Yu Deng","Duomin Wang","Xiaohang Ren","Xingyu Chen","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.18729v2.pdf","comment":"CVPR24 camera ready version. Project page:\n  https://yudeng.github.io/Portrait4D/"},{"id":"http://arxiv.org/abs/2402.05443v3","updated":"2024-06-03T08:12:13Z","published":"2024-02-08T06:45:03Z","title":"Scalable Wasserstein Gradient Flow for Generative Modeling through\n  Unbalanced Optimal Transport","summary":"  Wasserstein Gradient Flow (WGF) describes the gradient dynamics of\nprobability density within the Wasserstein space. WGF provides a promising\napproach for conducting optimization over the probability distributions.\nNumerically approximating the continuous WGF requires the time discretization\nmethod. The most well-known method for this is the JKO scheme. In this regard,\nprevious WGF models employ the JKO scheme and parametrize transport map for\neach JKO step. However, this approach results in quadratic training complexity\n$O(K^2)$ with the number of JKO step $K$. This severely limits the scalability\nof WGF models. In this paper, we introduce a scalable WGF-based generative\nmodel, called Semi-dual JKO (S-JKO). Our model is based on the semi-dual form\nof the JKO step, derived from the equivalence between the JKO step and the\nUnbalanced Optimal Transport. Our approach reduces the training complexity to\n$O(K)$. We demonstrate that our model significantly outperforms existing\nWGF-based generative models, achieving FID scores of 2.62 on CIFAR-10 and 5.46\non CelebA-HQ-256, which are comparable to state-of-the-art image generative\nmodels.\n","authors":["Jaemoo Choi","Jaewoong Choi","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2402.05443v3.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.03161v3","updated":"2024-06-03T08:09:09Z","published":"2024-02-05T16:30:49Z","title":"Video-LaVIT: Unified Video-Language Pre-training with Decoupled\n  Visual-Motional Tokenization","summary":"  In light of recent advances in multimodal Large Language Models (LLMs), there\nis increasing attention to scaling them from image-text data to more\ninformative real-world videos. Compared to static images, video poses unique\nchallenges for effective large-scale pre-training due to the modeling of its\nspatiotemporal dynamics. In this paper, we address such limitations in\nvideo-language pre-training with an efficient video decomposition that\nrepresents each video as keyframes and temporal motions. These are then adapted\nto an LLM using well-designed tokenizers that discretize visual and temporal\ninformation as a few tokens, thus enabling unified generative pre-training of\nvideos, images, and text. At inference, the generated tokens from the LLM are\ncarefully recovered to the original continuous pixel space to create various\nvideo content. Our proposed framework is both capable of comprehending and\ngenerating image and video content, as demonstrated by its competitive\nperformance across 13 multimodal benchmarks in image and video understanding\nand generation. Our code and models are available at\nhttps://video-lavit.github.io.\n","authors":["Yang Jin","Zhicheng Sun","Kun Xu","Kun Xu","Liwei Chen","Hao Jiang","Quzhe Huang","Chengru Song","Yuliang Liu","Di Zhang","Yang Song","Kun Gai","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2402.03161v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16506v3","updated":"2024-06-03T08:01:25Z","published":"2024-02-26T11:41:28Z","title":"Stochastic Conditional Diffusion Models for Robust Semantic Image\n  Synthesis","summary":"  Semantic image synthesis (SIS) is a task to generate realistic images\ncorresponding to semantic maps (labels). However, in real-world applications,\nSIS often encounters noisy user inputs. To address this, we propose Stochastic\nConditional Diffusion Model (SCDM), which is a robust conditional diffusion\nmodel that features novel forward and generation processes tailored for SIS\nwith noisy labels. It enhances robustness by stochastically perturbing the\nsemantic label maps through Label Diffusion, which diffuses the labels with\ndiscrete diffusion. Through the diffusion of labels, the noisy and clean\nsemantic maps become similar as the timestep increases, eventually becoming\nidentical at $t=T$. This facilitates the generation of an image close to a\nclean image, enabling robust generation. Furthermore, we propose a class-wise\nnoise schedule to differentially diffuse the labels depending on the class. We\ndemonstrate that the proposed method generates high-quality samples through\nextensive experiments and analyses on benchmark datasets, including a novel\nexperimental setup simulating human errors during real-world applications. Code\nis available at https://github.com/mlvlab/SCDM.\n","authors":["Juyeon Ko","Inho Kong","Dogyun Park","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2402.16506v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2305.08389v2","updated":"2024-06-03T07:47:36Z","published":"2023-05-15T07:12:19Z","title":"Edit As You Wish: Video Caption Editing with Multi-grained User Control","summary":"  Automatically narrating videos in natural language complying with user\nrequests, i.e. Controllable Video Captioning task, can help people manage\nmassive videos with desired intentions. However, existing works suffer from two\nshortcomings: 1) the control signal is single-grained which can not satisfy\ndiverse user intentions; 2) the video description is generated in a single\nround which can not be further edited to meet dynamic needs. In this paper, we\npropose a novel \\textbf{V}ideo \\textbf{C}aption \\textbf{E}diting \\textbf{(VCE)}\ntask to automatically revise an existing video description guided by\nmulti-grained user requests. Inspired by human writing-revision habits, we\ndesign the user command as a pivotal triplet \\{\\textit{operation, position,\nattribute}\\} to cover diverse user needs from coarse-grained to fine-grained.\nTo facilitate the VCE task, we \\textit{automatically} construct an open-domain\nbenchmark dataset named VATEX-EDIT and \\textit{manually} collect an e-commerce\ndataset called EMMAD-EDIT. We further propose a specialized small-scale model\n(i.e., OPA) compared with two generalist Large Multi-modal Models to perform an\nexhaustive analysis of the novel task. For evaluation, we adopt comprehensive\nmetrics considering caption fluency, command-caption consistency, and\nvideo-caption alignment. Experiments reveal the task challenges of fine-grained\nmulti-modal semantics understanding and processing. Our datasets, codes, and\nevaluation tools are ready to be open-sourced.\n","authors":["Linli Yao","Yuanmeng Zhang","Ziheng Wang","Xinglin Hou","Tiezheng Ge","Yuning Jiang","Xu Sun","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2305.08389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11549v2","updated":"2024-06-03T07:45:33Z","published":"2024-03-18T08:00:23Z","title":"Boosting Continual Learning of Vision-Language Models via\n  Mixture-of-Experts Adapters","summary":"  Continual learning can empower vision-language models to continuously acquire\nnew knowledge, without the need for access to the entire historical dataset.\nHowever, mitigating the performance degradation in large-scale models is\nnon-trivial due to (i) parameter shifts throughout lifelong learning and (ii)\nsignificant computational burdens associated with full-model tuning. In this\nwork, we present a parameter-efficient continual learning framework to\nalleviate long-term forgetting in incremental learning with vision-language\nmodels. Our approach involves the dynamic expansion of a pre-trained CLIP\nmodel, through the integration of Mixture-of-Experts (MoE) adapters in response\nto new tasks. To preserve the zero-shot recognition capability of\nvision-language models, we further introduce a Distribution Discriminative\nAuto-Selector (DDAS) that automatically routes in-distribution and\nout-of-distribution inputs to the MoE Adapter and the original CLIP,\nrespectively. Through extensive experiments across various settings, our\nproposed method consistently outperforms previous state-of-the-art approaches\nwhile concurrently reducing parameter training burdens by 60%. Our code locates\nat https://github.com/JiazuoYu/MoE-Adapters4CL\n","authors":["Jiazuo Yu","Yunzhi Zhuge","Lu Zhang","Ping Hu","Dong Wang","Huchuan Lu","You He"],"pdf_url":"https://arxiv.org/pdf/2403.11549v2.pdf","comment":"This work is accepted by CVPR2024. More modifications may be\n  performed"},{"id":"http://arxiv.org/abs/2403.05874v2","updated":"2024-06-03T07:37:23Z","published":"2024-03-09T10:53:11Z","title":"SPAFormer: Sequential 3D Part Assembly with Transformers","summary":"  We introduce SPAFormer, an innovative model designed to overcome the\ncombinatorial explosion challenge in the 3D Part Assembly (3D-PA) task. This\ntask requires accurate prediction of each part's pose and shape in sequential\nsteps, and as the number of parts increases, the possible assembly combinations\nincrease exponentially, leading to a combinatorial explosion that severely\nhinders the efficacy of 3D-PA. SPAFormer addresses this problem by leveraging\nweak constraints from assembly sequences, effectively reducing the solution\nspace's complexity. Since assembly part sequences convey construction rules\nsimilar to sentences being structured through words, our model explores both\nparallel and autoregressive generation. It further enhances assembly through\nknowledge enhancement strategies that utilize the attributes of parts and their\nsequence information, enabling it to capture the inherent assembly pattern and\nrelationships among sequentially ordered parts. We also construct a more\nchallenging benchmark named PartNet-Assembly covering 21 varied categories to\nmore comprehensively validate the effectiveness of SPAFormer. Extensive\nexperiments demonstrate the superior generalization capabilities of SPAFormer,\nparticularly with multi-tasking and in scenarios requiring long-horizon\nassembly. Codes and model weights will be released at\nhttps://github.com/xuboshen/SPAFormer.\n","authors":["Boshen Xu","Sipeng Zheng","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2403.05874v2.pdf","comment":"Code: https://github.com/xuboshen/SPAFormer"},{"id":"http://arxiv.org/abs/2405.17719v2","updated":"2024-06-03T07:29:18Z","published":"2024-05-28T00:27:29Z","title":"EgoNCE++: Do Egocentric Video-Language Models Really Understand\n  Hand-Object Interactions?","summary":"  Egocentric video-language pretraining is a crucial paradigm to advance the\nlearning of egocentric hand-object interactions (EgoHOI). Despite the great\nsuccess on existing testbeds, these benchmarks focus more on closed-set visual\nconcepts or limited scenarios. Due to the occurrence of diverse EgoHOIs in the\nreal world, we propose an open-vocabulary benchmark named EgoHOIBench to reveal\nthe diminished performance of current egocentric video-language models (EgoVLM)\non fined-grained concepts, indicating that these models still lack a full\nspectrum of egocentric understanding. We attribute this performance gap to\ninsufficient fine-grained supervision and strong bias towards understanding\nobjects rather than temporal dynamics in current methods. To tackle these\nissues, we introduce a novel asymmetric contrastive objective for EgoHOI named\nEgoNCE++. For video-to-text loss, we enhance text supervision through the\ngeneration of negative captions by leveraging the in-context learning of large\nlanguage models to perform HOI-related word substitution. For text-to-video\nloss, we propose an object-centric positive video sampling strategy that\naggregates video representations by the same nouns. Our extensive experiments\ndemonstrate that EgoNCE++ significantly boosts open-vocabulary HOI recognition,\nmulti-instance retrieval, and action recognition tasks across various\negocentric models, with improvements of up to +26.55%. Our code is available at\nhttps://github.com/xuboshen/EgoNCEpp.\n","authors":["Boshen Xu","Ziheng Wang","Yang Du","Zhinan Song","Sipeng Zheng","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2405.17719v2.pdf","comment":"Code: https://github.com/xuboshen/EgoNCEpp"},{"id":"http://arxiv.org/abs/2402.09353v5","updated":"2024-06-03T07:27:15Z","published":"2024-02-14T17:59:34Z","title":"DoRA: Weight-Decomposed Low-Rank Adaptation","summary":"  Among the widely used parameter-efficient fine-tuning (PEFT) methods, LoRA\nand its variants have gained considerable popularity because of avoiding\nadditional inference costs. However, there still often exists an accuracy gap\nbetween these methods and full fine-tuning (FT). In this work, we first\nintroduce a novel weight decomposition analysis to investigate the inherent\ndifferences between FT and LoRA. Aiming to resemble the learning capacity of FT\nfrom the findings, we propose Weight-Decomposed Low-Rank Adaptation (DoRA).\nDoRA decomposes the pre-trained weight into two components, magnitude and\ndirection, for fine-tuning, specifically employing LoRA for directional updates\nto efficiently minimize the number of trainable parameters. By employing \\ours,\nwe enhance both the learning capacity and training stability of LoRA while\navoiding any additional inference overhead. \\ours~consistently outperforms LoRA\non fine-tuning LLaMA, LLaVA, and VL-BART on various downstream tasks, such as\ncommonsense reasoning, visual instruction tuning, and image/video-text\nunderstanding. Code is available at https://github.com/NVlabs/DoRA.\n","authors":["Shih-Yang Liu","Chien-Yi Wang","Hongxu Yin","Pavlo Molchanov","Yu-Chiang Frank Wang","Kwang-Ting Cheng","Min-Hung Chen"],"pdf_url":"https://arxiv.org/pdf/2402.09353v5.pdf","comment":"Code available at https://github.com/NVlabs/DoRA"},{"id":"http://arxiv.org/abs/2404.18706v2","updated":"2024-06-03T07:19:35Z","published":"2024-04-29T13:57:02Z","title":"The Socface Project: Large-Scale Collection, Processing, and Analysis of\n  a Century of French Censuses","summary":"  This paper presents a complete processing workflow for extracting information\nfrom French census lists from 1836 to 1936. These lists contain information\nabout individuals living in France and their households. We aim at extracting\nall the information contained in these tables using automatic handwritten table\nrecognition. At the end of the Socface project, in which our work is taking\nplace, the extracted information will be redistributed to the departmental\narchives, and the nominative lists will be freely available to the public,\nallowing anyone to browse hundreds of millions of records. The extracted data\nwill be used by demographers to analyze social change over time, significantly\nimproving our understanding of French economic and social structures. For this\nproject, we developed a complete processing workflow: large-scale data\ncollection from French departmental archives, collaborative annotation of\ndocuments, training of handwritten table text and structure recognition models,\nand mass processing of millions of images. We present the tools we have\ndeveloped to easily collect and process millions of pages. We also show that it\nis possible to process such a wide variety of tables with a single table\nrecognition model that uses the image of the entire page to recognize\ninformation about individuals, categorize them and automatically group them\ninto households. The entire process has been successfully used to process the\ndocuments of a departmental archive, representing more than 450,000 images.\n","authors":["Mélodie Boillet","Solène Tarride","Manon Blanco","Valentin Rigal","Yoann Schneider","Bastien Abadie","Lionel Kesztenbaum","Christopher Kermorvant"],"pdf_url":"https://arxiv.org/pdf/2404.18706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00376v2","updated":"2024-06-03T07:09:39Z","published":"2024-03-01T09:01:53Z","title":"Spurious Feature Eraser: Stabilizing Test-Time Adaptation for\n  Vision-Language Foundation Model","summary":"  Vision-language foundation models have exhibited remarkable success across a\nmultitude of downstream tasks due to their scalability on extensive image-text\npaired data. However, these models also display significant limitations when\napplied to downstream tasks, such as fine-grained image classification, as a\nresult of ``decision shortcuts'' that hinder their generalization capabilities.\nIn this work, we find that the CLIP model possesses a rich set of features,\nencompassing both \\textit{desired invariant causal features} and\n\\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP\non downstream tasks originates from its inability to effectively utilize\npre-trained features in accordance with specific task requirements. To address\nthis challenge, we propose a simple yet effective method, Spurious Feature\nEraser (SEraser), to alleviate the decision shortcuts by erasing the spurious\nfeatures. Specifically, we introduce a test-time prompt tuning paradigm that\noptimizes a learnable prompt, thereby compelling the model to exploit invariant\nfeatures while disregarding decision shortcuts during the inference phase. The\nproposed method effectively alleviates excessive dependence on potentially\nmisleading spurious information. We conduct comparative analysis of the\nproposed method against various approaches which validates the significant\nsuperiority.\n","authors":["Huan Ma","Yan Zhu","Changqing Zhang","Peilin Zhao","Baoyuan Wu","Long-Kai Huang","Qinghua Hu","Bingzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2403.00376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20299v3","updated":"2024-06-03T06:36:22Z","published":"2024-05-30T17:46:23Z","title":"Scaling White-Box Transformers for Vision","summary":"  CRATE, a white-box transformer architecture designed to learn compressed and\nsparse representations, offers an intriguing alternative to standard vision\ntransformers (ViTs) due to its inherent mathematical interpretability. Despite\nextensive investigations into the scaling behaviors of language and vision\ntransformers, the scalability of CRATE remains an open question which this\npaper aims to address. Specifically, we propose CRATE-$\\alpha$, featuring\nstrategic yet minimal modifications to the sparse coding block in the CRATE\narchitecture design, and a light training recipe designed to improve the\nscalability of CRATE. Through extensive experiments, we demonstrate that\nCRATE-$\\alpha$ can effectively scale with larger model sizes and datasets. For\nexample, our CRATE-$\\alpha$-B substantially outperforms the prior best CRATE-B\nmodel accuracy on ImageNet classification by 3.7%, achieving an accuracy of\n83.2%. Meanwhile, when scaling further, our CRATE-$\\alpha$-L obtains an\nImageNet classification accuracy of 85.1%. More notably, these model\nperformance improvements are achieved while preserving, and potentially even\nenhancing the interpretability of learned CRATE models, as we demonstrate\nthrough showing that the learned token representations of increasingly larger\ntrained CRATE-$\\alpha$ models yield increasingly higher-quality unsupervised\nobject segmentation of images. The project page is\nhttps://rayjryang.github.io/CRATE-alpha/.\n","authors":["Jinrui Yang","Xianhang Li","Druv Pai","Yuyin Zhou","Yi Ma","Yaodong Yu","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2405.20299v3.pdf","comment":"project page: https://rayjryang.github.io/CRATE-alpha/"},{"id":"http://arxiv.org/abs/2404.03653v2","updated":"2024-06-03T06:02:34Z","published":"2024-04-04T17:59:46Z","title":"CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept\n  Matching","summary":"  Diffusion models have demonstrated great success in the field of\ntext-to-image generation. However, alleviating the misalignment between the\ntext prompts and images is still challenging. The root reason behind the\nmisalignment has not been extensively investigated. We observe that the\nmisalignment is caused by inadequate token attention activation. We further\nattribute this phenomenon to the diffusion model's insufficient condition\nutilization, which is caused by its training paradigm. To address the issue, we\npropose CoMat, an end-to-end diffusion model fine-tuning strategy with an\nimage-to-text concept matching mechanism. We leverage an image captioning model\nto measure image-to-text alignment and guide the diffusion model to revisit\nignored tokens. A novel attribute concentration module is also proposed to\naddress the attribute binding problem. Without any image or human preference\ndata, we use only 20K text prompts to fine-tune SDXL to obtain CoMat-SDXL.\nExtensive experiments show that CoMat-SDXL significantly outperforms the\nbaseline model SDXL in two text-to-image alignment benchmarks and achieves\nstart-of-the-art performance.\n","authors":["Dongzhi Jiang","Guanglu Song","Xiaoshi Wu","Renrui Zhang","Dazhong Shen","Zhuofan Zong","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.03653v2.pdf","comment":"Project Page: https://caraj7.github.io/comat"},{"id":"http://arxiv.org/abs/2405.20786v2","updated":"2024-06-03T05:36:47Z","published":"2024-05-30T06:25:42Z","title":"Stratified Avatar Generation from Sparse Observations","summary":"  Estimating 3D full-body avatars from AR/VR devices is essential for creating\nimmersive experiences in AR/VR applications. This task is challenging due to\nthe limited input from Head Mounted Devices, which capture only sparse\nobservations from the head and hands. Predicting the full-body avatars,\nparticularly the lower body, from these sparse observations presents\nsignificant difficulties. In this paper, we are inspired by the inherent\nproperty of the kinematic tree defined in the Skinned Multi-Person Linear\n(SMPL) model, where the upper body and lower body share only one common\nancestor node, bringing the potential of decoupled reconstruction. We propose a\nstratified approach to decouple the conventional full-body avatar\nreconstruction pipeline into two stages, with the reconstruction of the upper\nbody first and a subsequent reconstruction of the lower body conditioned on the\nprevious stage. To implement this straightforward idea, we leverage the latent\ndiffusion model as a powerful probabilistic generator, and train it to follow\nthe latent distribution of decoupled motions explored by a VQ-VAE\nencoder-decoder model. Extensive experiments on AMASS mocap dataset demonstrate\nour state-of-the-art performance in the reconstruction of full-body motions.\n","authors":["Han Feng","Wenchao Ma","Quankai Gao","Xianwei Zheng","Nan Xue","Huijuan Xu"],"pdf_url":"https://arxiv.org/pdf/2405.20786v2.pdf","comment":"Accepted by CVPR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2306.08964v2","updated":"2024-06-03T04:50:18Z","published":"2023-06-15T08:56:58Z","title":"Exploring Multi-Timestep Multi-Stage Diffusion Features for\n  Hyperspectral Image Classification","summary":"  The effectiveness of spectral-spatial feature learning is crucial for the\nhyperspectral image (HSI) classification task. Diffusion models, as a new class\nof groundbreaking generative models, have the ability to learn both contextual\nsemantics and textual details from the distinct timestep dimension, enabling\nthe modeling of complex spectral-spatial relations in HSIs. However, existing\ndiffusion-based HSI classification methods only utilize manually selected\nsingle-timestep single-stage features, limiting the full exploration and\nexploitation of rich contextual semantics and textual information hidden in the\ndiffusion model. To address this issue, we propose a novel diffusion-based\nfeature learning framework that explores Multi-Timestep Multi-Stage Diffusion\nfeatures for HSI classification for the first time, called MTMSD. Specifically,\nthe diffusion model is first pretrained with unlabeled HSI patches to mine the\nconnotation of unlabeled data, and then is used to extract the multi-timestep\nmulti-stage diffusion features. To effectively and efficiently leverage\nmulti-timestep multi-stage features,two strategies are further developed. One\nstrategy is class & timestep-oriented multi-stage feature purification module\nwith the inter-class and inter-timestep prior for reducing the redundancy of\nmulti-stage features and alleviating memory constraints. The other one is\nselective timestep feature fusion module with the guidance of global features\nto adaptively select different timestep features for integrating texture and\nsemantics. Both strategies facilitate the generality and adaptability of the\nMTMSD framework for diverse patterns of different HSI data. Extensive\nexperiments are conducted on four public HSI datasets, and the results\ndemonstrate that our method outperforms state-of-the-art methods for HSI\nclassification, especially on the challenging Houston 2018 dataset.\n","authors":["Jingyi Zhou","Jiamu Sheng","Jiayuan Fan","Peng Ye","Tong He","Bin Wang","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05234v2","updated":"2024-06-03T04:39:51Z","published":"2024-03-08T11:48:44Z","title":"Benchmarking Micro-action Recognition: Dataset, Methods, and\n  Applications","summary":"  Micro-action is an imperceptible non-verbal behaviour characterised by\nlow-intensity movement. It offers insights into the feelings and intentions of\nindividuals and is important for human-oriented applications such as emotion\nrecognition and psychological assessment. However, the identification,\ndifferentiation, and understanding of micro-actions pose challenges due to the\nimperceptible and inaccessible nature of these subtle human behaviors in\neveryday life. In this study, we innovatively collect a new micro-action\ndataset designated as Micro-action-52 (MA-52), and propose a benchmark named\nmicro-action network (MANet) for micro-action recognition (MAR) task. Uniquely,\nMA-52 provides the whole-body perspective including gestures, upper- and\nlower-limb movements, attempting to reveal comprehensive micro-action cues. In\ndetail, MA-52 contains 52 micro-action categories along with seven body part\nlabels, and encompasses a full array of realistic and natural micro-actions,\naccounting for 205 participants and 22,422 video instances collated from the\npsychological interviews. Based on the proposed dataset, we assess MANet and\nother nine prevalent action recognition methods. MANet incorporates squeeze-and\nexcitation (SE) and temporal shift module (TSM) into the ResNet architecture\nfor modeling the spatiotemporal characteristics of micro-actions. Then a\njoint-embedding loss is designed for semantic matching between video and action\nlabels; the loss is used to better distinguish between visually similar yet\ndistinct micro-action categories. The extended application in emotion\nrecognition has demonstrated one of the important values of our proposed\ndataset and method. In the future, further exploration of human behaviour,\nemotion, and psychological assessment will be conducted in depth. The dataset\nand source code are released at https://github.com/VUT-HFUT/Micro-Action.\n","authors":["Dan Guo","Kun Li","Bin Hu","Yan Zhang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.05234v2.pdf","comment":"Accepted by IEEE Transactions on Circuits and Systems for Video\n  Technology"},{"id":"http://arxiv.org/abs/2405.20881v2","updated":"2024-06-03T04:38:42Z","published":"2024-05-31T14:55:31Z","title":"S4Fusion: Saliency-aware Selective State Space Model for Infrared\n  Visible Image Fusion","summary":"  As one of the tasks in Image Fusion, Infrared and Visible Image Fusion aims\nto integrate complementary information captured by sensors of different\nmodalities into a single image. The Selective State Space Model (SSSM), known\nfor its ability to capture long-range dependencies, has demonstrated its\npotential in the field of computer vision. However, in image fusion, current\nmethods underestimate the potential of SSSM in capturing the global spatial\ninformation of both modalities. This limitation prevents the simultaneous\nconsideration of the global spatial information from both modalities during\ninteraction, leading to a lack of comprehensive perception of salient targets.\nConsequently, the fusion results tend to bias towards one modality instead of\nadaptively preserving salient targets. To address this issue, we propose the\nSaliency-aware Selective State Space Fusion Model (S4Fusion). In our S4Fusion,\nthe designed Cross-Modal Spatial Awareness Module (CMSA) can simultaneously\nfocus on global spatial information from both modalities while facilitating\ntheir interaction, thereby comprehensively capturing complementary information.\nAdditionally, S4Fusion leverages a pre-trained network to perceive uncertainty\nin the fused images. By minimizing this uncertainty, S4Fusion adaptively\nhighlights salient targets from both images. Extensive experiments demonstrate\nthat our approach produces high-quality images and enhances performance in\ndownstream tasks.\n","authors":["Haolong Ma","Hui Li","Chunyang Cheng","Gaoang Wang","Xiaoning Song","Xiaojun Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07586v5","updated":"2024-06-03T04:17:49Z","published":"2023-12-11T02:40:40Z","title":"Characteristic Guidance: Non-linear Correction for Diffusion Model at\n  Large Guidance Scale","summary":"  Popular guidance for denoising diffusion probabilistic model (DDPM) linearly\ncombines distinct conditional models together to provide enhanced control over\nsamples. However, this approach overlooks nonlinear effects that become\nsignificant when guidance scale is large. To address this issue, we propose\ncharacteristic guidance, a guidance method that provides first-principle\nnon-linear correction for classifier-free guidance. Such correction forces the\nguided DDPMs to respect the Fokker-Planck (FP) equation of diffusion process,\nin a way that is training-free and compatible with existing sampling methods.\nExperiments show that characteristic guidance enhances semantic characteristics\nof prompts and mitigate irregularities in image generation, proving effective\nin diverse applications ranging from simulating magnet phase transitions to\nlatent space sampling.\n","authors":["Candi Zheng","Yuan Lan"],"pdf_url":"https://arxiv.org/pdf/2312.07586v5.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2403.00476v3","updated":"2024-06-03T04:13:39Z","published":"2024-03-01T12:02:19Z","title":"TempCompass: Do Video LLMs Really Understand Videos?","summary":"  Recently, there is a surge in interest surrounding video large language\nmodels (Video LLMs). However, existing benchmarks fail to provide a\ncomprehensive feedback on the temporal perception ability of Video LLMs. On the\none hand, most of them are unable to distinguish between different temporal\naspects (e.g., speed, direction) and thus cannot reflect the nuanced\nperformance on these specific aspects. On the other hand, they are limited in\nthe diversity of task formats (e.g., only multi-choice QA), which hinders the\nunderstanding of how temporal perception performance may vary across different\ntypes of tasks. Motivated by these two problems, we propose the\n\\textbf{TempCompass} benchmark, which introduces a diversity of temporal\naspects and task formats. To collect high-quality test data, we devise two\nnovel strategies: (1) In video collection, we construct conflicting videos that\nshare the same static content but differ in a specific temporal aspect, which\nprevents Video LLMs from leveraging single-frame bias or language priors. (2)\nTo collect the task instructions, we propose a paradigm where humans first\nannotate meta-information for a video and then an LLM generates the\ninstruction. We also design an LLM-based approach to automatically and\naccurately evaluate the responses from Video LLMs. Based on TempCompass, we\ncomprehensively evaluate 8 state-of-the-art (SOTA) Video LLMs and 3 Image LLMs,\nand reveal the discerning fact that these models exhibit notably poor temporal\nperception ability. Our data will be available at\nhttps://github.com/llyx97/TempCompass.\n","authors":["Yuanxin Liu","Shicheng Li","Yi Liu","Yuxiang Wang","Shuhuai Ren","Lei Li","Sishuo Chen","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2403.00476v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07847v2","updated":"2024-06-03T04:02:52Z","published":"2024-04-11T15:42:53Z","title":"The Effectiveness of a Simplified Model Structure for Crowd Counting","summary":"  In the field of crowd counting research, many recent deep learning based\nmethods have demonstrated robust capabilities for accurately estimating crowd\nsizes. However, the enhancement in their performance often arises from an\nincrease in the complexity of the model structure. This paper discusses how to\nconstruct high-performance crowd counting models using only simple structures.\nWe proposes the Fuss-Free Network (FFNet) that is characterized by its simple\nand efficieny structure, consisting of only a backbone network and a\nmulti-scale feature fusion structure. The multi-scale feature fusion structure\nis a simple structure consisting of three branches, each only equipped with a\nfocus transition module, and combines the features from these branches through\nthe concatenation operation. Our proposed crowd counting model is trained and\nevaluated on four widely used public datasets, and it achieves accuracy that is\ncomparable to that of existing complex models. Furthermore, we conduct a\ncomprehensive evaluation by replacing the existing backbones of various models\nsuch as FFNet and CCTrans with different networks, including MobileNet-v3,\nConvNeXt-Tiny, and Swin-Transformer-Small. The experimental results further\nindicate that excellent crowd counting performance can be achieved with the\nsimplied structure proposed by us.\n","authors":["Lei Chen","Xinghang Gao","Fei Chao","Chih Min Lin","Xingen Gao","Hongyi Zhang","Juqiang Lin"],"pdf_url":"https://arxiv.org/pdf/2404.07847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11973v4","updated":"2024-06-03T03:51:38Z","published":"2023-12-19T09:11:49Z","title":"Continual Learning: Forget-free Winning Subnetworks for Video\n  Representations","summary":"  Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the\nexistence of efficient subnetworks within larger, dense networks, a\nhigh-performing Winning Subnetwork (WSN) in terms of task performance under\nappropriate sparsity conditions is considered for various continual learning\ntasks. It leverages pre-existing weights from dense networks to achieve\nefficient learning in Task Incremental Learning (TIL) and Task-agnostic\nIncremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning\n(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is\ndesigned to prevent overfitting when the data samples are scarce. Furthermore,\nthe sparse reuse of WSN weights is considered for Video Incremental Learning\n(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It\nenables compact encoding of videos and identifies reusable subnetworks across\nvarying bandwidths. We have integrated FSO into different architectural\nframeworks for continual learning, including VIL, TIL, and FSCIL. Our\ncomprehensive experiments demonstrate FSO's effectiveness, significantly\nimproving task performance at various convolutional representational levels.\nSpecifically, FSO enhances higher-layer performance in TIL and FSCIL and\nlower-layer performance in VIL.\n","authors":["Haeyong Kang","Jaehong Yoon","Sung Ju Hwang","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2312.11973v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.14962,\n  arXiv:2306.11305"},{"id":"http://arxiv.org/abs/2312.12754v2","updated":"2024-06-03T03:17:01Z","published":"2023-12-20T04:27:13Z","title":"Spectral Prompt Tuning:Unveiling Unseen Classes for Zero-Shot Semantic\n  Segmentation","summary":"  Recently, CLIP has found practical utility in the domain of pixel-level\nzero-shot segmentation tasks. The present landscape features two-stage\nmethodologies beset by issues such as intricate pipelines and elevated\ncomputational costs. While current one-stage approaches alleviate these\nconcerns and incorporate Visual Prompt Training (VPT) to uphold CLIP's\ngeneralization capacity, they still fall short in fully harnessing CLIP's\npotential for pixel-level unseen class demarcation and precise pixel\npredictions. To further stimulate CLIP's zero-shot dense prediction capability,\nwe propose SPT-SEG, a one-stage approach that improves CLIP's adaptability from\nimage to pixel. Specifically, we initially introduce Spectral Prompt Tuning\n(SPT), incorporating spectral prompts into the CLIP visual encoder's shallow\nlayers to capture structural intricacies of images, thereby enhancing\ncomprehension of unseen classes. Subsequently, we introduce the Spectral Guided\nDecoder (SGD), utilizing both high and low-frequency information to steer the\nnetwork's spatial focus towards more prominent classification features,\nenabling precise pixel-level prediction outcomes. Through extensive experiments\non two public datasets, we demonstrate the superiority of our method over\nstate-of-the-art approaches, performing well across all classes and\nparticularly excelling in handling unseen classes. Code is available\nat:https://github.com/clearxu/SPT.\n","authors":["Wenhao Xu","Rongtao Xu","Changwei Wang","Shibiao Xu","Li Guo","Man Zhang","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12754v2.pdf","comment":"AAAI2024 Accepted"},{"id":"http://arxiv.org/abs/2403.02998v2","updated":"2024-06-03T03:04:42Z","published":"2024-03-04T11:23:40Z","title":"Towards Calibrated Deep Clustering Network","summary":"  Deep clustering has exhibited remarkable performance; however, the\nover-confidence problem, i.e., the estimated confidence for a sample belonging\nto a particular cluster greatly exceeds its actual prediction accuracy, has\nbeen overlooked in prior research. To tackle this critical issue, we pioneer\nthe development of a calibrated deep clustering framework. Specifically, we\npropose a novel dual-head (calibration head and clustering head) deep\nclustering model that can effectively calibrate the estimated confidence and\nthe actual accuracy. The calibration head adjusts the overconfident predictions\nof the clustering head, generating prediction confidence that match the model\nlearning status. Then, the clustering head dynamically select reliable\nhigh-confidence samples estimated by the calibration head for pseudo-label\nself-training. Additionally, we introduce an effective network initialization\nstrategy that enhances both training speed and network robustness. The\neffectiveness of the proposed calibration approach and initialization strategy\nare both endorsed with solid theoretical guarantees. Extensive experiments\ndemonstrate the proposed calibrated deep clustering model not only surpasses\nstate-of-the-art deep clustering methods by 10 times in terms of expected\ncalibration error but also significantly outperforms them in terms of\nclustering accuracy.\n","authors":["Yuheng Jia","Jianhong Cheng","Hui Liu","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2403.02998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11473v2","updated":"2024-06-03T03:04:12Z","published":"2024-05-19T07:48:41Z","title":"FIFO-Diffusion: Generating Infinite Videos from Text without Training","summary":"  We propose a novel inference technique based on a pretrained diffusion model\nfor text-conditional video generation. Our approach, called FIFO-Diffusion, is\nconceptually capable of generating infinitely long videos without additional\ntraining. This is achieved by iteratively performing diagonal denoising, which\nconcurrently processes a series of consecutive frames with increasing noise\nlevels in a queue; our method dequeues a fully denoised frame at the head while\nenqueuing a new random noise frame at the tail. However, diagonal denoising is\na double-edged sword as the frames near the tail can take advantage of cleaner\nones by forward reference but such a strategy induces the discrepancy between\ntraining and inference. Hence, we introduce latent partitioning to reduce the\ntraining-inference gap and lookahead denoising to leverage the benefit of\nforward referencing. Practically, FIFO-Diffusion consumes a constant amount of\nmemory regardless of the target video length given a baseline model, while\nwell-suited for parallel inference on multiple GPUs. We have demonstrated the\npromising results and effectiveness of the proposed methods on existing\ntext-to-video generation baselines. Generated video samples and source codes\nare available at our project page.\n","authors":["Jihwan Kim","Junoh Kang","Jinyoung Choi","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2405.11473v2.pdf","comment":"Project Page: https://jjihwan.github.io/projects/FIFO-Diffusion"},{"id":"http://arxiv.org/abs/2405.21013v2","updated":"2024-06-03T02:43:16Z","published":"2024-05-31T16:55:04Z","title":"StrucTexTv3: An Efficient Vision-Language Model for Text-rich Image\n  Perception, Comprehension, and Beyond","summary":"  Text-rich images have significant and extensive value, deeply integrated into\nvarious aspects of human life. Notably, both visual cues and linguistic symbols\nin text-rich images play crucial roles in information transmission but are\naccompanied by diverse challenges. Therefore, the efficient and effective\nunderstanding of text-rich images is a crucial litmus test for the capability\nof Vision-Language Models. We have crafted an efficient vision-language model,\nStrucTexTv3, tailored to tackle various intelligent tasks for text-rich images.\nThe significant design of StrucTexTv3 is presented in the following aspects:\nFirstly, we adopt a combination of an effective multi-scale reduced visual\ntransformer and a multi-granularity token sampler (MG-Sampler) as a visual\ntoken generator, successfully solving the challenges of high-resolution input\nand complex representation learning for text-rich images. Secondly, we enhance\nthe perception and comprehension abilities of StrucTexTv3 through instruction\nlearning, seamlessly integrating various text-oriented tasks into a unified\nframework. Thirdly, we have curated a comprehensive collection of high-quality\ntext-rich images, abbreviated as TIM-30M, encompassing diverse scenarios like\nincidental scenes, office documents, web pages, and screenshots, thereby\nimproving the robustness of our model. Our method achieved SOTA results in\ntext-rich image perception tasks, and significantly improved performance in\ncomprehension tasks. Among multimodal models with LLM decoder of approximately\n1.8B parameters, it stands out as a leader, which also makes the deployment of\nedge devices feasible. In summary, the StrucTexTv3 model, featuring efficient\nstructural design, outstanding performance, and broad adaptability, offers\nrobust support for diverse intelligent application tasks involving text-rich\nimages, thus exhibiting immense potential for widespread application.\n","authors":["Pengyuan Lyu","Yulin Li","Hao Zhou","Weihong Ma","Xingyu Wan","Qunyi Xie","Liang Wu","Chengquan Zhang","Kun Yao","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2405.21013v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02072v4","updated":"2024-06-03T02:15:03Z","published":"2024-04-02T16:20:02Z","title":"EGTR: Extracting Graph from Transformer for Scene Graph Generation","summary":"  Scene Graph Generation (SGG) is a challenging task of detecting objects and\npredicting relationships between objects. After DETR was developed, one-stage\nSGG models based on a one-stage object detector have been actively studied.\nHowever, complex modeling is used to predict the relationship between objects,\nand the inherent relationship between object queries learned in the multi-head\nself-attention of the object detector has been neglected. We propose a\nlightweight one-stage SGG model that extracts the relation graph from the\nvarious relationships learned in the multi-head self-attention layers of the\nDETR decoder. By fully utilizing the self-attention by-products, the relation\ngraph can be extracted effectively with a shallow relation extraction head.\nConsidering the dependency of the relation extraction task on the object\ndetection task, we propose a novel relation smoothing technique that adjusts\nthe relation label adaptively according to the quality of the detected objects.\nBy the relation smoothing, the model is trained according to the continuous\ncurriculum that focuses on object detection task at the beginning of training\nand performs multi-task learning as the object detection performance gradually\nimproves. Furthermore, we propose a connectivity prediction task that predicts\nwhether a relation exists between object pairs as an auxiliary task of the\nrelation extraction. We demonstrate the effectiveness and efficiency of our\nmethod for the Visual Genome and Open Image V6 datasets. Our code is publicly\navailable at https://github.com/naver-ai/egtr.\n","authors":["Jinbae Im","JeongYeon Nam","Nokyung Park","Hyungmin Lee","Seunghyun Park"],"pdf_url":"https://arxiv.org/pdf/2404.02072v4.pdf","comment":"CVPR 2024 (Best paper award candidate)"},{"id":"http://arxiv.org/abs/2402.10717v2","updated":"2024-06-03T02:14:12Z","published":"2024-02-16T14:19:33Z","title":"BioFusionNet: Deep Learning-Based Survival Risk Stratification in ER+\n  Breast Cancer Through Multifeature and Multimodal Data Fusion","summary":"  Breast cancer is a significant health concern affecting millions of women\nworldwide. Accurate survival risk stratification plays a crucial role in\nguiding personalised treatment decisions and improving patient outcomes. Here\nwe present BioFusionNet, a deep learning framework that fuses image-derived\nfeatures with genetic and clinical data to obtain a holistic profile and\nachieve survival risk stratification of ER+ breast cancer patients. We employ\nmultiple self-supervised feature extractors (DINO and MoCoV3) pretrained on\nhistopathological patches to capture detailed image features. These features\nare then fused by a variational autoencoder and fed to a self-attention network\ngenerating patient-level features. A co-dual-cross-attention mechanism combines\nthe histopathological features with genetic data, enabling the model to capture\nthe interplay between them. Additionally, clinical data is incorporated using a\nfeed-forward network, further enhancing predictive performance and achieving\ncomprehensive multimodal feature integration. Furthermore, we introduce a\nweighted Cox loss function, specifically designed to handle imbalanced survival\ndata, which is a common challenge. Our model achieves a mean concordance index\nof 0.77 and a time-dependent area under the curve of 0.84, outperforming\nstate-of-the-art methods. It predicts risk (high versus low) with prognostic\nsignificance for overall survival in univariate analysis (HR=2.99, 95% CI:\n1.88--4.78, p<0.005), and maintains independent significance in multivariate\nanalysis incorporating standard clinicopathological variables (HR=2.91, 95\\%\nCI: 1.80--4.68, p<0.005).\n","authors":["Raktim Kumar Mondol","Ewan K. A. Millar","Arcot Sowmya","Erik Meijering"],"pdf_url":"https://arxiv.org/pdf/2402.10717v2.pdf","comment":"Keywords: Multimodal Fusion, Breast Cancer, Whole Slide Images, Deep\n  Neural Network, Survival Prediction"},{"id":"http://arxiv.org/abs/2401.06127v2","updated":"2024-06-03T02:09:38Z","published":"2024-01-11T18:59:14Z","title":"E$^{2}$GAN: Efficient Training of Efficient GANs for Image-to-Image\n  Translation","summary":"  One highly promising direction for enabling flexible real-time on-device\nimage editing is utilizing data distillation by leveraging large-scale\ntext-to-image diffusion models to generate paired datasets used for training\ngenerative adversarial networks (GANs). This approach notably alleviates the\nstringent requirements typically imposed by high-end commercial GPUs for\nperforming image editing with diffusion models. However, unlike text-to-image\ndiffusion models, each distilled GAN is specialized for a specific image\nediting task, necessitating costly training efforts to obtain models for\nvarious concepts. In this work, we introduce and address a novel research\ndirection: can the process of distilling GANs from diffusion models be made\nsignificantly more efficient? To achieve this goal, we propose a series of\ninnovative techniques. First, we construct a base GAN model with generalized\nfeatures, adaptable to different concepts through fine-tuning, eliminating the\nneed for training from scratch. Second, we identify crucial layers within the\nbase GAN model and employ Low-Rank Adaptation (LoRA) with a simple yet\neffective rank search process, rather than fine-tuning the entire base model.\nThird, we investigate the minimal amount of data necessary for fine-tuning,\nfurther reducing the overall training time. Extensive experiments show that we\ncan efficiently empower GANs with the ability to perform real-time high-quality\nimage editing on mobile devices with remarkably reduced training and storage\ncosts for each concept.\n","authors":["Yifan Gong","Zheng Zhan","Qing Jin","Yanyu Li","Yerlan Idelbayev","Xian Liu","Andrey Zharkov","Kfir Aberman","Sergey Tulyakov","Yanzhi Wang","Jian Ren"],"pdf_url":"https://arxiv.org/pdf/2401.06127v2.pdf","comment":"ICML 2024. Project Page: https://yifanfanfanfan.github.io/e2gan/"},{"id":"http://arxiv.org/abs/2404.16666v3","updated":"2024-06-03T02:07:14Z","published":"2024-04-25T15:06:58Z","title":"PhyRecon: Physically Plausible Neural Scene Reconstruction","summary":"  Neural implicit representations have gained popularity in multi-view 3D\nreconstruction. However, most previous work struggles to yield physically\nplausible results, limiting their utility in domains requiring rigorous\nphysical accuracy, such as embodied AI and robotics. This lack of plausibility\nstems from the absence of physics modeling in existing methods and their\ninability to recover intricate geometrical structures. In this paper, we\nintroduce PhyRecon, the first approach to leverage both differentiable\nrendering and differentiable physics simulation to learn implicit surface\nrepresentations. PhyRecon features a novel differentiable particle-based\nphysical simulator built on neural implicit representations. Central to this\ndesign is an efficient transformation between SDF-based implicit\nrepresentations and explicit surface points via our proposed Surface Points\nMarching Cubes (SP-MC), enabling differentiable learning with both rendering\nand physical losses. Additionally, PhyRecon models both rendering and physical\nuncertainty to identify and compensate for inconsistent and inaccurate\nmonocular geometric priors. This physical uncertainty further facilitates a\nnovel physics-guided pixel sampling to enhance the learning of slender\nstructures. By integrating these techniques, our model supports differentiable\njoint modeling of appearance, geometry, and physics. Extensive experiments\ndemonstrate that PhyRecon significantly outperforms all state-of-the-art\nmethods. Our results also exhibit superior physical stability in physical\nsimulators, with at least a 40% improvement across all datasets, paving the way\nfor future physics-based applications.\n","authors":["Junfeng Ni","Yixin Chen","Bohan Jing","Nan Jiang","Bin Wang","Bo Dai","Puhao Li","Yixin Zhu","Song-Chun Zhu","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2404.16666v3.pdf","comment":"project page: https://phyrecon.github.io/"},{"id":"http://arxiv.org/abs/2403.16286v2","updated":"2024-06-03T01:43:08Z","published":"2024-03-24T20:31:42Z","title":"HemoSet: The First Blood Segmentation Dataset for Automation of\n  Hemostasis Management","summary":"  Hemorrhaging occurs in surgeries of all types, forcing surgeons to quickly\nadapt to the visual interference that results from blood rapidly filling the\nsurgical field. Introducing automation into the crucial surgical task of\nhemostasis management would offload mental and physical tasks from the surgeon\nand surgical assistants while simultaneously increasing the efficiency and\nsafety of the operation. The first step in automation of hemostasis management\nis detection of blood in the surgical field. To propel the development of blood\ndetection algorithms in surgeries, we present HemoSet, the first blood\nsegmentation dataset based on bleeding during a live animal robotic surgery.\nOur dataset features vessel hemorrhage scenarios where turbulent flow leads to\nabnormal pooling geometries in surgical fields. These pools are formed in\nconditions endemic to surgical procedures -- uneven heterogeneous tissue, under\nglossy lighting conditions and rapid tool movement. We benchmark several\nstate-of-the-art segmentation models and provide insight into the difficulties\nspecific to blood detection. We intend for HemoSet to spur development of\nautonomous blood suction tools by providing a platform for training and\nrefining blood segmentation models, addressing the precision needed for such\nrobotics.\n","authors":["Albert J. Miao","Shan Lin","Jingpei Lu","Florian Richter","Benjamin Ostrander","Emily K. Funk","Ryan K. Orosco","Michael C. Yip"],"pdf_url":"https://arxiv.org/pdf/2403.16286v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10754v2","updated":"2024-06-03T01:35:11Z","published":"2023-05-18T06:54:56Z","title":"Brain Imaging-to-Graph Generation using Adversarial Hierarchical\n  Diffusion Models for MCI Causality Analysis","summary":"  Effective connectivity can describe the causal patterns among brain regions.\nThese patterns have the potential to reveal the pathological mechanism and\npromote early diagnosis and effective drug development for cognitive disease.\nHowever, the current methods utilize software toolkits to extract empirical\nfeatures from brain imaging to estimate effective connectivity. These methods\nheavily rely on manual parameter settings and may result in large errors during\neffective connectivity estimation. In this paper, a novel brain\nimaging-to-graph generation (BIGG) framework is proposed to map functional\nmagnetic resonance imaging (fMRI) into effective connectivity for mild\ncognitive impairment (MCI) analysis. To be specific, the proposed BIGG\nframework is based on the diffusion denoising probabilistic models (DDPM),\nwhere each denoising step is modeled as a generative adversarial network (GAN)\nto progressively translate the noise and conditional fMRI to effective\nconnectivity. The hierarchical transformers in the generator are designed to\nestimate the noise at multiple scales. Each scale concentrates on both spatial\nand temporal information between brain regions, enabling good quality in noise\nremoval and better inference of causal relations. Meanwhile, the\ntransformer-based discriminator constrains the generator to further capture\nglobal and local patterns for improving high-quality and diversity generation.\nBy introducing the diffusive factor, the denoising inference with a large\nsampling step size is more efficient and can maintain high-quality results for\neffective connectivity generation. Evaluations of the ADNI dataset demonstrate\nthe feasibility and efficacy of the proposed model. The proposed model not only\nachieves superior prediction performance compared with other competing methods\nbut also predicts MCI-related causal connections that are consistent with\nclinical studies.\n","authors":["Qiankun Zuo","Hao Tian","Chi-Man Pun","Hongfei Wang","Yudong Zhang","Jin Hong"],"pdf_url":"https://arxiv.org/pdf/2305.10754v2.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2402.11058v3","updated":"2024-06-03T01:09:38Z","published":"2024-02-16T20:14:47Z","title":"II-MMR: Identifying and Improving Multi-modal Multi-hop Reasoning in\n  Visual Question Answering","summary":"  Visual Question Answering (VQA) often involves diverse reasoning scenarios\nacross Vision and Language (V&L). Most prior VQA studies, however, have merely\nfocused on assessing the model's overall accuracy without evaluating it on\ndifferent reasoning cases. Furthermore, some recent works observe that\nconventional Chain-of-Thought (CoT) prompting fails to generate effective\nreasoning for VQA, especially for complex scenarios requiring multi-hop\nreasoning. In this paper, we propose II-MMR, a novel idea to identify and\nimprove multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA\nquestion with an image and finds a reasoning path to reach its answer using two\nnovel language promptings: (i) answer prediction-guided CoT prompt, or (ii)\nknowledge triplet-guided prompt. II-MMR then analyzes this path to identify\ndifferent reasoning cases in current VQA benchmarks by estimating how many hops\nand what types (i.e., visual or beyond-visual) of reasoning are required to\nanswer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR\nobserves that most of their VQA questions are easy to answer, simply demanding\n\"single-hop\" reasoning, whereas only a few questions require \"multi-hop\"\nreasoning. Moreover, while the recent V&L model struggles with such complex\nmulti-hop reasoning questions even using the traditional CoT method, II-MMR\nshows its effectiveness across all reasoning cases in both zero-shot and\nfine-tuning settings.\n","authors":["Jihyung Kil","Farideh Tavazoee","Dongyeop Kang","Joo-Kyung Kim"],"pdf_url":"https://arxiv.org/pdf/2402.11058v3.pdf","comment":"Accepted to ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2401.05604v2","updated":"2024-06-03T23:49:45Z","published":"2024-01-11T00:30:28Z","title":"REBUS: A Robust Evaluation Benchmark of Understanding Symbols","summary":"  We propose a new benchmark evaluating the performance of multimodal large\nlanguage models on rebus puzzles. The dataset covers 333 original examples of\nimage-based wordplay, cluing 13 categories such as movies, composers, major\ncities, and food. To achieve good performance on the benchmark of identifying\nthe clued word or phrase, models must combine image recognition and string\nmanipulation with hypothesis testing, multi-step reasoning, and an\nunderstanding of human cognition, making for a complex, multimodal evaluation\nof capabilities. We find that GPT-4o significantly outperforms all other\nmodels, followed by proprietary models outperforming all other evaluated\nmodels. However, even the best model has a final accuracy of only 42\\%, which\ngoes down to just 7\\% on hard puzzles, highlighting the need for substantial\nimprovements in reasoning. Further, models rarely understand all parts of a\npuzzle, and are almost always incapable of retroactively explaining the correct\nanswer. Our benchmark can therefore be used to identify major shortcomings in\nthe knowledge and reasoning of multimodal large language models.\n","authors":["Andrew Gritsevskiy","Arjun Panickssery","Aaron Kirtland","Derik Kauffman","Hans Gundlach","Irina Gritsevskaya","Joe Cavanagh","Jonathan Chiang","Lydia La Roux","Michelle Hung"],"pdf_url":"https://arxiv.org/pdf/2401.05604v2.pdf","comment":"20 pages, 5 figures. For code, see http://github.com/cvndsh/rebus"},{"id":"http://arxiv.org/abs/2402.01103v3","updated":"2024-06-03T23:30:33Z","published":"2024-02-02T02:40:51Z","title":"Compositional Generative Modeling: A Single Model is Not All You Need","summary":"  Large monolithic generative models trained on massive amounts of data have\nbecome an increasingly dominant approach in AI research. In this paper, we\nargue that we should instead construct large generative systems by composing\nsmaller generative models together. We show how such a compositional generative\napproach enables us to learn distributions in a more data-efficient manner,\nenabling generalization to parts of the data distribution unseen at training\ntime. We further show how this enables us to program and construct new\ngenerative models for tasks completely unseen at training. Finally, we show\nthat in many cases, we can discover separate compositional components from\ndata.\n","authors":["Yilun Du","Leslie Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2402.01103v3.pdf","comment":"ICML 2024 (Position Track)"},{"id":"http://arxiv.org/abs/2406.01843v1","updated":"2024-06-03T23:28:57Z","published":"2024-06-03T23:28:57Z","title":"L-MAGIC: Language Model Assisted Generation of Images with Coherence","summary":"  In the current era of generative AI breakthroughs, generating panoramic\nscenes from a single input image remains a key challenge. Most existing methods\nuse diffusion-based iterative or simultaneous multi-view inpainting. However,\nthe lack of global scene layout priors leads to subpar outputs with duplicated\nobjects (e.g., multiple beds in a bedroom) or requires time-consuming human\ntext inputs for each view. We propose L-MAGIC, a novel method leveraging large\nlanguage models for guidance while diffusing multiple coherent views of 360\ndegree panoramic scenes. L-MAGIC harnesses pre-trained diffusion and language\nmodels without fine-tuning, ensuring zero-shot performance. The output quality\nis further enhanced by super-resolution and multi-view fusion techniques.\nExtensive experiments demonstrate that the resulting panoramic scenes feature\nbetter scene layouts and perspective view rendering quality compared to related\nworks, with >70% preference in human evaluations. Combined with conditional\ndiffusion models, L-MAGIC can accept various input modalities, including but\nnot limited to text, depth maps, sketches, and colored scripts. Applying depth\nestimation further enables 3D point cloud generation and dynamic scene\nexploration with fluid camera motion. Code is available at\nhttps://github.com/IntelLabs/MMPano. The video presentation is available at\nhttps://youtu.be/XDMNEzH4-Ec?list=PLG9Zyvu7iBa0-a7ccNLO8LjcVRAoMn57s.\n","authors":["Zhipeng Cai","Matthias Mueller","Reiner Birkl","Diana Wofk","Shao-Yen Tseng","JunDa Cheng","Gabriela Ben-Melech Stan","Vasudev Lal","Michael Paulitsch"],"pdf_url":"https://arxiv.org/pdf/2406.01843v1.pdf","comment":"accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2405.17698v3","updated":"2024-06-03T23:24:39Z","published":"2024-05-27T23:09:37Z","title":"BaboonLand Dataset: Tracking Primates in the Wild and Automating\n  Behaviour Recognition from Drone Videos","summary":"  Using drones to track multiple individuals simultaneously in their natural\nenvironment is a powerful approach for better understanding group primate\nbehavior. Previous studies have demonstrated that it is possible to automate\nthe classification of primate behavior from video data, but these studies have\nbeen carried out in captivity or from ground-based cameras. To understand group\nbehavior and the self-organization of a collective, the whole troop needs to be\nseen at a scale where behavior can be seen in relation to the natural\nenvironment in which ecological decisions are made. This study presents a novel\ndataset from drone videos for baboon detection, tracking, and behavior\nrecognition. The baboon detection dataset was created by manually annotating\nall baboons in drone videos with bounding boxes. A tiling method was\nsubsequently applied to create a pyramid of images at various scales from the\noriginal 5.3K resolution images, resulting in approximately 30K images used for\nbaboon detection. The tracking dataset is derived from the detection dataset,\nwhere all bounding boxes are assigned the same ID throughout the video. This\nprocess resulted in half an hour of very dense tracking data. The behavior\nrecognition dataset was generated by converting tracks into mini-scenes, a\nvideo subregion centered on each animal; each mini-scene was manually annotated\nwith 12 distinct behavior types, resulting in over 20 hours of data. Benchmark\nresults show mean average precision (mAP) of 92.62\\% for the YOLOv8-X detection\nmodel, multiple object tracking precision (MOTA) of 63.81\\% for the BotSort\ntracking algorithm, and micro top-1 accuracy of 63.97\\% for the X3D behavior\nrecognition model. Using deep learning to classify wildlife behavior from drone\nfootage facilitates non-invasive insight into the collective behavior of an\nentire group.\n","authors":["Isla Duporge","Maksim Kholiavchenko","Roi Harel","Scott Wolf","Dan Rubenstein","Meg Crofoot","Tanya Berger-Wolf","Stephen Lee","Julie Barreau","Jenna Kline","Michelle Ramirez","Charles Stewart"],"pdf_url":"https://arxiv.org/pdf/2405.17698v3.pdf","comment":"Dataset will be published shortly"},{"id":"http://arxiv.org/abs/2406.01837v1","updated":"2024-06-03T23:09:30Z","published":"2024-06-03T23:09:30Z","title":"Boosting Vision-Language Models with Transduction","summary":"  Transduction is a powerful paradigm that leverages the structure of unlabeled\ndata to boost predictive accuracy. We present TransCLIP, a novel and\ncomputationally efficient transductive approach designed for Vision-Language\nModels (VLMs). TransCLIP is applicable as a plug-and-play module on top of\npopular inductive zero- and few-shot models, consistently improving their\nperformances. Our new objective function can be viewed as a regularized\nmaximum-likelihood estimation, constrained by a KL divergence penalty that\nintegrates the text-encoder knowledge and guides the transductive learning\nprocess. We further derive an iterative Block Majorize-Minimize (BMM) procedure\nfor optimizing our objective, with guaranteed convergence and decoupled\nsample-assignment updates, yielding computationally efficient transduction for\nlarge-scale datasets. We report comprehensive evaluations, comparisons, and\nablation studies that demonstrate: (i) Transduction can greatly enhance the\ngeneralization capabilities of inductive pretrained zero- and few-shot VLMs;\n(ii) TransCLIP substantially outperforms standard transductive few-shot\nlearning methods relying solely on vision features, notably due to the KL-based\nlanguage constraint.\n","authors":["Maxime Zanella","Benoît Gérin","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2406.01837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08400v2","updated":"2024-06-03T23:02:26Z","published":"2024-02-13T11:59:43Z","title":"Adaptive Hierarchical Certification for Segmentation using Randomized\n  Smoothing","summary":"  Certification for machine learning is proving that no adversarial sample can\nevade a model within a range under certain conditions, a necessity for\nsafety-critical domains. Common certification methods for segmentation use a\nflat set of fine-grained classes, leading to high abstain rates due to model\nuncertainty across many classes. We propose a novel, more practical setting,\nwhich certifies pixels within a multi-level hierarchy, and adaptively relaxes\nthe certification to a coarser level for unstable components classic methods\nwould abstain from, effectively lowering the abstain rate whilst providing more\ncertified semantically meaningful information. We mathematically formulate the\nproblem setup, introduce an adaptive hierarchical certification algorithm and\nprove the correctness of its guarantees. Since certified accuracy does not take\nthe loss of information into account for coarser classes, we introduce the\nCertified Information Gain ($\\mathrm{CIG}$) metric, which is proportional to\nthe class granularity level. Our extensive experiments on the datasets\nCityscapes, PASCAL-Context, ACDC and COCO-Stuff demonstrate that our adaptive\nalgorithm achieves a higher $\\mathrm{CIG}$ and lower abstain rate compared to\nthe current state-of-the-art certification method. Our code can be found here:\nhttps://github.com/AlaaAnani/adaptive-certify.\n","authors":["Alaa Anani","Tobias Lorenz","Bernt Schiele","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2402.08400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14830v2","updated":"2024-06-03T22:59:54Z","published":"2023-12-22T17:06:08Z","title":"Dreaming of Electrical Waves: Generative Modeling of Cardiac Excitation\n  Waves using Diffusion Models","summary":"  Electrical waves in the heart form rotating spiral or scroll waves during\nlife-threatening arrhythmias such as atrial or ventricular fibrillation. The\nwave dynamics are typically modeled using coupled partial differential\nequations, which describe reaction-diffusion dynamics in excitable media. More\nrecently, data-driven generative modeling has emerged as an alternative to\ngenerate spatio-temporal patterns in physical and biological systems. Here, we\nexplore denoising diffusion probabilistic models for the generative modeling of\nelectrical wave patterns in cardiac tissue. We trained diffusion models with\nsimulated electrical wave patterns to be able to generate such wave patterns in\nunconditional and conditional generation tasks. For instance, we explored the\ndiffusion-based i) parameter-specific generation, ii) evolution and iii)\ninpainting of spiral wave dynamics, including reconstructing three-dimensional\nscroll wave dynamics from superficial two-dimensional measurements. Further, we\ngenerated arbitrarily shaped bi-ventricular geometries and simultaneously\ninitiated scroll wave patterns inside these geometries using diffusion. We\ncharacterized and compared the diffusion-generated solutions to solutions\nobtained with corresponding biophysical models and found that diffusion models\nlearn to replicate spiral and scroll waves dynamics so well that they could be\nused for data-driven modeling of excitation waves in cardiac tissue. For\ninstance, an ensemble of diffusion-generated spiral wave dynamics exhibits\nsimilar self-termination statistics as the corresponding ensemble simulated\nwith a biophysical model. However, we also found that diffusion models {produce\nartifacts if training data is lacking, e.g. during self-termination,} and\n`hallucinate' wave patterns when insufficiently constrained.\n","authors":["Tanish Baranwal","Jan Lebert","Jan Christoph"],"pdf_url":"https://arxiv.org/pdf/2312.14830v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01829v1","updated":"2024-06-03T22:56:40Z","published":"2024-06-03T22:56:40Z","title":"FacAID: A Transformer Model for Neuro-Symbolic Facade Reconstruction","summary":"  We introduce a neuro-symbolic transformer-based model that converts flat,\nsegmented facade structures into procedural definitions using a custom-designed\nsplit grammar. To facilitate this, we first develop a semi-complex split\ngrammar tailored for architectural facades and then generate a dataset\ncomprising of facades alongside their corresponding procedural representations.\nThis dataset is used to train our transformer model to convert segmented, flat\nfacades into the procedural language of our grammar. During inference, the\nmodel applies this learned transformation to new facade segmentations,\nproviding a procedural representation that users can adjust to generate varied\nfacade designs. This method not only automates the conversion of static facade\nimages into dynamic, editable procedural formats but also enhances the design\nflexibility, allowing for easy modifications and variations by architects and\ndesigners. Our approach sets a new standard in facade design by combining the\nprecision of procedural generation with the adaptability of neuro-symbolic\nlearning.\n","authors":["Aleksander Płocharski","Jan Swidzinski","Joanna Porter-Sobieraj","Przemyslaw Musialski"],"pdf_url":"https://arxiv.org/pdf/2406.01829v1.pdf","comment":"11 pages, 10 figures, preprint"},{"id":"http://arxiv.org/abs/2405.07842v2","updated":"2024-06-03T22:51:32Z","published":"2024-05-13T15:30:41Z","title":"Ground-based image deconvolution with Swin Transformer UNet","summary":"  As ground-based all-sky astronomical surveys will gather millions of images\nin the coming years, a critical requirement emerges for the development of fast\ndeconvolution algorithms capable of efficiently improving the spatial\nresolution of these images. By successfully recovering clean and\nhigh-resolution images from these surveys, the objective is to deepen the\nunderstanding of galaxy formation and evolution through accurate photometric\nmeasurements. We introduce a two-step deconvolution framework using a Swin\nTransformer architecture. Our study reveals that the deep learning-based\nsolution introduces a bias, constraining the scope of scientific analysis. To\naddress this limitation, we propose a novel third step relying on the active\ncoefficients in the sparsity wavelet framework. We conducted a performance\ncomparison between our deep learning-based method and Firedec, a classical\ndeconvolution algorithm, based on an analysis of a subset of the EDisCS cluster\nsamples. We demonstrate the advantage of our method in terms of resolution\nrecovery, generalisation to different noise properties, and computational\nefficiency. The analysis of this cluster sample not only allowed us to assess\nthe efficiency of our method, but it also enabled us to quantify the number of\nclumps within these galaxies in relation to their disc colour. This robust\ntechnique that we propose holds promise for identifying structures in the\ndistant universe through ground-based images.\n","authors":["Utsav Akhaury","Pascale Jablonka","Jean-Luc Starck","Frédéric Courbin"],"pdf_url":"https://arxiv.org/pdf/2405.07842v2.pdf","comment":"11 pages, 14 figures"},{"id":"http://arxiv.org/abs/2311.01623v4","updated":"2024-06-03T22:36:36Z","published":"2023-11-03T16:58:10Z","title":"VQPy: An Object-Oriented Approach to Modern Video Analytics","summary":"  Video analytics is widely used in contemporary systems and services. At the\nforefront of video analytics are video queries that users develop to find\nobjects of particular interest. Building upon the insight that video objects\n(e.g., human, animals, cars, etc.), the center of video analytics, are similar\nin spirit to objects modeled by traditional object-oriented languages, we\npropose to develop an object-oriented approach to video analytics. This\napproach, named VQPy, consists of a frontend$\\unicode{x2015}$a Python variant\nwith constructs that make it easy for users to express video objects and their\ninteractions$\\unicode{x2015}$as well as an extensible backend that can\nautomatically construct and optimize pipelines based on video objects. We have\nimplemented and open-sourced VQPy, which has been productized in Cisco as part\nof its DeepVision framework.\n","authors":["Shan Yu","Zhenting Zhu","Yu Chen","Hanchen Xu","Pengzhan Zhao","Yang Wang","Arthi Padmanabhan","Hugo Latapie","Harry Xu"],"pdf_url":"https://arxiv.org/pdf/2311.01623v4.pdf","comment":"MLSys'24"},{"id":"http://arxiv.org/abs/2405.20510v2","updated":"2024-06-03T22:34:58Z","published":"2024-05-30T21:59:29Z","title":"Physically Compatible 3D Object Modeling from a Single Image","summary":"  We present a computational framework that transforms single images into 3D\nphysical objects. The visual geometry of a physical object in an image is\ndetermined by three orthogonal attributes: mechanical properties, external\nforces, and rest-shape geometry. Existing single-view 3D reconstruction methods\noften overlook this underlying composition, presuming rigidity or neglecting\nexternal forces. Consequently, the reconstructed objects fail to withstand\nreal-world physical forces, resulting in instability or undesirable deformation\n-- diverging from their intended designs as depicted in the image. Our\noptimization framework addresses this by embedding physical compatibility into\nthe reconstruction process. We explicitly decompose the three physical\nattributes and link them through static equilibrium, which serves as a hard\nconstraint, ensuring that the optimized physical shapes exhibit desired\nphysical behaviors. Evaluations on a dataset collected from Objaverse\ndemonstrate that our framework consistently enhances the physical realism of 3D\nmodels over existing methods. The utility of our framework extends to practical\napplications in dynamic simulations and 3D printing, where adherence to\nphysical compatibility is paramount.\n","authors":["Minghao Guo","Bohan Wang","Pingchuan Ma","Tianyuan Zhang","Crystal Elaine Owens","Chuang Gan","Joshua B. Tenenbaum","Kaiming He","Wojciech Matusik"],"pdf_url":"https://arxiv.org/pdf/2405.20510v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2405.12473v2","updated":"2024-06-03T15:05:57Z","published":"2024-05-21T03:25:32Z","title":"Learning Partially Aligned Item Representation for Cross-Domain\n  Sequential Recommendation","summary":"  Cross-domain sequential recommendation (CDSR) aims to uncover and transfer\nusers' sequential preferences across multiple recommendation domains. While\nsignificant endeavors have been made, they primarily concentrated on developing\nadvanced transfer modules and aligning user representations using\nself-supervised learning techniques. However, the problem of aligning item\nrepresentations has received limited attention, and misaligned item\nrepresentations can potentially lead to sub-optimal sequential modeling and\nuser representation alignment. To this end, we propose a model-agnostic\nframework called \\textbf{C}ross-domain item representation \\textbf{A}lignment\nfor \\textbf{C}ross-\\textbf{D}omain \\textbf{S}equential \\textbf{R}ecommendation\n(\\textbf{CA-CDSR}), which achieves sequence-aware generation and adaptively\npartial alignment for item representations. Specifically, we first develop a\nsequence-aware feature augmentation strategy, which captures both collaborative\nand sequential item correlations, thus facilitating holistic item\nrepresentation generation. Next, we conduct an empirical study to investigate\nthe partial representation alignment problem from a spectrum perspective. It\nmotivates us to devise an adaptive spectrum filter, achieving partial alignment\nadaptively. Furthermore, the aligned item representations can be fed into\ndifferent sequential encoders to obtain user representations. The entire\nframework is optimized in a multi-task learning paradigm with an annealing\nstrategy. Extensive experiments have demonstrated that CA-CDSR can surpass\nstate-of-the-art baselines by a significant margin and can effectively align\nitems in representation spaces to enhance performance.\n","authors":["Mingjia Yin","Hao Wang","Wei Guo","Yong Liu","Zhi Li","Sirui Zhao","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2405.12473v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17795v2","updated":"2024-06-03T15:02:52Z","published":"2024-05-28T03:45:34Z","title":"Dataset Regeneration for Sequential Recommendation","summary":"  The sequential recommender (SR) system is a crucial component of modern\nrecommender systems, as it aims to capture the evolving preferences of users.\nSignificant efforts have been made to enhance the capabilities of SR systems.\nThese methods typically follow the model-centric paradigm, which involves\ndeveloping effective models based on fixed datasets. However, this approach\noften overlooks potential quality issues and flaws inherent in the data. Driven\nby the potential of data-centric AI, we propose a novel data-centric paradigm\nfor developing an ideal training dataset using a model-agnostic dataset\nregeneration framework called DR4SR. This framework enables the regeneration of\na dataset with exceptional cross-architecture generalizability. Additionally,\nwe introduce the DR4SR+ framework, which incorporates a model-aware dataset\npersonalizer to tailor the regenerated dataset specifically for a target model.\nTo demonstrate the effectiveness of the data-centric paradigm, we integrate our\nframework with various model-centric methods and observe significant\nperformance improvements across four widely adopted datasets. Furthermore, we\nconduct in-depth analyses to explore the potential of the data-centric paradigm\nand provide valuable insights. The code can be found at\nhttps://anonymous.4open.science/r/KDD2024-86EA\n","authors":["Mingjia Yin","Hao Wang","Wei Guo","Yong Liu","Suojuan Zhang","Sirui Zhao","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2405.17795v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15641v2","updated":"2024-06-03T11:11:13Z","published":"2024-01-28T12:33:14Z","title":"PRE: A Peer Review Based Large Language Model Evaluator","summary":"  The impressive performance of large language models (LLMs) has attracted\nconsiderable attention from the academic and industrial communities. Besides\nhow to construct and train LLMs, how to effectively evaluate and compare the\ncapacity of LLMs has also been well recognized as an important yet difficult\nproblem. Existing paradigms rely on either human annotators or model-based\nevaluators to evaluate the performance of LLMs on different tasks. However,\nthese paradigms often suffer from high cost, low generalizability, and\ninherited biases in practice, which make them incapable of supporting the\nsustainable development of LLMs in long term. In order to address these issues,\ninspired by the peer review systems widely used in academic publication\nprocess, we propose a novel framework that can automatically evaluate LLMs\nthrough a peer-review process. Specifically, for the evaluation of a specific\ntask, we first construct a small qualification exam to select \"reviewers\" from\na couple of powerful LLMs. Then, to actually evaluate the \"submissions\" written\nby different candidate LLMs, i.e., the evaluatees, we use the reviewer LLMs to\nrate or compare the submissions. The final ranking of evaluatee LLMs is\ngenerated based on the results provided by all reviewers. We conducted\nextensive experiments on text summarization tasks with eleven LLMs including\nGPT-4. The results demonstrate the existence of biasness when evaluating using\na single LLM. Also, our PRE model outperforms all the baselines, illustrating\nthe effectiveness of the peer review mechanism.\n","authors":["Zhumin Chu","Qingyao Ai","Yiteng Tu","Haitao Li","Yiqun Liu"],"pdf_url":"https://arxiv.org/pdf/2401.15641v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2401.04514v2","updated":"2024-06-03T06:50:26Z","published":"2024-01-09T12:12:50Z","title":"Rewriting the Code: A Simple Method for Large Language Model Augmented\n  Code Search","summary":"  In code search, the Generation-Augmented Retrieval (GAR) framework, which\ngenerates exemplar code snippets to augment queries, has emerged as a promising\nstrategy to address the principal challenge of modality misalignment between\ncode snippets and natural language queries, particularly with the demonstrated\ncode generation capabilities of Large Language Models (LLMs). Nevertheless, our\npreliminary investigations indicate that the improvements conferred by such an\nLLM-augmented framework are somewhat constrained. This limitation could\npotentially be ascribed to the fact that the generated codes, albeit\nfunctionally accurate, frequently display a pronounced stylistic deviation from\nthe ground truth code in the codebase. In this paper, we extend the\nfoundational GAR framework and propose a simple yet effective method that\nadditionally Rewrites the Code (ReCo) within the codebase for style\nnormalization. Experimental results demonstrate that ReCo significantly boosts\nretrieval accuracy across sparse (up to 35.7%), zero-shot dense (up to 27.6%),\nand fine-tuned dense (up to 23.6%) retrieval settings in diverse search\nscenarios. To further elucidate the advantages of ReCo and stimulate research\nin code style normalization, we introduce Code Style Similarity, the first\nmetric tailored to quantify stylistic similarities in code. Notably, our\nempirical findings reveal the inadequacy of existing metrics in capturing\nstylistic nuances. The source code and data are available at\n\\url{https://github.com/Alex-HaochenLi/ReCo}.\n","authors":["Haochen Li","Xin Zhou","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2401.04514v2.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2305.03972v3","updated":"2024-06-03T03:30:35Z","published":"2023-05-06T08:12:11Z","title":"Category-Oriented Representation Learning for Image to Multi-Modal\n  Retrieval","summary":"  The rise of multi-modal search requests from users has highlighted the\nimportance of multi-modal retrieval (i.e. image-to-text or text-to-image\nretrieval), yet the more complex task of image-to-multi-modal retrieval,\ncrucial for many industry applications, remains under-explored. To address this\ngap and promote further research, we introduce and define the concept of\nImage-to-Multi-Modal Retrieval (IMMR), a process designed to retrieve rich\nmulti-modal (i.e. image and text) documents based on image queries. We focus on\nrepresentation learning for IMMR and analyze three key challenges for it: 1)\nskewed data and noisy label in real-world industrial data, 2) the\ninformation-inequality between image and text modality of documents when\nlearning representations, 3) effective and efficient training in large-scale\nindustrial contexts. To tackle the above challenges, we propose a novel\nframework named organizing categories and learning by classification for\nretrieval (OCLEAR). It consists of three components: 1) a novel\ncategory-oriented data governance scheme coupled with a large-scale\nclassification-based learning paradigm, which handles the skewed and noisy data\nfrom a data perspective. 2) model architecture specially designed for\nmulti-modal learning, where information-inequality between image and text\nmodality of documents is considered for modality fusion. 3) a hybrid parallel\ntraining approach for tackling large-scale training in industrial scenario. The\nproposed framework achieves SOTA performance on public datasets and has been\ndeployed in a real-world industrial e-commence system, leading to significant\nbusiness growth. Code will be made publicly available.\n","authors":["Zida Cheng","Chen Ju","Xu Chen","Zhonghua Zhai","Shuai Xiao","Xiaoyi Zeng","Weilin Huang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2305.03972v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11300v3","updated":"2024-06-03T19:35:25Z","published":"2023-04-22T03:13:05Z","title":"MAWSEO: Adversarial Wiki Search Poisoning for Illicit Online Promotion","summary":"  As a prominent instance of vandalism edits, Wiki search poisoning for illicit\npromotion is a cybercrime in which the adversary aims at editing Wiki articles\nto promote illicit businesses through Wiki search results of relevant queries.\nIn this paper, we report a study that, for the first time, shows that such\nstealthy blackhat SEO on Wiki can be automated. Our technique, called MAWSEO,\nemploys adversarial revisions to achieve real-world cybercriminal objectives,\nincluding rank boosting, vandalism detection evasion, topic relevancy, semantic\nconsistency, user awareness (but not alarming) of promotional content, etc. Our\nevaluation and user study demonstrate that MAWSEO is capable of effectively and\nefficiently generating adversarial vandalism edits, which can bypass\nstate-of-the-art built-in Wiki vandalism detectors, and also get promotional\ncontent through to Wiki users without triggering their alarms. In addition, we\ninvestigated potential defense, including coherence based detection and\nadversarial training of vandalism detection, against our attack in the Wiki\necosystem.\n","authors":["Zilong Lin","Zhengyi Li","Xiaojing Liao","XiaoFeng Wang","Xiaozhong Liu"],"pdf_url":"https://arxiv.org/pdf/2304.11300v3.pdf","comment":"Accepted at the 45th IEEE Symposium on Security and Privacy (IEEE S&P\n  2024)"},{"id":"http://arxiv.org/abs/2406.01702v1","updated":"2024-06-03T18:02:13Z","published":"2024-06-03T18:02:13Z","title":"Session Context Embedding for Intent Understanding in Product Search","summary":"  It is often noted that single query-item pair relevance training in search\ndoes not capture the customer intent. User intent can be better deduced from a\nseries of engagements (Clicks, ATCs, Orders) in a given search session. We\npropose a novel method for vectorizing session context for capturing and\nutilizing context in retrieval and rerank. In the runtime, session embedding is\nan alternative to query embedding, saved and updated after each request in the\nsession, it can be used for retrieval and ranking. We outline session\nembedding's solution to session-based intent understanding and its\narchitecture, the background to this line of thought in search and\nrecommendation, detail the methodologies implemented, and finally present the\nresults of an implementation of session embedding for query product type\nclassification. We demonstrate improvements over strategies ignoring session\ncontext in the runtime for user intent understanding.\n","authors":["Navid Mehrdad","Vishal Rathi","Sravanthi Rajanala"],"pdf_url":"https://arxiv.org/pdf/2406.01702v1.pdf","comment":"5 pages, 1 Figure, 5 Tables, SIGIR 2024, LLM for Individuals, Groups,\n  and Society"},{"id":"http://arxiv.org/abs/2406.01363v1","updated":"2024-06-03T14:31:47Z","published":"2024-06-03T14:31:47Z","title":"Privacy in LLM-based Recommendation: Recent Advances and Future\n  Directions","summary":"  Nowadays, large language models (LLMs) have been integrated with conventional\nrecommendation models to improve recommendation performance. However, while\nmost of the existing works have focused on improving the model performance, the\nprivacy issue has only received comparatively less attention. In this paper, we\nreview recent advancements in privacy within LLM-based recommendation,\ncategorizing them into privacy attacks and protection mechanisms. Additionally,\nwe highlight several challenges and propose future directions for the community\nto address these critical problems.\n","authors":["Sichun Luo","Wei Shao","Yuxuan Yao","Jian Xu","Mingyang Liu","Qintong Li","Bowei He","Maolin Wang","Guanzhi Deng","Hanxu Hou","Xinyi Zhang","Linqi Song"],"pdf_url":"https://arxiv.org/pdf/2406.01363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01285v1","updated":"2024-06-03T12:53:37Z","published":"2024-06-03T12:53:37Z","title":"Large Language Models as Recommender Systems: A Study of Popularity Bias","summary":"  The issue of popularity bias -- where popular items are disproportionately\nrecommended, overshadowing less popular but potentially relevant items --\nremains a significant challenge in recommender systems. Recent advancements\nhave seen the integration of general-purpose Large Language Models (LLMs) into\nthe architecture of such systems. This integration raises concerns that it\nmight exacerbate popularity bias, given that the LLM's training data is likely\ndominated by popular items. However, it simultaneously presents a novel\nopportunity to address the bias via prompt tuning. Our study explores this\ndichotomy, examining whether LLMs contribute to or can alleviate popularity\nbias in recommender systems. We introduce a principled way to measure\npopularity bias by discussing existing metrics and proposing a novel metric\nthat fulfills a series of desiderata. Based on our new metric, we compare a\nsimple LLM-based recommender to traditional recommender systems on a movie\nrecommendation task. We find that the LLM recommender exhibits less popularity\nbias, even without any explicit mitigation.\n","authors":["Jan Malte Lichtenberg","Alexander Buchholz","Pola Schwöbel"],"pdf_url":"https://arxiv.org/pdf/2406.01285v1.pdf","comment":"Accepted at Gen-IR@SIGIR24 workshop"},{"id":"http://arxiv.org/abs/2406.01280v1","updated":"2024-06-03T12:48:38Z","published":"2024-06-03T12:48:38Z","title":"Demo: Soccer Information Retrieval via Natural Queries using SoccerRAG","summary":"  The rapid evolution of digital sports media necessitates sophisticated\ninformation retrieval systems that can efficiently parse extensive multimodal\ndatasets. This paper demonstrates SoccerRAG, an innovative framework designed\nto harness the power of Retrieval Augmented Generation (RAG) and Large Language\nModels (LLMs) to extract soccer-related information through natural language\nqueries. By leveraging a multimodal dataset, SoccerRAG supports dynamic\nquerying and automatic data validation, enhancing user interaction and\naccessibility to sports archives. We present a novel interactive user interface\n(UI) based on the Chainlit framework which wraps around the core functionality,\nand enable users to interact with the SoccerRAG framework in a chatbot-like\nvisual manner.\n","authors":["Aleksander Theo Strand","Sushant Gautam","Cise Midoglu","Pål Halvorsen"],"pdf_url":"https://arxiv.org/pdf/2406.01280v1.pdf","comment":"accepted to CBMI 2024 as a demonstration;\n  https://github.com/simula/soccer-rag"},{"id":"http://arxiv.org/abs/2406.01273v1","updated":"2024-06-03T12:39:04Z","published":"2024-06-03T12:39:04Z","title":"SoccerRAG: Multimodal Soccer Information Retrieval via Natural Queries","summary":"  The rapid evolution of digital sports media necessitates sophisticated\ninformation retrieval systems that can efficiently parse extensive multimodal\ndatasets. This paper introduces SoccerRAG, an innovative framework designed to\nharness the power of Retrieval Augmented Generation (RAG) and Large Language\nModels (LLMs) to extract soccer-related information through natural language\nqueries. By leveraging a multimodal dataset, SoccerRAG supports dynamic\nquerying and automatic data validation, enhancing user interaction and\naccessibility to sports archives. Our evaluations indicate that SoccerRAG\neffectively handles complex queries, offering significant improvements over\ntraditional retrieval systems in terms of accuracy and user engagement. The\nresults underscore the potential of using RAG and LLMs in sports analytics,\npaving the way for future advancements in the accessibility and real-time\nprocessing of sports data.\n","authors":["Aleksander Theo Strand","Sushant Gautam","Cise Midoglu","Pål Halvorsen"],"pdf_url":"https://arxiv.org/pdf/2406.01273v1.pdf","comment":"accepted to CBMI 2024 as a regular paper;\n  https://github.com/simula/soccer-rag"},{"id":"http://arxiv.org/abs/2406.01233v1","updated":"2024-06-03T11:52:52Z","published":"2024-06-03T11:52:52Z","title":"Multi-word Term Embeddings Improve Lexical Product Retrieval","summary":"  Product search is uniquely different from search for documents, Internet\nresources or vacancies, therefore it requires the development of specialized\nsearch systems. The present work describes the H1 embdedding model, designed\nfor an offline term indexing of product descriptions at e-commerce platforms.\nThe model is compared to other state-of-the-art (SoTA) embedding models within\na framework of hybrid product search system that incorporates the advantages of\nlexical methods for product retrieval and semantic embedding-based methods. We\npropose an approach to building semantically rich term vocabularies for search\nindexes. Compared to other production semantic models, H1 paired with the\nproposed approach stands out due to its ability to process multi-word product\nterms as one token. As an example, for search queries \"new balance shoes\",\n\"gloria jeans kids wear\" brand entity will be represented as one token - \"new\nbalance\", \"gloria jeans\". This results in an increased precision of the system\nwithout affecting the recall. The hybrid search system with proposed model\nscores mAP@12 = 56.1% and R@1k = 86.6% on the WANDS public dataset, beating\nother SoTA analogues.\n","authors":["Viktor Shcherbakov","Fedor Krasnov"],"pdf_url":"https://arxiv.org/pdf/2406.01233v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.01022v1","updated":"2024-06-03T06:08:02Z","published":"2024-06-03T06:08:02Z","title":"Poisoning Attacks and Defenses in Recommender Systems: A Survey","summary":"  Modern recommender systems (RS) have profoundly enhanced user experience\nacross digital platforms, yet they face significant threats from poisoning\nattacks. These attacks, aimed at manipulating recommendation outputs for\nunethical gains, exploit vulnerabilities in RS through injecting malicious data\nor intervening model training. This survey presents a unique perspective by\nexamining these threats through the lens of an attacker, offering fresh\ninsights into their mechanics and impacts. Concretely, we detail a systematic\npipeline that encompasses four stages of a poisoning attack: setting attack\ngoals, assessing attacker capabilities, analyzing victim architecture, and\nimplementing poisoning strategies. The pipeline not only aligns with various\nattack tactics but also serves as a comprehensive taxonomy to pinpoint focuses\nof distinct poisoning attacks. Correspondingly, we further classify defensive\nstrategies into two main categories: poisoning data filtering and robust\ntraining from the defender's perspective. Finally, we highlight existing\nlimitations and suggest innovative directions for further exploration in this\nfield.\n","authors":["Zongwei Wang","Junliang Yu","Min Gao","Guanhua Ye","Shazia Sadiq","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2406.01022v1.pdf","comment":"22 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.00973v1","updated":"2024-06-03T04:03:24Z","published":"2024-06-03T04:03:24Z","title":"Cold-start Recommendation by Personalized Embedding Region Elicitation","summary":"  Rating elicitation is a success element for recommender systems to perform\nwell at cold-starting, in which the systems need to recommend items to a newly\narrived user with no prior knowledge about the user's preference. Existing\nelicitation methods employ a fixed set of items to learn the user's preference\nand then infer the users' preferences on the remaining items. Using a fixed\nseed set can limit the performance of the recommendation system since the seed\nset is unlikely optimal for all new users with potentially diverse preferences.\nThis paper addresses this challenge using a 2-phase, personalized elicitation\nscheme. First, the elicitation scheme asks users to rate a small set of popular\nitems in a ``burn-in'' phase. Second, it sequentially asks the user to rate\nadaptive items to refine the preference and the user's representation.\nThroughout the process, the system represents the user's embedding value not by\na point estimate but by a region estimate. The value of information obtained by\nasking the user's rating on an item is quantified by the distance from the\nregion center embedding space that contains with high confidence the true\nembedding value of the user. Finally, the recommendations are successively\ngenerated by considering the preference region of the user. We show that each\nsubproblem in the elicitation scheme can be efficiently implemented. Further,\nwe empirically demonstrate the effectiveness of the proposed method against\nexisting rating-elicitation methods on several prominent datasets.\n","authors":["Hieu Trung Nguyen","Duy Nguyen","Khoa Doan","Viet Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2406.00973v1.pdf","comment":"Accepted at UAI 2024"},{"id":"http://arxiv.org/abs/2406.00944v1","updated":"2024-06-03T02:56:14Z","published":"2024-06-03T02:56:14Z","title":"Unveil the Duality of Retrieval-Augmented Generation: Theoretical\n  Analysis and Practical Solution","summary":"  Retrieval-augmented generation (RAG) utilizes retrieved texts to enhance\nlarge language models (LLMs). However, studies show that RAG is not\nconsistently effective and can even mislead LLMs due to noisy or incorrect\nretrieved texts. This suggests that RAG possesses a duality including both\nbenefit and detriment. Although many existing methods attempt to address this\nissue, they lack a theoretical explanation for the duality in RAG. The benefit\nand detriment within this duality remain a black box that cannot be quantified\nor compared in an explainable manner. This paper takes the first step in\ntheoretically giving the essential explanation of benefit and detriment in RAG\nby: (1) decoupling and formalizing them from RAG prediction, (2) approximating\nthe gap between their values by representation similarity and (3) establishing\nthe trade-off mechanism between them, to make them explainable, quantifiable,\nand comparable. We demonstrate that the distribution difference between\nretrieved texts and LLMs' knowledge acts as double-edged sword, bringing both\nbenefit and detriment. We also prove that the actual effect of RAG can be\npredicted at token level. Based on our theory, we propose a practical novel\nmethod, X-RAG, which achieves collaborative generation between pure LLM and RAG\nat token level to preserve benefit and avoid detriment. Experiments in\nreal-world tasks based on LLMs including OPT, LLaMA-2, and Mistral show the\neffectiveness of our method and support our theoretical results.\n","authors":["Shicheng Xu","Liang Pang","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2406.00944v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2406.00083v1","updated":"2024-06-03T02:25:33Z","published":"2024-06-03T02:25:33Z","title":"BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of\n  Large Language Models","summary":"  Large Language Models (LLMs) are constrained by outdated information and a\ntendency to generate incorrect data, commonly referred to as \"hallucinations.\"\nRetrieval-Augmented Generation (RAG) addresses these limitations by combining\nthe strengths of retrieval-based methods and generative models. This approach\ninvolves retrieving relevant information from a large, up-to-date dataset and\nusing it to enhance the generation process, leading to more accurate and\ncontextually appropriate responses. Despite its benefits, RAG introduces a new\nattack surface for LLMs, particularly because RAG databases are often sourced\nfrom public data, such as the web. In this paper, we propose \\TrojRAG{} to\nidentify the vulnerabilities and attacks on retrieval parts (RAG database) and\ntheir indirect attacks on generative parts (LLMs). Specifically, we identify\nthat poisoning several customized content passages could achieve a retrieval\nbackdoor, where the retrieval works well for clean queries but always returns\ncustomized poisoned adversarial queries. Triggers and poisoned passages can be\nhighly customized to implement various attacks. For example, a trigger could be\na semantic group like \"The Republican Party, Donald Trump, etc.\" Adversarial\npassages can be tailored to different contents, not only linked to the triggers\nbut also used to indirectly attack generative LLMs without modifying them.\nThese attacks can include denial-of-service attacks on RAG and semantic\nsteering attacks on LLM generations conditioned by the triggers. Our\nexperiments demonstrate that by just poisoning 10 adversarial passages can\ninduce 98.2\\% success rate to retrieve the adversarial passages. Then, these\npassages can increase the reject ratio of RAG-based GPT-4 from 0.01\\% to 74.6\\%\nor increase the rate of negative responses from 0.22\\% to 72\\% for targeted\nqueries.\n","authors":["Jiaqi Xue","Mengxin Zheng","Yebowen Hu","Fei Liu","Xun Chen","Qian Lou"],"pdf_url":"https://arxiv.org/pdf/2406.00083v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.00700v2","updated":"2024-06-03T17:57:39Z","published":"2023-12-01T16:33:57Z","title":"GIFT: Generative Interpretable Fine-Tuning","summary":"  We present Generative Interpretable Fine-Tuning (GIFT) for\nparameter-efficient fine-tuning of pretrained Transformer backbones, which can\nbe formulated as a simple factorized matrix multiplication in the parameter\nspace or equivalently in the activation space, and thus embraces built-in\ninterpretability. For a pretrained layer with weights $\\omega\\in\n\\mathbb{R}^{d_{out}\\times d_{in}}$, our proposed GIFT learns the fine-tuned\nweights $\\hat{\\omega}$ directly from $\\omega$ as $\\hat{\\omega}=\\omega \\cdot\n(\\mathbb{I}+\\phi_{d_{in}\\times r}\\cdot \\psi_{r\\times d_{in}})$ where\n$\\mathbb{I}$ is an identity matrix. $\\Theta=(\\phi, \\psi)$ are the learnable\nparameters of the two linear layers of GIFT with $r$ being a hyper-parameter.\n$\\Theta$ is shared by all the layers selected for fine-tuning, resulting in\nsignificantly fewer trainable parameters compared to Low-Rank Adaptation\n(LoRA). We perform comprehensive evaluations on natural language tasks\n(commonsense reasoning and sequence classification) and computer vision tasks\n(visual fine-grained classification). We obtain the best accuracy and parameter\nefficiency among baselines both on the Commonsense170k reasoning benchmark\nusing LLaMA-1 (7B) and Llama-2 (7B)/-3 (8B) and on the FGVC and VTAB visual\nrecognition benchmarks using ImageNet-21k pretrained Vision Transformer\n(ViT-B/16). Notably, we obtain 5.9% absolute increase in average accuracy with\n53.8 times reduction of parameters on Commonsense170k using Llama-3 (8B)\ncompared to LoRA. We obtain performance comparable to LoRA on the GLUE\nbenchmark but with significantly fewer parameters using RoBERTa-Base/Large. We\nshow the output of the first linear layer (i.e., $\\omega\\cdot \\phi$) is\nsurprisingly interpretable, which can play the role of a token-clustering head\nas a by-product to localize meaningful objects/parts in images for computer\nvision tasks. Our code is publicly available.\n","authors":["Chinmay Savadikar","Xi Song","Tianfu Wu"],"pdf_url":"https://arxiv.org/pdf/2312.00700v2.pdf","comment":"Project page and code: https://savadikarc.github.io/gift"},{"id":"http://arxiv.org/abs/2401.03955v6","updated":"2024-06-03T17:57:22Z","published":"2024-01-08T15:21:21Z","title":"Tiny Time Mixers (TTMs): Fast Pre-trained Models for Enhanced\n  Zero/Few-Shot Forecasting of Multivariate Time Series","summary":"  Large pre-trained models excel in zero/few-shot learning for language and\nvision tasks but face challenges in multivariate time series (TS) forecasting\ndue to diverse data characteristics. Consequently, recent research efforts have\nfocused on developing pre-trained TS forecasting models. These models, whether\nbuilt from scratch or adapted from large language models (LLMs), excel in\nzero/few-shot forecasting tasks. However, they are limited by slow performance,\nhigh computational demands, and neglect of cross-channel and exogenous\ncorrelations. To address this, we introduce Tiny Time Mixers (TTM), a compact\nmodel (starting from 1M parameters) with effective transfer learning\ncapabilities, trained exclusively on public TS datasets. TTM, based on the\nlight-weight TSMixer architecture, incorporates innovations like adaptive\npatching, diverse resolution sampling, and resolution prefix tuning to handle\npre-training on varied dataset resolutions with minimal model capacity.\nAdditionally, it employs multi-level modeling to capture channel correlations\nand infuse exogenous signals during fine-tuning. TTM outperforms existing\npopular benchmarks in zero/few-shot forecasting by (4-40\\%), while reducing\ncomputational requirements significantly. Moreover, TTMs are lightweight and\ncan be executed even on CPU-only machines, enhancing usability and fostering\nwider adoption in resource-constrained environments. Model weights for our\ninitial variant (TTM-Q) are available at\nhttps://huggingface.co/ibm-granite/granite-timeseries-ttm-v1. Model weights for\nmore sophisticated variants (TTM-B, TTM-E, and TTM-A) will be shared soon. The\nsource code for TTM can be accessed at\nhttps://github.com/ibm-granite/granite-tsfm/tree/main/tsfm_public/models/tinytimemixer.\n","authors":["Vijay Ekambaram","Arindam Jati","Pankaj Dayama","Sumanta Mukherjee","Nam H. Nguyen","Wesley M. Gifford","Chandra Reddy","Jayant Kalagnanam"],"pdf_url":"https://arxiv.org/pdf/2401.03955v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10093v2","updated":"2024-06-03T17:51:58Z","published":"2024-02-15T16:46:16Z","title":"MIM-Refiner: A Contrastive Learning Boost from Intermediate Pre-Trained\n  Representations","summary":"  We introduce MIM (Masked Image Modeling)-Refiner, a contrastive learning\nboost for pre-trained MIM models. MIM-Refiner is motivated by the insight that\nstrong representations within MIM models generally reside in intermediate\nlayers. Accordingly, MIM-Refiner leverages multiple contrastive heads that are\nconnected to different intermediate layers. In each head, a modified nearest\nneighbor objective constructs semantic clusters that capture semantic\ninformation which improves performance on downstream tasks, including\noff-the-shelf and fine-tuning settings.\n  The refinement process is short and simple - yet highly effective. Within a\nfew epochs, we refine the features of MIM models from subpar to\nstate-of-the-art, off-the-shelf features. Refining a ViT-H, pre-trained with\ndata2vec 2.0 on ImageNet-1K, sets a new state-of-the-art in linear probing\n(84.7%) and low-shot classification among models that are pre-trained on\nImageNet-1K. At ImageNet-1K 1-shot classification, MIM-Refiner advances the\nstate-of-the-art to 64.2%, outperforming larger models that were trained on up\nto 2000 times more data such as DINOv2-g, OpenCLIP-G and MAWS-6.5B.\n","authors":["Benedikt Alkin","Lukas Miklautz","Sepp Hochreiter","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2402.10093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07193v2","updated":"2024-06-03T17:49:41Z","published":"2024-02-11T13:00:04Z","title":"Loss Symmetry and Noise Equilibrium of Stochastic Gradient Descent","summary":"  Symmetries exist abundantly in the loss function of neural networks. We\ncharacterize the learning dynamics of stochastic gradient descent (SGD) when\nexponential symmetries, a broad subclass of continuous symmetries, exist in the\nloss function. We establish that when gradient noises do not balance, SGD has\nthe tendency to move the model parameters toward a point where noises from\ndifferent directions are balanced. Here, a special type of fixed point in the\nconstant directions of the loss function emerges as a candidate for solutions\nfor SGD. As the main theoretical result, we prove that every parameter $\\theta$\nconnects without loss function barrier to a unique noise-balanced fixed point\n$\\theta^*$. The theory implies that the balancing of gradient noise can serve\nas a novel alternative mechanism for relevant phenomena such as progressive\nsharpening and flattening and can be applied to understand common practical\nproblems such as representation normalization, matrix factorization, warmup,\nand formation of latent representations.\n","authors":["Liu Ziyin","Mingze Wang","Hongchao Li","Lei Wu"],"pdf_url":"https://arxiv.org/pdf/2402.07193v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2401.12179v2","updated":"2024-06-03T17:37:53Z","published":"2024-01-22T18:10:10Z","title":"DITTO: Diffusion Inference-Time T-Optimization for Music Generation","summary":"  We propose Diffusion Inference-Time T-Optimization (DITTO), a general-purpose\nframe-work for controlling pre-trained text-to-music diffusion models at\ninference-time via optimizing initial noise latents. Our method can be used to\noptimize through any differentiable feature matching loss to achieve a target\n(stylized) output and leverages gradient checkpointing for memory efficiency.\nWe demonstrate a surprisingly wide-range of applications for music generation\nincluding inpainting, outpainting, and looping as well as intensity, melody,\nand musical structure control - all without ever fine-tuning the underlying\nmodel. When we compare our approach against related training, guidance, and\noptimization-based methods, we find DITTO achieves state-of-the-art performance\non nearly all tasks, including outperforming comparable approaches on\ncontrollability, audio quality, and computational efficiency, thus opening the\ndoor for high-quality, flexible, training-free control of diffusion models.\nSound examples can be found at https://DITTO-Music.github.io/web/.\n","authors":["Zachary Novack","Julian McAuley","Taylor Berg-Kirkpatrick","Nicholas J. Bryan"],"pdf_url":"https://arxiv.org/pdf/2401.12179v2.pdf","comment":"Oral at ICML 2024"},{"id":"http://arxiv.org/abs/2401.17505v3","updated":"2024-06-03T17:35:04Z","published":"2024-01-30T23:46:35Z","title":"Arrows of Time for Large Language Models","summary":"  We study the probabilistic modeling performed by Autoregressive Large\nLanguage Models (LLMs) through the angle of time directionality, addressing a\nquestion first raised in (Shannon, 1951). For large enough models, we\nempirically find a time asymmetry in their ability to learn natural language: a\ndifference in the average log-perplexity when trying to predict the next token\nversus when trying to predict the previous one. This difference is at the same\ntime subtle and very consistent across various modalities (language, model\nsize, training time, ...). Theoretically, this is surprising: from an\ninformation-theoretic point of view, there should be no such difference. We\nprovide a theoretical framework to explain how such an asymmetry can appear\nfrom sparsity and computational complexity considerations, and outline a number\nof perspectives opened by our results.\n","authors":["Vassilis Papadopoulos","Jérémie Wenger","Clément Hongler"],"pdf_url":"https://arxiv.org/pdf/2401.17505v3.pdf","comment":"Re-arranged and updated figures. Added experiments. 12 figures, 20\n  pages"},{"id":"http://arxiv.org/abs/2309.11028v3","updated":"2024-06-03T17:22:24Z","published":"2023-09-20T03:15:11Z","title":"The Topology and Geometry of Neural Representations","summary":"  A central question for neuroscience is how to characterize brain\nrepresentations of perceptual and cognitive content. An ideal characterization\nshould distinguish different functional regions with robustness to noise and\nidiosyncrasies of individual brains that do not correspond to computational\ndifferences. Previous studies have characterized brain representations by their\nrepresentational geometry, which is defined by the representational\ndissimilarity matrix (RDM), a summary statistic that abstracts from the roles\nof individual neurons (or responses channels) and characterizes the\ndiscriminability of stimuli. Here we explore a further step of abstraction:\nfrom the geometry to the topology of brain representations. We propose\ntopological representational similarity analysis (tRSA), an extension of\nrepresentational similarity analysis (RSA) that uses a family of\ngeo-topological summary statistics that generalizes the RDM to characterize the\ntopology while de-emphasizing the geometry. We evaluate this new family of\nstatistics in terms of the sensitivity and specificity for model selection\nusing both simulations and fMRI data. In the simulations, the ground truth is a\ndata-generating layer representation in a neural network model and the models\nare the same and other layers in different model instances (trained from\ndifferent random seeds). In fMRI, the ground truth is a visual area and the\nmodels are the same and other areas measured in different subjects. Results\nshow that topology-sensitive characterizations of population codes are robust\nto noise and interindividual variability and maintain excellent sensitivity to\nthe unique representational signatures of different neural network layers and\nbrain regions. These methods enable researchers to calibrate comparisons among\nrepresentations in brains and models to be sensitive to the geometry, the\ntopology, or a combination of both.\n","authors":["Baihan Lin","Nikolaus Kriegeskorte"],"pdf_url":"https://arxiv.org/pdf/2309.11028v3.pdf","comment":"codes: https://github.com/doerlbh/TopologicalRSA"},{"id":"http://arxiv.org/abs/2403.17846v2","updated":"2024-06-03T17:12:25Z","published":"2024-03-26T16:36:43Z","title":"Hierarchical Open-Vocabulary 3D Scene Graphs for Language-Grounded Robot\n  Navigation","summary":"  Recent open-vocabulary robot mapping methods enrich dense geometric maps with\npre-trained visual-language features. While these maps allow for the prediction\nof point-wise saliency maps when queried for a certain language concept,\nlarge-scale environments and abstract queries beyond the object level still\npose a considerable hurdle, ultimately limiting language-grounded robotic\nnavigation. In this work, we present HOV-SG, a hierarchical open-vocabulary 3D\nscene graph mapping approach for language-grounded robot navigation. Leveraging\nopen-vocabulary vision foundation models, we first obtain state-of-the-art\nopen-vocabulary segment-level maps in 3D and subsequently construct a 3D scene\ngraph hierarchy consisting of floor, room, and object concepts, each enriched\nwith open-vocabulary features. Our approach is able to represent multi-story\nbuildings and allows robotic traversal of those using a cross-floor Voronoi\ngraph. HOV-SG is evaluated on three distinct datasets and surpasses previous\nbaselines in open-vocabulary semantic accuracy on the object, room, and floor\nlevel while producing a 75% reduction in representation size compared to dense\nopen-vocabulary maps. In order to prove the efficacy and generalization\ncapabilities of HOV-SG, we showcase successful long-horizon\nlanguage-conditioned robot navigation within real-world multi-storage\nenvironments. We provide code and trial video data at http://hovsg.github.io/.\n","authors":["Abdelrhman Werby","Chenguang Huang","Martin Büchner","Abhinav Valada","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2403.17846v2.pdf","comment":"Code and video are available at http://hovsg.github.io/"},{"id":"http://arxiv.org/abs/2310.17807v3","updated":"2024-06-03T16:59:37Z","published":"2023-10-26T22:58:19Z","title":"Clover: Closed-Loop Verifiable Code Generation","summary":"  The use of large language models for code generation is a rapidly growing\ntrend in software development. However, without effective methods for ensuring\nthe correctness of generated code, this trend could lead to any number of\nundesirable outcomes. In this paper, we lay out a vision for addressing this\nchallenge: the Clover paradigm, short for Closed-Loop Verifiable Code\nGeneration, which reduces correctness checking to the more accessible problem\nof consistency checking. At the core of Clover lies a checker that performs\nconsistency checks among code, docstrings, and formal annotations. The checker\nis implemented using a novel integration of formal verification tools and large\nlanguage models. We provide a theoretical analysis to support our thesis that\nClover should be effective at consistency checking. We also empirically\ninvestigate its feasibility on a hand-designed dataset (CloverBench) featuring\nannotated Dafny programs at a textbook level of difficulty. Experimental\nresults show that for this dataset, (i) LLMs are reasonably successful at\nautomatically generating formal specifications; and (ii) our consistency\nchecker achieves a promising acceptance rate (up to 87%) for correct instances\nwhile maintaining zero tolerance for incorrect ones (no false positives).\n","authors":["Chuyue Sun","Ying Sheng","Oded Padon","Clark Barrett"],"pdf_url":"https://arxiv.org/pdf/2310.17807v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08277v5","updated":"2024-06-03T16:48:59Z","published":"2024-02-13T08:12:48Z","title":"Towards Faithful and Robust LLM Specialists for Evidence-Based\n  Question-Answering","summary":"  Advances towards more faithful and traceable answers of Large Language Models\n(LLMs) are crucial for various research and practical endeavors. One avenue in\nreaching this goal is basing the answers on reliable sources. However, this\nEvidence-Based QA has proven to work insufficiently with LLMs in terms of\nciting the correct sources (source quality) and truthfully representing the\ninformation within sources (answer attributability). In this work, we\nsystematically investigate how to robustly fine-tune LLMs for better source\nquality and answer attributability. Specifically, we introduce a data\ngeneration pipeline with automated data quality filters, which can synthesize\ndiversified high-quality training and testing data at scale. We further\nintroduce four test sets to benchmark the robustness of fine-tuned specialist\nmodels. Extensive evaluation shows that fine-tuning on synthetic data improves\nperformance on both in- and out-of-distribution. Furthermore, we show that data\nquality, which can be drastically improved by proposed quality filters, matters\nmore than quantity in improving Evidence-Based QA.\n","authors":["Tobias Schimanski","Jingwei Ni","Mathias Kraus","Elliott Ash","Markus Leippold"],"pdf_url":"https://arxiv.org/pdf/2402.08277v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13784v3","updated":"2024-06-03T16:44:31Z","published":"2024-03-20T17:47:08Z","title":"The Model Openness Framework: Promoting Completeness and Openness for\n  Reproducibility, Transparency, and Usability in Artificial Intelligence","summary":"  Generative AI (GAI) offers unprecedented opportunities for research and\ninnovation, but its commercialization has raised concerns about transparency,\nreproducibility, and safety. Many open GAI models lack the necessary components\nfor full understanding and reproducibility, and some use restrictive licenses\nwhilst claiming to be ``open-source''. To address these concerns, we propose\nthe Model Openness Framework (MOF), a ranked classification system that rates\nmachine learning models based on their completeness and openness, following\nprinciples of open science, open source, open data, and open access. The MOF\nrequires specific components of the model development lifecycle to be included\nand released under appropriate open licenses. This framework aims to prevent\nmisrepresentation of models claiming to be open, guide researchers and\ndevelopers in providing all model components under permissive licenses, and\nhelp individuals and organizations identify models that can be safely adopted\nwithout restrictions. By promoting transparency and reproducibility, the MOF\ncombats ``openwashing'' practices and establishes completeness and openness as\nprimary criteria alongside the core tenets of responsible AI. Wide adoption of\nthe MOF will foster a more open AI ecosystem, benefiting research, innovation,\nand adoption of state-of-the-art models.\n","authors":["Matt White","Ibrahim Haddad","Cailean Osborne","Xiao-Yang Liu Yanglet","Ahmed Abdelmonsef","Sachin Varghese"],"pdf_url":"https://arxiv.org/pdf/2403.13784v3.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2405.14555v4","updated":"2024-06-03T16:43:16Z","published":"2024-05-23T13:35:34Z","title":"Subtle Biases Need Subtler Measures: Dual Metrics for Evaluating\n  Representative and Affinity Bias in Large Language Models","summary":"  Research on Large Language Models (LLMs) has often neglected subtle biases\nthat, although less apparent, can significantly influence the models' outputs\ntoward particular social narratives. This study addresses two such biases\nwithin LLMs: representative bias, which denotes a tendency of LLMs to generate\noutputs that mirror the experiences of certain identity groups, and affinity\nbias, reflecting the models' evaluative preferences for specific narratives or\nviewpoints. We introduce two novel metrics to measure these biases: the\nRepresentative Bias Score (RBS) and the Affinity Bias Score (ABS), and present\nthe Creativity-Oriented Generation Suite (CoGS), a collection of open-ended\ntasks such as short story writing and poetry composition, designed with\ncustomized rubrics to detect these subtle biases. Our analysis uncovers marked\nrepresentative biases in prominent LLMs, with a preference for identities\nassociated with being white, straight, and men. Furthermore, our investigation\nof affinity bias reveals distinctive evaluative patterns within each model,\nakin to `bias fingerprints'. This trend is also seen in human evaluators,\nhighlighting a complex interplay between human and machine bias perceptions.\n","authors":["Abhishek Kumar","Sarfaroz Yunusov","Ali Emami"],"pdf_url":"https://arxiv.org/pdf/2405.14555v4.pdf","comment":"9 pages (excluding references), accepted to ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2405.16277v3","updated":"2024-06-03T16:42:55Z","published":"2024-05-25T15:28:22Z","title":"Picturing Ambiguity: A Visual Twist on the Winograd Schema Challenge","summary":"  Large Language Models (LLMs) have demonstrated remarkable success in tasks\nlike the Winograd Schema Challenge (WSC), showcasing advanced textual\ncommon-sense reasoning. However, applying this reasoning to multimodal domains,\nwhere understanding text and images together is essential, remains a\nsubstantial challenge. To address this, we introduce WinoVis, a novel dataset\nspecifically designed to probe text-to-image models on pronoun disambiguation\nwithin multimodal contexts. Utilizing GPT-4 for prompt generation and Diffusion\nAttentive Attribution Maps (DAAM) for heatmap analysis, we propose a novel\nevaluation framework that isolates the models' ability in pronoun\ndisambiguation from other visual processing challenges. Evaluation of\nsuccessive model versions reveals that, despite incremental advancements,\nStable Diffusion 2.0 achieves a precision of 56.7% on WinoVis, only marginally\nsurpassing random guessing. Further error analysis identifies important areas\nfor future research aimed at advancing text-to-image models in their ability to\ninterpret and interact with the complex visual world.\n","authors":["Brendan Park","Madeline Janecek","Naser Ezzati-Jivan","Yifeng Li","Ali Emami"],"pdf_url":"https://arxiv.org/pdf/2405.16277v3.pdf","comment":"9 pages (excluding references), accepted to ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2405.16282v3","updated":"2024-06-03T16:41:53Z","published":"2024-05-25T15:42:04Z","title":"Confidence Under the Hood: An Investigation into the\n  Confidence-Probability Alignment in Large Language Models","summary":"  As the use of Large Language Models (LLMs) becomes more widespread,\nunderstanding their self-evaluation of confidence in generated responses\nbecomes increasingly important as it is integral to the reliability of the\noutput of these models. We introduce the concept of Confidence-Probability\nAlignment, that connects an LLM's internal confidence, quantified by token\nprobabilities, to the confidence conveyed in the model's response when\nexplicitly asked about its certainty. Using various datasets and prompting\ntechniques that encourage model introspection, we probe the alignment between\nmodels' internal and expressed confidence. These techniques encompass using\nstructured evaluation scales to rate confidence, including answer options when\nprompting, and eliciting the model's confidence level for outputs it does not\nrecognize as its own. Notably, among the models analyzed, OpenAI's GPT-4 showed\nthe strongest confidence-probability alignment, with an average Spearman's\n$\\hat{\\rho}$ of 0.42, across a wide range of tasks. Our work contributes to the\nongoing efforts to facilitate risk assessment in the application of LLMs and to\nfurther our understanding of model trustworthiness.\n","authors":["Abhishek Kumar","Robert Morabito","Sanzhar Umbet","Jad Kabbara","Ali Emami"],"pdf_url":"https://arxiv.org/pdf/2405.16282v3.pdf","comment":"9 pages (excluding references), accepted to ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2402.08845v3","updated":"2024-06-03T16:29:05Z","published":"2024-02-13T23:25:01Z","title":"Feature Attribution with Necessity and Sufficiency via Dual-stage\n  Perturbation Test for Causal Explanation","summary":"  We investigate the problem of explainability for machine learning models,\nfocusing on Feature Attribution Methods (FAMs) that evaluate feature importance\nthrough perturbation tests. Despite their utility, FAMs struggle to distinguish\nthe contributions of different features, when their prediction changes are\nsimilar after perturbation. To enhance FAMs' discriminative power, we introduce\nFeature Attribution with Necessity and Sufficiency (FANS), which find a\nneighborhood of the input such that perturbing samples within this neighborhood\nhave a high Probability of being Necessity and Sufficiency (PNS) cause for the\nchange in predictions, and use this PNS as the importance of the feature.\nSpecifically, FANS compute this PNS via a heuristic strategy for estimating the\nneighborhood and a perturbation test involving two stages (factual and\ninterventional) for counterfactual reasoning. To generate counterfactual\nsamples, we use a resampling-based approach on the observed samples to\napproximate the required conditional distribution. We demonstrate that FANS\noutperforms existing attribution methods on six benchmarks. Please refer to the\nsource code via \\url{https://github.com/DMIRLAB-Group/FANS}.\n","authors":["Xuexin Chen","Ruichu Cai","Zhengting Huang","Yuxuan Zhu","Julien Horwood","Zhifeng Hao","Zijian Li","Jose Miguel Hernandez-Lobato"],"pdf_url":"https://arxiv.org/pdf/2402.08845v3.pdf","comment":"Accepted in the Proceedings of the 41st International Conference on\n  Machine Learning (ICML2024)"},{"id":"http://arxiv.org/abs/2405.06270v3","updated":"2024-06-03T16:23:28Z","published":"2024-05-10T06:52:44Z","title":"XAI4LLM. Let Machine Learning Models and LLMs Collaborate for Enhanced\n  In-Context Learning in Healthcare","summary":"  The integration of Large Language Models (LLMs) into healthcare diagnostics\noffers a promising avenue for clinical decision-making. This study outlines the\ndevelopment of a novel method for zero-shot/few-shot in-context learning (ICL)\nby integrating medical domain knowledge using a multi-layered structured\nprompt. We also explore the efficacy of two communication styles between the\nuser and LLMs: the Numerical Conversational (NC) style, which processes data\nincrementally, and the Natural Language Single-Turn (NL-ST) style, which\nemploys long narrative prompts.\n  Our study systematically evaluates the diagnostic accuracy and risk factors,\nincluding gender bias and false negative rates, using a dataset of 920 patient\nrecords in various few-shot scenarios. Results indicate that traditional\nclinical machine learning (ML) models generally outperform LLMs in zero-shot\nand few-shot settings. However, the performance gap narrows significantly when\nemploying few-shot examples alongside effective explainable AI (XAI) methods as\nsources of domain knowledge. Moreover, with sufficient time and an increased\nnumber of examples, the conversational style (NC) nearly matches the\nperformance of ML models. Most notably, LLMs demonstrate comparable or superior\ncost-sensitive accuracy relative to ML models.\n  This research confirms that, with appropriate domain knowledge and tailored\ncommunication strategies, LLMs can significantly enhance diagnostic processes.\nThe findings highlight the importance of optimizing the number of training\nexamples and communication styles to improve accuracy and reduce biases in LLM\napplications.\n","authors":["Fatemeh Nazary","Yashar Deldjoo","Tommaso Di Noia","Eugenio di Sciascio"],"pdf_url":"https://arxiv.org/pdf/2405.06270v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10046v2","updated":"2024-06-03T16:14:51Z","published":"2024-02-15T16:07:56Z","title":"How Flawed Is ECE? An Analysis via Logit Smoothing","summary":"  Informally, a model is calibrated if its predictions are correct with a\nprobability that matches the confidence of the prediction. By far the most\ncommon method in the literature for measuring calibration is the expected\ncalibration error (ECE). Recent work, however, has pointed out drawbacks of\nECE, such as the fact that it is discontinuous in the space of predictors. In\nthis work, we ask: how fundamental are these issues, and what are their impacts\non existing results? Towards this end, we completely characterize the\ndiscontinuities of ECE with respect to general probability measures on Polish\nspaces. We then use the nature of these discontinuities to motivate a novel\ncontinuous, easily estimated miscalibration metric, which we term\nLogit-Smoothed ECE (LS-ECE). By comparing the ECE and LS-ECE of pre-trained\nimage classification models, we show in initial experiments that binned ECE\nclosely tracks LS-ECE, indicating that the theoretical pathologies of ECE may\nbe avoidable in practice.\n","authors":["Muthu Chidambaram","Holden Lee","Colin McSwiggen","Semon Rezchikov"],"pdf_url":"https://arxiv.org/pdf/2402.10046v2.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.20407v2","updated":"2024-06-03T16:11:03Z","published":"2024-05-30T18:25:19Z","title":"Convolutional L2LFlows: Generating Accurate Showers in Highly Granular\n  Calorimeters Using Convolutional Normalizing Flows","summary":"  In the quest to build generative surrogate models as computationally\nefficient alternatives to rule-based simulations, the quality of the generated\nsamples remains a crucial frontier. So far, normalizing flows have been among\nthe models with the best fidelity. However, as the latent space in such models\nis required to have the same dimensionality as the data space, scaling up\nnormalizing flows to high dimensional datasets is not straightforward. The\nprior L2LFlows approach successfully used a series of separate normalizing\nflows and sequence of conditioning steps to circumvent this problem. In this\nwork, we extend L2LFlows to simulate showers with a 9-times larger profile in\nthe lateral direction. To achieve this, we introduce convolutional layers and\nU-Net-type connections, move from masked autoregressive flows to coupling\nlayers, and demonstrate the successful modelling of showers in the ILD\nElectromagnetic Calorimeter as well as Dataset 3 from the public CaloChallenge\ndataset.\n","authors":["Thorsten Buss","Frank Gaede","Gregor Kasieczka","Claudius Krause","David Shih"],"pdf_url":"https://arxiv.org/pdf/2405.20407v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17212v4","updated":"2024-06-03T15:57:47Z","published":"2023-05-26T19:14:01Z","title":"Rotational Equilibrium: How Weight Decay Balances Learning Across Neural\n  Networks","summary":"  This study investigates how weight decay affects the update behavior of\nindividual neurons in deep neural networks through a combination of applied\nanalysis and experimentation. Weight decay can cause the expected magnitude and\nangular updates of a neuron's weight vector to converge to a steady state we\ncall rotational equilibrium. These states can be highly homogeneous,\neffectively balancing the average rotation -- a proxy for the effective\nlearning rate -- across different layers and neurons. Our work analyzes these\ndynamics across optimizers like Adam, Lion, and SGD with momentum, offering a\nnew simple perspective on training that elucidates the efficacy of widely used\nbut poorly understood methods in deep learning. We demonstrate how balanced\nrotation plays a key role in the effectiveness of normalization like Weight\nStandardization, as well as that of AdamW over Adam with L2-regularization.\nFinally, we show that explicitly controlling the rotation provides the benefits\nof weight decay while substantially reducing the need for learning rate warmup.\n","authors":["Atli Kosson","Bettina Messmer","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2305.17212v4.pdf","comment":"Accepted to ICML 2024; Code available at https://github.com/epfml/REQ"},{"id":"http://arxiv.org/abs/2402.14991v3","updated":"2024-06-03T15:42:55Z","published":"2024-02-22T22:03:16Z","title":"Quantum Theory and Application of Contextual Optimal Transport","summary":"  Optimal Transport (OT) has fueled machine learning (ML) across many domains.\nWhen paired data measurements $(\\boldsymbol{\\mu}, \\boldsymbol{\\nu})$ are\ncoupled to covariates, a challenging conditional distribution learning setting\narises. Existing approaches for learning a $\\textit{global}$ transport map\nparameterized through a potentially unseen context utilize Neural OT and\nlargely rely on Brenier's theorem. Here, we propose a first-of-its-kind quantum\ncomputing formulation for amortized optimization of contextualized\ntransportation plans. We exploit a direct link between doubly stochastic\nmatrices and unitary operators thus unravelling a natural connection between OT\nand quantum computation. We verify our method (QontOT) on synthetic and real\ndata by predicting variations in cell type distributions conditioned on drug\ndosage. Importantly we conduct a 24-qubit hardware experiment on a task\nchallenging for classical computers and report a performance that cannot be\nmatched with our classical neural OT approach. In sum, this is a first step\ntoward learning to predict contextualized transportation plans through quantum\ncomputing.\n","authors":["Nicola Mariella","Albert Akhriev","Francesco Tacchino","Christa Zoufal","Juan Carlos Gonzalez-Espitia","Benedek Harsanyi","Eugene Koskin","Ivano Tavernelli","Stefan Woerner","Marianna Rapsomaniki","Sergiy Zhuk","Jannis Born"],"pdf_url":"https://arxiv.org/pdf/2402.14991v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2312.03654v2","updated":"2024-06-03T15:42:45Z","published":"2023-12-06T18:20:46Z","title":"Efficient Inverse Design Optimization through Multi-fidelity\n  Simulations, Machine Learning, and Search Space Reduction Strategies","summary":"  This paper introduces a methodology designed to augment the inverse design\noptimization process in scenarios constrained by limited compute, through the\nstrategic synergy of multi-fidelity evaluations, machine learning models, and\noptimization algorithms. The proposed methodology is analyzed on two distinct\nengineering inverse design problems: airfoil inverse design and the scalar\nfield reconstruction problem. It leverages a machine learning model trained\nwith low-fidelity simulation data, in each optimization cycle, thereby\nproficiently predicting a target variable and discerning whether a\nhigh-fidelity simulation is necessitated, which notably conserves computational\nresources. Additionally, the machine learning model is strategically deployed\nprior to optimization to compress the design space boundaries, thereby further\naccelerating convergence toward the optimal solution. The methodology has been\nemployed to enhance two optimization algorithms, namely Differential Evolution\nand Particle Swarm Optimization. Comparative analyses illustrate performance\nimprovements across both algorithms. Notably, this method is adaptable across\nany inverse design application, facilitating a synergy between a representative\nlow-fidelity ML model, and high-fidelity simulation, and can be seamlessly\napplied across any variety of population-based optimization algorithms.}\n","authors":["Luka Grbcic","Juliane Müller","Wibe Albert de Jong"],"pdf_url":"https://arxiv.org/pdf/2312.03654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01712v3","updated":"2024-06-03T15:35:12Z","published":"2024-04-02T07:54:18Z","title":"Efficient and Generalizable Certified Unlearning: A Hessian-free\n  Recollection Approach","summary":"  Machine unlearning strives to uphold the data owners' right to be forgotten\nby enabling models to selectively forget specific data. Recent advances suggest\nprecomputing and storing statistics extracted from second-order information and\nimplementing unlearning through Newton-style updates. However, the theoretical\nanalysis of these works often depends on restrictive assumptions of convexity\nand smoothness, and those mentioned operations on Hessian matrix are extremely\ncostly. As a result, applying these works to high-dimensional models becomes\nchallenging. In this paper, we propose an efficient Hessian-free certified\nunlearning. We propose to maintain a statistical vector for each data, computed\nthrough affine stochastic recursion approximation of the difference between\nretrained and learned models. Our analysis does not involve inverting Hessian\nand thus can be extended to non-convex non-smooth objectives. Under same\nassumptions, we demonstrate advancements of proposed method beyond the\nstate-of-the-art theoretical studies, in terms of generalization, unlearning\nguarantee, deletion capacity, and computation/storage complexity, and we show\nthat the unlearned model of our proposed approach is close to or same as the\nretrained model. Based on the strategy of recollecting statistics for\nforgetting data, we develop an algorithm that achieves near-instantaneous\nunlearning as it only requires a vector addition operation. Experiments\ndemonstrate that the proposed scheme surpasses existing results by orders of\nmagnitude in terms of time/storage costs, while also enhancing accuracy.\n","authors":["Xinbao Qiao","Meng Zhang","Ming Tang","Ermin Wei"],"pdf_url":"https://arxiv.org/pdf/2404.01712v3.pdf","comment":"31 pages, 10 figures"},{"id":"http://arxiv.org/abs/2403.15933v3","updated":"2024-06-03T15:30:52Z","published":"2024-03-23T21:16:56Z","title":"Understanding Domain-Size Generalization in Markov Logic Networks","summary":"  We study the generalization behavior of Markov Logic Networks (MLNs) across\nrelational structures of different sizes. Multiple works have noticed that MLNs\nlearned on a given domain generalize poorly across domains of different sizes.\nThis behavior emerges from a lack of internal consistency within an MLN when\nused across different domain sizes. In this paper, we quantify this\ninconsistency and bound it in terms of the variance of the MLN parameters. The\nparameter variance also bounds the KL divergence between an MLN's marginal\ndistributions taken from different domain sizes. We use these bounds to show\nthat maximizing the data log-likelihood while simultaneously minimizing the\nparameter variance corresponds to two natural notions of generalization across\ndomain sizes. Our theoretical results apply to Exponential Random Graphs and\nother Markov network based relational models. Finally, we observe that\nsolutions known to decrease the variance of the MLN parameters, like\nregularization and Domain-Size Aware MLNs, increase the internal consistency of\nthe MLNs. We empirically verify our results on four different datasets, with\ndifferent methods to control parameter variance, showing that controlling\nparameter variance leads to better generalization.\n","authors":["Florian Chen","Felix Weitkämper","Sagar Malhotra"],"pdf_url":"https://arxiv.org/pdf/2403.15933v3.pdf","comment":"To Appear in Proceedings of ECML 2024-Research Track"},{"id":"http://arxiv.org/abs/2402.05724v2","updated":"2024-06-03T15:29:09Z","published":"2024-02-08T14:54:47Z","title":"Model-Based RL for Mean-Field Games is not Statistically Harder than\n  Single-Agent RL","summary":"  We study the sample complexity of reinforcement learning (RL) in Mean-Field\nGames (MFGs) with model-based function approximation that requires strategic\nexploration to find a Nash Equilibrium policy. We introduce the Partial\nModel-Based Eluder Dimension (P-MBED), a more effective notion to characterize\nthe model class complexity. Notably, P-MBED measures the complexity of the\nsingle-agent model class converted from the given mean-field model class, and\npotentially, can be exponentially lower than the MBED proposed by\n\\citet{huang2023statistical}. We contribute a model elimination algorithm\nfeaturing a novel exploration strategy and establish sample complexity results\npolynomial w.r.t.~P-MBED. Crucially, our results reveal that, under the basic\nrealizability and Lipschitz continuity assumptions, \\emph{learning Nash\nEquilibrium in MFGs is no more statistically challenging than solving a\nlogarithmic number of single-agent RL problems}. We further extend our results\nto Multi-Type MFGs, generalizing from conventional MFGs and involving multiple\ntypes of agents. This extension implies statistical tractability of a broader\nclass of Markov Games through the efficacy of mean-field approximation.\nFinally, inspired by our theoretical algorithm, we present a heuristic approach\nwith improved computational efficiency and empirically demonstrate its\neffectiveness.\n","authors":["Jiawei Huang","Niao He","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2402.05724v2.pdf","comment":"ICML 2024; 55 Pages"},{"id":"http://arxiv.org/abs/2311.06103v2","updated":"2024-06-03T15:20:13Z","published":"2023-11-10T15:12:04Z","title":"1-Lipschitz Neural Networks are more expressive with N-Activations","summary":"  A crucial property for achieving secure, trustworthy and interpretable deep\nlearning systems is their robustness: small changes to a system's inputs should\nnot result in large changes to its outputs. Mathematically, this means one\nstrives for networks with a small Lipschitz constant. Several recent works have\nfocused on how to construct such Lipschitz networks, typically by imposing\nconstraints on the weight matrices. In this work, we study an orthogonal\naspect, namely the role of the activation function. We show that commonly used\nactivation functions, such as MaxMin, as well as all piece-wise linear ones\nwith two segments unnecessarily restrict the class of representable functions,\neven in the simplest one-dimensional setting. We furthermore introduce the new\nN-activation function that is provably more expressive than currently popular\nactivation functions. We provide code at\nhttps://github.com/berndprach/NActivation.\n","authors":["Bernd Prach","Christoph H. Lampert"],"pdf_url":"https://arxiv.org/pdf/2311.06103v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20233v2","updated":"2024-06-03T15:16:26Z","published":"2024-03-29T15:22:03Z","title":"Functional Bilevel Optimization for Machine Learning","summary":"  In this paper, we introduce a new functional point of view on bilevel\noptimization problems for machine learning, where the inner objective is\nminimized over a function space. These types of problems are most often solved\nby using methods developed in the parametric setting, where the inner objective\nis strongly convex with respect to the parameters of the prediction function.\nThe functional point of view does not rely on this assumption and notably\nallows using over-parameterized neural networks as the inner prediction\nfunction. We propose scalable and efficient algorithms for the functional\nbilevel optimization problem and illustrate the benefits of our approach on\ninstrumental regression and reinforcement learning tasks.\n","authors":["Ieva Petrulionyte","Julien Mairal","Michael Arbel"],"pdf_url":"https://arxiv.org/pdf/2403.20233v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18293v2","updated":"2024-06-03T15:07:01Z","published":"2024-05-28T15:48:27Z","title":"CF-OPT: Counterfactual Explanations for Structured Prediction","summary":"  Optimization layers in deep neural networks have enjoyed a growing popularity\nin structured learning, improving the state of the art on a variety of\napplications. Yet, these pipelines lack interpretability since they are made of\ntwo opaque layers: a highly non-linear prediction model, such as a deep neural\nnetwork, and an optimization layer, which is typically a complex black-box\nsolver. Our goal is to improve the transparency of such methods by providing\ncounterfactual explanations. We build upon variational autoencoders a\nprincipled way of obtaining counterfactuals: working in the latent space leads\nto a natural notion of plausibility of explanations. We finally introduce a\nvariant of the classic loss for VAE training that improves their performance in\nour specific structured context. These provide the foundations of CF-OPT, a\nfirst-order optimization algorithm that can find counterfactual explanations\nfor a broad class of structured learning architectures. Our numerical results\nshow that both close and plausible explanations can be obtained for problems\nfrom the recent literature.\n","authors":["Germain Vivier-Ardisson","Alexandre Forel","Axel Parmentier","Thibaut Vidal"],"pdf_url":"https://arxiv.org/pdf/2405.18293v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15301v2","updated":"2024-06-03T14:56:28Z","published":"2024-03-22T15:51:39Z","title":"Planning with a Learned Policy Basis to Optimally Solve Complex Tasks","summary":"  Conventional reinforcement learning (RL) methods can successfully solve a\nwide range of sequential decision problems. However, learning policies that can\ngeneralize predictably across multiple tasks in a setting with non-Markovian\nreward specifications is a challenging problem. We propose to use successor\nfeatures to learn a policy basis so that each (sub)policy in it solves a\nwell-defined subproblem. In a task described by a finite state automaton (FSA)\nthat involves the same set of subproblems, the combination of these\n(sub)policies can then be used to generate an optimal solution without\nadditional learning. In contrast to other methods that combine (sub)policies\nvia planning, our method asymptotically attains global optimality, even in\nstochastic environments.\n","authors":["Guillermo Infante","David Kuric","Anders Jonsson","Vicenç Gómez","Herke van Hoof"],"pdf_url":"https://arxiv.org/pdf/2403.15301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20448v2","updated":"2024-06-03T14:40:28Z","published":"2024-05-30T19:47:34Z","title":"Knockout: A simple way to handle missing inputs","summary":"  Deep learning models can extract predictive and actionable information from\ncomplex inputs. The richer the inputs, the better these models usually perform.\nHowever, models that leverage rich inputs (e.g., multi-modality) can be\ndifficult to deploy widely, because some inputs may be missing at inference.\nCurrent popular solutions to this problem include marginalization, imputation,\nand training multiple models. Marginalization can obtain calibrated predictions\nbut it is computationally costly and therefore only feasible for low\ndimensional inputs. Imputation may result in inaccurate predictions because it\nemploys point estimates for missing variables and does not work well for high\ndimensional inputs (e.g., images). Training multiple models whereby each model\ntakes different subsets of inputs can work well but requires knowing missing\ninput patterns in advance. Furthermore, training and retaining multiple models\ncan be costly. We propose an efficient way to learn both the conditional\ndistribution using full inputs and the marginal distributions. Our method,\nKnockout, randomly replaces input features with appropriate placeholder values\nduring training. We provide a theoretical justification of Knockout and show\nthat it can be viewed as an implicit marginalization strategy. We evaluate\nKnockout in a wide range of simulations and real-world datasets and show that\nit can offer strong empirical performance.\n","authors":["Minh Nguyen","Batuhan K. Karaman","Heejong Kim","Alan Q. Wang","Fengbei Liu","Mert R. Sabuncu"],"pdf_url":"https://arxiv.org/pdf/2405.20448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02416v3","updated":"2024-06-03T14:33:45Z","published":"2024-02-04T09:24:51Z","title":"Aligner: Efficient Alignment by Learning to Correct","summary":"  With the rapid development of large language models (LLMs) and ever-evolving\npractical requirements, finding an efficient and effective alignment method has\nnever been more critical. However, the tension between the complexity of\ncurrent alignment methods and the need for rapid iteration in deployment\nscenarios necessitates the development of a model-agnostic alignment approach\nthat can operate under these constraints. In this paper, we introduce Aligner,\na novel and simple alignment paradigm that learns the correctional residuals\nbetween preferred and dispreferred answers using a small model. Designed as a\nmodel-agnostic, plug-and-play module, Aligner can be directly applied to\nvarious open-source and API-based models with only one-off training, making it\nsuitable for rapid iteration. Notably, Aligner can be applied to any powerful,\nlarge-scale upstream models. Moreover, it can even iteratively bootstrap the\nupstream models using corrected responses as synthetic human preference data,\nbreaking through the model's performance ceiling. Our experiments demonstrate\nperformance improvements by deploying the same Aligner model across 11\ndifferent LLMs, evaluated on the 3H dimensions (helpfulness, harmlessness, and\nhonesty). Specifically, Aligner-7B has achieved an average improvement of\n68.9\\% in helpfulness and 23.8\\% in harmlessness across the tested LLMs while\nalso effectively reducing hallucination. In the Alpaca-Eval leaderboard,\nstacking Aligner-2B on GPT-4 Turbo improved its LC Win Rate from 55.0\\% to\n58.3\\%, surpassing GPT-4 Omni's 57.5\\% Win Rate (community report).\n","authors":["Jiaming Ji","Boyuan Chen","Hantao Lou","Donghai Hong","Borong Zhang","Xuehai Pan","Juntao Dai","Tianyi Qiu","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2402.02416v3.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2405.06418v2","updated":"2024-06-03T14:27:59Z","published":"2024-05-10T12:03:53Z","title":"PAC-Bayesian Generalization Bounds for Knowledge Graph Representation\n  Learning","summary":"  While a number of knowledge graph representation learning (KGRL) methods have\nbeen proposed over the past decade, very few theoretical analyses have been\nconducted on them. In this paper, we present the first PAC-Bayesian\ngeneralization bounds for KGRL methods. To analyze a broad class of KGRL\nmodels, we propose a generic framework named ReED (Relation-aware\nEncoder-Decoder), which consists of a relation-aware message passing encoder\nand a triplet classification decoder. Our ReED framework can express at least\n15 different existing KGRL models, including not only graph neural\nnetwork-based models such as R-GCN and CompGCN but also shallow-architecture\nmodels such as RotatE and ANALOGY. Our generalization bounds for the ReED\nframework provide theoretical grounds for the commonly used tricks in KGRL,\ne.g., parameter-sharing and weight normalization schemes, and guide desirable\ndesign choices for practical KGRL methods. We empirically show that the\ncritical factors in our generalization bounds can explain actual generalization\nerrors on three real-world knowledge graphs.\n","authors":["Jaejun Lee","Minsung Hwang","Joyce Jiyoung Whang"],"pdf_url":"https://arxiv.org/pdf/2405.06418v2.pdf","comment":"32 pages, 3 figures, 4 tables, The 41st International Conference on\n  Machine Learning (ICML 2024)"},{"id":"http://arxiv.org/abs/2402.07723v2","updated":"2024-06-03T14:20:34Z","published":"2024-02-12T15:35:32Z","title":"Generalization Bounds for Heavy-Tailed SDEs through the Fractional\n  Fokker-Planck Equation","summary":"  Understanding the generalization properties of heavy-tailed stochastic\noptimization algorithms has attracted increasing attention over the past years.\nWhile illuminating interesting aspects of stochastic optimizers by using\nheavy-tailed stochastic differential equations as proxies, prior works either\nprovided expected generalization bounds, or introduced non-computable\ninformation theoretic terms. Addressing these drawbacks, in this work, we prove\nhigh-probability generalization bounds for heavy-tailed SDEs which do not\ncontain any nontrivial information theoretic terms. To achieve this goal, we\ndevelop new proof techniques based on estimating the entropy flows associated\nwith the so-called fractional Fokker-Planck equation (a partial differential\nequation that governs the evolution of the distribution of the corresponding\nheavy-tailed SDE). In addition to obtaining high-probability bounds, we show\nthat our bounds have a better dependence on the dimension of parameters as\ncompared to prior art. Our results further identify a phase transition\nphenomenon, which suggests that heavy tails can be either beneficial or harmful\ndepending on the problem structure. We support our theory with experiments\nconducted in a variety of settings.\n","authors":["Benjamin Dupuis","Umut Şimşekli"],"pdf_url":"https://arxiv.org/pdf/2402.07723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21061v2","updated":"2024-06-03T14:20:27Z","published":"2024-05-31T17:50:27Z","title":"Graph External Attention Enhanced Transformer","summary":"  The Transformer architecture has recently gained considerable attention in\nthe field of graph representation learning, as it naturally overcomes several\nlimitations of Graph Neural Networks (GNNs) with customized attention\nmechanisms or positional and structural encodings. Despite making some\nprogress, existing works tend to overlook external information of graphs,\nspecifically the correlation between graphs. Intuitively, graphs with similar\nstructures should have similar representations. Therefore, we propose Graph\nExternal Attention (GEA) -- a novel attention mechanism that leverages multiple\nexternal node/edge key-value units to capture inter-graph correlations\nimplicitly. On this basis, we design an effective architecture called Graph\nExternal Attention Enhanced Transformer (GEAET), which integrates local\nstructure and global interaction information for more comprehensive graph\nrepresentations. Extensive experiments on benchmark datasets demonstrate that\nGEAET achieves state-of-the-art empirical performance. The source code is\navailable for reproducibility at: https://github.com/icm1018/GEAET.\n","authors":["Jianqing Liang","Min Chen","Jiye Liang"],"pdf_url":"https://arxiv.org/pdf/2405.21061v2.pdf","comment":"In Proceedings of ICML 2024"},{"id":"http://arxiv.org/abs/2306.04848v4","updated":"2024-06-03T14:18:29Z","published":"2023-06-08T00:56:33Z","title":"Interpreting and Improving Diffusion Models from an Optimization\n  Perspective","summary":"  Denoising is intuitively related to projection. Indeed, under the manifold\nhypothesis, adding random noise is approximately equivalent to orthogonal\nperturbation. Hence, learning to denoise is approximately learning to project.\nIn this paper, we use this observation to interpret denoising diffusion models\nas approximate gradient descent applied to the Euclidean distance function. We\nthen provide straight-forward convergence analysis of the DDIM sampler under\nsimple assumptions on the projection error of the denoiser. Finally, we propose\na new gradient-estimation sampler, generalizing DDIM using insights from our\ntheoretical results. In as few as 5-10 function evaluations, our sampler\nachieves state-of-the-art FID scores on pretrained CIFAR-10 and CelebA models\nand can generate high quality samples on latent diffusion models.\n","authors":["Frank Permenter","Chenyang Yuan"],"pdf_url":"https://arxiv.org/pdf/2306.04848v4.pdf","comment":"24 pages, 9 figures, 4 tables. To appear in ICML 2024"},{"id":"http://arxiv.org/abs/2402.08567v2","updated":"2024-06-03T14:15:03Z","published":"2024-02-13T16:06:17Z","title":"Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM\n  Agents Exponentially Fast","summary":"  A multimodal large language model (MLLM) agent can receive instructions,\ncapture images, retrieve histories from memory, and decide which tools to use.\nNonetheless, red-teaming efforts have revealed that adversarial images/prompts\ncan jailbreak an MLLM and cause unaligned behaviors. In this work, we report an\neven more severe safety issue in multi-agent environments, referred to as\ninfectious jailbreak. It entails the adversary simply jailbreaking a single\nagent, and without any further intervention from the adversary, (almost) all\nagents will become infected exponentially fast and exhibit harmful behaviors.\nTo validate the feasibility of infectious jailbreak, we simulate multi-agent\nenvironments containing up to one million LLaVA-1.5 agents, and employ\nrandomized pair-wise chat as a proof-of-concept instantiation for multi-agent\ninteraction. Our results show that feeding an (infectious) adversarial image\ninto the memory of any randomly chosen agent is sufficient to achieve\ninfectious jailbreak. Finally, we derive a simple principle for determining\nwhether a defense mechanism can provably restrain the spread of infectious\njailbreak, but how to design a practical defense that meets this principle\nremains an open question to investigate. Our project page is available at\nhttps://sail-sg.github.io/Agent-Smith/.\n","authors":["Xiangming Gu","Xiaosen Zheng","Tianyu Pang","Chao Du","Qian Liu","Ye Wang","Jing Jiang","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2402.08567v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2402.16801v2","updated":"2024-06-03T14:12:27Z","published":"2024-02-26T18:19:07Z","title":"Craftax: A Lightning-Fast Benchmark for Open-Ended Reinforcement\n  Learning","summary":"  Benchmarks play a crucial role in the development and analysis of\nreinforcement learning (RL) algorithms. We identify that existing benchmarks\nused for research into open-ended learning fall into one of two categories.\nEither they are too slow for meaningful research to be performed without\nenormous computational resources, like Crafter, NetHack and Minecraft, or they\nare not complex enough to pose a significant challenge, like Minigrid and\nProcgen. To remedy this, we first present Craftax-Classic: a ground-up rewrite\nof Crafter in JAX that runs up to 250x faster than the Python-native original.\nA run of PPO using 1 billion environment interactions finishes in under an hour\nusing only a single GPU and averages 90% of the optimal reward. To provide a\nmore compelling challenge we present the main Craftax benchmark, a significant\nextension of the Crafter mechanics with elements inspired from NetHack. Solving\nCraftax requires deep exploration, long term planning and memory, as well as\ncontinual adaptation to novel situations as more of the world is discovered. We\nshow that existing methods including global and episodic exploration, as well\nas unsupervised environment design fail to make material progress on the\nbenchmark. We believe that Craftax can for the first time allow researchers to\nexperiment in a complex, open-ended environment with limited computational\nresources.\n","authors":["Michael Matthews","Michael Beukman","Benjamin Ellis","Mikayel Samvelyan","Matthew Jackson","Samuel Coward","Jakob Foerster"],"pdf_url":"https://arxiv.org/pdf/2402.16801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11742v2","updated":"2024-06-03T14:09:10Z","published":"2024-02-18T23:59:54Z","title":"Balanced Data, Imbalanced Spectra: Unveiling Class Disparities with\n  Spectral Imbalance","summary":"  Classification models are expected to perform equally well for different\nclasses, yet in practice, there are often large gaps in their performance. This\nissue of class bias is widely studied in cases of datasets with sample\nimbalance, but is relatively overlooked in balanced datasets. In this work, we\nintroduce the concept of spectral imbalance in features as a potential source\nfor class disparities and study the connections between spectral imbalance and\nclass bias in both theory and practice. To build the connection between\nspectral imbalance and class gap, we develop a theoretical framework for\nstudying class disparities and derive exact expressions for the per-class error\nin a high-dimensional mixture model setting. We then study this phenomenon in\n11 different state-of-the-art pretrained encoders and show how our proposed\nframework can be used to compare the quality of encoders, as well as evaluate\nand combine data augmentation strategies to mitigate the issue. Our work sheds\nlight on the class-dependent effects of learning, and provides new insights\ninto how state-of-the-art pretrained features may have unknown biases that can\nbe diagnosed through their spectra.\n","authors":["Chiraag Kaushik","Ran Liu","Chi-Heng Lin","Amrit Khera","Matthew Y Jin","Wenrui Ma","Vidya Muthukumar","Eva L Dyer"],"pdf_url":"https://arxiv.org/pdf/2402.11742v2.pdf","comment":"25 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.02827v2","updated":"2024-06-03T13:51:16Z","published":"2024-02-05T09:24:52Z","title":"PowerGraph: A power grid benchmark dataset for graph neural networks","summary":"  Power grids are critical infrastructures of paramount importance to modern\nsociety and, therefore, engineered to operate under diverse conditions and\nfailures. The ongoing energy transition poses new challenges for the\ndecision-makers and system operators. Therefore, we must develop grid analysis\nalgorithms to ensure reliable operations. These key tools include power flow\nanalysis and system security analysis, both needed for effective operational\nand strategic planning. The literature review shows a growing trend of machine\nlearning (ML) models that perform these analyses effectively. In particular,\nGraph Neural Networks (GNNs) stand out in such applications because of the\ngraph-based structure of power grids. However, there is a lack of publicly\navailable graph datasets for training and benchmarking ML models in electrical\npower grid applications. First, we present PowerGraph, which comprises\nGNN-tailored datasets for i) power flows, ii) optimal power flows, and iii)\ncascading failure analyses of power grids. Second, we provide ground-truth\nexplanations for the cascading failure analysis. Finally, we perform a complete\nbenchmarking of GNN methods for node-level and graph-level tasks and\nexplainability. Overall, PowerGraph is a multifaceted GNN dataset for diverse\ntasks that includes power flow and fault scenarios with real-world\nexplanations, providing a valuable resource for developing improved GNN models\nfor node-level, graph-level tasks and explainability methods in power system\nmodeling. The dataset is available at\nhttps://figshare.com/articles/dataset/PowerGraph/22820534 and the code at\nhttps://github.com/PowerGraph-Datasets.\n","authors":["Anna Varbella","Kenza Amara","Blazhe Gjorgiev","Mennatallah El-Assady","Giovanni Sansavini"],"pdf_url":"https://arxiv.org/pdf/2402.02827v2.pdf","comment":"21 pages, 8 figures, conference paper"},{"id":"http://arxiv.org/abs/2403.19289v2","updated":"2024-06-03T13:49:20Z","published":"2024-03-28T10:19:36Z","title":"Uplift Modeling Under Limited Supervision","summary":"  Estimating causal effects in e-commerce tends to involve costly treatment\nassignments which can be impractical in large-scale settings. Leveraging\nmachine learning to predict such treatment effects without actual intervention\nis a standard practice to diminish the risk. However, existing methods for\ntreatment effect prediction tend to rely on training sets of substantial size,\nwhich are built from real experiments and are thus inherently risky to create.\nIn this work we propose a graph neural network to diminish the required\ntraining set size, relying on graphs that are common in e-commerce data.\nSpecifically, we view the problem as node regression with a restricted number\nof labeled instances, develop a two-model neural architecture akin to previous\ncausal effect estimators, and test varying message-passing layers for encoding.\nFurthermore, as an extra step, we combine the model with an acquisition\nfunction to guide the creation of the training set in settings with extremely\nlow experimental budget. The framework is flexible since each step can be used\nseparately with other models or treatment policies. The experiments on real\nlarge-scale networks indicate a clear advantage of our methodology over the\nstate of the art, which in many cases performs close to random, underlining the\nneed for models that can generalize with limited supervision to reduce\nexperimental risks.\n","authors":["George Panagopoulos","Daniele Malitesta","Fragkiskos D. Malliaros","Jun Pang"],"pdf_url":"https://arxiv.org/pdf/2403.19289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10134v3","updated":"2024-06-03T13:48:59Z","published":"2022-08-22T08:23:53Z","title":"Machine Learning with Confidential Computing: A Systematization of\n  Knowledge","summary":"  Privacy and security challenges in Machine Learning (ML) have become\nincreasingly severe, along with ML's pervasive development and the recent\ndemonstration of large attack surfaces. As a mature system-oriented approach,\nConfidential Computing has been utilized in both academia and industry to\nmitigate privacy and security issues in various ML scenarios. In this paper,\nthe conjunction between ML and Confidential Computing is investigated. We\nsystematize the prior work on Confidential Computing-assisted ML techniques\nthat provide i) confidentiality guarantees and ii) integrity assurances, and\ndiscuss their advanced features and drawbacks. Key challenges are further\nidentified, and we provide dedicated analyses of the limitations in existing\nTrusted Execution Environment (TEE) systems for ML use cases. Finally,\nprospective works are discussed, including grounded privacy definitions for\nclosed-loop protection, partitioned executions of efficient ML, dedicated\nTEE-assisted designs for ML, TEE-aware ML, and ML full pipeline guarantees. By\nproviding these potential solutions in our systematization of knowledge, we aim\nto build the bridge to help achieve a much stronger TEE-enabled ML for privacy\nguarantees without introducing computation and system costs.\n","authors":["Fan Mo","Zahra Tarkhani","Hamed Haddadi"],"pdf_url":"https://arxiv.org/pdf/2208.10134v3.pdf","comment":"Survey paper, 37 pages, accepted to ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2405.07839v2","updated":"2024-06-03T13:48:52Z","published":"2024-05-13T15:25:03Z","title":"Constrained Exploration via Reflected Replica Exchange Stochastic\n  Gradient Langevin Dynamics","summary":"  Replica exchange stochastic gradient Langevin dynamics (reSGLD) is an\neffective sampler for non-convex learning in large-scale datasets. However, the\nsimulation may encounter stagnation issues when the high-temperature chain\ndelves too deeply into the distribution tails. To tackle this issue, we propose\nreflected reSGLD (r2SGLD): an algorithm tailored for constrained non-convex\nexploration by utilizing reflection steps within a bounded domain.\nTheoretically, we observe that reducing the diameter of the domain enhances\nmixing rates, exhibiting a $\\textit{quadratic}$ behavior. Empirically, we test\nits performance through extensive experiments, including identifying dynamical\nsystems with physical constraints, simulations of constrained multi-modal\ndistributions, and image classification tasks. The theoretical and empirical\nfindings highlight the crucial role of constrained exploration in improving the\nsimulation efficiency.\n","authors":["Haoyang Zheng","Hengrong Du","Qi Feng","Wei Deng","Guang Lin"],"pdf_url":"https://arxiv.org/pdf/2405.07839v2.pdf","comment":"28 pages, 13 figures"},{"id":"http://arxiv.org/abs/2311.18639v2","updated":"2024-06-03T13:45:44Z","published":"2023-11-30T15:46:22Z","title":"Targeted Reduction of Causal Models","summary":"  Why does a phenomenon occur? Addressing this question is central to most\nscientific inquiries and often relies on simulations of scientific models. As\nmodels become more intricate, deciphering the causes behind phenomena in\nhigh-dimensional spaces of interconnected variables becomes increasingly\nchallenging. Causal Representation Learning (CRL) offers a promising avenue to\nuncover interpretable causal patterns within these simulations through an\ninterventional lens. However, developing general CRL frameworks suitable for\npractical applications remains an open challenge. We introduce Targeted Causal\nReduction (TCR), a method for condensing complex intervenable models into a\nconcise set of causal factors that explain a specific target phenomenon. We\npropose an information theoretic objective to learn TCR from interventional\ndata of simulations, establish identifiability for continuous variables under\nshift interventions and present a practical algorithm for learning TCRs. Its\nability to generate interpretable high-level explanations from complex models\nis demonstrated on toy and mechanical systems, illustrating its potential to\nassist scientists in the study of complex phenomena in a broad range of\ndisciplines.\n","authors":["Armin Kekić","Bernhard Schölkopf","Michel Besserve"],"pdf_url":"https://arxiv.org/pdf/2311.18639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15479v2","updated":"2024-06-03T13:43:52Z","published":"2023-06-27T13:57:16Z","title":"Predictive Coding beyond Correlations","summary":"  Recently, there has been extensive research on the capabilities of\nbiologically plausible algorithms. In this work, we show how one of such\nalgorithms, called predictive coding, is able to perform causal inference\ntasks. First, we show how a simple change in the inference process of\npredictive coding enables to compute interventions without the need to mutilate\nor redefine a causal graph. Then, we explore applications in cases where the\ngraph is unknown, and has to be inferred from observational data. Empirically,\nwe show how such findings can be used to improve the performance of predictive\ncoding in image classification tasks, and conclude that such models are able to\nperform simple end-to-end causal inference tasks.\n","authors":["Tommaso Salvatori","Luca Pinchetti","Amine M'Charrak","Beren Millidge","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2306.15479v2.pdf","comment":"44 Pages, 24 Figures. Changed title and abstract, following the ICML\n  accepted version"},{"id":"http://arxiv.org/abs/2401.08381v2","updated":"2024-06-03T13:40:44Z","published":"2024-01-16T14:11:54Z","title":"Robotic Imitation of Human Actions","summary":"  Imitation can allow us to quickly gain an understanding of a new task.\nThrough a demonstration, we can gain direct knowledge about which actions need\nto be performed and which goals they have. In this paper, we introduce a new\napproach to imitation learning that tackles the challenges of a robot imitating\na human, such as the change in perspective and body schema. Our approach can\nuse a single human demonstration to abstract information about the demonstrated\ntask, and use that information to generalise and replicate it. We facilitate\nthis ability by a new integration of two state-of-the-art methods: a diffusion\naction segmentation model to abstract temporal information from the\ndemonstration and an open vocabulary object detector for spatial information.\nFurthermore, we refine the abstracted information and use symbolic reasoning to\ncreate an action plan utilising inverse kinematics, to allow the robot to\nimitate the demonstrated action.\n","authors":["Josua Spisak","Matthias Kerzel","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2401.08381v2.pdf","comment":"Accepted at the ICDL 2024"},{"id":"http://arxiv.org/abs/2401.07039v2","updated":"2024-06-03T13:37:50Z","published":"2024-01-13T10:56:34Z","title":"Quantum Generative Diffusion Model: A Fully Quantum-Mechanical Model for\n  Generating Quantum State Ensemble","summary":"  Classical diffusion models have shown superior generative results and have\nbeen applied to many problems. Exploring these models in the quantum domain can\nadvance the field of quantum generative learning. In this paper, we introduce\nthe Quantum Generative Diffusion Model (QGDM), a simple and elegant quantum\ncounterpart of classical diffusion models.\n  The core idea of QGDM is that any target quantum state can be transformed\ninto a completely mixed state, which has the highest entropy and maximum\nuncertainty about the system, through a non-unitary forward process.\nSubsequently, a trainable backward process can be used to recover the target\nstate from the completely mixed state. The design requirements for QGDM's\nbackward process include ensuring non-unitarity while maintaining a low number\nof parameters. To achieve this, we introduce partial trace operations in the\nbackward process to enforce non-unitary. Additionally, we control the number of\ntrainable parameters by using a parameter-sharing strategy and incorporating\ntemporal information as an input in the backward process. Furthermore, we\nintroduce a resource-efficient version of QGDM, which reduces the number of\nauxiliary qubits while preserving impressive generative capabilities.\n  Our proposed models exhibit better convergence performance than Quantum\nGenerative Adversarial Networks (QGANs) because our models optimize a convex\ndistance function using gradient descent. Comparative results with QGANs\ndemonstrate the effectiveness of our models in generating both pure and mixed\nquantum states. Notably, our models achieve 53.03% higher fidelity in\nmixed-state generation tasks compared to QGANs. These results highlight the\npotential of the proposed models to tackle challenging quantum generation\ntasks.\n","authors":["Chuangtao Chen","Qinglin Zhao","MengChu Zhou","Zhimin He","Zhili Sun","Haozhen Situ"],"pdf_url":"https://arxiv.org/pdf/2401.07039v2.pdf","comment":"Comments are welcome"},{"id":"http://arxiv.org/abs/2306.04974v2","updated":"2024-06-03T13:30:28Z","published":"2023-06-08T07:05:36Z","title":"Conservative Prediction via Data-Driven Confidence Minimization","summary":"  In safety-critical applications of machine learning, it is often desirable\nfor a model to be conservative, abstaining from making predictions on unknown\ninputs which are not well-represented in the training data. However, detecting\nunknown examples is challenging, as it is impossible to anticipate all\npotential inputs at test time. To address this, prior work (Hendrycks et al.,\n2018) minimizes model confidence on an auxiliary outlier dataset carefully\ncurated to be disjoint from the training distribution. We theoretically analyze\nthe choice of auxiliary dataset for confidence minimization, revealing two\nactionable insights: (1) if the auxiliary set contains unknown examples similar\nto those seen at test time, confidence minimization leads to provable detection\nof unknown test examples, and (2) if the first condition is satisfied, it is\nunnecessary to filter out known examples for out-of-distribution (OOD)\ndetection. Motivated by these guidelines, we propose the Data-Driven Confidence\nMinimization (DCM) framework, which minimizes confidence on an uncertainty\ndataset. We apply DCM to two problem settings in which conservative prediction\nis paramount -- selective classification and OOD detection -- and provide a\nrealistic way to gather uncertainty data for each setting. In our experiments,\nDCM consistently outperforms existing selective classification approaches on 4\ndatasets when tested on unseen distributions and outperforms state-of-the-art\nOOD detection methods on 12 ID-OOD dataset pairs, reducing FPR (at TPR $95\\%$)\nby $6.3\\%$ and $58.1\\%$ on CIFAR-10 and CIFAR-100 compared to Outlier Exposure.\n","authors":["Caroline Choi","Fahim Tajwar","Yoonho Lee","Huaxiu Yao","Ananya Kumar","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2306.04974v2.pdf","comment":"Transactions on Machine Learning Research (TMLR), 2024"},{"id":"http://arxiv.org/abs/2402.04050v2","updated":"2024-06-03T13:22:12Z","published":"2024-02-06T14:53:19Z","title":"Connecting the Dots: Collaborative Fine-tuning for Black-Box\n  Vision-Language Models","summary":"  With the emergence of pretrained vision-language models (VLMs), considerable\nefforts have been devoted to fine-tuning them for downstream tasks. Despite the\nprogress made in designing efficient fine-tuning methods, such methods require\naccess to the model's parameters, which can be challenging as model owners\noften opt to provide their models as a black box to safeguard model ownership.\nThis paper proposes a \\textbf{C}ollabo\\textbf{ra}tive\n\\textbf{F}ine-\\textbf{T}uning (\\textbf{CraFT}) approach for fine-tuning\nblack-box VLMs to downstream tasks, where one only has access to the input\nprompts and the output predictions of the model. CraFT comprises two modules, a\nprompt generation module for learning text prompts and a prediction refinement\nmodule for enhancing output predictions in residual style. Additionally, we\nintroduce an auxiliary prediction-consistent loss to promote consistent\noptimization across these modules. These modules are optimized by a novel\ncollaborative training algorithm. Extensive experiments on few-shot\nclassification over 15 datasets demonstrate the superiority of CraFT. The\nresults show that CraFT achieves a decent gain of about 12\\% with 16-shot\ndatasets and only 8,000 queries. Moreover, CraFT trains faster and uses only\nabout 1/80 of the memory footprint for deployment, while sacrificing only\n1.62\\% compared to the white-box method. Our code is publicly available at\nhttps://github.com/mrflogs/CraFT .\n","authors":["Zhengbo Wang","Jian Liang","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2402.04050v2.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2402.14029v2","updated":"2024-06-03T13:12:18Z","published":"2024-02-20T03:14:45Z","title":"Partial Search in a Frozen Network is Enough to Find a Strong Lottery\n  Ticket","summary":"  Randomly initialized dense networks contain subnetworks that achieve high\naccuracy without weight learning -- strong lottery tickets (SLTs). Recently,\nGadhikar et al. (2023) demonstrated that SLTs can also be found within a\nrandomly pruned source network, thus reducing the SLT search space. However,\nthis limits the search to SLTs that are even sparser than the source, leading\nto worse accuracy due to unintentionally high sparsity. This paper proposes a\nmethod that reduces the SLT search space by an arbitrary ratio independent of\nthe desired SLT sparsity. A random subset of the initial weights is excluded\nfrom the search space by freezing it -- i.e., by either permanently pruning\nthem or locking them as a fixed part of the SLT. In addition to reducing search\nspace, the proposed random freezing can also provide the benefit of reducing\nthe model size for inference. Furthermore, experimental results show that the\nproposed method finds SLTs with better accuracy-to-model size trade-off than\nthe SLTs obtained from dense or randomly pruned source networks. In particular,\nthe SLTs found in Frozen ResNets on image classification using ImageNet\nsignificantly improve the accuracy-to-search space and accuracy-to-model size\ntrade-offs over SLTs within dense (non-freezing) or sparse (non-locking) random\nnetworks.\n","authors":["Hikari Otsuka","Daiki Chijiwa","Ángel López García-Arias","Yasuyuki Okoshi","Kazushi Kawamura","Thiem Van Chu","Daichi Fujiki","Susumu Takeuchi","Masato Motomura"],"pdf_url":"https://arxiv.org/pdf/2402.14029v2.pdf","comment":"v2: Updates include additional experiments and revisions of some\n  experiments"},{"id":"http://arxiv.org/abs/2210.04872v3","updated":"2024-06-03T13:07:57Z","published":"2022-10-10T17:45:37Z","title":"Sequential Neural Score Estimation: Likelihood-Free Inference with\n  Conditional Score Based Diffusion Models","summary":"  We introduce Sequential Neural Posterior Score Estimation (SNPSE), a\nscore-based method for Bayesian inference in simulator-based models. Our\nmethod, inspired by the remarkable success of score-based methods in generative\nmodelling, leverages conditional score-based diffusion models to generate\nsamples from the posterior distribution of interest. The model is trained using\nan objective function which directly estimates the score of the posterior. We\nembed the model into a sequential training procedure, which guides simulations\nusing the current approximation of the posterior at the observation of\ninterest, thereby reducing the simulation cost. We also introduce several\nalternative sequential approaches, and discuss their relative merits. We then\nvalidate our method, as well as its amortised, non-sequential, variant on\nseveral numerical examples, demonstrating comparable or superior performance to\nexisting state-of-the-art methods such as Sequential Neural Posterior\nEstimation (SNPE).\n","authors":["Louis Sharrock","Jack Simons","Song Liu","Mark Beaumont"],"pdf_url":"https://arxiv.org/pdf/2210.04872v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2402.02805v2","updated":"2024-06-03T13:07:06Z","published":"2024-02-05T08:26:33Z","title":"Graph-enhanced Large Language Models in Asynchronous Plan Reasoning","summary":"  Planning is a fundamental property of human intelligence. Reasoning about\nasynchronous plans is challenging since it requires sequential and parallel\nplanning to optimize time costs. Can large language models (LLMs) succeed at\nthis task? Here, we present the first large-scale study investigating this\nquestion. We find that a representative set of closed and open-source LLMs,\nincluding GPT-4 and LLaMA-2, behave poorly when not supplied with illustrations\nabout the task-solving process in our benchmark AsyncHow. We propose a novel\ntechnique called Plan Like a Graph (PLaG) that combines graphs with natural\nlanguage prompts and achieves state-of-the-art results. We show that although\nPLaG can boost model performance, LLMs still suffer from drastic degradation\nwhen task complexity increases, highlighting the limits of utilizing LLMs for\nsimulating digital devices. We see our study as an exciting step towards using\nLLMs as efficient autonomous agents. Our code and data are available at\nhttps://github.com/fangru-lin/graph-llm-asynchow-plan.\n","authors":["Fangru Lin","Emanuele La Malfa","Valentin Hofmann","Elle Michelle Yang","Anthony Cohn","Janet B. Pierrehumbert"],"pdf_url":"https://arxiv.org/pdf/2402.02805v2.pdf","comment":"Accepted at ICML-2024"},{"id":"http://arxiv.org/abs/2305.12639v2","updated":"2024-06-03T13:06:52Z","published":"2023-05-22T02:22:14Z","title":"Accelerating Graph Neural Networks via Edge Pruning for Power Allocation\n  in Wireless Networks","summary":"  Graph Neural Networks (GNNs) have recently emerged as a promising approach to\ntackling power allocation problems in wireless networks. Since unpaired\ntransmitters and receivers are often spatially distant, the distance-based\nthreshold is proposed to reduce the computation time by excluding or including\nthe channel state information in GNNs. In this paper, we are the first to\nintroduce a neighbour-based threshold approach to GNNs to reduce the time\ncomplexity. Furthermore, we conduct a comprehensive analysis of both\ndistance-based and neighbour-based thresholds and provide recommendations for\nselecting the appropriate value in different communication channel scenarios.\nWe design the corresponding neighbour-based Graph Neural Networks (N-GNN) with\nthe aim of allocating transmit powers to maximise the network throughput. Our\nresults show that our proposed N-GNN offer significant advantages in terms of\nreducing time complexity while preserving strong performance and generalisation\ncapacity. Besides, we show that by choosing a suitable threshold, the time\ncomplexity is reduced from O(|V|^2) to O(|V|), where |V| is the total number of\ntransceiver pairs.\n","authors":["Lili Chen","Jingge Zhu","Jamie Evans"],"pdf_url":"https://arxiv.org/pdf/2305.12639v2.pdf","comment":"Published in 2023 IEEE Global Communications Conference Workshops (GC\n  Workshops)"},{"id":"http://arxiv.org/abs/2405.11349v2","updated":"2024-06-03T12:55:58Z","published":"2024-05-18T17:38:25Z","title":"Unlock the Power of Algorithm Features: A Generalization Analysis for\n  Algorithm Selection","summary":"  In the algorithm selection research, the discussion surrounding algorithm\nfeatures has been significantly overshadowed by the emphasis on problem\nfeatures. Although a few empirical studies have yielded evidence regarding the\neffectiveness of algorithm features, the potential benefits of incorporating\nalgorithm features into algorithm selection models and their suitability for\ndifferent scenarios remain unclear. In this paper, we address this gap by\nproposing the first provable guarantee for algorithm selection based on\nalgorithm features, taking a generalization perspective. We analyze the\nbenefits and costs associated with algorithm features and investigate how the\ngeneralization error is affected by different factors. Specifically, we examine\nadaptive and predefined algorithm features under transductive and inductive\nlearning paradigms, respectively, and derive upper bounds for the\ngeneralization error based on their model's Rademacher complexity. Our\ntheoretical findings not only provide tight upper bounds, but also offer\nanalytical insights into the impact of various factors, such as the training\nscale of problem instances and candidate algorithms, model parameters, feature\nvalues, and distributional differences between the training and test data.\nNotably, we demonstrate how models will benefit from algorithm features in\ncomplex scenarios involving many algorithms, and proves the positive\ncorrelation between generalization error bound and $\\chi^2$-divergence of\ndistributions.\n","authors":["Xingyu Wu","Yan Zhong","Jibin Wu","Yuxiao Huang","Sheng-hao Wu","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2405.11349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09445v2","updated":"2024-06-03T12:42:50Z","published":"2024-01-31T16:53:50Z","title":"iMove: Exploring Bio-impedance Sensing for Fitness Activity Recognition","summary":"  Automatic and precise fitness activity recognition can be beneficial in\naspects from promoting a healthy lifestyle to personalized preventative\nhealthcare. While IMUs are currently the prominent fitness tracking modality,\nthrough iMove, we show bio-impedence can help improve IMU-based fitness\ntracking through sensor fusion and contrastive learning.To evaluate our\nmethods, we conducted an experiment including six upper body fitness activities\nperformed by ten subjects over five days to collect synchronized data from\nbio-impedance across two wrists and IMU on the left wrist.The contrastive\nlearning framework uses the two modalities to train a better IMU-only\nclassification model, where bio-impedance is only required at the training\nphase, by which the average Macro F1 score with the input of a single IMU was\nimproved by 3.22 \\% reaching 84.71 \\% compared to the 81.49 \\% of the IMU\nbaseline model. We have also shown how bio-impedance can improve human activity\nrecognition (HAR) directly through sensor fusion, reaching an average Macro F1\nscore of 89.57 \\% (two modalities required for both training and inference)\neven if Bio-impedance alone has an average macro F1 score of 75.36 \\%, which is\noutperformed by IMU alone. In addition, similar results were obtained in an\nextended study on lower body fitness activity classification, demonstrating the\ngeneralisability of our approach.Our findings underscore the potential of\nsensor fusion and contrastive learning as valuable tools for advancing fitness\nactivity recognition, with bio-impedance playing a pivotal role in augmenting\nthe capabilities of IMU-based systems.\n","authors":["Mengxi Liu","Vitor Fortes Rey","Yu Zhang","Lala Shakti Swarup Ray","Bo Zhou","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2402.09445v2.pdf","comment":"Accepted by percom2024"},{"id":"http://arxiv.org/abs/2405.11143v2","updated":"2024-06-03T12:19:18Z","published":"2024-05-20T01:04:40Z","title":"OpenRLHF: An Easy-to-use, Scalable and High-performance RLHF Framework","summary":"  As large language models (LLMs) continue to grow by scaling laws,\nreinforcement learning from human feedback (RLHF) has gained significant\nattention due to its outstanding performance. However, unlike pretraining or\nfine-tuning a single model, scaling reinforcement learning from human feedback\n(RLHF) for training large language models poses coordination challenges across\nfour models. We present OpenRLHF, an open-source framework enabling efficient\nRLHF scaling. Unlike existing RLHF frameworks that co-locate four models on the\nsame GPUs, OpenRLHF re-designs scheduling for the models beyond 70B parameters\nusing Ray, vLLM, and DeepSpeed, leveraging improved resource utilization and\ndiverse training approaches. Integrating seamlessly with Hugging Face, OpenRLHF\nprovides an out-of-the-box solution with optimized algorithms and launch\nscripts, which ensures user-friendliness. OpenRLHF implements RLHF, DPO,\nrejection sampling, and other alignment techniques. Empowering state-of-the-art\nLLM development, OpenRLHF's code is available at\nhttps://github.com/OpenLLMAI/OpenRLHF.\n","authors":["Jian Hu","Xibin Wu","Weixun Wang"," Xianyu","Dehao Zhang","Yu Cao"],"pdf_url":"https://arxiv.org/pdf/2405.11143v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07105v3","updated":"2024-06-03T12:14:34Z","published":"2024-01-13T16:09:49Z","title":"Graph Language Models","summary":"  While Language Models (LMs) are the workhorses of NLP, their interplay with\nstructured knowledge graphs (KGs) is still actively researched. Current methods\nfor encoding such graphs typically either (i) linearize them for embedding with\nLMs -- which underutilize structural information, or (ii) use Graph Neural\nNetworks (GNNs) to preserve the graph structure -- but GNNs cannot represent\ntext features as well as pretrained LMs. In our work we introduce a novel LM\ntype, the Graph Language Model (GLM), that integrates the strengths of both\napproaches and mitigates their weaknesses. The GLM parameters are initialized\nfrom a pretrained LM to enhance understanding of individual graph concepts and\ntriplets. Simultaneously, we design the GLM's architecture to incorporate graph\nbiases, thereby promoting effective knowledge distribution within the graph.\nThis enables GLMs to process graphs, texts, and interleaved inputs of both.\nEmpirical evaluations on relation classification tasks show that GLM embeddings\nsurpass both LM- and GNN-based baselines in supervised and zero-shot setting,\ndemonstrating their versatility.\n","authors":["Moritz Plenz","Anette Frank"],"pdf_url":"https://arxiv.org/pdf/2401.07105v3.pdf","comment":"Accepted at ACL 2024. 9 pages, 10 figures, 9 tables"},{"id":"http://arxiv.org/abs/2403.17436v2","updated":"2024-06-03T12:12:25Z","published":"2024-03-26T07:05:06Z","title":"Particle identification with machine learning from incomplete data in\n  the ALICE experiment","summary":"  The ALICE experiment at the LHC measures properties of the strongly\ninteracting matter formed in ultrarelativistic heavy-ion collisions. Such\nstudies require accurate particle identification (PID). ALICE provides PID\ninformation via several detectors for particles with momentum from about 100\nMeV/c up to 20 GeV/c. Traditionally, particles are selected with rectangular\ncuts. A much better performance can be achieved with machine learning (ML)\nmethods. Our solution uses multiple neural networks (NN) serving as binary\nclassifiers. Moreover, we extended our particle classifier with Feature Set\nEmbedding and attention in order to train on data with incomplete samples. We\nalso present the integration of the ML project with the ALICE analysis\nsoftware, and we discuss domain adaptation, the ML technique needed to transfer\nthe knowledge between simulated and real experimental data.\n","authors":["Maja Karwowska","Łukasz Graczykowski","Kamil Deja","Miłosz Kasak","Małgorzata Janik"],"pdf_url":"https://arxiv.org/pdf/2403.17436v2.pdf","comment":"Proceedings of 3rd Artificial Intelligence for the Electron Ion\n  Collider workshop -- AI4EIC2023, 28.11-1.12.2023. Accepted in JINST"},{"id":"http://arxiv.org/abs/2306.12330v2","updated":"2024-06-03T12:09:10Z","published":"2023-06-21T15:17:39Z","title":"ProtoGate: Prototype-based Neural Networks with Global-to-local Feature\n  Selection for Tabular Biomedical Data","summary":"  Tabular biomedical data poses challenges in machine learning because it is\noften high-dimensional and typically low-sample-size (HDLSS). Previous research\nhas attempted to address these challenges via local feature selection, but\nexisting approaches often fail to achieve optimal performance due to their\nlimitation in identifying globally important features and their susceptibility\nto the co-adaptation problem. In this paper, we propose ProtoGate, a\nprototype-based neural model for feature selection on HDLSS data. ProtoGate\nfirst selects instance-wise features via adaptively balancing global and local\nfeature selection. Furthermore, ProtoGate employs a non-parametric\nprototype-based prediction mechanism to tackle the co-adaptation problem,\nensuring the feature selection results and predictions are consistent with\nunderlying data clusters. We conduct comprehensive experiments to evaluate the\nperformance and interpretability of ProtoGate on synthetic and real-world\ndatasets. The results show that ProtoGate generally outperforms\nstate-of-the-art methods in prediction accuracy by a clear margin while\nproviding high-fidelity feature selection and explainable predictions. Code is\navailable at https://github.com/SilenceX12138/ProtoGate.\n","authors":["Xiangjian Jiang","Andrei Margeloiu","Nikola Simidjievski","Mateja Jamnik"],"pdf_url":"https://arxiv.org/pdf/2306.12330v2.pdf","comment":"Accepted by the Forty-first International Conference on Machine\n  Learning (ICML2024)"},{"id":"http://arxiv.org/abs/2203.08717v3","updated":"2024-06-03T12:06:06Z","published":"2022-03-16T16:14:19Z","title":"Weak Augmentation Guided Relational Self-Supervised Learning","summary":"  Self-supervised Learning (SSL) including the mainstream contrastive learning\nhas achieved great success in learning visual representations without data\nannotations. However, most methods mainly focus on the instance level\ninformation (\\ie, the different augmented images of the same instance should\nhave the same feature or cluster into the same class), but there is a lack of\nattention on the relationships between different instances. In this paper, we\nintroduce a novel SSL paradigm, which we term as relational self-supervised\nlearning (ReSSL) framework that learns representations by modeling the\nrelationship between different instances. Specifically, our proposed method\nemploys sharpened distribution of pairwise similarities among different\ninstances as \\textit{relation} metric, which is thus utilized to match the\nfeature embeddings of different augmentations. To boost the performance, we\nargue that weak augmentations matter to represent a more reliable relation, and\nleverage momentum strategy for practical efficiency. The designed asymmetric\npredictor head and an InfoNCE warm-up strategy enhance the robustness to\nhyper-parameters and benefit the resulting performance. Experimental results\nshow that our proposed ReSSL substantially outperforms the state-of-the-art\nmethods across different network architectures, including various lightweight\nnetworks (\\eg, EfficientNet and MobileNet).\n","authors":["Mingkai Zheng","Shan You","Fei Wang","Chen Qian","Changshui Zhang","Xiaogang Wang","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2203.08717v3.pdf","comment":"Extended version of NeurIPS 2021 paper. arXiv admin note: substantial\n  text overlap with arXiv:2107.09282"},{"id":"http://arxiv.org/abs/2403.06833v2","updated":"2024-06-03T12:04:50Z","published":"2024-03-11T15:48:56Z","title":"Can LLMs Separate Instructions From Data? And What Do We Even Mean By\n  That?","summary":"  Instruction-tuned Large Language Models (LLMs) show impressive results in\nnumerous practical applications, but they lack essential safety features that\nare common in other areas of computer science, particularly an explicit\nseparation of instructions and data. This makes them vulnerable to\nmanipulations such as indirect prompt injections and generally unsuitable for\nsafety-critical tasks. Surprisingly, there is currently no established\ndefinition or benchmark to quantify this phenomenon. In this work, we close\nthis gap by introducing a formal measure for instruction-data separation and an\nempirical variant that is calculable from a model's outputs. We also present a\nnew dataset, SEP, that allows estimating the measure for real-world models. Our\nresults on various LLMs show that the problem of instruction-data separation is\nreal: all models fail to achieve high separation, and canonical mitigation\ntechniques, such as prompt engineering and fine-tuning, either fail to\nsubstantially improve separation or reduce model utility. The source code and\nSEP dataset are openly accessible at\nhttps://github.com/egozverev/Shold-It-Be-Executed-Or-Processed.\n","authors":["Egor Zverev","Sahar Abdelnabi","Soroush Tabesh","Mario Fritz","Christoph H. Lampert"],"pdf_url":"https://arxiv.org/pdf/2403.06833v2.pdf","comment":"GitHub:\n  https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed. 10 pages main\n  text, 30 pages in total"},{"id":"http://arxiv.org/abs/2405.12807v5","updated":"2024-06-03T11:55:11Z","published":"2024-05-21T13:58:17Z","title":"FAdam: Adam is a natural gradient optimizer using diagonal empirical\n  Fisher information","summary":"  This paper establishes a mathematical foundation for the Adam optimizer,\nelucidating its connection to natural gradient descent through Riemannian and\ninformation geometry. We rigorously analyze the diagonal empirical Fisher\ninformation matrix (FIM) in Adam, clarifying all detailed approximations and\nadvocating for the use of log probability functions as loss, which should be\nbased on discrete distributions, due to the limitations of empirical FIM. Our\nanalysis uncovers flaws in the original Adam algorithm, leading to proposed\ncorrections such as enhanced momentum calculations, adjusted bias corrections,\nadaptive epsilon, and gradient clipping. We refine the weight decay term based\non our theoretical framework. Our modified algorithm, Fisher Adam (FAdam),\ndemonstrates superior performance across diverse domains including LLM, ASR,\nand VQ-VAE, achieving state-of-the-art results in ASR.\n","authors":["Dongseong Hwang"],"pdf_url":"https://arxiv.org/pdf/2405.12807v5.pdf","comment":"21 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2402.13891v2","updated":"2024-06-03T11:40:32Z","published":"2024-02-21T16:02:14Z","title":"Overcoming Saturation in Density Ratio Estimation by Iterated\n  Regularization","summary":"  Estimating the ratio of two probability densities from finitely many samples,\nis a central task in machine learning and statistics. In this work, we show\nthat a large class of kernel methods for density ratio estimation suffers from\nerror saturation, which prevents algorithms from achieving fast error\nconvergence rates on highly regular learning problems. To resolve saturation,\nwe introduce iterated regularization in density ratio estimation to achieve\nfast error rates. Our methods outperform its non-iteratively regularized\nversions on benchmarks for density ratio estimation as well as on large-scale\nevaluations for importance-weighted ensembling of deep unsupervised domain\nadaptation models.\n","authors":["Lukas Gruber","Markus Holzleitner","Johannes Lehner","Sepp Hochreiter","Werner Zellinger"],"pdf_url":"https://arxiv.org/pdf/2402.13891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08045v4","updated":"2024-06-03T11:34:05Z","published":"2023-11-14T10:10:31Z","title":"Adversarial Preference Optimization: Enhancing Your Alignment via RM-LLM\n  Game","summary":"  Human preference alignment is essential to improve the interaction quality of\nlarge language models (LLMs). Existing alignment methods depend on manually\nannotated preference data to guide the LLM optimization directions. However,\ncontinuously updating LLMs for alignment raises a distribution gap between\nmodel-generated samples and human-annotated responses, hindering training\neffectiveness. To mitigate this issue, previous methods require additional\npreference annotation on newly generated samples to adapt to the shifted\ndistribution, which consumes a large amount of annotation resources. Targeting\nmore efficient human preference optimization, we propose an Adversarial\nPreference Optimization (APO) framework, in which the LLM and the reward model\nupdate alternatively via a min-max game. Through adversarial training, the\nreward model can adapt to the shifted generation distribution of the LLM\nwithout any additional annotation. With comprehensive experiments, we find the\nproposed adversarial training framework further enhances existing alignment\nbaselines in terms of LLM helpfulness and harmlessness. The code is at\nhttps://github.com/Linear95/APO.\n","authors":["Pengyu Cheng","Yifan Yang","Jian Li","Yong Dai","Tianhao Hu","Peixin Cao","Nan Du","Xiaolong Li"],"pdf_url":"https://arxiv.org/pdf/2311.08045v4.pdf","comment":"Accepted by ACL2024 findings"},{"id":"http://arxiv.org/abs/2403.06807v2","updated":"2024-06-03T11:33:51Z","published":"2024-03-11T15:26:34Z","title":"Multistep Consistency Models","summary":"  Diffusion models are relatively easy to train but require many steps to\ngenerate samples. Consistency models are far more difficult to train, but\ngenerate samples in a single step.\n  In this paper we propose Multistep Consistency Models: A unification between\nConsistency Models (Song et al., 2023) and TRACT (Berthelot et al., 2023) that\ncan interpolate between a consistency model and a diffusion model: a trade-off\nbetween sampling speed and sampling quality. Specifically, a 1-step consistency\nmodel is a conventional consistency model whereas a $\\infty$-step consistency\nmodel is a diffusion model.\n  Multistep Consistency Models work really well in practice. By increasing the\nsample budget from a single step to 2-8 steps, we can train models more easily\nthat generate higher quality samples, while retaining much of the sampling\nspeed benefits. Notable results are 1.4 FID on Imagenet 64 in 8 step and 2.1\nFID on Imagenet128 in 8 steps with consistency distillation, using simple\nlosses without adversarial training. We also show that our method scales to a\ntext-to-image diffusion model, generating samples that are close to the quality\nof the original model.\n","authors":["Jonathan Heek","Emiel Hoogeboom","Tim Salimans"],"pdf_url":"https://arxiv.org/pdf/2403.06807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17406v2","updated":"2024-06-03T11:32:11Z","published":"2024-05-27T17:55:05Z","title":"Deep Learning Calabi-Yau four folds with hybrid and recurrent neural\n  network architectures","summary":"  In this work, we report the results of applying deep learning based on hybrid\nconvolutional-recurrent and purely recurrent neural network architectures to\nthe dataset of almost one million complete intersection Calabi-Yau four-folds\n(CICY4) to machine-learn their four Hodge numbers $h^{1,1}, h^{2,1}, h^{3,1},\nh^{2,2}$. In particular, we explored and experimented with twelve different\nneural network models, nine of which are convolutional-recurrent (CNN-RNN)\nhybrids with the RNN unit being either GRU (Gated Recurrent Unit) or Long Short\nTerm Memory (LSTM). The remaining four models are purely recurrent neural\nnetworks based on LSTM. In terms of the $h^{1,1}, h^{2,1}, h^{3,1}, h^{2,2}$\nprediction accuracies, at 72% training ratio, our best performing individual\nmodel is CNN-LSTM-400, a hybrid CNN-LSTM with the LSTM hidden size of 400,\nwhich obtained 99.74%, 98.07%, 95.19%, 81.01%, our second best performing\nindividual model is LSTM-448, an LSTM-based model with the hidden size of 448,\nwhich obtained 99.74%, 97.51%, 94.24%, and 78.63%. These results were improved\nby forming ensembles of the top two, three or even four models. Our best\nensemble, consisting of the top four models, achieved the accuracies of 99.84%,\n98.71%, 96.26%, 85.03%. At 80% training ratio, the top two performing models\nLSTM-448 and LSTM-424 are both LSTM-based with the hidden sizes of 448 and 424.\nCompared with the 72% training ratio, there is a significant improvement of\naccuracies, which reached 99.85%, 98.66%, 96.26%, 84.77% for the best\nindividual model and 99.90%, 99.03%, 97.97%, 87.34% for the best ensemble.\n","authors":["H. L. Dao"],"pdf_url":"https://arxiv.org/pdf/2405.17406v2.pdf","comment":"v2: new (improved) results added, references added, typos corrected"},{"id":"http://arxiv.org/abs/2405.18983v2","updated":"2024-06-03T11:16:55Z","published":"2024-05-29T10:56:13Z","title":"Federated Learning under Partially Class-Disjoint Data via Manifold\n  Reshaping","summary":"  Statistical heterogeneity severely limits the performance of federated\nlearning (FL), motivating several explorations e.g., FedProx, MOON and FedDyn,\nto alleviate this problem. Despite effectiveness, their considered scenario\ngenerally requires samples from almost all classes during the local training of\neach client, although some covariate shifts may exist among clients. In fact,\nthe natural case of partially class-disjoint data (PCDD), where each client\ncontributes a few classes (instead of all classes) of samples, is practical yet\nunderexplored. Specifically, the unique collapse and invasion characteristics\nof PCDD can induce the biased optimization direction in local training, which\nprevents the efficiency of federated learning. To address this dilemma, we\npropose a manifold reshaping approach called FedMR to calibrate the feature\nspace of local training. Our FedMR adds two interplaying losses to the vanilla\nfederated learning: one is intra-class loss to decorrelate feature dimensions\nfor anti-collapse; and the other one is inter-class loss to guarantee the\nproper margin among categories in the feature expansion. We conduct extensive\nexperiments on a range of datasets to demonstrate that our FedMR achieves much\nhigher accuracy and better communication efficiency. Source code is available\nat: https://github.com/MediaBrain-SJTU/FedMR.git.\n","authors":["Ziqing Fan","Jiangchao Yao","Ruipeng Zhang","Lingjuan Lyu","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.18983v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16444v2","updated":"2024-06-03T10:57:57Z","published":"2024-05-26T06:00:17Z","title":"CacheBlend: Fast Large Language Model Serving for RAG with Cached\n  Knowledge Fusion","summary":"  Large language models (LLMs) often incorporate multiple text chunks in their\ninputs to provide the necessary contexts. To speed up the prefill of the long\nLLM inputs, one can pre-compute the KV cache of a text and re-use the KV cache\nwhen the context is reused as the prefix of another LLM input. However, the\nreused text chunks are not always the input prefix, and when they are not,\ntheir precomputed KV caches cannot be directly used since they ignore the\ntext's cross-attention with the preceding text in the LLM input. Thus, the\nbenefits of reusing KV caches remain largely unrealized.\n  This paper tackles just one question: when an LLM input contains multiple\ntext chunks, how to quickly combine their precomputed KV caches in order to\nachieve the same generation quality as the expensive full prefill (i.e.,\nwithout reusing KV cache)? We present CacheBlend, a scheme that reuses the\npre-computed KV caches, regardless prefix or not, and selectively recomputes\nthe KV values of a small subset of tokens to partially update each reused KV\ncache. In the meantime,the small extra delay for recomputing some tokens can be\npipelined with the retrieval of KV caches within the same job,allowing\nCacheBlend to store KV caches in slower devices with more storage capacity\nwhile retrieving them without increasing the inference delay. By comparing\nCacheBlend with the state-of-the-art KV cache reusing schemes on three\nopen-source LLMs of various sizes and four popular benchmark datasets of\ndifferent tasks, we show that CacheBlend reduces time-to-first-token (TTFT) by\n2.2-3.3X and increases the inference throughput by 2.8-5X, compared with full\nKV recompute, without compromising generation quality or incurring more storage\ncost.\n","authors":["Jiayi Yao","Hanchen Li","Yuhan Liu","Siddhant Ray","Yihua Cheng","Qizheng Zhang","Kuntai Du","Shan Lu","Junchen Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.16444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06970v3","updated":"2024-06-03T10:50:08Z","published":"2023-10-10T19:47:58Z","title":"Flood and Echo Net: Algorithmically Aligned GNNs that Generalize","summary":"  Most Graph Neural Networks follow the standard message-passing framework\nwhere, in each step, all nodes simultaneously communicate with each other. We\nwant to challenge this paradigm by aligning the computation more closely to the\nexecution of distributed algorithms and propose the Flood and Echo Net. A\nsingle round of a Flood and Echo Net consists of an origin node and a flooding\nphase followed by an echo phase. First, during the flooding, messages are sent\nfrom the origin and propagated outwards throughout the entire graph. Then,\nduring the echo, the message flow reverses and messages are sent back towards\nthe origin. As nodes are only sparsely activated upon receiving a message, this\nleads to a wave-like activation pattern that traverses the graph. Through these\nsparse but parallel activations, the Net becomes more expressive than\ntraditional MPNNs which are limited by the 1-WL test and also is provably more\nefficient in terms of message complexity. Moreover, the mechanism's inherent\nability to generalize across graphs of varying sizes positions it as a\npractical architecture for the task of algorithmic learning. We test the Flood\nand Echo Net on a variety of synthetic tasks and the SALSA-CLRS benchmark and\nfind that the algorithmic alignment of the execution improves generalization to\nlarger graph sizes.\n","authors":["Joël Mathys","Florian Grötschla","Kalyan Varma Nadimpalli","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2310.06970v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2405.16956v2","updated":"2024-06-03T10:42:50Z","published":"2024-05-27T08:46:57Z","title":"Functional Programming Paradigm of Python for Scientific Computation\n  Pipeline Integration","summary":"  The advent of modern data processing has led to an increasing tendency\ntowards interdisciplinarity, which frequently involves the importation of\ndifferent technical approaches. Consequently, there is an urgent need for a\nunified data control system to facilitate the integration of varying libraries.\nThis integration is of profound significance in accelerating prototype\nverification, optimising algorithm performance and minimising maintenance\ncosts. This paper presents a novel functional programming (FP) paradigm based\non the Python architecture and associated suites in programming practice,\ndesigned for the integration of pipelines of different data mapping operations.\nIn particular, the solution is intended for the integration of scientific\ncomputation flows, which affords a robust yet flexible solution for the\naforementioned challenges.\n","authors":["Chen Zhang","Lecheng Jia","Wei Zhang","Ning Wen"],"pdf_url":"https://arxiv.org/pdf/2405.16956v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2405.16088v2","updated":"2024-06-03T10:26:10Z","published":"2024-05-25T06:39:39Z","title":"Estimating the normal-inverse-Wishart distribution","summary":"  The normal-inverse-Wishart (NIW) distribution is commonly used as a prior\ndistribution for the mean and covariance parameters of a multivariate normal\ndistribution. The family of NIW distributions is also a minimal exponential\nfamily. In this short note we describe a convergent procedure for converting\nfrom mean parameters to natural parameters in the NIW family, or --\nequivalently -- for performing maximum likelihood estimation of the natural\nparameters given observed sufficient statistics. This is needed, for example,\nwhen using a NIW base family in expectation propagation.\n","authors":["Jonathan So"],"pdf_url":"https://arxiv.org/pdf/2405.16088v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10634v2","updated":"2024-06-03T10:26:05Z","published":"2024-02-16T12:33:31Z","title":"Graph-based Forecasting with Missing Data through Spatiotemporal\n  Downsampling","summary":"  Given a set of synchronous time series, each associated with a sensor-point\nin space and characterized by inter-series relationships, the problem of\nspatiotemporal forecasting consists of predicting future observations for each\npoint. Spatiotemporal graph neural networks achieve striking results by\nrepresenting the relationships across time series as a graph. Nonetheless, most\nexisting methods rely on the often unrealistic assumption that inputs are\nalways available and fail to capture hidden spatiotemporal dynamics when part\nof the data is missing. In this work, we tackle this problem through\nhierarchical spatiotemporal downsampling. The input time series are\nprogressively coarsened over time and space, obtaining a pool of\nrepresentations that capture heterogeneous temporal and spatial dynamics.\nConditioned on observations and missing data patterns, such representations are\ncombined by an interpretable attention mechanism to generate the forecasts. Our\napproach outperforms state-of-the-art methods on synthetic and real-world\nbenchmarks under different missing data distributions, particularly in the\npresence of contiguous blocks of missing values.\n","authors":["Ivan Marisca","Cesare Alippi","Filippo Maria Bianchi"],"pdf_url":"https://arxiv.org/pdf/2402.10634v2.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2402.09631v3","updated":"2024-06-03T10:24:22Z","published":"2024-02-15T00:20:30Z","title":"Representation Surgery: Theory and Practice of Affine Steering","summary":"  Language models often exhibit undesirable behavior, e.g., generating toxic or\ngender-biased text. In the case of neural language models, an encoding of the\nundesirable behavior is often present in the model's representations. Thus, one\nnatural (and common) approach to prevent the model from exhibiting undesirable\nbehavior is to steer the model's representations in a manner that reduces the\nprobability of it generating undesirable text. This paper investigates the\nformal and empirical properties of steering functions, i.e., transformation of\nthe neural language model's representations that alter its behavior. First, we\nderive two optimal, in the least-squares sense, affine steering functions under\ndifferent constraints. Our theory provides justification for existing\napproaches and offers a novel, improved steering approach. Second, we offer a\nseries of experiments that demonstrate the empirical effectiveness of the\nmethods in mitigating bias and reducing toxic generation.\n","authors":["Shashwat Singh","Shauli Ravfogel","Jonathan Herzig","Roee Aharoni","Ryan Cotterell","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2402.09631v3.pdf","comment":"Accepted in ICML 2024"},{"id":"http://arxiv.org/abs/2212.05260v2","updated":"2024-06-03T10:16:12Z","published":"2022-12-10T10:34:35Z","title":"Examining properness in the external validation of survival models with\n  squared and logarithmic losses","summary":"  Scoring rules promote rational and honest decision-making, which is becoming\nincreasingly important for automated procedures in `auto-ML'. In this paper we\nsurvey common squared and logarithmic scoring rules for survival analysis and\ndetermine which losses are proper and improper. We prove that commonly utilised\nsquared and logarithmic scoring rules that are claimed to be proper are in fact\nimproper, such as the Integrated Survival Brier Score (ISBS). We further prove\nthat under a strict set of assumptions a class of scoring rules is strictly\nproper for, what we term, `approximate' survival losses. Despite the difference\nin properness, experiments in simulated and real-world datasets show there is\nno major difference between improper and proper versions of the widely-used\nISBS, ensuring that we can reasonably trust previous experiments utilizing the\noriginal score for evaluation purposes. We still advocate for the use of proper\nscoring rules, as even minor differences between losses can have important\nimplications in automated processes such as model tuning. We hope our findings\nencourage further research into the properties of survival measures so that\nrobust and honest evaluation of survival models can be achieved.\n","authors":["Raphael Sonabend","John Zobolas","Philipp Kopper","Lukas Burk","Andreas Bender"],"pdf_url":"https://arxiv.org/pdf/2212.05260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00463v2","updated":"2024-06-03T10:07:13Z","published":"2023-11-01T11:57:43Z","title":"Robust and Conjugate Gaussian Process Regression","summary":"  To enable closed form conditioning, a common assumption in Gaussian process\n(GP) regression is independent and identically distributed Gaussian observation\nnoise. This strong and simplistic assumption is often violated in practice,\nwhich leads to unreliable inferences and uncertainty quantification.\nUnfortunately, existing methods for robustifying GPs break closed-form\nconditioning, which makes them less attractive to practitioners and\nsignificantly more computationally expensive. In this paper, we demonstrate how\nto perform provably robust and conjugate Gaussian process (RCGP) regression at\nvirtually no additional cost using generalised Bayesian inference. RCGP is\nparticularly versatile as it enables exact conjugate closed form updates in all\nsettings where standard GPs admit them. To demonstrate its strong empirical\nperformance, we deploy RCGP for problems ranging from Bayesian optimisation to\nsparse variational Gaussian processes.\n","authors":["Matias Altamirano","François-Xavier Briol","Jeremias Knoblauch"],"pdf_url":"https://arxiv.org/pdf/2311.00463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03945v2","updated":"2024-06-03T09:55:44Z","published":"2024-03-06T18:52:39Z","title":"SPEAR:Exact Gradient Inversion of Batches in Federated Learning","summary":"  Federated learning is a framework for collaborative machine learning where\nclients only share gradient updates and not their private data with a server.\nHowever, it was recently shown that gradient inversion attacks can reconstruct\nthis data from the shared gradients. In the important honest-but-curious\nsetting, existing attacks enable exact reconstruction only for a batch size of\n$b=1$, with larger batches permitting only approximate reconstruction. In this\nwork, we propose SPEAR, the first algorithm reconstructing whole batches with\n$b >1$ exactly. SPEAR combines insights into the explicit low-rank structure of\ngradients with a sampling-based algorithm. Crucially, we leverage ReLU-induced\ngradient sparsity to precisely filter out large numbers of incorrect samples,\nmaking a final reconstruction step tractable. We provide an efficient GPU\nimplementation for fully connected networks and show that it recovers\nhigh-dimensional ImageNet inputs in batches of up to $b \\lesssim 25$ exactly\nwhile scaling to large networks. Finally, we show theoretically that much\nlarger batches can be reconstructed with high probability given exponential\ntime.\n","authors":["Dimitar I. Dimitrov","Maximilian Baader","Mark Niklas Müller","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2403.03945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16275v2","updated":"2024-06-03T09:53:22Z","published":"2022-11-29T15:04:09Z","title":"A survey on multi-player bandits","summary":"  Due mostly to its application to cognitive radio networks, multiplayer\nbandits gained a lot of interest in the last decade. A considerable progress\nhas been made on its theoretical aspect. However, the current algorithms are\nfar from applicable and many obstacles remain between these theoretical results\nand a possible implementation of multiplayer bandits algorithms in real\ncognitive radio networks. This survey contextualizes and organizes the rich\nmultiplayer bandits literature. In light of the existing works, some clear\ndirections for future research appear. We believe that a further study of these\ndifferent directions might lead to theoretical algorithms adapted to real-world\nsituations.\n","authors":["Etienne Boursier","Vianney Perchet"],"pdf_url":"https://arxiv.org/pdf/2211.16275v2.pdf","comment":"final version, accepted at JMLR"},{"id":"http://arxiv.org/abs/2402.03819v2","updated":"2024-06-03T09:53:06Z","published":"2024-02-06T09:07:41Z","title":"Do we need rebalancing strategies? A theoretical and empirical study\n  around SMOTE and its variants","summary":"  Synthetic Minority Oversampling Technique (SMOTE) is a common rebalancing\nstrategy for handling imbalanced tabular data sets. However, few works analyze\nSMOTE theoretically. In this paper, we prove that SMOTE (with default\nparameter) simply copies the original minority samples asymptotically. We also\nprove that SMOTE exhibits boundary artifacts, thus justifying existing SMOTE\nvariants. Then we introduce two new SMOTE-related strategies, and compare them\nwith state-of-the-art rebalancing procedures. Surprisingly, for most data sets,\nwe observe that applying no rebalancing strategy is competitive in terms of\npredictive performances, with tuned random forests. For highly imbalanced data\nsets, our new method, named Multivariate Gaussian SMOTE, is competitive.\nBesides, our analysis sheds some lights on the behavior of common rebalancing\nstrategies, when used in conjunction with random forests.\n","authors":["Abdoulaye Sakho","Emmanuel Malherbe","Erwan Scornet"],"pdf_url":"https://arxiv.org/pdf/2402.03819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09930v3","updated":"2024-06-03T09:46:32Z","published":"2024-03-15T00:09:47Z","title":"Quality-Diversity Actor-Critic: Learning High-Performing and Diverse\n  Behaviors via Value and Successor Features Critics","summary":"  A key aspect of intelligence is the ability to demonstrate a broad spectrum\nof behaviors for adapting to unexpected situations. Over the past decade,\nadvancements in deep reinforcement learning have led to groundbreaking\nachievements to solve complex continuous control tasks. However, most\napproaches return only one solution specialized for a specific problem. We\nintroduce Quality-Diversity Actor-Critic (QDAC), an off-policy actor-critic\ndeep reinforcement learning algorithm that leverages a value function critic\nand a successor features critic to learn high-performing and diverse behaviors.\nIn this framework, the actor optimizes an objective that seamlessly unifies\nboth critics using constrained optimization to (1) maximize return, while (2)\nexecuting diverse skills. Compared with other Quality-Diversity methods, QDAC\nachieves significantly higher performance and more diverse behaviors on six\nchallenging continuous control locomotion tasks. We also demonstrate that we\ncan harness the learned skills to adapt better than other baselines to five\nperturbed environments. Finally, qualitative analyses showcase a range of\nremarkable behaviors: adaptive-intelligent-robotics.github.io/QDAC.\n","authors":["Luca Grillotti","Maxence Faldor","Borja G. León","Antoine Cully"],"pdf_url":"https://arxiv.org/pdf/2403.09930v3.pdf","comment":"The first two authors contributed equally to this work. Accepted at\n  ICML 2024"},{"id":"http://arxiv.org/abs/2405.06582v2","updated":"2024-06-03T09:27:03Z","published":"2024-05-10T16:36:59Z","title":"The Role of Learning Algorithms in Collective Action","summary":"  Collective action in machine learning is the study of the control that a\ncoordinated group can have over machine learning algorithms. While previous\nresearch has concentrated on assessing the impact of collectives against\nBayes~(sub)-optimal classifiers, this perspective is limited in that it does\nnot account for the choice of learning algorithm. Classifiers seldom behave\nlike Bayes classifiers and are influenced by the choice of learning algorithms\nalong with their inherent biases. In this work, we initiate the study of how\nthe choice of the learning algorithm plays a role in the success of a\ncollective in practical settings. Specifically, we focus on distributionally\nrobust optimization (DRO), popular for improving a worst group error, and on\nthe ubiquitous stochastic gradient descent (SGD), due to its inductive bias for\n\"simpler\" functions. Our empirical results, supported by a theoretical\nfoundation, show that the effective size and success of the collective are\nhighly dependent on properties of the learning algorithm. This highlights the\nnecessity of taking the learning algorithm into account when studying the\nimpact of collective action in machine learning.\n","authors":["Omri Ben-Dov","Jake Fawkes","Samira Samadi","Amartya Sanyal"],"pdf_url":"https://arxiv.org/pdf/2405.06582v2.pdf","comment":"Accepted at the International Conference in Machine Learning (ICML),\n  2024"},{"id":"http://arxiv.org/abs/2311.18741v2","updated":"2024-06-03T09:15:29Z","published":"2023-11-30T17:38:54Z","title":"VREM-FL: Mobility-Aware Computation-Scheduling Co-Design for Vehicular\n  Federated Learning","summary":"  Assisted and autonomous driving are rapidly gaining momentum and will soon\nbecome a reality. Artificial intelligence and machine learning are regarded as\nkey enablers thanks to the massive amount of data that smart vehicles will\ncollect from onboard sensors. Federated learning is one of the most promising\ntechniques for training global machine learning models while preserving data\nprivacy of vehicles and optimizing communications resource usage. In this\narticle, we propose vehicular radio environment map federated learning\n(VREM-FL), a computation-scheduling co-design for vehicular federated learning\nthat combines mobility of vehicles with 5G radio environment maps. VREM-FL\njointly optimizes learning performance of the global model and wisely allocates\ncommunication and computation resources. This is achieved by orchestrating\nlocal computations at the vehicles in conjunction with transmission of their\nlocal models in an adaptive and predictive fashion, by exploiting radio channel\nmaps. The proposed algorithm can be tuned to trade training time for radio\nresource usage. Experimental results demonstrate that VREM-FL outperforms\nliterature benchmarks for both a linear regression model (learning time reduced\nby 28%) and a deep neural network for semantic image segmentation (doubling the\nnumber of model updates within the same time window).\n","authors":["Luca Ballotta","Nicolò Dal Fabbro","Giovanni Perin","Luca Schenato","Michele Rossi","Giuseppe Piro"],"pdf_url":"https://arxiv.org/pdf/2311.18741v2.pdf","comment":"This work has been submitted to IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2403.09389v2","updated":"2024-06-03T09:10:27Z","published":"2024-03-14T13:40:26Z","title":"Learning to optimize with convergence guarantees using nonlinear system\n  theory","summary":"  The increasing reliance on numerical methods for controlling dynamical\nsystems and training machine learning models underscores the need to devise\nalgorithms that dependably and efficiently navigate complex optimization\nlandscapes. Classical gradient descent methods offer strong theoretical\nguarantees for convex problems; however, they demand meticulous hyperparameter\ntuning for non-convex ones. The emerging paradigm of learning to optimize (L2O)\nautomates the discovery of algorithms with optimized performance leveraging\nlearning models and data - yet, it lacks a theoretical framework to analyze\nconvergence of the learned algorithms. In this paper, we fill this gap by\nharnessing nonlinear system theory. Specifically, we propose an unconstrained\nparametrization of all convergent algorithms for smooth non-convex objective\nfunctions. Notably, our framework is directly compatible with automatic\ndifferentiation tools, ensuring convergence by design while learning to\noptimize.\n","authors":["Andrea Martin","Luca Furieri"],"pdf_url":"https://arxiv.org/pdf/2403.09389v2.pdf","comment":"Published in the IEEE Control Systems Letters"},{"id":"http://arxiv.org/abs/2404.00074v2","updated":"2024-06-03T09:03:10Z","published":"2024-03-28T19:57:48Z","title":"A finite operator learning technique for mapping the elastic properties\n  of microstructures to their mechanical deformations","summary":"  To obtain fast solutions for governing physical equations in solid mechanics,\nwe introduce a method that integrates the core ideas of the finite element\nmethod with physics-informed neural networks and concept of neural operators.\nThis approach generalizes and enhances each method, learning the parametric\nsolution for mechanical problems without relying on data from other resources\n(e.g. other numerical solvers). We propose directly utilizing the available\ndiscretized weak form in finite element packages to construct the loss\nfunctions algebraically, thereby demonstrating the ability to find solutions\neven in the presence of sharp discontinuities. Our focus is on micromechanics\nas an example, where knowledge of deformation and stress fields for a given\nheterogeneous microstructure is crucial for further design applications. The\nprimary parameter under investigation is the Young's modulus distribution\nwithin the heterogeneous solid system. Our investigations reveal that\nphysics-based training yields higher accuracy compared to purely data-driven\napproaches for unseen microstructures. Additionally, we offer two methods to\ndirectly improve the process of obtaining high-resolution solutions, avoiding\nthe need to use basic interpolation techniques. First is based on an\nautoencoder approach to enhance the efficiency for calculation on high\nresolution grid point. Next, Fourier-based parametrization is utilized to\naddress complex 2D and 3D problems in micromechanics. The latter idea aims to\nrepresent complex microstructures efficiently using Fourier coefficients.\nComparisons with other well-known operator learning algorithms, further\nemphasize the advantages of the newly proposed method.\n","authors":["Shahed Rezaei","Reza Najian Asl","Shirko Faroughi","Mahdi Asgharzadeh","Ali Harandi","Rasoul Najafi Koopas","Gottfried Laschet","Stefanie Reese","Markus Apel"],"pdf_url":"https://arxiv.org/pdf/2404.00074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06395v3","updated":"2024-06-03T08:54:38Z","published":"2024-04-09T15:36:50Z","title":"MiniCPM: Unveiling the Potential of Small Language Models with Scalable\n  Training Strategies","summary":"  The burgeoning interest in developing Large Language Models (LLMs) with up to\ntrillion parameters has been met with concerns regarding resource efficiency\nand practical expense, particularly given the immense cost of experimentation.\nThis scenario underscores the importance of exploring the potential of Small\nLanguage Models (SLMs) as a resource-efficient alternative. In this context, we\nintroduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter\nvariants, not only excel in their respective categories but also demonstrate\ncapabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach\nexhibits scalability in both model and data dimensions for future LLM research.\nRegarding model scaling, we employ extensive model wind tunnel experiments for\nstable and optimal scaling. For data scaling, we introduce a\nWarmup-Stable-Decay (WSD) learning rate scheduler (LRS), conducive to\ncontinuous training and domain adaptation. We present an in-depth analysis of\nthe intriguing training dynamics that occurred in the WSD LRS. With WSD LRS, we\nare now able to efficiently study data-model scaling law without extensive\nretraining experiments on both axes of model and data, from which we derive the\nmuch higher compute optimal data-model ratio than Chinchilla Optimal.\nAdditionally, we introduce MiniCPM family, including MiniCPM-DPO, MiniCPM-MoE\nand MiniCPM-128K, whose excellent performance further cementing MiniCPM's\nfoundation in diverse SLM applications. MiniCPM models are available publicly\nat https://github.com/OpenBMB/MiniCPM .\n","authors":["Shengding Hu","Yuge Tu","Xu Han","Chaoqun He","Ganqu Cui","Xiang Long","Zhi Zheng","Yewei Fang","Yuxiang Huang","Weilin Zhao","Xinrong Zhang","Zheng Leng Thai","Kaihuo Zhang","Chongyi Wang","Yuan Yao","Chenyang Zhao","Jie Zhou","Jie Cai","Zhongwu Zhai","Ning Ding","Chao Jia","Guoyang Zeng","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2404.06395v3.pdf","comment":"revise according to peer review"},{"id":"http://arxiv.org/abs/2201.05745v4","updated":"2024-06-03T08:51:23Z","published":"2022-01-15T03:13:02Z","title":"Deep Optimal Transport for Domain Adaptation on SPD Manifolds","summary":"  The machine learning community has shown increasing interest in addressing\nthe domain adaptation problem on symmetric positive definite (SPD) manifolds.\nThis interest is primarily driven by the complexities of neuroimaging data\ngenerated from brain signals, which often exhibit shifts in data distribution\nacross recording sessions. These neuroimaging data, represented by signal\ncovariance matrices, possess the mathematical properties of symmetry and\npositive definiteness. However, applying conventional domain adaptation methods\nis challenging because these mathematical properties can be disrupted when\noperating on covariance matrices. In this study, we introduce a novel geometric\ndeep learning-based approach utilizing optimal transport on SPD manifolds to\nmanage discrepancies in both marginal and conditional distributions between the\nsource and target domains. We evaluate the effectiveness of this approach in\nthree cross-session brain-computer interface scenarios and provide visualized\nresults for further insights. The GitHub repository of this study can be\naccessed at\nhttps://github.com/GeometricBCI/Deep-Optimal-Transport-for-Domain-Adaptation-on-SPD-Manifolds.\n","authors":["Ce Ju","Cuntai Guan"],"pdf_url":"https://arxiv.org/pdf/2201.05745v4.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2405.04657v2","updated":"2024-06-03T08:50:31Z","published":"2024-05-07T20:30:14Z","title":"ACEGEN: Reinforcement learning of generative chemical agents for drug\n  discovery","summary":"  In recent years, reinforcement learning (RL) has emerged as a valuable tool\nin drug design, offering the potential to propose and optimize molecules with\ndesired properties. However, striking a balance between capabilities,\nflexibility, reliability, and efficiency remains challenging due to the\ncomplexity of advanced RL algorithms and the significant reliance on\nspecialized code. In this work, we introduce ACEGEN, a comprehensive and\nstreamlined toolkit tailored for generative drug design, built using TorchRL, a\nmodern RL library that offers thoroughly tested reusable components. We\nvalidate ACEGEN by benchmarking against other published generative modeling\nalgorithms and show comparable or improved performance. We also show examples\nof ACEGEN applied in multiple drug discovery case studies. ACEGEN is accessible\nat \\url{https://github.com/acellera/acegen-open} and available for use under\nthe MIT license.\n","authors":["Albert Bou","Morgan Thomas","Sebastian Dittert","Carles Navarro Ramírez","Maciej Majewski","Ye Wang","Shivam Patel","Gary Tresadern","Mazen Ahmad","Vincent Moens","Woody Sherman","Simone Sciabola","Gianni De Fabritiis"],"pdf_url":"https://arxiv.org/pdf/2405.04657v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05440v3","updated":"2024-06-03T08:49:54Z","published":"2023-11-09T15:24:44Z","title":"A Practical Approach to Novel Class Discovery in Tabular Data","summary":"  The problem of Novel Class Discovery (NCD) consists in extracting knowledge\nfrom a labeled set of known classes to accurately partition an unlabeled set of\nnovel classes. While NCD has recently received a lot of attention from the\ncommunity, it is often solved on computer vision problems and under unrealistic\nconditions. In particular, the number of novel classes is usually assumed to be\nknown in advance, and their labels are sometimes used to tune hyperparameters.\nMethods that rely on these assumptions are not applicable in real-world\nscenarios. In this work, we focus on solving NCD in tabular data when no prior\nknowledge of the novel classes is available. To this end, we propose to tune\nthe hyperparameters of NCD methods by adapting the $k$-fold cross-validation\nprocess and hiding some of the known classes in each fold. Since we have found\nthat methods with too many hyperparameters are likely to overfit these hidden\nclasses, we define a simple deep NCD model. This method is composed of only the\nessential elements necessary for the NCD problem and performs impressively well\nunder realistic conditions. Furthermore, we find that the latent space of this\nmethod can be used to reliably estimate the number of novel classes.\nAdditionally, we adapt two unsupervised clustering algorithms ($k$-means and\nSpectral Clustering) to leverage the knowledge of the known classes. Extensive\nexperiments are conducted on 7 tabular datasets and demonstrate the\neffectiveness of the proposed method and hyperparameter tuning process, and\nshow that the NCD problem can be solved without relying on knowledge from the\nnovel classes.\n","authors":["Colin Troisemaine","Alexandre Reiffers-Masson","Stéphane Gosselin","Vincent Lemaire","Sandrine Vaton"],"pdf_url":"https://arxiv.org/pdf/2311.05440v3.pdf","comment":"30 pages, including 7 pages of annexes"},{"id":"http://arxiv.org/abs/2405.21027v2","updated":"2024-06-03T08:43:51Z","published":"2024-05-31T17:16:29Z","title":"Fusion-PSRO: Nash Policy Fusion for Policy Space Response Oracles","summary":"  A popular approach for solving zero-sum games is to maintain populations of\npolicies to approximate the Nash Equilibrium (NE). Previous studies have shown\nthat Policy Space Response Oracle (PSRO) algorithm is an effective multi-agent\nreinforcement learning framework for solving such games. However, repeatedly\ntraining new policies from scratch to approximate Best Response (BR) to\nopponents' mixed policies at each iteration is both inefficient and costly.\nWhile some PSRO variants initialize a new policy by inheriting from past BR\npolicies, this approach limits the exploration of new policies, especially\nagainst challenging opponents. To address this issue, we propose Fusion-PSRO,\nwhich employs policy fusion to initialize policies for better approximation to\nBR. By selecting high-quality base policies from meta-NE, policy fusion fuses\nthe base policies into a new policy through model averaging. This approach\nallows the initialized policies to incorporate multiple expert policies, making\nit easier to handle difficult opponents compared to inheriting from past BR\npolicies or initializing from scratch. Moreover, our method only modifies the\npolicy initialization phase, allowing its application to nearly all PSRO\nvariants without additional training overhead. Our experiments on\nnon-transitive matrix games, Leduc Poker, and the more complex Liars Dice\ndemonstrate that Fusion-PSRO enhances the performance of nearly all PSRO\nvariants, achieving lower exploitability.\n","authors":["Jiesong Lian","Yucong Huang","Mingzhi Wang","Chengdong Ma","Yixue Hao","Ying Wen","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2405.21027v2.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.13049v3","updated":"2024-06-03T08:23:19Z","published":"2023-08-24T19:35:58Z","title":"Bayesian Exploration Networks","summary":"  Bayesian reinforcement learning (RL) offers a principled and elegant approach\nfor sequential decision making under uncertainty. Most notably, Bayesian agents\ndo not face an exploration/exploitation dilemma, a major pathology of\nfrequentist methods. However theoretical understanding of model-free approaches\nis lacking. In this paper, we introduce a novel Bayesian model-free formulation\nand the first analysis showing that model-free approaches can yield\nBayes-optimal policies. We show all existing model-free approaches make\napproximations that yield policies that can be arbitrarily Bayes-suboptimal. As\na first step towards model-free Bayes optimality, we introduce the Bayesian\nexploration network (BEN) which uses normalising flows to model both the\naleatoric uncertainty (via density estimation) and epistemic uncertainty (via\nvariational inference) in the Bellman operator. In the limit of complete\noptimisation, BEN learns true Bayes-optimal policies, but like in variational\nexpectation-maximisation, partial optimisation renders our approach tractable.\nEmpirical results demonstrate that BEN can learn true Bayes-optimal policies in\ntasks where existing model-free approaches fail.\n","authors":["Mattie Fellows","Brandon Kaplowitz","Christian Schroeder de Witt","Shimon Whiteson"],"pdf_url":"https://arxiv.org/pdf/2308.13049v3.pdf","comment":"ICML 2024 Version Update"},{"id":"http://arxiv.org/abs/2302.09826v3","updated":"2024-06-03T08:20:31Z","published":"2023-02-20T08:19:19Z","title":"On the Expressivity of Persistent Homology in Graph Learning","summary":"  Persistent homology, a technique from computational topology, has recently\nshown strong empirical performance in the context of graph classification.\nBeing able to capture long range graph properties via higher-order topological\nfeatures, such as cycles of arbitrary length, in combination with multi-scale\ntopological descriptors, has improved predictive performance for data sets with\nprominent topological structures, such as molecules. At the same time, the\ntheoretical properties of persistent homology have not been formally assessed\nin this context. This paper intends to bridge the gap between computational\ntopology and graph machine learning by providing a brief introduction to\npersistent homology in the context of graphs, as well as a theoretical\ndiscussion and empirical analysis of its expressivity for graph learning tasks.\n","authors":["Rubén Ballester","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2302.09826v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05890v3","updated":"2024-06-03T08:14:56Z","published":"2024-03-09T12:04:56Z","title":"Towards Efficient Replay in Federated Incremental Learning","summary":"  In Federated Learning (FL), the data in each client is typically assumed\nfixed or static. However, data often comes in an incremental manner in\nreal-world applications, where the data domain may increase dynamically. In\nthis work, we study catastrophic forgetting with data heterogeneity in\nFederated Incremental Learning (FIL) scenarios where edge clients may lack\nenough storage space to retain full data. We propose to employ a simple,\ngeneric framework for FIL named Re-Fed, which can coordinate each client to\ncache important samples for replay. More specifically, when a new task arrives,\neach client first caches selected previous samples based on their global and\nlocal importance. Then, the client trains the local model with both the cached\nsamples and the samples from the new task. Theoretically, we analyze the\nability of Re-Fed to discover important samples for replay thus alleviating the\ncatastrophic forgetting problem. Moreover, we empirically show that Re-Fed\nachieves competitive performance compared to state-of-the-art methods.\n","authors":["Yichen Li","Qunwei Li","Haozhao Wang","Ruixuan Li","Wenliang Zhong","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05890v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05443v3","updated":"2024-06-03T08:12:13Z","published":"2024-02-08T06:45:03Z","title":"Scalable Wasserstein Gradient Flow for Generative Modeling through\n  Unbalanced Optimal Transport","summary":"  Wasserstein Gradient Flow (WGF) describes the gradient dynamics of\nprobability density within the Wasserstein space. WGF provides a promising\napproach for conducting optimization over the probability distributions.\nNumerically approximating the continuous WGF requires the time discretization\nmethod. The most well-known method for this is the JKO scheme. In this regard,\nprevious WGF models employ the JKO scheme and parametrize transport map for\neach JKO step. However, this approach results in quadratic training complexity\n$O(K^2)$ with the number of JKO step $K$. This severely limits the scalability\nof WGF models. In this paper, we introduce a scalable WGF-based generative\nmodel, called Semi-dual JKO (S-JKO). Our model is based on the semi-dual form\nof the JKO step, derived from the equivalence between the JKO step and the\nUnbalanced Optimal Transport. Our approach reduces the training complexity to\n$O(K)$. We demonstrate that our model significantly outperforms existing\nWGF-based generative models, achieving FID scores of 2.62 on CIFAR-10 and 5.46\non CelebA-HQ-256, which are comparable to state-of-the-art image generative\nmodels.\n","authors":["Jaemoo Choi","Jaewoong Choi","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2402.05443v3.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2205.09622v5","updated":"2024-06-03T08:02:04Z","published":"2022-05-19T15:37:26Z","title":"What Is Fairness? On the Role of Protected Attributes and Fictitious\n  Worlds","summary":"  A growing body of literature in fairness-aware machine learning (fairML) aims\nto mitigate machine learning (ML)-related unfairness in automated\ndecision-making (ADM) by defining metrics that measure fairness of an ML model\nand by proposing methods to ensure that trained ML models achieve low scores on\nthese metrics. However, the underlying concept of fairness, i.e., the question\nof what fairness is, is rarely discussed, leaving a significant gap between\ncenturies of philosophical discussion and the recent adoption of the concept in\nthe ML community. In this work, we try to bridge this gap by formalizing a\nconsistent concept of fairness and by translating the philosophical\nconsiderations into a formal framework for the training and evaluation of ML\nmodels in ADM systems. We argue that fairness problems can arise even without\nthe presence of protected attributes (PAs), and point out that fairness and\npredictive performance are not irreconcilable opposites, but that the latter is\nnecessary to achieve the former. Furthermore, we argue why and how causal\nconsiderations are necessary when assessing fairness in the presence of PAs by\nproposing a fictitious, normatively desired (FiND) world in which PAs have no\ncausal effects. In practice, this FiND world must be approximated by a warped\nworld in which the causal effects of the PAs are removed from the real-world\ndata. Finally, we achieve greater linguistic clarity in the discussion of\nfairML. We outline algorithms for practical applications and present\nillustrative experiments on COMPAS data.\n","authors":["Ludwig Bothmann","Kristina Peters","Bernd Bischl"],"pdf_url":"https://arxiv.org/pdf/2205.09622v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06709v2","updated":"2024-06-03T07:52:21Z","published":"2023-05-11T10:34:27Z","title":"NUBO: A Transparent Python Package for Bayesian Optimization","summary":"  NUBO, short for Newcastle University Bayesian Optimization, is a Bayesian\noptimization framework for optimizing expensive-to-evaluate black-box\nfunctions, such as physical experiments and computer simulators. Bayesian\noptimization is a cost-efficient optimization strategy that uses surrogate\nmodeling via Gaussian processes to represent an objective function and\nacquisition functions to guide the selection of candidate points to approximate\nthe global optimum of the objective function. NUBO focuses on transparency and\nuser experience to make Bayesian optimization accessible to researchers from\nall disciplines. Clean and understandable code, precise references, and\nthorough documentation ensure transparency, while a modular and flexible\ndesign, easy-to-write syntax, and careful selection of Bayesian optimization\nalgorithms ensure a good user experience. NUBO allows users to tailor Bayesian\noptimization to their problem by writing a custom optimization loop using the\nprovided building blocks. It supports sequential single-point, parallel\nmulti-point, and asynchronous optimization of bounded, constrained, and mixed\n(discrete and continuous) parameter input spaces. Only algorithms and methods\nextensively tested and validated to perform well are included in NUBO. This\nensures that the package remains compact and does not overwhelm the user with\nan unnecessarily large number of options. The package is written in Python but\ndoes not require expert knowledge of Python to optimize simulators and\nexperiments. NUBO is distributed as open-source software under the BSD 3-Clause\nlicense.\n","authors":["Mike Diessner","Kevin J. Wilson","Richard D. Whalley"],"pdf_url":"https://arxiv.org/pdf/2305.06709v2.pdf","comment":"Accepted for publication by the Journal of Statistical Software"},{"id":"http://arxiv.org/abs/2303.01140v2","updated":"2024-06-03T07:51:54Z","published":"2023-03-02T10:39:13Z","title":"Cardinality Estimation over Knowledge Graphs with Embeddings and Graph\n  Neural Networks","summary":"  Cardinality Estimation over Knowledge Graphs (KG) is crucial for query\noptimization, yet remains a challenging task due to the semi-structured nature\nand complex correlations of typical Knowledge Graphs. In this work, we propose\nGNCE, a novel approach that leverages knowledge graph embeddings and Graph\nNeural Networks (GNN) to accurately predict the cardinality of conjunctive\nqueries. GNCE first creates semantically meaningful embeddings for all entities\nin the KG, which are then integrated into the given query, which is processed\nby a GNN to estimate the cardinality of the query. We evaluate GNCE on several\nKGs in terms of q-Error and demonstrate that it outperforms state-of-the-art\napproaches based on sampling, summaries, and (machine) learning in terms of\nestimation accuracy while also having lower execution time and less parameters.\nAdditionally, we show that GNCE can inductively generalise to unseen entities,\nmaking it suitable for use in dynamic query processing scenarios. Our proposed\napproach has the potential to significantly improve query optimization and\nrelated applications that rely on accurate cardinality estimates of conjunctive\nqueries.\n","authors":["Tim Schwabe","Maribel Acosta"],"pdf_url":"https://arxiv.org/pdf/2303.01140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04854v3","updated":"2024-06-03T07:48:19Z","published":"2024-02-07T13:54:06Z","title":"Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey","summary":"  Research surveys have always posed a challenge for beginner researchers who\nlack of research training. These researchers struggle to understand the\ndirections within their research topic, and the discovery of new research\nfindings within a short time. One way to provide intuitive assistance to\nbeginner researchers is by offering relevant knowledge graphs(KG) and\nrecommending related academic papers. However, existing navigation knowledge\ngraphs primarily rely on keywords in the research field and often fail to\npresent the logical hierarchy among multiple related papers clearly. Moreover,\nmost recommendation systems for academic papers simply rely on high text\nsimilarity, which can leave researchers confused as to why a particular article\nis being recommended. They may lack of grasp important information about the\ninsight connection between \"Issue resolved\" and \"Issue finding\" that they hope\nto obtain. To address these issues, this study aims to support research insight\nsurveys for beginner researchers by establishing a hierarchical tree-structured\nknowledge graph that reflects the inheritance insight of research topics and\nthe relevance insight among the academic papers.\n","authors":["Jinghong Li","Huy Phan","Wen Gu","Koichi Ota","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2402.04854v3.pdf","comment":"This paper will be submitted to 'The 18TH International Conference on\n  INnovations in Intelligent SysTems and Applications (INISTA 2024)'"},{"id":"http://arxiv.org/abs/2402.02801v2","updated":"2024-06-03T07:35:25Z","published":"2024-02-05T08:19:56Z","title":"KS-Lottery: Finding Certified Lottery Tickets for Multilingual Language\n  Models","summary":"  The lottery ticket hypothesis posits the existence of ``winning tickets''\nwithin a randomly initialized neural network. Do winning tickets exist for LLMs\nin fine-tuning scenarios? How can we find such winning tickets? In this paper,\nwe propose KS-Lottery, a method to identify a small subset of LLM parameters\nhighly effective in multilingual fine-tuning. Our key idea is to use\nKolmogorov-Smirnov Test to analyze the distribution shift of parameters before\nand after fine-tuning. We further theoretically prove that KS-Lottery can find\nthe certified winning tickets in the embedding layer, fine-tuning on the found\nparameters is guaranteed to perform as well as full fine-tuning. Comparing\nKS-Lottery with other parameter-efficient tuning algorithms on translation\ntasks, the experimental results show that KS-Lottery finds a much smaller set\nof parameters for fine-tuning while achieving the comparable performance as\nfull fine-tuning LLM. Surprisingly, we find that fine-tuning 18 tokens'\nembedding of LLaMA suffices to reach the fine-tuning translation\nperformance~\\footnote{https://github.com/CONE-MT/KS-Lottery.}.\n","authors":["Fei Yuan","Chang Ma","Shuai Yuan","Qiushi Sun","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2402.02801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10198v3","updated":"2024-06-03T07:34:37Z","published":"2024-02-15T18:55:05Z","title":"SAMformer: Unlocking the Potential of Transformers in Time Series\n  Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention","summary":"  Transformer-based architectures achieved breakthrough performance in natural\nlanguage processing and computer vision, yet they remain inferior to simpler\nlinear baselines in multivariate long-term forecasting. To better understand\nthis phenomenon, we start by studying a toy linear forecasting problem for\nwhich we show that transformers are incapable of converging to their true\nsolution despite their high expressive power. We further identify the attention\nof transformers as being responsible for this low generalization capacity.\nBuilding upon this insight, we propose a shallow lightweight transformer model\nthat successfully escapes bad local minima when optimized with sharpness-aware\noptimization. We empirically demonstrate that this result extends to all\ncommonly used real-world multivariate time series datasets. In particular,\nSAMformer surpasses current state-of-the-art methods and is on par with the\nbiggest foundation model MOIRAI while having significantly fewer parameters.\nThe code is available at https://github.com/romilbert/samformer.\n","authors":["Romain Ilbert","Ambroise Odonnat","Vasilii Feofanov","Aladin Virmaux","Giuseppe Paolo","Themis Palpanas","Ievgen Redko"],"pdf_url":"https://arxiv.org/pdf/2402.10198v3.pdf","comment":"Accepted as an Oral at ICML 2024, Vienna. The first two authors\n  contributed equally"},{"id":"http://arxiv.org/abs/2405.00946v2","updated":"2024-06-03T07:13:37Z","published":"2024-05-02T02:15:23Z","title":"SparseTSF: Modeling Long-term Time Series Forecasting with 1k Parameters","summary":"  This paper introduces SparseTSF, a novel, extremely lightweight model for\nLong-term Time Series Forecasting (LTSF), designed to address the challenges of\nmodeling complex temporal dependencies over extended horizons with minimal\ncomputational resources. At the heart of SparseTSF lies the Cross-Period Sparse\nForecasting technique, which simplifies the forecasting task by decoupling the\nperiodicity and trend in time series data. This technique involves downsampling\nthe original sequences to focus on cross-period trend prediction, effectively\nextracting periodic features while minimizing the model's complexity and\nparameter count. Based on this technique, the SparseTSF model uses fewer than\n*1k* parameters to achieve competitive or superior performance compared to\nstate-of-the-art models. Furthermore, SparseTSF showcases remarkable\ngeneralization capabilities, making it well-suited for scenarios with limited\ncomputational resources, small samples, or low-quality data. The code is\npublicly available at this repository: https://github.com/lss-1138/SparseTSF.\n","authors":["Shengsheng Lin","Weiwei Lin","Wentai Wu","Haojun Chen","Junjie Yang"],"pdf_url":"https://arxiv.org/pdf/2405.00946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00376v2","updated":"2024-06-03T07:09:39Z","published":"2024-03-01T09:01:53Z","title":"Spurious Feature Eraser: Stabilizing Test-Time Adaptation for\n  Vision-Language Foundation Model","summary":"  Vision-language foundation models have exhibited remarkable success across a\nmultitude of downstream tasks due to their scalability on extensive image-text\npaired data. However, these models also display significant limitations when\napplied to downstream tasks, such as fine-grained image classification, as a\nresult of ``decision shortcuts'' that hinder their generalization capabilities.\nIn this work, we find that the CLIP model possesses a rich set of features,\nencompassing both \\textit{desired invariant causal features} and\n\\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP\non downstream tasks originates from its inability to effectively utilize\npre-trained features in accordance with specific task requirements. To address\nthis challenge, we propose a simple yet effective method, Spurious Feature\nEraser (SEraser), to alleviate the decision shortcuts by erasing the spurious\nfeatures. Specifically, we introduce a test-time prompt tuning paradigm that\noptimizes a learnable prompt, thereby compelling the model to exploit invariant\nfeatures while disregarding decision shortcuts during the inference phase. The\nproposed method effectively alleviates excessive dependence on potentially\nmisleading spurious information. We conduct comparative analysis of the\nproposed method against various approaches which validates the significant\nsuperiority.\n","authors":["Huan Ma","Yan Zhu","Changqing Zhang","Peilin Zhao","Baoyuan Wu","Long-Kai Huang","Qinghua Hu","Bingzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2403.00376v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.03316v2","updated":"2024-06-03T07:01:54Z","published":"2022-11-07T05:36:30Z","title":"Accented Text-to-Speech Synthesis with a Conditional Variational\n  Autoencoder","summary":"  Accent plays a significant role in speech communication, influencing one's\ncapability to understand as well as conveying a person's identity. This paper\nintroduces a novel and efficient framework for accented Text-to-Speech (TTS)\nsynthesis based on a Conditional Variational Autoencoder. It has the ability to\nsynthesize a selected speaker's voice, which is converted to any desired target\naccent. Our thorough experiments validate the effectiveness of the proposed\nframework using both objective and subjective evaluations. The results also\nshow remarkable performance in terms of the ability to manipulate accents in\nthe synthesized speech and provide a promising avenue for future accented TTS\nresearch.\n","authors":["Jan Melechovsky","Ambuj Mehrish","Berrak Sisman","Dorien Herremans"],"pdf_url":"https://arxiv.org/pdf/2211.03316v2.pdf","comment":"preprint submitted to a conference, under review"},{"id":"http://arxiv.org/abs/2401.04679v7","updated":"2024-06-03T06:59:31Z","published":"2024-01-09T17:09:01Z","title":"RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation","summary":"  We investigate parameter-efficient fine-tuning (PEFT) methods that can\nprovide good accuracy under limited computational and memory budgets in the\ncontext of large language models (LLMs). We present a new PEFT method called\nRobust Adaptation (RoSA) inspired by robust principal component analysis that\njointly trains $\\textit{low-rank}$ and $\\textit{highly-sparse}$ components on\ntop of a set of fixed pretrained weights to efficiently approximate the\nperformance of a full-fine-tuning (FFT) solution. Across a series of\nchallenging generative tasks such as grade-school math and SQL query\ngeneration, which require fine-tuning for good performance, we show that RoSA\noutperforms LoRA, pure sparse fine-tuning, and alternative hybrid methods at\nthe same parameter budget, and can even recover the performance of FFT on some\ntasks. We provide system support for RoSA to complement the training algorithm,\nspecifically in the form of sparse GPU kernels which enable memory- and\ncomputationally-efficient training, and show that it is also compatible with\nlow-precision base weights, resulting in the first joint representation\ncombining quantization, low-rank and sparse approximations. Our code is\navailable at https://github.com/IST-DASLab/RoSA.\n","authors":["Mahdi Nikdan","Soroush Tabesh","Elvir Crnčević","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2401.04679v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18018v4","updated":"2024-06-03T06:52:58Z","published":"2024-01-31T17:28:24Z","title":"On Prompt-Driven Safeguarding for Large Language Models","summary":"  Prepending model inputs with safety prompts is a common practice for\nsafeguarding large language models (LLMs) against queries with harmful intents.\nHowever, the underlying working mechanisms of safety prompts have not been\nunraveled yet, restricting the possibility of automatically optimizing them to\nimprove LLM safety. In this work, we investigate how LLMs' behavior (i.e.,\ncomplying with or refusing user queries) is affected by safety prompts from the\nperspective of model representation. We find that in the representation space,\nthe input queries are typically moved by safety prompts in a \"higher-refusal\"\ndirection, in which models become more prone to refusing to provide assistance,\neven when the queries are harmless. On the other hand, LLMs are naturally\ncapable of distinguishing harmful and harmless queries without safety prompts.\nInspired by these findings, we propose a method for safety prompt optimization,\nnamely DRO (Directed Representation Optimization). Treating a safety prompt as\ncontinuous, trainable embeddings, DRO learns to move the queries'\nrepresentations along or opposite the refusal direction, depending on their\nharmfulness. Experiments with eight LLMs on out-of-domain and jailbreak\nbenchmarks demonstrate that DRO remarkably improves the safeguarding\nperformance of human-crafted safety prompts, without compromising the models'\ngeneral performance.\n","authors":["Chujie Zheng","Fan Yin","Hao Zhou","Fandong Meng","Jie Zhou","Kai-Wei Chang","Minlie Huang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2401.18018v4.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2401.04514v2","updated":"2024-06-03T06:50:26Z","published":"2024-01-09T12:12:50Z","title":"Rewriting the Code: A Simple Method for Large Language Model Augmented\n  Code Search","summary":"  In code search, the Generation-Augmented Retrieval (GAR) framework, which\ngenerates exemplar code snippets to augment queries, has emerged as a promising\nstrategy to address the principal challenge of modality misalignment between\ncode snippets and natural language queries, particularly with the demonstrated\ncode generation capabilities of Large Language Models (LLMs). Nevertheless, our\npreliminary investigations indicate that the improvements conferred by such an\nLLM-augmented framework are somewhat constrained. This limitation could\npotentially be ascribed to the fact that the generated codes, albeit\nfunctionally accurate, frequently display a pronounced stylistic deviation from\nthe ground truth code in the codebase. In this paper, we extend the\nfoundational GAR framework and propose a simple yet effective method that\nadditionally Rewrites the Code (ReCo) within the codebase for style\nnormalization. Experimental results demonstrate that ReCo significantly boosts\nretrieval accuracy across sparse (up to 35.7%), zero-shot dense (up to 27.6%),\nand fine-tuned dense (up to 23.6%) retrieval settings in diverse search\nscenarios. To further elucidate the advantages of ReCo and stimulate research\nin code style normalization, we introduce Code Style Similarity, the first\nmetric tailored to quantify stylistic similarities in code. Notably, our\nempirical findings reveal the inadequacy of existing metrics in capturing\nstylistic nuances. The source code and data are available at\n\\url{https://github.com/Alex-HaochenLi/ReCo}.\n","authors":["Haochen Li","Xin Zhou","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2401.04514v2.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2312.12558v3","updated":"2024-06-03T06:17:33Z","published":"2023-12-19T19:53:58Z","title":"Sample Efficient Reinforcement Learning with Partial Dynamics Knowledge","summary":"  The problem of sample complexity of online reinforcement learning is often\nstudied in the literature without taking into account any partial knowledge\nabout the system dynamics that could potentially accelerate the learning\nprocess. In this paper, we study the sample complexity of online Q-learning\nmethods when some prior knowledge about the dynamics is available or can be\nlearned efficiently. We focus on systems that evolve according to an additive\ndisturbance model of the form $S_{h+1} = f(S_h, A_h) + W_h$, where $f$\nrepresents the underlying system dynamics, and $W_h$ are unknown disturbances\nindependent of states and actions. In the setting of finite episodic Markov\ndecision processes with $S$ states, $A$ actions, and episode length $H$, we\npresent an optimistic Q-learning algorithm that achieves\n$\\tilde{\\mathcal{O}}(\\text{Poly}(H)\\sqrt{T})$ regret under perfect knowledge of\n$f$, where $T$ is the total number of interactions with the system. This is in\ncontrast to the typical $\\tilde{\\mathcal{O}}(\\text{Poly}(H)\\sqrt{SAT})$ regret\nfor existing Q-learning methods. Further, if only a noisy estimate $\\hat{f}$ of\n$f$ is available, our method can learn an approximately optimal policy in a\nnumber of samples that is independent of the cardinalities of state and action\nspaces. The sub-optimality gap depends on the approximation error $\\hat{f}-f$,\nas well as the Lipschitz constant of the corresponding optimal value function.\nOur approach does not require modeling of the transition probabilities and\nenjoys the same memory complexity as model-free methods.\n","authors":["Meshal Alharbi","Mardavij Roozbehani","Munther Dahleh"],"pdf_url":"https://arxiv.org/pdf/2312.12558v3.pdf","comment":"Published in the 38th Annual AAAI Conference on Artificial\n  Intelligence"},{"id":"http://arxiv.org/abs/2402.00138v2","updated":"2024-06-03T06:05:29Z","published":"2024-01-31T19:32:33Z","title":"Decomposable Submodular Maximization in Federated Setting","summary":"  Submodular functions, as well as the sub-class of decomposable submodular\nfunctions, and their optimization appear in a wide range of applications in\nmachine learning, recommendation systems, and welfare maximization. However,\noptimization of decomposable submodular functions with millions of component\nfunctions is computationally prohibitive. Furthermore, the component functions\nmay be private (they might represent user preference function, for example) and\ncannot be widely shared. To address these issues, we propose a {\\em federated\noptimization} setting for decomposable submodular optimization. In this\nsetting, clients have their own preference functions, and a weighted sum of\nthese preferences needs to be maximized. We implement the popular {\\em\ncontinuous greedy} algorithm in this setting where clients take parallel small\nlocal steps towards the local solution and then the local changes are\naggregated at a central server. To address the large number of clients, the\naggregation is performed only on a subsampled set. Further, the aggregation is\nperformed only intermittently between stretches of parallel local steps, which\nreduces communication cost significantly. We show that our federated algorithm\nis guaranteed to provide a good approximate solution, even in the presence of\nabove cost-cutting measures. Finally, we show how the federated setting can be\nincorporated in solving fundamental discrete submodular optimization problems\nsuch as Maximum Coverage and Facility Location.\n","authors":["Akbar Rafiey"],"pdf_url":"https://arxiv.org/pdf/2402.00138v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15043v2","updated":"2024-06-03T06:02:39Z","published":"2024-02-23T01:30:39Z","title":"KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large\n  Language Models","summary":"  Automatic evaluation methods for large language models (LLMs) are hindered by\ndata contamination, leading to inflated assessments of their effectiveness.\nExisting strategies, which aim to detect contaminated texts, focus on\nquantifying contamination status instead of accurately gauging model\nperformance. In this paper, we introduce KIEval, a Knowledge-grounded\nInteractive Evaluation framework, which incorporates an LLM-powered\n\"interactor\" role for the first time to accomplish a dynamic\ncontamination-resilient evaluation. Starting with a question in a conventional\nLLM benchmark involving domain-specific knowledge, KIEval utilizes dynamically\ngenerated, multi-round, and knowledge-focused dialogues to determine whether a\nmodel's response is merely a recall of benchmark answers or demonstrates a deep\ncomprehension to apply knowledge in more complex conversations. Extensive\nexperiments on seven leading LLMs across five datasets validate KIEval's\neffectiveness and generalization. We also reveal that data contamination brings\nno contribution or even negative effect to models' real-world applicability and\nunderstanding, and existing contamination detection methods for LLMs can only\nidentify contamination in pre-training but not during supervised fine-tuning.\n","authors":["Zhuohao Yu","Chang Gao","Wenjin Yao","Yidong Wang","Wei Ye","Jindong Wang","Xing Xie","Yue Zhang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.15043v2.pdf","comment":"Accepted to ACL 2024 (main conference); 19 pages, 5 figures, 19\n  tables, code is available at: https://github.com/zhuohaoyu/KIEval"},{"id":"http://arxiv.org/abs/2402.11078v3","updated":"2024-06-03T05:39:10Z","published":"2024-02-16T21:10:33Z","title":"Model Editing by Standard Fine-Tuning","summary":"  Standard fine-tuning is considered not as effective as specialized methods\nfor model editing due to its comparatively poor performance. However, it is\nsimple, agnostic to the architectural details of the model being edited, and\nable to leverage advances in standard training techniques with no additional\nwork (e.g., black-box PEFT for computational efficiency), making it an\nappealing choice for a model editor. In this work, we show that standard\nfine-tuning alone can yield competitive model editing performance with two\nminor modifications. First, we optimize the conditional likelihood rather than\nthe full likelihood. Second, in addition to the typical practice of training on\nrandomly paraphrased edit prompts to encourage generalization, we also train on\nrandom or similar unedited facts to encourage locality. Our experiments on the\nZsRE and CounterFact datasets demonstrate that these simple modifications allow\nstandard fine-tuning to match or outperform highly specialized editors in terms\nof edit score.\n","authors":["Govind Gangadhar","Karl Stratos"],"pdf_url":"https://arxiv.org/pdf/2402.11078v3.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2405.11930v2","updated":"2024-06-03T05:21:54Z","published":"2024-05-20T10:12:23Z","title":"Data Contamination Calibration for Black-box LLMs","summary":"  The rapid advancements of Large Language Models (LLMs) tightly associate with\nthe expansion of the training data size. However, the unchecked\nultra-large-scale training sets introduce a series of potential risks like data\ncontamination, i.e. the benchmark data is used for training. In this work, we\npropose a holistic method named Polarized Augment Calibration (PAC) along with\na new to-be-released dataset to detect the contaminated data and diminish the\ncontamination effect. PAC extends the popular MIA (Membership Inference Attack)\n-- from machine learning community -- by forming a more global target at\ndetecting training data to Clarify invisible training data. As a pioneering\nwork, PAC is very much plug-and-play that can be integrated with most (if not\nall) current white- and black-box LLMs. By extensive experiments, PAC\noutperforms existing methods by at least 4.5%, towards data contamination\ndetection on more 4 dataset formats, with more than 10 base LLMs. Besides, our\napplication in real-world scenarios highlights the prominent presence of\ncontamination and related issues.\n","authors":["Wentao Ye","Jiaqi Hu","Liyao Li","Haobo Wang","Gang Chen","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.11930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01773v2","updated":"2024-06-03T05:05:24Z","published":"2024-03-04T07:03:10Z","title":"Improving out-of-distribution generalization in graphs via hierarchical\n  semantic environments","summary":"  Out-of-distribution (OOD) generalization in the graph domain is challenging\ndue to complex distribution shifts and a lack of environmental contexts. Recent\nmethods attempt to enhance graph OOD generalization by generating flat\nenvironments. However, such flat environments come with inherent limitations to\ncapture more complex data distributions. Considering the DrugOOD dataset, which\ncontains diverse training environments (e.g., scaffold, size, etc.), flat\ncontexts cannot sufficiently address its high heterogeneity. Thus, a new\nchallenge is posed to generate more semantically enriched environments to\nenhance graph invariant learning for handling distribution shifts. In this\npaper, we propose a novel approach to generate hierarchical semantic\nenvironments for each graph. Firstly, given an input graph, we explicitly\nextract variant subgraphs from the input graph to generate proxy predictions on\nlocal environments. Then, stochastic attention mechanisms are employed to\nre-extract the subgraphs for regenerating global environments in a hierarchical\nmanner. In addition, we introduce a new learning objective that guides our\nmodel to learn the diversity of environments within the same hierarchy while\nmaintaining consistency across different hierarchies. This approach enables our\nmodel to consider the relationships between environments and facilitates robust\ngraph invariant learning. Extensive experiments on real-world graph data have\ndemonstrated the effectiveness of our framework. Particularly, in the\nchallenging dataset DrugOOD, our method achieves up to 1.29% and 2.83%\nimprovement over the best baselines on IC50 and EC50 prediction tasks,\nrespectively.\n","authors":["Yinhua Piao","Sangseon Lee","Yijingxiu Lu","Sun Kim"],"pdf_url":"https://arxiv.org/pdf/2403.01773v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2310.08540v5","updated":"2024-06-03T04:18:11Z","published":"2023-10-12T17:32:09Z","title":"Do pretrained Transformers Learn In-Context by Gradient Descent?","summary":"  The emergence of In-Context Learning (ICL) in LLMs remains a remarkable\nphenomenon that is partially understood. To explain ICL, recent studies have\ncreated theoretical connections to Gradient Descent (GD). We ask, do such\nconnections hold up in actual pre-trained language models? We highlight the\nlimiting assumptions in prior works that make their setup considerably\ndifferent from the practical setup in which language models are trained. For\nexample, their experimental verification uses \\emph{ICL objective} (training\nmodels explicitly for ICL), which differs from the emergent ICL in the wild.\nFurthermore, the theoretical hand-constructed weights used in these studies\nhave properties that don't match those of real LLMs. We also look for evidence\nin real models. We observe that ICL and GD have different sensitivity to the\norder in which they observe demonstrations. Finally, we probe and compare the\nICL vs. GD hypothesis in a natural setting. We conduct comprehensive empirical\nanalyses on language models pre-trained on natural data (LLaMa-7B). Our\ncomparisons of three performance metrics highlight the inconsistent behavior of\nICL and GD as a function of various factors such as datasets, models, and the\nnumber of demonstrations. We observe that ICL and GD modify the output\ndistribution of language models differently. These results indicate that\n\\emph{the equivalence between ICL and GD remains an open hypothesis} and calls\nfor further studies.\n","authors":["Lingfeng Shen","Aayush Mishra","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2310.08540v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07586v5","updated":"2024-06-03T04:17:49Z","published":"2023-12-11T02:40:40Z","title":"Characteristic Guidance: Non-linear Correction for Diffusion Model at\n  Large Guidance Scale","summary":"  Popular guidance for denoising diffusion probabilistic model (DDPM) linearly\ncombines distinct conditional models together to provide enhanced control over\nsamples. However, this approach overlooks nonlinear effects that become\nsignificant when guidance scale is large. To address this issue, we propose\ncharacteristic guidance, a guidance method that provides first-principle\nnon-linear correction for classifier-free guidance. Such correction forces the\nguided DDPMs to respect the Fokker-Planck (FP) equation of diffusion process,\nin a way that is training-free and compatible with existing sampling methods.\nExperiments show that characteristic guidance enhances semantic characteristics\nof prompts and mitigate irregularities in image generation, proving effective\nin diverse applications ranging from simulating magnet phase transitions to\nlatent space sampling.\n","authors":["Candi Zheng","Yuan Lan"],"pdf_url":"https://arxiv.org/pdf/2312.07586v5.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.18395v2","updated":"2024-06-03T03:53:16Z","published":"2024-05-28T17:35:05Z","title":"MC-GTA: Metric-Constrained Model-Based Clustering using Goodness-of-fit\n  Tests with Autocorrelations","summary":"  A wide range of (multivariate) temporal (1D) and spatial (2D) data analysis\ntasks, such as grouping vehicle sensor trajectories, can be formulated as\nclustering with given metric constraints. Existing metric-constrained\nclustering algorithms overlook the rich correlation between feature similarity\nand metric distance, i.e., metric autocorrelation. The model-based variations\nof these clustering algorithms (e.g. TICC and STICC) achieve SOTA performance,\nyet suffer from computational instability and complexity by using a\nmetric-constrained Expectation-Maximization procedure. In order to address\nthese two problems, we propose a novel clustering algorithm, MC-GTA\n(Model-based Clustering via Goodness-of-fit Tests with Autocorrelations). Its\nobjective is only composed of pairwise weighted sums of feature similarity\nterms (square Wasserstein-2 distance) and metric autocorrelation terms (a novel\nmultivariate generalization of classic semivariogram). We show that MC-GTA is\neffectively minimizing the total hinge loss for intra-cluster observation pairs\nnot passing goodness-of-fit tests, i.e., statistically not originating from the\nsame distribution. Experiments on 1D/2D synthetic and real-world datasets\ndemonstrate that MC-GTA successfully incorporates metric autocorrelation. It\noutperforms strong baselines by large margins (up to 14.3% in ARI and 32.1% in\nNMI) with faster and stabler optimization (>10x speedup).\n","authors":["Zhangyu Wang","Gengchen Mai","Krzysztof Janowicz","Ni Lao"],"pdf_url":"https://arxiv.org/pdf/2405.18395v2.pdf","comment":"ICML-2024 Proceedings"},{"id":"http://arxiv.org/abs/2312.11973v4","updated":"2024-06-03T03:51:38Z","published":"2023-12-19T09:11:49Z","title":"Continual Learning: Forget-free Winning Subnetworks for Video\n  Representations","summary":"  Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the\nexistence of efficient subnetworks within larger, dense networks, a\nhigh-performing Winning Subnetwork (WSN) in terms of task performance under\nappropriate sparsity conditions is considered for various continual learning\ntasks. It leverages pre-existing weights from dense networks to achieve\nefficient learning in Task Incremental Learning (TIL) and Task-agnostic\nIncremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning\n(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is\ndesigned to prevent overfitting when the data samples are scarce. Furthermore,\nthe sparse reuse of WSN weights is considered for Video Incremental Learning\n(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It\nenables compact encoding of videos and identifies reusable subnetworks across\nvarying bandwidths. We have integrated FSO into different architectural\nframeworks for continual learning, including VIL, TIL, and FSCIL. Our\ncomprehensive experiments demonstrate FSO's effectiveness, significantly\nimproving task performance at various convolutional representational levels.\nSpecifically, FSO enhances higher-layer performance in TIL and FSCIL and\nlower-layer performance in VIL.\n","authors":["Haeyong Kang","Jaehong Yoon","Sung Ju Hwang","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2312.11973v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.14962,\n  arXiv:2306.11305"},{"id":"http://arxiv.org/abs/2310.07160v3","updated":"2024-06-03T03:35:01Z","published":"2023-10-11T03:12:47Z","title":"LLark: A Multimodal Instruction-Following Language Model for Music","summary":"  Music has a unique and complex structure which is challenging for both expert\nhumans and existing AI systems to understand, and presents unique challenges\nrelative to other forms of audio. We present LLark, an instruction-tuned\nmultimodal model for \\emph{music} understanding. We detail our process for\ndataset creation, which involves augmenting the annotations of diverse\nopen-source music datasets and converting them to a unified instruction-tuning\nformat. We propose a multimodal architecture for LLark, integrating a\npretrained generative model for music with a pretrained language model. In\nevaluations on three types of tasks (music understanding, captioning,\nreasoning), we show that LLark matches or outperforms existing baselines in\nmusic understanding, and that humans show a high degree of agreement with its\nresponses in captioning and reasoning tasks. LLark is trained entirely from\nopen-source music data and models, and we make our training code available\nalong with the release of this paper. Additional results and audio examples are\nat https://bit.ly/llark, and our source code is available at\nhttps://github.com/spotify-research/llark .\n","authors":["Josh Gardner","Simon Durand","Daniel Stoller","Rachel M. Bittner"],"pdf_url":"https://arxiv.org/pdf/2310.07160v3.pdf","comment":"ICML camera-ready version"},{"id":"http://arxiv.org/abs/2405.07374v2","updated":"2024-06-03T03:32:56Z","published":"2024-05-12T20:27:34Z","title":"Conformalized Survival Distributions: A Generic Post-Process to Increase\n  Calibration","summary":"  Discrimination and calibration represent two important properties of survival\nanalysis, with the former assessing the model's ability to accurately rank\nsubjects and the latter evaluating the alignment of predicted outcomes with\nactual events. With their distinct nature, it is hard for survival models to\nsimultaneously optimize both of them especially as many previous results found\nimproving calibration tends to diminish discrimination performance. This paper\nintroduces a novel approach utilizing conformal regression that can improve a\nmodel's calibration without degrading discrimination. We provide theoretical\nguarantees for the above claim, and rigorously validate the efficiency of our\napproach across 11 real-world datasets, showcasing its practical applicability\nand robustness in diverse scenarios.\n","authors":["Shi-ang Qi","Yakun Yu","Russell Greiner"],"pdf_url":"https://arxiv.org/pdf/2405.07374v2.pdf","comment":"Accepted to ICML 2024; 37 pages, 19 figures"},{"id":"http://arxiv.org/abs/2402.14859v2","updated":"2024-06-03T03:29:07Z","published":"2024-02-20T23:08:21Z","title":"The Wolf Within: Covert Injection of Malice into MLLM Societies via an\n  MLLM Operative","summary":"  Due to their unprecedented ability to process and respond to various types of\ndata, Multimodal Large Language Models (MLLMs) are constantly defining the new\nboundary of Artificial General Intelligence (AGI). As these advanced generative\nmodels increasingly form collaborative networks for complex tasks, the\nintegrity and security of these systems are crucial. Our paper, ``The Wolf\nWithin'', explores a novel vulnerability in MLLM societies - the indirect\npropagation of malicious content. Unlike direct harmful output generation for\nMLLMs, our research demonstrates how a single MLLM agent can be subtly\ninfluenced to generate prompts that, in turn, induce other MLLM agents in the\nsociety to output malicious content. Our findings reveal that, an MLLM agent,\nwhen manipulated to produce specific prompts or instructions, can effectively\n``infect'' other agents within a society of MLLMs. This infection leads to the\ngeneration and circulation of harmful outputs, such as dangerous instructions\nor misinformation, across the society. We also show the transferability of\nthese indirectly generated prompts, highlighting their possibility in\npropagating malice through inter-agent communication. This research provides a\ncritical insight into a new dimension of threat posed by MLLMs, where a single\nagent can act as a catalyst for widespread malevolent influence. Our work\nunderscores the urgent need for developing robust mechanisms to detect and\nmitigate such covert manipulations within MLLM societies, ensuring their safe\nand ethical utilization in societal applications.\n","authors":["Zhen Tan","Chengshuai Zhao","Raha Moraffah","Yifan Li","Yu Kong","Tianlong Chen","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2402.14859v2.pdf","comment":"Accepted to workshop on ReGenAI@CVPR 2024"},{"id":"http://arxiv.org/abs/2405.12489v2","updated":"2024-06-03T03:26:59Z","published":"2024-05-21T04:18:57Z","title":"Exploring and Exploiting the Asymmetric Valley of Deep Neural Networks","summary":"  Exploring the loss landscape offers insights into the inherent principles of\ndeep neural networks (DNNs). Recent work suggests an additional asymmetry of\nthe valley beyond the flat and sharp ones, yet without thoroughly examining its\ncauses or implications. Our study methodically explores the factors affecting\nthe symmetry of DNN valleys, encompassing (1) the dataset, network\narchitecture, initialization, and hyperparameters that influence the\nconvergence point; and (2) the magnitude and direction of the noise for 1D\nvisualization. Our major observation shows that the {\\it degree of sign\nconsistency} between the noise and the convergence point is a critical\nindicator of valley symmetry. Theoretical insights from the aspects of ReLU\nactivation and softmax function could explain the interesting phenomenon. Our\ndiscovery propels novel understanding and applications in the scenario of Model\nFusion: (1) the efficacy of interpolating separate models significantly\ncorrelates with their sign consistency ratio, and (2) imposing sign alignment\nduring federated learning emerges as an innovative approach for model parameter\nalignment.\n","authors":["Xin-Chun Li","Jin-Lin Tang","Bo Zhang","Lan Li","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2405.12489v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13536v2","updated":"2024-06-03T03:13:35Z","published":"2023-09-24T03:19:40Z","title":"Tackling the Unlimited Staleness in Federated Learning with Intertwined\n  Data and Device Heterogeneities","summary":"  The efficiency of Federated Learning (FL) is often affected by both data and\ndevice heterogeneities. Data heterogeneity is defined as the heterogeneity of\ndata distributions on different clients. Device heterogeneity is defined as the\nclients' variant latencies in uploading their local model updates due to\nheterogeneous conditions of local hardware resources, and causes the problem of\nstaleness when being addressed by asynchronous FL. Traditional schemes of\ntackling the impact of staleness consider data and device heterogeneities as\ntwo separate and independent aspects in FL, but this assumption is unrealistic\nin many practical FL scenarios where data and device heterogeneities are\nintertwined. In these cases, traditional schemes of weighted aggregation in FL\nhave been proved to be ineffective, and a better approach is to convert a stale\nmodel update into a non-stale one. In this paper, we present a new FL framework\nthat leverages the gradient inversion technique for such conversion, hence\nefficiently tackling unlimited staleness in clients' model updates. Our basic\nidea is to use gradient inversion to get estimations of clients' local training\ndata from their uploaded stale model updates, and use these estimations to\ncompute non-stale client model updates. In this way, we address the problem of\npossible data quality drop when using gradient inversion, while still\npreserving the clients' local data privacy. We compared our approach with the\nexisting FL strategies on mainstream datasets and models, and experiment\nresults demonstrate that when tackling unlimited staleness, our approach can\nsignificantly improve the trained model accuracy by up to 20% and speed up the\nFL training progress by up to 35%.\n","authors":["Haoming Wang","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2309.13536v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2405.20630v2","updated":"2024-06-03T03:11:45Z","published":"2024-05-31T05:42:47Z","title":"Stochastic Optimal Control for Diffusion Bridges in Function Spaces","summary":"  Recent advancements in diffusion models and diffusion bridges primarily focus\non finite-dimensional spaces, yet many real-world problems necessitate\noperations in infinite-dimensional function spaces for more natural and\ninterpretable formulations. In this paper, we present a theory of stochastic\noptimal control (SOC) tailored to infinite-dimensional spaces, aiming to extend\ndiffusion-based algorithms to function spaces. Specifically, we demonstrate how\nDoob's $h$-transform, the fundamental tool for constructing diffusion bridges,\ncan be derived from the SOC perspective and expanded to infinite dimensions.\nThis expansion presents a challenge, as infinite-dimensional spaces typically\nlack closed-form densities. Leveraging our theory, we establish that solving\nthe optimal control problem with a specific objective function choice is\nequivalent to learning diffusion-based generative models. We propose two\napplications: (1) learning bridges between two infinite-dimensional\ndistributions and (2) generative models for sampling from an\ninfinite-dimensional distribution. Our approach proves effective for diverse\nproblems involving continuous function space representations, such as\nresolution-free images, time-series data, and probability density functions.\n","authors":["Byoungwoo Park","Jungwon Choi","Sungbin Lim","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20630v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15524v3","updated":"2024-06-03T03:02:54Z","published":"2023-10-24T05:07:31Z","title":"On the Inherent Privacy Properties of Discrete Denoising Diffusion\n  Models","summary":"  Privacy concerns have led to a surge in the creation of synthetic datasets,\nwith diffusion models emerging as a promising avenue. Although prior studies\nhave performed empirical evaluations on these models, there has been a gap in\nproviding a mathematical characterization of their privacy-preserving\ncapabilities. To address this, we present the pioneering theoretical\nexploration of the privacy preservation inherent in discrete diffusion models\n(DDMs) for discrete dataset generation. Focusing on per-instance differential\nprivacy (pDP), our framework elucidates the potential privacy leakage for each\ndata point in a given training dataset, offering insights into how the privacy\nloss of each point correlates with the dataset's distribution. Our bounds also\nshow that training with $s$-sized data points leads to a surge in privacy\nleakage from $(\\epsilon, O(\\frac{1}{s^2\\epsilon}))$-pDP to $(\\epsilon,\nO(\\frac{1}{s\\epsilon}))$-pDP of the DDM during the transition from the pure\nnoise to the synthetic clean data phase, and a faster decay in diffusion\ncoefficients amplifies the privacy guarantee. Finally, we empirically verify\nour theoretical findings on both synthetic and real-world datasets.\n","authors":["Rongzhe Wei","Eleonora Kreačić","Haoyu Wang","Haoteng Yin","Eli Chien","Vamsi K. Potluru","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2310.15524v3.pdf","comment":"58 pages"},{"id":"http://arxiv.org/abs/2404.09005v4","updated":"2024-06-03T02:51:46Z","published":"2024-04-13T13:18:40Z","title":"Proof-of-Learning with Incentive Security","summary":"  Most concurrent blockchain systems rely heavily on the Proof-of-Work (PoW) or\nProof-of-Stake (PoS) mechanisms for decentralized consensus and security\nassurance. However, the substantial energy expenditure stemming from\ncomputationally intensive yet meaningless tasks has raised considerable\nconcerns surrounding traditional PoW approaches, The PoS mechanism, while free\nof energy consumption, is subject to security and economic issues. Addressing\nthese issues, the paradigm of Proof-of-Useful-Work (PoUW) seeks to employ\nchallenges of practical significance as PoW, thereby imbuing energy consumption\nwith tangible value. While previous efforts in Proof of Learning (PoL) explored\nthe utilization of deep learning model training SGD tasks as PoUW challenges,\nrecent research has revealed its vulnerabilities to adversarial attacks and the\ntheoretical hardness in crafting a byzantine-secure PoL mechanism. In this\npaper, we introduce the concept of incentive-security that incentivizes\nrational provers to behave honestly for their best interest, bypassing the\nexisting hardness to design a PoL mechanism with computational efficiency, a\nprovable incentive-security guarantee and controllable difficulty.\nParticularly, our work is secure against two attacks to the recent work of Jia\net al. [2021], and also improves the computational overhead from $\\Theta(1)$ to\n$O(\\frac{\\log E}{E})$. Furthermore, while most recent research assumes trusted\nproblem providers and verifiers, our design also guarantees frontend\nincentive-security even when problem providers are untrusted, and verifier\nincentive-security that bypasses the Verifier's Dilemma. By incorporating ML\ntraining into blockchain consensus mechanisms with provable guarantees, our\nresearch not only proposes an eco-friendly solution to blockchain systems, but\nalso provides a proposal for a completely decentralized computing power market\nin the new AI age.\n","authors":["Zishuo Zhao","Zhixuan Fang","Xuechao Wang","Xi Chen","Yuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.09005v4.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2405.03664v2","updated":"2024-06-03T02:50:35Z","published":"2024-05-06T17:41:13Z","title":"A New Robust Partial $p$-Wasserstein-Based Metric for Comparing\n  Distributions","summary":"  The $2$-Wasserstein distance is sensitive to minor geometric differences\nbetween distributions, making it a very powerful dissimilarity metric. However,\ndue to this sensitivity, a small outlier mass can also cause a significant\nincrease in the $2$-Wasserstein distance between two similar distributions.\nSimilarly, sampling discrepancy can cause the empirical $2$-Wasserstein\ndistance on $n$ samples in $\\mathbb{R}^2$ to converge to the true distance at a\nrate of $n^{-1/4}$, which is significantly slower than the rate of $n^{-1/2}$\nfor $1$-Wasserstein distance. We introduce a new family of distances\nparameterized by $k \\ge 0$, called $k$-RPW that is based on computing the\npartial $2$-Wasserstein distance. We show that (1) $k$-RPW satisfies the metric\nproperties, (2) $k$-RPW is robust to small outlier mass while retaining the\nsensitivity of $2$-Wasserstein distance to minor geometric differences, and (3)\nwhen $k$ is a constant, $k$-RPW distance between empirical distributions on $n$\nsamples in $\\mathbb{R}^2$ converges to the true distance at a rate of\n$n^{-1/3}$, which is faster than the convergence rate of $n^{-1/4}$ for the\n$2$-Wasserstein distance. Using the partial $p$-Wasserstein distance, we extend\nour distance to any $p \\in [1,\\infty]$. By setting parameters $k$ or $p$\nappropriately, we can reduce our distance to the total variation,\n$p$-Wasserstein, and the L\\'evy-Prokhorov distances. Experiments show that our\ndistance function achieves higher accuracy in comparison to the\n$1$-Wasserstein, $2$-Wasserstein, and TV distances for image retrieval tasks on\nnoisy real-world data sets.\n","authors":["Sharath Raghvendra","Pouyan Shirzadian","Kaiyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.03664v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14285v3","updated":"2024-06-03T02:47:27Z","published":"2024-02-22T04:55:58Z","title":"Symbolic Music Generation with Non-Differentiable Rule Guided Diffusion","summary":"  We study the problem of symbolic music generation (e.g., generating piano\nrolls), with a technical focus on non-differentiable rule guidance. Musical\nrules are often expressed in symbolic form on note characteristics, such as\nnote density or chord progression, many of which are non-differentiable which\npose a challenge when using them for guided diffusion. We propose \\oursfull\n(\\ours), a novel guidance method that only requires forward evaluation of rule\nfunctions that can work with pre-trained diffusion models in a plug-and-play\nway, thus achieving training-free guidance for non-differentiable rules for the\nfirst time. Additionally, we introduce a latent diffusion architecture for\nsymbolic music generation with high time resolution, which can be composed with\nSCG in a plug-and-play fashion. Compared to standard strong baselines in\nsymbolic music generation, this framework demonstrates marked advancements in\nmusic quality and rule-based controllability, outperforming current\nstate-of-the-art generators in a variety of settings. For detailed\ndemonstrations, code and model checkpoints, please visit our project website:\nhttps://scg-rule-guided-music.github.io/.\n","authors":["Yujia Huang","Adishree Ghatare","Yuanzhe Liu","Ziniu Hu","Qinsheng Zhang","Chandramouli S Sastry","Siddharth Gururani","Sageev Oore","Yisong Yue"],"pdf_url":"https://arxiv.org/pdf/2402.14285v3.pdf","comment":"ICML 2024 (Oral)"},{"id":"http://arxiv.org/abs/2405.17233v2","updated":"2024-06-03T02:46:53Z","published":"2024-05-27T14:49:39Z","title":"CLAQ: Pushing the Limits of Low-Bit Post-Training Quantization for LLMs","summary":"  Parameter quantization for Large Language Models (LLMs) has attracted\nincreasing attentions recently in reducing memory costs and improving\ncomputational efficiency. Early approaches have been widely adopted. However,\nthe existing methods suffer from poor performance in low-bit (such as 2 to 3\nbits) scenarios. In this paper, we present a novel and effective Column-Level\nAdaptive weight Quantization (CLAQ) framework by introducing three different\ntypes of adaptive strategies for LLM quantization. Firstly, a K-Means\nclustering based algorithm is proposed that allows dynamic generation of\nquantization centroids for each column of a parameter matrix. Secondly, we\ndesign an outlier-guided adaptive precision search strategy which can\ndynamically assign varying bit-widths to different columns. Finally, a dynamic\noutlier reservation scheme is developed to retain some parameters in their\noriginal float point precision, in trade off of boosted model performance.\nExperiments on various mainstream open source LLMs including LLaMA-1, LLaMA-2\nand Yi demonstrate that our methods achieve the state-of-the-art results across\ndifferent bit settings, especially in extremely low-bit scenarios. Code is\navailable at https://github.com/fayuge/CLAQ.\n","authors":["Haoyu Wang","Bei Liu","Hang Shao","Bo Xiao","Ke Zeng","Guanglu Wan","Yanmin Qian"],"pdf_url":"https://arxiv.org/pdf/2405.17233v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08147v3","updated":"2024-06-03T02:43:24Z","published":"2024-03-13T00:19:06Z","title":"Representing Molecules as Random Walks Over Interpretable Grammars","summary":"  Recent research in molecular discovery has primarily been devoted to small,\ndrug-like molecules, leaving many similarly important applications in material\ndesign without adequate technology. These applications often rely on more\ncomplex molecular structures with fewer examples that are carefully designed\nusing known substructures. We propose a data-efficient and interpretable model\nfor representing and reasoning over such molecules in terms of graph grammars\nthat explicitly describe the hierarchical design space featuring motifs to be\nthe design basis. We present a novel representation in the form of random walks\nover the design space, which facilitates both molecule generation and property\nprediction. We demonstrate clear advantages over existing methods in terms of\nperformance, efficiency, and synthesizability of predicted molecules, and we\nprovide detailed insights into the method's chemical interpretability.\n","authors":["Michael Sun","Minghao Guo","Weize Yuan","Veronika Thost","Crystal Elaine Owens","Aristotle Franklin Grosz","Sharvaa Selvan","Katelyn Zhou","Hassan Mohiuddin","Benjamin J Pedretti","Zachary P Smith","Jie Chen","Wojciech Matusik"],"pdf_url":"https://arxiv.org/pdf/2403.08147v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16324v2","updated":"2024-06-03T02:37:28Z","published":"2024-02-26T06:08:25Z","title":"Achieving $\\tilde{O}(1/ε)$ Sample Complexity for Constrained\n  Markov Decision Process","summary":"  We consider the reinforcement learning problem for the constrained Markov\ndecision process (CMDP), which plays a central role in satisfying safety or\nresource constraints in sequential learning and decision-making. In this\nproblem, we are given finite resources and a MDP with unknown transition\nprobabilities. At each stage, we take an action, collecting a reward and\nconsuming some resources, all assumed to be unknown and need to be learned over\ntime. In this work, we take the first step towards deriving optimal\nproblem-dependent guarantees for the CMDP problems. We derive a logarithmic\nregret bound, which translates into a\n$O(\\frac{1}{\\Delta\\cdot\\eps}\\cdot\\log^2(1/\\eps))$ sample complexity bound, with\n$\\Delta$ being a problem-dependent parameter, yet independent of $\\eps$. Our\nsample complexity bound improves upon the state-of-art $O(1/\\eps^2)$ sample\ncomplexity for CMDP problems established in the previous literature, in terms\nof the dependency on $\\eps$. To achieve this advance, we develop a new\nframework for analyzing CMDP problems. To be specific, our algorithm operates\nin the primal space and we resolve the primal LP for the CMDP problem at each\nperiod in an online manner, with \\textit{adaptive} remaining resource\ncapacities. The key elements of our algorithm are: i) a characterization of the\ninstance hardness via LP basis, ii) an eliminating procedure that identifies\none optimal basis of the primal LP, and; iii) a resolving procedure that is\nadaptive to the remaining resources and sticks to the characterized optimal\nbasis.\n","authors":["Jiashuo Jiang","Yinyu Ye"],"pdf_url":"https://arxiv.org/pdf/2402.16324v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04795v2","updated":"2024-06-03T02:36:51Z","published":"2024-05-08T04:01:40Z","title":"Variational Schrödinger Diffusion Models","summary":"  Schr\\\"odinger bridge (SB) has emerged as the go-to method for optimizing\ntransportation plans in diffusion models. However, SB requires estimating the\nintractable forward score functions, inevitably resulting in the costly\nimplicit training loss based on simulated trajectories. To improve the\nscalability while preserving efficient transportation plans, we leverage\nvariational inference to linearize the forward score functions (variational\nscores) of SB and restore simulation-free properties in training backward\nscores. We propose the variational Schr\\\"odinger diffusion model (VSDM), where\nthe forward process is a multivariate diffusion and the variational scores are\nadaptively optimized for efficient transport. Theoretically, we use stochastic\napproximation to prove the convergence of the variational scores and show the\nconvergence of the adaptively generated samples based on the optimal\nvariational scores. Empirically, we test the algorithm in simulated examples\nand observe that VSDM is efficient in generations of anisotropic shapes and\nyields straighter sample trajectories compared to the single-variate diffusion.\nWe also verify the scalability of the algorithm in real-world data and achieve\ncompetitive unconditional generation performance in CIFAR10 and conditional\ngeneration in time series modeling. Notably, VSDM no longer depends on warm-up\ninitializations and has become tuning-friendly in training large-scale\nexperiments.\n","authors":["Wei Deng","Weijian Luo","Yixin Tan","Marin Biloš","Yu Chen","Yuriy Nevmyvaka","Ricky T. Q. Chen"],"pdf_url":"https://arxiv.org/pdf/2405.04795v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2402.01306v2","updated":"2024-06-03T02:36:09Z","published":"2024-02-02T10:53:36Z","title":"KTO: Model Alignment as Prospect Theoretic Optimization","summary":"  Kahneman & Tversky's $\\textit{prospect theory}$ tells us that humans perceive\nrandom variables in a biased but well-defined manner (1992); for example,\nhumans are famously loss-averse. We show that objectives for aligning LLMs with\nhuman feedback implicitly incorporate many of these biases -- the success of\nthese objectives (e.g., DPO) over cross-entropy minimization can partly be\nascribed to them belonging to a family of loss functions that we call\n$\\textit{human-aware losses}$ (HALOs). However, the utility functions these\nmethods attribute to humans still differ from those in the prospect theory\nliterature. Using a Kahneman-Tversky model of human utility, we propose a HALO\nthat directly maximizes the utility of generations instead of maximizing the\nlog-likelihood of preferences, as current methods do. We call this approach\nKTO, and it matches or exceeds the performance of preference-based methods at\nscales from 1B to 30B, despite only learning from a binary signal of whether an\noutput is desirable. More broadly, our work suggests that there is no one HALO\nthat is universally superior; the best loss depends on the inductive biases\nmost appropriate for a given setting, an oft-overlooked consideration.\n","authors":["Kawin Ethayarajh","Winnie Xu","Niklas Muennighoff","Dan Jurafsky","Douwe Kiela"],"pdf_url":"https://arxiv.org/pdf/2402.01306v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2302.14509v2","updated":"2024-06-03T02:18:44Z","published":"2023-02-28T11:58:39Z","title":"Policy Dispersion in Non-Markovian Environment","summary":"  Markov Decision Process (MDP) presents a mathematical framework to formulate\nthe learning processes of agents in reinforcement learning. MDP is limited by\nthe Markovian assumption that a reward only depends on the immediate state and\naction. However, a reward sometimes depends on the history of states and\nactions, which may result in the decision process in a non-Markovian\nenvironment. In such environments, agents receive rewards via\ntemporally-extended behaviors sparsely, and the learned policies may be\nsimilar. This leads the agents acquired with similar policies generally overfit\nto the given task and can not quickly adapt to perturbations of environments.\nTo resolve this problem, this paper tries to learn the diverse policies from\nthe history of state-action pairs under a non-Markovian environment, in which a\npolicy dispersion scheme is designed for seeking diverse policy representation.\nSpecifically, we first adopt a transformer-based method to learn policy\nembeddings. Then, we stack the policy embeddings to construct a dispersion\nmatrix to induce a set of diverse policies. Finally, we prove that if the\ndispersion matrix is positive definite, the dispersed embeddings can\neffectively enlarge the disagreements across policies, yielding a diverse\nexpression for the original policy embedding distribution. Experimental results\nshow that this dispersion scheme can obtain more expressive diverse policies,\nwhich then derive more robust performance than recent learning baselines under\nvarious learning environments.\n","authors":["Bohao Qu","Xiaofeng Cao","Jielong Yang","Hechang Chen","Chang Yi","Ivor W. Tsang","Yew-Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2302.14509v2.pdf","comment":"In further research, we found that the core content of the paper\n  requires significant modification and that the entire paper needs to be\n  restructured. To enhance the scientific quality and contributions of the\n  paper, we have decided to resubmit it after completing the necessary\n  revisions and improvements"},{"id":"http://arxiv.org/abs/2404.02072v4","updated":"2024-06-03T02:15:03Z","published":"2024-04-02T16:20:02Z","title":"EGTR: Extracting Graph from Transformer for Scene Graph Generation","summary":"  Scene Graph Generation (SGG) is a challenging task of detecting objects and\npredicting relationships between objects. After DETR was developed, one-stage\nSGG models based on a one-stage object detector have been actively studied.\nHowever, complex modeling is used to predict the relationship between objects,\nand the inherent relationship between object queries learned in the multi-head\nself-attention of the object detector has been neglected. We propose a\nlightweight one-stage SGG model that extracts the relation graph from the\nvarious relationships learned in the multi-head self-attention layers of the\nDETR decoder. By fully utilizing the self-attention by-products, the relation\ngraph can be extracted effectively with a shallow relation extraction head.\nConsidering the dependency of the relation extraction task on the object\ndetection task, we propose a novel relation smoothing technique that adjusts\nthe relation label adaptively according to the quality of the detected objects.\nBy the relation smoothing, the model is trained according to the continuous\ncurriculum that focuses on object detection task at the beginning of training\nand performs multi-task learning as the object detection performance gradually\nimproves. Furthermore, we propose a connectivity prediction task that predicts\nwhether a relation exists between object pairs as an auxiliary task of the\nrelation extraction. We demonstrate the effectiveness and efficiency of our\nmethod for the Visual Genome and Open Image V6 datasets. Our code is publicly\navailable at https://github.com/naver-ai/egtr.\n","authors":["Jinbae Im","JeongYeon Nam","Nokyung Park","Hyungmin Lee","Seunghyun Park"],"pdf_url":"https://arxiv.org/pdf/2404.02072v4.pdf","comment":"CVPR 2024 (Best paper award candidate)"},{"id":"http://arxiv.org/abs/2401.06127v2","updated":"2024-06-03T02:09:38Z","published":"2024-01-11T18:59:14Z","title":"E$^{2}$GAN: Efficient Training of Efficient GANs for Image-to-Image\n  Translation","summary":"  One highly promising direction for enabling flexible real-time on-device\nimage editing is utilizing data distillation by leveraging large-scale\ntext-to-image diffusion models to generate paired datasets used for training\ngenerative adversarial networks (GANs). This approach notably alleviates the\nstringent requirements typically imposed by high-end commercial GPUs for\nperforming image editing with diffusion models. However, unlike text-to-image\ndiffusion models, each distilled GAN is specialized for a specific image\nediting task, necessitating costly training efforts to obtain models for\nvarious concepts. In this work, we introduce and address a novel research\ndirection: can the process of distilling GANs from diffusion models be made\nsignificantly more efficient? To achieve this goal, we propose a series of\ninnovative techniques. First, we construct a base GAN model with generalized\nfeatures, adaptable to different concepts through fine-tuning, eliminating the\nneed for training from scratch. Second, we identify crucial layers within the\nbase GAN model and employ Low-Rank Adaptation (LoRA) with a simple yet\neffective rank search process, rather than fine-tuning the entire base model.\nThird, we investigate the minimal amount of data necessary for fine-tuning,\nfurther reducing the overall training time. Extensive experiments show that we\ncan efficiently empower GANs with the ability to perform real-time high-quality\nimage editing on mobile devices with remarkably reduced training and storage\ncosts for each concept.\n","authors":["Yifan Gong","Zheng Zhan","Qing Jin","Yanyu Li","Yerlan Idelbayev","Xian Liu","Andrey Zharkov","Kfir Aberman","Sergey Tulyakov","Yanzhi Wang","Jian Ren"],"pdf_url":"https://arxiv.org/pdf/2401.06127v2.pdf","comment":"ICML 2024. Project Page: https://yifanfanfanfan.github.io/e2gan/"},{"id":"http://arxiv.org/abs/2405.20445v2","updated":"2024-06-03T02:08:54Z","published":"2024-05-30T19:43:29Z","title":"GraphAny: A Foundation Model for Node Classification on Any Graph","summary":"  Foundation models that can perform inference on any new task without\nrequiring specific training have revolutionized machine learning in vision and\nlanguage applications. However, applications involving graph-structured data\nremain a tough nut for foundation models, due to challenges in the unique\nfeature- and label spaces associated with each graph. Traditional graph ML\nmodels such as graph neural networks (GNNs) trained on graphs cannot perform\ninference on a new graph with feature and label spaces different from the\ntraining ones. Furthermore, existing models learn functions specific to the\ntraining graph and cannot generalize to new graphs. In this work, we tackle\nthese two challenges with a new foundational architecture for inductive node\nclassification named GraphAny. GraphAny models inference on a new graph as an\nanalytical solution to a LinearGNN, thereby solving the first challenge. To\nsolve the second challenge, we learn attention scores for each node to fuse the\npredictions of multiple LinearGNNs. Specifically, the attention module is\ncarefully parameterized as a function of the entropy-normalized\ndistance-features between multiple LinearGNNs predictions to ensure\ngeneralization to new graphs. Empirically, GraphAny trained on the Wisconsin\ndataset with only 120 labeled nodes can effectively generalize to 30 new graphs\nwith an average accuracy of 67.26\\% in an inductive manner, surpassing GCN and\nGAT trained in the supervised regime, as well as other inductive baselines.\n","authors":["Jianan Zhao","Hesham Mostafa","Mikhail Galkin","Michael Bronstein","Zhaocheng Zhu","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2405.20445v2.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2401.07463v2","updated":"2024-06-03T01:55:52Z","published":"2024-01-15T04:20:39Z","title":"Consistency of semi-supervised learning, stochastic tug-of-war games,\n  and the p-Laplacian","summary":"  In this paper we give a broad overview of the intersection of partial\ndifferential equations (PDEs) and graph-based semi-supervised learning. The\noverview is focused on a large body of recent work on PDE continuum limits of\ngraph-based learning, which have been used to prove well-posedness of\nsemi-supervised learning algorithms in the large data limit. We highlight some\ninteresting research directions revolving around consistency of graph-based\nsemi-supervised learning, and present some new results on the consistency of\n$p$-Laplacian semi-supervised learning using the stochastic tug-of-war game\ninterpretation of the $p$-Laplacian. We also present the results of some\nnumerical experiments that illustrate our results and suggest directions for\nfuture work.\n","authors":["Jeff Calder","Nadejda Drenska"],"pdf_url":"https://arxiv.org/pdf/2401.07463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04411v2","updated":"2024-06-03T01:40:46Z","published":"2024-02-06T21:14:45Z","title":"DFA-RAG: Conversational Semantic Router for Large Language Model with\n  Definite Finite Automaton","summary":"  This paper introduces the retrieval-augmented large language model with\nDefinite Finite Automaton (DFA-RAG), a novel framework designed to enhance the\ncapabilities of conversational agents using large language models (LLMs).\nTraditional LLMs face challenges in generating regulated and compliant\nresponses in special scenarios with predetermined response guidelines, like\nemotional support and customer service. Our framework addresses these\nchallenges by embedding a Definite Finite Automaton (DFA), learned from\ntraining dialogues, within the LLM. This structured approach acts as a semantic\nrouter which enables the LLM to adhere to a deterministic response pathway. The\nrouting is achieved by the retrieval-augmentation generation (RAG) strategy,\nwhich carefully selects dialogue examples aligned with the current\nconversational context. The advantages of DFA-RAG include an interpretable\nstructure through human-readable DFA, context-aware retrieval for responses in\nconversations, and plug-and-play compatibility with existing LLMs. Extensive\nbenchmarks validate DFA-RAG's effectiveness, indicating its potential as a\nvaluable contribution to the conversational agent.\n","authors":["Yiyou Sun","Junjie Hu","Wei Cheng","Haifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04411v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2211.13316v3","updated":"2024-06-03T01:24:38Z","published":"2022-11-23T21:34:35Z","title":"Understanding Sample Generation Strategies for Learning Heuristic\n  Functions in Classical Planning","summary":"  We study the problem of learning good heuristic functions for classical\nplanning tasks with neural networks based on samples represented by states with\ntheir cost-to-goal estimates. The heuristic function is learned for a state\nspace and goal condition with the number of samples limited to a fraction of\nthe size of the state space, and must generalize well for all states of the\nstate space with the same goal condition. Our main goal is to better understand\nthe influence of sample generation strategies on the performance of a greedy\nbest-first heuristic search (GBFS) guided by a learned heuristic function. In a\nset of controlled experiments, we find that two main factors determine the\nquality of the learned heuristic: the algorithm used to generate the sample set\nand how close the sample estimates to the perfect cost-to-goal are. These two\nfactors are dependent: having perfect cost-to-goal estimates is insufficient if\nthe samples are not well distributed across the state space. We also study\nother effects, such as adding samples with high-value estimates. Based on our\nfindings, we propose practical strategies to improve the quality of learned\nheuristics: three strategies that aim to generate more representative states\nand two strategies that improve the cost-to-goal estimates. Our practical\nstrategies result in a learned heuristic that, when guiding a GBFS algorithm,\nincreases by more than 30% the mean coverage compared to a baseline learned\nheuristic.\n","authors":["R. V. Bettker","P. P. Minini","A. G. Pereira","M. Ritt"],"pdf_url":"https://arxiv.org/pdf/2211.13316v3.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2404.18239v3","updated":"2024-06-03T01:10:53Z","published":"2024-04-28T16:31:32Z","title":"SOUL: Unlocking the Power of Second-Order Optimization for LLM\n  Unlearning","summary":"  Large Language Models (LLMs) have highlighted the necessity of effective\nunlearning mechanisms to comply with data regulations and ethical AI practices.\nLLM unlearning aims at removing undesired data influences and associated model\ncapabilities without compromising utility out of the scope of unlearning. While\ninterest in studying LLM unlearning is growing,the impact of the optimizer\nchoice for LLM unlearning remains under-explored. In this work, we shed light\non the significance of optimizer selection in LLM unlearning for the first\ntime, establishing a clear connection between {second-order optimization} and\ninfluence unlearning (a classical approach using influence functions to update\nthe model for data influence removal). This insight propels us to develop a\nsecond-order unlearning framework, termed SOUL, built upon the second-order\nclipped stochastic optimization (Sophia)-based LLM training method. SOUL\nextends the static, one-shot model update using influence unlearning to a\ndynamic, iterative unlearning process. Our extensive experiments show that SOUL\nconsistently outperforms conventional first-order methods across various\nunlearning tasks, models, and metrics, suggesting the promise of second-order\noptimization in providing a scalable and easily implementable solution for LLM\nunlearning.\n","authors":["Jinghan Jia","Yihua Zhang","Yimeng Zhang","Jiancheng Liu","Bharat Runwal","James Diffenderfer","Bhavya Kailkhura","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2404.18239v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06470v2","updated":"2024-06-03T01:03:49Z","published":"2022-12-13T10:41:12Z","title":"Position: Considerations for Differentially Private Learning with\n  Large-Scale Public Pretraining","summary":"  The performance of differentially private machine learning can be boosted\nsignificantly by leveraging the transfer learning capabilities of non-private\nmodels pretrained on large public datasets. We critically review this approach.\n  We primarily question whether the use of large Web-scraped datasets should be\nviewed as differential-privacy-preserving. We caution that publicizing these\nmodels pretrained on Web data as \"private\" could lead to harm and erode the\npublic's trust in differential privacy as a meaningful definition of privacy.\n  Beyond the privacy considerations of using public data, we further question\nthe utility of this paradigm. We scrutinize whether existing machine learning\nbenchmarks are appropriate for measuring the ability of pretrained models to\ngeneralize to sensitive domains, which may be poorly represented in public Web\ndata. Finally, we notice that pretraining has been especially impactful for the\nlargest available models -- models sufficiently large to prohibit end users\nrunning them on their own devices. Thus, deploying such models today could be a\nnet loss for privacy, as it would require (private) data to be outsourced to a\nmore compute-powerful third party.\n  We conclude by discussing potential paths forward for the field of private\nlearning, as public pretraining becomes more popular and powerful.\n","authors":["Florian Tramèr","Gautam Kamath","Nicholas Carlini"],"pdf_url":"https://arxiv.org/pdf/2212.06470v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2401.15800v2","updated":"2024-06-03T00:49:43Z","published":"2024-01-28T23:14:51Z","title":"Provably Stable Feature Rankings with SHAP and LIME","summary":"  Feature attributions are ubiquitous tools for understanding the predictions\nof machine learning models. However, the calculation of popular methods for\nscoring input variables such as SHAP and LIME suffers from high instability due\nto random sampling. Leveraging ideas from multiple hypothesis testing, we\ndevise attribution methods that ensure the most important features are ranked\ncorrectly with high probability. Given SHAP estimates from KernelSHAP or\nShapley Sampling, we demonstrate how to retrospectively verify the number of\nstable rankings. Further, we introduce efficient sampling algorithms for SHAP\nand LIME that guarantee the $K$ highest-ranked features have the proper\nordering. Finally, we show how to adapt these local feature attribution methods\nfor the global importance setting.\n","authors":["Jeremy Goldwasser","Giles Hooker"],"pdf_url":"https://arxiv.org/pdf/2401.15800v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05100v2","updated":"2024-06-03T00:32:53Z","published":"2023-06-08T10:58:46Z","title":"Communication-Efficient Gradient Descent-Accent Methods for Distributed\n  Variational Inequalities: Unified Analysis and Local Updates","summary":"  Distributed and federated learning algorithms and techniques associated\nprimarily with minimization problems. However, with the increase of minimax\noptimization and variational inequality problems in machine learning, the\nnecessity of designing efficient distributed/federated learning approaches for\nthese problems is becoming more apparent. In this paper, we provide a unified\nconvergence analysis of communication-efficient local training methods for\ndistributed variational inequality problems (VIPs). Our approach is based on a\ngeneral key assumption on the stochastic estimates that allows us to propose\nand analyze several novel local training algorithms under a single framework\nfor solving a class of structured non-monotone VIPs. We present the first local\ngradient descent-accent algorithms with provable improved communication\ncomplexity for solving distributed variational inequalities on heterogeneous\ndata. The general algorithmic framework recovers state-of-the-art algorithms\nand their sharp convergence guarantees when the setting is specialized to\nminimization or minimax optimization problems. Finally, we demonstrate the\nstrong performance of the proposed algorithms compared to state-of-the-art\nmethods when solving federated minimax optimization problems.\n","authors":["Siqi Zhang","Sayantan Choudhury","Sebastian U Stich","Nicolas Loizou"],"pdf_url":"https://arxiv.org/pdf/2306.05100v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2405.18749v2","updated":"2024-06-03T00:17:05Z","published":"2024-05-29T04:22:18Z","title":"A SARS-CoV-2 Interaction Dataset and VHH Sequence Corpus for Antibody\n  Language Models","summary":"  Antibodies are crucial proteins produced by the immune system to eliminate\nharmful foreign substances and have become pivotal therapeutic agents for\ntreating human diseases. To accelerate the discovery of antibody therapeutics,\nthere is growing interest in constructing language models using antibody\nsequences. However, the applicability of pre-trained language models for\nantibody discovery has not been thoroughly evaluated due to the scarcity of\nlabeled datasets. To overcome these limitations, we introduce AVIDa-SARS-CoV-2,\na dataset featuring the antigen-variable domain of heavy chain of heavy chain\nantibody (VHH) interactions obtained from two alpacas immunized with severe\nacute respiratory syndrome coronavirus 2 (SARS-CoV-2) spike proteins.\nAVIDa-SARS-CoV-2 includes binary labels indicating the binding or non-binding\nof diverse VHH sequences to 12 SARS-CoV-2 mutants, such as the Delta and\nOmicron variants. Furthermore, we release VHHCorpus-2M, a pre-training dataset\nfor antibody language models, containing over two million VHH sequences. We\nreport benchmark results for predicting SARS-CoV-2-VHH binding using VHHBERT\npre-trained on VHHCorpus-2M and existing general protein and antibody-specific\npre-trained language models. These results confirm that AVIDa-SARS-CoV-2\nprovides valuable benchmarks for evaluating the representation capabilities of\nantibody language models for binding prediction, thereby facilitating the\ndevelopment of AI-driven antibody discovery. The datasets are available at\nhttps://datasets.cognanous.com.\n","authors":["Hirofumi Tsuruta","Hiroyuki Yamazaki","Ryota Maeda","Ryotaro Tamura","Akihiro Imura"],"pdf_url":"https://arxiv.org/pdf/2405.18749v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.14867v2","updated":"2024-06-03T16:59:20Z","published":"2023-12-22T17:45:19Z","title":"VIEScore: Towards Explainable Metrics for Conditional Image Synthesis\n  Evaluation","summary":"  In the rapidly advancing field of conditional image generation research,\nchallenges such as limited explainability lie in effectively evaluating the\nperformance and capabilities of various models. This paper introduces VIEScore,\na Visual Instruction-guided Explainable metric for evaluating any conditional\nimage generation tasks. VIEScore leverages general knowledge from Multimodal\nLarge Language Models (MLLMs) as the backbone and does not require training or\nfine-tuning. We evaluate VIEScore on seven prominent tasks in conditional image\ntasks and found: (1) VIEScore (GPT4-o) achieves a high Spearman correlation of\n0.4 with human evaluations, while the human-to-human correlation is 0.45. (2)\nVIEScore (with open-source MLLM) is significantly weaker than GPT-4o and GPT-4v\nin evaluating synthetic images. (3) VIEScore achieves a correlation on par with\nhuman ratings in the generation tasks but struggles in editing tasks. With\nthese results, we believe VIEScore shows its great potential to replace human\njudges in evaluating image synthesis tasks.\n","authors":["Max Ku","Dongfu Jiang","Cong Wei","Xiang Yue","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2312.14867v2.pdf","comment":"Accepted to ACL2024 main"},{"id":"http://arxiv.org/abs/2401.01163v3","updated":"2024-06-03T16:09:55Z","published":"2024-01-02T11:46:42Z","title":"NU-Class Net: A Novel Approach for Video Quality Enhancement","summary":"  Video content has experienced a surge in popularity, asserting its dominance\nover internet traffic and Internet of Things (IoT) networks. Video compression\nhas long been regarded as the primary means of efficiently managing the\nsubstantial multimedia traffic generated by video-capturing devices.\nNevertheless, video compression algorithms entail significant computational\ndemands in order to achieve substantial compression ratios. This complexity\npresents a formidable challenge when implementing efficient video coding\nstandards in resource-constrained embedded systems, such as IoT edge node\ncameras. To tackle this challenge, this paper introduces NU-Class Net, an\ninnovative deep-learning model designed to mitigate compression artifacts\nstemming from lossy compression codecs. This enhancement significantly elevates\nthe perceptible quality of low-bit-rate videos. By employing the NU-Class Net,\nthe video encoder within the video-capturing node can reduce output quality,\nthereby generating low-bit-rate videos and effectively curtailing both\ncomputation and bandwidth requirements at the edge. On the decoder side, which\nis typically less encumbered by resource limitations, NU-Class Net is applied\nafter the video decoder to compensate for artifacts and approximate the quality\nof the original video. Experimental results affirm the efficacy of the proposed\nmodel in enhancing the perceptible quality of videos, especially those streamed\nat low bit rates.\n","authors":["Parham Zilouchian Moghaddam","Mehdi Modarressi","Mohammad Amin Sadeghi"],"pdf_url":"https://arxiv.org/pdf/2401.01163v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19802v2","updated":"2024-06-03T12:55:12Z","published":"2024-05-30T08:12:08Z","title":"Exploring the Robustness of Decision-Level Through Adversarial Attacks\n  on LLM-Based Embodied Models","summary":"  Embodied intelligence empowers agents with a profound sense of perception,\nenabling them to respond in a manner closely aligned with real-world\nsituations. Large Language Models (LLMs) delve into language instructions with\ndepth, serving a crucial role in generating plans for intricate tasks. Thus,\nLLM-based embodied models further enhance the agent's capacity to comprehend\nand process information. However, this amalgamation also ushers in new\nchallenges in the pursuit of heightened intelligence. Specifically, attackers\ncan manipulate LLMs to produce irrelevant or even malicious outputs by altering\ntheir prompts. Confronted with this challenge, we observe a notable absence of\nmulti-modal datasets essential for comprehensively evaluating the robustness of\nLLM-based embodied models. Consequently, we construct the Embodied Intelligent\nRobot Attack Dataset (EIRAD), tailored specifically for robustness evaluation.\nAdditionally, two attack strategies are devised, including untargeted attacks\nand targeted attacks, to effectively simulate a range of diverse attack\nscenarios. At the same time, during the attack process, to more accurately\nascertain whether our method is successful in attacking the LLM-based embodied\nmodel, we devise a new attack success evaluation method utilizing the BLIP2\nmodel. Recognizing the time and cost-intensive nature of the GCG algorithm in\nattacks, we devise a scheme for prompt suffix initialization based on various\ntarget tasks, thus expediting the convergence process. Experimental results\ndemonstrate that our method exhibits a superior attack success rate when\ntargeting LLM-based embodied models, indicating a lower level of decision-level\nrobustness in these models.\n","authors":["Shuyuan Liu","Jiawei Chen","Shouwei Ruan","Hang Su","Zhaoxia Yin"],"pdf_url":"https://arxiv.org/pdf/2405.19802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08389v2","updated":"2024-06-03T07:47:36Z","published":"2023-05-15T07:12:19Z","title":"Edit As You Wish: Video Caption Editing with Multi-grained User Control","summary":"  Automatically narrating videos in natural language complying with user\nrequests, i.e. Controllable Video Captioning task, can help people manage\nmassive videos with desired intentions. However, existing works suffer from two\nshortcomings: 1) the control signal is single-grained which can not satisfy\ndiverse user intentions; 2) the video description is generated in a single\nround which can not be further edited to meet dynamic needs. In this paper, we\npropose a novel \\textbf{V}ideo \\textbf{C}aption \\textbf{E}diting \\textbf{(VCE)}\ntask to automatically revise an existing video description guided by\nmulti-grained user requests. Inspired by human writing-revision habits, we\ndesign the user command as a pivotal triplet \\{\\textit{operation, position,\nattribute}\\} to cover diverse user needs from coarse-grained to fine-grained.\nTo facilitate the VCE task, we \\textit{automatically} construct an open-domain\nbenchmark dataset named VATEX-EDIT and \\textit{manually} collect an e-commerce\ndataset called EMMAD-EDIT. We further propose a specialized small-scale model\n(i.e., OPA) compared with two generalist Large Multi-modal Models to perform an\nexhaustive analysis of the novel task. For evaluation, we adopt comprehensive\nmetrics considering caption fluency, command-caption consistency, and\nvideo-caption alignment. Experiments reveal the task challenges of fine-grained\nmulti-modal semantics understanding and processing. Our datasets, codes, and\nevaluation tools are ready to be open-sourced.\n","authors":["Linli Yao","Yuanmeng Zhang","Ziheng Wang","Xinglin Hou","Tiezheng Ge","Yuning Jiang","Xu Sun","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2305.08389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18063v2","updated":"2024-06-03T18:22:30Z","published":"2024-03-26T19:29:21Z","title":"Heracles: A Hybrid SSM-Transformer Model for High-Resolution Image and\n  Time-Series Analysis","summary":"  Transformers have revolutionized image modeling tasks with adaptations like\nDeIT, Swin, SVT, Biformer, STVit, and FDVIT. However, these models often face\nchallenges with inductive bias and high quadratic complexity, making them less\nefficient for high-resolution images. State space models (SSMs) such as Mamba,\nV-Mamba, ViM, and SiMBA offer an alternative to handle high resolution images\nin computer vision tasks. These SSMs encounter two major issues. First, they\nbecome unstable when scaled to large network sizes. Second, although they\nefficiently capture global information in images, they inherently struggle with\nhandling local information. To address these challenges, we introduce Heracles,\na novel SSM that integrates a local SSM, a global SSM, and an attention-based\ntoken interaction module. Heracles leverages a Hartely kernel-based state space\nmodel for global image information, a localized convolutional network for local\ndetails, and attention mechanisms in deeper layers for token interactions. Our\nextensive experiments demonstrate that Heracles-C-small achieves\nstate-of-the-art performance on the ImageNet dataset with 84.5\\% top-1\naccuracy. Heracles-C-Large and Heracles-C-Huge further improve accuracy to\n85.9\\% and 86.4\\%, respectively. Additionally, Heracles excels in transfer\nlearning tasks on datasets such as CIFAR-10, CIFAR-100, Oxford Flowers, and\nStanford Cars, and in instance segmentation on the MSCOCO dataset. Heracles\nalso proves its versatility by achieving state-of-the-art results on seven\ntime-series datasets, showcasing its ability to generalize across domains with\nspectral data, capturing both local and global information. The project page is\navailable at this link.\\url{https://github.com/badripatro/heracles}\n","authors":["Badri N. Patro","Suhas Ranganath","Vinay P. Namboodiri","Vijay S. Agneeswaran"],"pdf_url":"https://arxiv.org/pdf/2403.18063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01451v1","updated":"2024-06-03T15:42:30Z","published":"2024-06-03T15:42:30Z","title":"SAM as the Guide: Mastering Pseudo-Label Refinement in Semi-Supervised\n  Referring Expression Segmentation","summary":"  In this paper, we introduce SemiRES, a semi-supervised framework that\neffectively leverages a combination of labeled and unlabeled data to perform\nRES. A significant hurdle in applying semi-supervised techniques to RES is the\nprevalence of noisy pseudo-labels, particularly at the boundaries of objects.\nSemiRES incorporates the Segment Anything Model (SAM), renowned for its precise\nboundary demarcation, to improve the accuracy of these pseudo-labels. Within\nSemiRES, we offer two alternative matching strategies: IoU-based Optimal\nMatching (IOM) and Composite Parts Integration (CPI). These strategies are\ndesigned to extract the most accurate masks from SAM's output, thus guiding the\ntraining of the student model with enhanced precision. In instances where a\nprecise mask cannot be matched from the available candidates, we develop the\nPixel-Wise Adjustment (PWA) strategy, guiding the student model's training\ndirectly by the pseudo-labels. Extensive experiments on three RES\nbenchmarks--RefCOCO, RefCOCO+, and G-Ref reveal its superior performance\ncompared to fully supervised methods. Remarkably, with only 1% labeled data,\nour SemiRES outperforms the supervised baseline by a large margin, e.g. +18.64%\ngains on RefCOCO val set. The project code is available at\n\\url{https://github.com/nini0919/SemiRES}.\n","authors":["Danni Yang","Jiayi Ji","Yiwei Ma","Tianyu Guo","Haowei Wang","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2406.01451v1.pdf","comment":"Accepted by ICML2024"},{"id":"http://arxiv.org/abs/2406.01321v1","updated":"2024-06-03T13:42:10Z","published":"2024-06-03T13:42:10Z","title":"Sequence-to-Sequence Multi-Modal Speech In-Painting","summary":"  Speech in-painting is the task of regenerating missing audio contents using\nreliable context information. Despite various recent studies in multi-modal\nperception of audio in-painting, there is still a need for an effective\ninfusion of visual and auditory information in speech in-painting. In this\npaper, we introduce a novel sequence-to-sequence model that leverages the\nvisual information to in-paint audio signals via an encoder-decoder\narchitecture. The encoder plays the role of a lip-reader for facial recordings\nand the decoder takes both encoder outputs as well as the distorted audio\nspectrograms to restore the original speech. Our model outperforms an\naudio-only speech in-painting model and has comparable results with a recent\nmulti-modal speech in-painter in terms of speech quality and intelligibility\nmetrics for distortions of 300 ms to 1500 ms duration, which proves the\neffectiveness of the introduced multi-modality in speech in-painting.\n","authors":["Mahsa Kadkhodaei Elyaderani","Shahram Shirani"],"pdf_url":"https://arxiv.org/pdf/2406.01321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01280v1","updated":"2024-06-03T12:48:38Z","published":"2024-06-03T12:48:38Z","title":"Demo: Soccer Information Retrieval via Natural Queries using SoccerRAG","summary":"  The rapid evolution of digital sports media necessitates sophisticated\ninformation retrieval systems that can efficiently parse extensive multimodal\ndatasets. This paper demonstrates SoccerRAG, an innovative framework designed\nto harness the power of Retrieval Augmented Generation (RAG) and Large Language\nModels (LLMs) to extract soccer-related information through natural language\nqueries. By leveraging a multimodal dataset, SoccerRAG supports dynamic\nquerying and automatic data validation, enhancing user interaction and\naccessibility to sports archives. We present a novel interactive user interface\n(UI) based on the Chainlit framework which wraps around the core functionality,\nand enable users to interact with the SoccerRAG framework in a chatbot-like\nvisual manner.\n","authors":["Aleksander Theo Strand","Sushant Gautam","Cise Midoglu","Pål Halvorsen"],"pdf_url":"https://arxiv.org/pdf/2406.01280v1.pdf","comment":"accepted to CBMI 2024 as a demonstration;\n  https://github.com/simula/soccer-rag"},{"id":"http://arxiv.org/abs/2406.01273v1","updated":"2024-06-03T12:39:04Z","published":"2024-06-03T12:39:04Z","title":"SoccerRAG: Multimodal Soccer Information Retrieval via Natural Queries","summary":"  The rapid evolution of digital sports media necessitates sophisticated\ninformation retrieval systems that can efficiently parse extensive multimodal\ndatasets. This paper introduces SoccerRAG, an innovative framework designed to\nharness the power of Retrieval Augmented Generation (RAG) and Large Language\nModels (LLMs) to extract soccer-related information through natural language\nqueries. By leveraging a multimodal dataset, SoccerRAG supports dynamic\nquerying and automatic data validation, enhancing user interaction and\naccessibility to sports archives. Our evaluations indicate that SoccerRAG\neffectively handles complex queries, offering significant improvements over\ntraditional retrieval systems in terms of accuracy and user engagement. The\nresults underscore the potential of using RAG and LLMs in sports analytics,\npaving the way for future advancements in the accessibility and real-time\nprocessing of sports data.\n","authors":["Aleksander Theo Strand","Sushant Gautam","Cise Midoglu","Pål Halvorsen"],"pdf_url":"https://arxiv.org/pdf/2406.01273v1.pdf","comment":"accepted to CBMI 2024 as a regular paper;\n  https://github.com/simula/soccer-rag"},{"id":"http://arxiv.org/abs/2406.01033v1","updated":"2024-06-03T06:35:11Z","published":"2024-06-03T06:35:11Z","title":"Generalized Jersey Number Recognition Using Multi-task Learning With\n  Orientation-guided Weight Refinement","summary":"  Jersey number recognition (JNR) has always been an important task in sports\nanalytics. Improving recognition accuracy remains an ongoing challenge because\nimages are subject to blurring, occlusion, deformity, and low resolution.\nRecent research has addressed these problems using number localization and\noptical character recognition. Some approaches apply player identification\nschemes to image sequences, ignoring the impact of human body rotation angles\non jersey digit identification. Accurately predicting the number of jersey\ndigits by using a multi-task scheme to recognize each individual digit enables\nmore robust results. Based on the above considerations, this paper proposes a\nmulti-task learning method called the angle-digit refine scheme (ADRS), which\ncombines human body orientation angles and digit number clues to recognize\nathletic jersey numbers. Based on our experimental results, our approach\nincreases inference information, significantly improving prediction accuracy.\nCompared to state-of-the-art methods, which can only handle a single type of\nsport, the proposed method produces a more diverse and practical JNR\napplication. The incorporation of diverse types of team sports such as soccer,\nfootball, basketball, volleyball, and baseball into our dataset contributes\ngreatly to generalized JNR in sports analytics. Our accuracy achieves 64.07% on\nTop-1 and 89.97% on Top-2, with corresponding F1 scores of 67.46% and 90.64%,\nrespectively.\n","authors":["Yung-Hui Lin","Yu-Wen Chang","Huang-Chia Shih","Takahiro Ogawa"],"pdf_url":"https://arxiv.org/pdf/2406.01033v1.pdf","comment":"10 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2406.00919v1","updated":"2024-06-03T01:09:15Z","published":"2024-06-03T01:09:15Z","title":"Advancing Weakly-Supervised Audio-Visual Video Parsing via Segment-wise\n  Pseudo Labeling","summary":"  The Audio-Visual Video Parsing task aims to identify and temporally localize\nthe events that occur in either or both the audio and visual streams of audible\nvideos. It often performs in a weakly-supervised manner, where only video event\nlabels are provided, \\ie, the modalities and the timestamps of the labels are\nunknown. Due to the lack of densely annotated labels, recent work attempts to\nleverage pseudo labels to enrich the supervision. A commonly used strategy is\nto generate pseudo labels by categorizing the known video event labels for each\nmodality. However, the labels are still confined to the video level, and the\ntemporal boundaries of events remain unlabeled. In this paper, we propose a new\npseudo label generation strategy that can explicitly assign labels to each\nvideo segment by utilizing prior knowledge learned from the open world.\nSpecifically, we exploit the large-scale pretrained models, namely CLIP and\nCLAP, to estimate the events in each video segment and generate segment-level\nvisual and audio pseudo labels, respectively. We then propose a new loss\nfunction to exploit these pseudo labels by taking into account their\ncategory-richness and segment-richness. A label denoising strategy is also\nadopted to further improve the visual pseudo labels by flipping them whenever\nabnormally large forward losses occur. We perform extensive experiments on the\nLLP dataset and demonstrate the effectiveness of each proposed design and we\nachieve state-of-the-art video parsing performance on all types of event\nparsing, \\ie, audio event, visual event, and audio-visual event. We also\nexamine the proposed pseudo label generation strategy on a relevant\nweakly-supervised audio-visual event localization task and the experimental\nresults again verify the benefits and generalization of our method.\n","authors":["Jinxing Zhou","Dan Guo","Yiran Zhong","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2406.00919v1.pdf","comment":"IJCV 2024 Accepted. arXiv admin note: substantial text overlap with\n  arXiv:2303.02344"}]},"2024-06-02T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2405.18672v2","updated":"2024-06-02T23:30:46Z","published":"2024-05-29T00:36:56Z","title":"LLM-based Hierarchical Concept Decomposition for Interpretable\n  Fine-Grained Image Classification","summary":"  (Renyi Qu's Master's Thesis) Recent advancements in interpretable models for\nvision-language tasks have achieved competitive performance; however, their\ninterpretability often suffers due to the reliance on unstructured text outputs\nfrom large language models (LLMs). This introduces randomness and compromises\nboth transparency and reliability, which are essential for addressing safety\nissues in AI systems. We introduce \\texttt{Hi-CoDe} (Hierarchical Concept\nDecomposition), a novel framework designed to enhance model interpretability\nthrough structured concept analysis. Our approach consists of two main\ncomponents: (1) We use GPT-4 to decompose an input image into a structured\nhierarchy of visual concepts, thereby forming a visual concept tree. (2) We\nthen employ an ensemble of simple linear classifiers that operate on\nconcept-specific features derived from CLIP to perform classification. Our\napproach not only aligns with the performance of state-of-the-art models but\nalso advances transparency by providing clear insights into the decision-making\nprocess and highlighting the importance of various concepts. This allows for a\ndetailed analysis of potential failure modes and improves model compactness,\ntherefore setting a new benchmark in interpretability without compromising the\naccuracy.\n","authors":["Renyi Qu","Mark Yatskar"],"pdf_url":"https://arxiv.org/pdf/2405.18672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13673v3","updated":"2024-06-02T23:16:50Z","published":"2023-05-23T04:28:16Z","title":"Physics of Language Models: Part 1, Learning Hierarchical Language\n  Structures","summary":"  Transformer-based language models are effective but complex, and\nunderstanding their inner workings is a significant challenge. Previous\nresearch has primarily explored how these models handle simple tasks like name\ncopying or selection, and we extend this by investigating how these models\ngrasp complex, recursive language structures defined by context-free grammars\n(CFGs). We introduce a family of synthetic CFGs that produce hierarchical\nrules, capable of generating lengthy sentences (e.g., hundreds of tokens) that\nare locally ambiguous and require dynamic programming to parse. Despite this\ncomplexity, we demonstrate that generative models like GPT can accurately learn\nthis CFG language and generate sentences based on it. We explore the model's\ninternals, revealing that its hidden states precisely capture the structure of\nCFGs, and its attention patterns resemble the information passing in a dynamic\nprogramming algorithm.\n  This paper also presents several corollaries, including showing why\npositional embedding is inferior to relative attention or rotary embedding;\ndemonstrating that encoder-based models (e.g., BERT, deBERTa) cannot learn very\ndeeply nested CFGs as effectively as generative models (e.g., GPT); and\nhighlighting the necessity of adding structural and syntactic errors to the\npretraining data to make the model more robust to corrupted language prefixes.\n","authors":["Zeyuan Allen-Zhu","Yuanzhi Li"],"pdf_url":"https://arxiv.org/pdf/2305.13673v3.pdf","comment":"V2+V3 polishes writing; V3 includes Figures 6 and 10 for better\n  illustrations of our results"},{"id":"http://arxiv.org/abs/2404.14454v2","updated":"2024-06-02T22:42:00Z","published":"2024-04-21T09:20:16Z","title":"Reinforcement of Explainability of ChatGPT Prompts by Embedding Breast\n  Cancer Self-Screening Rules into AI Responses","summary":"  Addressing the global challenge of breast cancer, this research explores the\nfusion of generative AI, focusing on ChatGPT 3.5 turbo model, and the\nintricacies of breast cancer risk assessment. The research aims to evaluate\nChatGPT's reasoning capabilities, emphasizing its potential to process rules\nand provide explanations for screening recommendations. The study seeks to\nbridge the technology gap between intelligent machines and clinicians by\ndemonstrating ChatGPT's unique proficiency in natural language reasoning. The\nmethodology employs a supervised prompt-engineering approach to enforce\ndetailed explanations for ChatGPT's recommendations. Synthetic use cases,\ngenerated algorithmically, serve as the testing ground for the encoded rules,\nevaluating the model's processing prowess. Findings highlight ChatGPT's\npromising capacity in processing rules comparable to Expert System Shells, with\na focus on natural language reasoning. The research introduces the concept of\nreinforcement explainability, showcasing its potential in elucidating outcomes\nand facilitating user-friendly interfaces for breast cancer risk assessment.\n","authors":["Yousef Khan","Ahmed Abdeen Hamed"],"pdf_url":"https://arxiv.org/pdf/2404.14454v2.pdf","comment":"9 pages, 5 figures, 3 algorithms, 1 table, submitted to the IEEE\n  MedAI'24 Conference"},{"id":"http://arxiv.org/abs/2401.11356v3","updated":"2024-06-02T21:05:36Z","published":"2024-01-21T00:58:31Z","title":"ProLex: A Benchmark for Language Proficiency-oriented Lexical\n  Substitution","summary":"  Lexical Substitution discovers appropriate substitutes for a given target\nword in a context sentence. However, the task fails to consider substitutes\nthat are of equal or higher proficiency than the target, an aspect that could\nbe beneficial for language learners looking to improve their writing. To bridge\nthis gap, we propose a new task, language proficiency-oriented lexical\nsubstitution. We also introduce ProLex, a novel benchmark designed to assess\nsystems' ability to generate not only appropriate substitutes but also\nsubstitutes that demonstrate better language proficiency. Besides the\nbenchmark, we propose models that can automatically perform the new task. We\nshow that our best model, a Llama2-13B model fine-tuned with task-specific\nsynthetic data, outperforms ChatGPT by an average of 3.2% in F-score and\nachieves comparable results with GPT-4 on ProLex.\n","authors":["Xuanming Zhang","Zixun Chen","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11356v3.pdf","comment":"In ACL 2024 Findings, 19 pages, 4 figures, 14 tables"},{"id":"http://arxiv.org/abs/2306.12916v3","updated":"2024-06-02T20:38:10Z","published":"2023-06-22T14:31:18Z","title":"Cross-lingual Cross-temporal Summarization: Dataset, Models, Evaluation","summary":"  While summarization has been extensively researched in natural language\nprocessing (NLP), cross-lingual cross-temporal summarization (CLCTS) is a\nlargely unexplored area that has the potential to improve cross-cultural\naccessibility and understanding. This paper comprehensively addresses the CLCTS\ntask, including dataset creation, modeling, and evaluation. We (1) build the\nfirst CLCTS corpus with 328 instances for hDe-En (extended version with 455\ninstances) and 289 for hEn-De (extended version with 501 instances), leveraging\nhistorical fiction texts and Wikipedia summaries in English and German; (2)\nexamine the effectiveness of popular transformer end-to-end models with\ndifferent intermediate finetuning tasks; (3) explore the potential of GPT-3.5\nas a summarizer; (4) report evaluations from humans, GPT-4, and several recent\nautomatic evaluation metrics. Our results indicate that intermediate task\nfinetuned end-to-end models generate bad to moderate quality summaries while\nGPT-3.5, as a zero-shot summarizer, provides moderate to good quality outputs.\nGPT-3.5 also seems very adept at normalizing historical text. To assess data\ncontamination in GPT-3.5, we design an adversarial attack scheme in which we\nfind that GPT-3.5 performs slightly worse for unseen source documents compared\nto seen documents. Moreover, it sometimes hallucinates when the source\nsentences are inverted against its prior knowledge with a summarization\naccuracy of 0.67 for plot omission, 0.71 for entity swap, and 0.53 for plot\nnegation. Overall, our regression results of model performances suggest that\nlonger, older, and more complex source texts (all of which are more\ncharacteristic for historical language variants) are harder to summarize for\nall models, indicating the difficulty of the CLCTS task.\n","authors":["Ran Zhang","Jihed Ouni","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2306.12916v3.pdf","comment":"Computational Linguistics. Submitted manuscript.\n  https://direct.mit.edu/coli/article/doi/10.1162/coli_a_00519/121095/Cross-lingual-Cross-temporal-Summarization-Dataset"},{"id":"http://arxiv.org/abs/2404.18400v2","updated":"2024-06-02T20:17:59Z","published":"2024-04-29T03:30:06Z","title":"LLM-SR: Scientific Equation Discovery via Programming with Large\n  Language Models","summary":"  Mathematical equations have been unreasonably effective in describing complex\nnatural phenomena across various scientific disciplines. However, discovering\nsuch insightful equations from data presents significant challenges due to the\nnecessity of navigating extremely high-dimensional combinatorial and nonlinear\nhypothesis spaces. Traditional methods of equation discovery, commonly known as\nsymbolic regression, largely focus on extracting equations from data alone,\noften neglecting the rich domain-specific prior knowledge that scientists\ntypically depend on. To bridge this gap, we introduce LLM-SR, a novel approach\nthat leverages the extensive scientific knowledge and robust code generation\ncapabilities of Large Language Models (LLMs) to discover scientific equations\nfrom data in an efficient manner. Specifically, LLM-SR treats equations as\nprograms with mathematical operators and combines LLMs' scientific priors with\nevolutionary search over equation programs. The LLM iteratively proposes new\nequation skeleton hypotheses, drawing from its physical understanding, which\nare then optimized against data to estimate skeleton parameters. We demonstrate\nLLM-SR's effectiveness across three diverse scientific domains, where it\ndiscovers physically accurate equations that provide significantly better fits\nto in-domain and out-of-domain data compared to the well-established symbolic\nregression baselines. Incorporating scientific prior knowledge also enables\nLLM-SR to search the equation space more efficiently than baselines. Code is\navailable at: https://github.com/deep-symbolic-mathematics/LLM-SR\n","authors":["Parshin Shojaee","Kazem Meidani","Shashank Gupta","Amir Barati Farimani","Chandan K Reddy"],"pdf_url":"https://arxiv.org/pdf/2404.18400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17394v2","updated":"2024-06-02T19:43:55Z","published":"2024-05-27T17:46:57Z","title":"The Expressive Capacity of State Space Models: A Formal Language\n  Perspective","summary":"  Recently, recurrent models based on linear state space models (SSMs) have\nshown promising performance in language modeling (LM), competititve with\ntransformers. However, there is little understanding of the in-principle\nabilities of such models, which could provide useful guidance to the search for\nbetter LM architectures. We present a comprehensive theoretical study of the\ncapacity of such SSMs as it compares to that of transformers and traditional\nRNNs. We find that SSMs and transformers have overlapping but distinct\nstrengths. In star-free state tracking, SSMs implement straightforward and\nexact solutions to problems that transformers struggle to represent exactly.\nThey can also model bounded hierarchical structure with optimal memory even\nwithout simulating a stack. On the other hand, we identify a design choice in\ncurrent SSMs that limits their expressive power. We discuss implications for\nSSM and LM research, and verify results empirically on a recent SSM, Mamba.\n","authors":["Yash Sarrof","Yana Veitsman","Michael Hahn"],"pdf_url":"https://arxiv.org/pdf/2405.17394v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15051v2","updated":"2024-06-02T19:17:34Z","published":"2023-05-24T11:41:33Z","title":"A Monte Carlo Language Model Pipeline for Zero-Shot Sociopolitical Event\n  Extraction","summary":"  Current social science efforts automatically populate event databases of \"who\ndid what to whom?\" tuples, by applying event extraction (EE) to text such as\nnews. The event databases are used to analyze sociopolitical dynamics between\nactor pairs (dyads) in, e.g., international relations. While most EE methods\nheavily rely on rules or supervised learning, \\emph{zero-shot} event extraction\ncould potentially allow researchers to flexibly specify arbitrary event classes\nfor new research questions. Unfortunately, we find that current zero-shot EE\nmethods, as well as a naive zero-shot approach of simple generative language\nmodel (LM) prompting, perform poorly for dyadic event extraction; most suffer\nfrom word sense ambiguity, modality sensitivity, and computational\ninefficiency. We address these challenges with a new fine-grained, multi-stage\ninstruction-following generative LM pipeline, proposing a Monte Carlo approach\nto deal with, and even take advantage of, nondeterminism of generative outputs.\nOur pipeline includes explicit stages of linguistic analysis (synonym\ngeneration, contextual disambiguation, argument realization, event modality),\n\\textit{improving control and interpretability} compared to purely neural\nmethods. This method outperforms other zero-shot EE approaches, and outperforms\nnaive applications of generative LMs by at least 17 F1 percent points. The\npipeline's filtering mechanism greatly improves computational efficiency,\nallowing it to perform as few as 12% of queries that a previous zero-shot\nmethod uses. Finally, we demonstrate our pipeline's application to dyadic\ninternational relations analysis.\n","authors":["Erica Cai","Brendan O'Connor"],"pdf_url":"https://arxiv.org/pdf/2305.15051v2.pdf","comment":"Accepted at NeurIPS 2023 Workshop on Instruction Tuning and\n  Instruction Following; oral presentation at New England Natural Language\n  Processing, 2023; 17 pages of text including references and appendix"},{"id":"http://arxiv.org/abs/2308.00264v4","updated":"2024-06-02T19:12:57Z","published":"2023-08-01T03:54:27Z","title":"Multimodal Multi-loss Fusion Network for Sentiment Analysis","summary":"  This paper investigates the optimal selection and fusion of feature encoders\nacross multiple modalities and combines these in one neural network to improve\nsentiment detection. We compare different fusion methods and examine the impact\nof multi-loss training within the multi-modality fusion network, identifying\nsurprisingly important findings relating to subnet performance. We have also\nfound that integrating context significantly enhances model performance. Our\nbest model achieves state-of-the-art performance for three datasets (CMU-MOSI,\nCMU-MOSEI and CH-SIMS). These results suggest a roadmap toward an optimized\nfeature selection and fusion approach for enhancing sentiment detection in\nneural networks.\n","authors":["Zehui Wu","Ziwei Gong","Jaywon Koo","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2308.00264v4.pdf","comment":"First two authors contributed equally to the paper"},{"id":"http://arxiv.org/abs/2405.12933v2","updated":"2024-06-02T18:48:56Z","published":"2024-05-21T17:04:44Z","title":"Skin-in-the-Game: Decision Making via Multi-Stakeholder Alignment in\n  LLMs","summary":"  Large Language Models (LLMs) have shown remarkable capabilities in tasks such\nas summarization, arithmetic reasoning, and question answering. However, they\nencounter significant challenges in the domain of moral reasoning and ethical\ndecision-making, especially in complex scenarios with multiple stakeholders.\nThis paper introduces the Skin-in-the-Game (SKIG) framework, aimed at enhancing\nmoral reasoning in LLMs by exploring decisions' consequences from multiple\nstakeholder perspectives. Central to SKIG's mechanism is simulating\naccountability for actions, which, alongside empathy exercises and risk\nassessment, is pivotal to its effectiveness. We validate SKIG's performance\nacross various moral reasoning benchmarks with proprietary and opensource LLMs,\nand investigate its crucial components through extensive ablation analyses.\n","authors":["Bilgehan Sel","Priya Shanmugasundaram","Mohammad Kachuee","Kun Zhou","Ruoxi Jia","Ming Jin"],"pdf_url":"https://arxiv.org/pdf/2405.12933v2.pdf","comment":"ACL 2024, long paper"},{"id":"http://arxiv.org/abs/2405.05189v2","updated":"2024-06-02T18:47:44Z","published":"2024-05-08T16:25:42Z","title":"MIDGARD: Self-Consistency Using Minimum Description Length for\n  Structured Commonsense Reasoning","summary":"  We study the task of conducting structured reasoning as generating a\nreasoning graph from natural language input using large language models (LLMs).\nPrevious approaches have explored various prompting schemes, yet they suffer\nfrom error propagation due to the autoregressive nature and single-pass-based\ndecoding, which lack error correction capability. Additionally, relying solely\non a single sample may result in the omission of true nodes and edges. To\ncounter this, we draw inspiration from self-consistency (SC), which involves\nsampling a diverse set of reasoning chains and taking the majority vote as the\nfinal answer. To tackle the substantial challenge of applying SC on generated\ngraphs, we propose MIDGARD (MInimum Description length Guided Aggregation of\nReasoning in Directed acyclic graph) that leverages Minimum Description Length\n(MDL)-based formulation to identify consistent properties among the different\ngraph samples generated by an LLM. This formulation helps reject properties\nthat appear in only a few samples, which are likely to be erroneous, while\nenabling the inclusion of missing elements without compromising precision. Our\nmethod demonstrates superior performance than comparisons across various\nstructured reasoning tasks, including argument structure extraction,\nexplanation graph generation, inferring dependency relations among actions for\neveryday tasks, and semantic graph generation from natural texts.\n","authors":["Inderjeet Nair","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2405.05189v2.pdf","comment":"Accepted at ACL 2024(main)"},{"id":"http://arxiv.org/abs/2402.11073v3","updated":"2024-06-02T18:35:25Z","published":"2024-02-16T20:59:57Z","title":"AFaCTA: Assisting the Annotation of Factual Claim Detection with\n  Reliable LLM Annotators","summary":"  With the rise of generative AI, automated fact-checking methods to combat\nmisinformation are becoming more and more important. However, factual claim\ndetection, the first step in a fact-checking pipeline, suffers from two key\nissues that limit its scalability and generalizability: (1) inconsistency in\ndefinitions of the task and what a claim is, and (2) the high cost of manual\nannotation. To address (1), we review the definitions in related work and\npropose a unifying definition of factual claims that focuses on verifiability.\nTo address (2), we introduce AFaCTA (Automatic Factual Claim deTection\nAnnotator), a novel framework that assists in the annotation of factual claims\nwith the help of large language models (LLMs). AFaCTA calibrates its annotation\nconfidence with consistency along three predefined reasoning paths. Extensive\nevaluation and experiments in the domain of political speech reveal that AFaCTA\ncan efficiently assist experts in annotating factual claims and training\nhigh-quality classifiers, and can work with or without expert supervision. Our\nanalyses also result in PoliClaim, a comprehensive claim detection dataset\nspanning diverse political topics.\n","authors":["Jingwei Ni","Minjing Shi","Dominik Stammbach","Mrinmaya Sachan","Elliott Ash","Markus Leippold"],"pdf_url":"https://arxiv.org/pdf/2402.11073v3.pdf","comment":"ACL2024 Main Conference"},{"id":"http://arxiv.org/abs/2205.04355v2","updated":"2024-06-02T16:40:26Z","published":"2022-05-09T14:58:34Z","title":"XSTEM: An exemplar-based stemming algorithm","summary":"  Stemming is the process of reducing related words to a standard form by\nremoving affixes from them. Existing algorithms vary with respect to their\ncomplexity, configurability, handling of unknown words, and ability to avoid\nunder- and over-stemming. This paper presents a fast, simple, configurable,\nhigh-precision, high-recall stemming algorithm that combines the simplicity and\nperformance of word-based lookup tables with the strong generalizability of\nrule-based methods to avert problems with out-of-vocabulary words.\n","authors":["Kirk Baker"],"pdf_url":"https://arxiv.org/pdf/2205.04355v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14259v3","updated":"2024-06-02T16:30:00Z","published":"2024-05-23T07:39:42Z","title":"Let's Fuse Step by Step: A Generative Fusion Decoding Algorithm with\n  LLMs for Multi-modal Text Recognition","summary":"  We introduce \"Generative Fusion Decoding\" (GFD), a novel shallow fusion\nframework, utilized to integrate Large Language Models (LLMs) into multi-modal\ntext recognition systems such as automatic speech recognition (ASR) and optical\ncharacter recognition (OCR). We derive the formulas necessary to enable GFD to\noperate across mismatched token spaces of different models by mapping text\ntoken space to byte token space, enabling seamless fusion during the decoding\nprocess. The framework is plug-and-play, compatible with various\nauto-regressive models, and does not require re-training for feature alignment,\nthus overcoming limitations of previous fusion techniques. We highlight three\nmain advantages of GFD: First, by simplifying the complexity of aligning\ndifferent model sample spaces, GFD allows LLMs to correct errors in tandem with\nthe recognition model, reducing computation latencies. Second, the in-context\nlearning ability of LLMs is fully capitalized by GFD, increasing robustness in\nlong-form speech recognition and instruction aware speech recognition. Third,\nGFD enables fusing recognition models deficient in Chinese text recognition\nwith LLMs extensively trained on Chinese. Our evaluation demonstrates that GFD\nsignificantly improves performance in ASR and OCR tasks, with ASR reaching\nstate-of-the-art in the NTUML2021 benchmark. GFD provides a significant step\nforward in model integration, offering a unified solution that could be widely\napplicable to leveraging existing pre-trained models through step by step\nfusion.\n","authors":["Chan-Jan Hsu","Yi-Chang Chen","Feng-Ting Liao","Pei-Chen Ho","Yu-Hsiang Wang","Po-Chun Hsu","Da-shan Shiu"],"pdf_url":"https://arxiv.org/pdf/2405.14259v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11999v3","updated":"2024-06-02T16:21:59Z","published":"2024-04-18T08:49:38Z","title":"Token-level Direct Preference Optimization","summary":"  Fine-tuning pre-trained Large Language Models (LLMs) is essential to align\nthem with human values and intentions. This process often utilizes methods like\npairwise comparisons and KL divergence against a reference LLM, focusing on the\nevaluation of full answers generated by the models. However, the generation of\nthese responses occurs in a token level, following a sequential,\nauto-regressive fashion. In this paper, we introduce Token-level Direct\nPreference Optimization (TDPO), a novel approach to align LLMs with human\npreferences by optimizing policy at the token level. Unlike previous methods,\nwhich face challenges in divergence efficiency, TDPO incorporates forward KL\ndivergence constraints for each token, improving alignment and diversity.\nUtilizing the Bradley-Terry model for a token-based reward system, TDPO\nenhances the regulation of KL divergence, while preserving simplicity without\nthe need for explicit reward modeling. Experimental results across various text\ntasks demonstrate TDPO's superior performance in balancing alignment with\ngeneration diversity. Notably, fine-tuning with TDPO strikes a better balance\nthan DPO in the controlled sentiment generation and single-turn dialogue\ndatasets, and significantly improves the quality of generated responses\ncompared to both DPO and PPO-based RLHF methods. Our code is open-sourced at\nhttps://github.com/Vance0124/Token-level-Direct-Preference-Optimization.\n","authors":["Yongcheng Zeng","Guoqing Liu","Weiyu Ma","Ning Yang","Haifeng Zhang","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11999v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10379v3","updated":"2024-06-02T16:01:35Z","published":"2023-08-20T22:36:23Z","title":"Algorithm of Thoughts: Enhancing Exploration of Ideas in Large Language\n  Models","summary":"  Current literature, aiming to surpass the \"Chain-of-Thought\" approach, often\nresorts to external modi operandi involving halting, modifying, and then\nresuming the generation process to boost Large Language Models' (LLMs)\nreasoning capacities. Due to their myopic perspective, they escalate the number\nof query requests, leading to increased costs, memory, and computational\noverheads. Addressing this, we propose the Algorithm of Thoughts -- a novel\nstrategy that propels LLMs through algorithmic reasoning pathways. By employing\nalgorithmic examples fully in-context, this overarching view of the whole\nprocess exploits the innate recurrence dynamics of LLMs, expanding their idea\nexploration with merely one or a few queries. Our technique outperforms earlier\nsingle-query methods and even more recent multi-query strategies that employ an\nextensive tree search algorithms while using significantly fewer tokens.\nIntriguingly, our results suggest that instructing an LLM using an algorithm\ncan lead to performance surpassing that of the algorithm itself, hinting at\nLLM's inherent ability to weave its intuition into optimized searches. We probe\ninto the underpinnings of our method's efficacy and its nuances in application.\nThe code and related content can be found in:\nhttps://algorithm-of-thoughts.github.io.\n","authors":["Bilgehan Sel","Ahmad Al-Tawaha","Vanshaj Khattar","Ruoxi Jia","Ming Jin"],"pdf_url":"https://arxiv.org/pdf/2308.10379v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2402.04601v2","updated":"2024-06-02T15:50:40Z","published":"2024-02-07T05:56:54Z","title":"Alirector: Alignment-Enhanced Chinese Grammatical Error Corrector","summary":"  Chinese grammatical error correction (CGEC) faces serious overcorrection\nchallenges when employing autoregressive generative models such as\nsequence-to-sequence (Seq2Seq) models and decoder-only large language models\n(LLMs). While previous methods aim to address overcorrection in Seq2Seq models,\nthey are difficult to adapt to decoder-only LLMs. In this paper, we propose an\nalignment-enhanced corrector for the overcorrection problem that applies to\nboth Seq2Seq models and decoder-only LLMs. Our method first trains a correction\nmodel to generate an initial correction of the source sentence. Then, we\ncombine the source sentence with the initial correction and feed it through an\nalignment model for another round of correction, aiming to enforce the\nalignment model to focus on potential overcorrection. Moreover, to enhance the\nmodel's ability to identify nuances, we further explore the reverse alignment\nof the source sentence and the initial correction. Finally, we transfer the\nalignment knowledge from two alignment models to the correction model,\ninstructing it on how to avoid overcorrection. Experimental results on three\nCGEC datasets demonstrate the effectiveness of our approach in alleviating\novercorrection and improving overall performance. Our code has been made\npublicly available.\n","authors":["Haihui Yang","Xiaojun Quan"],"pdf_url":"https://arxiv.org/pdf/2402.04601v2.pdf","comment":"Accepted to Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2403.00231v3","updated":"2024-06-02T15:47:16Z","published":"2024-03-01T02:21:30Z","title":"Multimodal ArXiv: A Dataset for Improving Scientific Comprehension of\n  Large Vision-Language Models","summary":"  Large vision-language models (LVLMs) excel across diverse tasks involving\nconcrete images from natural scenes. However, their ability to interpret\nabstract figures, such as geometry shapes and scientific plots, remains limited\ndue to a scarcity of training datasets in scientific domains. To fill this gap,\nwe introduce Multimodal ArXiv, consisting of ArXivCap and ArXivQA, for\nenhancing LVLMs scientific comprehension. ArXivCap is a figure-caption dataset\ncomprising 6.4M images and 3.9M captions, sourced from 572K ArXiv papers\nspanning various scientific domains. Drawing from ArXivCap, we introduce\nArXivQA, a question-answering dataset generated by prompting GPT-4V based on\nscientific figures. ArXivQA greatly enhances open-sourced LVLMs' mathematical\nreasoning capabilities, achieving a 10.4\\% absolute accuracy gain on a\nmultimodal mathematical reasoning benchmark. Furthermore, employing ArXivCap,\nwe devise four vision-to-text tasks for benchmarking LVLMs. Evaluation results\nwith state-of-the-art LVLMs underscore their struggle with the nuanced\nsemantics of academic figures, while domain-specific training yields\nsubstantial performance gains. Our error analysis uncovers misinterpretations\nof visual context, recognition errors, and the production of overly simplified\ncaptions by current LVLMs, shedding light on future improvements.\n","authors":["Lei Li","Yuqi Wang","Runxin Xu","Peiyi Wang","Xiachong Feng","Lingpeng Kong","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2403.00231v3.pdf","comment":"Project page: https://mm-arxiv.github.io, Camera Ready Version of ACL\n  2024"},{"id":"http://arxiv.org/abs/2405.19076v2","updated":"2024-06-02T15:03:24Z","published":"2024-05-29T13:34:32Z","title":"Cephalo: Multi-Modal Vision-Language Models for Bio-Inspired Materials\n  Analysis and Design","summary":"  We present Cephalo, a series of multimodal vision large language models\n(V-LLMs) designed for materials science applications, integrating visual and\nlinguistic data for enhanced understanding and interaction within human-AI and\nmulti-agent AI frameworks. A key innovation of Cephalo is its advanced dataset\ngeneration method, which employs a sophisticated algorithm to accurately detect\nand separate images and their corresponding textual descriptions from PDF\ndocuments, such as scientific papers. The method includes a careful refinement\nof image-text pairs through integrated vision and language processing, ensuring\nhigh-quality, contextually relevant, and well reasoned training data. Cephalo\nis trained on integrated image and text data extracted from thousands of\nscientific papers and science-focused Wikipedia pages demonstrates can\ninterpret complex visual scenes, generate precise language descriptions, and\nanswer queries about images effectively. The combination of a vision encoder\nwith an autoregressive transformer supports complex natural language\nunderstanding in an integrated model, which can be coupled with other\ngenerative methods to create an image-to-text-to-image or image-to-text-to-3D\npipeline. To explore the development of larger models from smaller ones, we\nreport both mixture-of-expert methods and model merging. These hybrid\napproaches allow us to leverage the domain-specific expertise and general\nconversational capabilities to harness the strengths of multiple models. We\nexamine the models in diverse use cases that incorporate biological materials,\nfracture and engineering analysis, protein biophysics, and bio-inspired design\nbased on insect behavior. Generative applications include bio-inspired designs,\nincluding pollen-inspired architected materials, as well as the synthesis of\nbio-inspired material microstructures from a photograph of a solar eclipse.\n","authors":["Markus J. Buehler"],"pdf_url":"https://arxiv.org/pdf/2405.19076v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18628v2","updated":"2024-06-02T14:58:48Z","published":"2024-05-28T22:19:30Z","title":"Hardware-Aware Parallel Prompt Decoding for Memory-Efficient\n  Acceleration of LLM Inference","summary":"  The auto-regressive decoding of Large Language Models (LLMs) results in\nsignificant overheads in their hardware performance. While recent research has\ninvestigated various speculative decoding techniques for multi-token\ngeneration, these efforts have primarily focused on improving processing speed\nsuch as throughput. Crucially, they often neglect other metrics essential for\nreal-life deployments, such as memory consumption and training cost. To\novercome these limitations, we propose a novel parallel prompt decoding that\nrequires only $0.0002$% trainable parameters, enabling efficient training on a\nsingle A100-40GB GPU in just 16 hours. Inspired by the human natural language\ngeneration process, $PPD$ approximates outputs generated at future timesteps in\nparallel by using multiple prompt tokens. This approach partially recovers the\nmissing conditional dependency information necessary for multi-token\ngeneration, resulting in up to a 28% higher acceptance rate for long-range\npredictions. Furthermore, we present a hardware-aware dynamic sparse tree\ntechnique that adaptively optimizes this decoding scheme to fully leverage the\ncomputational capacities on different GPUs. Through extensive experiments\nacross LLMs ranging from MobileLlama to Vicuna-13B on a wide range of\nbenchmarks, our approach demonstrates up to 2.49$\\times$ speedup and maintains\na minimal runtime memory overhead of just $0.0004$%. More importantly, our\nparallel prompt decoding can serve as an orthogonal optimization for\nsynergistic integration with existing speculative decoding, showing up to\n$1.22\\times$ further speed improvement. Our code is available at\nhttps://github.com/hmarkc/parallel-prompt-decoding.\n","authors":["Hao Mark Chen","Wayne Luk","Ka Fai Cedric Yiu","Rui Li","Konstantin Mishchenko","Stylianos I. Venieris","Hongxiang Fan"],"pdf_url":"https://arxiv.org/pdf/2405.18628v2.pdf","comment":"The code for this implementation is available at\n  https://github.com/hmarkc/parallel-prompt-decoding"},{"id":"http://arxiv.org/abs/2402.13113v2","updated":"2024-06-02T14:48:13Z","published":"2024-02-20T16:09:49Z","title":"When Only Time Will Tell: Interpreting How Transformers Process Local\n  Ambiguities Through the Lens of Restart-Incrementality","summary":"  Incremental models that process sentences one token at a time will sometimes\nencounter points where more than one interpretation is possible. Causal models\nare forced to output one interpretation and continue, whereas models that can\nrevise may edit their previous output as the ambiguity is resolved. In this\nwork, we look at how restart-incremental Transformers build and update internal\nstates, in an effort to shed light on what processes cause revisions not viable\nin autoregressive models. We propose an interpretable way to analyse the\nincremental states, showing that their sequential structure encodes information\non the garden path effect and its resolution. Our method brings insights on\nvarious bidirectional encoders for contextualised meaning representation and\ndependency parsing, contributing to show their advantage over causal models\nwhen it comes to revisions.\n","authors":["Brielen Madureira","Patrick Kahardipraja","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2402.13113v2.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2402.10659v3","updated":"2024-06-02T13:50:14Z","published":"2024-02-16T13:10:14Z","title":"Network Formation and Dynamics Among Multi-LLMs","summary":"  Social networks shape opinions, behaviors, and information dissemination in\nhuman societies. As large language models (LLMs) increasingly integrate into\nsocial and professional environments, understanding their behavior within the\ncontext of social interactions and networks becomes essential. Our study\nanalyzes LLMs' network formation behavior to examine whether the dynamics of\nmultiple LLMs are similar to or different from human social dynamics. We\nobserve that LLMs exhibit key social network principles, including preferential\nattachment, triadic closure, homophily, community structure, and the\nsmall-world phenomenon, when asked about their preferences in network\nformation. We also investigate LLMs' decision-making based on real-world\nnetworks, revealing that triadic closure and homophily have a stronger\ninfluence than preferential attachment and that LLMs perform well in network\nformation predictions. Overall, our study opens up new possibilities for using\nLLMs in network science research and helps develop socially aware LLMs by\nshedding light on their social interaction behaviors and exploring their\nimpacts on social dynamics.\n","authors":["Marios Papachristou","Yuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.10659v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15799v2","updated":"2024-06-02T13:49:32Z","published":"2023-06-27T20:58:41Z","title":"FLuRKA: Fast and accurate unified Low-Rank & Kernel Attention","summary":"  Many efficient $\\textit{approximate}$ self-attention techniques have become\nprevalent since the inception of the transformer architecture. Two popular\nclasses of these techniques are low-rank and kernel methods. Each of these\nmethods has its strengths. We observe these strengths synergistically\ncomplement each other and exploit them to fuse low-rank and kernel methods,\nproducing a new class of transformers: FLuRKA ($\\textbf{F}$ast\n$\\textbf{L}$ow-$\\textbf{R}$ank & $\\textbf{K}$ernel$ \\textbf{A}$ttention).\nFLuRKA are highly $\\textit{training-efficient}$ with faster model speeds\n$\\textit{and}$ similar model qualities compared to constituent low-rank and\nkernel methods. We theoretically and empirically evaluate the speed and quality\nof FLuRKA. Our model speed analysis posits a variety of parameter\nconfigurations where FLuRKA exhibit speedups over low-rank and kernel\napproximations and our model quality analysis bounds the error of FLuRKA with\nrespect to full-attention. Empirically, we instantiate three FLuRKA variants\nwhich experience speedups of up to 3.3x and 1.7x over low-rank and kernel\nmethods respectively. This translates to speedups of up to 20x over models with\nflash-attention. Across a diverse set of tasks spanning language modeling,\nlanguage understanding, long sequence modeling, machine translation, and image\nclassification, FLuRKA achieve comparable accuracy with underlying low-rank and\nkernel approximations, occasionally surpassing both.\n","authors":["Ahan Gupta","Hao Guo","Yueming Yuan","Yanqi Zhou","Charith Mendis"],"pdf_url":"https://arxiv.org/pdf/2306.15799v2.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.00862v3","updated":"2024-06-02T13:38:01Z","published":"2024-02-29T21:05:14Z","title":"NewsBench: A Systematic Evaluation Framework for Assessing Editorial\n  Capabilities of Large Language Models in Chinese Journalism","summary":"  We present NewsBench, a novel evaluation framework to systematically assess\nthe capabilities of Large Language Models (LLMs) for editorial capabilities in\nChinese journalism. Our constructed benchmark dataset is focused on four facets\nof writing proficiency and six facets of safety adherence, and it comprises\nmanually and carefully designed 1,267 test samples in the types of multiple\nchoice questions and short answer questions for five editorial tasks in 24 news\ndomains. To measure performances, we propose different GPT-4 based automatic\nevaluation protocols to assess LLM generations for short answer questions in\nterms of writing proficiency and safety adherence, and both are validated by\nthe high correlations with human evaluations. Based on the systematic\nevaluation framework, we conduct a comprehensive analysis of ten popular LLMs\nwhich can handle Chinese. The experimental results highlight GPT-4 and ERNIE\nBot as top performers, yet reveal a relative deficiency in journalistic safety\nadherence in creative writing tasks. Our findings also underscore the need for\nenhanced ethical guidance in machine-generated journalistic content, marking a\nstep forward in aligning LLMs with journalistic standards and safety\nconsiderations.\n","authors":["Miao Li","Ming-Bin Chen","Bo Tang","Shengbin Hou","Pengyu Wang","Haiying Deng","Zhiyu Li","Feiyu Xiong","Keming Mao","Peng Cheng","Yi Luo"],"pdf_url":"https://arxiv.org/pdf/2403.00862v3.pdf","comment":"Long paper, ACL 2024 Main"},{"id":"http://arxiv.org/abs/2402.11941v3","updated":"2024-06-02T13:25:05Z","published":"2024-02-19T08:29:03Z","title":"CoCo-Agent: A Comprehensive Cognitive MLLM Agent for Smartphone GUI\n  Automation","summary":"  Multimodal large language models (MLLMs) have shown remarkable potential as\nhuman-like autonomous language agents to interact with real-world environments,\nespecially for graphical user interface (GUI) automation. However, those GUI\nagents require comprehensive cognition ability including exhaustive perception\nand reliable action response. We propose a Comprehensive Cognitive LLM Agent,\nCoCo-Agent, with two novel approaches, comprehensive environment perception\n(CEP) and conditional action prediction (CAP), to systematically improve the\nGUI automation performance. First, CEP facilitates the GUI perception through\ndifferent aspects and granularity, including screenshots and complementary\ndetailed layouts for the visual channel and historical actions for the textual\nchannel. Second, CAP decomposes the action prediction into sub-problems: action\ntype prediction and action target conditioned on the action type. With our\ntechnical design, our agent achieves new state-of-the-art performance on AITW\nand META-GUI benchmarks, showing promising abilities in realistic scenarios.\nCode is available at https://github.com/xbmxb/CoCo-Agent.\n","authors":["Xinbei Ma","Zhuosheng Zhang","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.11941v3.pdf","comment":"ACL'2024 Findings"},{"id":"http://arxiv.org/abs/2402.17959v2","updated":"2024-06-02T10:46:13Z","published":"2024-02-28T00:49:06Z","title":"An Iterative Associative Memory Model for Empathetic Response Generation","summary":"  Empathetic response generation aims to comprehend the cognitive and emotional\nstates in dialogue utterances and generate proper responses. Psychological\ntheories posit that comprehending emotional and cognitive states necessitates\niteratively capturing and understanding associated words across dialogue\nutterances. However, existing approaches regard dialogue utterances as either a\nlong sequence or independent utterances for comprehension, which are prone to\noverlook the associated words between them. To address this issue, we propose\nan Iterative Associative Memory Model (IAMM) for empathetic response\ngeneration. Specifically, we employ a novel second-order interaction attention\nmechanism to iteratively capture vital associated words between dialogue\nutterances and situations, dialogue history, and a memory module (for storing\nassociated words), thereby accurately and nuancedly comprehending the\nutterances. We conduct experiments on the Empathetic-Dialogue dataset. Both\nautomatic and human evaluations validate the efficacy of the model. Variant\nexperiments on LLMs also demonstrate that attending to associated words\nimproves empathetic comprehension and expression.\n","authors":["Zhou Yang","Zhaochun Ren","Yufeng Wang","Chao Chen","Haizhou Sun","Xiaofei Zhu","Xiangwen Liao"],"pdf_url":"https://arxiv.org/pdf/2402.17959v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.12026v3","updated":"2024-06-02T10:28:13Z","published":"2024-02-19T10:34:48Z","title":"Acquiring Clean Language Models from Backdoor Poisoned Datasets by\n  Downscaling Frequency Space","summary":"  Despite the notable success of language models (LMs) in various natural\nlanguage processing (NLP) tasks, the reliability of LMs is susceptible to\nbackdoor attacks. Prior research attempts to mitigate backdoor learning while\ntraining the LMs on the poisoned dataset, yet struggles against complex\nbackdoor attacks in real-world scenarios. In this paper, we investigate the\nlearning mechanisms of backdoor LMs in the frequency space by Fourier analysis.\nOur findings indicate that the backdoor mapping presented on the poisoned\ndatasets exhibits a more discernible inclination towards lower frequency\ncompared to clean mapping, resulting in the faster convergence of backdoor\nmapping. To alleviate this dilemma, we propose Multi-Scale Low-Rank Adaptation\n(MuScleLoRA), which deploys multiple radial scalings in the frequency space\nwith low-rank adaptation to the target model and further aligns the gradients\nwhen updating parameters. Through downscaling in the frequency space,\nMuScleLoRA encourages the model to prioritize the learning of relatively\nhigh-frequency clean mapping, consequently mitigating backdoor learning.\nExperimental results demonstrate that MuScleLoRA outperforms baselines\nsignificantly. Notably, MuScleLoRA reduces the average success rate of diverse\nbackdoor attacks to below 15\\% across multiple datasets and generalizes to\nvarious backbone LMs, including BERT, RoBERTa, GPT2-XL, and Llama2. The codes\nare publicly available at https://github.com/ZrW00/MuScleLoRA.\n","authors":["Zongru Wu","Zhuosheng Zhang","Pengzhou Cheng","Gongshen Liu"],"pdf_url":"https://arxiv.org/pdf/2402.12026v3.pdf","comment":"Accepted at ACL 2024 (Long Paper. Main Conference)"},{"id":"http://arxiv.org/abs/2308.08758v3","updated":"2024-06-02T10:09:01Z","published":"2023-08-17T03:10:17Z","title":"Discrete Prompt Compression with Reinforcement Learning","summary":"  Compressed prompts aid instruction-tuned language models (LMs) in overcoming\ncontext window limitations and reducing computational costs. Existing methods,\nwhich primarily based on training embeddings, face various challenges\nassociated with interpretability, the fixed number of embedding tokens,\nreusability across different LMs, and inapplicability when interacting with\nblack-box APIs. This study proposes prompt compression with reinforcement\nlearning (PCRL), which is a discrete prompt compression method that addresses\nthese issues. The proposed PCRL method utilizes a computationally efficient\npolicy network that edits prompts directly. The training approach employed in\nthe proposed PCRLs can be applied flexibly to various types of LMs, including\nboth decoder-only and encoder-decoder architecture and it can be trained\nwithout gradient access to the LMs or labeled data. The proposed PCRL achieves\nan average reduction of 24.6% in terms of the token count across various\ninstruction prompts while maintaining sufficient performance. In addition, we\ndemonstrate that the learned policy can be transferred to larger LMs, and\nthrough a comprehensive analysis, we explore the token importance within the\nprompts. Our code is accessible at\nhttps://github.com/nenomigami/PromptCompressor.\n","authors":["Hoyoun Jung","Kyung-Joong Kim"],"pdf_url":"https://arxiv.org/pdf/2308.08758v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13963v4","updated":"2024-06-02T10:02:00Z","published":"2024-02-21T17:47:20Z","title":"Towards Building Multilingual Language Model for Medicine","summary":"  The development of open-source, multilingual medical language models can\nbenefit a wide, linguistically diverse audience from different regions. To\npromote this domain, we present contributions from the following: First, we\nconstruct a multilingual medical corpus, containing approximately 25.5B tokens\nencompassing 6 main languages, termed as MMedC, enabling auto-regressive domain\nadaptation for general LLMs; Second, to monitor the development of multilingual\nmedical LLMs, we propose a multilingual medical multi-choice question-answering\nbenchmark with rationale, termed as MMedBench; Third, we have assessed a number\nof open-source large language models (LLMs) on our benchmark, along with those\nfurther auto-regressive trained on MMedC. Our final model, MMed-Llama 3, with\nonly 8B parameters, achieves superior performance compared to all other\nopen-source models on both MMedBench and English benchmarks, even rivaling\nGPT-4. In conclusion, in this work, we present a large-scale corpus, a\nbenchmark and a series of models to support the development of multilingual\nmedical LLMs.\n","authors":["Pengcheng Qiu","Chaoyi Wu","Xiaoman Zhang","Weixiong Lin","Haicheng Wang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2402.13963v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11900v2","updated":"2024-06-02T09:17:37Z","published":"2024-02-19T07:34:10Z","title":"Investigating Multi-Hop Factual Shortcuts in Knowledge Editing of Large\n  Language Models","summary":"  Recent work has showcased the powerful capability of large language models\n(LLMs) in recalling knowledge and reasoning. However, the reliability of LLMs\nin combining these two capabilities into reasoning through multi-hop facts has\nnot been widely explored. This paper systematically investigates the\npossibilities for LLMs to utilize shortcuts based on direct connections between\nthe initial and terminal entities of multi-hop knowledge. We first explore the\nexistence of factual shortcuts through Knowledge Neurons, revealing that: (i)\nthe strength of factual shortcuts is highly correlated with the frequency of\nco-occurrence of initial and terminal entities in the pre-training corpora;\n(ii) few-shot prompting leverage more shortcuts in answering multi-hop\nquestions compared to chain-of-thought prompting. Then, we analyze the risks\nposed by factual shortcuts from the perspective of multi-hop knowledge editing.\nAnalysis shows that approximately 20% of the failures are attributed to\nshortcuts, and the initial and terminal entities in these failure instances\nusually have higher co-occurrences in the pre-training corpus. Finally, we\npropose erasing shortcut neurons to mitigate the associated risks and find that\nthis approach significantly reduces failures in multiple-hop knowledge editing\ncaused by shortcuts.\n","authors":["Tianjie Ju","Yijin Chen","Xinwei Yuan","Zhuosheng Zhang","Wei Du","Yubin Zheng","Gongshen Liu"],"pdf_url":"https://arxiv.org/pdf/2402.11900v2.pdf","comment":"Accepted at ACL 2024 (Long Paper. Main Conference)"},{"id":"http://arxiv.org/abs/2402.15179v3","updated":"2024-06-02T09:05:31Z","published":"2024-02-23T08:21:02Z","title":"Advancing Parameter Efficiency in Fine-tuning via Representation Editing","summary":"  Parameter Efficient Fine-Tuning (PEFT) techniques have drawn significant\nattention due to their ability to yield competitive results while updating only\na small portion of the adjustable parameters. However, existing PEFT methods\npose challenges in hyperparameter selection, such as choosing the rank for LoRA\nor Adapter, or specifying the length of soft prompts. To address these\nchallenges, we propose a novel fine-tuning approach for neural models, named\nRepresentation EDiting (RED), which modifies the representations generated at\nsome layers through the application of scaling and biasing operations. While\nexisting PEFT methods still demonstrate over-parameterization that could\npotentially undermine the generalization ability acquired from pre-training,\nRED can substantially reduce the number of trainable parameters by a factor of\n25, 700 compared to full parameter fine-tuning and by a factor of 32 relative\nto LoRA. Remarkably, RED achieves results comparable or superior to both full\nparameter fine-tuning and other PEFT methods. Extensive experiments across\nvarious model architectures and scales, including RoBERTa, GPT-2, T5, and\nLLaMA-2, have demonstrated the effectiveness and efficiency of RED1, thereby\npositioning it as a promising PEFT strategy for large-scale neural models.\n","authors":["Muling Wu","Wenhao Liu","Xiaohua Wang","Tianlong Li","Changze Lv","Zixuan Ling","Jianhao Zhu","Cenyuan Zhang","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2402.15179v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17653v2","updated":"2024-06-02T08:50:02Z","published":"2024-05-27T20:53:22Z","title":"InversionView: A General-Purpose Method for Reading Information from\n  Neural Activations","summary":"  The inner workings of neural networks can be better understood if we can\nfully decipher the information encoded in neural activations. In this paper, we\nargue that this information is embodied by the subset of inputs that give rise\nto similar activations. Computing such subsets is nontrivial as the input space\nis exponentially large. We propose InversionView, which allows us to\npractically inspect this subset by sampling from a trained decoder model\nconditioned on activations. This helps uncover the information content of\nactivation vectors, and facilitates understanding of the algorithms implemented\nby transformer models. We present three case studies where we investigate\nmodels ranging from small transformers to GPT-2. In these studies, we\ndemonstrate the characteristics of our method, show the distinctive advantages\nit offers, and provide causally verified circuits.\n","authors":["Xinting Huang","Madhur Panwar","Navin Goyal","Michael Hahn"],"pdf_url":"https://arxiv.org/pdf/2405.17653v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10288v2","updated":"2024-06-02T08:04:26Z","published":"2024-05-16T17:48:21Z","title":"Timeline-based Sentence Decomposition with In-Context Learning for\n  Temporal Fact Extraction","summary":"  Facts extraction is pivotal for constructing knowledge graphs. Recently, the\nincreasing demand for temporal facts in downstream tasks has led to the\nemergence of the task of temporal fact extraction. In this paper, we\nspecifically address the extraction of temporal facts from natural language\ntext. Previous studies fail to handle the challenge of establishing\ntime-to-fact correspondences in complex sentences. To overcome this hurdle, we\npropose a timeline-based sentence decomposition strategy using large language\nmodels (LLMs) with in-context learning, ensuring a fine-grained understanding\nof the timeline associated with various facts. In addition, we evaluate the\nperformance of LLMs for direct temporal fact extraction and get unsatisfactory\nresults. To this end, we introduce TSDRE, a method that incorporates the\ndecomposition capabilities of LLMs into the traditional fine-tuning of smaller\npre-trained language models (PLMs). To support the evaluation, we construct\nComplexTRED, a complex temporal fact extraction dataset. Our experiments show\nthat TSDRE achieves state-of-the-art results on both HyperRED-Temporal and\nComplexTRED datasets.\n","authors":["Jianhao Chen","Haoyuan Ouyang","Junyang Ren","Wentao Ding","Wei Hu","Yuzhong Qu"],"pdf_url":"https://arxiv.org/pdf/2405.10288v2.pdf","comment":"Accepted to ACL2024 main conference"},{"id":"http://arxiv.org/abs/2401.17244v2","updated":"2024-06-02T07:50:21Z","published":"2024-01-30T18:37:45Z","title":"LLaMP: Large Language Model Made Powerful for High-fidelity Materials\n  Knowledge Retrieval and Distillation","summary":"  Reducing hallucination of Large Language Models (LLMs) is imperative for use\nin the sciences, where reliability and reproducibility are crucial. However,\nLLMs inherently lack long-term memory, making it a nontrivial, ad hoc, and\ninevitably biased task to fine-tune them on domain-specific literature and\ndata. Here we introduce LLaMP, a multimodal retrieval-augmented generation\n(RAG) framework of hierarchical reasoning-and-acting (ReAct) agents that can\ndynamically and recursively interact with computational and experimental data\non Materials Project (MP) and run atomistic simulations via high-throughput\nworkflow interface. Without fine-tuning, LLaMP demonstrates strong tool usage\nability to comprehend and integrate various modalities of materials science\nconcepts, fetch relevant data stores on the fly, process higher-order data\n(such as crystal structure and elastic tensor), and streamline complex tasks in\ncomputational materials and chemistry. We propose a simple metric combining\nuncertainty and confidence estimates to evaluate the self-consistency of\nresponses by LLaMP and vanilla LLMs. Our benchmark shows that LLaMP effectively\nmitigates the intrinsic bias in LLMs, counteracting the errors on bulk moduli,\nelectronic bandgaps, and formation energies that seem to derive from mixed data\nsources. We also demonstrate LLaMP's capability to edit crystal structures and\nrun annealing molecular dynamics simulations using pre-trained machine-learning\nforce fields. The framework offers an intuitive and nearly hallucination-free\napproach to exploring and scaling materials informatics, and establishes a\npathway for knowledge distillation and fine-tuning other language models. Code\nand live demo are available at https://github.com/chiang-yuan/llamp\n","authors":["Yuan Chiang","Elvis Hsieh","Chia-Hong Chou","Janosh Riebesell"],"pdf_url":"https://arxiv.org/pdf/2401.17244v2.pdf","comment":"31 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.19327v3","updated":"2024-06-02T06:09:49Z","published":"2024-05-29T17:57:16Z","title":"MAP-Neo: Highly Capable and Transparent Bilingual Large Language Model\n  Series","summary":"  Large Language Models (LLMs) have made great strides in recent years to\nachieve unprecedented performance across different tasks. However, due to\ncommercial interest, the most competitive models like GPT, Gemini, and Claude\nhave been gated behind proprietary interfaces without disclosing the training\ndetails. Recently, many institutions have open-sourced several strong LLMs like\nLLaMA-3, comparable to existing closed-source LLMs. However, only the model's\nweights are provided with most details (e.g., intermediate checkpoints,\npre-training corpus, and training code, etc.) being undisclosed. To improve the\ntransparency of LLMs, the research community has formed to open-source truly\nopen LLMs (e.g., Pythia, Amber, OLMo), where more details (e.g., pre-training\ncorpus and training code) are being provided. These models have greatly\nadvanced the scientific study of these large models including their strengths,\nweaknesses, biases and risks. However, we observe that the existing truly open\nLLMs on reasoning, knowledge, and coding tasks are still inferior to existing\nstate-of-the-art LLMs with similar model sizes. To this end, we open-source\nMAP-Neo, a highly capable and transparent bilingual language model with 7B\nparameters trained from scratch on 4.5T high-quality tokens. Our MAP-Neo is the\nfirst fully open-sourced bilingual LLM with comparable performance compared to\nexisting state-of-the-art LLMs. Moreover, we open-source all details to\nreproduce our MAP-Neo, where the cleaned pre-training corpus, data cleaning\npipeline, checkpoints, and well-optimized training/evaluation framework are\nprovided. Finally, we hope our MAP-Neo will enhance and strengthen the open\nresearch community and inspire more innovations and creativities to facilitate\nthe further improvements of LLMs.\n","authors":["Ge Zhang","Scott Qu","Jiaheng Liu","Chenchen Zhang","Chenghua Lin","Chou Leuang Yu","Danny Pan","Esther Cheng","Jie Liu","Qunshu Lin","Raven Yuan","Tuney Zheng","Wei Pang","Xinrun Du","Yiming Liang","Yinghao Ma","Yizhi Li","Ziyang Ma","Bill Lin","Emmanouil Benetos","Huan Yang","Junting Zhou","Kaijing Ma","Minghao Liu","Morry Niu","Noah Wang","Quehry Que","Ruibo Liu","Sine Liu","Shawn Guo","Soren Gao","Wangchunshu Zhou","Xinyue Zhang","Yizhi Zhou","Yubo Wang","Yuelin Bai","Yuhan Zhang","Yuxiang Zhang","Zenith Wang","Zhenzhu Yang","Zijian Zhao","Jiajun Zhang","Wanli Ouyang","Wenhao Huang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2405.19327v3.pdf","comment":"https://map-neo.github.io/"},{"id":"http://arxiv.org/abs/2402.02619v5","updated":"2024-06-02T05:56:31Z","published":"2024-02-04T21:33:18Z","title":"Increasing Trust in Language Models through the Reuse of Verified\n  Circuits","summary":"  Language Models (LMs) are increasingly used for a wide range of prediction\ntasks, but their training can often neglect rare edge cases, reducing their\nreliability. Here, we define a stringent standard of trustworthiness whereby\nthe task algorithm and circuit implementation must be verified, accounting for\nedge cases, with no known failure modes. We show that a transformer model can\nbe trained to meet this standard if built using mathematically and logically\nspecified frameworks. In this paper, we fully verify a model for n-digit\ninteger addition. To exhibit the reusability of verified modules, we insert the\ntrained integer addition model into an untrained model and train the combined\nmodel to perform both addition and subtraction. We find extensive reuse of the\naddition circuits for both tasks, easing verification of the more complex\nsubtractor model. We discuss how inserting verified task modules into LMs can\nleverage model reuse to improve verifiability and trustworthiness of language\nmodels built using them. The reuse of verified circuits reduces the effort to\nverify more complex composite models which we believe to be a significant step\ntowards safety of language models.\n","authors":["Philip Quirke","Clement Neo","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2402.02619v5.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.03635v4","updated":"2024-06-02T04:56:32Z","published":"2024-04-04T17:54:33Z","title":"WorDepth: Variational Language Prior for Monocular Depth Estimation","summary":"  Three-dimensional (3D) reconstruction from a single image is an ill-posed\nproblem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text\ndescription(s) is similarly ill-posed, i.e. spatial arrangements of objects\ndescribed. We investigate the question of whether two inherently ambiguous\nmodalities can be used in conjunction to produce metric-scaled reconstructions.\nTo test this, we focus on monocular depth estimation, the problem of predicting\na dense depth map from a single image, but with an additional text caption\ndescribing the scene. To this end, we begin by encoding the text caption as a\nmean and standard deviation; using a variational framework, we learn the\ndistribution of the plausible metric reconstructions of 3D scenes corresponding\nto the text captions as a prior. To \"select\" a specific reconstruction or depth\nmap, we encode the given image through a conditional sampler that samples from\nthe latent space of the variational text encoder, which is then decoded to the\noutput depth map. Our approach is trained alternatingly between the text and\nimage branches: in one optimization step, we predict the mean and standard\ndeviation from the text description and sample from a standard Gaussian, and in\nthe other, we sample using a (image) conditional sampler. Once trained, we\ndirectly predict depth from the encoded text using the conditional sampler. We\ndemonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where\nwe show that language can consistently improve performance in both.\n","authors":["Ziyao Zeng","Daniel Wang","Fengyu Yang","Hyoungseob Park","Yangchao Wu","Stefano Soatto","Byung-Woo Hong","Dong Lao","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2404.03635v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01789v2","updated":"2024-06-02T04:48:36Z","published":"2024-02-02T02:43:10Z","title":"The Political Preferences of LLMs","summary":"  I report here a comprehensive analysis about the political preferences\nembedded in Large Language Models (LLMs). Namely, I administer 11 political\norientation tests, designed to identify the political preferences of the test\ntaker, to 24 state-of-the-art conversational LLMs, both closed and open source.\nWhen probed with questions/statements with political connotations, most\nconversational LLMs tend to generate responses that are diagnosed by most\npolitical test instruments as manifesting preferences for left-of-center\nviewpoints. This does not appear to be the case for five additional base (i.e.\nfoundation) models upon which LLMs optimized for conversation with humans are\nbuilt. However, the weak performance of the base models at coherently answering\nthe tests' questions makes this subset of results inconclusive. Finally, I\ndemonstrate that LLMs can be steered towards specific locations in the\npolitical spectrum through Supervised Fine-Tuning (SFT) with only modest\namounts of politically aligned data, suggesting SFT's potential to embed\npolitical orientation in LLMs. With LLMs beginning to partially displace\ntraditional information sources like search engines and Wikipedia, the societal\nimplications of political biases embedded in LLMs are substantial.\n","authors":["David Rozado"],"pdf_url":"https://arxiv.org/pdf/2402.01789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12999v4","updated":"2024-06-02T03:57:06Z","published":"2023-12-20T12:59:31Z","title":"Machine Mindset: An MBTI Exploration of Large Language Models","summary":"  We present a novel approach for integrating Myers-Briggs Type Indicator\n(MBTI) personality traits into large language models (LLMs), addressing the\nchallenges of personality consistency in personalized AI. Our method, \"Machine\nMindset,\" involves a two-phase fine-tuning and Direct Preference Optimization\n(DPO) to embed MBTI traits into LLMs. This approach ensures that models\ninternalize these traits, offering a stable and consistent personality profile.\nWe demonstrate the effectiveness of our models across various domains, showing\nalignment between model performance and their respective MBTI traits. The paper\nhighlights significant contributions in the development of personality datasets\nand a new training methodology for personality integration in LLMs, enhancing\nthe potential for personalized AI applications. We also open-sourced our model\nand part of the data at \\url{https://github.com/PKU-YuanGroup/Machine-Mindset}.\n","authors":["Jiaxi Cui","Liuzhenghao Lv","Jing Wen","Rongsheng Wang","Jing Tang","YongHong Tian","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.12999v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17820v4","updated":"2024-06-02T03:48:21Z","published":"2023-06-30T17:38:10Z","title":"Meta-Reasoning: Semantics-Symbol Deconstruction for Large Language\n  Models","summary":"  Neural-symbolic methods have demonstrated efficiency in enhancing the\nreasoning abilities of large language models (LLMs). However, existing methods\nmainly rely on syntactically mapping natural languages to complete formal\nlanguages like Python and SQL. Those methods require that reasoning tasks be\nconvertible into programs, which cater to the computer execution mindset and\ndeviate from human reasoning habits. To broaden symbolic methods' applicability\nand adaptability in the real world, we propose the Meta-Reasoning from a\nlinguistic perspective. This method empowers LLMs to deconstruct\nreasoning-independent semantic information into generic symbolic\nrepresentations, thereby efficiently capturing more generalized reasoning\nknowledge. We conduct extensive experiments on more than ten datasets\nencompassing conventional reasoning tasks like arithmetic, symbolic, and\nlogical reasoning, and the more complex interactive reasoning tasks like\ntheory-of-mind reasoning. Experimental results demonstrate that Meta-Reasoning\nsignificantly enhances in-context reasoning accuracy, learning efficiency,\nout-of-domain generalization, and output stability compared to the\nChain-of-Thought technique. Code and data are publicly available at\n\\url{https://github.com/Alsace08/Meta-Reasoning}.\n","authors":["Yiming Wang","Zhuosheng Zhang","Pei Zhang","Baosong Yang","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2306.17820v4.pdf","comment":"Accepted by ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2403.09732v4","updated":"2024-06-02T02:58:53Z","published":"2024-03-13T02:32:41Z","title":"PET-SQL: A Prompt-Enhanced Two-Round Refinement of Text-to-SQL with\n  Cross-consistency","summary":"  Recent advancements in Text-to-SQL (Text2SQL) emphasize stimulating the large\nlanguage models (LLM) on in-context learning, achieving significant results.\nNevertheless, they face challenges when dealing with verbose database\ninformation and complex user intentions. This paper presents a two-stage\nframework to enhance the performance of current LLM-based natural language to\nSQL systems. We first introduce a novel prompt representation, called\nreference-enhanced representation, which includes schema information and\nrandomly sampled cell values from tables to instruct LLMs in generating SQL\nqueries. Then, in the first stage, question-SQL pairs are retrieved as few-shot\ndemonstrations, prompting the LLM to generate a preliminary SQL (PreSQL). After\nthat, the mentioned entities in PreSQL are parsed to conduct schema linking,\nwhich can significantly compact the useful information. In the second stage,\nwith the linked schema, we simplify the prompt's schema information and\ninstruct the LLM to produce the final SQL. Finally, as the post-refinement\nmodule, we propose using cross-consistency across different LLMs rather than\nself-consistency within a particular LLM. Our methods achieve new SOTA results\non the Spider benchmark, with an execution accuracy of 87.6%.\n","authors":["Zhishuai Li","Xiang Wang","Jingjing Zhao","Sun Yang","Guoqing Du","Xiaoru Hu","Bin Zhang","Yuxiao Ye","Ziyue Li","Rui Zhao","Hangyu Mao"],"pdf_url":"https://arxiv.org/pdf/2403.09732v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14168v2","updated":"2024-06-02T02:44:32Z","published":"2024-03-21T06:43:59Z","title":"M$^3$AV: A Multimodal, Multigenre, and Multipurpose Audio-Visual\n  Academic Lecture Dataset","summary":"  Publishing open-source academic video recordings is an emergent and prevalent\napproach to sharing knowledge online. Such videos carry rich multimodal\ninformation including speech, the facial and body movements of the speakers, as\nwell as the texts and pictures in the slides and possibly even the papers.\nAlthough multiple academic video datasets have been constructed and released,\nfew of them support both multimodal content recognition and understanding\ntasks, which is partially due to the lack of high-quality human annotations. In\nthis paper, we propose a novel multimodal, multigenre, and multipurpose\naudio-visual academic lecture dataset (M$^3$AV), which has almost 367 hours of\nvideos from five sources covering computer science, mathematics, and medical\nand biology topics. With high-quality human annotations of the slide text and\nspoken words, in particular high-valued name entities, the dataset can be used\nfor multiple audio-visual recognition and understanding tasks. Evaluations\nperformed on contextual speech recognition, speech synthesis, and slide and\nscript generation tasks demonstrate that the diversity of M$^3$AV makes it a\nchallenging dataset.\n","authors":["Zhe Chen","Heyang Liu","Wenyi Yu","Guangzhi Sun","Hongcheng Liu","Ji Wu","Chao Zhang","Yu Wang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14168v2.pdf","comment":"ACL 2024 Main Conference. Project website:\n  https://jack-zc8.github.io/M3AV-dataset-page"},{"id":"http://arxiv.org/abs/2405.19086v2","updated":"2024-06-02T02:32:31Z","published":"2024-05-29T13:49:44Z","title":"MEMoE: Enhancing Model Editing with Mixture of Experts Adaptors","summary":"  Model editing aims to efficiently alter the behavior of Large Language Models\n(LLMs) within a desired scope, while ensuring no adverse impact on other\ninputs. Recent years have witnessed various model editing methods been\nproposed. However, these methods either exhibit poor overall performance or\nstruggle to strike a balance between generalization and locality. We propose\nMEMoE, a model editing adapter utilizing a Mixture of Experts (MoE)\narchitecture with a knowledge anchor routing strategy. MEMoE updates knowledge\nusing a bypass MoE structure, keeping the original parameters unchanged to\npreserve the general ability of LLMs. And, the knowledge anchor routing ensures\nthat inputs requiring similar knowledge are routed to the same expert, thereby\nenhancing the generalization of the updated knowledge. Experimental results\nshow the superiority of our approach over both batch editing and sequential\nbatch editing tasks, exhibiting exceptional overall performance alongside\noutstanding balance between generalization and locality. Our code will be\navailable.\n","authors":["Renzhi Wang","Piji Li"],"pdf_url":"https://arxiv.org/pdf/2405.19086v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2405.18672v2","updated":"2024-06-02T23:30:46Z","published":"2024-05-29T00:36:56Z","title":"LLM-based Hierarchical Concept Decomposition for Interpretable\n  Fine-Grained Image Classification","summary":"  (Renyi Qu's Master's Thesis) Recent advancements in interpretable models for\nvision-language tasks have achieved competitive performance; however, their\ninterpretability often suffers due to the reliance on unstructured text outputs\nfrom large language models (LLMs). This introduces randomness and compromises\nboth transparency and reliability, which are essential for addressing safety\nissues in AI systems. We introduce \\texttt{Hi-CoDe} (Hierarchical Concept\nDecomposition), a novel framework designed to enhance model interpretability\nthrough structured concept analysis. Our approach consists of two main\ncomponents: (1) We use GPT-4 to decompose an input image into a structured\nhierarchy of visual concepts, thereby forming a visual concept tree. (2) We\nthen employ an ensemble of simple linear classifiers that operate on\nconcept-specific features derived from CLIP to perform classification. Our\napproach not only aligns with the performance of state-of-the-art models but\nalso advances transparency by providing clear insights into the decision-making\nprocess and highlighting the importance of various concepts. This allows for a\ndetailed analysis of potential failure modes and improves model compactness,\ntherefore setting a new benchmark in interpretability without compromising the\naccuracy.\n","authors":["Renyi Qu","Mark Yatskar"],"pdf_url":"https://arxiv.org/pdf/2405.18672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03458v2","updated":"2024-06-02T23:04:43Z","published":"2024-03-06T04:49:02Z","title":"Slot Abstractors: Toward Scalable Abstract Visual Reasoning","summary":"  Abstract visual reasoning is a characteristically human ability, allowing the\nidentification of relational patterns that are abstracted away from object\nfeatures, and the systematic generalization of those patterns to unseen\nproblems. Recent work has demonstrated strong systematic generalization in\nvisual reasoning tasks involving multi-object inputs, through the integration\nof slot-based methods used for extracting object-centric representations\ncoupled with strong inductive biases for relational abstraction. However, this\napproach was limited to problems containing a single rule, and was not scalable\nto visual reasoning problems containing a large number of objects. Other recent\nwork proposed Abstractors, an extension of Transformers that incorporates\nstrong relational inductive biases, thereby inheriting the Transformer's\nscalability and multi-head architecture, but it has yet to be demonstrated how\nthis approach might be applied to multi-object visual inputs. Here we combine\nthe strengths of the above approaches and propose Slot Abstractors, an approach\nto abstract visual reasoning that can be scaled to problems involving a large\nnumber of objects and multiple relations among them. The approach displays\nstate-of-the-art performance across four abstract visual reasoning tasks, as\nwell as an abstract reasoning task involving real-world images.\n","authors":["Shanka Subhra Mondal","Jonathan D. Cohen","Taylor W. Webb"],"pdf_url":"https://arxiv.org/pdf/2403.03458v2.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.16517v2","updated":"2024-06-02T22:05:39Z","published":"2024-05-26T11:01:39Z","title":"Sp2360: Sparse-view 360 Scene Reconstruction using Cascaded 2D Diffusion\n  Priors","summary":"  We aim to tackle sparse-view reconstruction of a 360 3D scene using priors\nfrom latent diffusion models (LDM). The sparse-view setting is ill-posed and\nunderconstrained, especially for scenes where the camera rotates 360 degrees\naround a point, as no visual information is available beyond some frontal views\nfocused on the central object(s) of interest. In this work, we show that\npretrained 2D diffusion models can strongly improve the reconstruction of a\nscene with low-cost fine-tuning. Specifically, we present SparseSplat360\n(Sp2360), a method that employs a cascade of in-painting and artifact removal\nmodels to fill in missing details and clean novel views. Due to superior\ntraining and rendering speeds, we use an explicit scene representation in the\nform of 3D Gaussians over NeRF-based implicit representations. We propose an\niterative update strategy to fuse generated pseudo novel views with existing 3D\nGaussians fitted to the initial sparse inputs. As a result, we obtain a\nmulti-view consistent scene representation with details coherent with the\nobserved inputs. Our evaluation on the challenging Mip-NeRF360 dataset shows\nthat our proposed 2D to 3D distillation algorithm considerably improves the\nperformance of a regularized version of 3DGS adapted to a sparse-view setting\nand outperforms existing sparse-view reconstruction methods in 360 scene\nreconstruction. Qualitatively, our method generates entire 360 scenes from as\nfew as 9 input views, with a high degree of foreground and background detail.\n","authors":["Soumava Paul","Christopher Wewer","Bernt Schiele","Jan Eric Lenssen"],"pdf_url":"https://arxiv.org/pdf/2405.16517v2.pdf","comment":"18 pages, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.03887v3","updated":"2024-06-02T21:30:13Z","published":"2023-07-08T03:42:54Z","title":"Improving Prototypical Part Networks with Reward Reweighing,\n  Reselection, and Retraining","summary":"  In recent years, work has gone into developing deep interpretable methods for\nimage classification that clearly attributes a model's output to specific\nfeatures of the data. One such of these methods is the Prototypical Part\nNetwork (ProtoPNet), which attempts to classify images based on meaningful\nparts of the input. While this architecture is able to produce visually\ninterpretable classifications, it often learns to classify based on parts of\nthe image that are not semantically meaningful. To address this problem, we\npropose the Reward Reweighing, Reselecting, and Retraining (R3) post-processing\nframework, which performs three additional corrective updates to a pretrained\nProtoPNet in an offline and efficient manner. The first two steps involve\nlearning a reward model based on collected human feedback and then aligning the\nprototypes with human preferences. The final step is retraining, which realigns\nthe base features and the classifier layer of the original model with the\nupdated prototypes. We find that our R3 framework consistently improves both\nthe interpretability and the predictive accuracy of ProtoPNet and its variants.\n","authors":["Aaron J. Li","Robin Netzorg","Zhihan Cheng","Zhuoqin Zhang","Bin Yu"],"pdf_url":"https://arxiv.org/pdf/2307.03887v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18316v3","updated":"2024-06-02T20:57:53Z","published":"2024-04-28T20:57:55Z","title":"Position: Do Not Explain Vision Models Without Context","summary":"  Does the stethoscope in the picture make the adjacent person a doctor or a\npatient? This, of course, depends on the contextual relationship of the two\nobjects. If it's obvious, why don't explanation methods for vision models use\ncontextual information? In this paper, we (1) review the most popular methods\nof explaining computer vision models by pointing out that they do not take into\naccount context information, (2) show examples of failures of popular XAI\nmethods, (3) provide examples of real-world use cases where spatial context\nplays a significant role, (4) propose new research directions that may lead to\nbetter use of context information in explaining computer vision models, (5)\nargue that a change in approach to explanations is needed from 'where' to\n'how'.\n","authors":["Paulina Tomaszewska","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2404.18316v3.pdf","comment":"Accepted at International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2310.06085v3","updated":"2024-06-02T20:12:11Z","published":"2023-08-20T22:27:54Z","title":"Quantile-based Maximum Likelihood Training for Outlier Detection","summary":"  Discriminative learning effectively predicts true object class for image\nclassification. However, it often results in false positives for outliers,\nposing critical concerns in applications like autonomous driving and video\nsurveillance systems. Previous attempts to address this challenge involved\ntraining image classifiers through contrastive learning using actual outlier\ndata or synthesizing outliers for self-supervised learning. Furthermore,\nunsupervised generative modeling of inliers in pixel space has shown limited\nsuccess for outlier detection. In this work, we introduce a quantile-based\nmaximum likelihood objective for learning the inlier distribution to improve\nthe outlier separation during inference. Our approach fits a normalizing flow\nto pre-trained discriminative features and detects the outliers according to\nthe evaluated log-likelihood. The experimental evaluation demonstrates the\neffectiveness of our method as it surpasses the performance of the\nstate-of-the-art unsupervised methods for outlier detection. The results are\nalso competitive compared with a recent self-supervised approach for outlier\ndetection. Our work allows to reduce dependency on well-sampled negative\ntraining data, which is especially important for domains like medical\ndiagnostics or remote sensing.\n","authors":["Masoud Taghikhah","Nishant Kumar","Siniša Šegvić","Abouzar Eslami","Stefan Gumhold"],"pdf_url":"https://arxiv.org/pdf/2310.06085v3.pdf","comment":"Camera Ready Version. Accepted at AAAI 2024. Code available at\n  https://github.com/taghikhah/QuantOD"},{"id":"http://arxiv.org/abs/2311.17833v2","updated":"2024-06-02T19:18:37Z","published":"2023-11-29T17:35:29Z","title":"DiG-IN: Diffusion Guidance for Investigating Networks -- Uncovering\n  Classifier Differences, Neuron Visualisations, and Visual Counterfactual\n  Explanations","summary":"  While deep learning has led to huge progress in complex image classification\ntasks like ImageNet, unexpected failure modes, e.g. via spurious features, call\ninto question how reliably these classifiers work in the wild. Furthermore, for\nsafety-critical tasks the black-box nature of their decisions is problematic,\nand explanations or at least methods which make decisions plausible are needed\nurgently. In this paper, we address these problems by generating images that\noptimize a classifier-derived objective using a framework for guided image\ngeneration. We analyze the decisions of image classifiers by visual\ncounterfactual explanations (VCEs), detection of systematic mistakes by\nanalyzing images where classifiers maximally disagree, and visualization of\nneurons and spurious features. In this way, we validate existing observations,\ne.g. the shape bias of adversarially robust models, as well as novel failure\nmodes, e.g. systematic errors of zero-shot CLIP classifiers. Moreover, our VCEs\noutperform previous work while being more versatile.\n","authors":["Maximilian Augustin","Yannic Neuhaus","Matthias Hein"],"pdf_url":"https://arxiv.org/pdf/2311.17833v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2402.12843v3","updated":"2024-06-02T18:08:19Z","published":"2024-02-20T09:13:11Z","title":"Solar Panel Segmentation :Self-Supervised Learning Solutions for\n  Imperfect Datasets","summary":"  The increasing adoption of solar energy necessitates advanced methodologies\nfor monitoring and maintenance to ensure optimal performance of solar panel\ninstallations. A critical component in this context is the accurate\nsegmentation of solar panels from aerial or satellite imagery, which is\nessential for identifying operational issues and assessing efficiency. This\npaper addresses the significant challenges in panel segmentation, particularly\nthe scarcity of annotated data and the labour-intensive nature of manual\nannotation for supervised learning. We explore and apply Self-Supervised\nLearning (SSL) to solve these challenges. We demonstrate that SSL significantly\nenhances model generalization under various conditions and reduces dependency\non manually annotated data, paving the way for robust and adaptable solar panel\nsegmentation solutions.\n","authors":["Sankarshanaa Sagaram","Krish Didwania","Laven Srivastava","Aditya Kasliwal","Pallavi Kailas","Ujjwal Verma"],"pdf_url":"https://arxiv.org/pdf/2402.12843v3.pdf","comment":"Published at ICLR Tiny Paper 2024"},{"id":"http://arxiv.org/abs/2402.11248v4","updated":"2024-06-02T17:34:18Z","published":"2024-02-17T11:03:02Z","title":"CoLLaVO: Crayon Large Language and Vision mOdel","summary":"  The remarkable success of Large Language Models (LLMs) and instruction tuning\ndrives the evolution of Vision Language Models (VLMs) towards a versatile\ngeneral-purpose model. Yet, it remains unexplored whether current VLMs\ngenuinely possess quality object-level image understanding capabilities\ndetermined from 'what objects are in the image?' or 'which object corresponds\nto a specified bounding box?'. Our findings reveal that the image understanding\ncapabilities of current VLMs are strongly correlated with their zero-shot\nperformance on vision language (VL) tasks. This suggests that prioritizing\nbasic image understanding is crucial for VLMs to excel at VL tasks. To enhance\nobject-level image understanding, we propose Crayon Large Language and Vision\nmOdel (CoLLaVO), which incorporates instruction tuning with Crayon Prompt as a\nnew visual prompt tuning scheme based on panoptic color maps. Furthermore, we\npresent a learning strategy of Dual QLoRA to preserve object-level image\nunderstanding without forgetting it during visual instruction tuning, thereby\nachieving a significant leap in numerous VL benchmarks in a zero-shot setting.\n","authors":["Byung-Kwan Lee","Beomchan Park","Chae Won Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2402.11248v4.pdf","comment":"ACL 2024 Findings. Code available:\n  https://github.com/ByungKwanLee/CoLLaVO"},{"id":"http://arxiv.org/abs/2403.03173v7","updated":"2024-06-02T16:35:23Z","published":"2024-03-05T18:08:29Z","title":"Solving the bongard-logo problem by modeling a probabilistic model","summary":"  Abstract reasoning problems pose challenges to the perception and cognition\nabilities of AI algorithms, demanding deeper pattern recognition and inductive\nreasoning beyond mere identification of explicit image features. In this study,\nwe introduce PMoC, a probabilistic model tailored for the Bongard-Logo problem,\nachieving high reasoning accuracy through the construction of an conditional\nprobabilistic model. Additionally, we have designed the Pose-Transformer, an\nenhanced Transformer-Encoder specifically crafted for complex abstract\nreasoning tasks, including Bongard-Logo, RAVEN, I-RAVEN, and PGM. Inspired by\nthe pose matrix in capsule networks, Pose-Transformer strengthens the focus on\npositional relationships between local features when processing image data.\nWhen combined with PMoC, it can further enhance reasoning accuracy. Our\nPose-Transformer effectively addresses reasoning difficulties associated with\nchanges in the position of abstract entities, outperforming previous models on\nRAVEN's OIG, D3$\\times$3 subsets, and the PGM dataset. Finally, considering the\ndeployment difficulties arising from the large number of Pose-Transformer\nparameters, this paper presents a lightweight version, Straw-Pose-Transformer,\nwhich maintains performance while significantly reducing the parameter count.\nThis study contributes to enhancing AI capabilities in abstract reasoning and\ncognitive pattern recognition.\n","authors":["Ruizhuo Song","Beiming Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.03173v7.pdf","comment":"14 pages, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.13756v2","updated":"2024-06-02T16:29:39Z","published":"2024-04-21T19:42:28Z","title":"BC-MRI-SEG: A Breast Cancer MRI Tumor Segmentation Benchmark","summary":"  Binary breast cancer tumor segmentation with Magnetic Resonance Imaging (MRI)\ndata is typically trained and evaluated on private medical data, which makes\ncomparing deep learning approaches difficult. We propose a benchmark\n(BC-MRI-SEG) for binary breast cancer tumor segmentation based on publicly\navailable MRI datasets. The benchmark consists of four datasets in total, where\ntwo datasets are used for supervised training and evaluation, and two are used\nfor zero-shot evaluation. Additionally we compare state-of-the-art (SOTA)\napproaches on our benchmark and provide an exhaustive list of available public\nbreast cancer MRI datasets. The source code has been made available at\nhttps://irulenot.github.io/BC_MRI_SEG_Benchmark.\n","authors":["Anthony Bilic","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2404.13756v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13331v2","updated":"2024-06-02T16:08:17Z","published":"2024-05-22T04:20:30Z","title":"Comparative Analysis of Hyperspectral Image Reconstruction Using Deep\n  Learning for Agricultural and Biological Applications","summary":"  Hyperspectral imaging (HSI) has become a key technology for non-invasive\nquality evaluation in various fields, offering detailed insights through\nspatial and spectral data. Despite its efficacy, the complexity and high cost\nof HSI systems have hindered their widespread adoption. This study addressed\nthese challenges by exploring deep learning-based hyperspectral image\nreconstruction from RGB (Red, Green, Blue) images, particularly for\nagricultural products. Specifically, different hyperspectral reconstruction\nalgorithms, such as Hyperspectral Convolutional Neural Network - Dense\n(HSCNN-D), High-Resolution Network (HRNET), and Multi-Scale Transformer Plus\nPlus (MST++), were compared to assess the dry matter content of sweet potatoes.\nAmong the tested reconstruction methods, HRNET demonstrated superior\nperformance, achieving the lowest mean relative absolute error (MRAE) of 0.07,\nroot mean square error (RMSE) of 0.03, and the highest peak signal-to-noise\nratio (PSNR) of 32.28 decibels (dB). Some key features were selected using the\ngenetic algorithm (GA), and their importance was interpreted using explainable\nartificial intelligence (XAI). Partial least squares regression (PLSR) models\nwere developed using the RGB, reconstructed, and ground truth (GT) data. The\nvisual and spectra quality of these reconstructed methods was compared with GT\ndata, and predicted maps were generated. The results revealed the prospect of\ndeep learning-based hyperspectral image reconstruction as a cost-effective and\nefficient quality assessment tool for agricultural and biological applications.\n","authors":["Md. Toukir Ahmed","Arthur Villordon","Mohammed Kamruzzaman"],"pdf_url":"https://arxiv.org/pdf/2405.13331v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2303.12130v2","updated":"2024-06-02T15:50:37Z","published":"2023-03-21T18:40:59Z","title":"MV-MR: multi-views and multi-representations for self-supervised\n  learning and knowledge distillation","summary":"  We present a new method of self-supervised learning and knowledge\ndistillation based on the multi-views and multi-representations (MV-MR). The\nMV-MR is based on the maximization of dependence between learnable embeddings\nfrom augmented and non-augmented views, jointly with the maximization of\ndependence between learnable embeddings from augmented view and multiple\nnon-learnable representations from non-augmented view. We show that the\nproposed method can be used for efficient self-supervised classification and\nmodel-agnostic knowledge distillation. Unlike other self-supervised techniques,\nour approach does not use any contrastive learning, clustering, or stop\ngradients. MV-MR is a generic framework allowing the incorporation of\nconstraints on the learnable embeddings via the usage of image\nmulti-representations as regularizers. Along this line, knowledge distillation\nis considered a particular case of such a regularization. MV-MR provides the\nstate-of-the-art performance on the STL10 and ImageNet-1K datasets among\nnon-contrastive and clustering-free methods. We show that a lower complexity\nResNet50 model pretrained using proposed knowledge distillation based on the\nCLIP ViT model achieves state-of-the-art performance on STL10 linear\nevaluation. The code is available at: https://github.com/vkinakh/mv-mr\n","authors":["Vitaliy Kinakh","Mariia Drozdova","Slava Voloshynovskiy"],"pdf_url":"https://arxiv.org/pdf/2303.12130v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10182v3","updated":"2024-06-02T15:47:23Z","published":"2023-07-02T11:09:08Z","title":"Enhancing Super-Resolution Networks through Realistic Thick-Slice CT\n  Simulation","summary":"  Deep learning-based Generative Models have the potential to convert\nlow-resolution CT images into high-resolution counterparts without long\nacquisition times and increased radiation exposure in thin-slice CT imaging.\nHowever, procuring appropriate training data for these Super-Resolution (SR)\nmodels is challenging. Previous SR research has simulated thick-slice CT images\nfrom thin-slice CT images to create training pairs. However, these methods\neither rely on simplistic interpolation techniques that lack realism or\nsinogram reconstruction, which require the release of raw data and complex\nreconstruction algorithms. Thus, we introduce a simple yet realistic method to\ngenerate thick CT images from thin-slice CT images, facilitating the creation\nof training pairs for SR algorithms. The training pairs produced by our method\nclosely resemble real data distributions (PSNR=49.74 vs. 40.66, p$<$0.05). A\nmultivariate Cox regression analysis involving thick slice CT images with lung\nfibrosis revealed that only the radiomics features extracted using our method\ndemonstrated a significant correlation with mortality (HR=1.19 and HR=1.14,\np$<$0.005). This paper represents the first to identify and address the\nchallenge of generating appropriate paired training data for Deep\nLearning-based CT SR models, which enhances the efficacy and applicability of\nSR models in real-world scenarios.\n","authors":["Zeyu Tang","Xiaodan Xing","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2307.10182v3.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.00231v3","updated":"2024-06-02T15:47:16Z","published":"2024-03-01T02:21:30Z","title":"Multimodal ArXiv: A Dataset for Improving Scientific Comprehension of\n  Large Vision-Language Models","summary":"  Large vision-language models (LVLMs) excel across diverse tasks involving\nconcrete images from natural scenes. However, their ability to interpret\nabstract figures, such as geometry shapes and scientific plots, remains limited\ndue to a scarcity of training datasets in scientific domains. To fill this gap,\nwe introduce Multimodal ArXiv, consisting of ArXivCap and ArXivQA, for\nenhancing LVLMs scientific comprehension. ArXivCap is a figure-caption dataset\ncomprising 6.4M images and 3.9M captions, sourced from 572K ArXiv papers\nspanning various scientific domains. Drawing from ArXivCap, we introduce\nArXivQA, a question-answering dataset generated by prompting GPT-4V based on\nscientific figures. ArXivQA greatly enhances open-sourced LVLMs' mathematical\nreasoning capabilities, achieving a 10.4\\% absolute accuracy gain on a\nmultimodal mathematical reasoning benchmark. Furthermore, employing ArXivCap,\nwe devise four vision-to-text tasks for benchmarking LVLMs. Evaluation results\nwith state-of-the-art LVLMs underscore their struggle with the nuanced\nsemantics of academic figures, while domain-specific training yields\nsubstantial performance gains. Our error analysis uncovers misinterpretations\nof visual context, recognition errors, and the production of overly simplified\ncaptions by current LVLMs, shedding light on future improvements.\n","authors":["Lei Li","Yuqi Wang","Runxin Xu","Peiyi Wang","Xiachong Feng","Lingpeng Kong","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2403.00231v3.pdf","comment":"Project page: https://mm-arxiv.github.io, Camera Ready Version of ACL\n  2024"},{"id":"http://arxiv.org/abs/2405.19076v2","updated":"2024-06-02T15:03:24Z","published":"2024-05-29T13:34:32Z","title":"Cephalo: Multi-Modal Vision-Language Models for Bio-Inspired Materials\n  Analysis and Design","summary":"  We present Cephalo, a series of multimodal vision large language models\n(V-LLMs) designed for materials science applications, integrating visual and\nlinguistic data for enhanced understanding and interaction within human-AI and\nmulti-agent AI frameworks. A key innovation of Cephalo is its advanced dataset\ngeneration method, which employs a sophisticated algorithm to accurately detect\nand separate images and their corresponding textual descriptions from PDF\ndocuments, such as scientific papers. The method includes a careful refinement\nof image-text pairs through integrated vision and language processing, ensuring\nhigh-quality, contextually relevant, and well reasoned training data. Cephalo\nis trained on integrated image and text data extracted from thousands of\nscientific papers and science-focused Wikipedia pages demonstrates can\ninterpret complex visual scenes, generate precise language descriptions, and\nanswer queries about images effectively. The combination of a vision encoder\nwith an autoregressive transformer supports complex natural language\nunderstanding in an integrated model, which can be coupled with other\ngenerative methods to create an image-to-text-to-image or image-to-text-to-3D\npipeline. To explore the development of larger models from smaller ones, we\nreport both mixture-of-expert methods and model merging. These hybrid\napproaches allow us to leverage the domain-specific expertise and general\nconversational capabilities to harness the strengths of multiple models. We\nexamine the models in diverse use cases that incorporate biological materials,\nfracture and engineering analysis, protein biophysics, and bio-inspired design\nbased on insect behavior. Generative applications include bio-inspired designs,\nincluding pollen-inspired architected materials, as well as the synthesis of\nbio-inspired material microstructures from a photograph of a solar eclipse.\n","authors":["Markus J. Buehler"],"pdf_url":"https://arxiv.org/pdf/2405.19076v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04965v2","updated":"2024-06-02T14:31:09Z","published":"2024-03-08T00:30:25Z","title":"StereoDiffusion: Training-Free Stereo Image Generation Using Latent\n  Diffusion Models","summary":"  The demand for stereo images increases as manufacturers launch more XR\ndevices. To meet this demand, we introduce StereoDiffusion, a method that,\nunlike traditional inpainting pipelines, is trainning free, remarkably\nstraightforward to use, and it seamlessly integrates into the original Stable\nDiffusion model. Our method modifies the latent variable to provide an\nend-to-end, lightweight capability for fast generation of stereo image pairs,\nwithout the need for fine-tuning model weights or any post-processing of\nimages. Using the original input to generate a left image and estimate a\ndisparity map for it, we generate the latent vector for the right image through\nStereo Pixel Shift operations, complemented by Symmetric Pixel Shift Masking\nDenoise and Self-Attention Layers Modification methods to align the right-side\nimage with the left-side image. Moreover, our proposed method maintains a high\nstandard of image quality throughout the stereo generation process, achieving\nstate-of-the-art scores in various quantitative evaluations.\n","authors":["Lezhong Wang","Jeppe Revall Frisvad","Mark Bo Jensen","Siavash Arjomand Bigdeli"],"pdf_url":"https://arxiv.org/pdf/2403.04965v2.pdf","comment":"Updated to CVPR 2024 GCV accepted version"},{"id":"http://arxiv.org/abs/2405.18715v2","updated":"2024-06-02T14:15:27Z","published":"2024-05-29T02:53:40Z","title":"NeRF On-the-go: Exploiting Uncertainty for Distractor-free NeRFs in the\n  Wild","summary":"  Neural Radiance Fields (NeRFs) have shown remarkable success in synthesizing\nphotorealistic views from multi-view images of static scenes, but face\nchallenges in dynamic, real-world environments with distractors like moving\nobjects, shadows, and lighting changes. Existing methods manage controlled\nenvironments and low occlusion ratios but fall short in render quality,\nespecially under high occlusion scenarios. In this paper, we introduce NeRF\nOn-the-go, a simple yet effective approach that enables the robust synthesis of\nnovel views in complex, in-the-wild scenes from only casually captured image\nsequences. Delving into uncertainty, our method not only efficiently eliminates\ndistractors, even when they are predominant in captures, but also achieves a\nnotably faster convergence speed. Through comprehensive experiments on various\nscenes, our method demonstrates a significant improvement over state-of-the-art\ntechniques. This advancement opens new avenues for NeRF in diverse and dynamic\nreal-world applications.\n","authors":["Weining Ren","Zihan Zhu","Boyang Sun","Jiaqi Chen","Marc Pollefeys","Songyou Peng"],"pdf_url":"https://arxiv.org/pdf/2405.18715v2.pdf","comment":"CVPR 2024, first two authors contributed equally. Project Page:\n  https://rwn17.github.io/nerf-on-the-go/"},{"id":"http://arxiv.org/abs/2310.17109v2","updated":"2024-06-02T12:38:47Z","published":"2023-10-26T02:37:08Z","title":"LP-OVOD: Open-Vocabulary Object Detection by Linear Probing","summary":"  This paper addresses the challenging problem of open-vocabulary object\ndetection (OVOD) where an object detector must identify both seen and unseen\nclasses in test images without labeled examples of the unseen classes in\ntraining. A typical approach for OVOD is to use joint text-image embeddings of\nCLIP to assign box proposals to their closest text label. However, this method\nhas a critical issue: many low-quality boxes, such as over- and\nunder-covered-object boxes, have the same similarity score as high-quality\nboxes since CLIP is not trained on exact object location information. To\naddress this issue, we propose a novel method, LP-OVOD, that discards\nlow-quality boxes by training a sigmoid linear classifier on pseudo labels\nretrieved from the top relevant region proposals to the novel text.\nExperimental results on COCO affirm the superior performance of our approach\nover the state of the art, achieving $\\textbf{40.5}$ in $\\text{AP}_{novel}$\nusing ResNet50 as the backbone and without external datasets or knowing novel\nclasses during training. Our code will be available at\nhttps://github.com/VinAIResearch/LP-OVOD.\n","authors":["Chau Pham","Truong Vu","Khoi Nguyen"],"pdf_url":"https://arxiv.org/pdf/2310.17109v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04314v2","updated":"2024-06-02T11:32:19Z","published":"2023-12-07T14:11:00Z","title":"GPT4SGG: Synthesizing Scene Graphs from Holistic and Region-specific\n  Narratives","summary":"  Training Scene Graph Generation (SGG) models with natural language captions\nhas become increasingly popular due to the abundant, cost-effective, and\nopen-world generalization supervision signals that natural language offers.\nHowever, such unstructured caption data and its processing pose significant\nchallenges in learning accurate and comprehensive scene graphs. The challenges\ncan be summarized as three aspects: 1) traditional scene graph parsers based on\nlinguistic representation often fail to extract meaningful relationship\ntriplets from caption data. 2) grounding unlocalized objects of parsed triplets\nwill meet ambiguity issues in visual-language alignment. 3) caption data\ntypically are sparse and exhibit bias to partial observations of image content.\nAiming to address these problems, we propose a divide-and-conquer strategy with\na novel framework named \\textit{GPT4SGG}, to obtain more accurate and\ncomprehensive scene graph signals. This framework decomposes a complex scene\ninto a bunch of simple regions, resulting in a set of region-specific\nnarratives. With these region-specific narratives (partial observations) and a\nholistic narrative (global observation) for an image, a large language model\n(LLM) performs the relationship reasoning to synthesize an accurate and\ncomprehensive scene graph. Experimental results demonstrate \\textit{GPT4SGG}\nsignificantly improves the performance of SGG models trained on image-caption\ndata, in which the ambiguity issue and long-tail bias have been well-handled\nwith more accurate and comprehensive scene graphs.\n","authors":["Zuyao Chen","Jinlin Wu","Zhen Lei","Zhaoxiang Zhang","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.04314v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07133v2","updated":"2024-06-02T10:50:54Z","published":"2023-12-12T10:07:37Z","title":"LatentMan: Generating Consistent Animated Characters using Image\n  Diffusion Models","summary":"  We propose a zero-shot approach for generating consistent videos of animated\ncharacters based on Text-to-Image (T2I) diffusion models. Existing\nText-to-Video (T2V) methods are expensive to train and require large-scale\nvideo datasets to produce diverse characters and motions. At the same time,\ntheir zero-shot alternatives fail to produce temporally consistent videos with\ncontinuous motion. We strive to bridge this gap, and we introduce LatentMan,\nwhich leverages existing text-based motion diffusion models to generate diverse\ncontinuous motions to guide the T2I model. To boost the temporal consistency,\nwe introduce the Spatial Latent Alignment module that exploits cross-frame\ndense correspondences that we compute to align the latents of the video frames.\nFurthermore, we propose Pixel-Wise Guidance to steer the diffusion process in a\ndirection that minimizes visual discrepancies between frames. Our proposed\napproach outperforms existing zero-shot T2V approaches in generating videos of\nanimated characters in terms of pixel-wise consistency and user preference.\nProject page https://abdo-eldesokey.github.io/latentman/.\n","authors":["Abdelrahman Eldesokey","Peter Wonka"],"pdf_url":"https://arxiv.org/pdf/2312.07133v2.pdf","comment":"CVPRW 2024. Project page: https://abdo-eldesokey.github.io/latentman/"},{"id":"http://arxiv.org/abs/2405.09552v2","updated":"2024-06-02T10:49:47Z","published":"2024-04-15T11:49:37Z","title":"ODFormer: Semantic Fundus Image Segmentation Using Transformer for Optic\n  Nerve Head Detection","summary":"  Optic nerve head (ONH) detection has been a crucial area of study in\nophthalmology for years. However, the significant discrepancy between fundus\nimage datasets, each generated using a single type of fundus camera, poses\nchallenges to the generalizability of ONH detection approaches developed based\non semantic segmentation networks. Despite the numerous recent advancements in\ngeneral-purpose semantic segmentation methods using convolutional neural\nnetworks (CNNs) and Transformers, there is currently a lack of benchmarks for\nthese state-of-the-art (SoTA) networks specifically trained for ONH detection.\nTherefore, in this article, we make contributions from three key aspects:\nnetwork design, the publication of a dataset, and the establishment of a\ncomprehensive benchmark. Our newly developed ONH detection network, referred to\nas ODFormer, is based upon the Swin Transformer architecture and incorporates\ntwo novel components: a multi-scale context aggregator and a lightweight\nbidirectional feature recalibrator. Our published large-scale dataset, known as\nTongjiU-DROD, provides multi-resolution fundus images for each participant,\ncaptured using two distinct types of cameras. Our established benchmark\ninvolves three datasets: DRIONS-DB, DRISHTI-GS1, and TongjiU-DROD, created by\nresearchers from different countries and containing fundus images captured from\nparticipants of diverse races and ages. Extensive experimental results\ndemonstrate that our proposed ODFormer outperforms other state-of-the-art\n(SoTA) networks in terms of performance and generalizability. Our dataset and\nsource code are publicly available at mias.group/ODFormer.\n","authors":["Jiayi Wang","Yi-An Mao","Xiaoyu Ma","Sicen Guo","Yuting Shao","Xiao Lv","Wenting Han","Mark Christopher","Linda M. Zangwill","Yanlong Bi","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2405.09552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.04014v2","updated":"2024-06-02T10:45:26Z","published":"2021-05-09T20:06:25Z","title":"DiagSet: a dataset for prostate cancer histopathological image\n  classification","summary":"  Cancer diseases constitute one of the most significant societal challenges.\nIn this paper, we introduce a novel histopathological dataset for prostate\ncancer detection. The proposed dataset, consisting of over 2.6 million tissue\npatches extracted from 430 fully annotated scans, 4675 scans with assigned\nbinary diagnoses, and 46 scans with diagnoses independently provided by a group\nof histopathologists can be found at\nhttps://github.com/michalkoziarski/DiagSet. Furthermore, we propose a machine\nlearning framework for detection of cancerous tissue regions and prediction of\nscan-level diagnosis, utilizing thresholding to abstain from the decision in\nuncertain cases. The proposed approach, composed of ensembles of deep neural\nnetworks operating on the histopathological scans at different scales, achieves\n94.6% accuracy in patch-level recognition and is compared in a scan-level\ndiagnosis with 9 human histopathologists showing high statistical agreement.\n","authors":["Michał Koziarski","Bogusław Cyganek","Przemysław Niedziela","Bogusław Olborski","Zbigniew Antosz","Marcin Żydak","Bogdan Kwolek","Paweł Wąsowicz","Andrzej Bukała","Jakub Swadźba","Piotr Sitkowski"],"pdf_url":"https://arxiv.org/pdf/2105.04014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02149v2","updated":"2024-06-02T10:39:45Z","published":"2024-02-03T13:35:39Z","title":"Improving Diffusion Models for Inverse Problems Using Optimal Posterior\n  Covariance","summary":"  Recent diffusion models provide a promising zero-shot solution to noisy\nlinear inverse problems without retraining for specific inverse problems. In\nthis paper, we reveal that recent methods can be uniformly interpreted as\nemploying a Gaussian approximation with hand-crafted isotropic covariance for\nthe intractable denoising posterior to approximate the conditional posterior\nmean. Inspired by this finding, we propose to improve recent methods by using\nmore principled covariance determined by maximum likelihood estimation. To\nachieve posterior covariance optimization without retraining, we provide\ngeneral plug-and-play solutions based on two approaches specifically designed\nfor leveraging pre-trained models with and without reverse covariance. We\nfurther propose a scalable method for learning posterior covariance prediction\nbased on representation with orthonormal basis. Experimental results\ndemonstrate that the proposed methods significantly enhance reconstruction\nperformance without requiring hyperparameter tuning.\n","authors":["Xinyu Peng","Ziyang Zheng","Wenrui Dai","Nuoqian Xiao","Chenglin Li","Junni Zou","Hongkai Xiong"],"pdf_url":"https://arxiv.org/pdf/2402.02149v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19703v2","updated":"2024-06-02T10:24:49Z","published":"2024-05-30T05:27:46Z","title":"Towards a Better Evaluation of Out-of-Domain Generalization","summary":"  The objective of Domain Generalization (DG) is to devise algorithms and\nmodels capable of achieving high performance on previously unseen test\ndistributions. In the pursuit of this objective, average measure has been\nemployed as the prevalent measure for evaluating models and comparing\nalgorithms in the existing DG studies. Despite its significance, a\ncomprehensive exploration of the average measure has been lacking and its\nsuitability in approximating the true domain generalization performance has\nbeen questionable. In this study, we carefully investigate the limitations\ninherent in the average measure and propose worst+gap measure as a robust\nalternative. We establish theoretical grounds of the proposed measure by\nderiving two theorems starting from two different assumptions. We conduct\nextensive experimental investigations to compare the proposed worst+gap measure\nwith the conventional average measure. Given the indispensable need to access\nthe true DG performance for studying measures, we modify five existing datasets\nto come up with SR-CMNIST, C-Cats&Dogs, L-CIFAR10, PACS-corrupted, and\nVLCS-corrupted datasets. The experiment results unveil an inferior performance\nof the average measure in approximating the true DG performance and confirm the\nrobustness of the theoretically supported worst+gap measure.\n","authors":["Duhun Hwang","Suhyun Kang","Moonjung Eo","Jimyeong Kim","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2405.19703v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20222v2","updated":"2024-06-02T10:14:56Z","published":"2024-05-30T16:22:22Z","title":"MOFA-Video: Controllable Image Animation via Generative Motion Field\n  Adaptions in Frozen Image-to-Video Diffusion Model","summary":"  We present MOFA-Video, an advanced controllable image animation method that\ngenerates video from the given image using various additional controllable\nsignals (such as human landmarks reference, manual trajectories, and another\neven provided video) or their combinations. This is different from previous\nmethods which only can work on a specific motion domain or show weak control\nabilities with diffusion prior. To achieve our goal, we design several\ndomain-aware motion field adapters (\\ie, MOFA-Adapters) to control the\ngenerated motions in the video generation pipeline. For MOFA-Adapters, we\nconsider the temporal motion consistency of the video and generate the dense\nmotion flow from the given sparse control conditions first, and then, the\nmulti-scale features of the given image are wrapped as a guided feature for\nstable video diffusion generation. We naively train two motion adapters for the\nmanual trajectories and the human landmarks individually since they both\ncontain sparse information about the control. After training, the MOFA-Adapters\nin different domains can also work together for more controllable video\ngeneration. Project Page: https://myniuuu.github.io/MOFA_Video/\n","authors":["Muyao Niu","Xiaodong Cun","Xintao Wang","Yong Zhang","Ying Shan","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2405.20222v2.pdf","comment":"Project Page: https://myniuuu.github.io/MOFA_Video/ ; Codes:\n  https://github.com/MyNiuuu/MOFA-Video"},{"id":"http://arxiv.org/abs/2405.15223v2","updated":"2024-06-02T09:44:20Z","published":"2024-05-24T05:29:12Z","title":"iVideoGPT: Interactive VideoGPTs are Scalable World Models","summary":"  World models empower model-based agents to interactively explore, reason, and\nplan within imagined environments for real-world decision-making. However, the\nhigh demand for interactivity poses challenges in harnessing recent\nadvancements in video generative models for developing world models at scale.\nThis work introduces Interactive VideoGPT (iVideoGPT), a scalable\nautoregressive transformer framework that integrates multimodal signals--visual\nobservations, actions, and rewards--into a sequence of tokens, facilitating an\ninteractive experience of agents via next-token prediction. iVideoGPT features\na novel compressive tokenization technique that efficiently discretizes\nhigh-dimensional visual observations. Leveraging its scalable architecture, we\nare able to pre-train iVideoGPT on millions of human and robotic manipulation\ntrajectories, establishing a versatile foundation that is adaptable to serve as\ninteractive world models for a wide range of downstream tasks. These include\naction-conditioned video prediction, visual planning, and model-based\nreinforcement learning, where iVideoGPT achieves competitive performance\ncompared with state-of-the-art methods. Our work advances the development of\ninteractive general world models, bridging the gap between generative video\nmodels and practical model-based reinforcement learning applications.\n","authors":["Jialong Wu","Shaofeng Yin","Ningya Feng","Xu He","Dong Li","Jianye Hao","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2405.15223v2.pdf","comment":"Project website: https://thuml.github.io/iVideoGPT"},{"id":"http://arxiv.org/abs/2312.04557v2","updated":"2024-06-02T09:30:39Z","published":"2023-12-07T18:59:30Z","title":"GenTron: Diffusion Transformers for Image and Video Generation","summary":"  In this study, we explore Transformer-based diffusion models for image and\nvideo generation. Despite the dominance of Transformer architectures in various\nfields due to their flexibility and scalability, the visual generative domain\nprimarily utilizes CNN-based U-Net architectures, particularly in\ndiffusion-based models. We introduce GenTron, a family of Generative models\nemploying Transformer-based diffusion, to address this gap. Our initial step\nwas to adapt Diffusion Transformers (DiTs) from class to text conditioning, a\nprocess involving thorough empirical exploration of the conditioning mechanism.\nWe then scale GenTron from approximately 900M to over 3B parameters, observing\nsignificant improvements in visual quality. Furthermore, we extend GenTron to\ntext-to-video generation, incorporating novel motion-free guidance to enhance\nvideo quality. In human evaluations against SDXL, GenTron achieves a 51.1% win\nrate in visual quality (with a 19.8% draw rate), and a 42.3% win rate in text\nalignment (with a 42.9% draw rate). GenTron also excels in the T2I-CompBench,\nunderscoring its strengths in compositional generation. We believe this work\nwill provide meaningful insights and serve as a valuable reference for future\nresearch.\n","authors":["Shoufa Chen","Mengmeng Xu","Jiawei Ren","Yuren Cong","Sen He","Yanping Xie","Animesh Sinha","Ping Luo","Tao Xiang","Juan-Manuel Perez-Rua"],"pdf_url":"https://arxiv.org/pdf/2312.04557v2.pdf","comment":"CVPR2024 Camera Ready. Website:\n  https://www.shoufachen.com/gentron_website/"},{"id":"http://arxiv.org/abs/2312.03031v2","updated":"2024-06-02T09:29:00Z","published":"2023-12-05T11:32:31Z","title":"Is Ego Status All You Need for Open-Loop End-to-End Autonomous Driving?","summary":"  End-to-end autonomous driving recently emerged as a promising research\ndirection to target autonomy from a full-stack perspective. Along this line,\nmany of the latest works follow an open-loop evaluation setting on nuScenes to\nstudy the planning behavior. In this paper, we delve deeper into the problem by\nconducting thorough analyses and demystifying more devils in the details. We\ninitially observed that the nuScenes dataset, characterized by relatively\nsimple driving scenarios, leads to an under-utilization of perception\ninformation in end-to-end models incorporating ego status, such as the ego\nvehicle's velocity. These models tend to rely predominantly on the ego\nvehicle's status for future path planning. Beyond the limitations of the\ndataset, we also note that current metrics do not comprehensively assess the\nplanning quality, leading to potentially biased conclusions drawn from existing\nbenchmarks. To address this issue, we introduce a new metric to evaluate\nwhether the predicted trajectories adhere to the road. We further propose a\nsimple baseline able to achieve competitive results without relying on\nperception annotations. Given the current limitations on the benchmark and\nmetrics, we suggest the community reassess relevant prevailing research and be\ncautious whether the continued pursuit of state-of-the-art would yield\nconvincing and universal conclusions. Code and models are available at\n\\url{https://github.com/NVlabs/BEV-Planner}\n","authors":["Zhiqi Li","Zhiding Yu","Shiyi Lan","Jiahan Li","Jan Kautz","Tong Lu","Jose M. Alvarez"],"pdf_url":"https://arxiv.org/pdf/2312.03031v2.pdf","comment":"Accept to cvpr 2024"},{"id":"http://arxiv.org/abs/2307.08924v4","updated":"2024-06-02T08:52:13Z","published":"2023-07-18T01:53:18Z","title":"Towards Task Sampler Learning for Meta-Learning","summary":"  Meta-learning aims to learn general knowledge with diverse training tasks\nconducted from limited data, and then transfer it to new tasks. It is commonly\nbelieved that increasing task diversity will enhance the generalization ability\nof meta-learning models. However, this paper challenges this view through\nempirical and theoretical analysis. We obtain three conclusions: (i) there is\nno universal task sampling strategy that can guarantee the optimal performance\nof meta-learning models; (ii) over-constraining task diversity may incur the\nrisk of under-fitting or over-fitting during training; and (iii) the\ngeneralization performance of meta-learning models are affected by task\ndiversity, task entropy, and task difficulty. Based on this insight, we design\na novel task sampler, called Adaptive Sampler (ASr). ASr is a plug-and-play\nmodule that can be integrated into any meta-learning framework. It dynamically\nadjusts task weights according to task diversity, task entropy, and task\ndifficulty, thereby obtaining the optimal probability distribution for\nmeta-training tasks. Finally, we conduct experiments on a series of benchmark\ndatasets across various scenarios, and the results demonstrate that ASr has\nclear advantages.\n","authors":["Jingyao Wang","Wenwen Qiang","Xingzhe Su","Changwen Zheng","Fuchun Sun","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2307.08924v4.pdf","comment":"accepted by IJCV"},{"id":"http://arxiv.org/abs/2302.13251v2","updated":"2024-06-02T08:42:11Z","published":"2023-02-26T07:10:09Z","title":"Unsupervised Domain Adaptation for Low-dose CT Reconstruction via\n  Bayesian Uncertainty Alignment","summary":"  Low-dose computed tomography (LDCT) image reconstruction techniques can\nreduce patient radiation exposure while maintaining acceptable imaging quality.\nDeep learning is widely used in this problem, but the performance of testing\ndata (a.k.a. target domain) is often degraded in clinical scenarios due to the\nvariations that were not encountered in training data (a.k.a. source domain).\nUnsupervised domain adaptation (UDA) of LDCT reconstruction has been proposed\nto solve this problem through distribution alignment. However, existing UDA\nmethods fail to explore the usage of uncertainty quantification, which is\ncrucial for reliable intelligent medical systems in clinical scenarios with\nunexpected variations. Moreover, existing direct alignment for different\npatients would lead to content mismatch issues. To address these issues, we\npropose to leverage a probabilistic reconstruction framework to conduct a joint\ndiscrepancy minimization between source and target domains in both the latent\nand image spaces. In the latent space, we devise a Bayesian uncertainty\nalignment to reduce the epistemic gap between the two domains. This approach\nreduces the uncertainty level of target domain data, making it more likely to\nrender well-reconstructed results on target domains. In the image space, we\npropose a sharpness-aware distribution alignment to achieve a match of\nsecond-order information, which can ensure that the reconstructed images from\nthe target domain have similar sharpness to normal-dose CT images from the\nsource domain. Experimental results on two simulated datasets and one clinical\nlow-dose imaging dataset show that our proposed method outperforms other\nmethods in quantitative and visualized performance.\n","authors":["Kecheng Chen","Jie Liu","Renjie Wan","Victor Ho-Fun Lee","Varut Vardhanabhuti","Hong Yan","Haoliang Li"],"pdf_url":"https://arxiv.org/pdf/2302.13251v2.pdf","comment":"Accepted by IEEE Transactions on Neural Networks and Learning Systems"},{"id":"http://arxiv.org/abs/2305.08117v2","updated":"2024-06-02T08:30:21Z","published":"2023-05-14T10:17:09Z","title":"MBQuant: A Novel Multi-Branch Topology Method for Arbitrary Bit-width\n  Network Quantization","summary":"  Arbitrary bit-width network quantization has received significant attention\ndue to its high adaptability to various bit-width requirements during runtime.\nHowever, in this paper, we investigate existing methods and observe a\nsignificant accumulation of quantization errors caused by switching weight and\nactivations bit-widths, leading to limited performance. To address this issue,\nwe propose MBQuant, a novel method that utilizes a multi-branch topology for\narbitrary bit-width quantization. MBQuant duplicates the network body into\nmultiple independent branches, where the weights of each branch are quantized\nto a fixed 2-bit and the activations remain in the input bit-width. The\ncomputation of a desired bit-width is completed by selecting an appropriate\nnumber of branches that satisfy the original computational constraint. By\nfixing the weight bit-width, this approach substantially reduces quantization\nerrors caused by switching weight bit-widths. Additionally, we introduce an\namortization branch selection strategy to distribute quantization errors caused\nby switching activation bit-widths among branches to improve performance.\nFinally, we adopt an in-place distillation strategy that facilitates guidance\nbetween branches to further enhance MBQuant's performance. Extensive\nexperiments demonstrate that MBQuant achieves significant performance gains\ncompared to existing arbitrary bit-width quantization methods. Code is at\nhttps://github.com/zysxmu/MultiQuant.\n","authors":["Yunshan Zhong","Yuyao Zhou","Fei Chao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2305.08117v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11874v3","updated":"2024-06-02T07:44:51Z","published":"2024-02-19T06:32:23Z","title":"Language-guided Image Reflection Separation","summary":"  This paper studies the problem of language-guided reflection separation,\nwhich aims at addressing the ill-posed reflection separation problem by\nintroducing language descriptions to provide layer content. We propose a\nunified framework to solve this problem, which leverages the cross-attention\nmechanism with contrastive learning strategies to construct the correspondence\nbetween language descriptions and image layers. A gated network design and a\nrandomized training strategy are employed to tackle the recognizable layer\nambiguity. The effectiveness of the proposed method is validated by the\nsignificant performance advantage over existing reflection separation methods\non both quantitative and qualitative comparisons.\n","authors":["Haofeng Zhong","Yuchen Hong","Shuchen Weng","Jinxiu Liang","Boxin Shi"],"pdf_url":"https://arxiv.org/pdf/2402.11874v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03703v2","updated":"2024-06-02T07:34:08Z","published":"2023-12-06T18:59:44Z","title":"Skeleton-in-Context: Unified Skeleton Sequence Modeling with In-Context\n  Learning","summary":"  In-context learning provides a new perspective for multi-task modeling for\nvision and NLP. Under this setting, the model can perceive tasks from prompts\nand accomplish them without any extra task-specific head predictions or model\nfine-tuning. However, Skeleton sequence modeling via in-context learning\nremains unexplored. Directly applying existing in-context models from other\nareas onto skeleton sequences fails due to the inter-frame and cross-task pose\nsimilarity that makes it outstandingly hard to perceive the task correctly from\na subtle context. To address this challenge, we propose Skeleton-in-Context\n(SiC), an effective framework for in-context skeleton sequence modeling. Our\nSiC is able to handle multiple skeleton-based tasks simultaneously after a\nsingle training process and accomplish each task from context according to the\ngiven prompt. It can further generalize to new, unseen tasks according to\ncustomized prompts. To facilitate context perception, we additionally propose a\ntask-unified prompt, which adaptively learns tasks of different natures, such\nas partial joint-level generation, sequence-level prediction, or 2D-to-3D\nmotion prediction. We conduct extensive experiments to evaluate the\neffectiveness of our SiC on multiple tasks, including motion prediction, pose\nestimation, joint completion, and future pose estimation. We also evaluate its\ngeneralization capability on unseen tasks such as motion-in-between. These\nexperiments show that our model achieves state-of-the-art multi-task\nperformance and even outperforms single-task methods on certain tasks.\n","authors":["Xinshun Wang","Zhongbin Fang","Xia Li","Xiangtai Li","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03703v2.pdf","comment":"Project page: https://github.com/fanglaosi/Skeleton-in-Context"},{"id":"http://arxiv.org/abs/2303.09975v5","updated":"2024-06-02T07:32:59Z","published":"2023-03-17T13:48:17Z","title":"MedNeXt: Transformer-driven Scaling of ConvNets for Medical Image\n  Segmentation","summary":"  There has been exploding interest in embracing Transformer-based\narchitectures for medical image segmentation. However, the lack of large-scale\nannotated medical datasets make achieving performances equivalent to those in\nnatural images challenging. Convolutional networks, in contrast, have higher\ninductive biases and consequently, are easily trainable to high performance.\nRecently, the ConvNeXt architecture attempted to modernize the standard ConvNet\nby mirroring Transformer blocks. In this work, we improve upon this to design a\nmodernized and scalable convolutional architecture customized to challenges of\ndata-scarce medical settings. We introduce MedNeXt, a Transformer-inspired\nlarge kernel segmentation network which introduces - 1) A fully ConvNeXt 3D\nEncoder-Decoder Network for medical image segmentation, 2) Residual ConvNeXt up\nand downsampling blocks to preserve semantic richness across scales, 3) A novel\ntechnique to iteratively increase kernel sizes by upsampling small kernel\nnetworks, to prevent performance saturation on limited medical data, 4)\nCompound scaling at multiple levels (depth, width, kernel size) of MedNeXt.\nThis leads to state-of-the-art performance on 4 tasks on CT and MRI modalities\nand varying dataset sizes, representing a modernized deep architecture for\nmedical image segmentation. Our code is made publicly available at:\nhttps://github.com/MIC-DKFZ/MedNeXt.\n","authors":["Saikat Roy","Gregor Koehler","Constantin Ulrich","Michael Baumgartner","Jens Petersen","Fabian Isensee","Paul F. Jaeger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2303.09975v5.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2402.11435v2","updated":"2024-06-02T05:40:18Z","published":"2024-02-18T03:04:38Z","title":"Momentor: Advancing Video Large Language Model with Fine-Grained\n  Temporal Reasoning","summary":"  Large Language Models (LLMs) demonstrate remarkable proficiency in\ncomprehending and handling text-based tasks. Many efforts are being made to\ntransfer these attributes to video modality, which are termed Video-LLMs.\nHowever, existing Video-LLMs can only capture the coarse-grained semantics and\nare unable to effectively handle tasks related to comprehension or localization\nof specific video segments. In light of these challenges, we propose Momentor,\na Video-LLM capable of accomplishing fine-grained temporal understanding tasks.\nTo support the training of Momentor, we design an automatic data generation\nengine to construct Moment-10M, a large-scale video instruction dataset with\nsegment-level instruction data. We train Momentor on Moment-10M, enabling it to\nperform segment-level reasoning and localization. Zero-shot evaluations on\nseveral tasks demonstrate that Momentor excels in fine-grained temporally\ngrounded comprehension and localization.\n","authors":["Long Qian","Juncheng Li","Yu Wu","Yaobo Ye","Hao Fei","Tat-Seng Chua","Yueting Zhuang","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2402.11435v2.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.17158v2","updated":"2024-06-02T05:27:34Z","published":"2024-05-27T13:31:46Z","title":"PatchScaler: An Efficient Patch-Independent Diffusion Model for\n  Super-Resolution","summary":"  Diffusion models significantly improve the quality of super-resolved images\nwith their impressive content generation capabilities. However, the huge\ncomputational costs limit the applications of these methods.Recent efforts have\nexplored reasonable inference acceleration to reduce the number of sampling\nsteps, but the computational cost remains high as each step is performed on the\nentire image.This paper introduces PatchScaler, a patch-independent\ndiffusion-based single image super-resolution (SR) method, designed to enhance\nthe efficiency of the inference process.The proposed method is motivated by the\nobservation that not all the image patches within an image need the same\nsampling steps for reconstructing high-resolution images.Based on this\nobservation, we thus develop a Patch-adaptive Group Sampling (PGS) to divide\nfeature patches into different groups according to the patch-level\nreconstruction difficulty and dynamically assign an appropriate sampling\nconfiguration for each group so that the inference speed can be better\naccelerated.In addition, to improve the denoising ability at each step of the\nsampling, we develop a texture prompt to guide the estimations of the diffusion\nmodel by retrieving high-quality texture priors from a patch-independent\nreference texture memory.Experiments show that our PatchScaler achieves\nfavorable performance in both quantitative and qualitative evaluations with\nfast inference speed.Our code and model are available at\n\\url{https://github.com/yongliuy/PatchScaler}.\n","authors":["Yong Liu","Hang Dong","Jinshan Pan","Qingji Dong","Kai Chen","Rongxiang Zhang","Xing Mei","Lean Fu","Fei Wang"],"pdf_url":"https://arxiv.org/pdf/2405.17158v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17261v2","updated":"2024-06-02T05:20:57Z","published":"2023-10-26T09:25:09Z","title":"Attribute Based Interpretable Evaluation Metrics for Generative Models","summary":"  When the training dataset comprises a 1:1 proportion of dogs to cats, a\ngenerative model that produces 1:1 dogs and cats better resembles the training\nspecies distribution than another model with 3:1 dogs and cats. Can we capture\nthis phenomenon using existing metrics? Unfortunately, we cannot, because these\nmetrics do not provide any interpretability beyond \"diversity\". In this\ncontext, we propose a new evaluation protocol that measures the divergence of a\nset of generated images from the training set regarding the distribution of\nattribute strengths as follows. Single-attribute Divergence (SaD) measures the\ndivergence regarding PDFs of a single attribute. Paired-attribute Divergence\n(PaD) measures the divergence regarding joint PDFs of a pair of attributes.\nThey provide which attributes the models struggle. For measuring the attribute\nstrengths of an image, we propose Heterogeneous CLIPScore (HCS) which measures\nthe cosine similarity between image and text vectors with heterogeneous initial\npoints. With SaD and PaD, we reveal the following about existing generative\nmodels. ProjectedGAN generates implausible attribute relationships such as a\nbaby with a beard even though it has competitive scores of existing metrics.\nDiffusion models struggle to capture diverse colors in the datasets. The larger\nsampling timesteps of latent diffusion model generate the more minor objects\nincluding earrings and necklaces. Stable Diffusion v1.5 better captures the\nattributes than v2.1. Our metrics lay a foundation for explainable evaluations\nof generative models.\n","authors":["Dongkyun Kim","Mingi Kwon","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2310.17261v2.pdf","comment":"Accepted by ICML2024"},{"id":"http://arxiv.org/abs/2404.03635v4","updated":"2024-06-02T04:56:32Z","published":"2024-04-04T17:54:33Z","title":"WorDepth: Variational Language Prior for Monocular Depth Estimation","summary":"  Three-dimensional (3D) reconstruction from a single image is an ill-posed\nproblem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text\ndescription(s) is similarly ill-posed, i.e. spatial arrangements of objects\ndescribed. We investigate the question of whether two inherently ambiguous\nmodalities can be used in conjunction to produce metric-scaled reconstructions.\nTo test this, we focus on monocular depth estimation, the problem of predicting\na dense depth map from a single image, but with an additional text caption\ndescribing the scene. To this end, we begin by encoding the text caption as a\nmean and standard deviation; using a variational framework, we learn the\ndistribution of the plausible metric reconstructions of 3D scenes corresponding\nto the text captions as a prior. To \"select\" a specific reconstruction or depth\nmap, we encode the given image through a conditional sampler that samples from\nthe latent space of the variational text encoder, which is then decoded to the\noutput depth map. Our approach is trained alternatingly between the text and\nimage branches: in one optimization step, we predict the mean and standard\ndeviation from the text description and sample from a standard Gaussian, and in\nthe other, we sample using a (image) conditional sampler. Once trained, we\ndirectly predict depth from the encoded text using the conditional sampler. We\ndemonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where\nwe show that language can consistently improve performance in both.\n","authors":["Ziyao Zeng","Daniel Wang","Fengyu Yang","Hyoungseob Park","Yangchao Wu","Stefano Soatto","Byung-Woo Hong","Dong Lao","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2404.03635v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19385v2","updated":"2024-06-02T04:45:00Z","published":"2024-02-29T17:36:39Z","title":"Towards Safe and Reliable Autonomous Driving: Dynamic Occupancy Set\n  Prediction","summary":"  In the rapidly evolving field of autonomous driving, reliable prediction is\npivotal for vehicular safety. However, trajectory predictions often deviate\nfrom actual paths, particularly in complex and challenging environments,\nleading to significant errors. To address this issue, our study introduces a\nnovel method for Dynamic Occupancy Set (DOS) prediction, it effectively\ncombines advanced trajectory prediction networks with a DOS prediction module,\novercoming the shortcomings of existing models. It provides a comprehensive and\nadaptable framework for predicting the potential occupancy sets of traffic\nparticipants. The innovative contributions of this study include the\ndevelopment of a novel DOS prediction model specifically tailored for\nnavigating complex scenarios, the introduction of precise DOS mathematical\nrepresentations, and the formulation of optimized loss functions that\ncollectively advance the safety and efficiency of autonomous systems. Through\nrigorous validation, our method demonstrates marked improvements over\ntraditional models, establishing a new benchmark for safety and operational\nefficiency in intelligent transportation systems.\n","authors":["Wenbo Shao","Jiahui Xu","Wenhao Yu","Jun Li","Hong Wang"],"pdf_url":"https://arxiv.org/pdf/2402.19385v2.pdf","comment":"Accepted by IEEE IV 2024"},{"id":"http://arxiv.org/abs/2405.19442v2","updated":"2024-06-02T04:16:01Z","published":"2024-05-29T18:40:11Z","title":"Large-scale DSM registration via motion averaging","summary":"  Generating wide-area digital surface models (DSMs) requires registering a\nlarge number of individual, and partially overlapped DSMs. This presents a\nchallenging problem for a typical registration algorithm, since when a large\nnumber of observations from these multiple DSMs are considered, it may easily\ncause memory overflow. Sequential registration algorithms, although can\nsignificantly reduce the computation, are especially vulnerable for small\noverlapped pairs, leading to a large error accumulation. In this work, we\npropose a novel solution that builds the DSM registration task as a motion\naveraging problem: pair-wise DSMs are registered to build a scene graph, with\nedges representing relative poses between DSMs. Specifically, based on the grid\nstructure of the large DSM, the pair-wise registration is performed using a\nnovel nearest neighbor search method. We show that the scene graph can be\noptimized via an extremely fast motion average algorithm with O(N) complexity\n(N refers to the number of images). Evaluation of high-resolution\nsatellite-derived DSM demonstrates significant improvement in computation and\naccuracy.\n","authors":["Ningli Xu","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2405.19442v2.pdf","comment":"9 Figures"},{"id":"http://arxiv.org/abs/2311.17091v2","updated":"2024-06-02T03:00:49Z","published":"2023-11-28T05:17:25Z","title":"Beyond Sole Strength: Customized Ensembles for Generalized\n  Vision-Language Models","summary":"  Fine-tuning pre-trained vision-language models (VLMs), e.g., CLIP, for the\nopen-world generalization has gained increasing popularity due to its practical\nvalue. However, performance advancements are limited when relying solely on\nintricate algorithmic designs for a single model, even one exhibiting strong\nperformance, e.g., CLIP-ViT-B/16. This paper, for the first time, explores the\ncollaborative potential of leveraging much weaker VLMs to enhance the\ngeneralization of a robust single model. The affirmative findings motivate us\nto address the generalization problem from a novel perspective, i.e., ensemble\nof pre-trained VLMs. We introduce three customized ensemble strategies, each\ntailored to one specific scenario. Firstly, we introduce the zero-shot\nensemble, automatically adjusting the logits of different models based on their\nconfidence when only pre-trained VLMs are available. Furthermore, for scenarios\nwith extra few-shot samples, we propose the training-free and tuning ensemble,\noffering flexibility based on the availability of computing resources. The\nproposed ensemble strategies are evaluated on zero-shot, base-to-new, and\ncross-dataset generalization, achieving new state-of-the-art performance.\nNotably, this work represents an initial stride toward enhancing the\ngeneralization performance of VLMs via ensemble. The code is available at\nhttps://github.com/zhiheLu/Ensemble_VLM.git.\n","authors":["Zhihe Lu","Jiawang Bai","Xin Li","Zeyu Xiao","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17091v2.pdf","comment":"Accepted on ICML 2024"},{"id":"http://arxiv.org/abs/2404.14329v2","updated":"2024-06-02T01:58:41Z","published":"2024-04-22T16:40:11Z","title":"X-Ray: A Sequential 3D Representation For Generation","summary":"  We introduce X-Ray, a novel 3D sequential representation inspired by the\npenetrability of x-ray scans. X-Ray transforms a 3D object into a series of\nsurface frames at different layers, making it suitable for generating 3D models\nfrom images. Our method utilizes ray casting from the camera center to capture\ngeometric and textured details, including depth, normal, and color, across all\nintersected surfaces. This process efficiently condenses the whole 3D object\ninto a multi-frame video format, motivating the utilize of a network\narchitecture similar to those in video diffusion models. This design ensures an\nefficient 3D representation by focusing solely on surface information. Also, we\npropose a two-stage pipeline to generate 3D objects from X-Ray Diffusion Model\nand Upsampler. We demonstrate the practicality and adaptability of our X-Ray\nrepresentation by synthesizing the complete visible and hidden surfaces of a 3D\nobject from a single input image. Experimental results reveal the\nstate-of-the-art superiority of our representation in enhancing the accuracy of\n3D generation, paving the way for new 3D representation research and practical\napplications.\n","authors":["Tao Hu","Wenhang Ge","Yuyang Zhao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2404.14329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05963v3","updated":"2024-06-02T01:35:28Z","published":"2024-03-09T17:05:43Z","title":"Robust Emotion Recognition in Context Debiasing","summary":"  Context-aware emotion recognition (CAER) has recently boosted the practical\napplications of affective computing techniques in unconstrained environments.\nMainstream CAER methods invariably extract ensemble representations from\ndiverse contexts and subject-centred characteristics to perceive the target\nperson's emotional state. Despite advancements, the biggest challenge remains\ndue to context bias interference. The harmful bias forces the models to rely on\nspurious correlations between background contexts and emotion labels in\nlikelihood estimation, causing severe performance bottlenecks and confounding\nvaluable context priors. In this paper, we propose a counterfactual emotion\ninference (CLEF) framework to address the above issue. Specifically, we first\nformulate a generalized causal graph to decouple the causal relationships among\nthe variables in CAER. Following the causal graph, CLEF introduces a\nnon-invasive context branch to capture the adverse direct effect caused by the\ncontext bias. During the inference, we eliminate the direct context effect from\nthe total causal effect by comparing factual and counterfactual outcomes,\nresulting in bias mitigation and robust prediction. As a model-agnostic\nframework, CLEF can be readily integrated into existing methods, bringing\nconsistent performance gains.\n","authors":["Dingkang Yang","Kun Yang","Mingcheng Li","Shunli Wang","Shuaibing Wang","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05963v3.pdf","comment":"Accepted by CVPR 2024"},{"id":"http://arxiv.org/abs/2311.03356v3","updated":"2024-06-02T00:33:53Z","published":"2023-11-06T18:59:57Z","title":"GLaMM: Pixel Grounding Large Multimodal Model","summary":"  Large Multimodal Models (LMMs) extend Large Language Models to the vision\ndomain. Initial LMMs used holistic images and text prompts to generate\nungrounded textual responses. Recently, region-level LMMs have been used to\ngenerate visually grounded responses. However, they are limited to only\nreferring to a single object category at a time, require users to specify the\nregions, or cannot offer dense pixel-wise object grounding. In this work, we\npresent Grounding LMM (GLaMM), the first model that can generate natural\nlanguage responses seamlessly intertwined with corresponding object\nsegmentation masks. GLaMM not only grounds objects appearing in the\nconversations but is flexible enough to accept both textual and optional visual\nprompts (region of interest) as input. This empowers users to interact with the\nmodel at various levels of granularity, both in textual and visual domains. Due\nto the lack of standard benchmarks for the novel setting of visually Grounded\nConversation Generation (GCG), we introduce a comprehensive evaluation protocol\nwith our curated grounded conversations. Our proposed GCG task requires densely\ngrounded concepts in natural scenes at a large-scale. To this end, we propose a\ndensely annotated Grounding-anything Dataset (GranD) using our proposed\nautomated annotation pipeline that encompasses 7.5M unique concepts grounded in\na total of 810M regions available with segmentation masks. Besides GCG, GLaMM\nalso performs effectively on several downstream tasks, e.g., referring\nexpression segmentation, image and region-level captioning and vision-language\nconversations.\n","authors":["Hanoona Rasheed","Muhammad Maaz","Sahal Shaji Mullappilly","Abdelrahman Shaker","Salman Khan","Hisham Cholakkal","Rao M. Anwer","Erix Xing","Ming-Hsuan Yang","Fahad S. Khan"],"pdf_url":"https://arxiv.org/pdf/2311.03356v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2405.05477v2","updated":"2024-06-02T00:00:13Z","published":"2024-05-09T00:30:45Z","title":"DynaSeg: A Deep Dynamic Fusion Method for Unsupervised Image\n  Segmentation Incorporating Feature Similarity and Spatial Continuity","summary":"  Our work tackles the fundamental challenge of image segmentation in computer\nvision, which is crucial for diverse applications. While supervised methods\ndemonstrate proficiency, their reliance on extensive pixel-level annotations\nlimits scalability. We introduce DynaSeg, an innovative unsupervised image\nsegmentation approach that overcomes the challenge of balancing feature\nsimilarity and spatial continuity without relying on extensive hyperparameter\ntuning. Unlike traditional methods, DynaSeg employs a dynamic weighting scheme\nthat automates parameter tuning, adapts flexibly to image characteristics, and\nfacilitates easy integration with other segmentation networks. By incorporating\na Silhouette Score Phase, DynaSeg prevents undersegmentation failures where the\nnumber of predicted clusters might converge to one. DynaSeg uses CNN-based and\npre-trained ResNet feature extraction, making it computationally efficient and\nmore straightforward than other complex models. Experimental results showcase\nstate-of-the-art performance, achieving a 12.2% and 14.12% mIOU improvement\nover current unsupervised segmentation approaches on COCO-All and COCO-Stuff\ndatasets, respectively. We provide qualitative and quantitative results on five\nbenchmark datasets, demonstrating the efficacy of the proposed approach.\n","authors":["Boujemaa Guermazi","Naimul Khan"],"pdf_url":"https://arxiv.org/pdf/2405.05477v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2402.01481v4","updated":"2024-06-02T23:17:38Z","published":"2024-02-02T15:07:09Z","title":"Pre-Training Protein Bi-level Representation Through Span Mask Strategy\n  On 3D Protein Chains","summary":"  In recent years, there has been a surge in the development of 3D\nstructure-based pre-trained protein models, representing a significant\nadvancement over pre-trained protein language models in various downstream\ntasks. However, most existing structure-based pre-trained models primarily\nfocus on the residue level, i.e., alpha carbon atoms, while ignoring other\natoms like side chain atoms. We argue that modeling proteins at both residue\nand atom levels is important since the side chain atoms can also be crucial for\nnumerous downstream tasks, for example, molecular docking. Nevertheless, we\nfind that naively combining residue and atom information during pre-training\ntypically fails. We identify a key reason is the information leakage caused by\nthe inclusion of atom structure in the input, which renders residue-level\npre-training tasks trivial and results in insufficiently expressive residue\nrepresentations. To address this issue, we introduce a span mask pre-training\nstrategy on 3D protein chains to learn meaningful representations of both\nresidues and atoms. This leads to a simple yet effective approach to learning\nprotein representation suitable for diverse downstream tasks. Extensive\nexperimental results on binding site prediction and function prediction tasks\ndemonstrate our proposed pre-training approach significantly outperforms other\nmethods. Our code will be made public.\n","authors":["Jiale Zhao","Wanru Zhuang","Jia Song","Yaqi Li","Shuqi Lu"],"pdf_url":"https://arxiv.org/pdf/2402.01481v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13673v3","updated":"2024-06-02T23:16:50Z","published":"2023-05-23T04:28:16Z","title":"Physics of Language Models: Part 1, Learning Hierarchical Language\n  Structures","summary":"  Transformer-based language models are effective but complex, and\nunderstanding their inner workings is a significant challenge. Previous\nresearch has primarily explored how these models handle simple tasks like name\ncopying or selection, and we extend this by investigating how these models\ngrasp complex, recursive language structures defined by context-free grammars\n(CFGs). We introduce a family of synthetic CFGs that produce hierarchical\nrules, capable of generating lengthy sentences (e.g., hundreds of tokens) that\nare locally ambiguous and require dynamic programming to parse. Despite this\ncomplexity, we demonstrate that generative models like GPT can accurately learn\nthis CFG language and generate sentences based on it. We explore the model's\ninternals, revealing that its hidden states precisely capture the structure of\nCFGs, and its attention patterns resemble the information passing in a dynamic\nprogramming algorithm.\n  This paper also presents several corollaries, including showing why\npositional embedding is inferior to relative attention or rotary embedding;\ndemonstrating that encoder-based models (e.g., BERT, deBERTa) cannot learn very\ndeeply nested CFGs as effectively as generative models (e.g., GPT); and\nhighlighting the necessity of adding structural and syntactic errors to the\npretraining data to make the model more robust to corrupted language prefixes.\n","authors":["Zeyuan Allen-Zhu","Yuanzhi Li"],"pdf_url":"https://arxiv.org/pdf/2305.13673v3.pdf","comment":"V2+V3 polishes writing; V3 includes Figures 6 and 10 for better\n  illustrations of our results"},{"id":"http://arxiv.org/abs/2403.03458v2","updated":"2024-06-02T23:04:43Z","published":"2024-03-06T04:49:02Z","title":"Slot Abstractors: Toward Scalable Abstract Visual Reasoning","summary":"  Abstract visual reasoning is a characteristically human ability, allowing the\nidentification of relational patterns that are abstracted away from object\nfeatures, and the systematic generalization of those patterns to unseen\nproblems. Recent work has demonstrated strong systematic generalization in\nvisual reasoning tasks involving multi-object inputs, through the integration\nof slot-based methods used for extracting object-centric representations\ncoupled with strong inductive biases for relational abstraction. However, this\napproach was limited to problems containing a single rule, and was not scalable\nto visual reasoning problems containing a large number of objects. Other recent\nwork proposed Abstractors, an extension of Transformers that incorporates\nstrong relational inductive biases, thereby inheriting the Transformer's\nscalability and multi-head architecture, but it has yet to be demonstrated how\nthis approach might be applied to multi-object visual inputs. Here we combine\nthe strengths of the above approaches and propose Slot Abstractors, an approach\nto abstract visual reasoning that can be scaled to problems involving a large\nnumber of objects and multiple relations among them. The approach displays\nstate-of-the-art performance across four abstract visual reasoning tasks, as\nwell as an abstract reasoning task involving real-world images.\n","authors":["Shanka Subhra Mondal","Jonathan D. Cohen","Taylor W. Webb"],"pdf_url":"https://arxiv.org/pdf/2403.03458v2.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.01116v4","updated":"2024-06-02T23:01:08Z","published":"2024-02-02T03:19:54Z","title":"Scalable Multi-modal Model Predictive Control via Duality-based\n  Interaction Predictions","summary":"  We propose a hierarchical architecture designed for scalable real-time Model\nPredictive Control (MPC) in complex, multi-modal traffic scenarios. This\narchitecture comprises two key components: 1) RAID-Net, a novel attention-based\nRecurrent Neural Network that predicts relevant interactions along the MPC\nprediction horizon between the autonomous vehicle and the surrounding vehicles\nusing Lagrangian duality, and 2) a reduced Stochastic MPC problem that\neliminates irrelevant collision avoidance constraints, enhancing computational\nefficiency. Our approach is demonstrated in a simulated traffic intersection\nwith interactive surrounding vehicles, showcasing a 12x speed-up in solving the\nmotion planning problem. A video demonstrating the proposed architecture in\nmultiple complex traffic scenarios can be found here:\nhttps://youtu.be/-pRiOnPb9_c. GitHub:\nhttps://github.com/MPC-Berkeley/hmpc_raidnet\n","authors":["Hansung Kim","Siddharth H. Nair","Francesco Borrelli"],"pdf_url":"https://arxiv.org/pdf/2402.01116v4.pdf","comment":"Accepted at IEEE Intelligent Vehicles Symposium 2024"},{"id":"http://arxiv.org/abs/2402.06122v3","updated":"2024-06-02T22:41:02Z","published":"2024-02-09T01:11:34Z","title":"Peeking with PEAK: Sequential, Nonparametric Composite Hypothesis Tests\n  for Means of Multiple Data Streams","summary":"  We propose a novel nonparametric sequential test for composite hypotheses for\nmeans of multiple data streams. Our proposed method, \\emph{peeking with\nexpectation-based averaged capital} (PEAK), builds upon the testing-by-betting\nframework and provides a non-asymptotic $\\alpha$-level test across any stopping\ntime. Our contributions are two-fold: (1) we propose a novel betting scheme and\nprovide theoretical guarantees on type-I error control, power, and asymptotic\ngrowth rate/$e$-power in the setting of a single data stream; (2) we introduce\nPEAK, a generalization of this betting scheme to multiple streams, that (i)\navoids using wasteful union bounds via averaging, (ii) is a test of power one\nunder mild regularity conditions on the sampling scheme of the streams, and\n(iii) reduces computational overhead when applying the testing-as-betting\napproaches for pure-exploration bandit problems. We illustrate the practical\nbenefits of PEAK using both synthetic and real-world HeartSteps datasets. Our\nexperiments show that PEAK provides up to an 85\\% reduction in the number of\nsamples before stopping compared to existing stopping rules for\npure-exploration bandit problems, and matches the performance of\nstate-of-the-art sequential tests while improving upon computational\ncomplexity.\n","authors":["Brian Cho","Kyra Gan","Nathan Kallus"],"pdf_url":"https://arxiv.org/pdf/2402.06122v3.pdf","comment":"To appear at the Forty-first International Conference on Machine\n  Learning (ICML 2024)"},{"id":"http://arxiv.org/abs/2305.14689v2","updated":"2024-06-02T22:26:44Z","published":"2023-05-24T03:52:48Z","title":"Least Squares Regression Can Exhibit Under-Parameterized Double Descent","summary":"  The relationship between the number of training data points, the number of\nparameters, and the generalization capabilities has been widely studied.\nPrevious work has shown that double descent can occur in the over-parameterized\nregime, and believe that the standard bias-variance trade-off holds in the\nunder-parameterized regime. These works provide multiple reasons for the\nexistence of the peak. We postulate that the location of the peak depends on\nthe technical properties of both the spectrum as well as the eigenvectors of\nthe sample covariance. We present two simple examples that provably exhibit\ndouble descent in the under-parameterized regime and do not seem to occur for\nreasons provided in prior work.\n","authors":["Xinyue Li","Rishi Sonthalia"],"pdf_url":"https://arxiv.org/pdf/2305.14689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09778v3","updated":"2024-06-02T22:15:18Z","published":"2023-12-15T13:30:04Z","title":"Hypergraph-MLP: Learning on Hypergraphs without Message Passing","summary":"  Hypergraphs are vital in modelling data with higher-order relations\ncontaining more than two entities, gaining prominence in machine learning and\nsignal processing. Many hypergraph neural networks leverage message passing\nover hypergraph structures to enhance node representation learning, yielding\nimpressive performances in tasks like hypergraph node classification. However,\nthese message-passing-based models face several challenges, including\noversmoothing as well as high latency and sensitivity to structural\nperturbations at inference time. To tackle those challenges, we propose an\nalternative approach where we integrate the information about hypergraph\nstructures into training supervision without explicit message passing, thus\nalso removing the reliance on it at inference. Specifically, we introduce\nHypergraph-MLP, a novel learning framework for hypergraph-structured data,\nwhere the learning model is a straightforward multilayer perceptron (MLP)\nsupervised by a loss function based on a notion of signal smoothness on\nhypergraphs. Experiments on hypergraph node classification tasks demonstrate\nthat Hypergraph-MLP achieves competitive performance compared to existing\nbaselines, and is considerably faster and more robust against structural\nperturbations at inference.\n","authors":["Bohan Tang","Siheng Chen","Xiaowen Dong"],"pdf_url":"https://arxiv.org/pdf/2312.09778v3.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2404.14367v3","updated":"2024-06-02T22:00:42Z","published":"2024-04-22T17:20:18Z","title":"Preference Fine-Tuning of LLMs Should Leverage Suboptimal, On-Policy\n  Data","summary":"  Learning from preference labels plays a crucial role in fine-tuning large\nlanguage models. There are several distinct approaches for preference\nfine-tuning, including supervised learning, on-policy reinforcement learning\n(RL), and contrastive learning. Different methods come with different\nimplementation tradeoffs and performance differences, and existing empirical\nfindings present different conclusions, for instance, some results show that\nonline RL is quite important to attain good fine-tuning results, while others\nfind (offline) contrastive or even purely supervised methods sufficient. This\nraises a natural question: what kind of approaches are important for\nfine-tuning with preference data and why? In this paper, we answer this\nquestion by performing a rigorous analysis of a number of fine-tuning\ntechniques on didactic and full-scale LLM problems. Our main finding is that,\nin general, approaches that use on-policy sampling or attempt to push down the\nlikelihood on certain responses (i.e., employ a \"negative gradient\") outperform\noffline and maximum likelihood objectives. We conceptualize our insights and\nunify methods that use on-policy sampling or negative gradient under a notion\nof mode-seeking objectives for categorical distributions. Mode-seeking\nobjectives are able to alter probability mass on specific bins of a categorical\ndistribution at a fast rate compared to maximum likelihood, allowing them to\nrelocate masses across bins more effectively. Our analysis prescribes\nactionable insights for preference fine-tuning of LLMs and informs how data\nshould be collected for maximal improvement.\n","authors":["Fahim Tajwar","Anikait Singh","Archit Sharma","Rafael Rafailov","Jeff Schneider","Tengyang Xie","Stefano Ermon","Chelsea Finn","Aviral Kumar"],"pdf_url":"https://arxiv.org/pdf/2404.14367v3.pdf","comment":"International Conference on Machine Learning (ICML), 2024"},{"id":"http://arxiv.org/abs/2402.04493v2","updated":"2024-06-02T21:38:47Z","published":"2024-02-07T00:33:11Z","title":"A Primal-Dual Algorithm for Offline Constrained Reinforcement Learning\n  with Linear MDPs","summary":"  We study offline reinforcement learning (RL) with linear MDPs under the\ninfinite-horizon discounted setting which aims to learn a policy that maximizes\nthe expected discounted cumulative reward using a pre-collected dataset.\nExisting algorithms for this setting either require a uniform data coverage\nassumptions or are computationally inefficient for finding an\n$\\epsilon$-optimal policy with $O(\\epsilon^{-2})$ sample complexity. In this\npaper, we propose a primal dual algorithm for offline RL with linear MDPs in\nthe infinite-horizon discounted setting. Our algorithm is the first\ncomputationally efficient algorithm in this setting that achieves sample\ncomplexity of $O(\\epsilon^{-2})$ with partial data coverage assumption. Our\nwork is an improvement upon a recent work that requires $O(\\epsilon^{-4})$\nsamples. Moreover, we extend our algorithm to work in the offline constrained\nRL setting that enforces constraints on additional reward signals.\n","authors":["Kihyuk Hong","Ambuj Tewari"],"pdf_url":"https://arxiv.org/pdf/2402.04493v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03887v3","updated":"2024-06-02T21:30:13Z","published":"2023-07-08T03:42:54Z","title":"Improving Prototypical Part Networks with Reward Reweighing,\n  Reselection, and Retraining","summary":"  In recent years, work has gone into developing deep interpretable methods for\nimage classification that clearly attributes a model's output to specific\nfeatures of the data. One such of these methods is the Prototypical Part\nNetwork (ProtoPNet), which attempts to classify images based on meaningful\nparts of the input. While this architecture is able to produce visually\ninterpretable classifications, it often learns to classify based on parts of\nthe image that are not semantically meaningful. To address this problem, we\npropose the Reward Reweighing, Reselecting, and Retraining (R3) post-processing\nframework, which performs three additional corrective updates to a pretrained\nProtoPNet in an offline and efficient manner. The first two steps involve\nlearning a reward model based on collected human feedback and then aligning the\nprototypes with human preferences. The final step is retraining, which realigns\nthe base features and the classifier layer of the original model with the\nupdated prototypes. We find that our R3 framework consistently improves both\nthe interpretability and the predictive accuracy of ProtoPNet and its variants.\n","authors":["Aaron J. Li","Robin Netzorg","Zhihan Cheng","Zhuoqin Zhang","Bin Yu"],"pdf_url":"https://arxiv.org/pdf/2307.03887v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03507v2","updated":"2024-06-02T21:24:12Z","published":"2024-03-06T07:29:57Z","title":"GaLore: Memory-Efficient LLM Training by Gradient Low-Rank Projection","summary":"  Training Large Language Models (LLMs) presents significant memory challenges,\npredominantly due to the growing size of weights and optimizer states. Common\nmemory-reduction approaches, such as low-rank adaptation (LoRA), add a\ntrainable low-rank matrix to the frozen pre-trained weight in each layer,\nreducing trainable parameters and optimizer states. However, such approaches\ntypically underperform training with full-rank weights in both pre-training and\nfine-tuning stages since they limit the parameter search to a low-rank subspace\nand alter the training dynamics, and further, may require full-rank warm start.\nIn this work, we propose Gradient Low-Rank Projection (GaLore), a training\nstrategy that allows full-parameter learning but is more memory-efficient than\ncommon low-rank adaptation methods such as LoRA. Our approach reduces memory\nusage by up to 65.5% in optimizer states while maintaining both efficiency and\nperformance for pre-training on LLaMA 1B and 7B architectures with C4 dataset\nwith up to 19.7B tokens, and on fine-tuning RoBERTa on GLUE tasks. Our 8-bit\nGaLore further reduces optimizer memory by up to 82.5% and total training\nmemory by 63.3%, compared to a BF16 baseline. Notably, we demonstrate, for the\nfirst time, the feasibility of pre-training a 7B model on consumer GPUs with\n24GB memory (e.g., NVIDIA RTX 4090) without model parallel, checkpointing, or\noffloading strategies.\n","authors":["Jiawei Zhao","Zhenyu Zhang","Beidi Chen","Zhangyang Wang","Anima Anandkumar","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2403.03507v2.pdf","comment":"ICML 2024 (Oral)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.00264v4","updated":"2024-06-02T19:12:57Z","published":"2023-08-01T03:54:27Z","title":"Multimodal Multi-loss Fusion Network for Sentiment Analysis","summary":"  This paper investigates the optimal selection and fusion of feature encoders\nacross multiple modalities and combines these in one neural network to improve\nsentiment detection. We compare different fusion methods and examine the impact\nof multi-loss training within the multi-modality fusion network, identifying\nsurprisingly important findings relating to subnet performance. We have also\nfound that integrating context significantly enhances model performance. Our\nbest model achieves state-of-the-art performance for three datasets (CMU-MOSI,\nCMU-MOSEI and CH-SIMS). These results suggest a roadmap toward an optimized\nfeature selection and fusion approach for enhancing sentiment detection in\nneural networks.\n","authors":["Zehui Wu","Ziwei Gong","Jaywon Koo","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2308.00264v4.pdf","comment":"First two authors contributed equally to the paper"},{"id":"http://arxiv.org/abs/2404.03635v4","updated":"2024-06-02T04:56:32Z","published":"2024-04-04T17:54:33Z","title":"WorDepth: Variational Language Prior for Monocular Depth Estimation","summary":"  Three-dimensional (3D) reconstruction from a single image is an ill-posed\nproblem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text\ndescription(s) is similarly ill-posed, i.e. spatial arrangements of objects\ndescribed. We investigate the question of whether two inherently ambiguous\nmodalities can be used in conjunction to produce metric-scaled reconstructions.\nTo test this, we focus on monocular depth estimation, the problem of predicting\na dense depth map from a single image, but with an additional text caption\ndescribing the scene. To this end, we begin by encoding the text caption as a\nmean and standard deviation; using a variational framework, we learn the\ndistribution of the plausible metric reconstructions of 3D scenes corresponding\nto the text captions as a prior. To \"select\" a specific reconstruction or depth\nmap, we encode the given image through a conditional sampler that samples from\nthe latent space of the variational text encoder, which is then decoded to the\noutput depth map. Our approach is trained alternatingly between the text and\nimage branches: in one optimization step, we predict the mean and standard\ndeviation from the text description and sample from a standard Gaussian, and in\nthe other, we sample using a (image) conditional sampler. Once trained, we\ndirectly predict depth from the encoded text using the conditional sampler. We\ndemonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where\nwe show that language can consistently improve performance in both.\n","authors":["Ziyao Zeng","Daniel Wang","Fengyu Yang","Hyoungseob Park","Yangchao Wu","Stefano Soatto","Byung-Woo Hong","Dong Lao","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2404.03635v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00901v1","updated":"2024-06-02T23:51:43Z","published":"2024-06-02T23:51:43Z","title":"Robust Multi-Modal Speech In-Painting: A Sequence-to-Sequence Approach","summary":"  The process of reconstructing missing parts of speech audio from context is\ncalled speech in-painting. Human perception of speech is inherently\nmulti-modal, involving both audio and visual (AV) cues. In this paper, we\nintroduce and study a sequence-to-sequence (seq2seq) speech in-painting model\nthat incorporates AV features. Our approach extends AV speech in-painting\ntechniques to scenarios where both audio and visual data may be jointly\ncorrupted. To achieve this, we employ a multi-modal training paradigm that\nboosts the robustness of our model across various conditions involving acoustic\nand visual distortions. This makes our distortion-aware model a plausible\nsolution for real-world challenging environments. We compare our method with\nexisting transformer-based and recurrent neural network-based models, which\nattempt to reconstruct missing speech gaps ranging from a few milliseconds to\nover a second. Our experimental results demonstrate that our novel seq2seq\narchitecture outperforms the state-of-the-art transformer solution by 38.8% in\nterms of enhancing speech quality and 7.14% in terms of improving speech\nintelligibility. We exploit a multi-task learning framework that simultaneously\nperforms lip-reading (transcribing video components to text) while\nreconstructing missing parts of the associated speech.\n","authors":["Mahsa Kadkhodaei Elyaderani","Shahram Shirani"],"pdf_url":"https://arxiv.org/pdf/2406.00901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00791v1","updated":"2024-06-02T16:13:57Z","published":"2024-06-02T16:13:57Z","title":"Towards Point Cloud Compression for Machine Perception: A Simple and\n  Strong Baseline by Learning the Octree Depth Level Predictor","summary":"  Point cloud compression has garnered significant interest in computer vision.\nHowever, existing algorithms primarily cater to human vision, while most point\ncloud data is utilized for machine vision tasks. To address this, we propose a\npoint cloud compression framework that simultaneously handles both human and\nmachine vision tasks. Our framework learns a scalable bit-stream, using only\nsubsets for different machine vision tasks to save bit-rate, while employing\nthe entire bit-stream for human vision tasks. Building on mainstream\noctree-based frameworks like VoxelContext-Net, OctAttention, and G-PCC, we\nintroduce a new octree depth-level predictor. This predictor adaptively\ndetermines the optimal depth level for each octree constructed from a point\ncloud, controlling the bit-rate for machine vision tasks. For simpler tasks\n(\\textit{e.g.}, classification) or objects/scenarios, we use fewer depth levels\nwith fewer bits, saving bit-rate. Conversely, for more complex tasks\n(\\textit{e.g}., segmentation) or objects/scenarios, we use deeper depth levels\nwith more bits to enhance performance. Experimental results on various datasets\n(\\textit{e.g}., ModelNet10, ModelNet40, ShapeNet, ScanNet, and KITTI) show that\nour point cloud compression approach improves performance for machine vision\ntasks without compromising human vision quality.\n","authors":["Lei Liu","Zhihao Hu","Zhenghao Chen"],"pdf_url":"https://arxiv.org/pdf/2406.00791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00758v1","updated":"2024-06-02T14:22:09Z","published":"2024-06-02T14:22:09Z","title":"Once-for-All: Controllable Generative Image Compression with Dynamic\n  Granularity Adaption","summary":"  Although recent generative image compression methods have demonstrated\nimpressive potential in optimizing the rate-distortion-perception trade-off,\nthey still face the critical challenge of flexible rate adaption to diverse\ncompression necessities and scenarios. To overcome this challenge, this paper\nproposes a Controllable Generative Image Compression framework, Control-GIC,\nthe first capable of fine-grained bitrate adaption across a broad spectrum\nwhile ensuring high-fidelity and generality compression. We base Control-GIC on\na VQGAN framework representing an image as a sequence of variable-length codes\n(i.e. VQ-indices), which can be losslessly compressed and exhibits a direct\npositive correlation with the bitrates. Therefore, drawing inspiration from the\nclassical coding principle, we naturally correlate the information density of\nlocal image patches with their granular representations, to achieve dynamic\nadjustment of the code quantity following different granularity decisions. This\nimplies we can flexibly determine a proper allocation of granularity for the\npatches to acquire desirable compression rates. We further develop a\nprobabilistic conditional decoder that can trace back to historic encoded\nmulti-granularity representations according to transmitted codes, and then\nreconstruct hierarchical granular features in the formalization of conditional\nprobability, enabling more informative aggregation to improve reconstruction\nrealism. Our experiments show that Control-GIC allows highly flexible and\ncontrollable bitrate adaption and even once compression on an entire dataset to\nfulfill constrained bitrate conditions. Experimental results demonstrate its\nsuperior performance over recent state-of-the-art methods.\n","authors":["Anqi Li","Yuxi Liu","Huihui Bai","Feng Li","Runmin Cong","Meng Wang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.00758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00683v1","updated":"2024-06-02T09:36:37Z","published":"2024-06-02T09:36:37Z","title":"Exploiting Frequency Correlation for Hyperspectral Image Reconstruction","summary":"  Deep priors have emerged as potent methods in hyperspectral image (HSI)\nreconstruction. While most methods emphasize space-domain learning using image\nspace priors like non-local similarity, frequency-domain learning using image\nfrequency priors remains neglected, limiting the reconstruction capability of\nnetworks. In this paper, we first propose a Hyperspectral Frequency Correlation\n(HFC) prior rooted in in-depth statistical frequency analyses of existent HSI\ndatasets. Leveraging the HFC prior, we subsequently establish the frequency\ndomain learning composed of a Spectral-wise self-Attention of Frequency (SAF)\nand a Spectral-spatial Interaction of Frequency (SIF) targeting low-frequency\nand high-frequency components, respectively. The outputs of SAF and SIF are\nadaptively merged by a learnable gating filter, thus achieving a thorough\nexploitation of image frequency priors. Integrating the frequency domain\nlearning and the existing space domain learning, we finally develop the\nCorrelation-driven Mixing Domains Transformer (CMDT) for HSI reconstruction.\nExtensive experiments highlight that our method surpasses various\nstate-of-the-art (SOTA) methods in reconstruction quality and computational\nefficiency.\n","authors":["Muge Yan","Lizhi Wang","Lin Zhu","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2406.00683v1.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2406.00626v1","updated":"2024-06-02T06:08:41Z","published":"2024-06-02T06:08:41Z","title":"Intelligent Text-Conditioned Music Generation","summary":"  CLIP (Contrastive Language-Image Pre-Training) is a multimodal neural network\ntrained on (text, image) pairs to predict the most relevant text caption given\nan image. It has been used extensively in image generation by connecting its\noutput with a generative model such as VQGAN, with the most notable example\nbeing OpenAI's DALLE-2. In this project, we apply a similar approach to bridge\nthe gap between natural language and music. Our model is split into two steps:\nfirst, we train a CLIP-like model on pairs of text and music over contrastive\nloss to align a piece of music with its most probable text caption. Then, we\ncombine the alignment model with a music decoder to generate music. To the best\nof our knowledge, this is the first attempt at text-conditioned deep music\ngeneration. Our experiments show that it is possible to train the text-music\nalignment model using contrastive loss and train a decoder to generate music\nfrom text prompts.\n","authors":["Zhouyao Xie","Nikhil Yadala","Xinyi Chen","Jing Xi Liu"],"pdf_url":"https://arxiv.org/pdf/2406.00626v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2406.00725v1","updated":"2024-06-02T12:21:10Z","published":"2024-06-02T12:21:10Z","title":"Maximum-Entropy Regularized Decision Transformer with Reward Relabelling\n  for Dynamic Recommendation","summary":"  Reinforcement learning-based recommender systems have recently gained\npopularity. However, due to the typical limitations of simulation environments\n(e.g., data inefficiency), most of the work cannot be broadly applied in all\ndomains. To counter these challenges, recent advancements have leveraged\noffline reinforcement learning methods, notable for their data-driven approach\nutilizing offline datasets. A prominent example of this is the Decision\nTransformer. Despite its popularity, the Decision Transformer approach has\ninherent drawbacks, particularly evident in recommendation methods based on it.\nThis paper identifies two key shortcomings in existing Decision\nTransformer-based methods: a lack of stitching capability and limited\neffectiveness in online adoption. In response, we introduce a novel methodology\nnamed Max-Entropy enhanced Decision Transformer with Reward Relabeling for\nOffline RLRS (EDT4Rec). Our approach begins with a max entropy perspective,\nleading to the development of a max entropy enhanced exploration strategy. This\nstrategy is designed to facilitate more effective exploration in online\nenvironments. Additionally, to augment the model's capability to stitch\nsub-optimal trajectories, we incorporate a unique reward relabeling technique.\nTo validate the effectiveness and superiority of EDT4Rec, we have conducted\ncomprehensive experiments across six real-world offline datasets and in an\nonline simulator.\n","authors":["Xiaocong Chen","Siyu Wang","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2406.00725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00682v1","updated":"2024-06-02T09:36:33Z","published":"2024-06-02T09:36:33Z","title":"A lexicon obtained and validated by a data-driven approach for organic\n  residues valorization in emerging and developing countries","summary":"  The text mining method presented in this paper was used for annotation of\nterms related to biological transformation and valorization of organic residues\nin agriculture in low and middle-income country. Specialized lexicon was\nobtained through different steps: corpus and extraction of terms, annotation of\nextracted terms, selection of relevant terms.\n","authors":["Christiane Rakotomalala","Jean-Marie Paillat","Frédéric Feder","Angel Avadí","Laurent Thuriès","Marie-Liesse Vermeire","Jean-Michel Médoc","Tom Wassenaar","Caroline Hottelart","Lilou Kieffer","Elisa Ndjie","Mathieu Picart","Jorel Tchamgoue","Alvin Tulle","Laurine Valade","Annie Boyer","Marie-Christine Duchamp","Mathieu Roche"],"pdf_url":"https://arxiv.org/pdf/2406.00682v1.pdf","comment":"5 pages, 2 tables"},{"id":"http://arxiv.org/abs/2406.00638v1","updated":"2024-06-02T06:48:43Z","published":"2024-06-02T06:48:43Z","title":"COS-Mix: Cosine Similarity and Distance Fusion for Improved Information\n  Retrieval","summary":"  This study proposes a novel hybrid retrieval strategy for Retrieval-Augmented\nGeneration (RAG) that integrates cosine similarity and cosine distance measures\nto improve retrieval performance, particularly for sparse data. The traditional\ncosine similarity measure is widely used to capture the similarity between\nvectors in high-dimensional spaces. However, it has been shown that this\nmeasure can yield arbitrary results in certain scenarios. To address this\nlimitation, we incorporate cosine distance measures to provide a complementary\nperspective by quantifying the dissimilarity between vectors. Our approach is\nexperimented on proprietary data, unlike recent publications that have used\nopen-source datasets. The proposed method demonstrates enhanced retrieval\nperformance and provides a more comprehensive understanding of the semantic\nrelationships between documents or items. This hybrid strategy offers a\npromising solution for efficiently and accurately retrieving relevant\ninformation in knowledge-intensive applications, leveraging techniques such as\nBM25 (sparse) retrieval , vector (Dense) retrieval, and cosine distance based\nretrieval to facilitate efficient information retrieval.\n","authors":["Kush Juvekar","Anupam Purwar"],"pdf_url":"https://arxiv.org/pdf/2406.00638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00615v1","updated":"2024-06-02T04:33:52Z","published":"2024-06-02T04:33:52Z","title":"Making Recommender Systems More Knowledgeable: A Framework to\n  Incorporate Side Information","summary":"  Session-based recommender systems typically focus on using only the triplet\n(user_id, timestamp, item_id) to make predictions of users' next actions. In\nthis paper, we aim to utilize side information to help recommender systems\ncatch patterns and signals otherwise undetectable. Specifically, we propose a\ngeneral framework for incorporating item-specific side information into the\nrecommender system to enhance its performance without much modification on the\noriginal model architecture. Experimental results on several models and\ndatasets prove that with side information, our recommender system outperforms\nstate-of-the-art models by a considerable margin and converges much faster.\nAdditionally, we propose a new type of loss to regularize the attention\nmechanism used by recommender systems and evaluate its influence on model\nperformance. Furthermore, through analysis, we put forward a few insights on\npotential further improvements.\n","authors":["Yukun Jiang","Leo Guo","Xinyi Chen","Jing Xi Liu"],"pdf_url":"https://arxiv.org/pdf/2406.00615v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.02606v1","updated":"2024-06-02T18:26:50Z","published":"2024-06-02T18:26:50Z","title":"Know Your Neighborhood: General and Zero-Shot Capable Binary Function\n  Search Powered by Call Graphlets","summary":"  Binary code similarity detection is an important problem with applications in\nareas like malware analysis, vulnerability research and plagiarism detection.\nThis paper proposes a novel graph neural network architecture combined with a\nnovel graph data representation called call graphlets. A call graphlet encodes\nthe neighborhood around each function in a binary executable, capturing the\nlocal and global context through a series of statistical features. A\nspecialized graph neural network model is then designed to operate on this\ngraph representation, learning to map it to a feature vector that encodes\nsemantic code similarities using deep metric learning. The proposed approach is\nevaluated across four distinct datasets covering different architectures,\ncompiler toolchains, and optimization levels. Experimental results demonstrate\nthat the combination of call graphlets and the novel graph neural network\narchitecture achieves state-of-the-art performance compared to baseline\ntechniques across cross-architecture, mono-architecture and zero shot tasks. In\naddition, our proposed approach also performs well when evaluated against an\nout-of-domain function inlining task. Overall, the work provides a general and\neffective graph neural network-based solution for conducting binary code\nsimilarity detection.\n","authors":["Joshua Collyer","Tim Watson","Iain Phillips"],"pdf_url":"https://arxiv.org/pdf/2406.02606v1.pdf","comment":null}]},"2024-06-01T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.06853v4","updated":"2024-06-01T22:38:32Z","published":"2024-01-12T19:00:26Z","title":"Large Language Models Can Learn Temporal Reasoning","summary":"  While large language models (LLMs) have demonstrated remarkable reasoning\ncapabilities, they are not without their flaws and inaccuracies. Recent studies\nhave introduced various methods to mitigate these limitations. Temporal\nreasoning (TR), in particular, presents a significant challenge for LLMs due to\nits reliance on diverse temporal concepts and intricate temporal logic. In this\npaper, we propose TG-LLM, a novel framework towards language-based TR. Instead\nof reasoning over the original context, we adopt a latent representation,\ntemporal graph (TG) that enhances the learning of TR. A synthetic dataset\n(TGQA), which is fully controllable and requires minimal supervision, is\nconstructed for fine-tuning LLMs on this text-to-TG translation task. We\nconfirmed in experiments that the capability of TG translation learned on our\ndataset can be transferred to other TR tasks and benchmarks. On top of that, we\nteach LLM to perform deliberate reasoning over the TGs via Chain-of-Thought\n(CoT) bootstrapping and graph data augmentation. We observed that those\nstrategies, which maintain a balance between usefulness and diversity, bring\nmore reliable CoTs and final results than the vanilla CoT distillation.\n","authors":["Siheng Xiong","Ali Payani","Ramana Kompella","Faramarz Fekri"],"pdf_url":"https://arxiv.org/pdf/2401.06853v4.pdf","comment":"ACL24 (main)"},{"id":"http://arxiv.org/abs/2402.08017v2","updated":"2024-06-01T21:46:50Z","published":"2024-02-12T19:27:26Z","title":"Lumos : Empowering Multimodal LLMs with Scene Text Recognition","summary":"  We introduce Lumos, the first end-to-end multimodal question-answering system\nwith text understanding capabilities. At the core of Lumos is a Scene Text\nRecognition (STR) component that extracts text from first person point-of-view\nimages, the output of which is used to augment input to a Multimodal Large\nLanguage Model (MM-LLM). While building Lumos, we encountered numerous\nchallenges related to STR quality, overall latency, and model inference. In\nthis paper, we delve into those challenges, and discuss the system\narchitecture, design choices, and modeling techniques employed to overcome\nthese obstacles. We also provide a comprehensive evaluation for each component,\nshowcasing high quality and efficiency.\n","authors":["Ashish Shenoy","Yichao Lu","Srihari Jayakumar","Debojeet Chatterjee","Mohsen Moslehpour","Pierce Chuang","Abhay Harpale","Vikas Bhardwaj","Di Xu","Shicong Zhao","Longfang Zhao","Ankit Ramchandani","Xin Luna Dong","Anuj Kumar"],"pdf_url":"https://arxiv.org/pdf/2402.08017v2.pdf","comment":"Accepted to KDD 2024 (ADS Track)"},{"id":"http://arxiv.org/abs/2306.10193v2","updated":"2024-06-01T21:40:33Z","published":"2023-06-16T21:55:08Z","title":"Conformal Language Modeling","summary":"  We propose a novel approach to conformal prediction for generative language\nmodels (LMs). Standard conformal prediction produces prediction sets -- in\nplace of single predictions -- that have rigorous, statistical performance\nguarantees. LM responses are typically sampled from the model's predicted\ndistribution over the large, combinatorial output space of natural language.\nTranslating this process to conformal prediction, we calibrate a stopping rule\nfor sampling different outputs from the LM that get added to a growing set of\ncandidates until we are confident that the output set is sufficient. Since some\nsamples may be low-quality, we also simultaneously calibrate and apply a\nrejection rule for removing candidates from the output set to reduce noise.\nSimilar to conformal prediction, we prove that the sampled set returned by our\nprocedure contains at least one acceptable answer with high probability, while\nstill being empirically precise (i.e., small) on average. Furthermore, within\nthis set of candidate responses, we show that we can also accurately identify\nsubsets of individual components -- such as phrases or sentences -- that are\neach independently correct (e.g., that are not \"hallucinations\"), again with\nstatistical guarantees. We demonstrate the promise of our approach on multiple\ntasks in open-domain question answering, text summarization, and radiology\nreport generation using different LM variants.\n","authors":["Victor Quach","Adam Fisch","Tal Schuster","Adam Yala","Jae Ho Sohn","Tommi S. Jaakkola","Regina Barzilay"],"pdf_url":"https://arxiv.org/pdf/2306.10193v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2310.12815v3","updated":"2024-06-01T21:21:07Z","published":"2023-10-19T15:12:09Z","title":"Formalizing and Benchmarking Prompt Injection Attacks and Defenses","summary":"  A prompt injection attack aims to inject malicious instruction/data into the\ninput of an LLM-Integrated Application such that it produces results as an\nattacker desires. Existing works are limited to case studies. As a result, the\nliterature lacks a systematic understanding of prompt injection attacks and\ntheir defenses. We aim to bridge the gap in this work. In particular, we\npropose a framework to formalize prompt injection attacks. Existing attacks are\nspecial cases in our framework. Moreover, based on our framework, we design a\nnew attack by combining existing ones. Using our framework, we conduct a\nsystematic evaluation on 5 prompt injection attacks and 10 defenses with 10\nLLMs and 7 tasks. Our work provides a common benchmark for quantitatively\nevaluating future prompt injection attacks and defenses. To facilitate research\non this topic, we make our platform public at\nhttps://github.com/liu00222/Open-Prompt-Injection.\n","authors":["Yupei Liu","Yuqi Jia","Runpeng Geng","Jinyuan Jia","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2310.12815v3.pdf","comment":"To appear in USENIX Security Symposium 2024"},{"id":"http://arxiv.org/abs/2401.04518v2","updated":"2024-06-01T17:52:14Z","published":"2024-01-09T12:20:41Z","title":"The Critique of Critique","summary":"  Critique, as a natural language description for assessing the quality of\nmodel-generated content, has played a vital role in the training, evaluation,\nand refinement of LLMs. However, a systematic method to evaluate the quality of\ncritique is lacking. In this paper, we pioneer the critique of critique, termed\nMetaCritique, which builds specific quantification criteria. To achieve a\nreliable evaluation outcome, we propose Atomic Information Units (AIUs), which\ndescribe the critique in a more fine-grained manner. MetaCritique aggregates\neach AIU's judgment for the overall score. Moreover, MetaCritique delivers a\nnatural language rationale for the intricate reasoning within each judgment.\nLastly, we construct a meta-evaluation dataset covering 4 tasks across 16\npublic datasets involving human-written and LLM-generated critiques.\nExperiments demonstrate that MetaCritique can achieve near-human performance.\nOur study can facilitate future research in LLM critiques based on our\nfollowing observations and released resources: (1) superior critiques judged by\nMetaCritique can lead to better refinements, indicating that it can potentially\nenhance the alignment of existing LLMs; (2) the leaderboard of critique models\nreveals that open-source critique models commonly suffer from factuality\nissues; (3) relevant code and data are publicly available at\nhttps://github.com/GAIR-NLP/MetaCritique to support deeper exploration; (4) an\nAPI at PyPI with the usage documentation in Appendix C allows users to assess\nthe critique conveniently.\n","authors":["Shichao Sun","Junlong Li","Weizhe Yuan","Ruifeng Yuan","Wenjie Li","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2401.04518v2.pdf","comment":"Accepted to Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2402.02456v2","updated":"2024-06-01T15:54:54Z","published":"2024-02-04T12:06:13Z","title":"tnGPS: Discovering Unknown Tensor Network Structure Search Algorithms\n  via Large Language Models (LLMs)","summary":"  Tensor networks are efficient for extremely high-dimensional representation,\nbut their model selection, known as tensor network structure search (TN-SS), is\na challenging problem. Although several works have targeted TN-SS, most\nexisting algorithms are manually crafted heuristics with poor performance,\nsuffering from the curse of dimensionality and local convergence. In this work,\nwe jump out of the box, studying how to harness large language models (LLMs) to\nautomatically discover new TN-SS algorithms, replacing the involvement of human\nexperts. By observing how human experts innovate in research, we model their\ncommon workflow and propose an automatic algorithm discovery framework called\ntnGPS. The proposed framework is an elaborate prompting pipeline that instruct\nLLMs to generate new TN-SS algorithms through iterative refinement and\nenhancement. The experimental results demonstrate that the algorithms\ndiscovered by tnGPS exhibit superior performance in benchmarks compared to the\ncurrent state-of-the-art methods.\n","authors":["Junhua Zeng","Chao Li","Zhun Sun","Qibin Zhao","Guoxu Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.02456v2.pdf","comment":"Accepted by ICML2024, pre-printed version"},{"id":"http://arxiv.org/abs/2405.20314v2","updated":"2024-06-01T15:24:10Z","published":"2024-05-30T17:54:35Z","title":"S3D: A Simple and Cost-Effective Self-Speculative Decoding Scheme for\n  Low-Memory GPUs","summary":"  Speculative decoding (SD) has attracted a significant amount of research\nattention due to the substantial speedup it can achieve for LLM inference.\nHowever, despite the high speedups they offer, speculative decoding methods\noften achieve optimal performance on high-end devices or with a substantial GPU\nmemory overhead. Given limited memory and the necessity of quantization, a\nhigh-performing model on a high-end GPU can slow down by up to 7 times. To this\nend, we propose Skippy Simultaneous Speculative Decoding (or S3D), a\ncost-effective self-speculative SD method based on simultaneous multi-token\ndecoding and mid-layer skipping. When compared against recent effective\nopen-source SD systems, our method has achieved one of the top\nperformance-memory ratios while requiring minimal architecture changes and\ntraining data. Leveraging our memory efficiency, we created a smaller yet more\neffective SD model based on Phi-3. It is 1.4 to 2 times faster than the\nquantized EAGLE model and operates in half-precision while using less VRAM.\n","authors":["Wei Zhong","Manasa Bharadwaj"],"pdf_url":"https://arxiv.org/pdf/2405.20314v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07120v2","updated":"2024-06-01T15:20:25Z","published":"2023-08-14T13:00:53Z","title":"Position: Key Claims in LLM Research Have a Long Tail of Footnotes","summary":"  Much of the recent discourse within the ML community has been centered around\nLarge Language Models (LLMs), their functionality and potential -- yet not only\ndo we not have a working definition of LLMs, but much of this discourse relies\non claims and assumptions that are worth re-examining. We contribute a\ndefinition of LLMs, critically examine five common claims regarding their\nproperties (including 'emergent properties'), and conclude with suggestions for\nfuture research directions and their framing.\n","authors":["Anna Rogers","Alexandra Sasha Luccioni"],"pdf_url":"https://arxiv.org/pdf/2308.07120v2.pdf","comment":"ICML 2024 camera-ready (https://openreview.net/forum?id=M2cwkGleRL)"},{"id":"http://arxiv.org/abs/2405.19426v2","updated":"2024-06-01T14:16:42Z","published":"2024-05-29T18:09:35Z","title":"Deep Learning for Assessment of Oral Reading Fluency","summary":"  Reading fluency assessment is a critical component of literacy programmes,\nserving to guide and monitor early education interventions. Given the resource\nintensive nature of the exercise when conducted by teachers, the development of\nautomatic tools that can operate on audio recordings of oral reading is\nattractive as an objective and highly scalable solution. Multiple complex\naspects such as accuracy, rate and expressiveness underlie human judgements of\nreading fluency. In this work, we investigate end-to-end modeling on a training\ndataset of children's audio recordings of story texts labeled by human experts.\nThe pre-trained wav2vec2.0 model is adopted due its potential to alleviate the\nchallenges from the limited amount of labeled data. We report the performance\nof a number of system variations on the relevant measures, and also probe the\nlearned embeddings for lexical and acoustic-prosodic features known to be\nimportant to the perception of reading fluency.\n","authors":["Mithilesh Vaidya","Binaya Kumar Sahoo","Preeti Rao"],"pdf_url":"https://arxiv.org/pdf/2405.19426v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04044v3","updated":"2024-06-01T14:04:21Z","published":"2023-11-07T14:55:52Z","title":"PrivLM-Bench: A Multi-level Privacy Evaluation Benchmark for Language\n  Models","summary":"  The rapid development of language models (LMs) brings unprecedented\naccessibility and usage for both models and users. On the one hand, powerful\nLMs achieve state-of-the-art performance over numerous downstream NLP tasks. On\nthe other hand, more and more attention is paid to unrestricted model accesses\nthat may bring malicious privacy risks of data leakage. To address these\nissues, many recent works propose privacy-preserving language models (PPLMs)\nwith differential privacy (DP). Unfortunately, different DP implementations\nmake it challenging for a fair comparison among existing PPLMs. In this paper,\nwe present PrivLM-Bench, a multi-perspective privacy evaluation benchmark to\nempirically and intuitively quantify the privacy leakage of LMs. Instead of\nonly reporting DP parameters, PrivLM-Bench sheds light on the neglected\ninference data privacy during actual usage. PrivLM-Bench first clearly defines\nmulti-faceted privacy objectives. Then, PrivLM-Bench constructs a unified\npipeline to perform private fine-tuning. Lastly, PrivLM-Bench performs existing\nprivacy attacks on LMs with pre-defined privacy objectives as the empirical\nevaluation results. The empirical attack results are used to fairly and\nintuitively evaluate the privacy leakage of various PPLMs. We conduct extensive\nexperiments on three datasets of GLUE for mainstream LMs.\n","authors":["Haoran Li","Dadi Guo","Donghao Li","Wei Fan","Qi Hu","Xin Liu","Chunkit Chan","Duanyi Yao","Yuan Yao","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2311.04044v3.pdf","comment":"To appear at ACL 2024"},{"id":"http://arxiv.org/abs/2402.09025v3","updated":"2024-06-01T12:10:48Z","published":"2024-02-14T09:01:13Z","title":"SLEB: Streamlining LLMs through Redundancy Verification and Elimination\n  of Transformer Blocks","summary":"  Large language models (LLMs) have proven to be highly effective across\nvarious natural language processing tasks. However, their large number of\nparameters poses significant challenges for practical deployment. Pruning, a\ntechnique aimed at reducing the size and complexity of LLMs, offers a potential\nsolution by removing redundant components from the network. Despite the promise\nof pruning, existing methods often struggle to achieve substantial end-to-end\nLLM inference speedup. In this paper, we introduce SLEB, a novel approach\ndesigned to streamline LLMs by eliminating redundant transformer blocks. We\nchoose the transformer block as the fundamental unit for pruning, because LLMs\nexhibit block-level redundancy with high similarity between the outputs of\nneighboring blocks. This choice allows us to effectively enhance the processing\nspeed of LLMs. Our experimental results demonstrate that SLEB outperforms\nprevious LLM pruning methods in accelerating LLM inference while also\nmaintaining superior perplexity and accuracy, making SLEB as a promising\ntechnique for enhancing the efficiency of LLMs. The code is available at:\nhttps://github.com/jiwonsong-dev/SLEB.\n","authors":["Jiwon Song","Kyungseok Oh","Taesu Kim","Hyungjun Kim","Yulhwa Kim","Jae-Joon Kim"],"pdf_url":"https://arxiv.org/pdf/2402.09025v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19778v2","updated":"2024-06-01T11:27:53Z","published":"2024-05-30T07:44:16Z","title":"Enhancing Consistency and Role-Specific Knowledge Capturing by\n  Rebuilding Fictional Character's Persona","summary":"  With the recent introduction of Assistants API, it is expected that\ndocument-based language models will be actively used in various domains,\nespecially Role-playing. However, a key challenge lies in utilizing\nprotagonist's persona: Assistants API often fails to achieve with its search\nbecause the information extraction part is different each time and it often\nomits important information such as protagonist's backstory or relationships.\nIt is hard to maintain a consistent persona simply by using the persona\ndocument as input to the Assistants API. To address the challenge of achieving\nstable persona consistency, we propose CharacterGPT, a novel persona\nreconstruction framework to alleviate the shortcomings of the Assistants API.\nOur method involves Character Persona Training (CPT), an effective persona\nrebuilding process that updates the character persona by extracting the\ncharacter's traits from given summary of the novel for each character as if the\nstory in a novel progresses. In our experiments, we ask each character to take\nthe Big Five Inventory personality test in various settings and analyze the\nresults. To assess whether it can think outside the box, we let each character\ngenerate short novels. Extensive experiments and human evaluation demonstrate\nthat CharacterGPT presents new possibilities for role-playing agent research.\n","authors":["Jeiyoon Park","Chanjun Park","Heuiseok Lim"],"pdf_url":"https://arxiv.org/pdf/2405.19778v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2405.11282v2","updated":"2024-06-01T10:28:41Z","published":"2024-05-18T12:58:02Z","title":"Estimating the Level of Dialectness Predicts Interannotator Agreement in\n  Multi-dialect Arabic Datasets","summary":"  On annotating multi-dialect Arabic datasets, it is common to randomly assign\nthe samples across a pool of native Arabic speakers. Recent analyses\nrecommended routing dialectal samples to native speakers of their respective\ndialects to build higher-quality datasets. However, automatically identifying\nthe dialect of samples is hard. Moreover, the pool of annotators who are native\nspeakers of specific Arabic dialects might be scarce. Arabic Level of\nDialectness (ALDi) was recently introduced as a quantitative variable that\nmeasures how sentences diverge from Standard Arabic. On randomly assigning\nsamples to annotators, we hypothesize that samples of higher ALDi scores are\nharder to label especially if they are written in dialects that the annotators\ndo not speak. We test this by analyzing the relation between ALDi scores and\nthe annotators' agreement, on 15 public datasets having raw individual sample\nannotations for various sentence-classification tasks. We find strong evidence\nsupporting our hypothesis for 11 of them. Consequently, we recommend\nprioritizing routing samples of high ALDi scores to native speakers of each\nsample's dialect, for which the dialect could be automatically identified at\nhigher accuracies.\n","authors":["Amr Keleg","Walid Magdy","Sharon Goldwater"],"pdf_url":"https://arxiv.org/pdf/2405.11282v2.pdf","comment":"Accepted to ACL 2024 - Main (camera-ready version)"},{"id":"http://arxiv.org/abs/2403.00226v3","updated":"2024-06-01T09:23:22Z","published":"2024-03-01T02:09:25Z","title":"A Semantic Distance Metric Learning approach for Lexical Semantic Change\n  Detection","summary":"  Detecting temporal semantic changes of words is an important task for various\nNLP applications that must make time-sensitive predictions. Lexical Semantic\nChange Detection (SCD) task involves predicting whether a given target word,\n$w$, changes its meaning between two different text corpora, $C_1$ and $C_2$.\nFor this purpose, we propose a supervised two-staged SCD method that uses\nexisting Word-in-Context (WiC) datasets. In the first stage, for a target word\n$w$, we learn two sense-aware encoders that represent the meaning of $w$ in a\ngiven sentence selected from a corpus. Next, in the second stage, we learn a\nsense-aware distance metric that compares the semantic representations of a\ntarget word across all of its occurrences in $C_1$ and $C_2$. Experimental\nresults on multiple benchmark datasets for SCD show that our proposed method\nachieves strong performance in multiple languages. Additionally, our method\nachieves significant improvements on WiC benchmarks compared to a sense-aware\nencoder with conventional distance functions. Source code is available at\nhttps://github.com/LivNLP/svp-sdml .\n","authors":["Taichi Aida","Danushka Bollegala"],"pdf_url":"https://arxiv.org/pdf/2403.00226v3.pdf","comment":"Findings of ACL2024"},{"id":"http://arxiv.org/abs/2311.07466v3","updated":"2024-06-01T07:57:52Z","published":"2023-11-13T16:53:51Z","title":"On Measuring Faithfulness or Self-consistency of Natural Language\n  Explanations","summary":"  Large language models (LLMs) can explain their predictions through post-hoc\nor Chain-of-Thought (CoT) explanations. But an LLM could make up reasonably\nsounding explanations that are unfaithful to its underlying reasoning. Recent\nwork has designed tests that aim to judge the faithfulness of post-hoc or CoT\nexplanations. In this work we argue that these faithfulness tests do not\nmeasure faithfulness to the models' inner workings -- but rather their\nself-consistency at output level. Our contributions are three-fold: i) We\nclarify the status of faithfulness tests in view of model explainability,\ncharacterising them as self-consistency tests instead. This assessment we\nunderline by ii) constructing a Comparative Consistency Bank for\nself-consistency tests that for the first time compares existing tests on a\ncommon suite of 11 open LLMs and 5 tasks -- including iii) our new\nself-consistency measure CC-SHAP. CC-SHAP is a fine-grained measure (not a\ntest) of LLM self-consistency. It compares how a model's input contributes to\nthe predicted answer and to generating the explanation. Our fine-grained\nCC-SHAP metric allows us iii) to compare LLM behaviour when making predictions\nand to analyse the effect of other consistency tests at a deeper level, which\ntakes us one step further towards measuring faithfulness by bringing us closer\nto the internals of the model than strictly surface output-oriented tests. Our\ncode is available at \\url{https://github.com/Heidelberg-NLP/CC-SHAP}\n","authors":["Letitia Parcalabescu","Anette Frank"],"pdf_url":"https://arxiv.org/pdf/2311.07466v3.pdf","comment":"Paper accepted for publication at ACL 2024 Main (Bangkok, Thailand);\n  10 main paper pages, 30 appendix pages"},{"id":"http://arxiv.org/abs/2402.14809v4","updated":"2024-06-01T07:46:28Z","published":"2024-02-22T18:59:02Z","title":"CriticBench: Benchmarking LLMs for Critique-Correct Reasoning","summary":"  The ability of Large Language Models (LLMs) to critique and refine their\nreasoning is crucial for their application in evaluation, feedback provision,\nand self-improvement. This paper introduces CriticBench, a comprehensive\nbenchmark designed to assess LLMs' abilities to critique and rectify their\nreasoning across a variety of tasks. CriticBench encompasses five reasoning\ndomains: mathematical, commonsense, symbolic, coding, and algorithmic. It\ncompiles 15 datasets and incorporates responses from three LLM families.\nUtilizing CriticBench, we evaluate and dissect the performance of 17 LLMs in\ngeneration, critique, and correction reasoning, i.e., GQC reasoning. Our\nfindings reveal: (1) a linear relationship in GQC capabilities, with\ncritique-focused training markedly enhancing performance; (2) a task-dependent\nvariation in correction effectiveness, with logic-oriented tasks being more\namenable to correction; (3) GQC knowledge inconsistencies that decrease as\nmodel size increases; and (4) an intriguing inter-model critiquing dynamic,\nwhere stronger models are better at critiquing weaker ones, while weaker models\ncan surprisingly surpass stronger ones in their self-critique. We hope these\ninsights into the nuanced critique-correct reasoning of LLMs will foster\nfurther research in LLM critique and self-improvement.\n","authors":["Zicheng Lin","Zhibin Gou","Tian Liang","Ruilin Luo","Haowei Liu","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2402.14809v4.pdf","comment":"ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2405.12059v2","updated":"2024-06-01T07:38:37Z","published":"2024-05-20T14:28:25Z","title":"STYLE: Improving Domain Transferability of Asking Clarification\n  Questions in Large Language Model Powered Conversational Agents","summary":"  Equipping a conversational search engine with strategies regarding when to\nask clarification questions is becoming increasingly important across various\ndomains. Attributing to the context understanding capability of LLMs and their\naccess to domain-specific sources of knowledge, LLM-based clarification\nstrategies feature rapid transfer to various domains in a post-hoc manner.\nHowever, they still struggle to deliver promising performance on unseen\ndomains, struggling to achieve effective domain transferability. We take the\nfirst step to investigate this issue and existing methods tend to produce\none-size-fits-all strategies across diverse domains, limiting their search\neffectiveness. In response, we introduce a novel method, called Style, to\nachieve effective domain transferability. Our experimental results indicate\nthat Style bears strong domain transferability, resulting in an average search\nperformance improvement of ~10% on four unseen domains.\n","authors":["Yue Chen","Chen Huang","Yang Deng","Wenqiang Lei","Dingnan Jin","Jia Liu","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2405.12059v2.pdf","comment":"Accepted to Findings of ACL 2024. Camera Ready"},{"id":"http://arxiv.org/abs/2405.12063v2","updated":"2024-06-01T07:35:26Z","published":"2024-05-20T14:34:01Z","title":"CLAMBER: A Benchmark of Identifying and Clarifying Ambiguous Information\n  Needs in Large Language Models","summary":"  Large language models (LLMs) are increasingly used to meet user information\nneeds, but their effectiveness in dealing with user queries that contain\nvarious types of ambiguity remains unknown, ultimately risking user trust and\nsatisfaction. To this end, we introduce CLAMBER, a benchmark for evaluating\nLLMs using a well-organized taxonomy. Building upon the taxonomy, we construct\n~12K high-quality data to assess the strengths, weaknesses, and potential risks\nof various off-the-shelf LLMs. Our findings indicate the limited practical\nutility of current LLMs in identifying and clarifying ambiguous user queries,\neven enhanced by chain-of-thought (CoT) and few-shot prompting. These\ntechniques may result in overconfidence in LLMs and yield only marginal\nenhancements in identifying ambiguity. Furthermore, current LLMs fall short in\ngenerating high-quality clarifying questions due to a lack of conflict\nresolution and inaccurate utilization of inherent knowledge. In this paper,\nCLAMBER presents a guidance and promotes further research on proactive and\ntrustworthy LLMs. Our dataset is available at\nhttps://github.com/zt991211/CLAMBER\n","authors":["Tong Zhang","Peixin Qin","Yang Deng","Chen Huang","Wenqiang Lei","Junhong Liu","Dingnan Jin","Hongru Liang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2405.12063v2.pdf","comment":"Accepted to ACL 2024. Camera Ready. Our dataset is available at\n  https://github.com/zt991211/CLAMBER"},{"id":"http://arxiv.org/abs/2405.11912v2","updated":"2024-06-01T07:30:51Z","published":"2024-05-20T09:48:15Z","title":"ARAIDA: Analogical Reasoning-Augmented Interactive Data Annotation","summary":"  Human annotation is a time-consuming task that requires a significant amount\nof effort. To address this issue, interactive data annotation utilizes an\nannotation model to provide suggestions for humans to approve or correct.\nHowever, annotation models trained with limited labeled data are prone to\ngenerating incorrect suggestions, leading to extra human correction effort. To\ntackle this challenge, we propose Araida, an analogical reasoning-based\napproach that enhances automatic annotation accuracy in the interactive data\nannotation setting and reduces the need for human corrections. Araida involves\nan error-aware integration strategy that dynamically coordinates an annotation\nmodel and a k-nearest neighbors (KNN) model, giving more importance to KNN's\npredictions when predictions from the annotation model are deemed inaccurate.\nEmpirical studies demonstrate that Araida is adaptable to different annotation\ntasks and models. On average, it reduces human correction labor by 11.02%\ncompared to vanilla interactive data annotation methods.\n","authors":["Chen Huang","Yiping Jin","Ilija Ilievski","Wenqiang Lei","Jiancheng Lv"],"pdf_url":"https://arxiv.org/pdf/2405.11912v2.pdf","comment":"Accepted to ACL 2024. Camera Ready"},{"id":"http://arxiv.org/abs/2402.11896v2","updated":"2024-06-01T07:13:15Z","published":"2024-02-19T07:22:29Z","title":"SIBO: A Simple Booster for Parameter-Efficient Fine-Tuning","summary":"  Fine-tuning all parameters of large language models (LLMs) necessitates\nsubstantial computational power and extended time. Latest advancements in\nparameter-efficient fine-tuning (PEFT) techniques, such as Adapter tuning and\nLoRA, allow for adjustments to only a minor fraction of the parameters of these\nLLMs. Concurrently, it has been noted that the issue of over-smoothing\ndiminishes the effectiveness of these Transformer-based LLMs, resulting in\nsuboptimal performances in downstream tasks. In this paper, we present SIBO,\nwhich is a SImple BOoster to enhance PEFT, by injecting an initial residual.\nSIBO is straightforward and readily extensible to a range of state-of-the-art\nPEFT techniques to alleviate over-smoothing and enhance performance. Extensive\nexperiments on 22 benchmark datasets demonstrate that SIBO significantly\nenhances the performance of various strong baselines, achieving up to 15.7% and\n23.5% improvement over existing PEFT methods on the arithmetic and commonsense\nreasoning tasks, respectively.\n","authors":["Zhihao Wen","Jie Zhang","Yuan Fang"],"pdf_url":"https://arxiv.org/pdf/2402.11896v2.pdf","comment":"Accepted by ACL 2024, 17 pages"},{"id":"http://arxiv.org/abs/2404.13611v2","updated":"2024-06-01T06:56:16Z","published":"2024-04-21T10:41:04Z","title":"Video sentence grounding with temporally global textual knowledge","summary":"  Temporal sentence grounding involves the retrieval of a video moment with a\nnatural language query. Many existing works directly incorporate the given\nvideo and temporally localized query for temporal grounding, overlooking the\ninherent domain gap between different modalities. In this paper, we utilize\npseudo-query features containing extensive temporally global textual knowledge\nsourced from the same video-query pair, to enhance the bridging of domain gaps\nand attain a heightened level of similarity between multi-modal features.\nSpecifically, we propose a Pseudo-query Intermediary Network (PIN) to achieve\nan improved alignment of visual and comprehensive pseudo-query features within\nthe feature space through contrastive learning. Subsequently, we utilize\nlearnable prompts to encapsulate the knowledge of pseudo-queries, propagating\nthem into the textual encoder and multi-modal fusion module, further enhancing\nthe feature alignment between visual and language for better temporal\ngrounding. Extensive experiments conducted on the Charades-STA and\nActivityNet-Captions datasets demonstrate the effectiveness of our method.\n","authors":["Cai Chen","Runzhong Zhang","Jianjun Gao","Kejun Wu","Kim-Hui Yap","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07529v3","updated":"2024-06-01T06:14:37Z","published":"2024-01-15T08:19:22Z","title":"MM-SAP: A Comprehensive Benchmark for Assessing Self-Awareness of\n  Multimodal Large Language Models in Perception","summary":"  Recent advancements in Multimodal Large Language Models (MLLMs) have\ndemonstrated exceptional capabilities in visual perception and understanding.\nHowever, these models also suffer from hallucinations, which limit their\nreliability as AI systems. We believe that these hallucinations are partially\ndue to the models' struggle with understanding what they can and cannot\nperceive from images, a capability we refer to as self-awareness in perception.\nDespite its importance, this aspect of MLLMs has been overlooked in prior\nstudies. In this paper, we aim to define and evaluate the self-awareness of\nMLLMs in perception. To do this, we first introduce the knowledge quadrant in\nperception, which helps define what MLLMs know and do not know about images.\nUsing this framework, we propose a novel benchmark, the Self-Awareness in\nPerception for MLLMs (MM-SAP), specifically designed to assess this capability.\nWe apply MM-SAP to a variety of popular MLLMs, offering a comprehensive\nanalysis of their self-awareness and providing detailed insights. The\nexperiment results reveal that current MLLMs possess limited self-awareness\ncapabilities, pointing to a crucial area for future advancement in the\ndevelopment of trustworthy MLLMs. Code and data are available at\nhttps://github.com/YHWmz/MM-SAP.\n","authors":["Yuhao Wang","Yusheng Liao","Heyang Liu","Hongcheng Liu","Yu Wang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07529v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07616v3","updated":"2024-06-01T04:52:17Z","published":"2024-02-12T12:48:02Z","title":"Anchor-based Large Language Models","summary":"  Large language models (LLMs) predominantly employ decoder-only transformer\narchitectures, necessitating the retention of keys/values information for\nhistorical tokens to provide contextual information and avoid redundant\ncomputation. However, the substantial size and parameter volume of these LLMs\nrequire massive GPU memory. This memory demand increases with the length of the\ninput text, leading to an urgent need for more efficient methods of information\nstorage and processing. This study introduces Anchor-based LLMs (AnLLMs), which\nutilize an innovative anchor-based self-attention network (AnSAN) and also an\nanchor-based inference strategy. This approach enables LLMs to compress\nsequence information into an anchor token, reducing the keys/values cache and\nenhancing inference efficiency. Experiments on question-answering benchmarks\nreveal that AnLLMs maintain similar accuracy levels while achieving up to 99%\nkeys/values cache reduction and up to 3.5 times faster inference. Despite a\nminor compromise in accuracy, the substantial enhancements of AnLLMs employing\nthe AnSAN technique in resource utilization and computational efficiency\nunderscore their potential for practical LLM applications.\n","authors":["Jianhui Pang","Fanghua Ye","Derek Fai Wong","Xin He","Wanshun Chen","Longyue Wang"],"pdf_url":"https://arxiv.org/pdf/2402.07616v3.pdf","comment":"The paper has been accepted by the ACL2024 conference. Work was done\n  when Jianhui Pang and Fanghua Ye were interning at Tencent AI Lab"},{"id":"http://arxiv.org/abs/2402.11192v2","updated":"2024-06-01T03:36:23Z","published":"2024-02-17T05:05:31Z","title":"I Learn Better If You Speak My Language: Understanding the Superior\n  Performance of Fine-Tuning Large Language Models with LLM-Generated Responses","summary":"  This paper explores an intriguing observation: fine-tuning a large language\nmodel (LLM) with responses generated by a LLM often yields better results than\nusing responses generated by humans. We conduct an in-depth investigation to\nunderstand why this occurs. Contrary to the common belief that these instances\nis simply due to the more detailed nature of LLM-generated content, our study\nidentifies another contributing factor: an LLM is inherently more \"familiar\"\nwith LLM generated responses. This familiarity is evidenced by lower perplexity\nbefore fine-tuning. We design a series of experiments to understand the impact\nof the \"familiarity\" and our conclusion reveals that this \"familiarity\"\nsignificantly impacts learning performance. Training with LLM-generated\nresponses not only enhances performance but also helps maintain the model's\ncapabilities in other tasks after fine-tuning on a specific task.\n","authors":["Xuan Ren","Biao Wu","Lingqiao Liu"],"pdf_url":"https://arxiv.org/pdf/2402.11192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19616v2","updated":"2024-06-01T03:00:37Z","published":"2024-05-30T02:09:51Z","title":"Easy Problems That LLMs Get Wrong","summary":"  We introduce a comprehensive Linguistic Benchmark designed to evaluate the\nlimitations of Large Language Models (LLMs) in domains such as logical\nreasoning, spatial intelligence, and linguistic understanding, among others.\nThrough a series of straightforward questions, it uncovers the significant\nlimitations of well-regarded models to perform tasks that humans manage with\nease. It also highlights the potential of prompt engineering to mitigate some\nerrors and underscores the necessity for better training methodologies. Our\nfindings stress the importance of grounding LLMs with human reasoning and\ncommon sense, emphasising the need for human-in-the-loop for enterprise\napplications. We hope this work paves the way for future research to enhance\nthe usefulness and reliability of new models.\n","authors":["Sean Williams","James Huckle"],"pdf_url":"https://arxiv.org/pdf/2405.19616v2.pdf","comment":"AutogenAI Ltd. GitHub Repo:\n  https://github.com/autogenai/easy-problems-that-llms-get-wrong"},{"id":"http://arxiv.org/abs/2405.18952v2","updated":"2024-06-01T02:18:06Z","published":"2024-05-29T10:08:31Z","title":"Are You Sure? Rank Them Again: Repeated Ranking For Better Preference\n  Datasets","summary":"  Training Large Language Models (LLMs) with Reinforcement Learning from AI\nFeedback (RLAIF) aligns model outputs more closely with human preferences. This\ninvolves an evaluator model ranking multiple candidate responses to user\nprompts. However, the rankings from popular evaluator models such as GPT-4 can\nbe inconsistent. We propose the Repeat Ranking method - where we evaluate the\nsame responses multiple times and train only on those responses which are\nconsistently ranked. Using 2,714 prompts in 62 languages, we generated\nresponses from 7 top multilingual LLMs and had GPT-4 rank them five times each.\nEvaluating on MT-Bench chat benchmarks in six languages, our method\noutperformed the standard practice of training on all available prompts. Our\nwork highlights the quality versus quantity trade-off in RLAIF dataset\ngeneration and offers a stackable strategy for enhancing dataset and thus model\nquality.\n","authors":["Peter Devine"],"pdf_url":"https://arxiv.org/pdf/2405.18952v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2404.13445v2","updated":"2024-06-01T23:28:55Z","published":"2024-04-20T18:52:51Z","title":"DMesh: A Differentiable Mesh Representation","summary":"  We present a differentiable representation, DMesh, for general 3D triangular\nmeshes. DMesh considers both the geometry and connectivity information of a\nmesh. In our design, we first get a set of convex tetrahedra that compactly\ntessellates the domain based on Weighted Delaunay Triangulation (WDT), and\nselect triangular faces on the tetrahedra to define the final mesh. We\nformulate probability of faces to exist on the actual surface in a\ndifferentiable manner based on the WDT. This enables DMesh to represent meshes\nof various topology in a differentiable way, and allows us to reconstruct the\nmesh under various observations, such as point cloud and multi-view images\nusing gradient-based optimization. The source code and full paper is available\nat: https://sonsang.github.io/dmesh-project.\n","authors":["Sanghyun Son","Matheus Gadelha","Yang Zhou","Zexiang Xu","Ming C. Lin","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.13445v2.pdf","comment":"35 pages, 22 figures. Updated with more analysis and experimental\n  results"},{"id":"http://arxiv.org/abs/2405.11133v2","updated":"2024-06-01T22:48:27Z","published":"2024-05-18T01:09:02Z","title":"XCAT-3.0: A Comprehensive Library of Personalized Digital Twins Derived\n  from CT Scans","summary":"  Virtual Imaging Trials (VIT) offer a cost-effective and scalable approach for\nevaluating medical imaging technologies. Computational phantoms, which mimic\nreal patient anatomy and physiology, play a central role in VIT. However, the\ncurrent libraries of computational phantoms face limitations, particularly in\nterms of sample size and diversity. Insufficient representation of the\npopulation hampers accurate assessment of imaging technologies across different\npatient groups. Traditionally, phantoms were created by manual segmentation,\nwhich is a laborious and time-consuming task, impeding the expansion of phantom\nlibraries. This study presents a framework for realistic computational phantom\nmodeling using a suite of four deep learning segmentation models, followed by\nthree forms of automated organ segmentation quality control. Over 2500\ncomputational phantoms with up to 140 structures illustrating a sophisticated\napproach to detailed anatomical modeling are released. Phantoms are available\nin both voxelized and surface mesh formats. The framework is aggregated with an\nin-house CT scanner simulator to produce realistic CT images. The framework can\npotentially advance virtual imaging trials, facilitating comprehensive and\nreliable evaluations of medical imaging technologies. Phantoms may be requested\nat https://cvit.duke.edu/resources/, code, model weights, and sample CT images\nare available at https://xcat-3.github.io.\n","authors":["Lavsen Dahal","Mobina Ghojoghnejad","Dhrubajyoti Ghosh","Yubraj Bhandari","David Kim","Fong Chi Ho","Fakrul Islam Tushar","Sheng Luoa","Kyle J. Lafata","Ehsan Abadi","Ehsan Samei","Joseph Y. Lo","W. Paul Segars"],"pdf_url":"https://arxiv.org/pdf/2405.11133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17166v2","updated":"2024-06-01T22:06:36Z","published":"2023-03-30T05:57:59Z","title":"Deep Single Image Camera Calibration by Heatmap Regression to Recover\n  Fisheye Images Under Manhattan World Assumption","summary":"  A Manhattan world lying along cuboid buildings is useful for camera angle\nestimation. However, accurate and robust angle estimation from fisheye images\nin the Manhattan world has remained an open challenge because general scene\nimages tend to lack constraints such as lines, arcs, and vanishing points. To\nachieve higher accuracy and robustness, we propose a learning-based calibration\nmethod that uses heatmap regression, which is similar to pose estimation using\nkeypoints, to detect the directions of labeled image coordinates.\nSimultaneously, our two estimators recover the rotation and remove fisheye\ndistortion by remapping from a general scene image. Without considering\nvanishing-point constraints, we find that additional points for learning-based\nmethods can be defined. To compensate for the lack of vanishing points in\nimages, we introduce auxiliary diagonal points that have the optimal 3D\narrangement of spatial uniformity. Extensive experiments demonstrated that our\nmethod outperforms conventional methods on large-scale datasets and with\noff-the-shelf cameras.\n","authors":["Nobuhiko Wakai","Satoshi Sato","Yasunori Ishii","Takayoshi Yamashita"],"pdf_url":"https://arxiv.org/pdf/2303.17166v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2402.08017v2","updated":"2024-06-01T21:46:50Z","published":"2024-02-12T19:27:26Z","title":"Lumos : Empowering Multimodal LLMs with Scene Text Recognition","summary":"  We introduce Lumos, the first end-to-end multimodal question-answering system\nwith text understanding capabilities. At the core of Lumos is a Scene Text\nRecognition (STR) component that extracts text from first person point-of-view\nimages, the output of which is used to augment input to a Multimodal Large\nLanguage Model (MM-LLM). While building Lumos, we encountered numerous\nchallenges related to STR quality, overall latency, and model inference. In\nthis paper, we delve into those challenges, and discuss the system\narchitecture, design choices, and modeling techniques employed to overcome\nthese obstacles. We also provide a comprehensive evaluation for each component,\nshowcasing high quality and efficiency.\n","authors":["Ashish Shenoy","Yichao Lu","Srihari Jayakumar","Debojeet Chatterjee","Mohsen Moslehpour","Pierce Chuang","Abhay Harpale","Vikas Bhardwaj","Di Xu","Shicong Zhao","Longfang Zhao","Ankit Ramchandani","Xin Luna Dong","Anuj Kumar"],"pdf_url":"https://arxiv.org/pdf/2402.08017v2.pdf","comment":"Accepted to KDD 2024 (ADS Track)"},{"id":"http://arxiv.org/abs/2405.18541v2","updated":"2024-06-01T20:53:23Z","published":"2024-05-28T19:16:59Z","title":"Low-Rank Few-Shot Adaptation of Vision-Language Models","summary":"  Recent progress in the few-shot adaptation of Vision-Language Models (VLMs)\nhas further pushed their generalization capabilities, at the expense of just a\nfew labeled samples within the target downstream task. However, this promising,\nalready quite abundant few-shot literature has focused principally on prompt\nlearning and, to a lesser extent, on adapters, overlooking the recent advances\nin Parameter-Efficient Fine-Tuning (PEFT). Furthermore, existing few-shot\nlearning methods for VLMs often rely on heavy training procedures and/or\ncarefully chosen, task-specific hyper-parameters, which might impede their\napplicability. In response, we introduce Low-Rank Adaptation (LoRA) in few-shot\nlearning for VLMs, and show its potential on 11 datasets, in comparison to\ncurrent state-of-the-art prompt- and adapter-based approaches. Surprisingly,\nour simple CLIP-LoRA method exhibits substantial improvements, while reducing\nthe training times and keeping the same hyper-parameters in all the target\ntasks, i.e., across all the datasets and numbers of shots. Certainly, our\nsurprising results do not dismiss the potential of prompt-learning and\nadapter-based research. However, we believe that our strong baseline could be\nused to evaluate progress in these emergent subjects in few-shot VLMs.\n","authors":["Maxime Zanella","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2405.18541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11021v2","updated":"2024-06-01T20:07:53Z","published":"2024-05-17T18:00:07Z","title":"Enhanced 3D Urban Scene Reconstruction and Point Cloud Densification\n  using Gaussian Splatting and Google Earth Imagery","summary":"  3D urban scene reconstruction and modelling is a crucial research area in\nremote sensing with numerous applications in academia, commerce, industry, and\nadministration. Recent advancements in view synthesis models have facilitated\nphotorealistic 3D reconstruction solely from 2D images. Leveraging Google Earth\nimagery, we construct a 3D Gaussian Splatting model of the Waterloo region\ncentered on the University of Waterloo and are able to achieve view-synthesis\nresults far exceeding previous 3D view-synthesis results based on neural\nradiance fields which we demonstrate in our benchmark. Additionally, we\nretrieved the 3D geometry of the scene using the 3D point cloud extracted from\nthe 3D Gaussian Splatting model which we benchmarked against our Multi-\nView-Stereo dense reconstruction of the scene, thereby reconstructing both the\n3D geometry and photorealistic lighting of the large-scale urban scene through\n3D Gaussian Splatting\n","authors":["Kyle Gao","Dening Lu","Hongjie He","Linlin Xu","Jonathan Li"],"pdf_url":"https://arxiv.org/pdf/2405.11021v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14874v2","updated":"2024-06-01T20:05:26Z","published":"2024-04-01T14:18:15Z","title":"Investigating Robustness of Open-Vocabulary Foundation Object Detectors\n  under Distribution Shifts","summary":"  The challenge of Out-Of-Distribution (OOD) robustness remains a critical\nhurdle towards deploying deep vision models. Open-vocabulary object detection\nextends the capabilities of traditional object detection frameworks to\nrecognize and classify objects beyond predefined categories. Investigating OOD\nrobustness in open-vocabulary object detection is essential to increase the\ntrustworthiness of these models. This study presents a comprehensive robustness\nevaluation of zero-shot capabilities of three recent open-vocabulary foundation\nobject detection models, namely OWL-ViT, YOLO World, and Grounding DINO.\nExperiments carried out on the COCO-O and COCO-C benchmarks encompassing\ndistribution shifts highlight the challenges of the models' robustness. Source\ncode shall be made available to the research community on GitHub.\n","authors":["Prakash Chandra Chhipa","Kanjar De","Meenakshi Subhash Chippa","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2405.14874v2.pdf","comment":"13 + 3 single column pages"},{"id":"http://arxiv.org/abs/2403.00174v3","updated":"2024-06-01T16:35:45Z","published":"2024-02-29T22:58:13Z","title":"A citizen science toolkit to collect human perceptions of urban\n  environments using open street view images","summary":"  Street View Imagery (SVI) is a valuable data source for studies (e.g.,\nenvironmental assessments, green space identification or land cover\nclassification). While commercial SVI is available, such providers commonly\nrestrict copying or reuse in ways necessary for research. Open SVI datasets are\nreadily available from less restrictive sources, such as Mapillary, but due to\nthe heterogeneity of the images, these require substantial preprocessing,\nfiltering, and careful quality checks. We present an efficient method for\nautomated downloading, processing, cropping, and filtering open SVI, to be used\nin a survey of human perceptions of the streets portrayed in these images. We\ndemonstrate our open-source reusable SVI preparation and smartphone-friendly\nperception-survey software with Amsterdam (Netherlands) as the case study.\nUsing a citizen science approach, we collected from 331 people 22,637 ratings\nabout their perceptions for various criteria. We have published our software in\na public repository for future re-use and reproducibility.\n","authors":["Matthew Danish","SM Labib","Britta Ricker","Marco Helbich"],"pdf_url":"https://arxiv.org/pdf/2403.00174v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15264v2","updated":"2024-06-01T16:28:16Z","published":"2023-11-26T10:38:47Z","title":"ChAda-ViT : Channel Adaptive Attention for Joint Representation Learning\n  of Heterogeneous Microscopy Images","summary":"  Unlike color photography images, which are consistently encoded into RGB\nchannels, biological images encompass various modalities, where the type of\nmicroscopy and the meaning of each channel varies with each experiment.\nImportantly, the number of channels can range from one to a dozen and their\ncorrelation is often comparatively much lower than RGB, as each of them brings\nspecific information content. This aspect is largely overlooked by methods\ndesigned out of the bioimage field, and current solutions mostly focus on\nintra-channel spatial attention, often ignoring the relationship between\nchannels, yet crucial in most biological applications. Importantly, the\nvariable channel type and count prevent the projection of several experiments\nto a unified representation for large scale pre-training. In this study, we\npropose ChAda-ViT, a novel Channel Adaptive Vision Transformer architecture\nemploying an Inter-Channel Attention mechanism on images with an arbitrary\nnumber, order and type of channels. We also introduce IDRCell100k, a bioimage\ndataset with a rich set of 79 experiments covering 7 microscope modalities,\nwith a multitude of channel types, and counts varying from 1 to 10 per\nexperiment. Our architecture, trained in a self-supervised manner, outperforms\nexisting approaches in several biologically relevant downstream tasks.\nAdditionally, it can be used to bridge the gap for the first time between\nassays with different microscopes, channel numbers or types by embedding\nvarious image and experimental modalities into a unified biological image\nrepresentation. The latter should facilitate interdisciplinary studies and pave\nthe way for better adoption of deep learning in biological image-based\nanalyses. Code and Data available at https://github.com/nicoboou/chadavit.\n","authors":["Nicolas Bourriez","Ihab Bendidi","Ethan Cohen","Gabriel Watkinson","Maxime Sanchez","Guillaume Bollot","Auguste Genovesio"],"pdf_url":"https://arxiv.org/pdf/2311.15264v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00521v4","updated":"2024-06-01T16:22:54Z","published":"2024-03-31T01:41:36Z","title":"CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz\n  continuity constrAIned Normalization","summary":"  Generative Adversarial Networks (GANs) significantly advanced image\ngeneration but their performance heavily depends on abundant training data. In\nscenarios with limited data, GANs often struggle with discriminator overfitting\nand unstable training. Batch Normalization (BN), despite being known for\nenhancing generalization and training stability, has rarely been used in the\ndiscriminator of Data-Efficient GANs. Our work addresses this gap by\nidentifying a critical flaw in BN: the tendency for gradient explosion during\nthe centering and scaling steps. To tackle this issue, we present CHAIN\n(lipsCHitz continuity constrAIned Normalization), which replaces the\nconventional centering step with zero-mean regularization and integrates a\nLipschitz continuity constraint in the scaling step. CHAIN further enhances GAN\ntraining by adaptively interpolating the normalized and unnormalized features,\neffectively avoiding discriminator overfitting. Our theoretical analyses firmly\nestablishes CHAIN's effectiveness in reducing gradients in latent features and\nweights, improving stability and generalization in GAN training. Empirical\nevidence supports our theory. CHAIN achieves state-of-the-art results in\ndata-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven\nhigh-resolution few-shot image datasets. Code:\nhttps://github.com/MaxwellYaoNi/CHAIN\n","authors":["Yao Ni","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2404.00521v4.pdf","comment":"Accepted by CVPR 2024. 26 pages. Improve Lemma 3.1 - Prop. 3.1 logic\n  flow. Code: https://github.com/MaxwellYaoNi/CHAIN"},{"id":"http://arxiv.org/abs/2405.14832v2","updated":"2024-06-01T16:18:53Z","published":"2024-05-23T17:49:37Z","title":"Direct3D: Scalable Image-to-3D Generation via 3D Latent Diffusion\n  Transformer","summary":"  Generating high-quality 3D assets from text and images has long been\nchallenging, primarily due to the absence of scalable 3D representations\ncapable of capturing intricate geometry distributions. In this work, we\nintroduce Direct3D, a native 3D generative model scalable to in-the-wild input\nimages, without requiring a multiview diffusion model or SDS optimization. Our\napproach comprises two primary components: a Direct 3D Variational Auto-Encoder\n(D3D-VAE) and a Direct 3D Diffusion Transformer (D3D-DiT). D3D-VAE efficiently\nencodes high-resolution 3D shapes into a compact and continuous latent triplane\nspace. Notably, our method directly supervises the decoded geometry using a\nsemi-continuous surface sampling strategy, diverging from previous methods\nrelying on rendered images as supervision signals. D3D-DiT models the\ndistribution of encoded 3D latents and is specifically designed to fuse\npositional information from the three feature maps of the triplane latent,\nenabling a native 3D generative model scalable to large-scale 3D datasets.\nAdditionally, we introduce an innovative image-to-3D generation pipeline\nincorporating semantic and pixel-level image conditions, allowing the model to\nproduce 3D shapes consistent with the provided conditional image input.\nExtensive experiments demonstrate the superiority of our large-scale\npre-trained Direct3D over previous image-to-3D approaches, achieving\nsignificantly better generation quality and generalization ability, thus\nestablishing a new state-of-the-art for 3D content creation. Project page:\nhttps://nju-3dv.github.io/projects/Direct3D/.\n","authors":["Shuang Wu","Youtian Lin","Feihu Zhang","Yifei Zeng","Jingxi Xu","Philip Torr","Xun Cao","Yao Yao"],"pdf_url":"https://arxiv.org/pdf/2405.14832v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05474v2","updated":"2024-06-01T16:17:52Z","published":"2023-08-10T10:01:56Z","title":"Spatio-Temporal Encoding of Brain Dynamics with Surface Masked\n  Autoencoders","summary":"  The development of robust and generalisable models for encoding the\nspatio-temporal dynamics of human brain activity is crucial for advancing\nneuroscientific discoveries. However, significant individual variation in the\norganisation of the human cerebral cortex makes it difficult to identify\npopulation-level trends in these signals. Recently, Surface Vision Transformers\n(SiTs) have emerged as a promising approach for modelling cortical signals, yet\nthey face some limitations in low-data scenarios due to the lack of inductive\nbiases in their architecture. To address these challenges, this paper proposes\nthe surface Masked AutoEncoder (sMAE) and video surface Masked AutoEncoder\n(vsMAE) - for multivariate and spatio-temporal pre-training of cortical signals\nover regular icosahedral grids. These models are trained to reconstruct\ncortical feature maps from masked versions of the input by learning strong\nlatent representations of cortical structure and function. Such representations\ntranslate into better modelling of individual phenotypes and enhanced\nperformance in downstream tasks. The proposed approach was evaluated on\ncortical phenotype regression using data from the young adult Human Connectome\nProject (HCP) and developing HCP (dHCP). Results show that (v)sMAE pre-trained\nmodels improve phenotyping prediction performance on multiple tasks by $\\ge\n26\\%$, and offer faster convergence relative to models trained from scratch.\nFinally, we show that pre-training vision transformers on large datasets, such\nas the UK Biobank (UKB), supports transfer learning to low-data regimes. Our\ncode and pre-trained models are publicly available at\nhttps://github.com/metrics-lab/surface-masked-autoencoders .\n","authors":["Simon Dahan","Logan Z. J. Williams","Yourong Guo","Daniel Rueckert","Emma C. Robinson"],"pdf_url":"https://arxiv.org/pdf/2308.05474v2.pdf","comment":"Accepted for publications for MIDL 2024; 20 figures; 7 figures"},{"id":"http://arxiv.org/abs/2403.18791v2","updated":"2024-06-01T15:25:47Z","published":"2024-03-27T17:35:24Z","title":"Object Pose Estimation via the Aggregation of Diffusion Features","summary":"  Estimating the pose of objects from images is a crucial task of 3D scene\nunderstanding, and recent approaches have shown promising results on very large\nbenchmarks. However, these methods experience a significant performance drop\nwhen dealing with unseen objects. We believe that it results from the limited\ngeneralizability of image features. To address this problem, we have an\nin-depth analysis on the features of diffusion models, e.g. Stable Diffusion,\nwhich hold substantial potential for modeling unseen objects. Based on this\nanalysis, we then innovatively introduce these diffusion features for object\npose estimation. To achieve this, we propose three distinct architectures that\ncan effectively capture and aggregate diffusion features of different\ngranularity, greatly improving the generalizability of object pose estimation.\nOur approach outperforms the state-of-the-art methods by a considerable margin\non three popular benchmark datasets, LM, O-LM, and T-LESS. In particular, our\nmethod achieves higher accuracy than the previous best arts on unseen objects:\n98.2% vs. 93.5% on Unseen LM, 85.9% vs. 76.3% on Unseen O-LM, showing the\nstrong generalizability of our method. Our code is released at\nhttps://github.com/Tianfu18/diff-feats-pose.\n","authors":["Tianfu Wang","Guosheng Hu","Hongguang Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18791v2.pdf","comment":"Accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2405.15475v2","updated":"2024-06-01T14:39:30Z","published":"2024-05-24T11:53:27Z","title":"Efficient Degradation-aware Any Image Restoration","summary":"  Reconstructing missing details from degraded low-quality inputs poses a\nsignificant challenge. Recent progress in image restoration has demonstrated\nthe efficacy of learning large models capable of addressing various\ndegradations simultaneously. Nonetheless, these approaches introduce\nconsiderable computational overhead and complex learning paradigms, limiting\ntheir practical utility. In response, we propose \\textit{DaAIR}, an efficient\nAll-in-One image restorer employing a Degradation-aware Learner (DaLe) in the\nlow-rank regime to collaboratively mine shared aspects and subtle nuances\nacross diverse degradations, generating a degradation-aware embedding. By\ndynamically allocating model capacity to input degradations, we realize an\nefficient restorer integrating holistic and specific learning within a unified\nmodel. Furthermore, DaAIR introduces a cost-efficient parameter update\nmechanism that enhances degradation awareness while maintaining computational\nefficiency. Extensive comparisons across five image degradations demonstrate\nthat our DaAIR outperforms both state-of-the-art All-in-One models and\ndegradation-specific counterparts, affirming our efficacy and practicality. The\nsource will be publicly made available at https://eduardzamfir.github.io/daair/\n","authors":["Eduard Zamfir","Zongwei Wu","Nancy Mehta","Danda Pani Paudel","Yulun Zhang","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2405.15475v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11909v2","updated":"2024-06-01T14:34:15Z","published":"2023-03-21T15:00:17Z","title":"The Multiscale Surface Vision Transformer","summary":"  Surface meshes are a favoured domain for representing structural and\nfunctional information on the human cortex, but their complex topology and\ngeometry pose significant challenges for deep learning analysis. While\nTransformers have excelled as domain-agnostic architectures for\nsequence-to-sequence learning, the quadratic cost of the self-attention\noperation remains an obstacle for many dense prediction tasks. Inspired by some\nof the latest advances in hierarchical modelling with vision transformers, we\nintroduce the Multiscale Surface Vision Transformer (MS-SiT) as a backbone\narchitecture for surface deep learning. The self-attention mechanism is applied\nwithin local-mesh-windows to allow for high-resolution sampling of the\nunderlying data, while a shifted-window strategy improves the sharing of\ninformation between windows. Neighbouring patches are successively merged,\nallowing the MS-SiT to learn hierarchical representations suitable for any\nprediction task. Results demonstrate that the MS-SiT outperforms existing\nsurface deep learning methods for neonatal phenotyping prediction tasks using\nthe Developing Human Connectome Project (dHCP) dataset. Furthermore, building\nthe MS-SiT backbone into a U-shaped architecture for surface segmentation\ndemonstrates competitive results on cortical parcellation using the UK Biobank\n(UKB) and manually-annotated MindBoggle datasets. Code and trained models are\npublicly available at\nhttps://github.com/metrics-lab/surface-vision-transformers.\n","authors":["Simon Dahan","Logan Z. J. Williams","Daniel Rueckert","Emma C. Robinson"],"pdf_url":"https://arxiv.org/pdf/2303.11909v2.pdf","comment":"Accepted for publication at MIDL 2024, 17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.08671v3","updated":"2024-06-01T11:34:13Z","published":"2024-02-13T18:53:13Z","title":"Are Semi-Dense Detector-Free Methods Good at Matching Local Features?","summary":"  Semi-dense detector-free approaches (SDF), such as LoFTR, are currently among\nthe most popular image matching methods. While SDF methods are trained to\nestablish correspondences between two images, their performances are almost\nexclusively evaluated using relative pose estimation metrics. Thus, the link\nbetween their ability to establish correspondences and the quality of the\nresulting estimated pose has thus far received little attention. This paper is\na first attempt to study this link. We start with proposing a novel structured\nattention-based image matching architecture (SAM). It allows us to show a\ncounter-intuitive result on two datasets (MegaDepth and HPatches): on the one\nhand SAM either outperforms or is on par with SDF methods in terms of\npose/homography estimation metrics, but on the other hand SDF approaches are\nsignificantly better than SAM in terms of matching accuracy. We then propose to\nlimit the computation of the matching accuracy to textured regions, and show\nthat in this case SAM often surpasses SDF methods. Our findings highlight a\nstrong correlation between the ability to establish accurate correspondences in\ntextured regions and the accuracy of the resulting estimated pose/homography.\nOur code will be made available.\n","authors":["Matthieu Vilain","Rémi Giraud","Hugo Germain","Guillaume Bourmaud"],"pdf_url":"https://arxiv.org/pdf/2402.08671v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12063v2","updated":"2024-06-01T10:54:50Z","published":"2024-02-09T02:23:47Z","title":"Consistency Model is an Effective Posterior Sample Approximation for\n  Diffusion Inverse Solvers","summary":"  Diffusion Inverse Solvers (DIS) are designed to sample from the conditional\ndistribution $p_{\\theta}(X_0|y)$, with a predefined diffusion model\n$p_{\\theta}(X_0)$, an operator $f(\\cdot)$, and a measurement $y=f(x'_0)$\nderived from an unknown image $x'_0$. Existing DIS estimate the conditional\nscore function by evaluating $f(\\cdot)$ with an approximated posterior sample\ndrawn from $p_{\\theta}(X_0|X_t)$. However, most prior approximations rely on\nthe posterior means, which may not lie in the support of the image\ndistribution, thereby potentially diverge from the appearance of genuine\nimages. Such out-of-support samples may significantly degrade the performance\nof the operator $f(\\cdot)$, particularly when it is a neural network. In this\npaper, we introduces a novel approach for posterior approximation that\nguarantees to generate valid samples within the support of the image\ndistribution, and also enhances the compatibility with neural network-based\noperators $f(\\cdot)$. We first demonstrate that the solution of the Probability\nFlow Ordinary Differential Equation (PF-ODE) with an initial value $x_t$ yields\nan effective posterior sample $p_{\\theta}(X_0|X_t=x_t)$. Based on this\nobservation, we adopt the Consistency Model (CM), which is distilled from\nPF-ODE, for posterior sampling. Furthermore, we design a novel family of DIS\nusing only CM. Through extensive experiments, we show that our proposed method\nfor posterior sample approximation substantially enhance the effectiveness of\nDIS for neural network operators $f(\\cdot)$ (e.g., in semantic segmentation).\nAdditionally, our experiments demonstrate the effectiveness of the new CM-based\ninversion techniques. The source code is provided in the supplementary\nmaterial.\n","authors":["Tongda Xu","Ziran Zhu","Jian Li","Dailan He","Yuanyuan Wang","Ming Sun","Ling Li","Hongwei Qin","Yan Wang","Jingjing Liu","Ya-Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.12063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12940v2","updated":"2024-06-01T10:25:54Z","published":"2024-04-19T15:10:54Z","title":"Neural Flow Diffusion Models: Learnable Forward Process for Improved\n  Diffusion Modelling","summary":"  Conventional diffusion models typically relies on a fixed forward process,\nwhich implicitly defines complex marginal distributions over latent variables.\nThis can often complicate the reverse process' task in learning generative\ntrajectories, and results in costly inference for diffusion models. To address\nthese limitations, we introduce Neural Flow Diffusion Models (NFDM), a novel\nframework that enhances diffusion models by supporting a broader range of\nforward processes beyond the standard Gaussian. We also propose a novel\nparameterization technique for learning the forward process. Our framework\nprovides an end-to-end, simulation-free optimization objective, effectively\nminimizing a variational upper bound on the negative log-likelihood.\nExperimental results demonstrate NFDM's strong performance, evidenced by\nstate-of-the-art likelihood estimation. Furthermore, we investigate NFDM's\ncapacity for learning generative dynamics with specific characteristics, such\nas deterministic straight lines trajectories, and demonstrate how the framework\nmay be adopted for learning bridges between two distributions. The results\nunderscores NFDM's versatility and its potential for a wide range of\napplications.\n","authors":["Grigory Bartosh","Dmitry Vetrov","Christian A. Naesseth"],"pdf_url":"https://arxiv.org/pdf/2404.12940v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10019v3","updated":"2024-06-01T09:59:01Z","published":"2023-09-18T17:50:56Z","title":"Long-Tail Learning with Foundation Model: Heavy Fine-Tuning Hurts","summary":"  The fine-tuning paradigm in addressing long-tail learning tasks has sparked\nsignificant interest since the emergence of foundation models. Nonetheless, how\nfine-tuning impacts performance in long-tail learning was not explicitly\nquantified. In this paper, we disclose that heavy fine-tuning may even lead to\nnon-negligible performance deterioration on tail classes, and lightweight\nfine-tuning is more effective. The reason is attributed to inconsistent class\nconditions caused by heavy fine-tuning. With the observation above, we develop\na low-complexity and accurate long-tail learning algorithms LIFT with the goal\nof facilitating fast prediction and compact models by adaptive lightweight\nfine-tuning. Experiments clearly verify that both the training time and the\nlearned parameters are significantly reduced with more accurate predictive\nperformance compared with state-of-the-art approaches. The implementation code\nis available at https://github.com/shijxcs/LIFT.\n","authors":["Jiang-Xin Shi","Tong Wei","Zhi Zhou","Jie-Jing Shao","Xin-Yan Han","Yu-Feng Li"],"pdf_url":"https://arxiv.org/pdf/2309.10019v3.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2402.13505v3","updated":"2024-06-01T09:53:00Z","published":"2024-02-21T03:39:04Z","title":"SimPro: A Simple Probabilistic Framework Towards Realistic Long-Tailed\n  Semi-Supervised Learning","summary":"  Recent advancements in semi-supervised learning have focused on a more\nrealistic yet challenging task: addressing imbalances in labeled data while the\nclass distribution of unlabeled data remains both unknown and potentially\nmismatched. Current approaches in this sphere often presuppose rigid\nassumptions regarding the class distribution of unlabeled data, thereby\nlimiting the adaptability of models to only certain distribution ranges. In\nthis study, we propose a novel approach, introducing a highly adaptable\nframework, designated as SimPro, which does not rely on any predefined\nassumptions about the distribution of unlabeled data. Our framework, grounded\nin a probabilistic model, innovatively refines the expectation-maximization\n(EM) algorithm by explicitly decoupling the modeling of conditional and\nmarginal class distributions. This separation facilitates a closed-form\nsolution for class distribution estimation during the maximization phase,\nleading to the formulation of a Bayes classifier. The Bayes classifier, in\nturn, enhances the quality of pseudo-labels in the expectation phase.\nRemarkably, the SimPro framework not only comes with theoretical guarantees but\nalso is straightforward to implement. Moreover, we introduce two novel class\ndistributions broadening the scope of the evaluation. Our method showcases\nconsistent state-of-the-art performance across diverse benchmarks and data\ndistribution scenarios. Our code is available at\nhttps://github.com/LeapLabTHU/SimPro.\n","authors":["Chaoqun Du","Yizeng Han","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2402.13505v3.pdf","comment":"ICML2024 camera-ready version"},{"id":"http://arxiv.org/abs/2404.06207v3","updated":"2024-06-01T09:31:08Z","published":"2024-04-09T10:56:46Z","title":"Leveraging edge detection and neural networks for better UAV\n  localization","summary":"  We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs)\nin environments lacking Global Navigation Satellite Systems (GNSS). Current\nstate-of-the-art techniques employ an offline-trained encoder to generate a\nvector representation (embedding) of the UAV's current view, which is then\ncompared with pre-computed embeddings of geo-referenced images to determine the\nUAV's position. Here, we demonstrate that the performance of these methods can\nbe significantly enhanced by preprocessing the images to extract their edges,\nwhich exhibit robustness to seasonal and illumination variations. Furthermore,\nwe establish that utilizing edges enhances resilience to orientation and\naltitude inaccuracies. Additionally, we introduce a confidence criterion for\nlocalization. Our findings are substantiated through synthetic experiments.\n","authors":["Theo Di Piazza","Enric Meinhardt-Llopis","Gabriele Facciolo","Benedicte Bascle","Corentin Abgrall","Jean-Clement Devaux"],"pdf_url":"https://arxiv.org/pdf/2404.06207v3.pdf","comment":"Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.03967v2","updated":"2024-06-01T09:14:56Z","published":"2023-11-07T13:06:50Z","title":"CeCNN: Copula-enhanced convolutional neural networks in joint prediction\n  of refraction error and axial length based on ultra-widefield fundus images","summary":"  Ultra-widefield (UWF) fundus images are replacing traditional fundus images\nin screening, detection, prediction, and treatment of complications related to\nmyopia because their much broader visual range is advantageous for highly\nmyopic eyes. Spherical equivalent (SE) is extensively used as the main myopia\noutcome measure, and axial length (AL) has drawn increasing interest as an\nimportant ocular component for assessing myopia. Cutting-edge studies show that\nSE and AL are strongly correlated. Using the joint information from SE and AL\nis potentially better than using either separately. In the deep learning\ncommunity, though there is research on multiple-response tasks with a 3D image\nbiomarker, dependence among responses is only sporadically taken into\nconsideration. Inspired by the spirit that information extracted from the data\nby statistical methods can improve the prediction accuracy of deep learning\nmodels, we formulate a class of multivariate response regression models with a\nhigher-order tensor biomarker, for the bivariate tasks of\nregression-classification and regression-regression. Specifically, we propose a\ncopula-enhanced convolutional neural network (CeCNN) framework that\nincorporates the dependence between responses through a Gaussian copula (with\nparameters estimated from a warm-up CNN) and uses the induced copula-likelihood\nloss with the backbone CNNs. We establish the statistical framework and\nalgorithms for the aforementioned two bivariate tasks. We show that the CeCNN\nhas better prediction accuracy after adding the dependency information to the\nbackbone models. The modeling and the proposed CeCNN algorithm are applicable\nbeyond the UWF scenario and can be effective with other backbones beyond ResNet\nand LeNet.\n","authors":["Chong Zhong","Yang Li","Danjuan Yang","Meiyan Li","Xingyao Zhou","Bo Fu","Catherine C. Liu","A. H. Welsh"],"pdf_url":"https://arxiv.org/pdf/2311.03967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03601v3","updated":"2024-06-01T08:50:14Z","published":"2023-07-07T13:43:44Z","title":"GPT4RoI: Instruction Tuning Large Language Model on Region-of-Interest","summary":"  Visual instruction tuning large language model(LLM) on image-text pairs has\nachieved general-purpose vision-language abilities. However, the lack of\nregion-text pairs limits their advancements to fine-grained multimodal\nunderstanding. In this paper, we propose spatial instruction tuning, which\nintroduces the reference to the region-of-interest(RoI) in the instruction.\nBefore sending to LLM, the reference is replaced by RoI features and\ninterleaved with language embeddings as a sequence. Our model GPT4RoI, trained\non 7 region-text pair datasets, brings an unprecedented interactive and\nconversational experience compared to previous image-level models. (1)\nInteraction beyond language: Users can interact with our model by both language\nand drawing bounding boxes to flexibly adjust the referring granularity. (2)\nVersatile multimodal abilities: A variety of attribute information within each\nRoI can be mined by GPT4RoI, e.g., color, shape, material, action, etc.\nFurthermore, it can reason about multiple RoIs based on common sense. On the\nVisual Commonsense Reasoning(VCR) dataset, GPT4RoI achieves a remarkable\naccuracy of 81.6%, surpassing all existing models by a significant margin (the\nsecond place is 75.6%) and almost reaching human-level performance of 85.0%.\nThe code, dataset, and demo can be found at\nhttps://github.com/jshilong/GPT4RoI.\n","authors":["Shilong Zhang","Peize Sun","Shoufa Chen","Min Xiao","Wenqi Shao","Wenwei Zhang","Yu Liu","Kai Chen","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2307.03601v3.pdf","comment":"Code has been released at https://github.com/jshilong/GPT4RoI"},{"id":"http://arxiv.org/abs/2403.14781v2","updated":"2024-06-01T08:27:23Z","published":"2024-03-21T18:52:58Z","title":"Champ: Controllable and Consistent Human Image Animation with 3D\n  Parametric Guidance","summary":"  In this study, we introduce a methodology for human image animation by\nleveraging a 3D human parametric model within a latent diffusion framework to\nenhance shape alignment and motion guidance in curernt human generative\ntechniques. The methodology utilizes the SMPL(Skinned Multi-Person Linear)\nmodel as the 3D human parametric model to establish a unified representation of\nbody shape and pose. This facilitates the accurate capture of intricate human\ngeometry and motion characteristics from source videos. Specifically, we\nincorporate rendered depth images, normal maps, and semantic maps obtained from\nSMPL sequences, alongside skeleton-based motion guidance, to enrich the\nconditions to the latent diffusion model with comprehensive 3D shape and\ndetailed pose attributes. A multi-layer motion fusion module, integrating\nself-attention mechanisms, is employed to fuse the shape and motion latent\nrepresentations in the spatial domain. By representing the 3D human parametric\nmodel as the motion guidance, we can perform parametric shape alignment of the\nhuman body between the reference image and the source video motion.\nExperimental evaluations conducted on benchmark datasets demonstrate the\nmethodology's superior ability to generate high-quality human animations that\naccurately capture both pose and shape variations. Furthermore, our approach\nalso exhibits superior generalization capabilities on the proposed in-the-wild\ndataset. Project page: https://fudan-generative-vision.github.io/champ.\n","authors":["Shenhao Zhu","Junming Leo Chen","Zuozhuo Dai","Qingkun Su","Yinghui Xu","Xun Cao","Yao Yao","Hao Zhu","Siyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2403.14781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.00947v3","updated":"2024-06-01T07:55:27Z","published":"2021-04-02T08:55:04Z","title":"A Detector-oblivious Multi-arm Network for Keypoint Matching","summary":"  This paper presents a matching network to establish point correspondence\nbetween images. We propose a Multi-Arm Network (MAN) to learn region overlap\nand depth, which can greatly improve the keypoint matching robustness while\nbringing little computational cost during the inference stage. Another design\nthat makes this framework different from many existing learning based pipelines\nthat require re-training when a different keypoint detector is adopted, our\nnetwork can directly work with different keypoint detectors without such a\ntime-consuming re-training process. Comprehensive experiments conducted on\noutdoor and indoor datasets demonstrated that our proposed MAN outperforms\nstate-of-the-art methods.\n","authors":["Xuelun Shen","Qian Hu","Xin Li","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2104.00947v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09486v4","updated":"2024-06-01T07:28:05Z","published":"2024-03-14T15:29:09Z","title":"SpikeReveal: Unlocking Temporal Sequences from Real Blurry Inputs with\n  Spike Streams","summary":"  Reconstructing a sequence of sharp images from the blurry input is crucial\nfor enhancing our insights into the captured scene and poses a significant\nchallenge due to the limited temporal features embedded in the image. Spike\ncameras, sampling at rates up to 40,000 Hz, have proven effective in capturing\nmotion features and beneficial for solving this ill-posed problem. Nonetheless,\nexisting methods fall into the supervised learning paradigm, which suffers from\nnotable performance degradation when applied to real-world scenarios that\ndiverge from the synthetic training data domain. Moreover, the quality of\nreconstructed images is capped by the generated images based on motion analysis\ninterpolation, which inherently differs from the actual scene, affecting the\ngeneralization ability of these methods in real high-speed scenarios. To\naddress these challenges, we propose the first self-supervised framework for\nthe task of spike-guided motion deblurring. Our approach begins with the\nformulation of a spike-guided deblurring model that explores the theoretical\nrelationships among spike streams, blurry images, and their corresponding sharp\nsequences. We subsequently develop a self-supervised cascaded framework to\nalleviate the issues of spike noise and spatial-resolution mismatching\nencountered in the deblurring model. With knowledge distillation and\nre-blurring loss, we further design a lightweight deblur network to generate\nhigh-quality sequences with brightness and texture consistency with the\noriginal input. Quantitative and qualitative experiments conducted on our\nreal-world and synthetic datasets with spikes validate the superior\ngeneralization of the proposed framework. Our code, data and trained models\nwill be available at \\url{https://github.com/chenkang455/S-SDM}.\n","authors":["Kang Chen","Shiyan Chen","Jiyuan Zhang","Baoyue Zhang","Yajing Zheng","Tiejun Huang","Zhaofei Yu"],"pdf_url":"https://arxiv.org/pdf/2403.09486v4.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2312.17161v2","updated":"2024-06-01T07:03:52Z","published":"2023-12-28T17:50:54Z","title":"Restoration by Generation with Constrained Priors","summary":"  The inherent generative power of denoising diffusion models makes them\nwell-suited for image restoration tasks where the objective is to find the\noptimal high-quality image within the generative space that closely resembles\nthe input image. We propose a method to adapt a pretrained diffusion model for\nimage restoration by simply adding noise to the input image to be restored and\nthen denoise. Our method is based on the observation that the space of a\ngenerative model needs to be constrained. We impose this constraint by\nfinetuning the generative model with a set of anchor images that capture the\ncharacteristics of the input image. With the constrained space, we can then\nleverage the sampling strategy used for generation to do image restoration. We\nevaluate against previous methods and show superior performances on multiple\nreal-world restoration datasets in preserving identity and image quality. We\nalso demonstrate an important and practical application on personalized\nrestoration, where we use a personal album as the anchor images to constrain\nthe generative space. This approach allows us to produce results that\naccurately preserve high-frequency details, which previous works are unable to\ndo. Project webpage: https://gen2res.github.io.\n","authors":["Zheng Ding","Xuaner Zhang","Zhuowen Tu","Zhihao Xia"],"pdf_url":"https://arxiv.org/pdf/2312.17161v2.pdf","comment":"CVPR 2024 (Highlight)"},{"id":"http://arxiv.org/abs/2404.13611v2","updated":"2024-06-01T06:56:16Z","published":"2024-04-21T10:41:04Z","title":"Video sentence grounding with temporally global textual knowledge","summary":"  Temporal sentence grounding involves the retrieval of a video moment with a\nnatural language query. Many existing works directly incorporate the given\nvideo and temporally localized query for temporal grounding, overlooking the\ninherent domain gap between different modalities. In this paper, we utilize\npseudo-query features containing extensive temporally global textual knowledge\nsourced from the same video-query pair, to enhance the bridging of domain gaps\nand attain a heightened level of similarity between multi-modal features.\nSpecifically, we propose a Pseudo-query Intermediary Network (PIN) to achieve\nan improved alignment of visual and comprehensive pseudo-query features within\nthe feature space through contrastive learning. Subsequently, we utilize\nlearnable prompts to encapsulate the knowledge of pseudo-queries, propagating\nthem into the textual encoder and multi-modal fusion module, further enhancing\nthe feature alignment between visual and language for better temporal\ngrounding. Extensive experiments conducted on the Charades-STA and\nActivityNet-Captions datasets demonstrate the effectiveness of our method.\n","authors":["Cai Chen","Runzhong Zhang","Jianjun Gao","Kejun Wu","Kim-Hui Yap","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16307v2","updated":"2024-06-01T06:20:54Z","published":"2024-04-25T03:22:48Z","title":"Boosting Model Resilience via Implicit Adversarial Data Augmentation","summary":"  Data augmentation plays a pivotal role in enhancing and diversifying training\ndata. Nonetheless, consistently improving model performance in varied learning\nscenarios, especially those with inherent data biases, remains challenging. To\naddress this, we propose to augment the deep features of samples by\nincorporating their adversarial and anti-adversarial perturbation\ndistributions, enabling adaptive adjustment in the learning difficulty tailored\nto each sample's specific characteristics. We then theoretically reveal that\nour augmentation process approximates the optimization of a surrogate loss\nfunction as the number of augmented copies increases indefinitely. This insight\nleads us to develop a meta-learning-based framework for optimizing classifiers\nwith this novel loss, introducing the effects of augmentation while bypassing\nthe explicit augmentation process. We conduct extensive experiments across four\ncommon biased learning scenarios: long-tail learning, generalized long-tail\nlearning, noisy label learning, and subpopulation shift learning. The empirical\nresults demonstrate that our method consistently achieves state-of-the-art\nperformance, highlighting its broad adaptability.\n","authors":["Xiaoling Zhou","Wei Ye","Zhemg Lee","Rui Xie","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16307v2.pdf","comment":"9 pages, 6 figures, accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2401.07529v3","updated":"2024-06-01T06:14:37Z","published":"2024-01-15T08:19:22Z","title":"MM-SAP: A Comprehensive Benchmark for Assessing Self-Awareness of\n  Multimodal Large Language Models in Perception","summary":"  Recent advancements in Multimodal Large Language Models (MLLMs) have\ndemonstrated exceptional capabilities in visual perception and understanding.\nHowever, these models also suffer from hallucinations, which limit their\nreliability as AI systems. We believe that these hallucinations are partially\ndue to the models' struggle with understanding what they can and cannot\nperceive from images, a capability we refer to as self-awareness in perception.\nDespite its importance, this aspect of MLLMs has been overlooked in prior\nstudies. In this paper, we aim to define and evaluate the self-awareness of\nMLLMs in perception. To do this, we first introduce the knowledge quadrant in\nperception, which helps define what MLLMs know and do not know about images.\nUsing this framework, we propose a novel benchmark, the Self-Awareness in\nPerception for MLLMs (MM-SAP), specifically designed to assess this capability.\nWe apply MM-SAP to a variety of popular MLLMs, offering a comprehensive\nanalysis of their self-awareness and providing detailed insights. The\nexperiment results reveal that current MLLMs possess limited self-awareness\ncapabilities, pointing to a crucial area for future advancement in the\ndevelopment of trustworthy MLLMs. Code and data are available at\nhttps://github.com/YHWmz/MM-SAP.\n","authors":["Yuhao Wang","Yusheng Liao","Heyang Liu","Hongcheng Liu","Yu Wang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07529v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05634v3","updated":"2024-06-01T05:52:18Z","published":"2023-12-09T18:43:05Z","title":"PGDS: Pose-Guidance Deep Supervision for Mitigating Clothes-Changing in\n  Person Re-Identification","summary":"  Person Re-Identification (Re-ID) task seeks to enhance the tracking of\nmultiple individuals by surveillance cameras. It supports multimodal tasks,\nincluding text-based person retrieval and human matching. One of the most\nsignificant challenges faced in Re-ID is clothes-changing, where the same\nperson may appear in different outfits. While previous methods have made\nnotable progress in maintaining clothing data consistency and handling clothing\nchange data, they still rely excessively on clothing information, which can\nlimit performance due to the dynamic nature of human appearances. To mitigate\nthis challenge, we propose the Pose-Guidance Deep Supervision (PGDS), an\neffective framework for learning pose guidance within the Re-ID task. It\nconsists of three modules: a human encoder, a pose encoder, and a Pose-to-Human\nProjection module (PHP). Our framework guides the human encoder, i.e., the main\nre-identification model, with pose information from the pose encoder through\nmultiple layers via the knowledge transfer mechanism from the PHP module,\nhelping the human encoder learn body parts information without increasing\ncomputation resources in the inference stage. Through extensive experiments,\nour method surpasses the performance of current state-of-the-art methods,\ndemonstrating its robustness and effectiveness for real-world applications. Our\ncode is available at https://github.com/huyquoctrinh/PGDS.\n","authors":["Quoc-Huy Trinh","Nhat-Tan Bui","Dinh-Hieu Hoang","Phuoc-Thao Vo Thi","Hai-Dang Nguyen","Debesh Jha","Ulas Bagci","Ngan Le","Minh-Triet Tran"],"pdf_url":"https://arxiv.org/pdf/2312.05634v3.pdf","comment":"Accepted at AVSS 2024"},{"id":"http://arxiv.org/abs/2403.06681v3","updated":"2024-06-01T05:19:24Z","published":"2024-03-11T12:56:36Z","title":"Out-of-distribution Partial Label Learning","summary":"  Partial Label Learning (PLL) tackles model learning from the data with\ninexact labels under the assumption that training and test objects are in the\nsame distribution, i.e., closed-set scenario. Nevertheless, this assumption\ndoes not hold in real-world open-set scenarios where test data may come from\nOut-Of-Distribution (OOD), resulting in object detection failure and hence\nsignificantly compromising the PLL model's security and trustworthiness. This\nis a previously unexplored problem called Out-Of-Distribution Partial Label\nLearning (OODPLL) that our newly proposed PLOOD framework can effectively\nresolve. During the training phase, our framework leverages self-supervised\nlearning strategy to generate positive and negative samples for each object,\nemulating in and out-of-distributions respectively. Under these distributions,\nPLL methods can learn discriminative features for OOD objects. In the inference\nphase, a novel Partial Energy (PE) scoring technique is proposed which\nleverages the label confidence established during the above training phase to\nmine the actual labels. In this way, the issue of inexact labeling in PLL can\nbe effectively addressed for significantly better performance in OOD object\ndetection. PLOOD is compared with SOTA PLL models and OOD scores on CIFAR-10\nand CIFAR-100 datasets against various OOD datasets. The results demonstrate\nthe effectiveness of our PLOOD framework, significantly outperforming SOTA PLL\nmodels and marking a substantial advancement in addressing PLL problems in\nreal-world OOD scenarios.\n","authors":["Jintao Huang","Yiu-Ming Cheung","Chi-Man Vong"],"pdf_url":"https://arxiv.org/pdf/2403.06681v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13215v2","updated":"2024-06-01T04:49:43Z","published":"2023-10-20T01:44:49Z","title":"Zone Evaluation: Revealing Spatial Bias in Object Detection","summary":"  A fundamental limitation of object detectors is that they suffer from\n\"spatial bias\", and in particular perform less satisfactorily when detecting\nobjects near image borders. For a long time, there has been a lack of effective\nways to measure and identify spatial bias, and little is known about where it\ncomes from and what degree it is. To this end, we present a new zone evaluation\nprotocol, extending from the traditional evaluation to a more generalized one,\nwhich measures the detection performance over zones, yielding a series of Zone\nPrecisions (ZPs). For the first time, we provide numerical results, showing\nthat the object detectors perform quite unevenly across the zones.\nSurprisingly, the detector's performance in the 96% border zone of the image\ndoes not reach the AP value (Average Precision, commonly regarded as the\naverage detection performance in the entire image zone). To better understand\nspatial bias, a series of heuristic experiments are conducted. Our\ninvestigation excludes two intuitive conjectures about spatial bias that the\nobject scale and the absolute positions of objects barely influence the spatial\nbias. We find that the key lies in the human-imperceptible divergence in data\npatterns between objects in different zones, thus eventually forming a visible\nperformance gap between the zones. With these findings, we finally discuss a\nfuture direction for object detection, namely, spatial disequilibrium problem,\naiming at pursuing a balanced detection ability over the entire image zone. By\nbroadly evaluating 10 popular object detectors and 5 detection datasets, we\nshed light on the spatial bias of object detectors. We hope this work could\nraise a focus on detection robustness. The source codes, evaluation protocols,\nand tutorials are publicly available at https://github.com/Zzh-tju/ZoneEval.\n","authors":["Zhaohui Zheng","Yuming Chen","Qibin Hou","Xiang Li","Ping Wang","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.13215v2.pdf","comment":"Accepted by IEEE TPAMI"},{"id":"http://arxiv.org/abs/2405.14455v2","updated":"2024-06-01T04:44:15Z","published":"2024-05-23T11:37:17Z","title":"TIGER: Text-Instructed 3D Gaussian Retrieval and Coherent Editing","summary":"  Editing objects within a scene is a critical functionality required across a\nbroad spectrum of applications in computer vision and graphics. As 3D Gaussian\nSplatting (3DGS) emerges as a frontier in scene representation, the effective\nmodification of 3D Gaussian scenes has become increasingly vital. This process\nentails accurately retrieve the target objects and subsequently performing\nmodifications based on instructions. Though available in pieces, existing\ntechniques mainly embed sparse semantics into Gaussians for retrieval, and rely\non an iterative dataset update paradigm for editing, leading to over-smoothing\nor inconsistency issues. To this end, this paper proposes a systematic\napproach, namely TIGER, for coherent text-instructed 3D Gaussian retrieval and\nediting. In contrast to the top-down language grounding approach for 3D\nGaussians, we adopt a bottom-up language aggregation strategy to generate a\ndenser language embedded 3D Gaussians that supports open-vocabulary retrieval.\nTo overcome the over-smoothing and inconsistency issues in editing, we propose\na Coherent Score Distillation (CSD) that aggregates a 2D image editing\ndiffusion model and a multi-view diffusion model for score distillation,\nproducing multi-view consistent editing with much finer details. In various\nexperiments, we demonstrate that our TIGER is able to accomplish more\nconsistent and realistic edits than prior work.\n","authors":["Teng Xu","Jiamin Chen","Peng Chen","Youjia Zhang","Junqing Yu","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2405.14455v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10300v2","updated":"2024-06-01T03:35:22Z","published":"2024-05-16T17:54:15Z","title":"Grounding DINO 1.5: Advance the \"Edge\" of Open-Set Object Detection","summary":"  This paper introduces Grounding DINO 1.5, a suite of advanced open-set object\ndetection models developed by IDEA Research, which aims to advance the \"Edge\"\nof open-set object detection. The suite encompasses two models: Grounding DINO\n1.5 Pro, a high-performance model designed for stronger generalization\ncapability across a wide range of scenarios, and Grounding DINO 1.5 Edge, an\nefficient model optimized for faster speed demanded in many applications\nrequiring edge deployment. The Grounding DINO 1.5 Pro model advances its\npredecessor by scaling up the model architecture, integrating an enhanced\nvision backbone, and expanding the training dataset to over 20 million images\nwith grounding annotations, thereby achieving a richer semantic understanding.\nThe Grounding DINO 1.5 Edge model, while designed for efficiency with reduced\nfeature scales, maintains robust detection capabilities by being trained on the\nsame comprehensive dataset. Empirical results demonstrate the effectiveness of\nGrounding DINO 1.5, with the Grounding DINO 1.5 Pro model attaining a 54.3 AP\non the COCO detection benchmark and a 55.7 AP on the LVIS-minival zero-shot\ntransfer benchmark, setting new records for open-set object detection.\nFurthermore, the Grounding DINO 1.5 Edge model, when optimized with TensorRT,\nachieves a speed of 75.2 FPS while attaining a zero-shot performance of 36.2 AP\non the LVIS-minival benchmark, making it more suitable for edge computing\nscenarios. Model examples and demos with API will be released at\nhttps://github.com/IDEA-Research/Grounding-DINO-1.5-API\n","authors":["Tianhe Ren","Qing Jiang","Shilong Liu","Zhaoyang Zeng","Wenlong Liu","Han Gao","Hongjie Huang","Zhengyu Ma","Xiaoke Jiang","Yihao Chen","Yuda Xiong","Hao Zhang","Feng Li","Peijun Tang","Kent Yu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.10300v2.pdf","comment":"homepage: https://deepdataspace.com/home"},{"id":"http://arxiv.org/abs/2307.03212v3","updated":"2024-06-01T03:00:16Z","published":"2023-07-06T16:38:43Z","title":"Attentive Graph Enhanced Region Representation Learning","summary":"  Representing urban regions accurately and comprehensively is essential for\nvarious urban planning and analysis tasks. Recently, with the expansion of the\ncity, modeling long-range spatial dependencies with multiple data sources plays\nan important role in urban region representation. In this paper, we propose the\nAttentive Graph Enhanced Region Representation Learning (ATGRL) model, which\naims to capture comprehensive dependencies from multiple graphs and learn rich\nsemantic representations of urban regions. Specifically, we propose a\ngraph-enhanced learning module to construct regional graphs by incorporating\nmobility flow patterns, point of interests (POIs) functions, and check-in\nsemantics with noise filtering. Then, we present a multi-graph aggregation\nmodule to capture both local and global spatial dependencies between regions by\nintegrating information from multiple graphs. In addition, we design a\ndual-stage fusion module to facilitate information sharing between different\nviews and efficiently fuse multi-view representations for urban region\nembedding using an improved linear attention mechanism. Finally, extensive\nexperiments on real-world datasets for three downstream tasks demonstrate the\nsuperior performance of our model compared to state-of-the-art methods.\n","authors":["Weiliang Chen","Qianqian Ren","Jinbao Li"],"pdf_url":"https://arxiv.org/pdf/2307.03212v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12476v2","updated":"2024-06-01T02:21:22Z","published":"2024-05-21T03:36:13Z","title":"Benchmarking Fish Dataset and Evaluation Metric in Keypoint Detection --\n  Towards Precise Fish Morphological Assessment in Aquaculture Breeding","summary":"  Accurate phenotypic analysis in aquaculture breeding necessitates the\nquantification of subtle morphological phenotypes. Existing datasets suffer\nfrom limitations such as small scale, limited species coverage, and inadequate\nannotation of keypoints for measuring refined and complex morphological\nphenotypes of fish body parts. To address this gap, we introduce FishPhenoKey,\na comprehensive dataset comprising 23,331 high-resolution images spanning six\nfish species. Notably, FishPhenoKey includes 22 phenotype-oriented annotations,\nenabling the capture of intricate morphological phenotypes. Motivated by the\nnuanced evaluation of these subtle morphologies, we also propose a new\nevaluation metric, Percentage of Measured Phenotype (PMP). It is designed to\nassess the accuracy of individual keypoint positions and is highly sensitive to\nthe phenotypes measured using the corresponding keypoints. To enhance keypoint\ndetection accuracy, we further propose a novel loss, Anatomically-Calibrated\nRegularization (ACR), that can be integrated into keypoint detection models,\nleveraging biological insights to refine keypoint localization. Our\ncontributions set a new benchmark in fish phenotype analysis, addressing the\nchallenges of precise morphological quantification and opening new avenues for\nresearch in sustainable aquaculture and genetic studies. Our dataset and code\nare available at https://github.com/WeizhenLiuBioinform/Fish-Phenotype-Detect.\n","authors":["Weizhen Liu","Jiayu Tan","Guangyu Lan","Ao Li","Dongye Li","Le Zhao","Xiaohui Yuan","Nanqing Dong"],"pdf_url":"https://arxiv.org/pdf/2405.12476v2.pdf","comment":"Accepted by IJCAI2024, Code:\n  https://github.com/WeizhenLiuBioinform/Fish-Phenotype-Detect"},{"id":"http://arxiv.org/abs/2402.10896v2","updated":"2024-06-01T01:06:16Z","published":"2024-02-16T18:54:47Z","title":"PaLM2-VAdapter: Progressively Aligned Language Model Makes a Strong\n  Vision-language Adapter","summary":"  This paper demonstrates that a progressively aligned language model can\neffectively bridge frozen vision encoders and large language models (LLMs).\nWhile the fundamental architecture and pre-training methods of vision encoders\nand LLMs have been extensively studied, the architecture and training strategy\nof vision-language adapters vary significantly across recent works. Our\nresearch undertakes a thorough exploration of the state-of-the-art perceiver\nresampler architecture and builds a strong baseline. However, we observe that\nthe vision-language alignment with perceiver resampler exhibits slow\nconvergence and limited scalability with a lack of direct supervision. To\naddress this issue, we propose PaLM2-VAdapter, employing a progressively\naligned language model as the vision-language adapter. Compared to the strong\nbaseline with perceiver resampler, our method empirically shows faster\nconvergence, higher performance, and stronger scalability. Extensive\nexperiments across various Visual Question Answering (VQA) and captioning tasks\non both images and videos demonstrate that our model exhibits state-of-the-art\nvisual understanding and multi-modal reasoning capabilities. Notably, our\nmethod achieves these advancements with 30~70% fewer parameters than the\nstate-of-the-art large vision-language models, marking a significant efficiency\nimprovement.\n","authors":["Junfei Xiao","Zheng Xu","Alan Yuille","Shen Yan","Boyu Wang"],"pdf_url":"https://arxiv.org/pdf/2402.10896v2.pdf","comment":"Technical report, 15 pages; v2 fix typos, add additional results in\n  appendix"},{"id":"http://arxiv.org/abs/2405.12971v2","updated":"2024-06-01T00:28:58Z","published":"2024-05-21T17:54:06Z","title":"BiomedParse: a biomedical foundation model for image parsing of\n  everything everywhere all at once","summary":"  Biomedical image analysis is fundamental for biomedical discovery in cell\nbiology, pathology, radiology, and many other biomedical domains. Holistic\nimage analysis comprises interdependent subtasks such as segmentation,\ndetection, and recognition of relevant objects. Here, we propose BiomedParse, a\nbiomedical foundation model for imaging parsing that can jointly conduct\nsegmentation, detection, and recognition for 82 object types across 9 imaging\nmodalities. Through joint learning, we can improve accuracy for individual\ntasks and enable novel applications such as segmenting all relevant objects in\nan image through a text prompt, rather than requiring users to laboriously\nspecify the bounding box for each object. We leveraged readily available\nnatural-language labels or descriptions accompanying those datasets and use\nGPT-4 to harmonize the noisy, unstructured text information with established\nbiomedical object ontologies. We created a large dataset comprising over six\nmillion triples of image, segmentation mask, and textual description. On image\nsegmentation, we showed that BiomedParse is broadly applicable, outperforming\nstate-of-the-art methods on 102,855 test image-mask-label triples across 9\nimaging modalities (everything). On object detection, which aims to locate a\nspecific object of interest, BiomedParse again attained state-of-the-art\nperformance, especially on objects with irregular shapes (everywhere). On\nobject recognition, which aims to identify all objects in a given image along\nwith their semantic types, we showed that BiomedParse can simultaneously\nsegment and label all biomedical objects in an image (all at once). In summary,\nBiomedParse is an all-in-one tool for biomedical image analysis by jointly\nsolving segmentation, detection, and recognition for all major biomedical image\nmodalities, paving the path for efficient and accurate image-based biomedical\ndiscovery.\n","authors":["Theodore Zhao","Yu Gu","Jianwei Yang","Naoto Usuyama","Ho Hin Lee","Tristan Naumann","Jianfeng Gao","Angela Crabtree","Brian Piening","Carlo Bifulco","Mu Wei","Hoifung Poon","Sheng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.12971v2.pdf","comment":"Project page: https://aka.ms/biomedparse-project"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.13333v2","updated":"2024-06-01T22:27:42Z","published":"2023-09-23T10:35:01Z","title":"mdendro: An R package for extended agglomerative hierarchical clustering","summary":"  \"mdendro\" is an R package that provides a comprehensive collection of linkage\nmethods for agglomerative hierarchical clustering on a matrix of proximity data\n(distances or similarities), returning a multifurcated dendrogram or\nmultidendrogram. Multidendrograms can group more than two clusters at the same\ntime, solving the nonuniqueness problem that arises when there are ties in the\ndata. This problem causes that different binary dendrograms are possible\ndepending both on the order of the input data and on the criterion used to\nbreak ties. Weighted and unweighted versions of the most common linkage methods\nare included in the package, which also implements two parametric linkage\nmethods. In addition, package \"mdendro\" provides five descriptive measures to\nanalyze the resulting dendrograms: cophenetic correlation coefficient, space\ndistortion ratio, agglomeration coefficient, chaining coefficient and tree\nbalance.\n","authors":["Alberto Fernández","Sergio Gómez"],"pdf_url":"https://arxiv.org/pdf/2309.13333v2.pdf","comment":"27 pages, 13 figures. Software available at CRAN\n  (https://cran.r-project.org/package=mdendro) and Github\n  (https://sergio-gomez.github.io/mdendro/)"},{"id":"http://arxiv.org/abs/2310.13848v2","updated":"2024-06-01T15:02:41Z","published":"2023-10-20T22:47:18Z","title":"FABULA: Intelligence Report Generation Using Retrieval-Augmented\n  Narrative Construction","summary":"  Narrative construction is the process of representing disparate event\ninformation into a logical plot structure that models an end to end story.\nIntelligence analysis is an example of a domain that can benefit tremendously\nfrom narrative construction techniques, particularly in aiding analysts during\nthe largely manual and costly process of synthesizing event information into\ncomprehensive intelligence reports. Manual intelligence report generation is\noften prone to challenges such as integrating dynamic event information,\nwriting fine-grained queries, and closing information gaps. This motivates the\ndevelopment of a system that retrieves and represents critical aspects of\nevents in a form that aids in automatic generation of intelligence reports.\n  We introduce a Retrieval Augmented Generation (RAG) approach to augment\nprompting of an autoregressive decoder by retrieving structured information\nasserted in a knowledge graph to generate targeted information based on a\nnarrative plot model. We apply our approach to the problem of neural\nintelligence report generation and introduce FABULA, framework to augment\nintelligence analysis workflows using RAG. An analyst can use FABULA to query\nan Event Plot Graph (EPG) to retrieve relevant event plot points, which can be\nused to augment prompting of a Large Language Model (LLM) during intelligence\nreport generation. Our evaluation studies show that the plot points included in\nthe generated intelligence reports have high semantic relevance, high\ncoherency, and low data redundancy.\n","authors":["Priyanka Ranade","Anupam Joshi"],"pdf_url":"https://arxiv.org/pdf/2310.13848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12774v2","updated":"2024-06-01T09:21:02Z","published":"2024-02-20T07:25:34Z","title":"Interpreting Conversational Dense Retrieval by Rewriting-Enhanced\n  Inversion of Session Embedding","summary":"  Conversational dense retrieval has shown to be effective in conversational\nsearch. However, a major limitation of conversational dense retrieval is their\nlack of interpretability, hindering intuitive understanding of model behaviors\nfor targeted improvements. This paper presents CONVINV, a simple yet effective\napproach to shed light on interpretable conversational dense retrieval models.\nCONVINV transforms opaque conversational session embeddings into explicitly\ninterpretable text while faithfully maintaining their original retrieval\nperformance as much as possible. Such transformation is achieved by training a\nrecently proposed Vec2Text model based on the ad-hoc query encoder, leveraging\nthe fact that the session and query embeddings share the same space in existing\nconversational dense retrieval. To further enhance interpretability, we propose\nto incorporate external interpretable query rewrites into the transformation\nprocess. Extensive evaluations on three conversational search benchmarks\ndemonstrate that CONVINV can yield more interpretable text and faithfully\npreserve original retrieval performance than baselines. Our work connects\nopaque session embeddings with transparent query rewriting, paving the way\ntoward trustworthy conversational search.\n","authors":["Yiruo Cheng","Kelong Mao","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2402.12774v2.pdf","comment":"Accepted by ACL 2024. Repo: https://github.com/Ariya12138/ConvInv"},{"id":"http://arxiv.org/abs/2302.02592v3","updated":"2024-06-01T09:10:16Z","published":"2023-02-06T07:00:20Z","title":"RLTP: Reinforcement Learning to Pace for Delayed Impression Modeling in\n  Preloaded Ads","summary":"  To increase brand awareness, many advertisers conclude contracts with\nadvertising platforms to purchase traffic and then deliver advertisements to\ntarget audiences. In a whole delivery period, advertisers usually desire a\ncertain impression count for the ads, and they also expect that the delivery\nperformance is as good as possible (e.g., obtaining high click-through rate).\nAdvertising platforms employ pacing algorithms to satisfy the demands via\nadjusting the selection probabilities to traffic requests in real-time.\nHowever, the delivery procedure is also affected by the strategies from\npublishers, which cannot be controlled by advertising platforms. Preloading is\na widely used strategy for many types of ads (e.g., video ads) to make sure\nthat the response time for displaying after a traffic request is legitimate,\nwhich results in delayed impression phenomenon. Traditional pacing algorithms\ncannot handle the preloading nature well because they rely on immediate\nfeedback signals, and may fail to guarantee the demands from advertisers.\n  In this paper, we focus on a new research problem of impression pacing for\npreloaded ads, and propose a Reinforcement Learning To Pace framework RLTP. It\nlearns a pacing agent that sequentially produces selection probabilities in the\nwhole delivery period. To jointly optimize the two objectives of impression\ncount and delivery performance, RLTP employs tailored reward estimator to\nsatisfy the guaranteed impression count, penalize the over-delivery and\nmaximize the traffic value. Experiments on large-scale industrial datasets\nverify that RLTP outperforms baseline pacing algorithms by a large margin. We\nhave deployed the RLTP framework online to our advertising platform, and\nresults show that it achieves significant uplift to core metrics including\ndelivery completion rate and click-through rate.\n","authors":["Penghui Wei","Yongqiang Chen","Shaoguo Liu","Liang Wang","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2302.02592v3.pdf","comment":"KDD 2023 (Applied Data Science Track). The first two authors\n  contributed equally"},{"id":"http://arxiv.org/abs/2311.16720v3","updated":"2024-06-01T08:15:58Z","published":"2023-11-28T12:04:19Z","title":"A Two-Stage Adaptation of Large Language Models for Text Ranking","summary":"  Text ranking is a critical task in information retrieval. Recent advances in\npre-trained language models (PLMs), especially large language models (LLMs),\npresent new opportunities for applying them to text ranking. While supervised\nfine-tuning (SFT) with ranking data has been widely explored to better align\nPLMs with text ranking goals, previous studies have focused primarily on\nencoder-only and encoder-decoder PLMs. Research on leveraging decoder-only LLMs\nfor text ranking remains scarce. An exception to this is RankLLaMA, which uses\ndirect SFT to explore LLaMA's potential for text ranking. In this work, we\npropose a two-stage progressive paradigm to better adapt LLMs to text ranking.\nFirst, we conduct continual pre-training (CPT) of LLMs on a large\nweakly-supervised corpus. Second, we perform SFT, and propose an improved\noptimization strategy building upon RankLLaMA. Our experimental results on\nmultiple benchmarks show that our approach outperforms previous methods in both\nin-domain and out-domain scenarios.\n","authors":["Longhui Zhang","Yanzhao Zhang","Dingkun Long","Pengjun Xie","Meishan Zhang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.16720v3.pdf","comment":"Accepted to Findings of ACL 2024. Code and models available at\n  https://github.com/Alibaba-NLP/RankingGPT"},{"id":"http://arxiv.org/abs/2404.11343v2","updated":"2024-06-01T07:08:49Z","published":"2024-04-17T13:03:07Z","title":"Large Language Models meet Collaborative Filtering: An Efficient\n  All-round LLM-based Recommender System","summary":"  Collaborative filtering recommender systems (CF-RecSys) have shown successive\nresults in enhancing the user experience on social media and e-commerce\nplatforms. However, as CF-RecSys struggles under cold scenarios with sparse\nuser-item interactions, recent strategies have focused on leveraging modality\ninformation of user/items (e.g., text or images) based on pre-trained modality\nencoders and Large Language Models (LLMs). Despite their effectiveness under\ncold scenarios, we observe that they underperform simple traditional\ncollaborative filtering models under warm scenarios due to the lack of\ncollaborative knowledge. In this work, we propose an efficient All-round\nLLM-based Recommender system, called A-LLMRec, that excels not only in the cold\nscenario but also in the warm scenario. Our main idea is to enable an LLM to\ndirectly leverage the collaborative knowledge contained in a pre-trained\nstate-of-the-art CF-RecSys so that the emergent ability of the LLM as well as\nthe high-quality user/item embeddings that are already trained by the\nstate-of-the-art CF-RecSys can be jointly exploited. This approach yields two\nadvantages: (1) model-agnostic, allowing for integration with various existing\nCF-RecSys, and (2) efficiency, eliminating the extensive fine-tuning typically\nrequired for LLM-based recommenders. Our extensive experiments on various\nreal-world datasets demonstrate the superiority of A-LLMRec in various\nscenarios, including cold/warm, few-shot, cold user, and cross-domain\nscenarios. Beyond the recommendation task, we also show the potential of\nA-LLMRec in generating natural language outputs based on the understanding of\nthe collaborative knowledge by performing a favorite genre prediction task. Our\ncode is available at https://github.com/ghdtjr/A-LLMRec .\n","authors":["Sein Kim","Hongseok Kang","Seungyoon Choi","Donghyun Kim","Minchul Yang","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2404.11343v2.pdf","comment":"KDD 2024"},{"id":"http://arxiv.org/abs/2403.02630v3","updated":"2024-06-01T03:57:41Z","published":"2024-03-05T03:40:39Z","title":"FedHCDR: Federated Cross-Domain Recommendation with Hypergraph Signal\n  Decoupling","summary":"  In recent years, Cross-Domain Recommendation (CDR) has drawn significant\nattention, which utilizes user data from multiple domains to enhance the\nrecommendation performance. However, current CDR methods require sharing user\ndata across domains, thereby violating the General Data Protection Regulation\n(GDPR). Consequently, numerous approaches have been proposed for Federated\nCross-Domain Recommendation (FedCDR). Nevertheless, the data heterogeneity\nacross different domains inevitably influences the overall performance of\nfederated learning. In this study, we propose FedHCDR, a novel Federated\nCross-Domain Recommendation framework with Hypergraph signal decoupling.\nSpecifically, to address the data heterogeneity across domains, we introduce an\napproach called hypergraph signal decoupling (HSD) to decouple the user\nfeatures into domain-exclusive and domain-shared features. The approach employs\nhigh-pass and low-pass hypergraph filters to decouple domain-exclusive and\ndomain-shared user representations, which are trained by the local-global\nbi-directional transfer algorithm. In addition, a hypergraph contrastive\nlearning (HCL) module is devised to enhance the learning of domain-shared user\nrelationship information by perturbing the user hypergraph. Extensive\nexperiments conducted on three real-world scenarios demonstrate that FedHCDR\noutperforms existing baselines significantly.\n","authors":["Hongyu Zhang","Dongyi Zheng","Lin Zhong","Xu Yang","Jiyuan Feng","Yunqing Feng","Qing Liao"],"pdf_url":"https://arxiv.org/pdf/2403.02630v3.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.01633v1","updated":"2024-06-01T15:54:45Z","published":"2024-06-01T15:54:45Z","title":"On Overcoming Miscalibrated Conversational Priors in LLM-based Chatbots","summary":"  We explore the use of Large Language Model (LLM-based) chatbots to power\nrecommender systems. We observe that the chatbots respond poorly when they\nencounter under-specified requests (e.g., they make incorrect assumptions,\nhedge with a long response, or refuse to answer). We conjecture that such\nmiscalibrated response tendencies (i.e., conversational priors) can be\nattributed to LLM fine-tuning using annotators -- single-turn annotations may\nnot capture multi-turn conversation utility, and the annotators' preferences\nmay not even be representative of users interacting with a recommender system.\n  We first analyze public LLM chat logs to conclude that query\nunder-specification is common. Next, we study synthetic recommendation problems\nwith configurable latent item utilities and frame them as Partially Observed\nDecision Processes (PODP). We find that pre-trained LLMs can be sub-optimal for\nPODPs and derive better policies that clarify under-specified queries when\nappropriate. Then, we re-calibrate LLMs by prompting them with learned control\nmessages to approximate the improved policy. Finally, we show empirically that\nour lightweight learning approach effectively uses logged conversation data to\nre-calibrate the response strategies of LLM-based chatbots for recommendation\ntasks.\n","authors":["Christine Herlihy","Jennifer Neville","Tobias Schnabel","Adith Swaminathan"],"pdf_url":"https://arxiv.org/pdf/2406.01633v1.pdf","comment":"Preprint of UAI'24 conference publication"},{"id":"http://arxiv.org/abs/2406.01631v1","updated":"2024-06-01T11:56:08Z","published":"2024-06-01T11:56:08Z","title":"An LLM-based Recommender System Environment","summary":"  Reinforcement learning (RL) has gained popularity in the realm of recommender\nsystems due to its ability to optimize long-term rewards and guide users in\ndiscovering relevant content. However, the successful implementation of RL in\nrecommender systems is challenging because of several factors, including the\nlimited availability of online data for training on-policy methods. This\nscarcity requires expensive human interaction for online model training.\nFurthermore, the development of effective evaluation frameworks that accurately\nreflect the quality of models remains a fundamental challenge in recommender\nsystems. To address these challenges, we propose a comprehensive framework for\nsynthetic environments that simulate human behavior by harnessing the\ncapabilities of large language models (LLMs). We complement our framework with\nin-depth ablation studies and demonstrate its effectiveness with experiments on\nmovie and book recommendations. By utilizing LLMs as synthetic users, this work\nintroduces a modular and novel framework for training RL-based recommender\nsystems. The software, including the RL environment, is publicly available.\n","authors":["Nathan Corecco","Giorgio Piatti","Luca A. Lanzendörfer","Flint Xiaofeng Fan","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2406.01631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01629v1","updated":"2024-06-01T10:20:52Z","published":"2024-06-01T10:20:52Z","title":"RecDiff: Diffusion Model for Social Recommendation","summary":"  Social recommendation has emerged as a powerful approach to enhance\npersonalized recommendations by leveraging the social connections among users,\nsuch as following and friend relations observed in online social platforms. The\nfundamental assumption of social recommendation is that socially-connected\nusers exhibit homophily in their preference patterns. This means that users\nconnected by social ties tend to have similar tastes in user-item activities,\nsuch as rating and purchasing. However, this assumption is not always valid due\nto the presence of irrelevant and false social ties, which can contaminate user\nembeddings and adversely affect recommendation accuracy. To address this\nchallenge, we propose a novel diffusion-based social denoising framework for\nrecommendation (RecDiff). Our approach utilizes a simple yet effective\nhidden-space diffusion paradigm to alleivate the noisy effect in the compressed\nand dense representation space. By performing multi-step noise diffusion and\nremoval, RecDiff possesses a robust ability to identify and eliminate noise\nfrom the encoded user representations, even when the noise levels vary. The\ndiffusion module is optimized in a downstream task-aware manner, thereby\nmaximizing its ability to enhance the recommendation process. We conducted\nextensive experiments to evaluate the efficacy of our framework, and the\nresults demonstrate its superiority in terms of recommendation accuracy,\ntraining efficiency, and denoising effectiveness. The source code for the model\nimplementation is publicly available at: https://github.com/HKUDS/RecDiff.\n","authors":["Zongwei Li","Lianghao Xia","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2406.01629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00333v1","updated":"2024-06-01T07:18:56Z","published":"2024-06-01T07:18:56Z","title":"A Practice-Friendly Two-Stage LLM-Enhanced Paradigm in Sequential\n  Recommendation","summary":"  The training paradigm integrating large language models (LLM) is gradually\nreshaping sequential recommender systems (SRS) and has shown promising results.\nHowever, most existing LLM-enhanced methods rely on rich textual information on\nthe item side and instance-level supervised fine-tuning (SFT) to inject\ncollaborative information into LLM, which is inefficient and limited in many\napplications. To alleviate these problems, this paper proposes a novel\npractice-friendly two-stage LLM-enhanced paradigm (TSLRec) for SRS.\nSpecifically, in the information reconstruction stage, we design a new\nuser-level SFT task for collaborative information injection with the assistance\nof a pre-trained SRS model, which is more efficient and compatible with limited\ntext information. We aim to let LLM try to infer the latent category of each\nitem and reconstruct the corresponding user's preference distribution for all\ncategories from the user's interaction sequence. In the information\naugmentation stage, we feed each item into LLM to obtain a set of enhanced\nembeddings that combine collaborative information and LLM inference\ncapabilities. These embeddings can then be used to help train various future\nSRS models. Finally, we verify the effectiveness and efficiency of our TSLRec\non three SRS benchmark datasets.\n","authors":["Dugang Liu","Shenxian Xian","Xiaolin Lin","Xiaolian Zhang","Hong Zhu","Yuan Fang","Zhen Chen","Zhong Ming"],"pdf_url":"https://arxiv.org/pdf/2406.00333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00323v1","updated":"2024-06-01T06:53:03Z","published":"2024-06-01T06:53:03Z","title":"BeFA: A General Behavior-driven Feature Adapter for Multimedia\n  Recommendation","summary":"  Multimedia recommender systems focus on utilizing behavioral information and\ncontent information to model user preferences. Typically, it employs\npre-trained feature encoders to extract content features, then fuses them with\nbehavioral features. However, pre-trained feature encoders often extract\nfeatures from the entire content simultaneously, including excessive\npreference-irrelevant details. We speculate that it may result in the extracted\nfeatures not containing sufficient features to accurately reflect user\npreferences. To verify our hypothesis, we introduce an attribution analysis\nmethod for visually and intuitively analyzing the content features. The results\nindicate that certain products' content features exhibit the issues of\ninformation drift}and information omission,reducing the expressive ability of\nfeatures. Building upon this finding, we propose an effective and efficient\ngeneral Behavior-driven Feature Adapter (BeFA) to tackle these issues. This\nadapter reconstructs the content feature with the guidance of behavioral\ninformation, enabling content features accurately reflecting user preferences.\nExtensive experiments demonstrate the effectiveness of the adapter across all\nmultimedia recommendation methods. The code will be publicly available upon the\npaper's acceptance.\n","authors":["Qile Fan","Penghang Yu","Zhiyi Tan","Bing-Kun Bao","Guanming Lu"],"pdf_url":"https://arxiv.org/pdf/2406.00323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00318v1","updated":"2024-06-01T06:28:41Z","published":"2024-06-01T06:28:41Z","title":"KGLink: A column type annotation method that combines knowledge graph\n  and pre-trained language model","summary":"  The semantic annotation of tabular data plays a crucial role in various\ndownstream tasks. Previous research has proposed knowledge graph (KG)-based and\ndeep learning-based methods, each with its inherent limitations. KG-based\nmethods encounter difficulties annotating columns when there is no match for\ncolumn cells in the KG. Moreover, KG-based methods can provide multiple\npredictions for one column, making it challenging to determine the semantic\ntype with the most suitable granularity for the dataset. This type granularity\nissue limits their scalability.\n  On the other hand, deep learning-based methods face challenges related to the\nvaluable context missing issue. This occurs when the information within the\ntable is insufficient for determining the correct column type.\n  This paper presents KGLink, a method that combines WikiData KG information\nwith a pre-trained deep learning language model for table column annotation,\neffectively addressing both type granularity and valuable context missing\nissues. Through comprehensive experiments on widely used tabular datasets\nencompassing numeric and string columns with varying type granularity, we\nshowcase the effectiveness and efficiency of KGLink. By leveraging the\nstrengths of KGLink, we successfully surmount challenges related to type\ngranularity and valuable context issues, establishing it as a robust solution\nfor the semantic annotation of tabular data.\n","authors":["Yubo Wang","Hao Xin","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2406.00318v1.pdf","comment":"To be published in ICDE 2024"},{"id":"http://arxiv.org/abs/2406.00247v1","updated":"2024-06-01T00:52:41Z","published":"2024-06-01T00:52:41Z","title":"Large Language Models for Relevance Judgment in Product Search","summary":"  High relevance of retrieved and re-ranked items to the search query is the\ncornerstone of successful product search, yet measuring relevance of items to\nqueries is one of the most challenging tasks in product information retrieval,\nand quality of product search is highly influenced by the precision and scale\nof available relevance-labelled data. In this paper, we present an array of\ntechniques for leveraging Large Language Models (LLMs) for automating the\nrelevance judgment of query-item pairs (QIPs) at scale. Using a unique dataset\nof multi-million QIPs, annotated by human evaluators, we test and optimize\nhyper parameters for finetuning billion-parameter LLMs with and without Low\nRank Adaption (LoRA), as well as various modes of item attribute concatenation\nand prompting in LLM finetuning, and consider trade offs in item attribute\ninclusion for quality of relevance predictions. We demonstrate considerable\nimprovement over baselines of prior generations of LLMs, as well as\noff-the-shelf models, towards relevance annotations on par with the human\nrelevance evaluators. Our findings have immediate implications for the growing\nfield of relevance judgment automation in product search.\n","authors":["Navid Mehrdad","Hrushikesh Mohapatra","Mossaab Bagdouri","Prijith Chandran","Alessandro Magnani","Xunfan Cai","Ajit Puthenputhussery","Sachin Yadav","Tony Lee","ChengXiang Zhai","Ciya Liao"],"pdf_url":"https://arxiv.org/pdf/2406.00247v1.pdf","comment":"10 pages, 1 figure, 11 tables - SIGIR 2024, LLM4Eval"}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.15583v3","updated":"2024-06-01T09:31:15Z","published":"2023-12-25T01:57:22Z","title":"ITEACH-Net: Inverted Teacher-studEnt seArCH Network for Emotion\n  Recognition in Conversation","summary":"  There remain two critical challenges that hinder the development of ERC.\nFirstly, there is a lack of exploration into mining deeper insights from the\ndata itself for conversational emotion tasks. Secondly, the systems exhibit\nvulnerability to random modality feature missing, which is a common occurrence\nin realistic settings. Focusing on these two key challenges, we propose a novel\nframework for incomplete multimodal learning in ERC, called \"Inverted\nTeacher-studEnt seArCH Network (ITEACH-Net).\" ITEACH-Net comprises two novel\ncomponents: the Emotion Context Changing Encoder (ECCE) and the Inverted\nTeacher-Student (ITS) framework. Specifically, leveraging the tendency for\nemotional states to exhibit local stability within conversational contexts,\nECCE captures these patterns and further perceives their evolution over time.\nRecognizing the varying challenges of handling incomplete versus complete data,\nITS employs a teacher-student framework to decouple the respective\ncomputations. Subsequently, through Neural Architecture Search, the student\nmodel develops enhanced computational capabilities for handling incomplete data\ncompared to the teacher model. During testing, we design a novel evaluation\nmethod, testing the model's performance under different missing rate conditions\nwithout altering the model weights. We conduct experiments on three benchmark\nERC datasets, and the results demonstrate that our ITEACH-Net outperforms\nexisting methods in incomplete multimodal ERC. We believe ITEACH-Net can\ninspire relevant research on the intrinsic nature of emotions within\nconversation scenarios and pave a more robust route for incomplete learning\ntechniques. Codes will be made available.\n","authors":["Haiyang Sun","Zheng Lian","Chenglong Wang","Kang Chen","Licai Sun","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2312.15583v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00409v1","updated":"2024-06-01T11:43:00Z","published":"2024-06-01T11:43:00Z","title":"Arabic Handwritten Text for Person Biometric Identification: A Deep\n  Learning Approach","summary":"  This study thoroughly investigates how well deep learning models can\nrecognize Arabic handwritten text for person biometric identification. It\ncompares three advanced architectures -- ResNet50, MobileNetV2, and\nEfficientNetB7 -- using three widely recognized datasets: AHAWP, Khatt, and\nLAMIS-MSHD. Results show that EfficientNetB7 outperforms the others, achieving\ntest accuracies of 98.57\\%, 99.15\\%, and 99.79\\% on AHAWP, Khatt, and\nLAMIS-MSHD datasets, respectively. EfficientNetB7's exceptional performance is\ncredited to its innovative techniques, including compound scaling, depth-wise\nseparable convolutions, and squeeze-and-excitation blocks. These features allow\nthe model to extract more abstract and distinctive features from handwritten\ntext images. The study's findings hold significant implications for enhancing\nidentity verification and authentication systems, highlighting the potential of\ndeep learning in Arabic handwritten text recognition for person biometric\nidentification.\n","authors":["Mazen Balat","Youssef Mohamed","Ahmed Heakl","Ahmed Zaky"],"pdf_url":"https://arxiv.org/pdf/2406.00409v1.pdf","comment":"6 pages, 11 figures, 4 tables, International IEEE Conference on the\n  Intelligent Methods, Systems, and Applications (IMSA)"},{"id":"http://arxiv.org/abs/2406.00323v1","updated":"2024-06-01T06:53:03Z","published":"2024-06-01T06:53:03Z","title":"BeFA: A General Behavior-driven Feature Adapter for Multimedia\n  Recommendation","summary":"  Multimedia recommender systems focus on utilizing behavioral information and\ncontent information to model user preferences. Typically, it employs\npre-trained feature encoders to extract content features, then fuses them with\nbehavioral features. However, pre-trained feature encoders often extract\nfeatures from the entire content simultaneously, including excessive\npreference-irrelevant details. We speculate that it may result in the extracted\nfeatures not containing sufficient features to accurately reflect user\npreferences. To verify our hypothesis, we introduce an attribution analysis\nmethod for visually and intuitively analyzing the content features. The results\nindicate that certain products' content features exhibit the issues of\ninformation drift}and information omission,reducing the expressive ability of\nfeatures. Building upon this finding, we propose an effective and efficient\ngeneral Behavior-driven Feature Adapter (BeFA) to tackle these issues. This\nadapter reconstructs the content feature with the guidance of behavioral\ninformation, enabling content features accurately reflecting user preferences.\nExtensive experiments demonstrate the effectiveness of the adapter across all\nmultimedia recommendation methods. The code will be publicly available upon the\npaper's acceptance.\n","authors":["Qile Fan","Penghang Yu","Zhiyi Tan","Bing-Kun Bao","Guanming Lu"],"pdf_url":"https://arxiv.org/pdf/2406.00323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00320v1","updated":"2024-06-01T06:40:22Z","published":"2024-06-01T06:40:22Z","title":"Frieren: Efficient Video-to-Audio Generation with Rectified Flow\n  Matching","summary":"  Video-to-audio (V2A) generation aims to synthesize content-matching audio\nfrom silent video, and it remains challenging to build V2A models with high\ngeneration quality, efficiency, and visual-audio temporal synchrony. We propose\nFrieren, a V2A model based on rectified flow matching. Frieren regresses the\nconditional transport vector field from noise to spectrogram latent with\nstraight paths and conducts sampling by solving ODE, outperforming\nautoregressive and score-based models in terms of audio quality. By employing a\nnon-autoregressive vector field estimator based on a feed-forward transformer\nand channel-level cross-modal feature fusion with strong temporal alignment,\nour model generates audio that is highly synchronized with the input video.\nFurthermore, through reflow and one-step distillation with guided vector field,\nour model can generate decent audio in a few, or even only one sampling step.\nExperiments indicate that Frieren achieves state-of-the-art performance in both\ngeneration quality and temporal alignment on VGGSound, with alignment accuracy\nreaching 97.22%, and 6.2% improvement in inception score over the strong\ndiffusion-based baseline. Audio samples are available at\nhttp://frieren-v2a.github.io .\n","authors":["Yongqi Wang","Wenxiang Guo","Rongjie Huang","Jiawei Huang","Zehan Wang","Fuming You","Ruiqi Li","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.00320v1.pdf","comment":null}]},"2024-06-04T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2406.02543v1","updated":"2024-06-04T17:58:18Z","published":"2024-06-04T17:58:18Z","title":"To Believe or Not to Believe Your LLM","summary":"  We explore uncertainty quantification in large language models (LLMs), with\nthe goal to identify when uncertainty in responses given a query is large. We\nsimultaneously consider both epistemic and aleatoric uncertainties, where the\nformer comes from the lack of knowledge about the ground truth (such as about\nfacts or the language), and the latter comes from irreducible randomness (such\nas multiple possible answers). In particular, we derive an\ninformation-theoretic metric that allows to reliably detect when only epistemic\nuncertainty is large, in which case the output of the model is unreliable. This\ncondition can be computed based solely on the output of the model obtained\nsimply by some special iterative prompting based on the previous responses.\nSuch quantification, for instance, allows to detect hallucinations (cases when\nepistemic uncertainty is high) in both single- and multi-answer responses. This\nis in contrast to many standard uncertainty quantification strategies (such as\nthresholding the log-likelihood of a response) where hallucinations in the\nmulti-answer case cannot be detected. We conduct a series of experiments which\ndemonstrate the advantage of our formulation. Further, our investigations shed\nsome light on how the probabilities assigned to a given output by an LLM can be\namplified by iterative prompting, which might be of independent interest.\n","authors":["Yasin Abbasi Yadkori","Ilja Kuzborskij","András György","Csaba Szepesvári"],"pdf_url":"https://arxiv.org/pdf/2406.02543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02539v1","updated":"2024-06-04T17:56:28Z","published":"2024-06-04T17:56:28Z","title":"Parrot: Multilingual Visual Instruction Tuning","summary":"  The rapid development of Multimodal Large Language Models (MLLMs) like GPT-4V\nhas marked a significant step towards artificial general intelligence. Existing\nmethods mainly focus on aligning vision encoders with LLMs through supervised\nfine-tuning (SFT) to endow LLMs with multimodal abilities, making MLLMs'\ninherent ability to react to multiple languages progressively deteriorate as\nthe training process evolves. We empirically find that the imbalanced SFT\ndatasets, primarily composed of English-centric image-text pairs, lead to\nsignificantly reduced performance in non-English languages. This is due to the\nfailure of aligning the vision encoder and LLM with multilingual tokens during\nthe SFT process. In this paper, we introduce Parrot, a novel method that\nutilizes textual guidance to drive visual token alignment at the language\nlevel. Parrot makes the visual tokens condition on diverse language inputs and\nuses Mixture-of-Experts (MoE) to promote the alignment of multilingual tokens.\nSpecifically, to enhance non-English visual tokens alignment, we compute the\ncross-attention using the initial visual features and textual embeddings, the\nresult of which is then fed into the MoE router to select the most relevant\nexperts. The selected experts subsequently convert the initial visual tokens\ninto language-specific visual tokens. Moreover, considering the current lack of\nbenchmarks for evaluating multilingual capabilities within the field, we\ncollect and make available a Massive Multilingual Multimodal Benchmark which\nincludes 6 languages, 15 categories, and 12,000 questions, named as MMMB. Our\nmethod not only demonstrates state-of-the-art performance on multilingual\nMMBench and MMMB, but also excels across a broad range of multimodal tasks.\nBoth the source code and the training dataset of Parrot will be made publicly\navailable.\n","authors":["Hai-Long Sun","Da-Wei Zhou","Yang Li","Shiyin Lu","Chao Yi","Qing-Guo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","De-Chuan Zhan","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2406.02539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02537v1","updated":"2024-06-04T17:55:43Z","published":"2024-06-04T17:55:43Z","title":"TopViewRS: Vision-Language Models as Top-View Spatial Reasoners","summary":"  Top-view perspective denotes a typical way in which humans read and reason\nover different types of maps, and it is vital for localization and navigation\nof humans as well as of `non-human' agents, such as the ones backed by large\nVision-Language Models (VLMs). Nonetheless, spatial reasoning capabilities of\nmodern VLMs remain unattested and underexplored. In this work, we thus study\ntheir capability to understand and reason over spatial relations from the top\nview. The focus on top view also enables controlled evaluations at different\ngranularity of spatial reasoning; we clearly disentangle different abilities\n(e.g., recognizing particular objects versus understanding their relative\npositions). We introduce the TopViewRS (Top-View Reasoning in Space) dataset,\nconsisting of 11,384 multiple-choice questions with either realistic or\nsemantic top-view map as visual input. We then use it to study and evaluate\nVLMs across 4 perception and reasoning tasks with different levels of\ncomplexity. Evaluation of 10 representative open- and closed-source VLMs\nreveals the gap of more than 50% compared to average human performance, and it\nis even lower than the random baseline in some cases. Although additional\nexperiments show that Chain-of-Thought reasoning can boost model capabilities\nby 5.82% on average, the overall performance of VLMs remains limited. Our\nfindings underscore the critical need for enhanced model capability in top-view\nspatial reasoning and set a foundation for further research towards human-level\nproficiency of VLMs in real-world multimodal tasks.\n","authors":["Chengzu Li","Caiqi Zhang","Han Zhou","Nigel Collier","Anna Korhonen","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2406.02537v1.pdf","comment":"9 pages, 3 figures, 3 tables (21 pages, 4 figures, 15 tables\n  including references and appendices)"},{"id":"http://arxiv.org/abs/2406.02536v1","updated":"2024-06-04T17:55:38Z","published":"2024-06-04T17:55:38Z","title":"Mitigate Position Bias in Large Language Models via Scaling a Single\n  Dimension","summary":"  Large Language Models (LLMs) are increasingly applied in various real-world\nscenarios due to their excellent generalization capabilities and robust\ngenerative abilities. However, they exhibit position bias, also known as \"lost\nin the middle\", a phenomenon that is especially pronounced in long-context\nscenarios, which indicates the placement of the key information in different\npositions of a prompt can significantly affect accuracy. This paper first\nexplores the micro-level manifestations of position bias, concluding that\nattention weights are a micro-level expression of position bias. It further\nidentifies that, in addition to position embeddings, causal attention mask also\ncontributes to position bias by creating position-specific hidden states. Based\non these insights, we propose a method to mitigate position bias by scaling\nthis positional hidden states. Experiments on the NaturalQuestions\nMulti-document QA, KV retrieval, LongBench and timeline reorder tasks, using\nvarious models including RoPE models, context windowextended models, and Alibi\nmodels, demonstrate the effectiveness and generalizability of our approach. Our\nmethod can improve performance by up to 15.2% by modifying just one dimension\nof hidden states. Our code is available at https://aka.ms/PositionalHidden.\n","authors":["Yijiong Yu","Huiqiang Jiang","Xufang Luo","Qianhui Wu","Chin-Yew Lin","Dongsheng Li","Yuqing Yang","Yongfeng Huang","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2406.02536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02532v1","updated":"2024-06-04T17:53:36Z","published":"2024-06-04T17:53:36Z","title":"SpecExec: Massively Parallel Speculative Decoding for Interactive LLM\n  Inference on Consumer Devices","summary":"  As large language models gain widespread adoption, running them efficiently\nbecomes crucial. Recent works on LLM inference use speculative decoding to\nachieve extreme speedups. However, most of these works implicitly design their\nalgorithms for high-end datacenter hardware. In this work, we ask the opposite\nquestion: how fast can we run LLMs on consumer machines? Consumer GPUs can no\nlonger fit the largest available models (50B+ parameters) and must offload them\nto RAM or SSD. When running with offloaded parameters, the inference engine can\nprocess batches of hundreds or thousands of tokens at the same time as just one\ntoken, making it a natural fit for speculative decoding. We propose SpecExec\n(Speculative Execution), a simple parallel decoding method that can generate up\nto 20 tokens per target model iteration for popular LLM families. It utilizes\nthe high spikiness of the token probabilities distribution in modern LLMs and a\nhigh degree of alignment between model output probabilities. SpecExec takes the\nmost probable tokens continuation from the draft model to build a \"cache\" tree\nfor the target model, which then gets validated in a single pass. Using\nSpecExec, we demonstrate inference of 50B+ parameter LLMs on consumer GPUs with\nRAM offloading at 4-6 tokens per second with 4-bit quantization or 2-3 tokens\nper second with 16-bit weights.\n","authors":["Ruslan Svirschevski","Avner May","Zhuoming Chen","Beidi Chen","Zhihao Jia","Max Ryabinin"],"pdf_url":"https://arxiv.org/pdf/2406.02532v1.pdf","comment":"preprint. arXiv admin note: text overlap with arXiv:2312.17238 by\n  other authors"},{"id":"http://arxiv.org/abs/2406.02528v1","updated":"2024-06-04T17:50:34Z","published":"2024-06-04T17:50:34Z","title":"Scalable MatMul-free Language Modeling","summary":"  Matrix multiplication (MatMul) typically dominates the overall computational\ncost of large language models (LLMs). This cost only grows as LLMs scale to\nlarger embedding dimensions and context lengths. In this work, we show that\nMatMul operations can be completely eliminated from LLMs while maintaining\nstrong performance at billion-parameter scales. Our experiments show that our\nproposed MatMul-free models achieve performance on-par with state-of-the-art\nTransformers that require far more memory during inference at a scale up to at\nleast 2.7B parameters. We investigate the scaling laws and find that the\nperformance gap between our MatMul-free models and full precision Transformers\nnarrows as the model size increases. We also provide a GPU-efficient\nimplementation of this model which reduces memory usage by up to 61% over an\nunoptimized baseline during training. By utilizing an optimized kernel during\ninference, our model's memory consumption can be reduced by more than 10x\ncompared to unoptimized models. To properly quantify the efficiency of our\narchitecture, we build a custom hardware solution on an FPGA which exploits\nlightweight operations beyond what GPUs are capable of. We processed\nbillion-parameter scale models at 13W beyond human readable throughput, moving\nLLMs closer to brain-like efficiency. This work not only shows how far LLMs can\nbe stripped back while still performing effectively, but also points at the\ntypes of operations future accelerators should be optimized for in processing\nthe next generation of lightweight LLMs. Our code implementation is available\nat \\url{https://github.com/ridgerchu/matmulfreellm}.\n","authors":["Rui-Jie Zhu","Yu Zhang","Ethan Sifferman","Tyler Sheaves","Yiqiao Wang","Dustin Richmond","Peng Zhou","Jason K. Eshraghian"],"pdf_url":"https://arxiv.org/pdf/2406.02528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02524v1","updated":"2024-06-04T17:42:21Z","published":"2024-06-04T17:42:21Z","title":"CheckEmbed: Effective Verification of LLM Solutions to Open-Ended Tasks","summary":"  Large Language Models (LLMs) are revolutionizing various domains, yet\nverifying their answers remains a significant challenge, especially for\nintricate open-ended tasks such as consolidation, summarization, and extraction\nof knowledge. In this work, we propose CheckEmbed: an accurate, scalable, and\nsimple LLM verification approach. CheckEmbed is driven by a straightforward yet\npowerful idea: in order to compare LLM solutions to one another or to the\nground-truth, compare their corresponding answer-level embeddings obtained with\na model such as GPT Text Embedding Large. This reduces a complex textual answer\nto a single embedding, facilitating straightforward, fast, and meaningful\nverification. We develop a comprehensive verification pipeline implementing the\nCheckEmbed methodology. The CheckEmbed pipeline also comes with metrics for\nassessing the truthfulness of the LLM answers, such as embedding heatmaps and\ntheir summaries. We show how to use these metrics for deploying practical\nengines that decide whether an LLM answer is satisfactory or not. We apply the\npipeline to real-world document analysis tasks, including term extraction and\ndocument summarization, showcasing significant improvements in accuracy,\ncost-effectiveness, and runtime performance compared to existing token-,\nsentence-, and fact-level schemes such as BERTScore or SelfCheckGPT.\n","authors":["Maciej Besta","Lorenzo Paleari","Ales Kubicek","Piotr Nyczyk","Robert Gerstenberger","Patrick Iff","Tomasz Lehmann","Hubert Niewiadomski","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2406.02524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02517v1","updated":"2024-06-04T17:39:23Z","published":"2024-06-04T17:39:23Z","title":"Deterministic Reversible Data Augmentation for Neural Machine\n  Translation","summary":"  Data augmentation is an effective way to diversify corpora in machine\ntranslation, but previous methods may introduce semantic inconsistency between\noriginal and augmented data because of irreversible operations and random\nsubword sampling procedures. To generate both symbolically diverse and\nsemantically consistent augmentation data, we propose Deterministic Reversible\nData Augmentation (DRDA), a simple but effective data augmentation method for\nneural machine translation. DRDA adopts deterministic segmentations and\nreversible operations to generate multi-granularity subword representations and\npulls them closer together with multi-view techniques. With no extra corpora or\nmodel changes required, DRDA outperforms strong baselines on several\ntranslation tasks with a clear margin (up to 4.3 BLEU gain over Transformer)\nand exhibits good robustness in noisy, low-resource, and cross-domain datasets.\n","authors":["Jiashu Yao","Heyan Huang","Zeming Liu","Yuhang Guo"],"pdf_url":"https://arxiv.org/pdf/2406.02517v1.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2404.15485v2","updated":"2024-06-04T17:37:08Z","published":"2024-04-23T19:55:18Z","title":"Large Language Models Spot Phishing Emails with Surprising Accuracy: A\n  Comparative Analysis of Performance","summary":"  Phishing, a prevalent cybercrime tactic for decades, remains a significant\nthreat in today's digital world. By leveraging clever social engineering\nelements and modern technology, cybercrime targets many individuals,\nbusinesses, and organizations to exploit trust and security. These\ncyber-attackers are often disguised in many trustworthy forms to appear as\nlegitimate sources. By cleverly using psychological elements like urgency,\nfear, social proof, and other manipulative strategies, phishers can lure\nindividuals into revealing sensitive and personalized information. Building on\nthis pervasive issue within modern technology, this paper aims to analyze the\neffectiveness of 15 Large Language Models (LLMs) in detecting phishing\nattempts, specifically focusing on a randomized set of \"419 Scam\" emails. The\nobjective is to determine which LLMs can accurately detect phishing emails by\nanalyzing a text file containing email metadata based on predefined criteria.\nThe experiment concluded that the following models, ChatGPT 3.5,\nGPT-3.5-Turbo-Instruct, and ChatGPT, were the most effective in detecting\nphishing emails.\n","authors":["Het Patel","Umair Rehman","Farkhund Iqbal"],"pdf_url":"https://arxiv.org/pdf/2404.15485v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2402.04833v2","updated":"2024-06-04T17:20:01Z","published":"2024-02-07T13:32:11Z","title":"Long Is More for Alignment: A Simple but Tough-to-Beat Baseline for\n  Instruction Fine-Tuning","summary":"  There is a consensus that instruction fine-tuning of LLMs requires\nhigh-quality data, but what are they? LIMA (NeurIPS 2023) and AlpaGasus (ICLR\n2024) are state-of-the-art methods for selecting such high-quality examples,\neither via manual curation or using GPT-3.5-Turbo as a quality scorer. We show\nthat the extremely simple baseline of selecting the 1,000 instructions with\nlongest responses -- that intuitively contain more learnable information and\nare harder to overfit -- from standard datasets can consistently outperform\nthese sophisticated methods according to GPT-4 and PaLM-2 as judges, while\nremaining competitive on the Open LLM benchmarks that test factual knowledge.\nWe demonstrate this for several LLMs (Llama-2-7B, Llama-2-13B, Mistral-7B-v0.1)\nand datasets (Alpaca-52k, Evol-Instruct-70k). In addition, a lightweight\nrefinement of such long instructions can further improve the abilities of the\nfine-tuned LLMs, and allows us to obtain competitive results on MT-Bench and\nthe 2nd highest-ranked Llama-2-7B-based model on AlpacaEval 2.0, while training\non only 1,000 examples and no extra preference data. We also conduct a thorough\nanalysis of our models to ensure that their enhanced performance is not simply\ndue to GPT-4's preference for longer responses. Overall, our findings suggest\nthat fine-tuning on the longest responses should be the default baseline for\nany work on instruction fine-tuning. We provide our code at\nhttps://github.com/tml-epfl/long-is-more-for-alignment.\n","authors":["Hao Zhao","Maksym Andriushchenko","Francesco Croce","Nicolas Flammarion"],"pdf_url":"https://arxiv.org/pdf/2402.04833v2.pdf","comment":"Accepted at ICML 2024. This camera-ready version adds MT-Bench\n  evaluations, a human study, more thorough analysis of length bias. Code at\n  https://github.com/tml-epfl/long-is-more-for-alignment"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2406.02552v1","updated":"2024-06-04T17:59:57Z","published":"2024-06-04T17:59:57Z","title":"VHS: High-Resolution Iterative Stereo Matching with Visual Hull Priors","summary":"  We present a stereo-matching method for depth estimation from high-resolution\nimages using visual hulls as priors, and a memory-efficient technique for the\ncorrelation computation. Our method uses object masks extracted from\nsupplementary views of the scene to guide the disparity estimation, effectively\nreducing the search space for matches. This approach is specifically tailored\nto stereo rigs in volumetric capture systems, where an accurate depth plays a\nkey role in the downstream reconstruction task. To enable training and\nregression at high resolutions targeted by recent systems, our approach extends\na sparse correlation computation into a hybrid sparse-dense scheme suitable for\napplication in leading recurrent network architectures. We evaluate the\nperformance-efficiency trade-off of our method compared to state-of-the-art\nmethods, and demonstrate the efficacy of the visual hull guidance. In addition,\nwe propose a training scheme for a further reduction of memory requirements\nduring optimization, facilitating training on high-resolution data.\n","authors":["Markus Plack","Hannah Dröge","Leif Van Holland","Matthias B. Hullin"],"pdf_url":"https://arxiv.org/pdf/2406.02552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02549v1","updated":"2024-06-04T17:59:32Z","published":"2024-06-04T17:59:32Z","title":"Dreamguider: Improved Training free Diffusion-based Conditional\n  Generation","summary":"  Diffusion models have emerged as a formidable tool for training-free\nconditional generation.However, a key hurdle in inference-time guidance\ntechniques is the need for compute-heavy backpropagation through the diffusion\nnetwork for estimating the guidance direction. Moreover, these techniques often\nrequire handcrafted parameter tuning on a case-by-case basis. Although some\nrecent works have introduced minimal compute methods for linear inverse\nproblems, a generic lightweight guidance solution to both linear and non-linear\nguidance problems is still missing. To this end, we propose Dreamguider, a\nmethod that enables inference-time guidance without compute-heavy\nbackpropagation through the diffusion network. The key idea is to regulate the\ngradient flow through a time-varying factor. Moreover, we propose an empirical\nguidance scale that works for a wide variety of tasks, hence removing the need\nfor handcrafted parameter tuning. We further introduce an effective lightweight\naugmentation strategy that significantly boosts the performance during\ninference-time guidance. We present experiments using Dreamguider on multiple\ntasks across multiple datasets and models to show the effectiveness of the\nproposed modules. To facilitate further research, we will make the code public\nafter the review process.\n","authors":["Nithin Gopalakrishnan Nair","Vishal M Patel"],"pdf_url":"https://arxiv.org/pdf/2406.02549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02548v1","updated":"2024-06-04T17:59:31Z","published":"2024-06-04T17:59:31Z","title":"Open-YOLO 3D: Towards Fast and Accurate Open-Vocabulary 3D Instance\n  Segmentation","summary":"  Recent works on open-vocabulary 3D instance segmentation show strong promise,\nbut at the cost of slow inference speed and high computation requirements. This\nhigh computation cost is typically due to their heavy reliance on 3D clip\nfeatures, which require computationally expensive 2D foundation models like\nSegment Anything (SAM) and CLIP for multi-view aggregation into 3D. As a\nconsequence, this hampers their applicability in many real-world applications\nthat require both fast and accurate predictions. To this end, we propose a fast\nyet accurate open-vocabulary 3D instance segmentation approach, named Open-YOLO\n3D, that effectively leverages only 2D object detection from multi-view RGB\nimages for open-vocabulary 3D instance segmentation. We address this task by\ngenerating class-agnostic 3D masks for objects in the scene and associating\nthem with text prompts. We observe that the projection of class-agnostic 3D\npoint cloud instances already holds instance information; thus, using SAM might\nonly result in redundancy that unnecessarily increases the inference time. We\nempirically find that a better performance of matching text prompts to 3D masks\ncan be achieved in a faster fashion with a 2D object detector. We validate our\nOpen-YOLO 3D on two benchmarks, ScanNet200 and Replica, under two scenarios:\n(i) with ground truth masks, where labels are required for given object\nproposals, and (ii) with class-agnostic 3D proposals generated from a 3D\nproposal network. Our Open-YOLO 3D achieves state-of-the-art performance on\nboth datasets while obtaining up to $\\sim$16$\\times$ speedup compared to the\nbest existing method in literature. On ScanNet200 val. set, our Open-YOLO 3D\nachieves mean average precision (mAP) of 24.7\\% while operating at 22 seconds\nper scene. Code and model are available at github.com/aminebdj/OpenYOLO3D.\n","authors":["Mohamed El Amine Boudjoghra","Angela Dai","Jean Lahoud","Hisham Cholakkal","Rao Muhammad Anwer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2406.02548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02547v1","updated":"2024-06-04T17:59:25Z","published":"2024-06-04T17:59:25Z","title":"Leveraging Visual Tokens for Extended Text Contexts in Multi-Modal\n  Learning","summary":"  Training models with longer in-context lengths is a significant challenge for\nmultimodal model due to substantial GPU memory and computational costs. This\nexploratory study does not present state-of-the-art models; rather, it\nintroduces an innovative method designed to increase in-context text length in\nmulti-modality large language models (MLLMs) efficiently. We present Visualized\nIn-Context Text Processing (VisInContext), which processes long in-context text\nusing visual tokens. This technique significantly reduces GPU memory usage and\nfloating point operations (FLOPs) for both training and inferenceing stage. For\ninstance, our method expands the pre-training in-context text length from 256\nto 2048 tokens with nearly same FLOPs for a 56 billion parameter MOE model.\nExperimental results demonstrate that model trained with VisInContext delivers\nsuperior performance on common downstream benchmarks for in-context few-shot\nevaluation. Additionally, VisInContext is complementary to existing methods for\nincreasing in-context text length and enhances document understanding\ncapabilities, showing great potential in document QA tasks and sequential\ndocument retrieval.\n","authors":["Alex Jinpeng Wang","Linjie Li","Yiqi Lin","Min Li","Lijuan Wang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2406.02547v1.pdf","comment":"12 pages. The website is\n  \\url{https://fingerrec.github.io/visincontext}"},{"id":"http://arxiv.org/abs/2406.02541v1","updated":"2024-06-04T17:57:37Z","published":"2024-06-04T17:57:37Z","title":"Enhancing Temporal Consistency in Video Editing by Reconstructing Videos\n  with 3D Gaussian Splatting","summary":"  Recent advancements in zero-shot video diffusion models have shown promise\nfor text-driven video editing, but challenges remain in achieving high temporal\nconsistency. To address this, we introduce Video-3DGS, a 3D Gaussian Splatting\n(3DGS)-based video refiner designed to enhance temporal consistency in\nzero-shot video editors. Our approach utilizes a two-stage 3D Gaussian\noptimizing process tailored for editing dynamic monocular videos. In the first\nstage, Video-3DGS employs an improved version of COLMAP, referred to as\nMC-COLMAP, which processes original videos using a Masked and Clipped approach.\nFor each video clip, MC-COLMAP generates the point clouds for dynamic\nforeground objects and complex backgrounds. These point clouds are utilized to\ninitialize two sets of 3D Gaussians (Frg-3DGS and Bkg-3DGS) aiming to represent\nforeground and background views. Both foreground and background views are then\nmerged with a 2D learnable parameter map to reconstruct full views. In the\nsecond stage, we leverage the reconstruction ability developed in the first\nstage to impose the temporal constraints on the video diffusion model. To\ndemonstrate the efficacy of Video-3DGS on both stages, we conduct extensive\nexperiments across two related tasks: Video Reconstruction and Video Editing.\nVideo-3DGS trained with 3k iterations significantly improves video\nreconstruction quality (+3 PSNR, +7 PSNR increase) and training efficiency\n(x1.9, x4.5 times faster) over NeRF-based and 3DGS-based state-of-art methods\non DAVIS dataset, respectively. Moreover, it enhances video editing by ensuring\ntemporal consistency across 58 dynamic monocular videos.\n","authors":["Inkyu Shin","Qihang Yu","Xiaohui Shen","In So Kweon","Kuk-Jin Yoon","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2406.02541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02540v1","updated":"2024-06-04T17:57:10Z","published":"2024-06-04T17:57:10Z","title":"ViDiT-Q: Efficient and Accurate Quantization of Diffusion Transformers\n  for Image and Video Generation","summary":"  Diffusion transformers (DiTs) have exhibited remarkable performance in visual\ngeneration tasks, such as generating realistic images or videos based on\ntextual instructions. However, larger model sizes and multi-frame processing\nfor video generation lead to increased computational and memory costs, posing\nchallenges for practical deployment on edge devices. Post-Training Quantization\n(PTQ) is an effective method for reducing memory costs and computational\ncomplexity. When quantizing diffusion transformers, we find that applying\nexisting diffusion quantization methods designed for U-Net faces challenges in\npreserving quality. After analyzing the major challenges for quantizing\ndiffusion transformers, we design an improved quantization scheme: \"ViDiT-Q\":\nVideo and Image Diffusion Transformer Quantization) to address these issues.\nFurthermore, we identify highly sensitive layers and timesteps hinder\nquantization for lower bit-widths. To tackle this, we improve ViDiT-Q with a\nnovel metric-decoupled mixed-precision quantization method (ViDiT-Q-MP). We\nvalidate the effectiveness of ViDiT-Q across a variety of text-to-image and\nvideo models. While baseline quantization methods fail at W8A8 and produce\nunreadable content at W4A8, ViDiT-Q achieves lossless W8A8 quantization.\nViDiTQ-MP achieves W4A8 with negligible visual quality degradation, resulting\nin a 2.5x memory optimization and a 1.5x latency speedup.\n","authors":["Tianchen Zhao","Tongcheng Fang","Enshu Liu","Wan Rui","Widyadewi Soedarmadji","Shiyao Li","Zinan Lin","Guohao Dai","Shengen Yan","Huazhong Yang","Xuefei Ning","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2406.02540v1.pdf","comment":"Project Page: https://a-suozhang.xyz/viditq.github.io/"},{"id":"http://arxiv.org/abs/2406.02539v1","updated":"2024-06-04T17:56:28Z","published":"2024-06-04T17:56:28Z","title":"Parrot: Multilingual Visual Instruction Tuning","summary":"  The rapid development of Multimodal Large Language Models (MLLMs) like GPT-4V\nhas marked a significant step towards artificial general intelligence. Existing\nmethods mainly focus on aligning vision encoders with LLMs through supervised\nfine-tuning (SFT) to endow LLMs with multimodal abilities, making MLLMs'\ninherent ability to react to multiple languages progressively deteriorate as\nthe training process evolves. We empirically find that the imbalanced SFT\ndatasets, primarily composed of English-centric image-text pairs, lead to\nsignificantly reduced performance in non-English languages. This is due to the\nfailure of aligning the vision encoder and LLM with multilingual tokens during\nthe SFT process. In this paper, we introduce Parrot, a novel method that\nutilizes textual guidance to drive visual token alignment at the language\nlevel. Parrot makes the visual tokens condition on diverse language inputs and\nuses Mixture-of-Experts (MoE) to promote the alignment of multilingual tokens.\nSpecifically, to enhance non-English visual tokens alignment, we compute the\ncross-attention using the initial visual features and textual embeddings, the\nresult of which is then fed into the MoE router to select the most relevant\nexperts. The selected experts subsequently convert the initial visual tokens\ninto language-specific visual tokens. Moreover, considering the current lack of\nbenchmarks for evaluating multilingual capabilities within the field, we\ncollect and make available a Massive Multilingual Multimodal Benchmark which\nincludes 6 languages, 15 categories, and 12,000 questions, named as MMMB. Our\nmethod not only demonstrates state-of-the-art performance on multilingual\nMMBench and MMMB, but also excels across a broad range of multimodal tasks.\nBoth the source code and the training dataset of Parrot will be made publicly\navailable.\n","authors":["Hai-Long Sun","Da-Wei Zhou","Yang Li","Shiyin Lu","Chao Yi","Qing-Guo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","De-Chuan Zhan","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2406.02539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02537v1","updated":"2024-06-04T17:55:43Z","published":"2024-06-04T17:55:43Z","title":"TopViewRS: Vision-Language Models as Top-View Spatial Reasoners","summary":"  Top-view perspective denotes a typical way in which humans read and reason\nover different types of maps, and it is vital for localization and navigation\nof humans as well as of `non-human' agents, such as the ones backed by large\nVision-Language Models (VLMs). Nonetheless, spatial reasoning capabilities of\nmodern VLMs remain unattested and underexplored. In this work, we thus study\ntheir capability to understand and reason over spatial relations from the top\nview. The focus on top view also enables controlled evaluations at different\ngranularity of spatial reasoning; we clearly disentangle different abilities\n(e.g., recognizing particular objects versus understanding their relative\npositions). We introduce the TopViewRS (Top-View Reasoning in Space) dataset,\nconsisting of 11,384 multiple-choice questions with either realistic or\nsemantic top-view map as visual input. We then use it to study and evaluate\nVLMs across 4 perception and reasoning tasks with different levels of\ncomplexity. Evaluation of 10 representative open- and closed-source VLMs\nreveals the gap of more than 50% compared to average human performance, and it\nis even lower than the random baseline in some cases. Although additional\nexperiments show that Chain-of-Thought reasoning can boost model capabilities\nby 5.82% on average, the overall performance of VLMs remains limited. Our\nfindings underscore the critical need for enhanced model capability in top-view\nspatial reasoning and set a foundation for further research towards human-level\nproficiency of VLMs in real-world multimodal tasks.\n","authors":["Chengzu Li","Caiqi Zhang","Han Zhou","Nigel Collier","Anna Korhonen","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2406.02537v1.pdf","comment":"9 pages, 3 figures, 3 tables (21 pages, 4 figures, 15 tables\n  including references and appendices)"},{"id":"http://arxiv.org/abs/2406.02535v1","updated":"2024-06-04T17:55:22Z","published":"2024-06-04T17:55:22Z","title":"Enhancing 2D Representation Learning with a 3D Prior","summary":"  Learning robust and effective representations of visual data is a fundamental\ntask in computer vision. Traditionally, this is achieved by training models\nwith labeled data which can be expensive to obtain. Self-supervised learning\nattempts to circumvent the requirement for labeled data by learning\nrepresentations from raw unlabeled visual data alone. However, unlike humans\nwho obtain rich 3D information from their binocular vision and through motion,\nthe majority of current self-supervised methods are tasked with learning from\nmonocular 2D image collections. This is noteworthy as it has been demonstrated\nthat shape-centric visual processing is more robust compared to texture-biased\nautomated methods. Inspired by this, we propose a new approach for\nstrengthening existing self-supervised methods by explicitly enforcing a strong\n3D structural prior directly into the model during training. Through\nexperiments, across a range of datasets, we demonstrate that our 3D aware\nrepresentations are more robust compared to conventional self-supervised\nbaselines.\n","authors":["Mehmet Aygün","Prithviraj Dhar","Zhicheng Yan","Oisin Mac Aodha","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2406.02535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01101v2","updated":"2024-06-04T17:55:02Z","published":"2023-04-03T16:01:03Z","title":"Dsfer-Net: A Deep Supervision and Feature Retrieval Network for\n  Bitemporal Change Detection Using Modern Hopfield Networks","summary":"  Change detection, an essential application for high-resolution remote sensing\nimages, aims to monitor and analyze changes in the land surface over time. Due\nto the rapid increase in the quantity of high-resolution remote sensing data\nand the complexity of texture features, several quantitative deep\nlearning-based methods have been proposed. These methods outperform traditional\nchange detection methods by extracting deep features and combining\nspatial-temporal information. However, reasonable explanations for how deep\nfeatures improve detection performance are still lacking. In our\ninvestigations, we found that modern Hopfield network layers significantly\nenhance semantic understanding. In this paper, we propose a Deep Supervision\nand FEature Retrieval network (Dsfer-Net) for bitemporal change detection.\nSpecifically, the highly representative deep features of bitemporal images are\njointly extracted through a fully convolutional Siamese network. Based on the\nsequential geographical information of the bitemporal images, we designed a\nfeature retrieval module to extract difference features and leverage\ndiscriminative information in a deeply supervised manner. Additionally, we\nobserved that the deeply supervised feature retrieval module provides\nexplainable evidence of the semantic understanding of the proposed network in\nits deep layers. Finally, our end-to-end network establishes a novel framework\nby aggregating retrieved features and feature pairs from different layers.\nExperiments conducted on three public datasets (LEVIR-CD, WHU-CD, and CDD)\nconfirm the superiority of the proposed Dsfer-Net over other state-of-the-art\nmethods.\n","authors":["Shizhen Chang","Michael Kopp","Pedram Ghamisi","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2304.01101v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02534v1","updated":"2024-06-04T17:54:44Z","published":"2024-06-04T17:54:44Z","title":"Enhancing predictive imaging biomarker discovery through treatment\n  effect analysis","summary":"  Identifying predictive biomarkers, which forecast individual treatment\neffectiveness, is crucial for personalized medicine and informs decision-making\nacross diverse disciplines. These biomarkers are extracted from pre-treatment\ndata, often within randomized controlled trials, and have to be distinguished\nfrom prognostic biomarkers, which are independent of treatment assignment. Our\nstudy focuses on the discovery of predictive imaging biomarkers, aiming to\nleverage pre-treatment images to unveil new causal relationships. Previous\napproaches relied on labor-intensive handcrafted or manually derived features,\nwhich may introduce biases. In response, we present a new task of discovering\npredictive imaging biomarkers directly from the pre-treatment images to learn\nrelevant image features. We propose an evaluation protocol for this task to\nassess a model's ability to identify predictive imaging biomarkers and\ndifferentiate them from prognostic ones. It employs statistical testing and a\ncomprehensive analysis of image feature attribution. We explore the suitability\nof deep learning models originally designed for estimating the conditional\naverage treatment effect (CATE) for this task, which previously have been\nprimarily assessed for the precision of CATE estimation, overlooking the\nevaluation of imaging biomarker discovery. Our proof-of-concept analysis\ndemonstrates promising results in discovering and validating predictive imaging\nbiomarkers from synthetic outcomes and real-world image datasets.\n","authors":["Shuhan Xiao","Lukas Klein","Jens Petersen","Philipp Vollmuth","Paul F. Jaeger","Klaus H. Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2406.02534v1.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2406.02533v1","updated":"2024-06-04T17:54:20Z","published":"2024-06-04T17:54:20Z","title":"SatSplatYOLO: 3D Gaussian Splatting-based Virtual Object Detection\n  Ensembles for Satellite Feature Recognition","summary":"  On-orbit servicing (OOS), inspection of spacecraft, and active debris removal\n(ADR). Such missions require precise rendezvous and proximity operations in the\nvicinity of non-cooperative, possibly unknown, resident space objects. Safety\nconcerns with manned missions and lag times with ground-based control\nnecessitate complete autonomy. In this article, we present an approach for\nmapping geometries and high-confidence detection of components of unknown,\nnon-cooperative satellites on orbit. We implement accelerated 3D Gaussian\nsplatting to learn a 3D representation of the satellite, render virtual views\nof the target, and ensemble the YOLOv5 object detector over the virtual views,\nresulting in reliable, accurate, and precise satellite component detections.\nThe full pipeline capable of running on-board and stand to enable downstream\nmachine intelligence tasks necessary for autonomous guidance, navigation, and\ncontrol tasks.\n","authors":["Van Minh Nguyen","Emma Sandidge","Trupti Mahendrakar","Ryan T. White"],"pdf_url":"https://arxiv.org/pdf/2406.02533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02529v1","updated":"2024-06-04T17:51:08Z","published":"2024-06-04T17:51:08Z","title":"ReLUs Are Sufficient for Learning Implicit Neural Representations","summary":"  Motivated by the growing theoretical understanding of neural networks that\nemploy the Rectified Linear Unit (ReLU) as their activation function, we\nrevisit the use of ReLU activation functions for learning implicit neural\nrepresentations (INRs). Inspired by second order B-spline wavelets, we\nincorporate a set of simple constraints to the ReLU neurons in each layer of a\ndeep neural network (DNN) to remedy the spectral bias. This in turn enables its\nuse for various INR tasks. Empirically, we demonstrate that, contrary to\npopular belief, one can learn state-of-the-art INRs based on a DNN composed of\nonly ReLU neurons. Next, by leveraging recent theoretical works which\ncharacterize the kinds of functions ReLU neural networks learn, we provide a\nway to quantify the regularity of the learned function. This offers a\nprincipled approach to selecting the hyperparameters in INR architectures. We\nsubstantiate our claims through experiments in signal representation, super\nresolution, and computed tomography, demonstrating the versatility and\neffectiveness of our method. The code for all experiments can be found at\nhttps://github.com/joeshenouda/relu-inrs.\n","authors":["Joseph Shenouda","Yamin Zhou","Robert D. Nowak"],"pdf_url":"https://arxiv.org/pdf/2406.02529v1.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2406.02518v1","updated":"2024-06-04T17:39:31Z","published":"2024-06-04T17:39:31Z","title":"DDGS-CT: Direction-Disentangled Gaussian Splatting for Realistic Volume\n  Rendering","summary":"  Digitally reconstructed radiographs (DRRs) are simulated 2D X-ray images\ngenerated from 3D CT volumes, widely used in preoperative settings but limited\nin intraoperative applications due to computational bottlenecks, especially for\naccurate but heavy physics-based Monte Carlo methods. While analytical DRR\nrenderers offer greater efficiency, they overlook anisotropic X-ray image\nformation phenomena, such as Compton scattering. We present a novel approach\nthat marries realistic physics-inspired X-ray simulation with efficient,\ndifferentiable DRR generation using 3D Gaussian splatting (3DGS). Our\ndirection-disentangled 3DGS (DDGS) method separates the radiosity contribution\ninto isotropic and direction-dependent components, approximating complex\nanisotropic interactions without intricate runtime simulations. Additionally,\nwe adapt the 3DGS initialization to account for tomography data properties,\nenhancing accuracy and efficiency. Our method outperforms state-of-the-art\ntechniques in image accuracy. Furthermore, our DDGS shows promise for\nintraoperative applications and inverse problems such as pose registration,\ndelivering superior registration accuracy and runtime performance compared to\nanalytical DRR methods.\n","authors":["Zhongpai Gao","Benjamin Planche","Meng Zheng","Xiao Chen","Terrence Chen","Ziyan Wu"],"pdf_url":"https://arxiv.org/pdf/2406.02518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02511v1","updated":"2024-06-04T17:32:52Z","published":"2024-06-04T17:32:52Z","title":"V-Express: Conditional Dropout for Progressive Training of Portrait\n  Video Generation","summary":"  In the field of portrait video generation, the use of single images to\ngenerate portrait videos has become increasingly prevalent. A common approach\ninvolves leveraging generative models to enhance adapters for controlled\ngeneration. However, control signals (e.g., text, audio, reference image, pose,\ndepth map, etc.) can vary in strength. Among these, weaker conditions often\nstruggle to be effective due to interference from stronger conditions, posing a\nchallenge in balancing these conditions. In our work on portrait video\ngeneration, we identified audio signals as particularly weak, often\novershadowed by stronger signals such as facial pose and reference image.\nHowever, direct training with weak signals often leads to difficulties in\nconvergence. To address this, we propose V-Express, a simple method that\nbalances different control signals through the progressive training and the\nconditional dropout operation. Our method gradually enables effective control\nby weak conditions, thereby achieving generation capabilities that\nsimultaneously take into account the facial pose, reference image, and audio.\nThe experimental results demonstrate that our method can effectively generate\nportrait videos controlled by audio. Furthermore, a potential solution is\nprovided for the simultaneous and effective use of conditions of varying\nstrengths.\n","authors":["Cong Wang","Kuan Tian","Jun Zhang","Yonghang Guan","Feng Luo","Fei Shen","Zhiwei Jiang","Qing Gu","Xiao Han","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2406.02511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02509v1","updated":"2024-06-04T17:27:19Z","published":"2024-06-04T17:27:19Z","title":"CamCo: Camera-Controllable 3D-Consistent Image-to-Video Generation","summary":"  Recently video diffusion models have emerged as expressive generative tools\nfor high-quality video content creation readily available to general users.\nHowever, these models often do not offer precise control over camera poses for\nvideo generation, limiting the expression of cinematic language and user\ncontrol. To address this issue, we introduce CamCo, which allows fine-grained\nCamera pose Control for image-to-video generation. We equip a pre-trained\nimage-to-video generator with accurately parameterized camera pose input using\nPl\\\"ucker coordinates. To enhance 3D consistency in the videos produced, we\nintegrate an epipolar attention module in each attention block that enforces\nepipolar constraints to the feature maps. Additionally, we fine-tune CamCo on\nreal-world videos with camera poses estimated through structure-from-motion\nalgorithms to better synthesize object motion. Our experiments show that CamCo\nsignificantly improves 3D consistency and camera control capabilities compared\nto previous models while effectively generating plausible object motion.\nProject page: https://ir1d.github.io/CamCo/\n","authors":["Dejia Xu","Weili Nie","Chao Liu","Sifei Liu","Jan Kautz","Zhangyang Wang","Arash Vahdat"],"pdf_url":"https://arxiv.org/pdf/2406.02509v1.pdf","comment":"Project page: https://ir1d.github.io/CamCo/"},{"id":"http://arxiv.org/abs/2406.02507v1","updated":"2024-06-04T17:25:59Z","published":"2024-06-04T17:25:59Z","title":"Guiding a Diffusion Model with a Bad Version of Itself","summary":"  The primary axes of interest in image-generating diffusion models are image\nquality, the amount of variation in the results, and how well the results align\nwith a given condition, e.g., a class label or a text prompt. The popular\nclassifier-free guidance approach uses an unconditional model to guide a\nconditional model, leading to simultaneously better prompt alignment and\nhigher-quality images at the cost of reduced variation. These effects seem\ninherently entangled, and thus hard to control. We make the surprising\nobservation that it is possible to obtain disentangled control over image\nquality without compromising the amount of variation by guiding generation\nusing a smaller, less-trained version of the model itself rather than an\nunconditional model. This leads to significant improvements in ImageNet\ngeneration, setting record FIDs of 1.01 for 64x64 and 1.25 for 512x512, using\npublicly available networks. Furthermore, the method is also applicable to\nunconditional diffusion models, drastically improving their quality.\n","authors":["Tero Karras","Miika Aittala","Tuomas Kynkäänniemi","Jaakko Lehtinen","Timo Aila","Samuli Laine"],"pdf_url":"https://arxiv.org/pdf/2406.02507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14125v4","updated":"2024-06-04T17:25:20Z","published":"2023-12-21T18:46:41Z","title":"VideoPoet: A Large Language Model for Zero-Shot Video Generation","summary":"  We present VideoPoet, a language model capable of synthesizing high-quality\nvideo, with matching audio, from a large variety of conditioning signals.\nVideoPoet employs a decoder-only transformer architecture that processes\nmultimodal inputs -- including images, videos, text, and audio. The training\nprotocol follows that of Large Language Models (LLMs), consisting of two\nstages: pretraining and task-specific adaptation. During pretraining, VideoPoet\nincorporates a mixture of multimodal generative objectives within an\nautoregressive Transformer framework. The pretrained LLM serves as a foundation\nthat can be adapted for a range of video generation tasks. We present empirical\nresults demonstrating the model's state-of-the-art capabilities in zero-shot\nvideo generation, specifically highlighting VideoPoet's ability to generate\nhigh-fidelity motions. Project page: http://sites.research.google/videopoet/\n","authors":["Dan Kondratyuk","Lijun Yu","Xiuye Gu","José Lezama","Jonathan Huang","Grant Schindler","Rachel Hornung","Vighnesh Birodkar","Jimmy Yan","Ming-Chang Chiu","Krishna Somandepalli","Hassan Akbari","Yair Alon","Yong Cheng","Josh Dillon","Agrim Gupta","Meera Hahn","Anja Hauth","David Hendon","Alonso Martinez","David Minnen","Mikhail Sirotenko","Kihyuk Sohn","Xuan Yang","Hartwig Adam","Ming-Hsuan Yang","Irfan Essa","Huisheng Wang","David A. Ross","Bryan Seybold","Lu Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.14125v4.pdf","comment":"To appear at ICML 2024; Project page:\n  http://sites.research.google/videopoet/"},{"id":"http://arxiv.org/abs/2406.02506v1","updated":"2024-06-04T17:24:19Z","published":"2024-06-04T17:24:19Z","title":"An Open-Source Tool for Mapping War Destruction at Scale in Ukraine\n  using Sentinel-1 Time Series","summary":"  Access to detailed war impact assessments is crucial for humanitarian\norganizations to effectively assist populations most affected by armed\nconflicts. However, maintaining a comprehensive understanding of the situation\non the ground is challenging, especially in conflicts that cover vast\nterritories and extend over long periods. This study presents a scalable and\ntransferable method for estimating war-induced damage to buildings. We first\ntrain a machine learning model to output pixel-wise probability of destruction\nfrom Synthetic Aperture Radar (SAR) satellite image time series, leveraging\nexisting, manual damage assessments as ground truth and cloud-based geospatial\nanalysis tools for large-scale inference. We further post-process these\nassessments using open building footprints to obtain a final damage estimate\nper building. We introduce an accessible, open-source tool that allows users to\nadjust the confidence interval based on their specific requirements and use\ncases. Our approach enables humanitarian organizations and other actors to\nrapidly screen large geographic regions for war impacts. We provide two\npublicly accessible dashboards: a Ukraine Damage Explorer to dynamically view\nour pre-computed estimates, and a Rapid Damage Mapping Tool to easily run our\nmethod and produce custom maps.\n","authors":["Olivier Dietrich","Torben Peters","Vivien Sainte Fare Garnot","Valerie Sticher","Thao Ton-That Whelan","Konrad Schindler","Jan Dirk Wegner"],"pdf_url":"https://arxiv.org/pdf/2406.02506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02495v1","updated":"2024-06-04T17:13:10Z","published":"2024-06-04T17:13:10Z","title":"GenS: Generalizable Neural Surface Reconstruction from Multi-View Images","summary":"  Combining the signed distance function (SDF) and differentiable volume\nrendering has emerged as a powerful paradigm for surface reconstruction from\nmulti-view images without 3D supervision. However, current methods are impeded\nby requiring long-time per-scene optimizations and cannot generalize to new\nscenes. In this paper, we present GenS, an end-to-end generalizable neural\nsurface reconstruction model. Unlike coordinate-based methods that train a\nseparate network for each scene, we construct a generalized multi-scale volume\nto directly encode all scenes. Compared with existing solutions, our\nrepresentation is more powerful, which can recover high-frequency details while\nmaintaining global smoothness. Meanwhile, we introduce a multi-scale\nfeature-metric consistency to impose the multi-view consistency in a more\ndiscriminative multi-scale feature space, which is robust to the failures of\nthe photometric consistency. And the learnable feature can be self-enhanced to\ncontinuously improve the matching accuracy and mitigate aggregation ambiguity.\nFurthermore, we design a view contrast loss to force the model to be robust to\nthose regions covered by few viewpoints through distilling the geometric prior\nfrom dense input to sparse input. Extensive experiments on popular benchmarks\nshow that our model can generalize well to new scenes and outperform existing\nstate-of-the-art methods even those employing ground-truth depth supervision.\nCode is available at https://github.com/prstrive/GenS.\n","authors":["Rui Peng","Xiaodong Gu","Luyang Tang","Shihe Shen","Fanqi Yu","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2406.02495v1.pdf","comment":"NeurIPS 2023 Accepted"},{"id":"http://arxiv.org/abs/2403.07134v2","updated":"2024-06-04T16:57:16Z","published":"2024-03-11T20:04:03Z","title":"COMQ: A Backpropagation-Free Algorithm for Post-Training Quantization","summary":"  Post-training quantization (PTQ) has emerged as a practical approach to\ncompress large neural networks, making them highly efficient for deployment.\nHowever, effectively reducing these models to their low-bit counterparts\nwithout compromising the original accuracy remains a key challenge. In this\npaper, we propose an innovative PTQ algorithm termed COMQ, which sequentially\nconducts coordinate-wise minimization of the layer-wise reconstruction errors.\nWe consider the widely used integer quantization, where every quantized weight\ncan be decomposed into a shared floating-point scalar and an integer bit-code.\nWithin a fixed layer, COMQ treats all the scaling factor(s) and bit-codes as\nthe variables of the reconstruction error. Every iteration improves this error\nalong a single coordinate while keeping all other variables constant. COMQ is\neasy to use and requires no hyper-parameter tuning. It instead involves only\ndot products and rounding operations. We update these variables in a carefully\ndesigned greedy order, significantly enhancing the accuracy. COMQ achieves\nremarkable results in quantizing 4-bit Vision Transformers, with a negligible\nloss of less than 1% in Top-1 accuracy. In 4-bit INT quantization of\nconvolutional neural networks, COMQ maintains near-lossless accuracy with a\nminimal drop of merely 0.3% in Top-1 accuracy.\n","authors":["Aozhong Zhang","Zi Yang","Naigang Wang","Yingyong Qin","Jack Xin","Xin Li","Penghang Yin"],"pdf_url":"https://arxiv.org/pdf/2403.07134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02485v1","updated":"2024-06-04T16:54:28Z","published":"2024-06-04T16:54:28Z","title":"Stable-Pose: Leveraging Transformers for Pose-Guided Text-to-Image\n  Generation","summary":"  Controllable text-to-image (T2I) diffusion models have shown impressive\nperformance in generating high-quality visual content through the incorporation\nof various conditions. Current methods, however, exhibit limited performance\nwhen guided by skeleton human poses, especially in complex pose conditions such\nas side or rear perspectives of human figures. To address this issue, we\npresent Stable-Pose, a novel adapter model that introduces a coarse-to-fine\nattention masking strategy into a vision Transformer (ViT) to gain accurate\npose guidance for T2I models. Stable-Pose is designed to adeptly handle pose\nconditions within pre-trained Stable Diffusion, providing a refined and\nefficient way of aligning pose representation during image synthesis. We\nleverage the query-key self-attention mechanism of ViTs to explore the\ninterconnections among different anatomical parts in human pose skeletons.\nMasked pose images are used to smoothly refine the attention maps based on\ntarget pose-related features in a hierarchical manner, transitioning from\ncoarse to fine levels. Additionally, our loss function is formulated to\nallocate increased emphasis to the pose region, thereby augmenting the model's\nprecision in capturing intricate pose details. We assessed the performance of\nStable-Pose across five public datasets under a wide range of indoor and\noutdoor human pose scenarios. Stable-Pose achieved an AP score of 57.1 in the\nLAION-Human dataset, marking around 13% improvement over the established\ntechnique ControlNet. The project link and code is available at\nhttps://github.com/ai-med/StablePose.\n","authors":["Jiajun Wang","Morteza Ghahremani","Yitong Li","Björn Ommer","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2406.02485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02477v1","updated":"2024-06-04T16:47:47Z","published":"2024-06-04T16:47:47Z","title":"Inpainting Pathology in Lumbar Spine MRI with Latent Diffusion","summary":"  Data driven models for automated diagnosis in radiology suffer from\ninsufficient and imbalanced datasets due to low representation of pathology in\na population and the cost of expert annotations. Datasets can be bolstered\nthrough data augmentation. However, even when utilizing a full suite of\ntransformations during model training, typical data augmentations do not\naddress variations in human anatomy. An alternative direction is to synthesize\ndata using generative models, which can potentially craft datasets with\nspecific attributes. While this holds promise, commonly used generative models\nsuch as Generative Adversarial Networks may inadvertently produce anatomically\ninaccurate features. On the other hand, diffusion models, which offer greater\nstability, tend to memorize training data, raising concerns about privacy and\ngenerative diversity. Alternatively, inpainting has the potential to augment\ndata through directly inserting pathology in medical images. However, this\napproach introduces a new challenge: accurately merging the generated\npathological features with the surrounding anatomical context. While inpainting\nis a well established method for addressing simple lesions, its application to\npathologies that involve complex structural changes remains relatively\nunexplored. We propose an efficient method for inpainting pathological features\nonto healthy anatomy in MRI through voxelwise noise scheduling in a latent\ndiffusion model. We evaluate the method's ability to insert disc herniation and\ncentral canal stenosis in lumbar spine sagittal T2 MRI, and it achieves\nsuperior Frechet Inception Distance compared to state-of-the-art methods.\n","authors":["Colin Hansen","Simas Glinskis","Ashwin Raju","Micha Kornreich","JinHyeong Park","Jayashri Pawar","Richard Herzog","Li Zhang","Benjamin Odry"],"pdf_url":"https://arxiv.org/pdf/2406.02477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02468v1","updated":"2024-06-04T16:38:06Z","published":"2024-06-04T16:38:06Z","title":"DL-KDD: Dual-Light Knowledge Distillation for Action Recognition in the\n  Dark","summary":"  Human action recognition in dark videos is a challenging task for computer\nvision. Recent research focuses on applying dark enhancement methods to improve\nthe visibility of the video. However, such video processing results in the loss\nof critical information in the original (un-enhanced) video. Conversely,\ntraditional two-stream methods are capable of learning information from both\noriginal and processed videos, but it can lead to a significant increase in the\ncomputational cost during the inference phase in the task of video\nclassification. To address these challenges, we propose a novel teacher-student\nvideo classification framework, named Dual-Light KnowleDge Distillation for\nAction Recognition in the Dark (DL-KDD). This framework enables the model to\nlearn from both original and enhanced video without introducing additional\ncomputational cost during inference. Specifically, DL-KDD utilizes the strategy\nof knowledge distillation during training. The teacher model is trained with\nenhanced video, and the student model is trained with both the original video\nand the soft target generated by the teacher model. This teacher-student\nframework allows the student model to predict action using only the original\ninput video during inference. In our experiments, the proposed DL-KDD framework\noutperforms state-of-the-art methods on the ARID, ARID V1.5, and Dark-48\ndatasets. We achieve the best performance on each dataset and up to a 4.18%\nimprovement on Dark-48, using only original video inputs, thus avoiding the use\nof two-stream framework or enhancement modules for inference. We further\nvalidate the effectiveness of the distillation strategy in ablative\nexperiments. The results highlight the advantages of our knowledge distillation\nframework in dark human action recognition.\n","authors":["Chi-Jui Chang","Oscar Tai-Yuan Chen","Vincent S. Tseng"],"pdf_url":"https://arxiv.org/pdf/2406.02468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02465v1","updated":"2024-06-04T16:34:17Z","published":"2024-06-04T16:34:17Z","title":"An Empirical Study into Clustering of Unseen Datasets with\n  Self-Supervised Encoders","summary":"  Can pretrained models generalize to new datasets without any retraining? We\ndeploy pretrained image models on datasets they were not trained for, and\ninvestigate whether their embeddings form meaningful clusters. Our suite of\nbenchmarking experiments use encoders pretrained solely on ImageNet-1k with\neither supervised or self-supervised training techniques, deployed on image\ndatasets that were not seen during training, and clustered with conventional\nclustering algorithms. This evaluation provides new insights into the\nembeddings of self-supervised models, which prioritize different features to\nsupervised models. Supervised encoders typically offer more utility than SSL\nencoders within the training domain, and vice-versa far outside of it, however,\nfine-tuned encoders demonstrate the opposite trend. Clustering provides a way\nto evaluate the utility of self-supervised learned representations orthogonal\nto existing methods such as kNN. Additionally, we find the silhouette score\nwhen measured in a UMAP-reduced space is highly correlated with clustering\nperformance, and can therefore be used as a proxy for clustering performance on\ndata with no ground truth labels. Our code implementation is available at\n\\url{https://github.com/scottclowe/zs-ssl-clustering/}.\n","authors":["Scott C. Lowe","Joakim Bruslund Haurum","Sageev Oore","Thomas B. Moeslund","Graham W. Taylor"],"pdf_url":"https://arxiv.org/pdf/2406.02465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02462v1","updated":"2024-06-04T16:30:37Z","published":"2024-06-04T16:30:37Z","title":"Learning Image Priors through Patch-based Diffusion Models for Solving\n  Inverse Problems","summary":"  Diffusion models can learn strong image priors from underlying data\ndistribution and use them to solve inverse problems, but the training process\nis computationally expensive and requires lots of data. Such bottlenecks\nprevent most existing works from being feasible for high-dimensional and\nhigh-resolution data such as 3D images. This paper proposes a method to learn\nan efficient data prior for the entire image by training diffusion models only\non patches of images. Specifically, we propose a patch-based position-aware\ndiffusion inverse solver, called PaDIS, where we obtain the score function of\nthe whole image through scores of patches and their positional encoding and\nutilize this as the prior for solving inverse problems. First of all, we show\nthat this diffusion model achieves an improved memory efficiency and data\nefficiency while still maintaining the capability to generate entire images via\npositional encoding. Additionally, the proposed PaDIS model is highly flexible\nand can be plugged in with different diffusion inverse solvers (DIS). We\ndemonstrate that the proposed PaDIS approach enables solving various inverse\nproblems in both natural and medical image domains, including CT\nreconstruction, deblurring, and superresolution, given only patch-based priors.\nNotably, PaDIS outperforms previous DIS methods trained on entire image priors\nin the case of limited training data, demonstrating the data efficiency of our\nproposed approach by learning patch-based prior.\n","authors":["Jason Hu","Bowen Song","Xiaojian Xu","Liyue Shen","Jeffrey A. Fessler"],"pdf_url":"https://arxiv.org/pdf/2406.02462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02461v1","updated":"2024-06-04T16:27:09Z","published":"2024-06-04T16:27:09Z","title":"RoomTex: Texturing Compositional Indoor Scenes via Iterative Inpainting","summary":"  The advancement of diffusion models has pushed the boundary of text-to-3D\nobject generation. While it is straightforward to composite objects into a\nscene with reasonable geometry, it is nontrivial to texture such a scene\nperfectly due to style inconsistency and occlusions between objects. To tackle\nthese problems, we propose a coarse-to-fine 3D scene texturing framework,\nreferred to as RoomTex, to generate high-fidelity and style-consistent textures\nfor untextured compositional scene meshes. In the coarse stage, RoomTex first\nunwraps the scene mesh to a panoramic depth map and leverages ControlNet to\ngenerate a room panorama, which is regarded as the coarse reference to ensure\nthe global texture consistency. In the fine stage, based on the panoramic image\nand perspective depth maps, RoomTex will refine and texture every single object\nin the room iteratively along a series of selected camera views, until this\nobject is completely painted. Moreover, we propose to maintain superior\nalignment between RGB and depth spaces via subtle edge detection methods.\nExtensive experiments show our method is capable of generating high-quality and\ndiverse room textures, and more importantly, supporting interactive\nfine-grained texture control and flexible scene editing thanks to our\ninpainting-based framework and compositional mesh input. Our project page is\navailable at https://qwang666.github.io/RoomTex/.\n","authors":["Qi Wang","Ruijie Lu","Xudong Xu","Jingbo Wang","Michael Yu Wang","Bo Dai","Gang Zeng","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2406.02461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00687v2","updated":"2024-06-04T16:19:47Z","published":"2024-06-02T09:48:19Z","title":"Lay-A-Scene: Personalized 3D Object Arrangement Using Text-to-Image\n  Priors","summary":"  Generating 3D visual scenes is at the forefront of visual generative AI, but\ncurrent 3D generation techniques struggle with generating scenes with multiple\nhigh-resolution objects. Here we introduce Lay-A-Scene, which solves the task\nof Open-set 3D Object Arrangement, effectively arranging unseen objects. Given\na set of 3D objects, the task is to find a plausible arrangement of these\nobjects in a scene. We address this task by leveraging pre-trained\ntext-to-image models. We personalize the model and explain how to generate\nimages of a scene that contains multiple predefined objects without neglecting\nany of them. Then, we describe how to infer the 3D poses and arrangement of\nobjects from a 2D generated image by finding a consistent projection of objects\nonto the 2D scene. We evaluate the quality of Lay-A-Scene using 3D objects from\nObjaverse and human raters and find that it often generates coherent and\nfeasible 3D object arrangements.\n","authors":["Ohad Rahamim","Hilit Segev","Idan Achituve","Yuval Atzmon","Yoni Kasten","Gal Chechik"],"pdf_url":"https://arxiv.org/pdf/2406.00687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00783v2","updated":"2024-06-04T16:08:07Z","published":"2024-06-02T15:51:33Z","title":"AI-Face: A Million-Scale Demographically Annotated AI-Generated Face\n  Dataset and Fairness Benchmark","summary":"  AI-generated faces have enriched human life, such as entertainment,\neducation, and art. However, they also pose misuse risks. Therefore, detecting\nAI-generated faces becomes crucial, yet current detectors show biased\nperformance across different demographic groups. Mitigating biases can be done\nby designing algorithmic fairness methods, which usually require\ndemographically annotated face datasets for model training. However, no\nexisting dataset comprehensively encompasses both demographic attributes and\ndiverse generative methods, which hinders the development of fair detectors for\nAI-generated faces. In this work, we introduce the AI-Face dataset, the first\nmillion-scale demographically annotated AI-generated face image dataset,\nincluding real faces, faces from deepfake videos, and faces generated by\nGenerative Adversarial Networks and Diffusion Models. Based on this dataset, we\nconduct the first comprehensive fairness benchmark to assess various AI face\ndetectors and provide valuable insights and findings to promote the future fair\ndesign of AI face detectors. Our AI-Face dataset and benchmark code are\npublicly available at https://github.com/Purdue-M2/AI-Face-FairnessBench.\n","authors":["Li Lin"," Santosh","Xin Wang","Shu Hu"],"pdf_url":"https://arxiv.org/pdf/2406.00783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02435v1","updated":"2024-06-04T15:57:43Z","published":"2024-06-04T15:57:43Z","title":"Generative Active Learning for Long-tailed Instance Segmentation","summary":"  Recently, large-scale language-image generative models have gained widespread\nattention and many works have utilized generated data from these models to\nfurther enhance the performance of perception tasks. However, not all generated\ndata can positively impact downstream models, and these methods do not\nthoroughly explore how to better select and utilize generated data. On the\nother hand, there is still a lack of research oriented towards active learning\non generated data. In this paper, we explore how to perform active learning\nspecifically for generated data in the long-tailed instance segmentation task.\nSubsequently, we propose BSGAL, a new algorithm that online estimates the\ncontribution of the generated data based on gradient cache. BSGAL can handle\nunlimited generated data and complex downstream segmentation tasks effectively.\nExperiments show that BSGAL outperforms the baseline approach and effectually\nimproves the performance of long-tailed segmentation. Our code can be found at\nhttps://github.com/aim-uofa/DiverGen.\n","authors":["Muzhi Zhu","Chengxiang Fan","Hao Chen","Yang Liu","Weian Mao","Xiaogang Xu","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2406.02435v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2406.02425v1","updated":"2024-06-04T15:44:25Z","published":"2024-06-04T15:44:25Z","title":"CoNav: A Benchmark for Human-Centered Collaborative Navigation","summary":"  Human-robot collaboration, in which the robot intelligently assists the human\nwith the upcoming task, is an appealing objective. To achieve this goal, the\nagent needs to be equipped with a fundamental collaborative navigation ability,\nwhere the agent should reason human intention by observing human activities and\nthen navigate to the human's intended destination in advance of the human.\nHowever, this vital ability has not been well studied in previous literature.\nTo fill this gap, we propose a collaborative navigation (CoNav) benchmark. Our\nCoNav tackles the critical challenge of constructing a 3D navigation\nenvironment with realistic and diverse human activities. To achieve this, we\ndesign a novel LLM-based humanoid animation generation framework, which is\nconditioned on both text descriptions and environmental context. The generated\nhumanoid trajectory obeys the environmental context and can be easily\nintegrated into popular simulators. We empirically find that the existing\nnavigation methods struggle in CoNav task since they neglect the perception of\nhuman intention. To solve this problem, we propose an intention-aware agent for\nreasoning both long-term and short-term human intention. The agent predicts\nnavigation action based on the predicted intention and panoramic observation.\nThe emergent agent behavior including observing humans, avoiding human\ncollision, and navigation reveals the efficiency of the proposed datasets and\nagents.\n","authors":["Changhao Li","Xinyu Sun","Peihao Chen","Jugang Fan","Zixu Wang","Yanxia Liu","Jinhui Zhu","Chuang Gan","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2406.02425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02422v1","updated":"2024-06-04T15:39:49Z","published":"2024-06-04T15:39:49Z","title":"IterMask2: Iterative Unsupervised Anomaly Segmentation via Spatial and\n  Frequency Masking for Brain Lesions in MRI","summary":"  Unsupervised anomaly segmentation approaches to pathology segmentation train\na model on images of healthy subjects, that they define as the 'normal' data\ndistribution. At inference, they aim to segment any pathologies in new images\nas 'anomalies', as they exhibit patterns that deviate from those in 'normal'\ntraining data. Prevailing methods follow the 'corrupt-and-reconstruct'\nparadigm. They intentionally corrupt an input image, reconstruct it to follow\nthe learned 'normal' distribution, and subsequently segment anomalies based on\nreconstruction error. Corrupting an input image, however, inevitably leads to\nsuboptimal reconstruction even of normal regions, causing false positives. To\nalleviate this, we propose a novel iterative spatial mask-refining strategy\nIterMask2. We iteratively mask areas of the image, reconstruct them, and update\nthe mask based on reconstruction error. This iterative process progressively\nadds information about areas that are confidently normal as per the model. The\nincreasing content guides reconstruction of nearby masked areas, improving\nreconstruction of normal tissue under these areas, reducing false positives. We\nalso use high-frequency image content as an auxiliary input to provide\nadditional structural information for masked areas. This further improves\nreconstruction error of normal in comparison to anomalous areas, facilitating\nsegmentation of the latter. We conduct experiments on several brain lesion\ndatasets and demonstrate effectiveness of our method. Code is available at:\nhttps://github.com/ZiyunLiang/IterMasks2\n","authors":["Ziyun Liang","Xiaoqing Guo","J. Alison Noble","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2406.02422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13534v2","updated":"2024-06-04T15:23:51Z","published":"2024-04-21T05:09:56Z","title":"Motion-aware Latent Diffusion Models for Video Frame Interpolation","summary":"  With the advancement of AIGC, video frame interpolation (VFI) has become a\ncrucial component in existing video generation frameworks, attracting\nwidespread research interest. For the VFI task, the motion estimation between\nneighboring frames plays a crucial role in avoiding motion ambiguity. However,\nexisting VFI methods always struggle to accurately predict the motion\ninformation between consecutive frames, and this imprecise estimation leads to\nblurred and visually incoherent interpolated frames. In this paper, we propose\na novel diffusion framework, motion-aware latent diffusion models (MADiff),\nwhich is specifically designed for the VFI task. By incorporating motion priors\nbetween the conditional neighboring frames with the target interpolated frame\npredicted throughout the diffusion sampling procedure, MADiff progressively\nrefines the intermediate outcomes, culminating in generating both visually\nsmooth and realistic results. Extensive experiments conducted on benchmark\ndatasets demonstrate that our method achieves state-of-the-art performance\nsignificantly outperforming existing approaches, especially under challenging\nscenarios involving dynamic textures with complex motion.\n","authors":["Zhilin Huang","Yijie Yu","Ling Yang","Chujun Qin","Bing Zheng","Xiawu Zheng","Zikun Zhou","Yaowei Wang","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2404.13534v2.pdf","comment":"17 pages, 4 figures. arXiv admin note: substantial text overlap with\n  arXiv:2303.09508 by other authors"},{"id":"http://arxiv.org/abs/2406.02411v1","updated":"2024-06-04T15:21:37Z","published":"2024-06-04T15:21:37Z","title":"Decoupling of neural network calibration measures","summary":"  A lot of effort is currently invested in safeguarding autonomous driving\nsystems, which heavily rely on deep neural networks for computer vision. We\ninvestigate the coupling of different neural network calibration measures with\na special focus on the Area Under the Sparsification Error curve (AUSE) metric.\nWe elaborate on the well-known inconsistency in determining optimal calibration\nusing the Expected Calibration Error (ECE) and we demonstrate similar issues\nfor the AUSE, the Uncertainty Calibration Score (UCS), as well as the\nUncertainty Calibration Error (UCE). We conclude that the current methodologies\nleave a degree of freedom, which prevents a unique model calibration for the\nhomologation of safety-critical functionalities. Furthermore, we propose the\nAUSE as an indirect measure for the residual uncertainty, which is irreducible\nfor a fixed network architecture and is driven by the stochasticity in the\nunderlying data generation process (aleatoric contribution) as well as the\nlimitation in the hypothesis space (epistemic contribution).\n","authors":["Dominik Werner Wolf","Prasannavenkatesh Balaji","Alexander Braun","Markus Ulrich"],"pdf_url":"https://arxiv.org/pdf/2406.02411v1.pdf","comment":"Submitted to the German Conference on Pattern Recognition (GCPR) 2024"},{"id":"http://arxiv.org/abs/2406.02407v1","updated":"2024-06-04T15:17:37Z","published":"2024-06-04T15:17:37Z","title":"WE-GS: An In-the-wild Efficient 3D Gaussian Representation for\n  Unconstrained Photo Collections","summary":"  Novel View Synthesis (NVS) from unconstrained photo collections is\nchallenging in computer graphics. Recently, 3D Gaussian Splatting (3DGS) has\nshown promise for photorealistic and real-time NVS of static scenes. Building\non 3DGS, we propose an efficient point-based differentiable rendering framework\nfor scene reconstruction from photo collections. Our key innovation is a\nresidual-based spherical harmonic coefficients transfer module that adapts 3DGS\nto varying lighting conditions and photometric post-processing. This\nlightweight module can be pre-computed and ensures efficient gradient\npropagation from rendered images to 3D Gaussian attributes. Additionally, we\nobserve that the appearance encoder and the transient mask predictor, the two\nmost critical parts of NVS from unconstrained photo collections, can be\nmutually beneficial. We introduce a plug-and-play lightweight spatial attention\nmodule to simultaneously predict transient occluders and latent appearance\nrepresentation for each image. After training and preprocessing, our method\naligns with the standard 3DGS format and rendering pipeline, facilitating\nseamlessly integration into various 3DGS applications. Extensive experiments on\ndiverse datasets show our approach outperforms existing approaches on the\nrendering quality of novel view and appearance synthesis with high converge and\nrendering speed.\n","authors":["Yuze Wang","Junyi Wang","Yue Qi"],"pdf_url":"https://arxiv.org/pdf/2406.02407v1.pdf","comment":"Our project page is available at\n  https://yuzewang1998.github.io/we-gs.github.io/"},{"id":"http://arxiv.org/abs/2406.02395v1","updated":"2024-06-04T15:09:29Z","published":"2024-06-04T15:09:29Z","title":"GrootVL: Tree Topology is All You Need in State Space Model","summary":"  The state space models, employing recursively propagated features,\ndemonstrate strong representation capabilities comparable to Transformer models\nand superior efficiency. However, constrained by the inherent geometric\nconstraints of sequences, it still falls short in modeling long-range\ndependencies. To address this issue, we propose the GrootVL network, which\nfirst dynamically generates a tree topology based on spatial relationships and\ninput features. Then, feature propagation is performed based on this graph,\nthereby breaking the original sequence constraints to achieve stronger\nrepresentation capabilities. Additionally, we introduce a linear complexity\ndynamic programming algorithm to enhance long-range interactions without\nincreasing computational cost. GrootVL is a versatile multimodal framework that\ncan be applied to both visual and textual tasks. Extensive experiments\ndemonstrate that our method significantly outperforms existing structured state\nspace models on image classification, object detection and segmentation.\nBesides, by fine-tuning large language models, our approach achieves consistent\nimprovements in multiple textual tasks at minor training cost.\n","authors":["Yicheng Xiao","Lin Song","Shaoli Huang","Jiangshan Wang","Siyu Song","Yixiao Ge","Xiu Li","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2406.02395v1.pdf","comment":"The code is available at https://github.com/EasonXiao-888/GrootVL"},{"id":"http://arxiv.org/abs/2406.02385v1","updated":"2024-06-04T15:00:49Z","published":"2024-06-04T15:00:49Z","title":"Low-Rank Adaption on Transformer-based Oriented Object Detector for\n  Satellite Onboard Processing of Remote Sensing Images","summary":"  Deep learning models in satellite onboard enable real-time interpretation of\nremote sensing images, reducing the need for data transmission to the ground\nand conserving communication resources. As satellite numbers and observation\nfrequencies increase, the demand for satellite onboard real-time image\ninterpretation grows, highlighting the expanding importance and development of\nthis technology. However, updating the extensive parameters of models deployed\non the satellites for spaceborne object detection model is challenging due to\nthe limitations of uplink bandwidth in wireless satellite communications. To\naddress this issue, this paper proposes a method based on parameter-efficient\nfine-tuning technology with low-rank adaptation (LoRA) module. It involves\ntraining low-rank matrix parameters and integrating them with the original\nmodel's weight matrix through multiplication and summation, thereby fine-tuning\nthe model parameters to adapt to new data distributions with minimal weight\nupdates. The proposed method combines parameter-efficient fine-tuning with full\nfine-tuning in the parameter update strategy of the oriented object detection\nalgorithm architecture. This strategy enables model performance improvements\nclose to full fine-tuning effects with minimal parameter updates. In addition,\nlow rank approximation is conducted to pick an optimal rank value for LoRA\nmatrices. Extensive experiments verify the effectiveness of the proposed\nmethod. By fine-tuning and updating only 12.4$\\%$ of the model's total\nparameters, it is able to achieve 97$\\%$ to 100$\\%$ of the performance of full\nfine-tuning models. Additionally, the reduced number of trainable parameters\naccelerates model training iterations and enhances the generalization and\nrobustness of the oriented object detection model. The source code is available\nat: \\url{https://github.com/fudanxu/LoRA-Det}.\n","authors":["Xinyang Pu","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2406.02385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02383v1","updated":"2024-06-04T14:59:38Z","published":"2024-06-04T14:59:38Z","title":"Learning to Edit Visual Programs with Self-Supervision","summary":"  We design a system that learns how to edit visual programs. Our edit network\nconsumes a complete input program and a visual target. From this input, we task\nour network with predicting a local edit operation that could be applied to the\ninput program to improve its similarity to the target. In order to apply this\nscheme for domains that lack program annotations, we develop a self-supervised\nlearning approach that integrates this edit network into a bootstrapped\nfinetuning loop along with a network that predicts entire programs in one-shot.\nOur joint finetuning scheme, when coupled with an inference procedure that\ninitializes a population from the one-shot model and evolves members of this\npopulation with the edit network, helps to infer more accurate visual programs.\nOver multiple domains, we experimentally compare our method against the\nalternative of using only the one-shot model, and find that even under equal\nsearch-time budgets, our editing-based paradigm provides significant\nadvantages.\n","authors":["R. Kenny Jones","Renhao Zhang","Aditya Ganeshan","Daniel Ritchie"],"pdf_url":"https://arxiv.org/pdf/2406.02383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02380v1","updated":"2024-06-04T14:57:56Z","published":"2024-06-04T14:57:56Z","title":"EUFCC-340K: A Faceted Hierarchical Dataset for Metadata Annotation in\n  GLAM Collections","summary":"  In this paper, we address the challenges of automatic metadata annotation in\nthe domain of Galleries, Libraries, Archives, and Museums (GLAMs) by\nintroducing a novel dataset, EUFCC340K, collected from the Europeana portal.\nComprising over 340,000 images, the EUFCC340K dataset is organized across\nmultiple facets: Materials, Object Types, Disciplines, and Subjects, following\na hierarchical structure based on the Art & Architecture Thesaurus (AAT). We\ndeveloped several baseline models, incorporating multiple heads on a ConvNeXT\nbackbone for multi-label image tagging on these facets, and fine-tuning a CLIP\nmodel with our image text pairs. Our experiments to evaluate model robustness\nand generalization capabilities in two different test scenarios demonstrate the\nutility of the dataset in improving multi-label classification tools that have\nthe potential to alleviate cataloging tasks in the cultural heritage sector.\n","authors":["Francesc Net","Marc Folia","Pep Casals","Andrew D. Bagdanov","Lluis Gomez"],"pdf_url":"https://arxiv.org/pdf/2406.02380v1.pdf","comment":"23 pages, 13 figures"},{"id":"http://arxiv.org/abs/2406.00609v2","updated":"2024-06-04T14:47:45Z","published":"2024-06-02T03:44:50Z","title":"SuperGaussian: Repurposing Video Models for 3D Super Resolution","summary":"  We present a simple, modular, and generic method that upsamples coarse 3D\nmodels by adding geometric and appearance details. While generative 3D models\nnow exist, they do not yet match the quality of their counterparts in image and\nvideo domains. We demonstrate that it is possible to directly repurpose\nexisting (pretrained) video models for 3D super-resolution and thus sidestep\nthe problem of the shortage of large repositories of high-quality 3D training\nmodels. We describe how to repurpose video upsampling models, which are not 3D\nconsistent, and combine them with 3D consolidation to produce 3D-consistent\nresults. As output, we produce high quality Gaussian Splat models, which are\nobject centric and effective. Our method is category agnostic and can be easily\nincorporated into existing 3D workflows. We evaluate our proposed SuperGaussian\non a variety of 3D inputs, which are diverse both in terms of complexity and\nrepresentation (e.g., Gaussian Splats or NeRFs), and demonstrate that our\nsimple method significantly improves the fidelity of the final 3D models. Check\nour project website for details: supergaussian.github.io\n","authors":["Yuan Shen","Duygu Ceylan","Paul Guerrero","Zexiang Xu","Niloy J. Mitra","Shenlong Wang","Anna Frühstück"],"pdf_url":"https://arxiv.org/pdf/2406.00609v2.pdf","comment":"Check our project website for details:\n  https://supergaussian.github.io"},{"id":"http://arxiv.org/abs/2406.02355v1","updated":"2024-06-04T14:34:13Z","published":"2024-06-04T14:34:13Z","title":"FedDr+: Stabilizing Dot-regression with Global Feature Distillation for\n  Federated Learning","summary":"  Federated Learning (FL) has emerged as a pivotal framework for the\ndevelopment of effective global models (global FL) or personalized models\n(personalized FL) across clients with heterogeneous, non-iid data distribution.\nA key challenge in FL is client drift, where data heterogeneity impedes the\naggregation of scattered knowledge. Recent studies have tackled the client\ndrift issue by identifying significant divergence in the last classifier layer.\nTo mitigate this divergence, strategies such as freezing the classifier weights\nand aligning the feature extractor accordingly have proven effective. Although\nthe local alignment between classifier and feature extractor has been studied\nas a crucial factor in FL, we observe that it may lead the model to\noveremphasize the observed classes within each client. Thus, our objectives are\ntwofold: (1) enhancing local alignment while (2) preserving the representation\nof unseen class samples. This approach aims to effectively integrate knowledge\nfrom individual clients, thereby improving performance for both global and\npersonalized FL. To achieve this, we introduce a novel algorithm named FedDr+,\nwhich empowers local model alignment using dot-regression loss. FedDr+ freezes\nthe classifier as a simplex ETF to align the features and improves aggregated\nglobal models by employing a feature distillation mechanism to retain\ninformation about unseen/missing classes. Consequently, we provide empirical\nevidence demonstrating that our algorithm surpasses existing methods that use a\nfrozen classifier to boost alignment across the diverse distribution.\n","authors":["Seongyoon Kim","Minchan Jeong","Sungnyun Kim","Sungwoo Cho","Sumyeong Ahn","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2406.02355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00210v2","updated":"2024-06-04T14:26:06Z","published":"2024-05-31T21:47:05Z","title":"A-SDM: Accelerating Stable Diffusion through Model Assembly and Feature\n  Inheritance Strategies","summary":"  The Stable Diffusion Model (SDM) is a prevalent and effective model for\ntext-to-image (T2I) and image-to-image (I2I) generation. Despite various\nattempts at sampler optimization, model distillation, and network\nquantification, these approaches typically maintain the original network\narchitecture. The extensive parameter scale and substantial computational\ndemands have limited research into adjusting the model architecture. This study\nfocuses on reducing redundant computation in SDM and optimizes the model\nthrough both tuning and tuning-free methods. 1) For the tuning method, we\ndesign a model assembly strategy to reconstruct a lightweight model while\npreserving performance through distillation. Second, to mitigate performance\nloss due to pruning, we incorporate multi-expert conditional convolution\n(ME-CondConv) into compressed UNets to enhance network performance by\nincreasing capacity without sacrificing speed. Third, we validate the\neffectiveness of the multi-UNet switching method for improving network speed.\n2) For the tuning-free method, we propose a feature inheritance strategy to\naccelerate inference by skipping local computations at the block, layer, or\nunit level within the network structure. We also examine multiple sampling\nmodes for feature inheritance at the time-step level. Experiments demonstrate\nthat both the proposed tuning and the tuning-free methods can improve the speed\nand performance of the SDM. The lightweight model reconstructed by the model\nassembly strategy increases generation speed by $22.4%$, while the feature\ninheritance strategy enhances the SDM generation speed by $40.0%$.\n","authors":["Jinchao Zhu","Yuxuan Wang","Siyuan Pan","Pengfei Wan","Di Zhang","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2406.00210v2.pdf","comment":"19 pages, 16 figures, submitted to IEEE Transactions on Neural\n  Networks and Learning Systems"},{"id":"http://arxiv.org/abs/2406.02349v1","updated":"2024-06-04T14:24:35Z","published":"2024-06-04T14:24:35Z","title":"CADE: Cosine Annealing Differential Evolution for Spiking Neural Network","summary":"  Spiking neural networks (SNNs) have gained prominence for their potential in\nneuromorphic computing and energy-efficient artificial intelligence, yet\noptimizing them remains a formidable challenge for gradient-based methods due\nto their discrete, spike-based computation. This paper attempts to tackle the\nchallenges by introducing Cosine Annealing Differential Evolution (CADE),\ndesigned to modulate the mutation factor (F) and crossover rate (CR) of\ndifferential evolution (DE) for the SNN model, i.e., Spiking Element Wise (SEW)\nResNet. Extensive empirical evaluations were conducted to analyze CADE. CADE\nshowed a balance in exploring and exploiting the search space, resulting in\naccelerated convergence and improved accuracy compared to existing\ngradient-based and DE-based methods. Moreover, an initialization method based\non a transfer learning setting was developed, pretraining on a source dataset\n(i.e., CIFAR-10) and fine-tuning the target dataset (i.e., CIFAR-100), to\nimprove population diversity. It was found to further enhance CADE for SNN.\nRemarkably, CADE elevates the performance of the highest accuracy SEW model by\nan additional 0.52 percentage points, underscoring its effectiveness in\nfine-tuning and enhancing SNNs. These findings emphasize the pivotal role of a\nscheduler for F and CR adjustment, especially for DE-based SNN. Source Code on\nGithub: https://github.com/Tank-Jiang/CADE4SNN.\n","authors":["Runhua Jiang","Guodong Du","Shuyang Yu","Yifei Guo","Sim Kuan Goh","Ho-Kin Tang"],"pdf_url":"https://arxiv.org/pdf/2406.02349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02347v1","updated":"2024-06-04T14:23:27Z","published":"2024-06-04T14:23:27Z","title":"Flash Diffusion: Accelerating Any Conditional Diffusion Model for Few\n  Steps Image Generation","summary":"  In this paper, we propose an efficient, fast, and versatile distillation\nmethod to accelerate the generation of pre-trained diffusion models: Flash\nDiffusion. The method reaches state-of-the-art performances in terms of FID and\nCLIP-Score for few steps image generation on the COCO2014 and COCO2017\ndatasets, while requiring only several GPU hours of training and fewer\ntrainable parameters than existing methods. In addition to its efficiency, the\nversatility of the method is also exposed across several tasks such as\ntext-to-image, inpainting, face-swapping, super-resolution and using different\nbackbones such as UNet-based denoisers (SD1.5, SDXL) or DiT (Pixart-$\\alpha$),\nas well as adapters. In all cases, the method allowed to reduce drastically the\nnumber of sampling steps while maintaining very high-quality image generation.\nThe official implementation is available at\nhttps://github.com/gojasper/flash-diffusion.\n","authors":["Clement Chadebec","Onur Tasar","Eyal Benaroche","Benjamin Aubin"],"pdf_url":"https://arxiv.org/pdf/2406.02347v1.pdf","comment":"16 pages + 16 pages appendices"},{"id":"http://arxiv.org/abs/2406.02345v1","updated":"2024-06-04T14:21:41Z","published":"2024-06-04T14:21:41Z","title":"Progressive Confident Masking Attention Network for Audio-Visual\n  Segmentation","summary":"  Audio and visual signals typically occur simultaneously, and humans possess\nan innate ability to correlate and synchronize information from these two\nmodalities. Recently, a challenging problem known as Audio-Visual Segmentation\n(AVS) has emerged, intending to produce segmentation maps for sounding objects\nwithin a scene. However, the methods proposed so far have not sufficiently\nintegrated audio and visual information, and the computational costs have been\nextremely high. Additionally, the outputs of different stages have not been\nfully utilized. To facilitate this research, we introduce a novel Progressive\nConfident Masking Attention Network (PMCANet). It leverages attention\nmechanisms to uncover the intrinsic correlations between audio signals and\nvisual frames. Furthermore, we design an efficient and effective\ncross-attention module to enhance semantic perception by selecting query\ntokens. This selection is determined through confidence-driven units based on\nthe network's multi-stage predictive outputs. Experiments demonstrate that our\nnetwork outperforms other AVS methods while requiring less computational\nresources.\n","authors":["Yuxuan Wang","Feng Dong","Jinchao Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.02345v1.pdf","comment":"10 pages, 9 figures, submitted to IEEE TRANSACTIONS ON CIRCUITS AND\n  SYSTEMS FOR VIDEO TECHNOLOGY"},{"id":"http://arxiv.org/abs/2406.02343v1","updated":"2024-06-04T14:19:50Z","published":"2024-06-04T14:19:50Z","title":"Cluster-Aware Similarity Diffusion for Instance Retrieval","summary":"  Diffusion-based re-ranking is a common method used for retrieving instances\nby performing similarity propagation in a nearest neighbor graph. However,\nexisting techniques that construct the affinity graph based on pairwise\ninstances can lead to the propagation of misinformation from outliers and other\nmanifolds, resulting in inaccurate results. To overcome this issue, we propose\na novel Cluster-Aware Similarity (CAS) diffusion for instance retrieval. The\nprimary concept of CAS is to conduct similarity diffusion within local\nclusters, which can reduce the influence from other manifolds explicitly. To\nobtain a symmetrical and smooth similarity matrix, our Bidirectional Similarity\nDiffusion strategy introduces an inverse constraint term to the optimization\nobjective of local cluster diffusion. Additionally, we have optimized a\nNeighbor-guided Similarity Smoothing approach to ensure similarity consistency\namong the local neighbors of each instance. Evaluations in instance retrieval\nand object re-identification validate the effectiveness of the proposed CAS,\nour code is publicly available.\n","authors":["Jifei Luo","Hantao Yao","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2406.02343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02327v1","updated":"2024-06-04T13:57:34Z","published":"2024-06-04T13:57:34Z","title":"Continual Unsupervised Out-of-Distribution Detection","summary":"  Deep learning models excel when the data distribution during training aligns\nwith testing data. Yet, their performance diminishes when faced with\nout-of-distribution (OOD) samples, leading to great interest in the field of\nOOD detection. Current approaches typically assume that OOD samples originate\nfrom an unconcentrated distribution complementary to the training distribution.\nWhile this assumption is appropriate in the traditional unsupervised OOD\n(U-OOD) setting, it proves inadequate when considering the place of deployment\nof the underlying deep learning model. To better reflect this real-world\nscenario, we introduce the novel setting of continual U-OOD detection. To\ntackle this new setting, we propose a method that starts from a U-OOD detector,\nwhich is agnostic to the OOD distribution, and slowly updates during deployment\nto account for the actual OOD distribution. Our method uses a new U-OOD scoring\nfunction that combines the Mahalanobis distance with a nearest-neighbor\napproach. Furthermore, we design a confidence-scaled few-shot OOD detector that\noutperforms previous methods. We show our method greatly improves upon strong\nbaselines from related fields.\n","authors":["Lars Doorenbos","Raphael Sznitman","Pablo Márquez-Neila"],"pdf_url":"https://arxiv.org/pdf/2406.02327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10820v2","updated":"2024-06-04T13:15:16Z","published":"2024-03-16T06:10:22Z","title":"Active Label Correction for Semantic Segmentation with Foundation Models","summary":"  Training and validating models for semantic segmentation require datasets\nwith pixel-wise annotations, which are notoriously labor-intensive. Although\nuseful priors such as foundation models or crowdsourced datasets are available,\nthey are error-prone. We hence propose an effective framework of active label\ncorrection (ALC) based on a design of correction query to rectify pseudo labels\nof pixels, which in turn is more annotator-friendly than the standard one\ninquiring to classify a pixel directly according to our theoretical analysis\nand user study. Specifically, leveraging foundation models providing useful\nzero-shot predictions on pseudo labels and superpixels, our method comprises\ntwo key techniques: (i) an annotator-friendly design of correction query with\nthe pseudo labels, and (ii) an acquisition function looking ahead label\nexpansions based on the superpixels. Experimental results on PASCAL,\nCityscapes, and Kvasir-SEG datasets demonstrate the effectiveness of our ALC\nframework, outperforming prior methods for active semantic segmentation and\nlabel correction. Notably, utilizing our method, we obtained a revised dataset\nof PASCAL by rectifying errors in 2.6 million pixels in PASCAL dataset.\n","authors":["Hoyoung Kim","Sehyun Hwang","Suha Kwak","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2403.10820v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02287v1","updated":"2024-06-04T13:00:22Z","published":"2024-06-04T13:00:22Z","title":"Optimised ProPainter for Video Diminished Reality Inpainting","summary":"  In this paper, part of the DREAMING Challenge - Diminished Reality for\nEmerging Applications in Medicine through Inpainting, we introduce a refined\nvideo inpainting technique optimised from the ProPainter method to meet the\nspecialised demands of medical imaging, specifically in the context of oral and\nmaxillofacial surgery. Our enhanced algorithm employs the zero-shot ProPainter,\nfeaturing optimized parameters and pre-processing, to adeptly manage the\ncomplex task of inpainting surgical video sequences, without requiring any\ntraining process. It aims to produce temporally coherent and detail-rich\nreconstructions of occluded regions, facilitating clearer views of operative\nfields. The efficacy of our approach is evaluated using comprehensive metrics,\npositioning it as a significant advancement in the application of diminished\nreality for medical purposes.\n","authors":["Pengze Li","Lihao Liu","Carola-Bibiane Schönlieb","Angelica I Aviles-Rivero"],"pdf_url":"https://arxiv.org/pdf/2406.02287v1.pdf","comment":"Accepted to ISBI 2024"},{"id":"http://arxiv.org/abs/2406.02265v1","updated":"2024-06-04T12:41:54Z","published":"2024-06-04T12:41:54Z","title":"Understanding Retrieval Robustness for Retrieval-Augmented Image\n  Captioning","summary":"  Recent advancements in retrieval-augmented models for image captioning\nhighlight the significance of retrieving related captions for efficient,\nlightweight models with strong domain-transfer capabilities. While these models\ndemonstrate the success of retrieval augmentation, retrieval models are still\nfar from perfect in practice. Retrieved information can sometimes mislead the\nmodel generation, negatively impacting performance. In this paper, we analyze\nthe robustness of the SmallCap retrieval-augmented captioning model. Our\nanalysis shows that SmallCap is sensitive to tokens that appear in the majority\nof the retrieved captions, and integrated gradients attribution shows that\nthose tokens are likely copied into the final caption. Given these findings, we\npropose to train the model by sampling retrieved captions from more diverse\nsets. This reduces the probability that the model learns to copy majority\ntokens and improves both in-domain and cross-domain performance effectively.\n","authors":["Wenyan Li","Jiaang Li","Rita Ramos","Raphael Tang","Desmond Elliott"],"pdf_url":"https://arxiv.org/pdf/2406.02265v1.pdf","comment":"9 pages, long paper at ACL 2024"},{"id":"http://arxiv.org/abs/2406.02264v1","updated":"2024-06-04T12:37:11Z","published":"2024-06-04T12:37:11Z","title":"Image contrast enhancement based on the Schrödinger operator spectrum","summary":"  This study proposes a novel image contrast enhancement method based on image\nprojection onto the squared eigenfunctions of the two dimensional Schr\\\"odinger\noperator. This projection depends on a design parameter\n\\texorpdfstring{\\(\\gamma\\)}{gamma} which is proposed to control the pixel\nintensity during image reconstruction. The performance of the proposed method\nis investigated through its application to color images. The selection of\n\\texorpdfstring{\\(\\gamma\\)}{gamma} values is performed using k-means, which\nhelps preserve the image spatial adjacency information. Furthermore,\nmulti-objective optimization using the Non dominated Sorting Genetic Algorithm\nII (NSAG2) algorithm is proposed to select the optimal values of\n\\texorpdfstring{\\(\\gamma\\)}{gamma} and the semi-classical parameter h from the\n2DSCSA. The results demonstrate the effectiveness of the proposed method for\nenhancing image contrast while preserving the inherent characteristics of the\noriginal image, producing the desired enhancement with almost no artifacts.\n","authors":["Juan M. Vargas","Taous-Meriem Laleg-Kirati"],"pdf_url":"https://arxiv.org/pdf/2406.02264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02263v1","updated":"2024-06-04T12:33:02Z","published":"2024-06-04T12:33:02Z","title":"M3DM-NR: RGB-3D Noisy-Resistant Industrial Anomaly Detection via\n  Multimodal Denoising","summary":"  Existing industrial anomaly detection methods primarily concentrate on\nunsupervised learning with pristine RGB images. Yet, both RGB and 3D data are\ncrucial for anomaly detection, and the datasets are seldom completely clean in\npractical scenarios. To address above challenges, this paper initially delves\ninto the RGB-3D multi-modal noisy anomaly detection, proposing a novel\nnoise-resistant M3DM-NR framework to leveraging strong multi-modal\ndiscriminative capabilities of CLIP. M3DM-NR consists of three stages: Stage-I\nintroduces the Suspected References Selection module to filter a few normal\nsamples from the training dataset, using the multimodal features extracted by\nthe Initial Feature Extraction, and a Suspected Anomaly Map Computation module\nto generate a suspected anomaly map to focus on abnormal regions as reference.\nStage-II uses the suspected anomaly maps of the reference samples as reference,\nand inputs image, point cloud, and text information to achieve denoising of the\ntraining samples through intra-modal comparison and multi-scale aggregation\noperations. Finally, Stage-III proposes the Point Feature Alignment,\nUnsupervised Feature Fusion, Noise Discriminative Coreset Selection, and\nDecision Layer Fusion modules to learn the pattern of the training dataset,\nenabling anomaly detection and segmentation while filtering out noise.\nExtensive experiments show that M3DM-NR outperforms state-of-the-art methods in\n3D-RGB multi-modal noisy anomaly detection.\n","authors":["Chengjie Wang","Haokun Zhu","Jinlong Peng","Yue Wang","Ran Yi","Yunsheng Wu","Lizhuang Ma","Jiangning Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.02263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08078v5","updated":"2024-06-04T12:27:38Z","published":"2023-12-13T11:47:28Z","title":"Fine-Grained Image-Text Alignment in Medical Imaging Enables Explainable\n  Cyclic Image-Report Generation","summary":"  To address these issues, we propose a novel Adaptive patch-word Matching\n(AdaMatch) model to correlate chest X-ray (CXR) image regions with words in\nmedical reports and apply it to CXR-report generation to provide explainability\nfor the generation process. AdaMatch exploits the fine-grained relation between\nadaptive patches and words to provide explanations of specific image regions\nwith corresponding words. To capture the abnormal regions of varying sizes and\npositions, we introduce the Adaptive Patch extraction (AdaPatch) module to\nacquire the adaptive patches for these regions adaptively. In order to provide\nexplicit explainability for CXR-report generation task, we propose an\nAdaMatch-based bidirectional large language model for Cyclic CXR-report\ngeneration (AdaMatch-Cyclic). It employs the AdaMatch to obtain the keywords\nfor CXR images and `keypatches' for medical reports as hints to guide\nCXR-report generation. Extensive experiments on two publicly available CXR\ndatasets prove the effectiveness of our method and its superior performance to\nexisting methods.\n","authors":["Wenting Chen","Linlin Shen","Jingyang Lin","Jiebo Luo","Xiang Li","Yixuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.08078v5.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2406.02253v1","updated":"2024-06-04T12:19:09Z","published":"2024-06-04T12:19:09Z","title":"PuFace: Defending against Facial Cloaking Attacks for Facial Recognition\n  Models","summary":"  The recently proposed facial cloaking attacks add invisible perturbation\n(cloaks) to facial images to protect users from being recognized by\nunauthorized facial recognition models. However, we show that the \"cloaks\" are\nnot robust enough and can be removed from images.\n  This paper introduces PuFace, an image purification system leveraging the\ngeneralization ability of neural networks to diminish the impact of cloaks by\npushing the cloaked images towards the manifold of natural (uncloaked) images\nbefore the training process of facial recognition models. Specifically, we\ndevise a purifier that takes all the training images including both cloaked and\nnatural images as input and generates the purified facial images close to the\nmanifold where natural images lie. To meet the defense goal, we propose to\ntrain the purifier on particularly amplified cloaked images with a loss\nfunction that combines image loss and feature loss. Our empirical experiment\nshows PuFace can effectively defend against two state-of-the-art facial\ncloaking attacks and reduces the attack success rate from 69.84\\% to 7.61\\% on\naverage without degrading the normal accuracy for various facial recognition\nmodels. Moreover, PuFace is a model-agnostic defense mechanism that can be\napplied to any facial recognition model without modifying the model structure.\n","authors":["Jing Wen"],"pdf_url":"https://arxiv.org/pdf/2406.02253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18454v2","updated":"2024-06-04T11:59:54Z","published":"2024-04-29T06:24:32Z","title":"3D Gaussian Splatting with Deferred Reflection","summary":"  The advent of neural and Gaussian-based radiance field methods have achieved\ngreat success in the field of novel view synthesis. However, specular\nreflection remains non-trivial, as the high frequency radiance field is\nnotoriously difficult to fit stably and accurately. We present a deferred\nshading method to effectively render specular reflection with Gaussian\nsplatting. The key challenge comes from the environment map reflection model,\nwhich requires accurate surface normal while simultaneously bottlenecks normal\nestimation with discontinuous gradients. We leverage the per-pixel reflection\ngradients generated by deferred shading to bridge the optimization process of\nneighboring Gaussians, allowing nearly correct normal estimations to gradually\npropagate and eventually spread over all reflective objects. Our method\nsignificantly outperforms state-of-the-art techniques and concurrent work in\nsynthesizing high-quality specular reflection effects, demonstrating a\nconsistent improvement of peak signal-to-noise ratio (PSNR) for both synthetic\nand real-world scenes, while running at a frame rate almost identical to\nvanilla Gaussian splatting.\n","authors":["Keyang Ye","Qiming Hou","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.18454v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16475v2","updated":"2024-06-04T11:53:44Z","published":"2024-05-26T07:58:51Z","title":"Looks Too Good To Be True: An Information-Theoretic Analysis of\n  Hallucinations in Generative Restoration Models","summary":"  The pursuit of high perceptual quality in image restoration has driven the\ndevelopment of revolutionary generative models, capable of producing results\noften visually indistinguishable from real data. However, as their perceptual\nquality continues to improve, these models also exhibit a growing tendency to\ngenerate hallucinations - realistic-looking details that do not exist in the\nground truth images. The presence of hallucinations introduces uncertainty\nregarding the reliability of the models' predictions, raising major concerns\nabout their practical application. In this paper, we employ information-theory\ntools to investigate this phenomenon, revealing a fundamental tradeoff between\nuncertainty and perception. We rigorously analyze the relationship between\nthese two factors, proving that the global minimal uncertainty in generative\nmodels grows in tandem with perception. In particular, we define the inherent\nuncertainty of the restoration problem and show that attaining perfect\nperceptual quality entails at least twice this uncertainty. Additionally, we\nestablish a relation between mean squared-error distortion, uncertainty and\nperception, through which we prove the aforementioned uncertainly-perception\ntradeoff induces the well-known perception-distortion tradeoff. This work\nuncovers fundamental limitations of generative models in achieving both high\nperceptual quality and reliable predictions for image restoration. We\ndemonstrate our theoretical findings through an analysis of single image\nsuper-resolution algorithms. Our work aims to raise awareness among\npractitioners about this inherent tradeoff, empowering them to make informed\ndecisions and potentially prioritize safety over perceptual performance.\n","authors":["Regev Cohen","Idan Kligvasser","Ehud Rivlin","Daniel Freedman"],"pdf_url":"https://arxiv.org/pdf/2405.16475v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02230v1","updated":"2024-06-04T11:48:44Z","published":"2024-06-04T11:48:44Z","title":"I4VGen: Image as Stepping Stone for Text-to-Video Generation","summary":"  Text-to-video generation has lagged behind text-to-image synthesis in quality\nand diversity due to the complexity of spatio-temporal modeling and limited\nvideo-text datasets. This paper presents I4VGen, a training-free and\nplug-and-play video diffusion inference framework, which enhances text-to-video\ngeneration by leveraging robust image techniques. Specifically, following\ntext-to-image-to-video, I4VGen decomposes the text-to-video generation into two\nstages: anchor image synthesis and anchor image-guided video synthesis.\nCorrespondingly, a well-designed generation-selection pipeline is employed to\nachieve visually-realistic and semantically-faithful anchor image, and an\ninnovative Noise-Invariant Video Score Distillation Sampling is incorporated to\nanimate the image to a dynamic video, followed by a video regeneration process\nto refine the video. This inference strategy effectively mitigates the\nprevalent issue of non-zero terminal signal-to-noise ratio. Extensive\nevaluations show that I4VGen not only produces videos with higher visual\nrealism and textual fidelity but also integrates seamlessly into existing\nimage-to-video diffusion models, thereby improving overall video quality.\n","authors":["Xiefan Guo","Jinlin Liu","Miaomiao Cui","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2406.02230v1.pdf","comment":"Project page: https://xiefan-guo.github.io/i4vgen"},{"id":"http://arxiv.org/abs/2401.15578v2","updated":"2024-06-04T11:47:15Z","published":"2024-01-28T06:23:55Z","title":"ASCNet: Asymmetric Sampling Correction Network for Infrared Image\n  Destriping","summary":"  In a real-world infrared imaging system, effectively learning a consistent\nstripe noise removal model is essential. Most existing destriping methods\ncannot precisely reconstruct images due to cross-level semantic gaps and\ninsufficient characterization of the global column features. To tackle this\nproblem, we propose a novel infrared image destriping method, called Asymmetric\nSampling Correction Network (ASCNet), that can effectively capture global\ncolumn relationships and embed them into a U-shaped framework, providing\ncomprehensive discriminative representation and seamless semantic connectivity.\nOur ASCNet consists of three core elements: Residual Haar Discrete Wavelet\nTransform (RHDWT), Pixel Shuffle (PS), and Column Non-uniformity Correction\nModule (CNCM). Specifically, RHDWT is a novel downsampler that employs\ndouble-branch modeling to effectively integrate stripe-directional prior\nknowledge and data-driven semantic interaction to enrich the feature\nrepresentation. Observing the semantic patterns crosstalk of stripe noise, PS\nis introduced as an upsampler to prevent excessive apriori decoding and\nperforming semantic-bias-free image reconstruction. After each sampling, CNCM\ncaptures the column relationships in long-range dependencies. By incorporating\ncolumn, spatial, and self-dependence information, CNCM well establishes a\nglobal context to distinguish stripes from the scene's vertical structures.\nExtensive experiments on synthetic data, real data, and infrared small target\ndetection tasks demonstrate that the proposed method outperforms\nstate-of-the-art single-image destriping methods both visually and\nquantitatively. Our code will be made publicly available at\nhttps://github.com/xdFai/ASCNet.\n","authors":["Shuai Yuan","Hanlin Qin","Xiang Yan","Shiqi Yang","Shuowen Yang","Naveed Akhtar"],"pdf_url":"https://arxiv.org/pdf/2401.15578v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18576v3","updated":"2024-06-04T11:39:34Z","published":"2023-11-30T14:15:39Z","title":"Fingerprint Matching with Localized Deep Representation","summary":"  Compared to minutia-based fingerprint representations, fixed-length\nrepresentations are attractive due to simple and efficient matching. However,\nfixed-length fingerprint representations are limited in accuracy when matching\nfingerprints with different visible areas, which can occur due to different\nfinger poses or acquisition methods. To address this issue, we propose a\nlocalized deep representation of fingerprint, named LDRF. By focusing on the\ndiscriminative characteristics within local regions, LDRF provides a more\nrobust and accurate fixed-length representation for fingerprints with variable\nvisible areas. LDRF can be adapted to retain information within any valid area,\nmaking it highly flexible. The matching scores produced by LDRF also exhibit\nintuitive statistical characteristics, which led us to propose a matching score\nnormalization technique to mitigate the uncertainty in the cases of very small\noverlapping area. With this new technique, we can maintain a high level of\naccuracy and reliability in our fingerprint matching, even as the size of the\ndatabase grows rapidly. Our experimental results on 21 datasets containing over\n140K fingerprints of various finger poses and impression types show that LDRF\noutperforms other fixed-length representations and is robust to sensing\ntechnologies and impression types. Besides, the proposed matching score\nnormalization effectively reduces the false match rate (FMR) in large-scale\nidentification experiments comprising over 5.11 million fingerprints.\nSpecifically, this technique results in a reduction of two orders of magnitude\ncompared to matching without matching score normalization and five orders of\nmagnitude compared to prior works.\n","authors":["Yongjie Duan","Zhiyu Pan","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.18576v3.pdf","comment":"The paper requires major revision"},{"id":"http://arxiv.org/abs/2406.02223v1","updated":"2024-06-04T11:33:40Z","published":"2024-06-04T11:33:40Z","title":"SMCL: Saliency Masked Contrastive Learning for Long-tailed Recognition","summary":"  Real-world data often follow a long-tailed distribution with a high imbalance\nin the number of samples between classes. The problem with training from\nimbalanced data is that some background features, common to all classes, can be\nunobserved in classes with scarce samples. As a result, this background\ncorrelates to biased predictions into ``major\" classes. In this paper, we\npropose saliency masked contrastive learning, a new method that uses saliency\nmasking and contrastive learning to mitigate the problem and improve the\ngeneralizability of a model. Our key idea is to mask the important part of an\nimage using saliency detection and use contrastive learning to move the masked\nimage towards minor classes in the feature space, so that background features\npresent in the masked image are no longer correlated with the original class.\nExperiment results show that our method achieves state-of-the-art level\nperformance on benchmark long-tailed datasets.\n","authors":["Sanglee Park","Seung-won Hwang","Jungmin So"],"pdf_url":"https://arxiv.org/pdf/2406.02223v1.pdf","comment":"accepted at ICASSP 2023"},{"id":"http://arxiv.org/abs/2405.09550v3","updated":"2024-06-04T11:28:42Z","published":"2024-03-20T12:27:30Z","title":"Mask-based Invisible Backdoor Attacks on Object Detection","summary":"  Deep learning models have achieved unprecedented performance in the domain of\nobject detection, resulting in breakthroughs in areas such as autonomous\ndriving and security. However, deep learning models are vulnerable to backdoor\nattacks. These attacks prompt models to behave similarly to standard models\nwithout a trigger; however, they act maliciously upon detecting a predefined\ntrigger. Despite extensive research on backdoor attacks in image\nclassification, their application to object detection remains relatively\nunderexplored. Given the widespread application of object detection in critical\nreal-world scenarios, the sensitivity and potential impact of these\nvulnerabilities cannot be overstated. In this study, we propose an effective\ninvisible backdoor attack on object detection utilizing a mask-based approach.\nThree distinct attack scenarios were explored for object detection: object\ndisappearance, object misclassification, and object generation attack. Through\nextensive experiments, we comprehensively examined the effectiveness of these\nattacks and tested certain defense methods to determine effective\ncountermeasures. Code will be available at\nhttps://github.com/jeongjin0/invisible-backdoor-object-detection\n","authors":["Jeongjin Shin"],"pdf_url":"https://arxiv.org/pdf/2405.09550v3.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.01306v2","updated":"2024-06-04T11:08:42Z","published":"2024-03-02T20:36:10Z","title":"ICC: Quantifying Image Caption Concreteness for Multimodal Dataset\n  Curation","summary":"  Web-scale training on paired text-image data is becoming increasingly central\nto multimodal learning, but is challenged by the highly noisy nature of\ndatasets in the wild. Standard data filtering approaches succeed in removing\nmismatched text-image pairs, but permit semantically related but highly\nabstract or subjective text. These approaches lack the fine-grained ability to\nisolate the most concrete samples that provide the strongest signal for\nlearning in a noisy dataset. In this work, we propose a new metric, image\ncaption concreteness, that evaluates caption text without an image reference to\nmeasure its concreteness and relevancy for use in multimodal learning. Our\napproach leverages strong foundation models for measuring visual-semantic\ninformation loss in multimodal representations. We demonstrate that this\nstrongly correlates with human evaluation of concreteness in both single-word\nand sentence-level texts. Moreover, we show that curation using ICC complements\nexisting approaches: It succeeds in selecting the highest quality samples from\nmultimodal web-scale datasets to allow for efficient training in\nresource-constrained settings.\n","authors":["Moran Yanuka","Morris Alper","Hadar Averbuch-Elor","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2403.01306v2.pdf","comment":"Accepted to ACL 2024 (Finding). For Project webpage, see\n  https://moranyanuka.github.io/icc/"},{"id":"http://arxiv.org/abs/2312.04465v2","updated":"2024-06-04T11:08:25Z","published":"2023-12-07T17:35:49Z","title":"FitDiff: Robust monocular 3D facial shape and reflectance estimation\n  using Diffusion Models","summary":"  The remarkable progress in 3D face reconstruction has resulted in high-detail\nand photorealistic facial representations. Recently, Diffusion Models have\nrevolutionized the capabilities of generative methods by surpassing the\nperformance of GANs. In this work, we present FitDiff, a diffusion-based 3D\nfacial avatar generative model. Leveraging diffusion principles, our model\naccurately generates relightable facial avatars, utilizing an identity\nembedding extracted from an \"in-the-wild\" 2D facial image. The introduced\nmulti-modal diffusion model is the first to concurrently output facial\nreflectance maps (diffuse and specular albedo and normals) and shapes,\nshowcasing great generalization capabilities. It is solely trained on an\nannotated subset of a public facial dataset, paired with 3D reconstructions. We\nrevisit the typical 3D facial fitting approach by guiding a reverse diffusion\nprocess using perceptual and face recognition losses. Being the first 3D LDM\nconditioned on face recognition embeddings, FitDiff reconstructs relightable\nhuman avatars, that can be used as-is in common rendering engines, starting\nonly from an unconstrained facial image, and achieving state-of-the-art\nperformance.\n","authors":["Stathis Galanakis","Alexandros Lattas","Stylianos Moschoglou","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2312.04465v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02208v1","updated":"2024-06-04T11:06:13Z","published":"2024-06-04T11:06:13Z","title":"Why Only Text: Empowering Vision-and-Language Navigation with\n  Multi-modal Prompts","summary":"  Current Vision-and-Language Navigation (VLN) tasks mainly employ textual\ninstructions to guide agents. However, being inherently abstract, the same\ntextual instruction can be associated with different visual signals, causing\nsevere ambiguity and limiting the transfer of prior knowledge in the vision\ndomain from the user to the agent. To fill this gap, we propose\nVision-and-Language Navigation with Multi-modal Prompts (VLN-MP), a novel task\naugmenting traditional VLN by integrating both natural language and images in\ninstructions. VLN-MP not only maintains backward compatibility by effectively\nhandling text-only prompts but also consistently shows advantages with\ndifferent quantities and relevance of visual prompts. Possible forms of visual\nprompts include both exact and similar object images, providing adaptability\nand versatility in diverse navigation scenarios. To evaluate VLN-MP under a\nunified framework, we implement a new benchmark that offers: (1) a\ntraining-free pipeline to transform textual instructions into multi-modal forms\nwith landmark images; (2) diverse datasets with multi-modal instructions for\ndifferent downstream tasks; (3) a novel module designed to process various\nimage prompts for seamless integration with state-of-the-art VLN models.\nExtensive experiments on four VLN benchmarks (R2R, RxR, REVERIE, CVDN) show\nthat incorporating visual prompts significantly boosts navigation performance.\nWhile maintaining efficiency with text-only prompts, VLN-MP enables agents to\nnavigate in the pre-explore setting and outperform text-based models, showing\nits broader applicability.\n","authors":["Haodong Hong","Sen Wang","Zi Huang","Qi Wu","Jiajun Liu"],"pdf_url":"https://arxiv.org/pdf/2406.02208v1.pdf","comment":"IJCAI 2024"},{"id":"http://arxiv.org/abs/2406.02202v1","updated":"2024-06-04T10:57:59Z","published":"2024-06-04T10:57:59Z","title":"Can CLIP help CLIP in learning 3D?","summary":"  In this study, we explore an alternative approach to enhance contrastive\ntext-image-3D alignment in the absence of textual descriptions for 3D objects.\nWe introduce two unsupervised methods, $I2I$ and $(I2L)^2$, which leverage CLIP\nknowledge about textual and 2D data to compute the neural perceived similarity\nbetween two 3D samples. We employ the proposed methods to mine 3D hard\nnegatives, establishing a multimodal contrastive pipeline with hard negative\nweighting via a custom loss function. We train on different configurations of\nthe proposed hard negative mining approach, and we evaluate the accuracy of our\nmodels in 3D classification and on the cross-modal retrieval benchmark, testing\nimage-to-shape and shape-to-image retrieval. Results demonstrate that our\napproach, even without explicit text alignment, achieves comparable or superior\nperformance on zero-shot and standard 3D classification, while significantly\nimproving both image-to-shape and shape-to-image retrieval compared to previous\nmethods.\n","authors":["Cristian Sbrolli","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2406.02202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10223v4","updated":"2024-06-04T10:52:16Z","published":"2023-05-17T13:56:48Z","title":"Advancing Unsupervised Low-light Image Enhancement: Noise Estimation,\n  Illumination Interpolation, and Self-Regulation","summary":"  Contemporary Low-Light Image Enhancement (LLIE) techniques have made notable\nadvancements in preserving image details and enhancing contrast, achieving\ncommendable results on specific datasets. Nevertheless, these approaches\nencounter persistent challenges in efficiently mitigating dynamic noise and\naccommodating diverse low-light scenarios. Insufficient constraints on complex\npixel-wise mapping learning lead to overfitting to specific types of noise and\nartifacts associated with low-light conditions, reducing effectiveness in\nvariable lighting scenarios. To this end, we first propose a method for\nestimating the noise level in low light images in a quick and accurate way.\nThis facilitates precise denoising, prevents over-smoothing, and adapts to\ndynamic noise patterns. Subsequently, we devise a Learnable Illumination\nInterpolator (LII), which employs learnlable interpolation operations between\nthe input and unit vector to satisfy general constraints between illumination\nand input. Finally, we introduce a self-regularization loss that incorporates\nintrinsic image properties and essential visual attributes to guide the output\ntowards meeting human visual expectations. Comprehensive experiments validate\nthe competitiveness of our proposed algorithm in both qualitative and\nquantitative assessments. Notably, our noise estimation method, with linear\ntime complexity and suitable for various denoisers, significantly improves both\ndenoising and enhancement performance. Benefiting from this, our approach\nachieves a 0.675dB PSNR improvement on the LOL dataset and 0.818dB on the MIT\ndataset on LLIE task, even compared to supervised methods. The source code is\navailable at \\href{https://doi.org/10.5281/zenodo.11463142}{this DOI\nrepository} and the specific code for noise estimation can be found at\n\\href{https://github.com/GoogolplexGoodenough/noise_estimate}{this separate\nGitHub link}.\n","authors":["Xiaofeng Liu","Jiaxin Gao","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2305.10223v4.pdf","comment":"Image processing, low-light image enhancement, noise estimation,\n  illumination learning"},{"id":"http://arxiv.org/abs/2403.05196v2","updated":"2024-06-04T10:47:02Z","published":"2024-03-08T10:19:00Z","title":"Denoising Autoregressive Representation Learning","summary":"  In this paper, we explore a new generative approach for learning visual\nrepresentations. Our method, DARL, employs a decoder-only Transformer to\npredict image patches autoregressively. We find that training with Mean Squared\nError (MSE) alone leads to strong representations. To enhance the image\ngeneration ability, we replace the MSE loss with the diffusion objective by\nusing a denoising patch decoder. We show that the learned representation can be\nimproved by using tailored noise schedules and longer training in larger\nmodels. Notably, the optimal schedule differs significantly from the typical\nones used in standard image diffusion models. Overall, despite its simple\narchitecture, DARL delivers performance remarkably close to state-of-the-art\nmasked prediction models under the fine-tuning protocol. This marks an\nimportant step towards a unified model capable of both visual perception and\ngeneration, effectively combining the strengths of autoregressive and denoising\ndiffusion models.\n","authors":["Yazhe Li","Jorg Bornschein","Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2403.05196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16518v2","updated":"2024-06-04T10:30:58Z","published":"2023-11-27T18:11:19Z","title":"SeeSR: Towards Semantics-Aware Real-World Image Super-Resolution","summary":"  Owe to the powerful generative priors, the pre-trained text-to-image (T2I)\ndiffusion models have become increasingly popular in solving the real-world\nimage super-resolution problem. However, as a consequence of the heavy quality\ndegradation of input low-resolution (LR) images, the destruction of local\nstructures can lead to ambiguous image semantics. As a result, the content of\nreproduced high-resolution image may have semantic errors, deteriorating the\nsuper-resolution performance. To address this issue, we present a\nsemantics-aware approach to better preserve the semantic fidelity of generative\nreal-world image super-resolution. First, we train a degradation-aware prompt\nextractor, which can generate accurate soft and hard semantic prompts even\nunder strong degradation. The hard semantic prompts refer to the image tags,\naiming to enhance the local perception ability of the T2I model, while the soft\nsemantic prompts compensate for the hard ones to provide additional\nrepresentation information. These semantic prompts encourage the T2I model to\ngenerate detailed and semantically accurate results. Furthermore, during the\ninference process, we integrate the LR images into the initial sampling noise\nto mitigate the diffusion model's tendency to generate excessive random\ndetails. The experiments show that our method can reproduce more realistic\nimage details and hold better the semantics. The source code of our method can\nbe found at https://github.com/cswry/SeeSR.\n","authors":["Rongyuan Wu","Tao Yang","Lingchen Sun","Zhengqiang Zhang","Shuai Li","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.16518v2.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2406.02184v1","updated":"2024-06-04T10:29:18Z","published":"2024-06-04T10:29:18Z","title":"GraVITON: Graph based garment warping with attention guided inversion\n  for Virtual-tryon","summary":"  Virtual try-on, a rapidly evolving field in computer vision, is transforming\ne-commerce by improving customer experiences through precise garment warping\nand seamless integration onto the human body. While existing methods such as\nTPS and flow address the garment warping but overlook the finer contextual\ndetails. In this paper, we introduce a novel graph based warping technique\nwhich emphasizes the value of context in garment flow. Our graph based warping\nmodule generates warped garment as well as a coarse person image, which is\nutilised by a simple refinement network to give a coarse virtual tryon image.\nThe proposed work exploits latent diffusion model to generate the final tryon,\ntreating garment transfer as an inpainting task. The diffusion model is\nconditioned with decoupled cross attention based inversion of visual and\ntextual information. We introduce an occlusion aware warping constraint that\ngenerates dense warped garment, without any holes and occlusion. Our method,\nvalidated on VITON-HD and Dresscode datasets, showcases substantial\nstate-of-the-art qualitative and quantitative results showing considerable\nimprovement in garment warping, texture preservation, and overall realism.\n","authors":["Sanhita Pathak","Vinay Kaushik","Brejesh Lall"],"pdf_url":"https://arxiv.org/pdf/2406.02184v1.pdf","comment":"18 pages, 7 Figures and 6 Tables"},{"id":"http://arxiv.org/abs/2204.09389v2","updated":"2024-06-04T10:24:11Z","published":"2022-04-20T11:01:51Z","title":"Epistemic Uncertainty-Weighted Loss for Visual Bias Mitigation","summary":"  Deep neural networks are highly susceptible to learning biases in visual\ndata. While various methods have been proposed to mitigate such bias, the\nmajority require explicit knowledge of the biases present in the training data\nin order to mitigate. We argue the relevance of exploring methods which are\ncompletely ignorant of the presence of any bias, but are capable of identifying\nand mitigating them. Furthermore, we propose using Bayesian neural networks\nwith a predictive uncertainty-weighted loss function to dynamically identify\npotential bias in individual training samples and to weight them during\ntraining. We find a positive correlation between samples subject to bias and\nhigher epistemic uncertainties. Finally, we show the method has potential to\nmitigate visual bias on a bias benchmark dataset and on a real-world face\ndetection problem, and we consider the merits and weaknesses of our approach.\n","authors":["Rebecca S Stone","Nishant Ravikumar","Andrew J Bulpitt","David C Hogg"],"pdf_url":"https://arxiv.org/pdf/2204.09389v2.pdf","comment":"Published in 2022 IEEE CVPR Workshop on Fair, Data Efficient and\n  Trusted Computer Vision"},{"id":"http://arxiv.org/abs/2312.15271v2","updated":"2024-06-04T09:59:48Z","published":"2023-12-23T14:43:52Z","title":"SSFlowNet: Semi-supervised Scene Flow Estimation On Point Clouds With\n  Pseudo Label","summary":"  In the domain of supervised scene flow estimation, the process of manual\nlabeling is both time-intensive and financially demanding. This paper\nintroduces SSFlowNet, a semi-supervised approach for scene flow estimation,\nthat utilizes a blend of labeled and unlabeled data, optimizing the balance\nbetween the cost of labeling and the precision of model training. SSFlowNet\nstands out through its innovative use of pseudo-labels, mainly reducing the\ndependency on extensively labeled datasets while maintaining high model\naccuracy. The core of our model is its emphasis on the intricate geometric\nstructures of point clouds, both locally and globally, coupled with a novel\nspatial memory feature. This feature is adept at learning the geometric\nrelationships between points over sequential time frames. By identifying\nsimilarities between labeled and unlabeled points, SSFlowNet dynamically\nconstructs a correlation matrix to evaluate scene flow dependencies at\nindividual point level. Furthermore, the integration of a flow consistency\nmodule within SSFlowNet enhances its capability to consistently estimate flow,\nan essential aspect for analyzing dynamic scenes. Empirical results demonstrate\nthat SSFlowNet surpasses existing methods in pseudo-label generation and shows\nadaptability across varying data volumes. Moreover, our semi-supervised\ntraining technique yields promising outcomes even with different smaller ratio\nlabeled data, marking a substantial advancement in the field of scene flow\nestimation.\n","authors":["Jingze Chen","Junfeng Yao","Qiqin Lin","Rongzhou Zhou","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2312.15271v2.pdf","comment":"Accepted by 33rd International Conference on Artificial Neural\n  Networks (ICANN 2024)"},{"id":"http://arxiv.org/abs/2406.02158v1","updated":"2024-06-04T09:45:04Z","published":"2024-06-04T09:45:04Z","title":"Radar Spectra-Language Model for Automotive Scene Parsing","summary":"  Radar sensors are low cost, long-range, and weather-resilient. Therefore,\nthey are widely used for driver assistance functions, and are expected to be\ncrucial for the success of autonomous driving in the future. In many perception\ntasks only pre-processed radar point clouds are considered. In contrast, radar\nspectra are a raw form of radar measurements and contain more information than\nradar point clouds. However, radar spectra are rather difficult to interpret.\nIn this work, we aim to explore the semantic information contained in spectra\nin the context of automated driving, thereby moving towards better\ninterpretability of radar spectra. To this end, we create a radar\nspectra-language model, allowing us to query radar spectra measurements for the\npresence of scene elements using free text. We overcome the scarcity of radar\nspectra data by matching the embedding space of an existing vision-language\nmodel (VLM). Finally, we explore the benefit of the learned representation for\nscene parsing, and obtain improvements in free space segmentation and object\ndetection merely by injecting the spectra embedding into a baseline model.\n","authors":["Mariia Pushkareva","Yuri Feldman","Csaba Domokos","Kilian Rambach","Dotan Di Castro"],"pdf_url":"https://arxiv.org/pdf/2406.02158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02153v1","updated":"2024-06-04T09:41:40Z","published":"2024-06-04T09:41:40Z","title":"Analyzing the Feature Extractor Networks for Face Image Synthesis","summary":"  Advancements like Generative Adversarial Networks have attracted the\nattention of researchers toward face image synthesis to generate ever more\nrealistic images. Thereby, the need for the evaluation criteria to assess the\nrealism of the generated images has become apparent. While FID utilized with\nInceptionV3 is one of the primary choices for benchmarking, concerns about\nInceptionV3's limitations for face images have emerged. This study investigates\nthe behavior of diverse feature extractors -- InceptionV3, CLIP, DINOv2, and\nArcFace -- considering a variety of metrics -- FID, KID, Precision\\&Recall.\nWhile the FFHQ dataset is used as the target domain, as the source domains, the\nCelebA-HQ dataset and the synthetic datasets generated using StyleGAN2 and\nProjected FastGAN are used. Experiments include deep-down analysis of the\nfeatures: $L_2$ normalization, model attention during extraction, and domain\ndistributions in the feature space. We aim to give valuable insights into the\nbehavior of feature extractors for evaluating face image synthesis\nmethodologies. The code is publicly available at\nhttps://github.com/ThEnded32/AnalyzingFeatureExtractors.\n","authors":["Erdi Sarıtaş","Hazım Kemal Ekenel"],"pdf_url":"https://arxiv.org/pdf/2406.02153v1.pdf","comment":"Accepted at 18th International Conference on Automatic Face and\n  Gesture Recognition (FG) on 1st SD-FGA Workshop 2024"},{"id":"http://arxiv.org/abs/2406.02147v1","updated":"2024-06-04T09:34:46Z","published":"2024-06-04T09:34:46Z","title":"UA-Track: Uncertainty-Aware End-to-End 3D Multi-Object Tracking","summary":"  3D multiple object tracking (MOT) plays a crucial role in autonomous driving\nperception. Recent end-to-end query-based trackers simultaneously detect and\ntrack objects, which have shown promising potential for the 3D MOT task.\nHowever, existing methods overlook the uncertainty issue, which refers to the\nlack of precise confidence about the state and location of tracked objects.\nUncertainty arises owing to various factors during motion observation by\ncameras, especially occlusions and the small size of target objects, resulting\nin an inaccurate estimation of the object's position, label, and identity. To\nthis end, we propose an Uncertainty-Aware 3D MOT framework, UA-Track, which\ntackles the uncertainty problem from multiple aspects. Specifically, we first\nintroduce an Uncertainty-aware Probabilistic Decoder to capture the uncertainty\nin object prediction with probabilistic attention. Secondly, we propose an\nUncertainty-guided Query Denoising strategy to further enhance the training\nprocess. We also utilize Uncertainty-reduced Query Initialization, which\nleverages predicted 2D object location and depth information to reduce query\nuncertainty. As a result, our UA-Track achieves state-of-the-art performance on\nthe nuScenes benchmark, i.e., 66.3% AMOTA on the test split, surpassing the\nprevious best end-to-end solution by a significant margin of 8.9% AMOTA.\n","authors":["Lijun Zhou","Tao Tang","Pengkun Hao","Zihang He","Kalok Ho","Shuo Gu","Wenbo Hou","Zhihui Hao","Haiyang Sun","Kun Zhan","Peng Jia","Xianpeng Lang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2406.02147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02142v1","updated":"2024-06-04T09:29:59Z","published":"2024-06-04T09:29:59Z","title":"Analyzing the Effect of Combined Degradations on Face Recognition","summary":"  A face recognition model is typically trained on large datasets of images\nthat may be collected from controlled environments. This results in performance\ndiscrepancies when applied to real-world scenarios due to the domain gap\nbetween clean and in-the-wild images. Therefore, some researchers have\ninvestigated the robustness of these models by analyzing synthetic\ndegradations. Yet, existing studies have mostly focused on single degradation\nfactors, which may not fully capture the complexity of real-world degradations.\nThis work addresses this problem by analyzing the impact of both single and\ncombined degradations using a real-world degradation pipeline extended with\nunder/over-exposure conditions. We use the LFW dataset for our experiments and\nassess the model's performance based on verification accuracy. Results reveal\nthat single and combined degradations show dissimilar model behavior. The\ncombined effect of degradation significantly lowers performance even if its\nsingle effect is negligible. This work emphasizes the importance of accounting\nfor real-world complexity to assess the robustness of face recognition models\nin real-world settings. The code is publicly available at\nhttps://github.com/ThEnded32/AnalyzingCombinedDegradations.\n","authors":["Erdi Sarıtaş","Hazım Kemal Ekenel"],"pdf_url":"https://arxiv.org/pdf/2406.02142v1.pdf","comment":"Accepted at 18th International Conference on Automatic Face and\n  Gesture Recognition (FG) on 2nd PrivAAL Workshop 2024"},{"id":"http://arxiv.org/abs/2405.21013v3","updated":"2024-06-04T09:14:39Z","published":"2024-05-31T16:55:04Z","title":"StrucTexTv3: An Efficient Vision-Language Model for Text-rich Image\n  Perception, Comprehension, and Beyond","summary":"  Text-rich images have significant and extensive value, deeply integrated into\nvarious aspects of human life. Notably, both visual cues and linguistic symbols\nin text-rich images play crucial roles in information transmission but are\naccompanied by diverse challenges. Therefore, the efficient and effective\nunderstanding of text-rich images is a crucial litmus test for the capability\nof Vision-Language Models. We have crafted an efficient vision-language model,\nStrucTexTv3, tailored to tackle various intelligent tasks for text-rich images.\nThe significant design of StrucTexTv3 is presented in the following aspects:\nFirstly, we adopt a combination of an effective multi-scale reduced visual\ntransformer and a multi-granularity token sampler (MG-Sampler) as a visual\ntoken generator, successfully solving the challenges of high-resolution input\nand complex representation learning for text-rich images. Secondly, we enhance\nthe perception and comprehension abilities of StrucTexTv3 through instruction\nlearning, seamlessly integrating various text-oriented tasks into a unified\nframework. Thirdly, we have curated a comprehensive collection of high-quality\ntext-rich images, abbreviated as TIM-30M, encompassing diverse scenarios like\nincidental scenes, office documents, web pages, and screenshots, thereby\nimproving the robustness of our model. Our method achieved SOTA results in\ntext-rich image perception tasks, and significantly improved performance in\ncomprehension tasks. Among multimodal models with LLM decoder of approximately\n1.8B parameters, it stands out as a leader, which also makes the deployment of\nedge devices feasible. In summary, the StrucTexTv3 model, featuring efficient\nstructural design, outstanding performance, and broad adaptability, offers\nrobust support for diverse intelligent application tasks involving text-rich\nimages, thus exhibiting immense potential for widespread application.\n","authors":["Pengyuan Lyu","Yulin Li","Hao Zhou","Weihong Ma","Xingyu Wan","Qunyi Xie","Liang Wu","Chengquan Zhang","Kun Yao","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2405.21013v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11180v5","updated":"2024-06-04T09:11:21Z","published":"2023-06-19T22:07:20Z","title":"Hyperbolic Active Learning for Semantic Segmentation under Domain Shift","summary":"  We introduce a hyperbolic neural network approach to pixel-level active\nlearning for semantic segmentation. Analysis of the data statistics leads to a\nnovel interpretation of the hyperbolic radius as an indicator of data scarcity.\nIn HALO (Hyperbolic Active Learning Optimization), for the first time, we\npropose the use of epistemic uncertainty as a data acquisition strategy,\nfollowing the intuition of selecting data points that are the least known. The\nhyperbolic radius, complemented by the widely-adopted prediction entropy,\neffectively approximates epistemic uncertainty. We perform extensive\nexperimental analysis based on two established synthetic-to-real benchmarks,\ni.e. GTAV $\\rightarrow$ Cityscapes and SYNTHIA $\\rightarrow$ Cityscapes.\nAdditionally, we test HALO on Cityscape $\\rightarrow$ ACDC for domain\nadaptation under adverse weather conditions, and we benchmark both\nconvolutional and attention-based backbones. HALO sets a new state-of-the-art\nin active learning for semantic segmentation under domain shift and it is the\nfirst active learning approach that surpasses the performance of supervised\ndomain adaptation while using only a small portion of labels (i.e., 1%).\n","authors":["Luca Franco","Paolo Mandica","Konstantinos Kallidromitis","Devin Guillory","Yu-Teng Li","Trevor Darrell","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2306.11180v5.pdf","comment":"ICML 2024. Project repository: https://github.com/paolomandica/HALO"},{"id":"http://arxiv.org/abs/2406.02125v1","updated":"2024-06-04T09:10:02Z","published":"2024-06-04T09:10:02Z","title":"Domain Game: Disentangle Anatomical Feature for Single Domain\n  Generalized Segmentation","summary":"  Single domain generalization aims to address the challenge of\nout-of-distribution generalization problem with only one source domain\navailable. Feature distanglement is a classic solution to this purpose, where\nthe extracted task-related feature is presumed to be resilient to domain shift.\nHowever, the absence of references from other domains in a single-domain\nscenario poses significant uncertainty in feature disentanglement\n(ill-posedness). In this paper, we propose a new framework, named\n\\textit{Domain Game}, to perform better feature distangling for medical image\nsegmentation, based on the observation that diagnostic relevant features are\nmore sensitive to geometric transformations, whilist domain-specific features\nprobably will remain invariant to such operations. In domain game, a set of\nrandomly transformed images derived from a singular source image is\nstrategically encoded into two separate feature sets to represent diagnostic\nfeatures and domain-specific features, respectively, and we apply forces to\npull or repel them in the feature space, accordingly. Results from cross-site\ntest domain evaluation showcase approximately an ~11.8% performance boost in\nprostate segmentation and around ~10.5% in brain tumor segmentation compared to\nthe second-best method.\n","authors":["Hao Chen","Hongrun Zhang","U Wang Chan","Rui Yin","Xiaofei Wang","Chao Li"],"pdf_url":"https://arxiv.org/pdf/2406.02125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01136v2","updated":"2024-06-04T09:02:14Z","published":"2024-06-03T09:27:57Z","title":"Towards Practical Single-shot Motion Synthesis","summary":"  Despite the recent advances in the so-called \"cold start\" generation from\ntext prompts, their needs in data and computing resources, as well as the\nambiguities around intellectual property and privacy concerns pose certain\ncounterarguments for their utility. An interesting and relatively unexplored\nalternative has been the introduction of unconditional synthesis from a single\nsample, which has led to interesting generative applications. In this paper we\nfocus on single-shot motion generation and more specifically on accelerating\nthe training time of a Generative Adversarial Network (GAN). In particular, we\ntackle the challenge of GAN's equilibrium collapse when using mini-batch\ntraining by carefully annealing the weights of the loss functions that prevent\nmode collapse. Additionally, we perform statistical analysis in the generator\nand discriminator models to identify correlations between training stages and\nenable transfer learning. Our improved GAN achieves competitive quality and\ndiversity on the Mixamo benchmark when compared to the original GAN\narchitecture and a single-shot diffusion model, while being up to x6.8 faster\nin training time from the former and x1.75 from the latter. Finally, we\ndemonstrate the ability of our improved GAN to mix and compose motion with a\nsingle forward pass. Project page available at\nhttps://moverseai.github.io/single-shot.\n","authors":["Konstantinos Roditakis","Spyridon Thermos","Nikolaos Zioulis"],"pdf_url":"https://arxiv.org/pdf/2406.01136v2.pdf","comment":"CVPR 2024, AI for 3D Generation Workshop, Project page:\n  https://moverseai.github.io/single-shot"},{"id":"http://arxiv.org/abs/2303.10559v2","updated":"2024-06-04T08:57:38Z","published":"2023-03-19T04:00:05Z","title":"Deep Learning for Camera Calibration and Beyond: A Survey","summary":"  Camera calibration involves estimating camera parameters to infer geometric\nfeatures from captured sequences, which is crucial for computer vision and\nrobotics. However, conventional calibration is laborious and requires dedicated\ncollection. Recent efforts show that learning-based solutions have the\npotential to be used in place of the repeatability works of manual\ncalibrations. Among these solutions, various learning strategies, networks,\ngeometric priors, and datasets have been investigated. In this paper, we\nprovide a comprehensive survey of learning-based camera calibration techniques,\nby analyzing their strengths and limitations. Our main calibration categories\ninclude the standard pinhole camera model, distortion camera model, cross-view\nmodel, and cross-sensor model, following the research trend and extended\napplications. As there is no benchmark in this community, we collect a holistic\ncalibration dataset that can serve as a public platform to evaluate the\ngeneralization of existing methods. It comprises both synthetic and real-world\ndata, with images and videos captured by different cameras in diverse scenes.\nToward the end of this paper, we discuss the challenges and provide further\nresearch directions. To our knowledge, this is the first survey for the\nlearning-based camera calibration (spanned 8 years). The summarized methods,\ndatasets, and benchmarks are available and will be regularly updated at\nhttps://github.com/KangLiao929/Awesome-Deep-Camera-Calibration.\n","authors":["Kang Liao","Lang Nie","Shujuan Huang","Chunyu Lin","Jing Zhang","Yao Zhao","Moncef Gabbouj","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2303.10559v2.pdf","comment":"Github repository:\n  https://github.com/KangLiao929/Awesome-Deep-Camera-Calibration"},{"id":"http://arxiv.org/abs/2405.07857v2","updated":"2024-06-04T08:56:57Z","published":"2024-05-13T15:42:46Z","title":"Synergistic Integration of Coordinate Network and Tensorial Feature for\n  Improving Neural Radiance Fields from Sparse Inputs","summary":"  The multi-plane representation has been highlighted for its fast training and\ninference across static and dynamic neural radiance fields. This approach\nconstructs relevant features via projection onto learnable grids and\ninterpolating adjacent vertices. However, it has limitations in capturing\nlow-frequency details and tends to overuse parameters for low-frequency\nfeatures due to its bias toward fine details, despite its multi-resolution\nconcept. This phenomenon leads to instability and inefficiency when training\nposes are sparse. In this work, we propose a method that synergistically\nintegrates multi-plane representation with a coordinate-based MLP network known\nfor strong bias toward low-frequency signals. The coordinate-based network is\nresponsible for capturing low-frequency details, while the multi-plane\nrepresentation focuses on capturing fine-grained details. We demonstrate that\nusing residual connections between them seamlessly preserves their own inherent\nproperties. Additionally, the proposed progressive training scheme accelerates\nthe disentanglement of these two features. We demonstrate empirically that our\nproposed method outperforms baseline models for both static and dynamic NeRFs\nwith sparse inputs, achieving comparable results with fewer parameters.\n","authors":["Mingyu Kim","Jun-Seong Kim","Se-Young Yun","Jin-Hwa Kim"],"pdf_url":"https://arxiv.org/pdf/2405.07857v2.pdf","comment":"ICML2024 ; Project page is accessible at\n  https://mingyukim87.github.io/SynergyNeRF ; Code is available at\n  https://github.com/MingyuKim87/SynergyNeRF"},{"id":"http://arxiv.org/abs/2211.13984v2","updated":"2024-06-04T08:54:51Z","published":"2022-11-25T09:47:34Z","title":"Aggregated Text Transformer for Scene Text Detection","summary":"  This paper explores the multi-scale aggregation strategy for scene text\ndetection in natural images. We present the Aggregated Text TRansformer(ATTR),\nwhich is designed to represent texts in scene images with a multi-scale\nself-attention mechanism. Starting from the image pyramid with multiple\nresolutions, the features are first extracted at different scales with shared\nweight and then fed into an encoder-decoder architecture of Transformer. The\nmulti-scale image representations are robust and contain rich information on\ntext contents of various sizes. The text Transformer aggregates these features\nto learn the interaction across different scales and improve text\nrepresentation. The proposed method detects scene texts by representing each\ntext instance as an individual binary mask, which is tolerant of curve texts\nand regions with dense instances. Extensive experiments on public scene text\ndetection datasets demonstrate the effectiveness of the proposed framework.\n","authors":["Zhao Zhou","Xiangcheng Du","Yingbin Zheng","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2211.13984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00772v2","updated":"2024-06-04T08:53:24Z","published":"2024-06-02T15:19:07Z","title":"Unsupervised Contrastive Analysis for Salient Pattern Detection using\n  Conditional Diffusion Models","summary":"  Contrastive Analysis (CA) regards the problem of identifying patterns in\nimages that allow distinguishing between a background (BG) dataset (i.e.\nhealthy subjects) and a target (TG) dataset (i.e. unhealthy subjects). Recent\nworks on this topic rely on variational autoencoders (VAE) or contrastive\nlearning strategies to learn the patterns that separate TG samples from BG\nsamples in a supervised manner. However, the dependency on target (unhealthy)\nsamples can be challenging in medical scenarios due to their limited\navailability. Also, the blurred reconstructions of VAEs lack utility and\ninterpretability. In this work, we redefine the CA task by employing a\nself-supervised contrastive encoder to learn a latent representation encoding\nonly common patterns from input images, using samples exclusively from the BG\ndataset during training, and approximating the distribution of the target\npatterns by leveraging data augmentation techniques. Subsequently, we exploit\nstate-of-the-art generative methods, i.e. diffusion models, conditioned on the\nlearned latent representation to produce a realistic (healthy) version of the\ninput image encoding solely the common patterns. Thorough validation on a\nfacial image dataset and experiments across three brain MRI datasets\ndemonstrate that conditioning the generative process of state-of-the-art\ngenerative methods with the latent representation from our self-supervised\ncontrastive encoder yields improvements in the generated image quality and in\nthe accuracy of image classification. The code is available at\nhttps://github.com/CristianoPatricio/unsupervised-contrastive-cond-diff.\n","authors":["Cristiano Patrício","Carlo Alberto Barbano","Attilio Fiandrotti","Riccardo Renzulli","Marco Grangetto","Luis F. Teixeira","João C. Neves"],"pdf_url":"https://arxiv.org/pdf/2406.00772v2.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2405.15477v2","updated":"2024-06-04T08:35:14Z","published":"2024-05-24T11:58:02Z","title":"MagicBathyNet: A Multimodal Remote Sensing Dataset for Bathymetry\n  Prediction and Pixel-based Classification in Shallow Waters","summary":"  Accurate, detailed, and high-frequent bathymetry, coupled with complex\nsemantic content, is crucial for the undermapped shallow seabed areas facing\nintense climatological and anthropogenic pressures. Current methods exploiting\nremote sensing images to derive bathymetry or seabed classes mainly exploit\nnon-open data. This lack of openly accessible benchmark archives prevents the\nwider use of deep learning methods in such applications. To address this issue,\nin this paper we present the MagicBathyNet, which is a benchmark dataset made\nup of image patches of Sentinel2, SPOT-6 and aerial imagery, bathymetry in\nraster format and annotations of seabed classes. MagicBathyNet is then\nexploited to benchmark state-of-the-art methods in learning-based bathymetry\nand pixel-based classification. Dataset, pre-trained weights, and code are\npublicly available at www.magicbathy.eu/magicbathynet.html.\n","authors":["Panagiotis Agrafiotis","Łukasz Janowski","Dimitrios Skarlatos","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2405.15477v2.pdf","comment":"5 pages, 3 figures, 5 tables. Accepted at IEEE International\n  Geoscience and Remote Sensing Symposium (IGARSS) 2024"},{"id":"http://arxiv.org/abs/2406.01425v2","updated":"2024-06-04T08:20:27Z","published":"2024-06-03T15:25:45Z","title":"Sensitivity-Informed Augmentation for Robust Segmentation","summary":"  Segmentation is an integral module in many visual computing applications such\nas virtual try-on, medical imaging, autonomous driving, and agricultural\nautomation. These applications often involve either widespread consumer use or\nhighly variable environments, both of which can degrade the quality of visual\nsensor data, whether from a common mobile phone or an expensive satellite\nimaging camera. In addition to external noises like user difference or weather\nconditions, internal noises such as variations in camera quality or lens\ndistortion can affect the performance of segmentation models during both\ndevelopment and deployment. In this work, we present an efficient, adaptable,\nand gradient-free method to enhance the robustness of learning-based\nsegmentation models across training. First, we introduce a novel adaptive\nsensitivity analysis (ASA) using Kernel Inception Distance (KID) on basis\nperturbations to benchmark perturbation sensitivity of pre-trained segmentation\nmodels. Then, we model the sensitivity curve using the adaptive SA and sample\nperturbation hyperparameter values accordingly. Finally, we conduct adversarial\ntraining with the selected perturbation values and dynamically re-evaluate\nrobustness during online training. Our method, implemented end-to-end with\nminimal fine-tuning required, consistently outperforms state-of-the-art data\naugmentation techniques for segmentation. It shows significant improvement in\nboth clean data evaluation and real-world adverse scenario evaluation across\nvarious segmentation datasets used in visual computing and computer graphics\napplications.\n","authors":["Laura Zheng","Wenjie Wei","Tony Wu","Jacob Clements","Shreelekha Revankar","Andre Harrison","Yu Shen","Ming C. Lin"],"pdf_url":"https://arxiv.org/pdf/2406.01425v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2406.02077v1","updated":"2024-06-04T07:57:34Z","published":"2024-06-04T07:57:34Z","title":"Multi-target stain normalization for histology slides","summary":"  Traditional staining normalization approaches, e.g. Macenko, typically rely\non the choice of a single representative reference image, which may not\nadequately account for the diverse staining patterns of datasets collected in\npractical scenarios. In this study, we introduce a novel approach that\nleverages multiple reference images to enhance robustness against stain\nvariation. Our method is parameter-free and can be adopted in existing\ncomputational pathology pipelines with no significant changes. We evaluate the\neffectiveness of our method through experiments using a deep-learning pipeline\nfor automatic nuclei segmentation on colorectal images. Our results show that\nby leveraging multiple reference images, better results can be achieved when\ngeneralizing to external data, where the staining can widely differ from the\ntraining set.\n","authors":["Desislav Ivanov","Carlo Alberto Barbano","Marco Grangetto"],"pdf_url":"https://arxiv.org/pdf/2406.02077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02074v1","updated":"2024-06-04T07:54:10Z","published":"2024-06-04T07:54:10Z","title":"FaceCom: Towards High-fidelity 3D Facial Shape Completion via\n  Optimization and Inpainting Guidance","summary":"  We propose FaceCom, a method for 3D facial shape completion, which delivers\nhigh-fidelity results for incomplete facial inputs of arbitrary forms. Unlike\nend-to-end shape completion methods based on point clouds or voxels, our\napproach relies on a mesh-based generative network that is easy to optimize,\nenabling it to handle shape completion for irregular facial scans. We first\ntrain a shape generator on a mixed 3D facial dataset containing 2405\nidentities. Based on the incomplete facial input, we fit complete faces using\nan optimization approach under image inpainting guidance. The completion\nresults are refined through a post-processing step. FaceCom demonstrates the\nability to effectively and naturally complete facial scan data with varying\nmissing regions and degrees of missing areas. Our method can be used in medical\nprosthetic fabrication and the registration of deficient scanning data. Our\nexperimental results demonstrate that FaceCom achieves exceptional performance\nin fitting and shape completion tasks. The code is available at\nhttps://github.com/dragonylee/FaceCom.git.\n","authors":["Yinglong Li","Hongyu Wu","Xiaogang Wang","Qingzhao Qin","Yijiao Zhao","Yong wang","Aimin Hao"],"pdf_url":"https://arxiv.org/pdf/2406.02074v1.pdf","comment":"accepted to CVPR2024"},{"id":"http://arxiv.org/abs/2406.02064v1","updated":"2024-06-04T07:45:27Z","published":"2024-06-04T07:45:27Z","title":"Advancing Generalized Transfer Attack with Initialization Derived\n  Bilevel Optimization and Dynamic Sequence Truncation","summary":"  Transfer attacks generate significant interest for real-world black-box\napplications by crafting transferable adversarial examples through surrogate\nmodels. Whereas, existing works essentially directly optimize the single-level\nobjective w.r.t. the surrogate model, which always leads to poor\ninterpretability of attack mechanism and limited generalization performance\nover unknown victim models. In this work, we propose the\n\\textbf{B}il\\textbf{E}vel \\textbf{T}ransfer \\textbf{A}ttac\\textbf{K} (BETAK)\nframework by establishing an initialization derived bilevel optimization\nparadigm, which explicitly reformulates the nested constraint relationship\nbetween the Upper-Level (UL) pseudo-victim attacker and the Lower-Level (LL)\nsurrogate attacker. Algorithmically, we introduce the Hyper Gradient Response\n(HGR) estimation as an effective feedback for the transferability over\npseudo-victim attackers, and propose the Dynamic Sequence Truncation (DST)\ntechnique to dynamically adjust the back-propagation path for HGR and reduce\ncomputational overhead simultaneously. Meanwhile, we conduct detailed\nalgorithmic analysis and provide convergence guarantee to support non-convexity\nof the LL surrogate attacker. Extensive evaluations demonstrate substantial\nimprovement of BETAK (e.g., $\\mathbf{53.41}$\\% increase of attack success rates\nagainst IncRes-v$2_{ens}$) against different victims and defense methods in\ntargeted and untargeted attack scenarios. The source code is available at\nhttps://github.com/callous-youth/BETAK.\n","authors":["Yaohua Liu","Jiaxin Gao","Xuan Liu","Xianghao Jiao","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2406.02064v1.pdf","comment":"Accepted by IJCAI 2024. 10 pages"},{"id":"http://arxiv.org/abs/2406.02058v1","updated":"2024-06-04T07:42:33Z","published":"2024-06-04T07:42:33Z","title":"OpenGaussian: Towards Point-Level 3D Gaussian-based Open Vocabulary\n  Understanding","summary":"  This paper introduces OpenGaussian, a method based on 3D Gaussian Splatting\n(3DGS) capable of 3D point-level open vocabulary understanding. Our primary\nmotivation stems from observing that existing 3DGS-based open vocabulary\nmethods mainly focus on 2D pixel-level parsing. These methods struggle with 3D\npoint-level tasks due to weak feature expressiveness and inaccurate 2D-3D\nfeature associations. To ensure robust feature presentation and 3D point-level\nunderstanding, we first employ SAM masks without cross-frame associations to\ntrain instance features with 3D consistency. These features exhibit both\nintra-object consistency and inter-object distinction. Then, we propose a\ntwo-stage codebook to discretize these features from coarse to fine levels. At\nthe coarse level, we consider the positional information of 3D points to\nachieve location-based clustering, which is then refined at the fine level.\nFinally, we introduce an instance-level 3D-2D feature association method that\nlinks 3D points to 2D masks, which are further associated with 2D CLIP\nfeatures. Extensive experiments, including open vocabulary-based 3D object\nselection, 3D point cloud understanding, click-based 3D object selection, and\nablation studies, demonstrate the effectiveness of our proposed method. Project\npage: https://3d-aigc.github.io/OpenGaussian\n","authors":["Yanmin Wu","Jiarui Meng","Haijie Li","Chenming Wu","Yahao Shi","Xinhua Cheng","Chen Zhao","Haocheng Feng","Errui Ding","Jingdong Wang","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.02058v1.pdf","comment":"technical report, 15 pages"},{"id":"http://arxiv.org/abs/2406.01489v2","updated":"2024-06-04T07:39:20Z","published":"2024-06-03T16:13:33Z","title":"DA-HFNet: Progressive Fine-Grained Forgery Image Detection and\n  Localization Based on Dual Attention","summary":"  The increasing difficulty in accurately detecting forged images generated by\nAIGC(Artificial Intelligence Generative Content) poses many risks,\nnecessitating the development of effective methods to identify and further\nlocate forged areas. In this paper, to facilitate research efforts, we\nconstruct a DA-HFNet forged image dataset guided by text or image-assisted GAN\nand Diffusion model. Our goal is to utilize a hierarchical progressive network\nto capture forged artifacts at different scales for detection and localization.\nSpecifically, it relies on a dual-attention mechanism to adaptively fuse\nmulti-modal image features in depth, followed by a multi-branch interaction\nnetwork to thoroughly interact image features at different scales and improve\ndetector performance by leveraging dependencies between layers. Additionally,\nwe extract more sensitive noise fingerprints to obtain more prominent forged\nartifact features in the forged areas. Extensive experiments validate the\neffectiveness of our approach, demonstrating significant performance\nimprovements compared to state-of-the-art methods for forged image detection\nand localization.The code and dataset will be released in the future.\n","authors":["Yang Liu","Xiaofei Li","Jun Zhang","Shengze Hu","Jun Lei"],"pdf_url":"https://arxiv.org/pdf/2406.01489v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01460v2","updated":"2024-06-04T07:36:57Z","published":"2024-06-03T15:49:11Z","title":"MLIP: Efficient Multi-Perspective Language-Image Pretraining with\n  Exhaustive Data Utilization","summary":"  Contrastive Language-Image Pretraining (CLIP) has achieved remarkable\nsuccess, leading to rapid advancements in multimodal studies. However, CLIP\nfaces a notable challenge in terms of inefficient data utilization. It relies\non a single contrastive supervision for each image-text pair during\nrepresentation learning, disregarding a substantial amount of valuable\ninformation that could offer richer supervision. Additionally, the retention of\nnon-informative tokens leads to increased computational demands and time costs,\nparticularly in CLIP's ViT image encoder. To address these issues, we propose\nMulti-Perspective Language-Image Pretraining (MLIP). In MLIP, we leverage the\nfrequency transform's sensitivity to both high and low-frequency variations,\nwhich complements the spatial domain's sensitivity limited to low-frequency\nvariations only. By incorporating frequency transforms and token-level\nalignment, we expand CILP's single supervision into multi-domain and\nmulti-level supervision, enabling a more thorough exploration of informative\nimage features. Additionally, we introduce a token merging method guided by\ncomprehensive semantics from the frequency and spatial domains. This allows us\nto merge tokens to multi-granularity tokens with a controllable compression\nrate to accelerate CLIP. Extensive experiments validate the effectiveness of\nour design.\n","authors":["Yu Zhang","Qi Zhang","Zixuan Gong","Yiwei Shi","Yepeng Liu","Duoqian Miao","Yang Liu","Ke Liu","Kun Yi","Wei Fan","Liang Hu","Changwei Wang"],"pdf_url":"https://arxiv.org/pdf/2406.01460v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2312.00851v2","updated":"2024-06-04T07:34:05Z","published":"2023-12-01T13:25:16Z","title":"Physics Inspired Criterion for Pruning-Quantization Joint Learning","summary":"  Pruning-quantization joint learning always facilitates the deployment of deep\nneural networks (DNNs) on resource-constrained edge devices. However, most\nexisting methods do not jointly learn a global criterion for pruning and\nquantization in an interpretable way. In this paper, we propose a novel physics\ninspired criterion for pruning-quantization joint learning (PIC-PQ), which is\nexplored from an analogy we first draw between elasticity dynamics (ED) and\nmodel compression (MC). Specifically, derived from Hooke's law in ED, we\nestablish a linear relationship between the filters' importance distribution\nand the filter property (FP) by a learnable deformation scale in the physics\ninspired criterion (PIC). Furthermore, we extend PIC with a relative shift\nvariable for a global view. To ensure feasibility and flexibility, available\nmaximum bitwidth and penalty factor are introduced in quantization bitwidth\nassignment. Experiments on benchmarks of image classification demonstrate that\nPIC-PQ yields a good trade-off between accuracy and bit-operations (BOPs)\ncompression ratio e.g., 54.96X BOPs compression ratio in ResNet56 on CIFAR10\nwith 0.10% accuracy drop and 53.24X in ResNet18 on ImageNet with 0.61% accuracy\ndrop). The code will be available at https://github.com/fanxxxxyi/PIC-PQ.\n","authors":["Weiying Xie","Xiaoyi Fan","Xin Zhang","Yunsong Li","Jie Lei","Leyuan Fang"],"pdf_url":"https://arxiv.org/pdf/2312.00851v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02038v1","updated":"2024-06-04T07:23:41Z","published":"2024-06-04T07:23:41Z","title":"Leveraging Predicate and Triplet Learning for Scene Graph Generation","summary":"  Scene Graph Generation (SGG) aims to identify entities and predict the\nrelationship triplets \\textit{\\textless subject, predicate, object\\textgreater\n} in visual scenes. Given the prevalence of large visual variations of\nsubject-object pairs even in the same predicate, it can be quite challenging to\nmodel and refine predicate representations directly across such pairs, which is\nhowever a common strategy adopted by most existing SGG methods. We observe that\nvisual variations within the identical triplet are relatively small and certain\nrelation cues are shared in the same type of triplet, which can potentially\nfacilitate the relation learning in SGG. Moreover, for the long-tail problem\nwidely studied in SGG task, it is also crucial to deal with the limited types\nand quantity of triplets in tail predicates. Accordingly, in this paper, we\npropose a Dual-granularity Relation Modeling (DRM) network to leverage\nfine-grained triplet cues besides the coarse-grained predicate ones. DRM\nutilizes contexts and semantics of predicate and triplet with Dual-granularity\nConstraints, generating compact and balanced representations from two\nperspectives to facilitate relation recognition. Furthermore, a\nDual-granularity Knowledge Transfer (DKT) strategy is introduced to transfer\nvariation from head predicates/triplets to tail ones, aiming to enrich the\npattern diversity of tail classes to alleviate the long-tail problem. Extensive\nexperiments demonstrate the effectiveness of our method, which establishes new\nstate-of-the-art performance on Visual Genome, Open Image, and GQA datasets.\nOur code is available at \\url{https://github.com/jkli1998/DRM}\n","authors":["Jiankai Li","Yunhong Wang","Xiefan Guo","Ruijie Yang","Weixin Li"],"pdf_url":"https://arxiv.org/pdf/2406.02038v1.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2406.02037v1","updated":"2024-06-04T07:23:09Z","published":"2024-06-04T07:23:09Z","title":"Multi-Scale Direction-Aware Network for Infrared Small Target Detection","summary":"  Infrared small target detection faces the problem that it is difficult to\neffectively separate the background and the target. Existing deep\nlearning-based methods focus on appearance features and ignore high-frequency\ndirectional features. Therefore, we propose a multi-scale direction-aware\nnetwork (MSDA-Net), which is the first attempt to integrate the high-frequency\ndirectional features of infrared small targets as domain prior knowledge into\nneural networks. Specifically, an innovative multi-directional feature\nawareness (MDFA) module is constructed, which fully utilizes the prior\nknowledge of targets and emphasizes the focus on high-frequency directional\nfeatures. On this basis, combined with the multi-scale local relation learning\n(MLRL) module, a multi-scale direction-aware (MSDA) module is further\nconstructed. The MSDA module promotes the full extraction of local relations at\ndifferent scales and the full perception of key features in different\ndirections. Meanwhile, a high-frequency direction injection (HFDI) module\nwithout training parameters is constructed to inject the high-frequency\ndirectional information of the original image into the network. This helps\nguide the network to pay attention to detailed information such as target edges\nand shapes. In addition, we propose a feature aggregation (FA) structure that\naggregates multi-level features to solve the problem of small targets\ndisappearing in deep feature maps. Furthermore, a lightweight feature alignment\nfusion (FAF) module is constructed, which can effectively alleviate the pixel\noffset existing in multi-level feature map fusion. Extensive experimental\nresults show that our MSDA-Net achieves state-of-the-art (SOTA) results on the\npublic NUDT-SIRST, SIRST and IRSTD-1k datasets.\n","authors":["Jinmiao Zhao","Zelin Shi","Chuang Yu","Yunpeng Liu"],"pdf_url":"https://arxiv.org/pdf/2406.02037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16358v2","updated":"2024-06-04T07:22:48Z","published":"2024-03-25T01:44:34Z","title":"ChebMixer: Efficient Graph Representation Learning with MLP Mixer","summary":"  Graph neural networks have achieved remarkable success in learning graph\nrepresentations, especially graph Transformer, which has recently shown\nsuperior performance on various graph mining tasks. However, graph Transformer\ngenerally treats nodes as tokens, which results in quadratic complexity\nregarding the number of nodes during self-attention computation. The graph MLP\nMixer addresses this challenge by using the efficient MLP Mixer technique from\ncomputer vision. However, the time-consuming process of extracting graph tokens\nlimits its performance. In this paper, we present a novel architecture named\nChebMixer, a newly graph MLP Mixer that uses fast Chebyshev polynomials-based\nspectral filtering to extract a sequence of tokens. Firstly, we produce\nmultiscale representations of graph nodes via fast Chebyshev polynomial-based\nspectral filtering. Next, we consider each node's multiscale representations as\na sequence of tokens and refine the node representation with an effective MLP\nMixer. Finally, we aggregate the multiscale representations of nodes through\nChebyshev interpolation. Owing to the powerful representation capabilities and\nfast computational properties of MLP Mixer, we can quickly extract more\ninformative node representations to improve the performance of downstream\ntasks. The experimental results prove our significant improvements in a variety\nof scenarios ranging from graph node classification to medical image\nsegmentation.\n","authors":["Xiaoyan Kui","Haonan Yan","Qinsong Li","Liming Chen","Beiji Zou"],"pdf_url":"https://arxiv.org/pdf/2403.16358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02027v1","updated":"2024-06-04T07:06:06Z","published":"2024-06-04T07:06:06Z","title":"Inference Attacks in Machine Learning as a Service: A Taxonomy, Review,\n  and Promising Directions","summary":"  The prosperity of machine learning has also brought people's concerns about\ndata privacy. Among them, inference attacks can implement privacy breaches in\nvarious MLaaS scenarios and model training/prediction phases. Specifically,\ninference attacks can perform privacy inference on undisclosed target training\nsets based on outputs of the target model, including but not limited to\nstatistics, membership, semantics, data representation, etc. For instance,\ninfer whether the target data has the characteristics of AIDS. In addition, the\nrapid development of the machine learning community in recent years, especially\nthe surge of model types and application scenarios, has further stimulated the\ninference attacks' research. Thus, studying inference attacks and analyzing\nthem in depth is urgent and significant. However, there is still a gap in the\nsystematic discussion of inference attacks from taxonomy, global perspective,\nattack, and defense perspectives. This survey provides an in-depth and\ncomprehensive inference of attacks and corresponding countermeasures in\nML-as-a-service based on taxonomy and the latest researches. Without\ncompromising researchers' intuition, we first propose the 3MP taxonomy based on\nthe community research status, trying to normalize the confusing naming system\nof inference attacks. Also, we analyze the pros and cons of each type of\ninference attack, their workflow, countermeasure, and how they interact with\nother attacks. In the end, we point out several promising directions for\nresearchers from a more comprehensive and novel perspective.\n","authors":["Feng Wu","Lei Cui","Shaowen Yao","Shui Yu"],"pdf_url":"https://arxiv.org/pdf/2406.02027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02021v1","updated":"2024-06-04T07:00:14Z","published":"2024-06-04T07:00:14Z","title":"MetaMixer Is All You Need","summary":"  Transformer, composed of self-attention and Feed-Forward Network, has\nrevolutionized the landscape of network design across various vision tasks. FFN\nis a versatile operator seamlessly integrated into nearly all AI models to\neffectively harness rich representations. Recent works also show that FFN\nfunctions like key-value memories. Thus, akin to the query-key-value mechanism\nwithin self-attention, FFN can be viewed as a memory network, where the input\nserves as query and the two projection weights operate as keys and values,\nrespectively. We hypothesize that the importance lies in query-key-value\nframework itself rather than in self-attention. To verify this, we propose\nconverting self-attention into a more FFN-like efficient token mixer with only\nconvolutions while retaining query-key-value framework, namely FFNification.\nSpecifically, FFNification replaces query-key and attention coefficient-value\ninteractions with large kernel convolutions and adopts GELU activation function\ninstead of softmax. The derived token mixer, FFNified attention, serves as\nkey-value memories for detecting locally distributed spatial patterns, and\noperates in the opposite dimension to the ConvNeXt block within each\ncorresponding sub-operation of the query-key-value framework. Building upon the\nabove two modules, we present a family of Fast-Forward Networks. Our FFNet\nachieves remarkable performance improvements over previous state-of-the-art\nmethods across a wide range of tasks. The strong and general performance of our\nproposed method validates our hypothesis and leads us to introduce MetaMixer, a\ngeneral mixer architecture that does not specify sub-operations within the\nquery-key-value framework. We show that using only simple operations like\nconvolution and GELU in the MetaMixer can achieve superior performance.\n","authors":["Seokju Yun","Dongheon Lee","Youngmin Ro"],"pdf_url":"https://arxiv.org/pdf/2406.02021v1.pdf","comment":"Code: https://github.com/ysj9909/FFNet"},{"id":"http://arxiv.org/abs/2402.11874v4","updated":"2024-06-04T06:56:43Z","published":"2024-02-19T06:32:23Z","title":"Language-guided Image Reflection Separation","summary":"  This paper studies the problem of language-guided reflection separation,\nwhich aims at addressing the ill-posed reflection separation problem by\nintroducing language descriptions to provide layer content. We propose a\nunified framework to solve this problem, which leverages the cross-attention\nmechanism with contrastive learning strategies to construct the correspondence\nbetween language descriptions and image layers. A gated network design and a\nrandomized training strategy are employed to tackle the recognizable layer\nambiguity. The effectiveness of the proposed method is validated by the\nsignificant performance advantage over existing reflection separation methods\non both quantitative and qualitative comparisons.\n","authors":["Haofeng Zhong","Yuchen Hong","Shuchen Weng","Jinxiu Liang","Boxin Shi"],"pdf_url":"https://arxiv.org/pdf/2402.11874v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19957v2","updated":"2024-06-04T06:56:39Z","published":"2024-05-30T11:23:01Z","title":"PLA4D: Pixel-Level Alignments for Text-to-4D Gaussian Splatting","summary":"  As text-conditioned diffusion models (DMs) achieve breakthroughs in image,\nvideo, and 3D generation, the research community's focus has shifted to the\nmore challenging task of text-to-4D synthesis, which introduces a temporal\ndimension to generate dynamic 3D objects. In this context, we identify Score\nDistillation Sampling (SDS), a widely used technique for text-to-3D synthesis,\nas a significant hindrance to text-to-4D performance due to its Janus-faced and\ntexture-unrealistic problems coupled with high computational costs. In this\npaper, we propose \\textbf{P}ixel-\\textbf{L}evel \\textbf{A}lignments for\nText-to-\\textbf{4D} Gaussian Splatting (\\textbf{PLA4D}), a novel method that\nutilizes text-to-video frames as explicit pixel alignment targets to generate\nstatic 3D objects and inject motion into them. Specifically, we introduce Focal\nAlignment to calibrate camera poses for rendering and GS-Mesh Contrastive\nLearning to distill geometry priors from rendered image contrasts at the pixel\nlevel. Additionally, we develop Motion Alignment using a deformation network to\ndrive changes in Gaussians and implement Reference Refinement for smooth 4D\nobject surfaces. These techniques enable 4D Gaussian Splatting to align\ngeometry, texture, and motion with generated videos at the pixel level.\nCompared to previous methods, PLA4D produces synthesized outputs with better\ntexture details in less time and effectively mitigates the Janus-faced problem.\nPLA4D is fully implemented using open-source models, offering an accessible,\nuser-friendly, and promising direction for 4D digital content creation. Our\nproject page: https://github.com/MiaoQiaowei/PLA4D.github.io.\n","authors":["Qiaowei Miao","Yawei Luo","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2405.19957v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01996v1","updated":"2024-06-04T06:27:48Z","published":"2024-06-04T06:27:48Z","title":"Bayesian Mesh Optimization for Graph Neural Networks to Enhance\n  Engineering Performance Prediction","summary":"  In engineering design, surrogate models are widely employed to replace\ncomputationally expensive simulations by leveraging design variables and\ngeometric parameters from computer-aided design (CAD) models. However, these\nmodels often lose critical information when simplified to lower dimensions and\nface challenges in parameter definition, especially with the complex 3D shapes\ncommonly found in industrial datasets. To address these limitations, we propose\na Bayesian graph neural network (GNN) framework for a 3D deep-learning-based\nsurrogate model that predicts engineering performance by directly learning\ngeometric features from CAD using mesh representation. Our framework determines\nthe optimal size of mesh elements through Bayesian optimization, resulting in a\nhigh-accuracy surrogate model. Additionally, it effectively handles the\nirregular and complex structures of 3D CADs, which differ significantly from\nthe regular and uniform pixel structures of 2D images typically used in deep\nlearning. Experimental results demonstrate that the quality of the mesh\nsignificantly impacts the prediction accuracy of the surrogate model, with an\noptimally sized mesh achieving superior performance. We compare the performance\nof models based on various 3D representations such as voxel, point cloud, and\ngraph, and evaluate the computational costs of Monte Carlo simulation and\nBayesian optimization methods to find the optimal mesh size. We anticipate that\nour proposed framework has the potential to be applied to mesh-based\nsimulations across various engineering fields, leveraging physics-based\ninformation commonly used in computer-aided engineering.\n","authors":["Jangseop Park","Namwoo Kang"],"pdf_url":"https://arxiv.org/pdf/2406.01996v1.pdf","comment":"17 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2406.01994v1","updated":"2024-06-04T06:24:07Z","published":"2024-06-04T06:24:07Z","title":"3D Imaging of Complex Specular Surfaces by Fusing Polarimetric and\n  Deflectometric Information","summary":"  Accurate and fast 3D imaging of specular surfaces still poses major\nchallenges for state-of-the-art optical measurement principles. Frequently used\nmethods, such as phase-measuring deflectometry (PMD) or shape-from-polarization\n(SfP), rely on strong assumptions about the measured objects, limiting their\ngeneralizability in broader application areas like medical imaging, industrial\ninspection, virtual reality, or cultural heritage analysis. In this paper, we\nintroduce a measurement principle that utilizes a novel technique to\neffectively encode and decode the information contained in a light field\nreflected off a specular surface. We combine polarization cues from SfP with\ngeometric information obtained from PMD to resolve all arising ambiguities in\nthe 3D measurement. Moreover, our approach removes the unrealistic orthographic\nimaging assumption for SfP, which significantly improves the respective\nresults. We showcase our new technique by demonstrating single-shot and\nmulti-shot measurements on complex-shaped specular surfaces, displaying an\nevaluated accuracy of surface normals below $0.6^\\circ$.\n","authors":["Jiazhang Wang","Oliver Cossairt","Florian Willomitzer"],"pdf_url":"https://arxiv.org/pdf/2406.01994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01993v1","updated":"2024-06-04T06:23:27Z","published":"2024-06-04T06:23:27Z","title":"Choroidal Vessel Segmentation on Indocyanine Green Angiography Images\n  via Human-in-the-Loop Labeling","summary":"  Human-in-the-loop (HITL) strategy has been recently introduced into the field\nof medical image processing. Indocyanine green angiography (ICGA) stands as a\nwell-established examination for visualizing choroidal vasculature and\ndetecting chorioretinal diseases. However, the intricate nature of choroidal\nvascular networks makes large-scale manual segmentation of ICGA images\nchallenging. Thus, the study aims to develop a high-precision choroidal vessel\nsegmentation model with limited labor using HITL framework. We utilized a\nmulti-source ICGA dataset, including 55 degree view and ultra-widefield ICGA\n(UWF-ICGA) images for model development. The choroidal vessel network was\npre-segmented by a pre-trained vessel segmentation model, and then manually\nmodified by two ophthalmologists. Choroidal vascular diameter, density,\ncomplexity, tortuosity, and branching angle were automatically quantified based\non the segmentation. We finally conducted four cycles of HITL. One hundred and\nfifty 55 degree view ICGA images were used for the first three cycles (50\nimages per cycle), and twenty UWF-ICGA images for the last cycle. The average\ntime needed to manually correct a pre-segmented ICGA image per cycle reduced\nfrom 20 minutes to 1 minute. High segmentation accuracy has been achieved on\nboth 55 degree view ICGA and UWF-ICGA images. Additionally, the\nmulti-dimensional choroidal vascular parameters were significantly associated\nwith various chorioretinal diseases. Our study not only demonstrated the\nfeasibility of the HITL strategy in improving segmentation performance with\nreduced manual labeling, but also innovatively introduced several risk\npredictors for choroidal abnormalities.\n","authors":["Ruoyu Chen","Ziwei Zhao","Mayinuer Yusufu","Xianwen Shang","Danli Shi","Mingguang He"],"pdf_url":"https://arxiv.org/pdf/2406.01993v1.pdf","comment":"25 pages,4 figures"},{"id":"http://arxiv.org/abs/2406.01987v1","updated":"2024-06-04T06:07:24Z","published":"2024-06-04T06:07:24Z","title":"Dealing with All-stage Missing Modality: Towards A Universal Model with\n  Robust Reconstruction and Personalization","summary":"  Addressing missing modalities presents a critical challenge in multimodal\nlearning. Current approaches focus on developing models that can handle\nmodality-incomplete inputs during inference, assuming that the full set of\nmodalities are available for all the data during training. This reliance on\nfull-modality data for training limits the use of abundant modality-incomplete\nsamples that are often encountered in practical settings. In this paper, we\npropose a robust universal model with modality reconstruction and model\npersonalization, which can effectively tackle the missing modality at both\ntraining and testing stages. Our method leverages a multimodal masked\nautoencoder to reconstruct the missing modality and masked patches\nsimultaneously, incorporating an innovative distribution approximation\nmechanism to fully utilize both modality-complete and modality-incomplete data.\nThe reconstructed modalities then contributes to our designed data-model\nco-distillation scheme to guide the model learning in the presence of missing\nmodalities. Moreover, we propose a CLIP-driven hyper-network to personalize\npartial model parameters, enabling the model to adapt to each distinct missing\nmodality scenario. Our method has been extensively validated on two brain tumor\nsegmentation benchmarks. Experimental results demonstrate the promising\nperformance of our method, which consistently exceeds previous state-of-the-art\napproaches under the all-stage missing modality settings with different missing\nratios. Code will be available.\n","authors":["Yunpeng Zhao","Cheng Chen","Qing You Pang","Quanzheng Li","Carol Tang","Beng-Ti Ang","Yueming Jin"],"pdf_url":"https://arxiv.org/pdf/2406.01987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01428v2","updated":"2024-06-04T05:39:15Z","published":"2024-06-03T15:26:06Z","title":"Superhuman performance in urology board questions by an explainable\n  large language model enabled for context integration of the European\n  Association of Urology guidelines: the UroBot study","summary":"  Large Language Models (LLMs) are revolutionizing medical Question-Answering\n(medQA) through extensive use of medical literature. However, their performance\nis often hampered by outdated training data and a lack of explainability, which\nlimits clinical applicability. This study aimed to create and assess UroBot, a\nurology-specialized chatbot, by comparing it with state-of-the-art models and\nthe performance of urologists on urological board questions, ensuring full\nclinician-verifiability. UroBot was developed using OpenAI's GPT-3.5, GPT-4,\nand GPT-4o models, employing retrieval-augmented generation (RAG) and the\nlatest 2023 guidelines from the European Association of Urology (EAU). The\nevaluation included ten runs of 200 European Board of Urology (EBU) In-Service\nAssessment (ISA) questions, with performance assessed by the mean Rate of\nCorrect Answers (RoCA). UroBot-4o achieved an average RoCA of 88.4%, surpassing\nGPT-4o by 10.8%, with a score of 77.6%. It was also clinician-verifiable and\nexhibited the highest run agreement as indicated by Fleiss' Kappa (k = 0.979).\nBy comparison, the average performance of urologists on board questions, as\nreported in the literature, is 68.7%. UroBot's clinician-verifiable nature and\nsuperior accuracy compared to both existing models and urologists on board\nquestions highlight its potential for clinical integration. The study also\nprovides the necessary code and instructions for further development of UroBot.\n","authors":["Martin J. Hetz","Nicolas Carl","Sarah Haggenmüller","Christoph Wies","Maurice Stephan Michel","Frederik Wessels","Titus J. Brinker"],"pdf_url":"https://arxiv.org/pdf/2406.01428v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01975v1","updated":"2024-06-04T05:19:32Z","published":"2024-06-04T05:19:32Z","title":"Can Dense Connectivity Benefit Outlier Detection? An Odyssey with NAS","summary":"  Recent advances in Out-of-Distribution (OOD) Detection is the driving force\nbehind safe and reliable deployment of Convolutional Neural Networks (CNNs) in\nreal world applications. However, existing studies focus on OOD detection\nthrough confidence score and deep generative model-based methods, without\nconsidering the impact of DNN structures, especially dense connectivity in\narchitecture fabrications. In addition, existing outlier detection approaches\nexhibit high variance in generalization performance, lacking stability and\nconfidence in evaluating and ranking different outlier detectors. In this work,\nwe propose a novel paradigm, Dense Connectivity Search of Outlier Detector\n(DCSOD), that automatically explore the dense connectivity of CNN architectures\non near-OOD detection task using Neural Architecture Search (NAS). We introduce\na hierarchical search space containing versatile convolution operators and\ndense connectivity, allowing a flexible exploration of CNN architectures with\ndiverse connectivity patterns. To improve the quality of evaluation on OOD\ndetection during search, we propose evolving distillation based on our\nmulti-view feature learning explanation. Evolving distillation stabilizes\ntraining for OOD detection evaluation, thus improves the quality of search. We\nthoroughly examine DCSOD on CIFAR benchmarks under OOD detection protocol.\nExperimental results show that DCSOD achieve remarkable performance over widely\nused architectures and previous NAS baselines. Notably, DCSOD achieves\nstate-of-the-art (SOTA) performance on CIFAR benchmark, with AUROC improvement\nof $\\sim$1.0%.\n","authors":["Hao Fu","Tunhou Zhang","Hai Li","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2406.01975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16451v2","updated":"2024-06-04T05:15:50Z","published":"2024-05-26T06:42:06Z","title":"From Macro to Micro: Boosting micro-expression recognition via\n  pre-training on macro-expression videos","summary":"  Micro-expression recognition (MER) has drawn increasing attention in recent\nyears due to its potential applications in intelligent medical and lie\ndetection. However, the shortage of annotated data has been the major obstacle\nto further improve deep-learning based MER methods. Intuitively, utilizing\nsufficient macro-expression data to promote MER performance seems to be a\nfeasible solution. However, the facial patterns of macro-expressions and\nmicro-expressions are significantly different, which makes naive transfer\nlearning methods difficult to deploy directly. To tacle this issue, we propose\na generalized transfer learning paradigm, called \\textbf{MA}cro-expression\n\\textbf{TO} \\textbf{MI}cro-expression (MA2MI). Under our paradigm, networks can\nlearns the ability to represent subtle facial movement by reconstructing future\nframes. In addition, we also propose a two-branch micro-action network\n(MIACNet) to decouple facial position features and facial action features,\nwhich can help the network more accurately locate facial action locations.\nExtensive experiments on three popular MER benchmarks demonstrate the\nsuperiority of our method.\n","authors":["Hanting Li","Hongjing Niu","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.16451v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2406.01970v1","updated":"2024-06-04T05:06:00Z","published":"2024-06-04T05:06:00Z","title":"The Crystal Ball Hypothesis in diffusion models: Anticipating object\n  positions from initial noise","summary":"  Diffusion models have achieved remarkable success in text-to-image generation\ntasks; however, the role of initial noise has been rarely explored. In this\nstudy, we identify specific regions within the initial noise image, termed\ntrigger patches, that play a key role for object generation in the resulting\nimages. Notably, these patches are ``universal'' and can be generalized across\nvarious positions, seeds, and prompts. To be specific, extracting these patches\nfrom one noise and injecting them into another noise leads to object generation\nin targeted areas. We identify these patches by analyzing the dispersion of\nobject bounding boxes across generated images, leading to the development of a\nposterior analysis technique. Furthermore, we create a dataset consisting of\nGaussian noises labeled with bounding boxes corresponding to the objects\nappearing in the generated images and train a detector that identifies these\npatches from the initial noise. To explain the formation of these patches, we\nreveal that they are outliers in Gaussian noise, and follow distinct\ndistributions through two-sample tests. Finally, we find the misalignment\nbetween prompts and the trigger patch patterns can result in unsuccessful image\ngenerations. The study proposes a reject-sampling strategy to obtain optimal\nnoise, aiming to improve prompt adherence and positional diversity in image\ngeneration.\n","authors":["Yuanhao Ban","Ruochen Wang","Tianyi Zhou","Boqing Gong","Cho-Jui Hsieh","Minhao Cheng"],"pdf_url":"https://arxiv.org/pdf/2406.01970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01961v1","updated":"2024-06-04T04:43:58Z","published":"2024-06-04T04:43:58Z","title":"Exploring Real World Map Change Generalization of Prior-Informed HD Map\n  Prediction Models","summary":"  Building and maintaining High-Definition (HD) maps represents a large barrier\nto autonomous vehicle deployment. This, along with advances in modern online\nmap detection models, has sparked renewed interest in the online mapping\nproblem. However, effectively predicting online maps at a high enough quality\nto enable safe, driverless deployments remains a significant challenge. Recent\nwork on these models proposes training robust online mapping systems using low\nquality map priors with synthetic perturbations in an attempt to simulate\nout-of-date HD map priors. In this paper, we investigate how models trained on\nthese synthetically perturbed map priors generalize to performance on\ndeployment-scale, real world map changes. We present a large-scale experimental\nstudy to determine which synthetic perturbations are most useful in\ngeneralizing to real world HD map changes, evaluated using multiple years of\nreal-world autonomous driving data. We show there is still a substantial\nsim2real gap between synthetic prior perturbations and observed real-world\nchanges, which limits the utility of current prior-informed HD map prediction\nmodels.\n","authors":["Samuel M. Bateman","Ning Xu","H. Charles Zhao","Yael Ben Shalom","Vince Gong","Greg Long","Will Maddern"],"pdf_url":"https://arxiv.org/pdf/2406.01961v1.pdf","comment":"Accepted to CVPR 2024, Workshop on Autonomous Driving"},{"id":"http://arxiv.org/abs/2406.00571v2","updated":"2024-06-04T04:36:22Z","published":"2024-06-01T22:58:08Z","title":"An Image Segmentation Model with Transformed Total Variation","summary":"  Based on transformed $\\ell_1$ regularization, transformed total variation\n(TTV) has robust image recovery that is competitive with other nonconvex total\nvariation (TV) regularizers, such as TV$^p$, $0<p<1$. Inspired by its\nperformance, we propose a TTV-regularized Mumford--Shah model with fuzzy\nmembership function for image segmentation. To solve it, we design an\nalternating direction method of multipliers (ADMM) algorithm that utilizes the\ntransformed $\\ell_1$ proximal operator. Numerical experiments demonstrate that\nusing TTV is more effective than classical TV and other nonconvex TV variants\nin image segmentation.\n","authors":["Elisha Dayag","Kevin Bui","Fredrick Park","Jack Xin"],"pdf_url":"https://arxiv.org/pdf/2406.00571v2.pdf","comment":"Accepted to EUSIPCO'24"},{"id":"http://arxiv.org/abs/2406.01956v1","updated":"2024-06-04T04:31:39Z","published":"2024-06-04T04:31:39Z","title":"Enhance Image-to-Image Generation with LLaVA Prompt and Negative Prompt","summary":"  This paper presents a novel approach to enhance image-to-image generation by\nleveraging the multimodal capabilities of the Large Language and Vision\nAssistant (LLaVA). We propose a framework where LLaVA analyzes input images and\ngenerates textual descriptions, hereinafter LLaVA-generated prompts. These\nprompts, along with the original image, are fed into the image-to-image\ngeneration pipeline. This enriched representation guides the generation process\ntowards outputs that exhibit a stronger resemblance to the input image.\nExtensive experiments demonstrate the effectiveness of LLaVA-generated prompts\nin promoting image similarity. We observe a significant improvement in the\nvisual coherence between the generated and input images compared to traditional\nmethods. Future work will explore fine-tuning LLaVA prompts for increased\ncontrol over the creative process. By providing more specific details within\nthe prompts, we aim to achieve a delicate balance between faithfulness to the\noriginal image and artistic expression in the generated outputs.\n","authors":["Zhicheng Ding","Panfeng Li","Qikai Yang","Siyang Li"],"pdf_url":"https://arxiv.org/pdf/2406.01956v1.pdf","comment":"Accepted by 2024 5th International Conference on Information Science,\n  Parallel and Distributed Systems"},{"id":"http://arxiv.org/abs/2406.01954v1","updated":"2024-06-04T04:22:47Z","published":"2024-06-04T04:22:47Z","title":"Plug-and-Play Diffusion Distillation","summary":"  Diffusion models have shown tremendous results in image generation. However,\ndue to the iterative nature of the diffusion process and its reliance on\nclassifier-free guidance, inference times are slow. In this paper, we propose a\nnew distillation approach for guided diffusion models in which an external\nlightweight guide model is trained while the original text-to-image model\nremains frozen. We show that our method reduces the inference computation of\nclassifier-free guided latent-space diffusion models by almost half, and only\nrequires 1\\% trainable parameters of the base model. Furthermore, once trained,\nour guide model can be applied to various fine-tuned, domain-specific versions\nof the base diffusion model without the need for additional training: this\n\"plug-and-play\" functionality drastically improves inference computation while\nmaintaining the visual fidelity of generated images. Empirically, we show that\nour approach is able to produce visually appealing results and achieve a\ncomparable FID score to the teacher with as few as 8 to 16 steps.\n","authors":["Yi-Ting Hsiao","Siavash Khodadadeh","Kevin Duarte","Wei-An Lin","Hui Qu","Mingi Kwon","Ratheesh Kalarot"],"pdf_url":"https://arxiv.org/pdf/2406.01954v1.pdf","comment":"IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)\n  2024"},{"id":"http://arxiv.org/abs/2406.01938v1","updated":"2024-06-04T03:45:08Z","published":"2024-06-04T03:45:08Z","title":"Nutrition Estimation for Dietary Management: A Transformer Approach with\n  Depth Sensing","summary":"  Nutrition estimation is crucial for effective dietary management and overall\nhealth and well-being. Existing methods often struggle with sub-optimal\naccuracy and can be time-consuming. In this paper, we propose NuNet, a\ntransformer-based network designed for nutrition estimation that utilizes both\nRGB and depth information from food images. We have designed and implemented a\nmulti-scale encoder and decoder, along with two types of feature fusion\nmodules, specialized for estimating five nutritional factors. These modules\neffectively balance the efficiency and effectiveness of feature extraction with\nflexible usage of our customized attention mechanisms and fusion strategies.\nOur experimental study shows that NuNet outperforms its variants and existing\nsolutions significantly for nutrition estimation. It achieves an error rate of\n15.65%, the lowest known to us, largely due to our multi-scale architecture and\nfusion modules. This research holds practical values for dietary management\nwith huge potential for transnational research and deployment and could inspire\nother applications involving multiple data types with varying degrees of\nimportance.\n","authors":["Zhengyi Kwan","Wei Zhang","Zhengkui Wang","Aik Beng Ng","Simon See"],"pdf_url":"https://arxiv.org/pdf/2406.01938v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2406.01493v2","updated":"2024-06-04T03:33:52Z","published":"2024-06-03T16:20:24Z","title":"Learning Temporally Consistent Video Depth from Video Diffusion Priors","summary":"  This work addresses the challenge of video depth estimation, which expects\nnot only per-frame accuracy but, more importantly, cross-frame consistency.\nInstead of directly developing a depth estimator from scratch, we reformulate\nthe prediction task into a conditional generation problem. This allows us to\nleverage the prior knowledge embedded in existing video generation models,\nthereby reducing learning difficulty and enhancing generalizability.\nConcretely, we study how to tame the public Stable Video Diffusion (SVD) to\npredict reliable depth from input videos using a mixture of image depth and\nvideo depth datasets. We empirically confirm that a procedural training\nstrategy -- first optimizing the spatial layers of SVD and then optimizing the\ntemporal layers while keeping the spatial layers frozen -- yields the best\nresults in terms of both spatial accuracy and temporal consistency. We further\nexamine the sliding window strategy for inference on arbitrarily long videos.\nOur observations indicate a trade-off between efficiency and performance, with\na one-frame overlap already producing favorable results. Extensive experimental\nresults demonstrate the superiority of our approach, termed ChronoDepth, over\nexisting alternatives, particularly in terms of the temporal consistency of the\nestimated depth. Additionally, we highlight the benefits of more consistent\nvideo depth in two practical applications: depth-conditioned video generation\nand novel view synthesis. Our project page is available at\nhttps://jhaoshao.github.io/ChronoDepth/.\n","authors":["Jiahao Shao","Yuanbo Yang","Hongyu Zhou","Youmin Zhang","Yujun Shen","Matteo Poggi","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2406.01493v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20729v2","updated":"2024-06-04T03:31:49Z","published":"2024-05-31T09:37:39Z","title":"Extreme Point Supervised Instance Segmentation","summary":"  This paper introduces a novel approach to learning instance segmentation\nusing extreme points, i.e., the topmost, leftmost, bottommost, and rightmost\npoints, of each object. These points are readily available in the modern\nbounding box annotation process while offering strong clues for precise\nsegmentation, and thus allows to improve performance at the same annotation\ncost with box-supervised methods. Our work considers extreme points as a part\nof the true instance mask and propagates them to identify potential foreground\nand background points, which are all together used for training a pseudo label\ngenerator. Then pseudo labels given by the generator are in turn used for\nsupervised learning of our final model. On three public benchmarks, our method\nsignificantly outperforms existing box-supervised methods, further narrowing\nthe gap with its fully supervised counterpart. In particular, our model\ngenerates high-quality masks when a target object is separated into multiple\nparts, where previous box-supervised methods often fail.\n","authors":["Hyeonjun Lee","Sehyun Hwang","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2405.20729v2.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2406.01932v1","updated":"2024-06-04T03:31:42Z","published":"2024-06-04T03:31:42Z","title":"Detecting Endangered Marine Species in Autonomous Underwater Vehicle\n  Imagery Using Point Annotations and Few-Shot Learning","summary":"  One use of Autonomous Underwater Vehicles (AUVs) is the monitoring of\nhabitats associated with threatened, endangered and protected marine species,\nsuch as the handfish of Tasmania, Australia. Seafloor imagery collected by AUVs\ncan be used to identify individuals within their broader habitat context, but\nthe sheer volume of imagery collected can overwhelm efforts to locate rare or\ncryptic individuals. Machine learning models can be used to identify the\npresence of a particular species in images using a trained object detector, but\nthe lack of training examples reduces detection performance, particularly for\nrare species that may only have a small number of examples in the wild. In this\npaper, inspired by recent work in few-shot learning, images and annotations of\ncommon marine species are exploited to enhance the ability of the detector to\nidentify rare and cryptic species. Annotated images of six common marine\nspecies are used in two ways. Firstly, the common species are used in a\npre-training step to allow the backbone to create rich features for marine\nspecies. Secondly, a copy-paste operation is used with the common species\nimages to augment the training data. While annotations for more common marine\nspecies are available in public datasets, they are often in point format, which\nis unsuitable for training an object detector. A popular semantic segmentation\nmodel efficiently generates bounding box annotations for training from the\navailable point annotations. Our proposed framework is applied to AUV images of\nhandfish, increasing average precision by up to 48\\% compared to baseline\nobject detection training. This approach can be applied to other objects with\nlow numbers of annotations and promises to increase the ability to actively\nmonitor threatened, endangered and protected species.\n","authors":["Heather Doig","Oscar Pizarro","Jacquomo Monk","Stefan Williams"],"pdf_url":"https://arxiv.org/pdf/2406.01932v1.pdf","comment":"7 pages, 5 figures. Submitted to the 2024 IEEE/RSJ International\n  Conference on Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2402.18331v3","updated":"2024-06-04T03:29:24Z","published":"2024-02-28T13:50:46Z","title":"FineDiffusion: Scaling up Diffusion Models for Fine-grained Image\n  Generation with 10,000 Classes","summary":"  The class-conditional image generation based on diffusion models is renowned\nfor generating high-quality and diverse images. However, most prior efforts\nfocus on generating images for general categories, e.g., 1000 classes in\nImageNet-1k. A more challenging task, large-scale fine-grained image\ngeneration, remains the boundary to explore. In this work, we present a\nparameter-efficient strategy, called FineDiffusion, to fine-tune large\npre-trained diffusion models scaling to large-scale fine-grained image\ngeneration with 10,000 categories. FineDiffusion significantly accelerates\ntraining and reduces storage overhead by only fine-tuning tiered class\nembedder, bias terms, and normalization layers' parameters. To further improve\nthe image generation quality of fine-grained categories, we propose a novel\nsampling method for fine-grained image generation, which utilizes\nsuperclass-conditioned guidance, specifically tailored for fine-grained\ncategories, to replace the conventional classifier-free guidance sampling.\nCompared to full fine-tuning, FineDiffusion achieves a remarkable 1.56x\ntraining speed-up and requires storing merely 1.77% of the total model\nparameters, while achieving state-of-the-art FID of 9.776 on image generation\nof 10,000 classes. Extensive qualitative and quantitative experiments\ndemonstrate the superiority of our method compared to other parameter-efficient\nfine-tuning methods. The code and more generated results are available at our\nproject website: https://finediffusion.github.io/.\n","authors":["Ziying Pan","Kun Wang","Gang Li","Feihong He","Yongxuan Lai"],"pdf_url":"https://arxiv.org/pdf/2402.18331v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03887v4","updated":"2024-06-04T03:25:20Z","published":"2023-07-08T03:42:54Z","title":"Improving Prototypical Visual Explanations with Reward Reweighing,\n  Reselection, and Retraining","summary":"  In recent years, work has gone into developing deep interpretable methods for\nimage classification that clearly attributes a model's output to specific\nfeatures of the data. One such of these methods is the Prototypical Part\nNetwork (ProtoPNet), which attempts to classify images based on meaningful\nparts of the input. While this architecture is able to produce visually\ninterpretable classifications, it often learns to classify based on parts of\nthe image that are not semantically meaningful. To address this problem, we\npropose the Reward Reweighing, Reselecting, and Retraining (R3) post-processing\nframework, which performs three additional corrective updates to a pretrained\nProtoPNet in an offline and efficient manner. The first two steps involve\nlearning a reward model based on collected human feedback and then aligning the\nprototypes with human preferences. The final step is retraining, which realigns\nthe base features and the classifier layer of the original model with the\nupdated prototypes. We find that our R3 framework consistently improves both\nthe interpretability and the predictive accuracy of ProtoPNet and its variants.\n","authors":["Aaron J. Li","Robin Netzorg","Zhihan Cheng","Zhuoqin Zhang","Bin Yu"],"pdf_url":"https://arxiv.org/pdf/2307.03887v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07448v2","updated":"2024-06-04T03:15:51Z","published":"2024-04-11T03:08:53Z","title":"Transferable and Principled Efficiency for Open-Vocabulary Segmentation","summary":"  Recent success of pre-trained foundation vision-language models makes\nOpen-Vocabulary Segmentation (OVS) possible. Despite the promising performance,\nthis approach introduces heavy computational overheads for two challenges: 1)\nlarge model sizes of the backbone; 2) expensive costs during the fine-tuning.\nThese challenges hinder this OVS strategy from being widely applicable and\naffordable in real-world scenarios. Although traditional methods such as model\ncompression and efficient fine-tuning can address these challenges, they often\nrely on heuristics. This means that their solutions cannot be easily\ntransferred and necessitate re-training on different models, which comes at a\ncost. In the context of efficient OVS, we target achieving performance that is\ncomparable to or even better than prior OVS works based on large\nvision-language foundation models, by utilizing smaller models that incur lower\ntraining costs. The core strategy is to make our efficiency principled and thus\nseamlessly transferable from one OVS framework to others without further\ncustomization. Comprehensive experiments on diverse OVS benchmarks demonstrate\nour superior trade-off between segmentation accuracy and computation costs over\nprevious works. Our code is available on https://github.com/Xujxyang/OpenTrans\n","authors":["Jingxuan Xu","Wuyang Chen","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.07448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00490v2","updated":"2024-06-04T03:15:41Z","published":"2024-06-01T16:41:24Z","title":"Research on the Application of Computer Vision Based on Deep Learning in\n  Autonomous Driving Technology","summary":"  This research aims to explore the application of deep learning in autonomous\ndriving computer vision technology and its impact on improving system\nperformance. By using advanced technologies such as convolutional neural\nnetworks (CNN), multi-task joint learning methods, and deep reinforcement\nlearning, this article analyzes in detail the application of deep learning in\nimage recognition, real-time target tracking and classification, environment\nperception and decision support, and path planning and navigation. Application\nprocess in key areas. Research results show that the proposed system has an\naccuracy of over 98% in image recognition, target tracking and classification,\nand also demonstrates efficient performance and practicality in environmental\nperception and decision support, path planning and navigation. The conclusion\npoints out that deep learning technology can significantly improve the accuracy\nand real-time response capabilities of autonomous driving systems. Although\nthere are still challenges in environmental perception and decision support,\nwith the advancement of technology, it is expected to achieve wider\napplications and greater capabilities in the future. potential.\n","authors":["Jingyu Zhang","Jin Cao","Jinghao Chang","Xinjin Li","Houze Liu","Zhenglin Li"],"pdf_url":"https://arxiv.org/pdf/2406.00490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01210v2","updated":"2024-06-04T03:14:19Z","published":"2024-06-03T11:24:15Z","title":"GeminiFusion: Efficient Pixel-wise Multimodal Fusion for Vision\n  Transformer","summary":"  Cross-modal transformers have demonstrated superiority in various vision\ntasks by effectively integrating different modalities. This paper first\ncritiques prior token exchange methods which replace less informative tokens\nwith inter-modal features, and demonstrate exchange based methods underperform\ncross-attention mechanisms, while the computational demand of the latter\ninevitably restricts its use with longer sequences. To surmount the\ncomputational challenges, we propose GeminiFusion, a pixel-wise fusion approach\nthat capitalizes on aligned cross-modal representations. GeminiFusion elegantly\ncombines intra-modal and inter-modal attentions, dynamically integrating\ncomplementary information across modalities. We employ a layer-adaptive noise\nto adaptively control their interplay on a per-layer basis, thereby achieving a\nharmonized fusion process. Notably, GeminiFusion maintains linear complexity\nwith respect to the number of input tokens, ensuring this multimodal framework\noperates with efficiency comparable to unimodal networks. Comprehensive\nevaluations across multimodal image-to-image translation, 3D object detection\nand arbitrary-modal semantic segmentation tasks, including RGB, depth, LiDAR,\nevent data, etc. demonstrate the superior performance of our GeminiFusion\nagainst leading-edge techniques. The PyTorch code is available at\nhttps://github.com/JiaDingCN/GeminiFusion\n","authors":["Ding Jia","Jianyuan Guo","Kai Han","Han Wu","Chao Zhang","Chang Xu","Xinghao Chen"],"pdf_url":"https://arxiv.org/pdf/2406.01210v2.pdf","comment":"Accepted by ICML 2024, code and models are available at\n  https://github.com/JiaDingCN/GeminiFusion"},{"id":"http://arxiv.org/abs/2405.05553v2","updated":"2024-06-04T03:13:03Z","published":"2024-05-09T05:23:34Z","title":"Towards Robust Physical-world Backdoor Attacks on Lane Detection","summary":"  Deep learning-based lane detection (LD) plays a critical role in autonomous\ndriving systems, such as adaptive cruise control. However, it is vulnerable to\nbackdoor attacks. Existing backdoor attack methods on LD exhibit limited\neffectiveness in dynamic real-world scenarios, primarily because they fail to\nconsider dynamic scene factors, including changes in driving perspectives\n(e.g., viewpoint transformations) and environmental conditions (e.g., weather\nor lighting changes). To tackle this issue, this paper introduces BadLANE, a\ndynamic scene adaptation backdoor attack for LD designed to withstand changes\nin real-world dynamic scene factors. To address the challenges posed by\nchanging driving perspectives, we propose an amorphous trigger pattern composed\nof shapeless pixels. This trigger design allows the backdoor to be activated by\nvarious forms or shapes of mud spots or pollution on the road or lens, enabling\nadaptation to changes in vehicle observation viewpoints during driving. To\nmitigate the effects of environmental changes, we design a meta-learning\nframework to train meta-generators tailored to different environmental\nconditions. These generators produce meta-triggers that incorporate diverse\nenvironmental information, such as weather or lighting conditions, as the\ninitialization of the trigger patterns for backdoor implantation, thus enabling\nadaptation to dynamic environments. Extensive experiments on various commonly\nused LD models in both digital and physical domains validate the effectiveness\nof our attacks, outperforming other baselines significantly (+25.15% on average\nin Attack Success Rate). Our codes will be available upon paper publication.\n","authors":["Xinwei Zhang","Aishan Liu","Tianyuan Zhang","Siyuan Liang","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2405.05553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01920v1","updated":"2024-06-04T03:04:21Z","published":"2024-06-04T03:04:21Z","title":"CODE: Contrasting Self-generated Description to Combat Hallucination in\n  Large Multi-modal Models","summary":"  Large Multi-modal Models (LMMs) have recently demonstrated remarkable\nabilities in visual context understanding and coherent response generation.\nHowever, alongside these advancements, the issue of hallucinations has emerged\nas a significant challenge, producing erroneous responses that are unrelated to\nthe visual contents. In this paper, we introduce a novel contrastive-based\ndecoding method, COuntering DEscription Contrastive Decoding (CODE), which\nleverages self-generated descriptions as contrasting references during the\ndecoding phase of LMMs to address hallucination issues. CODE utilizes the\ncomprehensive descriptions from model itself as visual counterpart to correct\nand improve response alignment with actual visual content. By dynamically\nadjusting the information flow and distribution of next-token predictions in\nthe LMM's vocabulary, CODE enhances the coherence and informativeness of\ngenerated responses. Extensive experiments demonstrate that our method\nsignificantly reduces hallucinations and improves cross-modal consistency\nacross various benchmarks and cutting-edge LMMs. Our method provides a simple\nyet effective decoding strategy that can be integrated to existing LMM\nframeworks without additional training.\n","authors":["Junho Kim","Hyunjun Kim","Yeonju Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2406.01920v1.pdf","comment":"Project page: https://ivy-lvlm.github.io/CODE/"},{"id":"http://arxiv.org/abs/2405.01016v3","updated":"2024-06-04T03:03:39Z","published":"2024-05-02T05:35:10Z","title":"Addressing Diverging Training Costs using Local Restoration for Precise\n  Bird's Eye View Map Construction","summary":"  Recent advancements in Bird's Eye View (BEV) fusion for map construction have\ndemonstrated remarkable mapping of urban environments. However, their deep and\nbulky architecture incurs substantial amounts of backpropagation memory and\ncomputing latency. Consequently, the problem poses an unavoidable bottleneck in\nconstructing high-resolution (HR) BEV maps, as their large-sized features cause\nsignificant increases in costs including GPU memory consumption and computing\nlatency, named diverging training costs issue. Affected by the problem, most\nexisting methods adopt low-resolution (LR) BEV and struggle to estimate the\nprecise locations of urban scene components like road lanes, and sidewalks. As\nthe imprecision leads to risky self-driving, the diverging training costs issue\nhas to be resolved. In this paper, we address the issue with our novel Trumpet\nNeural Network (TNN) mechanism. The framework utilizes LR BEV space and outputs\nan up-sampled semantic BEV map to create a memory-efficient pipeline. To this\nend, we introduce Local Restoration of BEV representation. Specifically, the\nup-sampled BEV representation has severely aliased, blocky signals, and thick\nsemantic labels. Our proposed Local Restoration restores the signals and thins\n(or narrows down) the width of the labels. Our extensive experiments show that\nthe TNN mechanism provides a plug-and-play memory-efficient pipeline, thereby\nenabling the effective estimation of real-sized (or precise) semantic labels\nfor BEV map construction.\n","authors":["Minsu Kim","Giseop Kim","Sunwook Choi"],"pdf_url":"https://arxiv.org/pdf/2405.01016v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12793v2","updated":"2024-06-04T03:01:05Z","published":"2023-10-19T14:50:46Z","title":"OODRobustBench: a Benchmark and Large-Scale Analysis of Adversarial\n  Robustness under Distribution Shift","summary":"  Existing works have made great progress in improving adversarial robustness,\nbut typically test their method only on data from the same distribution as the\ntraining data, i.e. in-distribution (ID) testing. As a result, it is unclear\nhow such robustness generalizes under input distribution shifts, i.e.\nout-of-distribution (OOD) testing. This omission is concerning as such\ndistribution shifts are unavoidable when methods are deployed in the wild. To\naddress this issue we propose a benchmark named OODRobustBench to\ncomprehensively assess OOD adversarial robustness using 23 dataset-wise shifts\n(i.e. naturalistic shifts in input distribution) and 6 threat-wise shifts\n(i.e., unforeseen adversarial threat models). OODRobustBench is used to assess\n706 robust models using 60.7K adversarial evaluations. This large-scale\nanalysis shows that: 1) adversarial robustness suffers from a severe OOD\ngeneralization issue; 2) ID robustness correlates strongly with OOD robustness\nin a positive linear way. The latter enables the prediction of OOD robustness\nfrom ID robustness. We then predict and verify that existing methods are\nunlikely to achieve high OOD robustness. Novel methods are therefore required\nto achieve OOD robustness beyond our prediction. To facilitate the development\nof these methods, we investigate a wide range of techniques and identify\nseveral promising directions. Code and models are available at:\nhttps://github.com/OODRobustBench/OODRobustBench.\n","authors":["Lin Li","Yifei Wang","Chawin Sitawarin","Michael Spratling"],"pdf_url":"https://arxiv.org/pdf/2310.12793v2.pdf","comment":"ICML2024, and ICLR2024 DMLR workshop"},{"id":"http://arxiv.org/abs/2406.01917v1","updated":"2024-06-04T02:59:36Z","published":"2024-06-04T02:59:36Z","title":"GOMAA-Geo: GOal Modality Agnostic Active Geo-localization","summary":"  We consider the task of active geo-localization (AGL) in which an agent uses\na sequence of visual cues observed during aerial navigation to find a target\nspecified through multiple possible modalities. This could emulate a UAV\ninvolved in a search-and-rescue operation navigating through an area, observing\na stream of aerial images as it goes. The AGL task is associated with two\nimportant challenges. Firstly, an agent must deal with a goal specification in\none of multiple modalities (e.g., through a natural language description) while\nthe search cues are provided in other modalities (aerial imagery). The second\nchallenge is limited localization time (e.g., limited battery life, urgency) so\nthat the goal must be localized as efficiently as possible, i.e. the agent must\neffectively leverage its sequentially observed aerial views when searching for\nthe goal. To address these challenges, we propose GOMAA-Geo - a goal modality\nagnostic active geo-localization agent - for zero-shot generalization between\ndifferent goal modalities. Our approach combines cross-modality contrastive\nlearning to align representations across modalities with supervised foundation\nmodel pretraining and reinforcement learning to obtain highly effective\nnavigation and localization policies. Through extensive evaluations, we show\nthat GOMAA-Geo outperforms alternative learnable approaches and that it\ngeneralizes across datasets - e.g., to disaster-hit areas without seeing a\nsingle disaster scenario during training - and goal modalities - e.g., to\nground-level imagery or textual descriptions, despite only being trained with\ngoals specified as aerial views. Code and models are publicly available at\nhttps://github.com/mvrl/GOMAA-Geo/tree/main.\n","authors":["Anindya Sarkar","Srikumar Sastry","Aleksis Pirinen","Chongjie Zhang","Nathan Jacobs","Yevgeniy Vorobeychik"],"pdf_url":"https://arxiv.org/pdf/2406.01917v1.pdf","comment":"23 pages, 17 figures"},{"id":"http://arxiv.org/abs/2406.01916v1","updated":"2024-06-04T02:57:09Z","published":"2024-06-04T02:57:09Z","title":"FastLGS: Speeding up Language Embedded Gaussians with Feature Grid\n  Mapping","summary":"  The semantically interactive radiance field has always been an appealing task\nfor its potential to facilitate user-friendly and automated real-world 3D scene\nunderstanding applications. However, it is a challenging task to achieve high\nquality, efficiency and zero-shot ability at the same time with semantics in\nradiance fields. In this work, we present FastLGS, an approach that supports\nreal-time open-vocabulary query within 3D Gaussian Splatting (3DGS) under high\nresolution. We propose the semantic feature grid to save multi-view CLIP\nfeatures which are extracted based on Segment Anything Model (SAM) masks, and\nmap the grids to low dimensional features for semantic field training through\n3DGS. Once trained, we can restore pixel-aligned CLIP embeddings through\nfeature grids from rendered features for open-vocabulary queries. Comparisons\nwith other state-of-the-art methods prove that FastLGS can achieve the first\nplace performance concerning both speed and accuracy, where FastLGS is 98x\nfaster than LERF and 4x faster than LangSplat. Meanwhile, experiments show that\nFastLGS is adaptive and compatible with many downstream tasks, such as 3D\nsegmentation and 3D object inpainting, which can be easily applied to other 3D\nmanipulation systems.\n","authors":["Yuzhou Ji","He Zhu","Junshu Tang","Wuyi Liu","Zhizhong Zhang","Yuan Xie","Lizhuang Ma","Xin Tan"],"pdf_url":"https://arxiv.org/pdf/2406.01916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01914v1","updated":"2024-06-04T02:51:26Z","published":"2024-06-04T02:51:26Z","title":"HPE-CogVLM: New Head Pose Grounding Task Exploration on Vision Language\n  Model","summary":"  Head pose estimation (HPE) task requires a sophisticated understanding of 3D\nspatial relationships and precise numerical output of yaw, pitch, and roll\nEuler angles. Previous HPE studies are mainly based on Non-large language\nmodels (Non-LLMs), which rely on close-up human heads cropped from the full\nimage as inputs and lack robustness in real-world scenario. In this paper, we\npresent a novel framework to enhance the HPE prediction task by leveraging the\nvisual grounding capability of CogVLM. CogVLM is a vision language model (VLM)\nwith grounding capability of predicting object bounding boxes (BBoxes), which\nenables HPE training and prediction using full image information input. To\nintegrate the HPE task into the VLM, we first cop with the catastrophic\nforgetting problem in large language models (LLMs) by investigating the\nrehearsal ratio in the data rehearsal method. Then, we propose and validate a\nLoRA layer-based model merging method, which keeps the integrity of parameters,\nto enhance the HPE performance in the framework. The results show our\nHPE-CogVLM achieves a 31.5\\% reduction in Mean Absolute Error for HPE\nprediction over the current Non-LLM based state-of-the-art in cross-dataset\nevaluation. Furthermore, we compare our LoRA layer-based model merging method\nwith LoRA fine-tuning only and other merging methods in CogVLM. The results\ndemonstrate our framework outperforms them in all HPE metrics.\n","authors":["Yu Tian","Tianqi Shao","Tsukasa Demizu","Xuyang Wu","Hsin-Tai Wu"],"pdf_url":"https://arxiv.org/pdf/2406.01914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02611v3","updated":"2024-06-04T02:47:19Z","published":"2024-03-05T02:59:35Z","title":"A Unified Framework for Microscopy Defocus Deblur with Multi-Pyramid\n  Transformer and Contrastive Learning","summary":"  Defocus blur is a persistent problem in microscope imaging that poses harm to\npathology interpretation and medical intervention in cell microscopy and\nmicroscope surgery. To address this problem, a unified framework including the\nmulti-pyramid transformer (MPT) and extended frequency contrastive\nregularization (EFCR) is proposed to tackle two outstanding challenges in\nmicroscopy deblur: longer attention span and data deficiency. The MPT employs\nan explicit pyramid structure at each network stage that integrates the\ncross-scale window attention (CSWA), the intra-scale channel attention (ISCA),\nand the feature-enhancing feed-forward network (FEFN) to capture long-range\ncross-scale spatial interaction and global channel context. The EFCR addresses\nthe data deficiency problem by exploring latent deblur signals from different\nfrequency bands. It also enables deblur knowledge transfer to learn\ncross-domain information from extra data, improving deblur performance for\nlabeled and unlabeled data. Extensive experiments and downstream task\nvalidation show the framework achieves state-of-the-art performance across\nmultiple datasets. Project page: https://github.com/PieceZhang/MPT-CataBlur.\n","authors":["Yuelin Zhang","Pengyu Zheng","Wanquan Yan","Chengyu Fang","Shing Shin Cheng"],"pdf_url":"https://arxiv.org/pdf/2403.02611v3.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2406.01906v1","updated":"2024-06-04T02:28:51Z","published":"2024-06-04T02:28:51Z","title":"ProGEO: Generating Prompts through Image-Text Contrastive Learning for\n  Visual Geo-localization","summary":"  Visual Geo-localization (VG) refers to the process to identify the location\ndescribed in query images, which is widely applied in robotics field and\ncomputer vision tasks, such as autonomous driving, metaverse, augmented\nreality, and SLAM. In fine-grained images lacking specific text descriptions,\ndirectly applying pure visual methods to represent neighborhood features often\nleads to the model focusing on overly fine-grained features, unable to fully\nmine the semantic information in the images. Therefore, we propose a two-stage\ntraining method to enhance visual performance and use contrastive learning to\nmine challenging samples. We first leverage the multi-modal description\ncapability of CLIP (Contrastive Language-Image Pretraining) to create a set of\nlearnable text prompts for each geographic image feature to form vague\ndescriptions. Then, by utilizing dynamic text prompts to assist the training of\nthe image encoder, we enable the image encoder to learn better and more\ngeneralizable visual features. This strategy of applying text to purely visual\ntasks addresses the challenge of using multi-modal models for geographic\nimages, which often suffer from a lack of precise descriptions, making them\ndifficult to utilize widely. We validate the effectiveness of the proposed\nstrategy on several large-scale visual geo-localization datasets, and our\nmethod achieves competitive results on multiple visual geo-localization\ndatasets. Our code and model are available at\nhttps://github.com/Chain-Mao/ProGEO.\n","authors":["Chen Mao","Jingqi Hu"],"pdf_url":"https://arxiv.org/pdf/2406.01906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10888v2","updated":"2024-06-04T02:11:25Z","published":"2022-12-21T09:58:14Z","title":"A Survey of Mix-based Data Augmentation: Taxonomy, Methods,\n  Applications, and Explainability","summary":"  Data augmentation (DA) is indispensable in modern machine learning and deep\nneural networks. The basic idea of DA is to construct new training data to\nimprove the model's generalization by adding slightly disturbed versions of\nexisting data or synthesizing new data. This survey comprehensively reviews a\ncrucial subset of DA techniques, namely Mix-based Data Augmentation (MixDA),\nwhich generates novel samples by combining multiple examples. In contrast to\ntraditional DA approaches that operate on single samples or entire datasets,\nMixDA stands out due to its effectiveness, simplicity, flexibility,\ncomputational efficiency, theoretical foundation, and broad applicability. We\nbegin by introducing a novel taxonomy that categorizes MixDA into Mixup-based,\nCutmix-based, and mixture approaches based on a hierarchical perspective of the\ndata mixing operation. Subsequently, we provide an in-depth review of various\nMixDA techniques, focusing on their underlying motivations. Owing to its\nversatility, MixDA has penetrated a wide range of applications, which we also\nthoroughly investigate in this survey. Moreover, we delve into the underlying\nmechanisms of MixDA's effectiveness by examining its impact on model\ngeneralization and calibration while providing insights into the model's\nbehavior by analyzing the inherent properties of MixDA. Finally, we\nrecapitulate the critical findings and fundamental challenges of current MixDA\nstudies while outlining the potential directions for future works. Different\nfrom previous related surveys that focus on DA approaches in specific domains\n(e.g., CV and NLP) or only review a limited subset of MixDA studies, we are the\nfirst to provide a systematical survey of MixDA, covering its taxonomy,\nmethodology, application, and explainability. Furthermore, we provide promising\ndirections for researchers interested in this exciting area.\n","authors":["Chengtai Cao","Fan Zhou","Yurou Dai","Jianping Wang","Kunpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2212.10888v2.pdf","comment":"41 pages, 4 figures, and 5 tables"},{"id":"http://arxiv.org/abs/2406.01900v1","updated":"2024-06-04T02:05:57Z","published":"2024-06-04T02:05:57Z","title":"Follow-Your-Emoji: Fine-Controllable and Expressive Freestyle Portrait\n  Animation","summary":"  We present Follow-Your-Emoji, a diffusion-based framework for portrait\nanimation, which animates a reference portrait with target landmark sequences.\nThe main challenge of portrait animation is to preserve the identity of the\nreference portrait and transfer the target expression to this portrait while\nmaintaining temporal consistency and fidelity. To address these challenges,\nFollow-Your-Emoji equipped the powerful Stable Diffusion model with two\nwell-designed technologies. Specifically, we first adopt a new explicit motion\nsignal, namely expression-aware landmark, to guide the animation process. We\ndiscover this landmark can not only ensure the accurate motion alignment\nbetween the reference portrait and target motion during inference but also\nincrease the ability to portray exaggerated expressions (i.e., large pupil\nmovements) and avoid identity leakage. Then, we propose a facial fine-grained\nloss to improve the model's ability of subtle expression perception and\nreference portrait appearance reconstruction by using both expression and\nfacial masks. Accordingly, our method demonstrates significant performance in\ncontrolling the expression of freestyle portraits, including real humans,\ncartoons, sculptures, and even animals. By leveraging a simple and effective\nprogressive generation strategy, we extend our model to stable long-term\nanimation, thus increasing its potential application value. To address the lack\nof a benchmark for this field, we introduce EmojiBench, a comprehensive\nbenchmark comprising diverse portrait images, driving videos, and landmarks. We\nshow extensive evaluations on EmojiBench to verify the superiority of\nFollow-Your-Emoji.\n","authors":["Yue Ma","Hongyu Liu","Hongfa Wang","Heng Pan","Yingqing He","Junkun Yuan","Ailing Zeng","Chengfei Cai","Heung-Yeung Shum","Wei Liu","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2406.01900v1.pdf","comment":"Project Page: https://follow-your-emoji.github.io/"},{"id":"http://arxiv.org/abs/2406.01894v1","updated":"2024-06-04T01:58:32Z","published":"2024-06-04T01:58:32Z","title":"SVASTIN: Sparse Video Adversarial Attack via Spatio-Temporal Invertible\n  Neural Networks","summary":"  Robust and imperceptible adversarial video attack is challenging due to the\nspatial and temporal characteristics of videos. The existing video adversarial\nattack methods mainly take a gradient-based approach and generate adversarial\nvideos with noticeable perturbations. In this paper, we propose a novel Sparse\nAdversarial Video Attack via Spatio-Temporal Invertible Neural Networks\n(SVASTIN) to generate adversarial videos through spatio-temporal feature space\ninformation exchanging. It consists of a Guided Target Video Learning (GTVL)\nmodule to balance the perturbation budget and optimization speed and a\nSpatio-Temporal Invertible Neural Network (STIN) module to perform\nspatio-temporal feature space information exchanging between a source video and\nthe target feature tensor learned by GTVL module. Extensive experiments on\nUCF-101 and Kinetics-400 demonstrate that our proposed SVASTIN can generate\nadversarial examples with higher imperceptibility than the state-of-the-art\nmethods with the higher fooling rate. Code is available at\n\\href{https://github.com/Brittany-Chen/SVASTIN}{https://github.com/Brittany-Chen/SVASTIN}.\n","authors":["Yi Pan","Jun-Jie Huang","Zihan Chen","Wentao Zhao","Ziyue Wang"],"pdf_url":"https://arxiv.org/pdf/2406.01894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01884v1","updated":"2024-06-04T01:36:29Z","published":"2024-06-04T01:36:29Z","title":"Rank-based No-reference Quality Assessment for Face Swapping","summary":"  Face swapping has become a prominent research area in computer vision and\nimage processing due to rapid technological advancements. The metric of\nmeasuring the quality in most face swapping methods relies on several distances\nbetween the manipulated images and the source image, or the target image, i.e.,\nthere are suitable known reference face images. Therefore, there is still a gap\nin accurately assessing the quality of face interchange in reference-free\nscenarios. In this study, we present a novel no-reference image quality\nassessment (NR-IQA) method specifically designed for face swapping, addressing\nthis issue by constructing a comprehensive large-scale dataset, implementing a\nmethod for ranking image quality based on multiple facial attributes, and\nincorporating a Siamese network based on interpretable qualitative comparisons.\nOur model demonstrates the state-of-the-art performance in the quality\nassessment of swapped faces, providing coarse- and fine-grained. Enhanced by\nthis metric, an improved face-swapping model achieved a more advanced level\nwith respect to expressions and poses. Extensive experiments confirm the\nsuperiority of our method over existing general no-reference image quality\nassessment metrics and the latest metric of facial image quality assessment,\nmaking it well suited for evaluating face swapping images in real-world\nscenarios.\n","authors":["Xinghui Zhou","Wenbo Zhou","Tianyi Wei","Shen Chen","Taiping Yao","Shouhong Ding","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2406.01884v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.09066v2","updated":"2024-06-04T01:27:35Z","published":"2023-12-14T16:04:14Z","title":"CMOSE: Comprehensive Multi-Modality Online Student Engagement Dataset\n  with High-Quality Labels","summary":"  Online learning is a rapidly growing industry. However, a major doubt about\nonline learning is whether students are as engaged as they are in face-to-face\nclasses. An engagement recognition system can notify the instructors about the\nstudents condition and improve the learning experience. Current challenges in\nengagement detection involve poor label quality, extreme data imbalance, and\nintra-class variety - the variety of behaviors at a certain engagement level.\nTo address these problems, we present the CMOSE dataset, which contains a large\nnumber of data from different engagement levels and high-quality labels\nannotated according to psychological advice. We also propose a training\nmechanism MocoRank to handle the intra-class variety and the ordinal pattern of\ndifferent degrees of engagement classes. MocoRank outperforms prior engagement\ndetection frameworks, achieving a 1.32% increase in overall accuracy and 5.05%\nimprovement in average accuracy. Further, we demonstrate the effectiveness of\nmulti-modality in engagement detection by combining video features with speech\nand audio features. The data transferability experiments also state that the\nproposed CMOSE dataset provides superior label quality and behavior diversity.\n","authors":["Chi-hsuan Wu","Shih-yang Liu","Xijie Huang","Xingbo Wang","Rong Zhang","Luca Minciullo","Wong Kai Yiu","Kenny Kwan","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2312.09066v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2406.00934v2","updated":"2024-06-04T01:26:10Z","published":"2024-06-03T02:12:27Z","title":"LanEvil: Benchmarking the Robustness of Lane Detection to Environmental\n  Illusions","summary":"  Lane detection (LD) is an essential component of autonomous driving systems,\nproviding fundamental functionalities like adaptive cruise control and\nautomated lane centering. Existing LD benchmarks primarily focus on evaluating\ncommon cases, neglecting the robustness of LD models against environmental\nillusions such as shadows and tire marks on the road. This research gap poses\nsignificant safety challenges since these illusions exist naturally in\nreal-world traffic situations. For the first time, this paper studies the\npotential threats caused by these environmental illusions to LD and establishes\nthe first comprehensive benchmark LanEvil for evaluating the robustness of LD\nagainst this natural corruption. We systematically design 14 prevalent yet\ncritical types of environmental illusions (e.g., shadow, reflection) that cover\na wide spectrum of real-world influencing factors in LD tasks. Based on\nreal-world environments, we create 94 realistic and customizable 3D cases using\nthe widely used CARLA simulator, resulting in a dataset comprising 90,292\nsampled images. Through extensive experiments, we benchmark the robustness of\npopular LD methods using LanEvil, revealing substantial performance degradation\n(-5.37% Accuracy and -10.70% F1-Score on average), with shadow effects posing\nthe greatest risk (-7.39% Accuracy). Additionally, we assess the performance of\ncommercial auto-driving systems OpenPilot and Apollo through collaborative\nsimulations, demonstrating that proposed environmental illusions can lead to\nincorrect decisions and potential traffic accidents. To defend against\nenvironmental illusions, we propose the Attention Area Mixing (AAM) approach\nusing hard examples, which witness significant robustness improvement (+3.76%)\nunder illumination effects. We hope our paper can contribute to advancing more\nrobust auto-driving systems in the future. Website: https://lanevil.github.io/.\n","authors":["Tianyuan Zhang","Lu Wang","Hainan Li","Yisong Xiao","Siyuan Liang","Aishan Liu","Xianglong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2406.00934v2.pdf","comment":"Submitted to ACM MM 2024"},{"id":"http://arxiv.org/abs/2304.03807v3","updated":"2024-06-04T01:18:21Z","published":"2023-04-07T18:21:30Z","title":"Privacy-Preserving CNN Training with Transfer Learning: Multiclass\n  Logistic Regression","summary":"  In this paper, we present a practical solution to implement\nprivacy-preserving CNN training based on mere Homomorphic Encryption (HE)\ntechnique. To our best knowledge, this is the first attempt successfully to\ncrack this nut and no work ever before has achieved this goal. Several\ntechniques combine to accomplish the task:: (1) with transfer learning,\nprivacy-preserving CNN training can be reduced to homomorphic neural network\ntraining, or even multiclass logistic regression (MLR) training; (2) via a\nfaster gradient variant called $\\texttt{Quadratic Gradient}$, an enhanced\ngradient method for MLR with a state-of-the-art performance in convergence\nspeed is applied in this work to achieve high performance; (3) we employ the\nthought of transformation in mathematics to transform approximating Softmax\nfunction in the encryption domain to the approximation of the Sigmoid function.\nA new type of loss function termed $\\texttt{Squared Likelihood Error}$ has been\ndeveloped alongside to align with this change.; and (4) we use a simple but\nflexible matrix-encoding method named $\\texttt{Volley Revolver}$ to manage the\ndata flow in the ciphertexts, which is the key factor to complete the whole\nhomomorphic CNN training. The complete, runnable C++ code to implement our work\ncan be found at:\n\\href{https://github.com/petitioner/HE.CNNtraining}{$\\texttt{https://github.com/petitioner/HE.CNNtraining}$}.\nWe select $\\texttt{REGNET\\_X\\_400MF}$ as our pre-trained model for transfer\nlearning. We use the first 128 MNIST training images as training data and the\nwhole MNIST testing dataset as the testing data. The client only needs to\nupload 6 ciphertexts to the cloud and it takes $\\sim 21$ mins to perform 2\niterations on a cloud with 64 vCPUs, resulting in a precision of $21.49\\%$.\n","authors":["John Chiang"],"pdf_url":"https://arxiv.org/pdf/2304.03807v3.pdf","comment":"In this work, we initiated to implement privacy-persevering CNN\n  training based on mere HE techniques by presenting a faster HE-friendly\n  algorithm"},{"id":"http://arxiv.org/abs/2402.16627v3","updated":"2024-06-04T01:08:56Z","published":"2024-02-26T15:01:16Z","title":"Contextualized Diffusion Models for Text-Guided Image and Video\n  Generation","summary":"  Conditional diffusion models have exhibited superior performance in\nhigh-fidelity text-guided visual generation and editing. Nevertheless,\nprevailing text-guided visual diffusion models primarily focus on incorporating\ntext-visual relationships exclusively into the reverse process, often\ndisregarding their relevance in the forward process. This inconsistency between\nforward and reverse processes may limit the precise conveyance of textual\nsemantics in visual synthesis results. To address this issue, we propose a\nnovel and general contextualized diffusion model (ContextDiff) by incorporating\nthe cross-modal context encompassing interactions and alignments between text\ncondition and visual sample into forward and reverse processes. We propagate\nthis context to all timesteps in the two processes to adapt their trajectories,\nthereby facilitating cross-modal conditional modeling. We generalize our\ncontextualized diffusion to both DDPMs and DDIMs with theoretical derivations,\nand demonstrate the effectiveness of our model in evaluations with two\nchallenging tasks: text-to-image generation, and text-to-video editing. In each\ntask, our ContextDiff achieves new state-of-the-art performance, significantly\nenhancing the semantic alignment between text condition and generated samples,\nas evidenced by quantitative and qualitative evaluations. Our code is available\nat https://github.com/YangLing0818/ContextDiff\n","authors":["Ling Yang","Zhilong Zhang","Zhaochen Yu","Jingwei Liu","Minkai Xu","Stefano Ermon","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2402.16627v3.pdf","comment":"ICLR 2024. Project: https://github.com/YangLing0818/ContextDiff"},{"id":"http://arxiv.org/abs/2406.01869v1","updated":"2024-06-04T00:41:47Z","published":"2024-06-04T00:41:47Z","title":"Fruit Classification System with Deep Learning and Neural Architecture\n  Search","summary":"  The fruit identification process involves analyzing and categorizing\ndifferent types of fruits based on their visual characteristics. This activity\ncan be achieved using a range of methodologies, encompassing manual\nexamination, conventional computer vision methodologies, and more sophisticated\nmethodologies employing machine learning and deep learning. Our study\nidentified a total of 15 distinct categories of fruit, consisting of class\nAvocado, Banana, Cherry, Apple Braeburn, Apple golden 1, Apricot, Grape, Kiwi,\nMango, Orange, Papaya, Peach, Pineapple, Pomegranate and Strawberry. Neural\nArchitecture Search (NAS) is a technological advancement employed within the\nrealm of deep learning and artificial intelligence, to automate conceptualizing\nand refining neural network topologies. NAS aims to identify neural network\nstructures that are highly suitable for tasks, such as the detection of fruits.\nOur suggested model with 99.98% mAP increased the detection performance of the\npreceding research study that used Fruit datasets. In addition, after the\ncompletion of the study, a comparative analysis was carried out to assess the\nfindings in conjunction with those of another research that is connected to the\ntopic. When compared to the findings of earlier studies, the detector that was\nproposed exhibited higher performance in terms of both its accuracy and its\nprecision.\n","authors":["Christine Dewi","Dhananjay Thiruvady","Nayyar Zaidi"],"pdf_url":"https://arxiv.org/pdf/2406.01869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01867v1","updated":"2024-06-04T00:38:44Z","published":"2024-06-04T00:38:44Z","title":"MoLA: Motion Generation and Editing with Latent Diffusion Enhanced by\n  Adversarial Training","summary":"  In motion generation, controllability as well as generation quality and speed\nis becoming more and more important. There are various motion editing tasks,\nsuch as in-betweening, upper body editing, and path-following, but existing\nmethods perform motion editing with a data-space diffusion model, which is slow\nin inference compared to a latent diffusion model. In this paper, we propose\nMoLA, which provides fast and high-quality motion generation and also can deal\nwith multiple editing tasks in a single framework. For high-quality and fast\ngeneration, we employ a variational autoencoder and latent diffusion model, and\nimprove the performance with adversarial training. In addition, we apply a\ntraining-free guided generation framework to achieve various editing tasks with\nmotion control inputs. We quantitatively show the effectiveness of adversarial\nlearning in text-to-motion generation, and demonstrate the applicability of our\nediting framework to multiple editing tasks in the motion domain.\n","authors":["Kengo Uchida","Takashi Shibuya","Yuhta Takida","Naoki Murata","Shusuke Takahashi","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2406.01867v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.03118v2","updated":"2024-06-04T23:55:39Z","published":"2024-04-03T23:57:34Z","title":"LVLM-Intrepret: An Interpretability Tool for Large Vision-Language\n  Models","summary":"  In the rapidly evolving landscape of artificial intelligence, multi-modal\nlarge language models are emerging as a significant area of interest. These\nmodels, which combine various forms of data input, are becoming increasingly\npopular. However, understanding their internal mechanisms remains a complex\ntask. Numerous advancements have been made in the field of explainability tools\nand mechanisms, yet there is still much to explore. In this work, we present a\nnovel interactive application aimed towards understanding the internal\nmechanisms of large vision-language models. Our interface is designed to\nenhance the interpretability of the image patches, which are instrumental in\ngenerating an answer, and assess the efficacy of the language model in\ngrounding its output in the image. With our application, a user can\nsystematically investigate the model and uncover system limitations, paving the\nway for enhancements in system capabilities. Finally, we present a case study\nof how our application can aid in understanding failure mechanisms in a popular\nlarge multi-modal model: LLaVA.\n","authors":["Gabriela Ben Melech Stan","Estelle Aflalo","Raanan Yehezkel Rohekar","Anahita Bhiwandiwalla","Shao-Yen Tseng","Matthew Lyle Olson","Yaniv Gurwicz","Chenfei Wu","Nan Duan","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2404.03118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02820v1","updated":"2024-06-04T23:39:08Z","published":"2024-06-04T23:39:08Z","title":"ORACLE: Leveraging Mutual Information for Consistent Character\n  Generation with LoRAs in Diffusion Models","summary":"  Text-to-image diffusion models have recently taken center stage as pivotal\ntools in promoting visual creativity across an array of domains such as comic\nbook artistry, children's literature, game development, and web design. These\nmodels harness the power of artificial intelligence to convert textual\ndescriptions into vivid images, thereby enabling artists and creators to bring\ntheir imaginative concepts to life with unprecedented ease. However, one of the\nsignificant hurdles that persist is the challenge of maintaining consistency in\ncharacter generation across diverse contexts. Variations in textual prompts,\neven if minor, can yield vastly different visual outputs, posing a considerable\nproblem in projects that require a uniform representation of characters\nthroughout. In this paper, we introduce a novel framework designed to produce\nconsistent character representations from a single text prompt across diverse\nsettings. Through both quantitative and qualitative analyses, we demonstrate\nthat our framework outperforms existing methods in generating characters with\nconsistent visual identities, underscoring its potential to transform creative\nindustries. By addressing the critical challenge of character consistency, we\nnot only enhance the practical utility of these models but also broaden the\nhorizons for artistic and creative expression.\n","authors":["Kiymet Akdemir","Pinar Yanardag"],"pdf_url":"https://arxiv.org/pdf/2406.02820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08110v2","updated":"2024-06-04T23:06:36Z","published":"2023-11-14T12:14:54Z","title":"Improving Hateful Meme Detection through Retrieval-Guided Contrastive\n  Learning","summary":"  Hateful memes have emerged as a significant concern on the Internet.\nDetecting hateful memes requires the system to jointly understand the visual\nand textual modalities. Our investigation reveals that the embedding space of\nexisting CLIP-based systems lacks sensitivity to subtle differences in memes\nthat are vital for correct hatefulness classification. We propose constructing\na hatefulness-aware embedding space through retrieval-guided contrastive\ntraining. Our approach achieves state-of-the-art performance on the\nHatefulMemes dataset with an AUROC of 87.0, outperforming much larger\nfine-tuned large multimodal models. We demonstrate a retrieval-based hateful\nmemes detection system, which is capable of identifying hatefulness based on\ndata unseen in training. This allows developers to update the hateful memes\ndetection system by simply adding new examples without retraining, a desirable\nfeature for real services in the constantly evolving landscape of hateful memes\non the Internet.\n","authors":["Jingbiao Mei","Jinghong Chen","Weizhe Lin","Bill Byrne","Marcus Tomalin"],"pdf_url":"https://arxiv.org/pdf/2311.08110v2.pdf","comment":"Accepted by ACL 2024 Main Conference. This is the camera-ready\n  version. We added more experiments to address reviewers' comments"},{"id":"http://arxiv.org/abs/2211.04325v2","updated":"2024-06-04T22:09:46Z","published":"2022-10-26T00:28:40Z","title":"Will we run out of data? Limits of LLM scaling based on human-generated\n  data","summary":"  We investigate the potential constraints on LLM scaling posed by the\navailability of public human-generated text data. We forecast the growing\ndemand for training data based on current trends and estimate the total stock\nof public human text data. Our findings indicate that if current LLM\ndevelopment trends continue, models will be trained on datasets roughly equal\nin size to the available stock of public human text data between 2026 and 2032,\nor slightly earlier if models are overtrained. We explore how progress in\nlanguage modeling can continue when human-generated text datasets cannot be\nscaled any further. We argue that synthetic data generation, transfer learning\nfrom data-rich domains, and data efficiency improvements might support further\nprogress.\n","authors":["Pablo Villalobos","Anson Ho","Jaime Sevilla","Tamay Besiroglu","Lennart Heim","Marius Hobbhahn"],"pdf_url":"https://arxiv.org/pdf/2211.04325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18570v2","updated":"2024-06-04T20:53:32Z","published":"2024-05-28T20:28:07Z","title":"Its Not a Modality Gap: Characterizing and Addressing the Contrastive\n  Gap","summary":"  Multi-modal contrastive models such as CLIP achieve state-of-the-art\nperformance in zero-shot classification by embedding input images and texts on\na joint representational space. Recently, a modality gap has been reported in\ntwo-encoder contrastive models like CLIP, meaning that the image and text\nembeddings reside in disjoint areas of the latent space. Previous studies\nsuggest that this gap exists due to 1) the cone effect, 2) mismatched pairs in\nthe dataset, and 3) insufficient training. We show that, even when accounting\nfor all these factors, and even when using the same modality, the contrastive\nloss actually creates a gap during training. As a result, We propose that the\nmodality gap is inherent to the two-encoder contrastive loss and rename it the\ncontrastive gap. We present evidence that attributes this contrastive gap to\nlow uniformity in CLIP space, resulting in embeddings that occupy only a small\nportion of the latent space. To close the gap, we adapt the uniformity and\nalignment properties of unimodal contrastive loss to the multi-modal setting\nand show that simply adding these terms to the CLIP loss distributes the\nembeddings more uniformly in the representational space, closing the gap. In\nour experiments, we show that the modified representational space achieves\nbetter performance than default CLIP loss in downstream tasks such as zero-shot\nimage classification and multi-modal arithmetic.\n","authors":["Abrar Fahim","Alex Murphy","Alona Fyshe"],"pdf_url":"https://arxiv.org/pdf/2405.18570v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02780v1","updated":"2024-06-04T20:51:04Z","published":"2024-06-04T20:51:04Z","title":"LADI v2: Multi-label Dataset and Classifiers for Low-Altitude Disaster\n  Imagery","summary":"  ML-based computer vision models are promising tools for supporting emergency\nmanagement operations following natural disasters. Arial photographs taken from\nsmall manned and unmanned aircraft can be available soon after a disaster and\nprovide valuable information from multiple perspectives for situational\nawareness and damage assessment applications. However, emergency managers often\nface challenges finding the most relevant photos among the tens of thousands\nthat may be taken after an incident. While ML-based solutions could enable more\neffective use of aerial photographs, there is still a lack of training data for\nimagery of this type from multiple perspectives and for multiple hazard types.\nTo address this, we present the LADI v2 (Low Altitude Disaster Imagery version\n2) dataset, a curated set of about 10,000 disaster images captured in the\nUnited States by the Civil Air Patrol (CAP) in response to federally-declared\nemergencies (2015-2023) and annotated for multi-label classification by trained\nCAP volunteers. We also provide two pretrained baseline classifiers and compare\ntheir performance to state-of-the-art vision-language models in multi-label\nclassification. The data and code are released publicly to support the\ndevelopment of computer vision models for emergency management research and\napplications.\n","authors":["Samuel Scheele","Katherine Picchione","Jeffrey Liu"],"pdf_url":"https://arxiv.org/pdf/2406.02780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02776v1","updated":"2024-06-04T20:45:53Z","published":"2024-06-04T20:45:53Z","title":"MeshVPR: Citywide Visual Place Recognition Using 3D Meshes","summary":"  Mesh-based scene representation offers a promising direction for simplifying\nlarge-scale hierarchical visual localization pipelines, combining a visual\nplace recognition step based on global features (retrieval) and a visual\nlocalization step based on local features. While existing work demonstrates the\nviability of meshes for visual localization, the impact of using synthetic\ndatabases rendered from them in visual place recognition remains largely\nunexplored. In this work we investigate using dense 3D textured meshes for\nlarge-scale Visual Place Recognition (VPR) and identify a significant\nperformance drop when using synthetic mesh-based databases compared to\nreal-world images for retrieval. To address this, we propose MeshVPR, a novel\nVPR pipeline that utilizes a lightweight features alignment framework to bridge\nthe gap between real-world and synthetic domains. MeshVPR leverages pre-trained\nVPR models and it is efficient and scalable for city-wide deployments. We\nintroduce novel datasets with freely available 3D meshes and manually collected\nqueries from Berlin, Paris, and Melbourne. Extensive evaluations demonstrate\nthat MeshVPR achieves competitive performance with standard VPR pipelines,\npaving the way for mesh-based localization systems. Our contributions include\nthe new task of citywide mesh-based VPR, the new benchmark datasets, MeshVPR,\nand a thorough analysis of open challenges. Data, code, and interactive\nvisualizations are available at https://mesh-vpr.github.io\n","authors":["Gabriele Berton","Lorenz Junglas","Riccardo Zaccone","Thomas Pollok","Barbara Caputo","Carlo Masone"],"pdf_url":"https://arxiv.org/pdf/2406.02776v1.pdf","comment":"Website: https://mesh-vpr.github.io/"},{"id":"http://arxiv.org/abs/2406.02774v1","updated":"2024-06-04T20:43:26Z","published":"2024-06-04T20:43:26Z","title":"Diffusion-Refined VQA Annotations for Semi-Supervised Gaze Following","summary":"  Training gaze following models requires a large number of images with gaze\ntarget coordinates annotated by human annotators, which is a laborious and\ninherently ambiguous process. We propose the first semi-supervised method for\ngaze following by introducing two novel priors to the task. We obtain the first\nprior using a large pretrained Visual Question Answering (VQA) model, where we\ncompute Grad-CAM heatmaps by `prompting' the VQA model with a gaze following\nquestion. These heatmaps can be noisy and not suited for use in training. The\nneed to refine these noisy annotations leads us to incorporate a second prior.\nWe utilize a diffusion model trained on limited human annotations and modify\nthe reverse sampling process to refine the Grad-CAM heatmaps. By tuning the\ndiffusion process we achieve a trade-off between the human annotation prior and\nthe VQA heatmap prior, which retains the useful VQA prior information while\nexhibiting similar properties to the training data distribution. Our method\noutperforms simple pseudo-annotation generation baselines on the GazeFollow\nimage dataset. More importantly, our pseudo-annotation strategy, applied to a\nwidely used supervised gaze following model (VAT), reduces the annotation need\nby 50%. Our method also performs the best on the VideoAttentionTarget dataset.\n","authors":["Qiaomu Miao","Alexandros Graikos","Jingwei Zhang","Sounak Mondal","Minh Hoai","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2406.02774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02773v1","updated":"2024-06-04T20:40:27Z","published":"2024-06-04T20:40:27Z","title":"Cyclic Sparse Training: Is it Enough?","summary":"  The success of iterative pruning methods in achieving state-of-the-art sparse\nnetworks has largely been attributed to improved mask identification and an\nimplicit regularization induced by pruning. We challenge this hypothesis and\ninstead posit that their repeated cyclic training schedules enable improved\noptimization. To verify this, we show that pruning at initialization is\nsignificantly boosted by repeated cyclic training, even outperforming standard\niterative pruning methods. The dominant mechanism how this is achieved, as we\nconjecture, can be attributed to a better exploration of the loss landscape\nleading to a lower training loss. However, at high sparsity, repeated cyclic\ntraining alone is not enough for competitive performance. A strong coupling\nbetween learnt parameter initialization and mask seems to be required. Standard\nmethods obtain this coupling via expensive pruning-training iterations,\nstarting from a dense network. To achieve this with sparse training instead, we\npropose SCULPT-ing, i.e., repeated cyclic training of any sparse mask followed\nby a single pruning step to couple the parameters and the mask, which is able\nto match the performance of state-of-the-art iterative pruning methods in the\nhigh sparsity regime at reduced computational cost.\n","authors":["Advait Gadhikar","Sree Harsha Nelaturu","Rebekka Burkholz"],"pdf_url":"https://arxiv.org/pdf/2406.02773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13236v2","updated":"2024-06-04T20:38:49Z","published":"2023-12-20T18:00:16Z","title":"Diffusion Models With Learned Adaptive Noise","summary":"  Diffusion models have gained traction as powerful algorithms for synthesizing\nhigh-quality images. Central to these algorithms is the diffusion process, a\nset of equations which maps data to noise in a way that can significantly\naffect performance. In this paper, we explore whether the diffusion process can\nbe learned from data. Our work is grounded in Bayesian inference and seeks to\nimprove log-likelihood estimation by casting the learned diffusion process as\nan approximate variational posterior that yields a tighter lower bound (ELBO)\non the likelihood. A widely held assumption is that the ELBO is invariant to\nthe noise process: our work dispels this assumption and proposes multivariate\nlearned adaptive noise (MULAN), a learned diffusion process that applies noise\nat different rates across an image. Specifically, our method relies on a\nmultivariate noise schedule that is a function of the data to ensure that the\nELBO is no longer invariant to the choice of the noise schedule as in previous\nworks. Empirically, MULAN sets a new state-of-the-art in density estimation on\nCIFAR-10 and ImageNet and reduces the number of training steps by 50%. Code is\navailable at https://github.com/s-sahoo/MuLAN\n","authors":["Subham Sekhar Sahoo","Aaron Gokaslan","Chris De Sa","Volodymyr Kuleshov"],"pdf_url":"https://arxiv.org/pdf/2312.13236v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02761v1","updated":"2024-06-04T20:28:02Z","published":"2024-06-04T20:28:02Z","title":"Multi-layer Learnable Attention Mask for Multimodal Tasks","summary":"  While the Self-Attention mechanism in the Transformer model has proven to be\neffective in many domains, we observe that it is less effective in more diverse\nsettings (e.g. multimodality) due to the varying granularity of each token and\nthe high computational demands of lengthy sequences. To address the challenges,\nwe introduce the Learnable Attention Mask (LAM), strategically designed to\nglobally regulate attention maps and prioritize critical tokens within the\nsequence. Leveraging the Self-Attention module in a BERT-like transformer\nnetwork, our approach adeptly captures associations between tokens. The\nextension of the LAM to a multi-layer version accommodates the varied\ninformation aspects embedded at each layer of the Transformer network.\nComprehensive experimental validation on various datasets, such as MADv2,\nQVHighlights, ImageNet 1K, and MSRVTT, demonstrates the efficacy of the LAM,\nexemplifying its ability to enhance model performance while mitigating\nredundant computations. This pioneering approach presents a significant\nadvancement in enhancing the understanding of complex scenarios, such as in\nmovie understanding.\n","authors":["Wayner Barrios","SouYoung Jin"],"pdf_url":"https://arxiv.org/pdf/2406.02761v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.17197v2","updated":"2024-06-04T15:52:52Z","published":"2024-01-30T17:31:19Z","title":"Data-efficient Fine-tuning for LLM-based Recommendation","summary":"  Leveraging Large Language Models (LLMs) for recommendation has recently\ngarnered considerable attention, where fine-tuning plays a key role in LLMs'\nadaptation. However, the cost of fine-tuning LLMs on rapidly expanding\nrecommendation data limits their practical application. To address this\nchallenge, few-shot fine-tuning offers a promising approach to quickly adapt\nLLMs to new recommendation data. We propose the task of data pruning for\nefficient LLM-based recommendation, aimed at identifying representative samples\ntailored for LLMs' few-shot fine-tuning. While coreset selection is closely\nrelated to the proposed task, existing coreset selection methods often rely on\nsuboptimal heuristic metrics or entail costly optimization on large-scale\nrecommendation data.\n  To tackle these issues, we introduce two objectives for the data pruning task\nin the context of LLM-based recommendation: 1) high accuracy aims to identify\nthe influential samples that can lead to high overall performance; and 2) high\nefficiency underlines the low costs of the data pruning process. To pursue the\ntwo objectives, we propose a novel data pruning method based on two scores,\ni.e., influence score and effort score, to efficiently identify the influential\nsamples. Particularly, the influence score is introduced to accurately estimate\nthe influence of sample removal on the overall performance. To achieve low\ncosts of the data pruning process, we use a small-sized surrogate model to\nreplace LLMs to obtain the influence score. Considering the potential gap\nbetween the surrogate model and LLMs, we further propose an effort score to\nprioritize some hard samples specifically for LLMs. Empirical results on three\nreal-world datasets validate the effectiveness of our proposed method. In\nparticular, the proposed method uses only 2% samples to surpass the full data\nfine-tuning, reducing time costs by 97%.\n","authors":["Xinyu Lin","Wenjie Wang","Yongqi Li","Shuo Yang","Fuli Feng","Yinwei Wei","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2401.17197v2.pdf","comment":"Accepted by SIGIR 2024"},{"id":"http://arxiv.org/abs/2402.00943v2","updated":"2024-06-04T15:38:08Z","published":"2024-02-01T19:00:40Z","title":"Approximate Nearest Neighbor Search with Window Filters","summary":"  We define and investigate the problem of $\\textit{c-approximate window\nsearch}$: approximate nearest neighbor search where each point in the dataset\nhas a numeric label, and the goal is to find nearest neighbors to queries\nwithin arbitrary label ranges. Many semantic search problems, such as image and\ndocument search with timestamp filters, or product search with cost filters,\nare natural examples of this problem. We propose and theoretically analyze a\nmodular tree-based framework for transforming an index that solves the\ntraditional c-approximate nearest neighbor problem into a data structure that\nsolves window search. On standard nearest neighbor benchmark datasets equipped\nwith random label values, adversarially constructed embeddings, and image\nsearch embeddings with real timestamps, we obtain up to a $75\\times$ speedup\nover existing solutions at the same level of recall.\n","authors":["Joshua Engels","Benjamin Landrum","Shangdi Yu","Laxman Dhulipala","Julian Shun"],"pdf_url":"https://arxiv.org/pdf/2402.00943v2.pdf","comment":"Code available: https://github.com/JoshEngels/RangeFilteredANN"},{"id":"http://arxiv.org/abs/2406.02377v1","updated":"2024-06-04T14:55:14Z","published":"2024-06-04T14:55:14Z","title":"XRec: Large Language Models for Explainable Recommendation","summary":"  Recommender systems help users navigate information overload by providing\npersonalized recommendations aligned with their preferences. Collaborative\nFiltering (CF) is a widely adopted approach, but while advanced techniques like\ngraph neural networks (GNNs) and self-supervised learning (SSL) have enhanced\nCF models for better user representations, they often lack the ability to\nprovide explanations for the recommended items. Explainable recommendations aim\nto address this gap by offering transparency and insights into the\nrecommendation decision-making process, enhancing users' understanding. This\nwork leverages the language capabilities of Large Language Models (LLMs) to\npush the boundaries of explainable recommender systems. We introduce a\nmodel-agnostic framework called XRec, which enables LLMs to provide\ncomprehensive explanations for user behaviors in recommender systems. By\nintegrating collaborative signals and designing a lightweight collaborative\nadaptor, the framework empowers LLMs to understand complex patterns in\nuser-item interactions and gain a deeper understanding of user preferences. Our\nextensive experiments demonstrate the effectiveness of XRec, showcasing its\nability to generate comprehensive and meaningful explanations that outperform\nbaseline approaches in explainable recommender systems. We open-source our\nmodel implementation at https://github.com/HKUDS/XRec.\n","authors":["Qiyao Ma","Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2406.02377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02368v1","updated":"2024-06-04T14:46:25Z","published":"2024-06-04T14:46:25Z","title":"Large Language Models Make Sample-Efficient Recommender Systems","summary":"  Large language models (LLMs) have achieved remarkable progress in the field\nof natural language processing (NLP), demonstrating remarkable abilities in\nproducing text that resembles human language for various tasks. This opens up\nnew opportunities for employing them in recommender systems (RSs). In this\npaper, we specifically examine the sample efficiency of LLM-enhanced\nrecommender systems, which pertains to the model's capacity to attain superior\nperformance with a limited quantity of training data. Conventional\nrecommendation models (CRMs) often need a large amount of training data because\nof the sparsity of features and interactions. Hence, we propose and verify our\ncore viewpoint: Large Language Models Make Sample-Efficient Recommender\nSystems. We propose a simple yet effective framework (i.e., Laser) to validate\nthe viewpoint from two aspects: (1) LLMs themselves are sample-efficient\nrecommenders; and (2) LLMs, as feature generators and encoders, make CRMs more\nsample-efficient. Extensive experiments on two public datasets show that Laser\nrequires only a small fraction of training samples to match or even surpass\nCRMs that are trained on the entire training set, demonstrating superior sample\nefficiency.\n","authors":["Jianghao Lin","Xinyi Dai","Rong Shan","Bo Chen","Ruiming Tang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.02368v1.pdf","comment":"Accepted by Frontier of Computer Science"},{"id":"http://arxiv.org/abs/2309.03613v2","updated":"2024-06-04T14:25:45Z","published":"2023-09-07T10:13:09Z","title":"Evaluating ChatGPT as a Recommender System: A Rigorous Approach","summary":"  Large Language Models (LLMs) have recently shown impressive abilities in\nhandling various natural language-related tasks. Among different LLMs, current\nstudies have assessed ChatGPT's superior performance across manifold tasks,\nespecially under the zero/few-shot prompting conditions. Given such successes,\nthe Recommender Systems (RSs) research community have started investigating its\npotential applications within the recommendation scenario. However, although\nvarious methods have been proposed to integrate ChatGPT's capabilities into\nRSs, current research struggles to comprehensively evaluate such models while\nconsidering the peculiarities of generative models. Often, evaluations do not\nconsider hallucinations, duplications, and out-of-the-closed domain\nrecommendations and solely focus on accuracy metrics, neglecting the impact on\nbeyond-accuracy facets. To bridge this gap, we propose a robust evaluation\npipeline to assess ChatGPT's ability as an RS and post-process ChatGPT\nrecommendations to account for these aspects. Through this pipeline, we\ninvestigate ChatGPT-3.5 and ChatGPT-4 performance in the recommendation task\nunder the zero-shot condition employing the role-playing prompt. We analyze the\nmodel's functionality in three settings: the Top-N Recommendation, the\ncold-start recommendation, and the re-ranking of a list of recommendations, and\nin three domains: movies, music, and books. The experiments reveal that ChatGPT\nexhibits higher accuracy than the baselines on books domain. It also excels in\nre-ranking and cold-start scenarios while maintaining reasonable\nbeyond-accuracy metrics. Furthermore, we measure the similarity between the\nChatGPT recommendations and the other recommenders, providing insights about\nhow ChatGPT could be categorized in the realm of recommender systems. The\nevaluation pipeline is publicly released for future research.\n","authors":["Dario Di Palma","Giovanni Maria Biancofiore","Vito Walter Anelli","Fedelucio Narducci","Tommaso Di Noia","Eugenio Di Sciascio"],"pdf_url":"https://arxiv.org/pdf/2309.03613v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02245v1","updated":"2024-06-04T12:09:44Z","published":"2024-06-04T12:09:44Z","title":"Description Boosting for Zero-Shot Entity and Relation Classification","summary":"  Zero-shot entity and relation classification models leverage available\nexternal information of unseen classes -- e.g., textual descriptions -- to\nannotate input text data. Thanks to the minimum data requirement, Zero-Shot\nLearning (ZSL) methods have high value in practice, especially in applications\nwhere labeled data is scarce. Even though recent research in ZSL has\ndemonstrated significant results, our analysis reveals that those methods are\nsensitive to provided textual descriptions of entities (or relations). Even a\nminor modification of descriptions can lead to a change in the decision\nboundary between entity (or relation) classes. In this paper, we formally\ndefine the problem of identifying effective descriptions for zero shot\ninference. We propose a strategy for generating variations of an initial\ndescription, a heuristic for ranking them and an ensemble method capable of\nboosting the predictions of zero-shot models through description enhancement.\nEmpirical results on four different entity and relation classification datasets\nshow that our proposed method outperform existing approaches and achieve new\nSOTA results on these datasets under the ZSL settings. The source code of the\nproposed solutions and the evaluation framework are open-sourced.\n","authors":["Gabriele Picco","Leopold Fuchs","Marcos Martínez Galindo","Alberto Purpura","Vanessa López","Hoang Thanh Lam"],"pdf_url":"https://arxiv.org/pdf/2406.02245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02163v1","updated":"2024-06-04T09:52:41Z","published":"2024-06-04T09:52:41Z","title":"Pairwise Ranking Loss for Multi-Task Learning in Recommender Systems","summary":"  Multi-Task Learning (MTL) plays a crucial role in real-world advertising\napplications such as recommender systems, aiming to achieve robust\nrepresentations while minimizing resource consumption. MTL endeavors to\nsimultaneously optimize multiple tasks to construct a unified model serving\ndiverse objectives. In online advertising systems, tasks like Click-Through\nRate (CTR) and Conversion Rate (CVR) are often treated as MTL problems\nconcurrently. However, it has been overlooked that a conversion ($y_{cvr}=1$)\nnecessitates a preceding click ($y_{ctr}=1$). In other words, while certain CTR\ntasks are associated with corresponding conversions, others lack such\nassociations. Moreover, the likelihood of noise is significantly higher in CTR\ntasks where conversions do not occur compared to those where they do, and\nexisting methods lack the ability to differentiate between these two scenarios.\nIn this study, exposure labels corresponding to conversions are regarded as\ndefinitive indicators, and a novel task-specific loss is introduced by\ncalculating a \\textbf{p}air\\textbf{wise} \\textbf{r}anking (PWiseR) loss between\nmodel predictions, manifesting as pairwise ranking loss, to encourage the model\nto rely more on them. To demonstrate the effect of the proposed loss function,\nexperiments were conducted on different MTL and Single-Task Learning (STL)\nmodels using four distinct public MTL datasets, namely Alibaba FR, NL, US, and\nCCP, along with a proprietary industrial dataset. The results indicate that our\nproposed loss function outperforms the BCE loss function in most cases in terms\nof the AUC metric.\n","authors":["Furkan Durmus","Hasan Saribas","Said Aldemir","Yang Junyan","Hakan Cevikalp"],"pdf_url":"https://arxiv.org/pdf/2406.02163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02135v1","updated":"2024-06-04T09:24:04Z","published":"2024-06-04T09:24:04Z","title":"Robust Interaction-based Relevance Modeling for Online E-Commerce and\n  LLM-based Retrieval","summary":"  Semantic relevance calculation is crucial for e-commerce search engines, as\nit ensures that the items selected closely align with customer intent.\nInadequate attention to this aspect can detrimentally affect user experience\nand engagement. Traditional text-matching techniques are prevalent but often\nfail to capture the nuances of search intent accurately, so neural networks now\nhave become a preferred solution to processing such complex text matching.\nExisting methods predominantly employ representation-based architectures, which\nstrike a balance between high traffic capacity and low latency. However, they\nexhibit significant shortcomings in generalization and robustness when compared\nto interaction-based architectures. In this work, we introduce a robust\ninteraction-based modeling paradigm to address these shortcomings. It\nencompasses 1) a dynamic length representation scheme for expedited inference,\n2) a professional terms recognition method to identify subjects and core\nattributes from complex sentence structures, and 3) a contrastive adversarial\ntraining protocol to bolster the model's robustness and matching capabilities.\nExtensive offline evaluations demonstrate the superior robustness and\neffectiveness of our approach, and online A/B testing confirms its ability to\nimprove relevance in the same exposure position, resulting in more clicks and\nconversions. To the best of our knowledge, this method is the first\ninteraction-based approach for large e-commerce search relevance calculation.\nNotably, we have deployed it for the entire search traffic on alibaba.com, the\nlargest B2B e-commerce platform in the world.\n","authors":["Ben Chen","Huangyu Dai","Xiang Ma","Wen Jiang","Wei Ning"],"pdf_url":"https://arxiv.org/pdf/2406.02135v1.pdf","comment":"Accepted by ECML-PKDD'24 as Outstanding Paper. 8 pages, 2 figures, 7\n  tables"},{"id":"http://arxiv.org/abs/2405.19749v2","updated":"2024-06-04T07:45:06Z","published":"2024-05-30T06:52:01Z","title":"Generating Query Recommendations via LLMs","summary":"  Query recommendation systems are ubiquitous in modern search engines,\nassisting users in producing effective queries to meet their information needs.\nHowever, these systems require a large amount of data to produce good\nrecommendations, such as a large collection of documents to index and query\nlogs. In particular, query logs and user data are not available in cold start\nscenarios. Query logs are expensive to collect and maintain and require complex\nand time-consuming cascading pipelines for creating, combining, and ranking\nrecommendations. To address these issues, we frame the query recommendation\nproblem as a generative task, proposing a novel approach called Generative\nQuery Recommendation (GQR). GQR uses an LLM as its foundation and does not\nrequire to be trained or fine-tuned to tackle the query recommendation problem.\nWe design a prompt that enables the LLM to understand the specific\nrecommendation task, even using a single example. We then improved our system\nby proposing a version that exploits query logs called Retriever-Augmented GQR\n(RA-GQR). RA-GQr dynamically composes its prompt by retrieving similar queries\nfrom query logs. GQR approaches reuses a pre-existing neural architecture\nresulting in a simpler and more ready-to-market approach, even in a cold start\nscenario. Our proposed GQR obtains state-of-the-art performance in terms of\nNDCG@10 and clarity score against two commercial search engines and the\nprevious state-of-the-art approach on the Robust04 and ClueWeb09B collections,\nimproving on average the NDCG@10 performance up to ~4% on Robust04 and\nClueWeb09B w.r.t the previous best competitor. RA-GQR further improve the\nNDCG@10 obtaining an increase of ~11%, ~6\\% on Robust04 and ClueWeb09B w.r.t\nthe best competitor. Furthermore, our system obtained ~59% of user preferences\nin a blind user study, proving that our method produces the most engaging\nqueries.\n","authors":["Andrea Bacciu","Enrico Palumbo","Andreas Damianou","Nicola Tonellotto","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2405.19749v2.pdf","comment":"Generating Query Recommendations via LLMs"},{"id":"http://arxiv.org/abs/2406.02048v1","updated":"2024-06-04T07:29:59Z","published":"2024-06-04T07:29:59Z","title":"Auto-Encoding or Auto-Regression? A Reality Check on Causality of\n  Self-Attention-Based Sequential Recommenders","summary":"  The comparison between Auto-Encoding (AE) and Auto-Regression (AR) has become\nan increasingly important topic with recent advances in sequential\nrecommendation. At the heart of this discussion lies the comparison of BERT4Rec\nand SASRec, which serve as representative AE and AR models for self-attentive\nsequential recommenders. Yet the conclusion of this debate remains uncertain\ndue to: (1) the lack of fair and controlled environments for experiments and\nevaluations; and (2) the presence of numerous confounding factors w.r.t.\nfeature selection, modeling choices and optimization algorithms. In this work,\nwe aim to answer this question by conducting a series of controlled\nexperiments. We start by tracing the AE/AR debate back to its origin through a\nsystematic re-evaluation of SASRec and BERT4Rec, discovering that AR models\ngenerally surpass AE models in sequential recommendation. In addition, we find\nthat AR models further outperforms AE models when using a customized design\nspace that includes additional features, modeling approaches and optimization\ntechniques. Furthermore, the performance advantage of AR models persists in the\nbroader HuggingFace transformer ecosystems. Lastly, we provide potential\nexplanations and insights into AE/AR performance from two key perspectives:\nlow-rank approximation and inductive bias. We make our code and data available\nat https://github.com/yueqirex/ModSAR\n","authors":["Yueqi Wang","Zhankui He","Zhenrui Yue","Julian McAuley","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2406.02048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00011v2","updated":"2024-06-04T07:17:46Z","published":"2024-05-20T08:07:27Z","title":"DisCo: Towards Harmonious Disentanglement and Collaboration between\n  Tabular and Semantic Space for Recommendation","summary":"  Recommender systems play important roles in various applications such as\ne-commerce, social media, etc. Conventional recommendation methods usually\nmodel the collaborative signals within the tabular representation space.\nDespite the personalization modeling and the efficiency, the latent semantic\ndependencies are omitted. Methods that introduce semantics into recommendation\nthen emerge, injecting knowledge from the semantic representation space where\nthe general language understanding are compressed. However, existing\nsemantic-enhanced recommendation methods focus on aligning the two spaces,\nduring which the representations of the two spaces tend to get close while the\nunique patterns are discarded and not well explored. In this paper, we propose\nDisCo to Disentangle the unique patterns from the two representation spaces and\nCollaborate the two spaces for recommendation enhancement, where both the\nspecificity and the consistency of the two spaces are captured. Concretely, we\npropose 1) a dual-side attentive network to capture the intra-domain patterns\nand the inter-domain patterns, 2) a sufficiency constraint to preserve the\ntask-relevant information of each representation space and filter out the\nnoise, and 3) a disentanglement constraint to avoid the model from discarding\nthe unique information. These modules strike a balance between disentanglement\nand collaboration of the two representation spaces to produce informative\npattern vectors, which could serve as extra features and be appended to\narbitrary recommendation backbones for enhancement. Experiment results validate\nthe superiority of our method against different models and the compatibility of\nDisCo over different backbones. Various ablation studies and efficiency\nanalysis are also conducted to justify each model component.\n","authors":["Kounianhua Du","Jizheng Chen","Jianghao Lin","Yunjia Xi","Hangyu Wang","Xinyi Dai","Bo Chen","Ruiming Tang","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.00011v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01034v2","updated":"2024-06-04T06:10:11Z","published":"2024-06-03T06:36:04Z","title":"FourierKAN-GCF: Fourier Kolmogorov-Arnold Network -- An Effective and\n  Efficient Feature Transformation for Graph Collaborative Filtering","summary":"  Graph Collaborative Filtering (GCF) has achieved state-of-the-art performance\nfor recommendation tasks. However, most GCF structures simplify the feature\ntransformation and nonlinear operation during message passing in the graph\nconvolution network (GCN). We revisit these two components and discover that a\npart of feature transformation and nonlinear operation during message passing\nin GCN can improve the representation of GCF, but increase the difficulty of\ntraining.\n  In this work, we propose a simple and effective graph-based recommendation\nmodel called FourierKAN-GCF. Specifically, it utilizes a novel Fourier\nKolmogorov-Arnold Network (KAN) to replace the multilayer perceptron (MLP) as a\npart of the feature transformation during message passing in GCN, which\nimproves the representation power of GCF and is easy to train. We further\nemploy message dropout and node dropout strategies to improve the\nrepresentation power and robustness of the model. Extensive experiments on two\npublic datasets demonstrate the superiority of FourierKAN-GCF over most\nstate-of-the-art methods. The implementation code is available at\nhttps://github.com/Jinfeng-Xu/FKAN-GCF.\n","authors":["Jinfeng Xu","Zheyu Chen","Jinze Li","Shuo Yang","Wei Wang","Xiping Hu","Edith C. -H. Ngai"],"pdf_url":"https://arxiv.org/pdf/2406.01034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13542v2","updated":"2024-06-04T05:17:24Z","published":"2024-02-21T05:41:34Z","title":"ARL2: Aligning Retrievers for Black-box Large Language Models via\n  Self-guided Adaptive Relevance Labeling","summary":"  Retrieval-augmented generation enhances large language models (LLMs) by\nincorporating relevant information from external knowledge sources. This\nenables LLMs to adapt to specific domains and mitigate hallucinations in\nknowledge-intensive tasks. However, existing retrievers are often misaligned\nwith LLMs due to their separate training processes and the black-box nature of\nLLMs. To address this challenge, we propose ARL2, a retriever learning\ntechnique that harnesses LLMs as labelers. ARL2 leverages LLMs to annotate and\nscore relevant evidence, enabling learning the retriever from robust LLM\nsupervision. Furthermore, ARL2 uses an adaptive self-training strategy for\ncurating high-quality and diverse relevance data, which can effectively reduce\nthe annotation cost. Extensive experiments demonstrate the effectiveness of\nARL2, achieving accuracy improvements of 5.4% on NQ and 4.6% on MMLU compared\nto the state-of-the-art methods. Additionally, ARL2 exhibits robust transfer\nlearning capabilities and strong zero-shot generalization abilities. Our code\nwill be published at \\url{https://github.com/zhanglingxi-cs/ARL2}.\n","authors":["Lingxi Zhang","Yue Yu","Kuan Wang","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.13542v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2403.00815v2","updated":"2024-06-04T05:11:19Z","published":"2024-02-25T23:10:20Z","title":"RAM-EHR: Retrieval Augmentation Meets Clinical Predictions on Electronic\n  Health Records","summary":"  We present RAM-EHR, a Retrieval AugMentation pipeline to improve clinical\npredictions on Electronic Health Records (EHRs). RAM-EHR first collects\nmultiple knowledge sources, converts them into text format, and uses dense\nretrieval to obtain information related to medical concepts. This strategy\naddresses the difficulties associated with complex names for the concepts.\nRAM-EHR then augments the local EHR predictive model co-trained with\nconsistency regularization to capture complementary information from patient\nvisits and summarized knowledge. Experiments on two EHR datasets show the\nefficacy of RAM-EHR over previous knowledge-enhanced baselines (3.4% gain in\nAUROC and 7.2% gain in AUPR), emphasizing the effectiveness of the summarized\nknowledge from RAM-EHR for clinical prediction tasks. The code will be\npublished at \\url{https://github.com/ritaranx/RAM-EHR}.\n","authors":["Ran Xu","Wenqi Shi","Yue Yu","Yuchen Zhuang","Bowen Jin","May D. Wang","Joyce C. Ho","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2403.00815v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2402.03181v4","updated":"2024-06-04T04:51:08Z","published":"2024-02-05T16:46:16Z","title":"C-RAG: Certified Generation Risks for Retrieval-Augmented Language\n  Models","summary":"  Despite the impressive capabilities of large language models (LLMs) across\ndiverse applications, they still suffer from trustworthiness issues, such as\nhallucinations and misalignments. Retrieval-augmented language models (RAG)\nhave been proposed to enhance the credibility of generations by grounding\nexternal knowledge, but the theoretical understandings of their generation\nrisks remains unexplored. In this paper, we answer: 1) whether RAG can indeed\nlead to low generation risks, 2) how to provide provable guarantees on the\ngeneration risks of RAG and vanilla LLMs, and 3) what sufficient conditions\nenable RAG models to reduce generation risks. We propose C-RAG, the first\nframework to certify generation risks for RAG models. Specifically, we provide\nconformal risk analysis for RAG models and certify an upper confidence bound of\ngeneration risks, which we refer to as conformal generation risk. We also\nprovide theoretical guarantees on conformal generation risks for general\nbounded risk functions under test distribution shifts. We prove that RAG\nachieves a lower conformal generation risk than that of a single LLM when the\nquality of the retrieval model and transformer is non-trivial. Our intensive\nempirical results demonstrate the soundness and tightness of our conformal\ngeneration risk guarantees across four widely-used NLP datasets on four\nstate-of-the-art retrieval models.\n","authors":["Mintong Kang","Nezihe Merve Gürel","Ning Yu","Dawn Song","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2402.03181v4.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2406.01197v2","updated":"2024-06-04T04:12:39Z","published":"2024-06-03T10:59:33Z","title":"A Survey of Generative Information Retrieval","summary":"  Generative Retrieval (GR) is an emerging paradigm in information retrieval\nthat leverages generative models to directly map queries to relevant document\nidentifiers (DocIDs) without the need for traditional query processing or\ndocument reranking. This survey provides a comprehensive overview of GR,\nhighlighting key developments, indexing and retrieval strategies, and\nchallenges. We discuss various document identifier strategies, including\nnumerical and string-based identifiers, and explore different document\nrepresentation methods. Our primary contribution lies in outlining future\nresearch directions that could profoundly impact the field: improving the\nquality of query generation, exploring learnable document identifiers,\nenhancing scalability, and integrating GR with multi-task learning frameworks.\nBy examining state-of-the-art GR techniques and their applications, this survey\naims to provide a foundational understanding of GR and inspire further\ninnovations in this transformative approach to information retrieval. We also\nmake the complementary materials such as paper collection publicly available at\nhttps://github.com/MiuLab/GenIR-Survey/\n","authors":["Tzu-Lin Kuo","Tzu-Wei Chiu","Tzung-Sheng Lin","Sheng-Yang Wu","Chao-Wei Huang","Yun-Nung Chen"],"pdf_url":"https://arxiv.org/pdf/2406.01197v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00004v2","updated":"2024-06-04T03:10:54Z","published":"2024-05-12T04:15:05Z","title":"Navigating the Future of Federated Recommendation Systems with\n  Foundation Models","summary":"  In recent years, the integration of federated learning (FL) and\nrecommendation systems (RS), known as Federated Recommendation Systems (FRS),\nhas attracted attention for preserving user privacy by keeping private data on\nclient devices. However, FRS faces inherent limitations such as data\nheterogeneity and scarcity, due to the privacy requirements of FL and the\ntypical data sparsity issues of RSs. Models like ChatGPT are empowered by the\nconcept of transfer learning and self-supervised learning, so they can be\neasily applied to the downstream tasks after fine-tuning or prompting. These\nmodels, so-called Foundation Models (FM), fouce on understanding the human's\nintent and perform following their designed roles in the specific tasks, which\nare widely recognized for producing high-quality content in the image and\nlanguage domains. Thus, the achievements of FMs inspire the design of FRS and\nsuggest a promising research direction: integrating foundation models to\naddress the above limitations. In this study, we conduct a comprehensive review\nof FRSs with FMs. Specifically, we: 1) summarise the common approaches of\ncurrent FRSs and FMs; 2) review the challenges posed by FRSs and FMs; 3)\ndiscuss potential future research directions; and 4) introduce some common\nbenchmarks and evaluation metrics in the FRS field. We hope that this position\npaper provides the necessary background and guidance to explore this\ninteresting and emerging topic.\n","authors":["Zhiwei Li","Guodong Long"],"pdf_url":"https://arxiv.org/pdf/2406.00004v2.pdf","comment":"20 pages, position paper"},{"id":"http://arxiv.org/abs/2405.12038v2","updated":"2024-06-04T02:30:49Z","published":"2024-05-20T14:05:35Z","title":"Adaptive Convolutional Forecasting Network Based on Time Series\n  Feature-Driven","summary":"  Time series data in real-world scenarios contain a substantial amount of\nnonlinear information, which significantly interferes with the training process\nof models, leading to decreased prediction performance. Therefore, during the\ntime series forecasting process, extracting the local and global time series\npatterns and understanding the potential nonlinear features among different\ntime observations are highly significant. To address this challenge, we\nintroduce multi-resolution convolution and deformable convolution operations.\nBy enlarging the receptive field using convolution kernels with different\ndilation factors to capture temporal correlation information at different\nresolutions, and adaptively adjusting the sampling positions through additional\noffset vectors, we enhance the network's ability to capture potential nonlinear\nfeatures among time observations. Building upon this, we propose ACNet, an\nadaptive convolutional network designed to effectively model the local and\nglobal temporal dependencies and the nonlinear features between observations in\nmultivariate time series. Specifically, by extracting and fusing time series\nfeatures at different resolutions, we capture both local contextual information\nand global patterns in the time series. The designed nonlinear feature adaptive\nextraction module captures the nonlinear features among different time\nobservations in the time series. We evaluated the performance of ACNet across\ntwelve real-world datasets. The results indicate that ACNet consistently\nachieves state-of-the-art performance in both short-term and long-term\nforecasting tasks with favorable runtime efficiency.\n","authors":["Dandan Zhang","Zhiqiang Zhang","Nanguang Chen","Yun Wang"],"pdf_url":"https://arxiv.org/pdf/2405.12038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01906v1","updated":"2024-06-04T02:28:51Z","published":"2024-06-04T02:28:51Z","title":"ProGEO: Generating Prompts through Image-Text Contrastive Learning for\n  Visual Geo-localization","summary":"  Visual Geo-localization (VG) refers to the process to identify the location\ndescribed in query images, which is widely applied in robotics field and\ncomputer vision tasks, such as autonomous driving, metaverse, augmented\nreality, and SLAM. In fine-grained images lacking specific text descriptions,\ndirectly applying pure visual methods to represent neighborhood features often\nleads to the model focusing on overly fine-grained features, unable to fully\nmine the semantic information in the images. Therefore, we propose a two-stage\ntraining method to enhance visual performance and use contrastive learning to\nmine challenging samples. We first leverage the multi-modal description\ncapability of CLIP (Contrastive Language-Image Pretraining) to create a set of\nlearnable text prompts for each geographic image feature to form vague\ndescriptions. Then, by utilizing dynamic text prompts to assist the training of\nthe image encoder, we enable the image encoder to learn better and more\ngeneralizable visual features. This strategy of applying text to purely visual\ntasks addresses the challenge of using multi-modal models for geographic\nimages, which often suffer from a lack of precise descriptions, making them\ndifficult to utilize widely. We validate the effectiveness of the proposed\nstrategy on several large-scale visual geo-localization datasets, and our\nmethod achieves competitive results on multiple visual geo-localization\ndatasets. Our code and model are available at\nhttps://github.com/Chain-Mao/ProGEO.\n","authors":["Chen Mao","Jingqi Hu"],"pdf_url":"https://arxiv.org/pdf/2406.01906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07092v3","updated":"2024-06-04T02:09:15Z","published":"2024-02-11T03:27:22Z","title":"Generalizing Conversational Dense Retrieval via LLM-Cognition Data\n  Augmentation","summary":"  Conversational search utilizes muli-turn natural language contexts to\nretrieve relevant passages. Existing conversational dense retrieval models\nmostly view a conversation as a fixed sequence of questions and responses,\noverlooking the severe data sparsity problem -- that is, users can perform a\nconversation in various ways, and these alternate conversations are unrecorded.\nConsequently, they often struggle to generalize to diverse conversations in\nreal-world scenarios. In this work, we propose a framework for generalizing\nConversational dense retrieval via LLM-cognition data Augmentation (ConvAug).\nConvAug first generates multi-level augmented conversations to capture the\ndiverse nature of conversational contexts. Inspired by human cognition, we\ndevise a cognition-aware process to mitigate the generation of false positives,\nfalse negatives, and hallucinations. Moreover, we develop a difficulty-adaptive\nsample filter that selects challenging samples for complex conversations,\nthereby giving the model a larger learning space. A contrastive learning\nobjective is then employed to train a better conversational context encoder.\nExtensive experiments conducted on four public datasets, under both normal and\nzero-shot settings, demonstrate the effectiveness, generalizability, and\napplicability of ConvAug. The code is released at\nhttps://github.com/haon-chen/ConvAug.\n","authors":["Haonan Chen","Zhicheng Dou","Kelong Mao","Jiongnan Liu","Ziliang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.07092v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2406.01876v1","updated":"2024-06-04T01:08:00Z","published":"2024-06-04T01:08:00Z","title":"GRAM: Generative Retrieval Augmented Matching of Data Schemas in the\n  Context of Data Security","summary":"  Schema matching constitutes a pivotal phase in the data ingestion process for\ncontemporary database systems. Its objective is to discern pairwise\nsimilarities between two sets of attributes, each associated with a distinct\ndata table. This challenge emerges at the initial stages of data analytics,\nsuch as when incorporating a third-party table into existing databases to\ninform business insights. Given its significance in the realm of database\nsystems, schema matching has been under investigation since the 2000s. This\nstudy revisits this foundational problem within the context of large language\nmodels. Adhering to increasingly stringent data security policies, our focus\nlies on the zero-shot and few-shot scenarios: the model should analyze only a\nminimal amount of customer data to execute the matching task, contrasting with\nthe conventional approach of scrutinizing the entire data table. We emphasize\nthat the zero-shot or few-shot assumption is imperative to safeguard the\nidentity and privacy of customer data, even at the potential cost of accuracy.\nThe capability to accurately match attributes under such stringent requirements\ndistinguishes our work from previous literature in this domain.\n","authors":["Xuanqing Liu","Luyang Kong","Runhui Wang","Patrick Song","Austin Nevins","Henrik Johnson","Nimish Amlathe","Davor Golac"],"pdf_url":"https://arxiv.org/pdf/2406.01876v1.pdf","comment":"KDD 2024 Camera Ready; 11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2405.18570v2","updated":"2024-06-04T20:53:32Z","published":"2024-05-28T20:28:07Z","title":"Its Not a Modality Gap: Characterizing and Addressing the Contrastive\n  Gap","summary":"  Multi-modal contrastive models such as CLIP achieve state-of-the-art\nperformance in zero-shot classification by embedding input images and texts on\na joint representational space. Recently, a modality gap has been reported in\ntwo-encoder contrastive models like CLIP, meaning that the image and text\nembeddings reside in disjoint areas of the latent space. Previous studies\nsuggest that this gap exists due to 1) the cone effect, 2) mismatched pairs in\nthe dataset, and 3) insufficient training. We show that, even when accounting\nfor all these factors, and even when using the same modality, the contrastive\nloss actually creates a gap during training. As a result, We propose that the\nmodality gap is inherent to the two-encoder contrastive loss and rename it the\ncontrastive gap. We present evidence that attributes this contrastive gap to\nlow uniformity in CLIP space, resulting in embeddings that occupy only a small\nportion of the latent space. To close the gap, we adapt the uniformity and\nalignment properties of unimodal contrastive loss to the multi-modal setting\nand show that simply adding these terms to the CLIP loss distributes the\nembeddings more uniformly in the representational space, closing the gap. In\nour experiments, we show that the modified representational space achieves\nbetter performance than default CLIP loss in downstream tasks such as zero-shot\nimage classification and multi-modal arithmetic.\n","authors":["Abrar Fahim","Alex Murphy","Alona Fyshe"],"pdf_url":"https://arxiv.org/pdf/2405.18570v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2406.02550v1","updated":"2024-06-04T17:59:36Z","published":"2024-06-04T17:59:36Z","title":"Learning to grok: Emergence of in-context learning and skill composition\n  in modular arithmetic tasks","summary":"  Large language models can solve tasks that were not present in the training\nset. This capability is believed to be due to in-context learning and skill\ncomposition. In this work, we study the emergence of in-context learning and\nskill composition in a collection of modular arithmetic tasks. Specifically, we\nconsider a finite collection of linear modular functions $z = a \\, x + b \\, y\n\\;\\mathrm{mod}\\; p$ labeled by the vector $(a, b) \\in \\mathbb{Z}_p^2$. We use\nsome of these tasks for pre-training and the rest for out-of-distribution\ntesting. We empirically show that a GPT-style transformer exhibits a transition\nfrom in-distribution to out-of-distribution generalization as the number of\npre-training tasks increases. We find that the smallest model capable of\nout-of-distribution generalization requires two transformer blocks, while for\ndeeper models, the out-of-distribution generalization phase is\n\\emph{transient}, necessitating early stopping. Finally, we perform an\ninterpretability study of the pre-trained models, revealing the highly\nstructured representations in both phases; and discuss the learnt algorithm.\n","authors":["Tianyu He","Darshil Doshi","Aritra Das","Andrey Gromov"],"pdf_url":"https://arxiv.org/pdf/2406.02550v1.pdf","comment":"21 pages, 19 figures"},{"id":"http://arxiv.org/abs/2406.02545v1","updated":"2024-06-04T17:58:33Z","published":"2024-06-04T17:58:33Z","title":"Robust and highly scalable estimation of directional couplings from\n  time-shifted signals","summary":"  The estimation of directed couplings between the nodes of a network from\nindirect measurements is a central methodological challenge in scientific\nfields such as neuroscience, systems biology and economics. Unfortunately, the\nproblem is generally ill-posed due to the possible presence of unknown delays\nin the measurements. In this paper, we offer a solution of this problem by\nusing a variational Bayes framework, where the uncertainty over the delays is\nmarginalized in order to obtain conservative coupling estimates. To overcome\nthe well-known overconfidence of classical variational methods, we use a\nhybrid-VI scheme where the (possibly flat or multimodal) posterior over the\nmeasurement parameters is estimated using a forward KL loss while the (nearly\nconvex) conditional posterior over the couplings is estimated using the highly\nscalable gradient-based VI. In our ground-truth experiments, we show that the\nnetwork provides reliable and conservative estimates of the couplings, greatly\noutperforming similar methods such as regression DCM.\n","authors":["Luca Ambrogioni","Louis Rouillard","Demian Wassermann"],"pdf_url":"https://arxiv.org/pdf/2406.02545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02543v1","updated":"2024-06-04T17:58:18Z","published":"2024-06-04T17:58:18Z","title":"To Believe or Not to Believe Your LLM","summary":"  We explore uncertainty quantification in large language models (LLMs), with\nthe goal to identify when uncertainty in responses given a query is large. We\nsimultaneously consider both epistemic and aleatoric uncertainties, where the\nformer comes from the lack of knowledge about the ground truth (such as about\nfacts or the language), and the latter comes from irreducible randomness (such\nas multiple possible answers). In particular, we derive an\ninformation-theoretic metric that allows to reliably detect when only epistemic\nuncertainty is large, in which case the output of the model is unreliable. This\ncondition can be computed based solely on the output of the model obtained\nsimply by some special iterative prompting based on the previous responses.\nSuch quantification, for instance, allows to detect hallucinations (cases when\nepistemic uncertainty is high) in both single- and multi-answer responses. This\nis in contrast to many standard uncertainty quantification strategies (such as\nthresholding the log-likelihood of a response) where hallucinations in the\nmulti-answer case cannot be detected. We conduct a series of experiments which\ndemonstrate the advantage of our formulation. Further, our investigations shed\nsome light on how the probabilities assigned to a given output by an LLM can be\namplified by iterative prompting, which might be of independent interest.\n","authors":["Yasin Abbasi Yadkori","Ilja Kuzborskij","András György","Csaba Szepesvári"],"pdf_url":"https://arxiv.org/pdf/2406.02543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02542v1","updated":"2024-06-04T17:58:03Z","published":"2024-06-04T17:58:03Z","title":"Loki: Low-Rank Keys for Efficient Sparse Attention","summary":"  Inference on large language models can be expensive in terms of the compute\nand memory costs involved, especially when long sequence lengths are used. In\nparticular, the self-attention mechanism used in such models contributes\nsignificantly to these costs, which has resulted in several recent works that\npropose sparse attention approximations for inference. In this work, we propose\nto approximate the self-attention computation by focusing on the dimensionality\nof key vectors computed in the attention block. Our analysis reveals that the\nkey vectors lie in a significantly lower-dimensional space, consistently across\nseveral datasets and models. Exploiting this observation, we propose Loki, a\nnovel sparse attention method that ranks and selects tokens in the KV-cache\nbased on attention scores computed in low-dimensional space. Our evaluations\nshow that Loki is able to maintain the efficacy of the models better than other\npopular approximation methods, while speeding up the attention computation due\nto reduced data movement (load/store) and compute costs.\n","authors":["Prajwal Singhania","Siddharth Singh","Shwai He","Soheil Feizi","Abhinav Bhatele"],"pdf_url":"https://arxiv.org/pdf/2406.02542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02539v1","updated":"2024-06-04T17:56:28Z","published":"2024-06-04T17:56:28Z","title":"Parrot: Multilingual Visual Instruction Tuning","summary":"  The rapid development of Multimodal Large Language Models (MLLMs) like GPT-4V\nhas marked a significant step towards artificial general intelligence. Existing\nmethods mainly focus on aligning vision encoders with LLMs through supervised\nfine-tuning (SFT) to endow LLMs with multimodal abilities, making MLLMs'\ninherent ability to react to multiple languages progressively deteriorate as\nthe training process evolves. We empirically find that the imbalanced SFT\ndatasets, primarily composed of English-centric image-text pairs, lead to\nsignificantly reduced performance in non-English languages. This is due to the\nfailure of aligning the vision encoder and LLM with multilingual tokens during\nthe SFT process. In this paper, we introduce Parrot, a novel method that\nutilizes textual guidance to drive visual token alignment at the language\nlevel. Parrot makes the visual tokens condition on diverse language inputs and\nuses Mixture-of-Experts (MoE) to promote the alignment of multilingual tokens.\nSpecifically, to enhance non-English visual tokens alignment, we compute the\ncross-attention using the initial visual features and textual embeddings, the\nresult of which is then fed into the MoE router to select the most relevant\nexperts. The selected experts subsequently convert the initial visual tokens\ninto language-specific visual tokens. Moreover, considering the current lack of\nbenchmarks for evaluating multilingual capabilities within the field, we\ncollect and make available a Massive Multilingual Multimodal Benchmark which\nincludes 6 languages, 15 categories, and 12,000 questions, named as MMMB. Our\nmethod not only demonstrates state-of-the-art performance on multilingual\nMMBench and MMMB, but also excels across a broad range of multimodal tasks.\nBoth the source code and the training dataset of Parrot will be made publicly\navailable.\n","authors":["Hai-Long Sun","Da-Wei Zhou","Yang Li","Shiyin Lu","Chao Yi","Qing-Guo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","De-Chuan Zhan","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2406.02539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02537v1","updated":"2024-06-04T17:55:43Z","published":"2024-06-04T17:55:43Z","title":"TopViewRS: Vision-Language Models as Top-View Spatial Reasoners","summary":"  Top-view perspective denotes a typical way in which humans read and reason\nover different types of maps, and it is vital for localization and navigation\nof humans as well as of `non-human' agents, such as the ones backed by large\nVision-Language Models (VLMs). Nonetheless, spatial reasoning capabilities of\nmodern VLMs remain unattested and underexplored. In this work, we thus study\ntheir capability to understand and reason over spatial relations from the top\nview. The focus on top view also enables controlled evaluations at different\ngranularity of spatial reasoning; we clearly disentangle different abilities\n(e.g., recognizing particular objects versus understanding their relative\npositions). We introduce the TopViewRS (Top-View Reasoning in Space) dataset,\nconsisting of 11,384 multiple-choice questions with either realistic or\nsemantic top-view map as visual input. We then use it to study and evaluate\nVLMs across 4 perception and reasoning tasks with different levels of\ncomplexity. Evaluation of 10 representative open- and closed-source VLMs\nreveals the gap of more than 50% compared to average human performance, and it\nis even lower than the random baseline in some cases. Although additional\nexperiments show that Chain-of-Thought reasoning can boost model capabilities\nby 5.82% on average, the overall performance of VLMs remains limited. Our\nfindings underscore the critical need for enhanced model capability in top-view\nspatial reasoning and set a foundation for further research towards human-level\nproficiency of VLMs in real-world multimodal tasks.\n","authors":["Chengzu Li","Caiqi Zhang","Han Zhou","Nigel Collier","Anna Korhonen","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2406.02537v1.pdf","comment":"9 pages, 3 figures, 3 tables (21 pages, 4 figures, 15 tables\n  including references and appendices)"},{"id":"http://arxiv.org/abs/2406.02536v1","updated":"2024-06-04T17:55:38Z","published":"2024-06-04T17:55:38Z","title":"Mitigate Position Bias in Large Language Models via Scaling a Single\n  Dimension","summary":"  Large Language Models (LLMs) are increasingly applied in various real-world\nscenarios due to their excellent generalization capabilities and robust\ngenerative abilities. However, they exhibit position bias, also known as \"lost\nin the middle\", a phenomenon that is especially pronounced in long-context\nscenarios, which indicates the placement of the key information in different\npositions of a prompt can significantly affect accuracy. This paper first\nexplores the micro-level manifestations of position bias, concluding that\nattention weights are a micro-level expression of position bias. It further\nidentifies that, in addition to position embeddings, causal attention mask also\ncontributes to position bias by creating position-specific hidden states. Based\non these insights, we propose a method to mitigate position bias by scaling\nthis positional hidden states. Experiments on the NaturalQuestions\nMulti-document QA, KV retrieval, LongBench and timeline reorder tasks, using\nvarious models including RoPE models, context windowextended models, and Alibi\nmodels, demonstrate the effectiveness and generalizability of our approach. Our\nmethod can improve performance by up to 15.2% by modifying just one dimension\nof hidden states. Our code is available at https://aka.ms/PositionalHidden.\n","authors":["Yijiong Yu","Huiqiang Jiang","Xufang Luo","Qianhui Wu","Chin-Yew Lin","Dongsheng Li","Yuqing Yang","Yongfeng Huang","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2406.02536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02534v1","updated":"2024-06-04T17:54:44Z","published":"2024-06-04T17:54:44Z","title":"Enhancing predictive imaging biomarker discovery through treatment\n  effect analysis","summary":"  Identifying predictive biomarkers, which forecast individual treatment\neffectiveness, is crucial for personalized medicine and informs decision-making\nacross diverse disciplines. These biomarkers are extracted from pre-treatment\ndata, often within randomized controlled trials, and have to be distinguished\nfrom prognostic biomarkers, which are independent of treatment assignment. Our\nstudy focuses on the discovery of predictive imaging biomarkers, aiming to\nleverage pre-treatment images to unveil new causal relationships. Previous\napproaches relied on labor-intensive handcrafted or manually derived features,\nwhich may introduce biases. In response, we present a new task of discovering\npredictive imaging biomarkers directly from the pre-treatment images to learn\nrelevant image features. We propose an evaluation protocol for this task to\nassess a model's ability to identify predictive imaging biomarkers and\ndifferentiate them from prognostic ones. It employs statistical testing and a\ncomprehensive analysis of image feature attribution. We explore the suitability\nof deep learning models originally designed for estimating the conditional\naverage treatment effect (CATE) for this task, which previously have been\nprimarily assessed for the precision of CATE estimation, overlooking the\nevaluation of imaging biomarker discovery. Our proof-of-concept analysis\ndemonstrates promising results in discovering and validating predictive imaging\nbiomarkers from synthetic outcomes and real-world image datasets.\n","authors":["Shuhan Xiao","Lukas Klein","Jens Petersen","Philipp Vollmuth","Paul F. Jaeger","Klaus H. Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2406.02534v1.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2406.02529v1","updated":"2024-06-04T17:51:08Z","published":"2024-06-04T17:51:08Z","title":"ReLUs Are Sufficient for Learning Implicit Neural Representations","summary":"  Motivated by the growing theoretical understanding of neural networks that\nemploy the Rectified Linear Unit (ReLU) as their activation function, we\nrevisit the use of ReLU activation functions for learning implicit neural\nrepresentations (INRs). Inspired by second order B-spline wavelets, we\nincorporate a set of simple constraints to the ReLU neurons in each layer of a\ndeep neural network (DNN) to remedy the spectral bias. This in turn enables its\nuse for various INR tasks. Empirically, we demonstrate that, contrary to\npopular belief, one can learn state-of-the-art INRs based on a DNN composed of\nonly ReLU neurons. Next, by leveraging recent theoretical works which\ncharacterize the kinds of functions ReLU neural networks learn, we provide a\nway to quantify the regularity of the learned function. This offers a\nprincipled approach to selecting the hyperparameters in INR architectures. We\nsubstantiate our claims through experiments in signal representation, super\nresolution, and computed tomography, demonstrating the versatility and\neffectiveness of our method. The code for all experiments can be found at\nhttps://github.com/joeshenouda/relu-inrs.\n","authors":["Joseph Shenouda","Yamin Zhou","Robert D. Nowak"],"pdf_url":"https://arxiv.org/pdf/2406.02529v1.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2402.03496v4","updated":"2024-06-04T17:47:41Z","published":"2024-02-05T20:15:19Z","title":"Can We Remove the Square-Root in Adaptive Gradient Methods? A\n  Second-Order Perspective","summary":"  Adaptive gradient optimizers like Adam(W) are the default training algorithms\nfor many deep learning architectures, such as transformers. Their diagonal\npreconditioner is based on the gradient outer product which is incorporated\ninto the parameter update via a square root. While these methods are often\nmotivated as approximate second-order methods, the square root represents a\nfundamental difference. In this work, we investigate how the behavior of\nadaptive methods changes when we remove the root, i.e. strengthen their\nsecond-order motivation. Surprisingly, we find that such square-root-free\nadaptive methods close the generalization gap to SGD on convolutional\narchitectures, while maintaining their root-based counterpart's performance on\ntransformers. The second-order perspective also has practical benefits for the\ndevelopment of non-diagonal adaptive methods through the concept of\npreconditioner invariance. In contrast to root-based methods like Shampoo, the\nroot-free counterparts do not require numerically unstable matrix root\ndecompositions and inversions, thus work well in half precision. Our findings\nprovide new insights into the development of adaptive methods and raise\nimportant questions regarding the currently overlooked role of adaptivity for\ntheir success.\n","authors":["Wu Lin","Felix Dangel","Runa Eschenhagen","Juhan Bae","Richard E. Turner","Alireza Makhzani"],"pdf_url":"https://arxiv.org/pdf/2402.03496v4.pdf","comment":"A long version of the ICML 2024 paper. Updated Sec 4 to emphasize the\n  concept of preconditioner invariance"},{"id":"http://arxiv.org/abs/2406.02523v1","updated":"2024-06-04T17:41:31Z","published":"2024-06-04T17:41:31Z","title":"RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots","summary":"  Recent advancements in Artificial Intelligence (AI) have largely been\npropelled by scaling. In Robotics, scaling is hindered by the lack of access to\nmassive robot datasets. We advocate using realistic physical simulation as a\nmeans to scale environments, tasks, and datasets for robot learning methods. We\npresent RoboCasa, a large-scale simulation framework for training generalist\nrobots in everyday environments. RoboCasa features realistic and diverse scenes\nfocusing on kitchen environments. We provide thousands of 3D assets across over\n150 object categories and dozens of interactable furniture and appliances. We\nenrich the realism and diversity of our simulation with generative AI tools,\nsuch as object assets from text-to-3D models and environment textures from\ntext-to-image models. We design a set of 100 tasks for systematic evaluation,\nincluding composite tasks generated by the guidance of large language models.\nTo facilitate learning, we provide high-quality human demonstrations and\nintegrate automated trajectory generation methods to substantially enlarge our\ndatasets with minimal human burden. Our experiments show a clear scaling trend\nin using synthetically generated robot data for large-scale imitation learning\nand show great promise in harnessing simulation data in real-world tasks.\nVideos and open-source code are available at https://robocasa.ai/\n","authors":["Soroush Nasiriany","Abhiram Maddukuri","Lance Zhang","Adeet Parikh","Aaron Lo","Abhishek Joshi","Ajay Mandlekar","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.02523v1.pdf","comment":"RSS 2024"},{"id":"http://arxiv.org/abs/2406.02515v1","updated":"2024-06-04T17:38:24Z","published":"2024-06-04T17:38:24Z","title":"Uncertainty of Joint Neural Contextual Bandit","summary":"  Contextual bandit learning is increasingly favored in modern large-scale\nrecommendation systems. To better utlize the contextual information and\navailable user or item features, the integration of neural networks have been\nintroduced to enhance contextual bandit learning and has triggered significant\ninterest from both academia and industry. However, a major challenge arises\nwhen implementing a disjoint neural contextual bandit solution in large-scale\nrecommendation systems, where each item or user may correspond to a separate\nbandit arm. The huge number of items to recommend poses a significant hurdle\nfor real world production deployment. This paper focuses on a joint neural\ncontextual bandit solution which serves all recommending items in one single\nmodel. The output consists of a predicted reward $\\mu$, an uncertainty $\\sigma$\nand a hyper-parameter $\\alpha$ which balances exploitation and exploration,\ne.g., $\\mu + \\alpha \\sigma$.\n  The tuning of the parameter $\\alpha$ is typically heuristic and complex in\npractice due to its stochastic nature. To address this challenge, we provide\nboth theoretical analysis and experimental findings regarding the uncertainty\n$\\sigma$ of the joint neural contextual bandit model. Our analysis reveals that\n$\\alpha$ demonstrates an approximate square root relationship with the size of\nthe last hidden layer $F$ and inverse square root relationship with the amount\nof training data $N$, i.e., $\\sigma \\propto \\sqrt{\\frac{F}{N}}$. The\nexperiments, conducted with real industrial data, align with the theoretical\nanalysis, help understanding model behaviors and assist the hyper-parameter\ntuning during both offline training and online deployment.\n","authors":["Hongbo Guo","Zheqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.02515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02510v1","updated":"2024-06-04T17:29:21Z","published":"2024-06-04T17:29:21Z","title":"Fairness-Optimized Synthetic EHR Generation for Arbitrary Downstream\n  Predictive Tasks","summary":"  Among various aspects of ensuring the responsible design of AI tools for\nhealthcare applications, addressing fairness concerns has been a key focus\narea. Specifically, given the wide spread of electronic health record (EHR)\ndata and their huge potential to inform a wide range of clinical decision\nsupport tasks, improving fairness in this category of health AI tools is of key\nimportance. While such a broad problem (that is, mitigating fairness in\nEHR-based AI models) has been tackled using various methods, task- and\nmodel-agnostic methods are noticeably rare. In this study, we aimed to target\nthis gap by presenting a new pipeline that generates synthetic EHR data, which\nis not only consistent with (faithful to) the real EHR data but also can reduce\nthe fairness concerns (defined by the end-user) in the downstream tasks, when\ncombined with the real data. We demonstrate the effectiveness of our proposed\npipeline across various downstream tasks and two different EHR datasets. Our\nproposed pipeline can add a widely applicable and complementary tool to the\nexisting toolbox of methods to address fairness in health AI applications such\nas those modifying the design of a downstream model. The codebase for our\nproject is available at https://github.com/healthylaife/FairSynth\n","authors":["Mirza Farhan Bin Tarek","Raphael Poulain","Rahmatollah Beheshti"],"pdf_url":"https://arxiv.org/pdf/2406.02510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02741v2","updated":"2024-06-04T17:26:30Z","published":"2024-03-05T07:51:38Z","title":"State-Constrained Zero-Sum Differential Games with One-Sided Information","summary":"  We study zero-sum differential games with state constraints and one-sided\ninformation, where the informed player (Player 1) has a categorical payoff type\nunknown to the uninformed player (Player 2). The goal of Player 1 is to\nminimize his payoff without violating the constraints, while that of Player 2\nis to violate the state constraints if possible, or to maximize the payoff\notherwise. One example of the game is a man-to-man matchup in football. Without\nstate constraints, Cardaliaguet (2007) showed that the value of such a game\nexists and is convex to the common belief of players. Our theoretical\ncontribution is an extension of this result to games with state constraints and\nthe derivation of the primal and dual subdynamic principles necessary for\ncomputing behavioral strategies. Different from existing works that are\nconcerned about the scalability of no-regret learning in games with discrete\ndynamics, our study reveals the underlying structure of strategies for belief\nmanipulation resulting from information asymmetry and state constraints. This\nstructure will be necessary for scalable learning on games with continuous\nactions and long time windows. We use a simplified football game to demonstrate\nthe utility of this work, where we reveal player positions and belief states in\nwhich the attacker should (or should not) play specific random deceptive moves\nto take advantage of information asymmetry, and compute how the defender should\nrespond.\n","authors":["Mukesh Ghimire","Lei Zhang","Zhe Xu","Yi Ren"],"pdf_url":"https://arxiv.org/pdf/2403.02741v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2406.02507v1","updated":"2024-06-04T17:25:59Z","published":"2024-06-04T17:25:59Z","title":"Guiding a Diffusion Model with a Bad Version of Itself","summary":"  The primary axes of interest in image-generating diffusion models are image\nquality, the amount of variation in the results, and how well the results align\nwith a given condition, e.g., a class label or a text prompt. The popular\nclassifier-free guidance approach uses an unconditional model to guide a\nconditional model, leading to simultaneously better prompt alignment and\nhigher-quality images at the cost of reduced variation. These effects seem\ninherently entangled, and thus hard to control. We make the surprising\nobservation that it is possible to obtain disentangled control over image\nquality without compromising the amount of variation by guiding generation\nusing a smaller, less-trained version of the model itself rather than an\nunconditional model. This leads to significant improvements in ImageNet\ngeneration, setting record FIDs of 1.01 for 64x64 and 1.25 for 512x512, using\npublicly available networks. Furthermore, the method is also applicable to\nunconditional diffusion models, drastically improving their quality.\n","authors":["Tero Karras","Miika Aittala","Tuomas Kynkäänniemi","Jaakko Lehtinen","Timo Aila","Samuli Laine"],"pdf_url":"https://arxiv.org/pdf/2406.02507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02500v1","updated":"2024-06-04T17:18:40Z","published":"2024-06-04T17:18:40Z","title":"Demystifying the Compression of Mixture-of-Experts Through a Unified\n  Framework","summary":"  Scaling large language models has revolutionized the performance across\ndiverse domains, yet the continual growth in model size poses significant\nchallenges for real-world deployment. The Mixture of Experts (MoE) approach\naddresses this by dynamically selecting and activating only a subset of\nexperts, significantly reducing computational costs while maintaining high\nperformance. However, MoE introduces potential redundancy (e.g., parameters)\nand extra costs (e.g., communication overhead). Despite numerous compression\ntechniques developed for mitigating the redundancy in dense models, the\ncompression of MoE remains under-explored. We first bridge this gap with a\ncutting-edge unified framework that not only seamlessly integrates mainstream\ncompression methods but also helps systematically understand MoE compression.\nThis framework approaches compression from two perspectives: Expert Slimming\nwhich compresses individual experts and Expert Trimming which removes\nstructured modules. Within this framework, we explore the optimization space\nunexplored by existing methods,and further introduce aggressive Expert Trimming\ntechniques, i.e., Layer Drop and Block Drop, to eliminate redundancy at larger\nscales. Based on these insights,we present a comprehensive recipe to guide\npractitioners in compressing MoE effectively. Extensive experimental results\ndemonstrate the effectiveness of the compression methods under our framework\nand the proposed recipe, achieving a 6.05x speedup and only 20.0GB memory usage\nwhile maintaining over 92% of performance on Mixtral-8x7B.\n","authors":["Shwai He","Daize Dong","Liang Ding","Ang Li"],"pdf_url":"https://arxiv.org/pdf/2406.02500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07335v3","updated":"2024-06-04T17:17:53Z","published":"2023-12-12T14:53:18Z","title":"Momentum Particle Maximum Likelihood","summary":"  Maximum likelihood estimation (MLE) of latent variable models is often recast\nas the minimization of a free energy functional over an extended space of\nparameters and probability distributions. This perspective was recently\ncombined with insights from optimal transport to obtain novel particle-based\nalgorithms for fitting latent variable models to data. Drawing inspiration from\nprior works which interpret `momentum-enriched' optimization algorithms as\ndiscretizations of ordinary differential equations, we propose an analogous\ndynamical-systems-inspired approach to minimizing the free energy functional.\nThe result is a dynamical system that blends elements of Nesterov's Accelerated\nGradient method, the underdamped Langevin diffusion, and particle methods.\nUnder suitable assumptions, we prove that the continuous-time system minimizes\nthe functional. By discretizing the system, we obtain a practical algorithm for\nMLE in latent variable models. The algorithm outperforms existing particle\nmethods in numerical experiments and compares favourably with other MLE\nalgorithms.\n","authors":["Jen Ning Lim","Juan Kuntz","Samuel Power","Adam M. Johansen"],"pdf_url":"https://arxiv.org/pdf/2312.07335v3.pdf","comment":"ICML 2024 camera ready"},{"id":"http://arxiv.org/abs/2406.02497v1","updated":"2024-06-04T17:15:25Z","published":"2024-06-04T17:15:25Z","title":"Dropout MPC: An Ensemble Neural MPC Approach for Systems with Learned\n  Dynamics","summary":"  Neural networks are lately more and more often being used in the context of\ndata-driven control, as an approximate model of the true system dynamics. Model\nPredictive Control (MPC) adopts this practise leading to neural MPC strategies.\nThis raises a question of whether the trained neural network has converged and\ngeneralized in a way that the learned model encapsulates an accurate\napproximation of the true dynamic model of the system, thus making it a\nreliable choice for model-based control, especially for disturbed and uncertain\nsystems. To tackle that, we propose Dropout MPC, a novel sampling-based\nensemble neural MPC algorithm that employs the Monte-Carlo dropout technique on\nthe learned system model. The closed loop is based on an ensemble of predictive\ncontrollers, that are used simultaneously at each time-step for trajectory\noptimization. Each member of the ensemble influences the control input, based\non a weighted voting scheme, thus by employing different realizations of the\nlearned system dynamics, neural control becomes more reliable by design. An\nadditional strength of the method is that it offers by design a way to estimate\nfuture uncertainty, leading to cautious control. While the method aims in\ngeneral at uncertain systems with complex dynamics, where models derived from\nfirst principles are hard to infer, to showcase the application we utilize data\ngathered in the laboratory from a real mobile manipulator and employ the\nproposed algorithm for the navigation of the robot in simulation.\n","authors":["Spyridon Syntakas","Kostas Vlachos"],"pdf_url":"https://arxiv.org/pdf/2406.02497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02496v1","updated":"2024-06-04T17:14:31Z","published":"2024-06-04T17:14:31Z","title":"Kolmogorov-Arnold Networks for Time Series: Bridging Predictive Power\n  and Interpretability","summary":"  Kolmogorov-Arnold Networks (KAN) is a groundbreaking model recently proposed\nby the MIT team, representing a revolutionary approach with the potential to be\na game-changer in the field. This innovative concept has rapidly garnered\nworldwide interest within the AI community. Inspired by the Kolmogorov-Arnold\nrepresentation theorem, KAN utilizes spline-parametrized univariate functions\nin place of traditional linear weights, enabling them to dynamically learn\nactivation patterns and significantly enhancing interpretability. In this\npaper, we explore the application of KAN to time series forecasting and propose\ntwo variants: T-KAN and MT-KAN. T-KAN is designed to detect concept drift\nwithin time series and can explain the nonlinear relationships between\npredictions and previous time steps through symbolic regression, making it\nhighly interpretable in dynamically changing environments. MT-KAN, on the other\nhand, improves predictive performance by effectively uncovering and leveraging\nthe complex relationships among variables in multivariate time series.\nExperiments validate the effectiveness of these approaches, demonstrating that\nT-KAN and MT-KAN significantly outperform traditional methods in time series\nforecasting tasks, not only enhancing predictive accuracy but also improving\nmodel interpretability. This research opens new avenues for adaptive\nforecasting models, highlighting the potential of KAN as a powerful and\ninterpretable tool in predictive analytics.\n","authors":["Kunpeng Xu","Lifei Chen","Shengrui Wang"],"pdf_url":"https://arxiv.org/pdf/2406.02496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14202v2","updated":"2024-06-04T17:11:02Z","published":"2024-02-22T01:07:48Z","title":"Comparing Graph Transformers via Positional Encodings","summary":"  The distinguishing power of graph transformers is closely tied to the choice\nof positional encoding: features used to augment the base transformer with\ninformation about the graph. There are two primary types of positional\nencoding: absolute positional encodings (APEs) and relative positional\nencodings (RPEs). APEs assign features to each node and are given as input to\nthe transformer. RPEs instead assign a feature to each pair of nodes, e.g.,\ngraph distance, and are used to augment the attention block. A priori, it is\nunclear which method is better for maximizing the power of the resulting graph\ntransformer. In this paper, we aim to understand the relationship between these\ndifferent types of positional encodings. Interestingly, we show that graph\ntransformers using APEs and RPEs are equivalent in terms of distinguishing\npower. In particular, we demonstrate how to interchange APEs and RPEs while\nmaintaining their distinguishing power in terms of graph transformers. Based on\nour theoretical results, we provide a study on several APEs and RPEs (including\nthe resistance distance and the recently introduced stable and expressive\npositional encoding (SPE)) and compare their distinguishing power in terms of\ntransformers. We believe our work will help navigate the huge number of choices\nof positional encoding and will provide guidance on the future design of\npositional encodings for graph transformers.\n","authors":["Mitchell Black","Zhengchao Wan","Gal Mishne","Amir Nayyeri","Yusu Wang"],"pdf_url":"https://arxiv.org/pdf/2402.14202v2.pdf","comment":"accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2402.02328v3","updated":"2024-06-04T17:05:20Z","published":"2024-02-04T03:03:27Z","title":"Sample Complexity of Algorithm Selection Using Neural Networks and Its\n  Applications to Branch-and-Cut","summary":"  Data-driven algorithm design is a paradigm that uses statistical and machine\nlearning techniques to select from a class of algorithms for a computational\nproblem an algorithm that has the best expected performance with respect to\nsome (unknown) distribution on the instances of the problem. We build upon\nrecent work in this line of research by considering the setup where, instead of\nselecting a single algorithm that has the best performance, we allow the\npossibility of selecting an algorithm based on the instance to be solved, using\nneural networks. In particular, given a representative sample of instances, we\nlearn a neural network that maps an instance of the problem to the most\nappropriate algorithm for that instance. We formalize this idea and derive\nrigorous sample complexity bounds for this learning problem, in the spirit of\nrecent work in data-driven algorithm design. We then apply this approach to the\nproblem of making good decisions in the branch-and-cut framework for\nmixed-integer optimization (e.g., which cut to add?). In other words, the\nneural network will take as input a mixed-integer optimization instance and\noutput a decision that will result in a small branch-and-cut tree for that\ninstance. Our computational results provide evidence that our particular way of\nusing neural networks for cut selection can make a significant impact in\nreducing branch-and-cut tree sizes, compared to previous data-driven\napproaches.\n","authors":["Hongyu Cheng","Sammy Khalife","Barbara Fiedorowicz","Amitabh Basu"],"pdf_url":"https://arxiv.org/pdf/2402.02328v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02490v1","updated":"2024-06-04T17:00:14Z","published":"2024-06-04T17:00:14Z","title":"Ai-Sampler: Adversarial Learning of Markov kernels with involutive maps","summary":"  Markov chain Monte Carlo methods have become popular in statistics as\nversatile techniques to sample from complicated probability distributions. In\nthis work, we propose a method to parameterize and train transition kernels of\nMarkov chains to achieve efficient sampling and good mixing. This training\nprocedure minimizes the total variation distance between the stationary\ndistribution of the chain and the empirical distribution of the data. Our\napproach leverages involutive Metropolis-Hastings kernels constructed from\nreversible neural networks that ensure detailed balance by construction. We\nfind that reversibility also implies $C_2$-equivariance of the discriminator\nfunction which can be used to restrict its function space.\n","authors":["Evgenii Egorov","Ricardo Valperga","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2406.02490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07134v2","updated":"2024-06-04T16:57:16Z","published":"2024-03-11T20:04:03Z","title":"COMQ: A Backpropagation-Free Algorithm for Post-Training Quantization","summary":"  Post-training quantization (PTQ) has emerged as a practical approach to\ncompress large neural networks, making them highly efficient for deployment.\nHowever, effectively reducing these models to their low-bit counterparts\nwithout compromising the original accuracy remains a key challenge. In this\npaper, we propose an innovative PTQ algorithm termed COMQ, which sequentially\nconducts coordinate-wise minimization of the layer-wise reconstruction errors.\nWe consider the widely used integer quantization, where every quantized weight\ncan be decomposed into a shared floating-point scalar and an integer bit-code.\nWithin a fixed layer, COMQ treats all the scaling factor(s) and bit-codes as\nthe variables of the reconstruction error. Every iteration improves this error\nalong a single coordinate while keeping all other variables constant. COMQ is\neasy to use and requires no hyper-parameter tuning. It instead involves only\ndot products and rounding operations. We update these variables in a carefully\ndesigned greedy order, significantly enhancing the accuracy. COMQ achieves\nremarkable results in quantizing 4-bit Vision Transformers, with a negligible\nloss of less than 1% in Top-1 accuracy. In 4-bit INT quantization of\nconvolutional neural networks, COMQ maintains near-lossless accuracy with a\nminimal drop of merely 0.3% in Top-1 accuracy.\n","authors":["Aozhong Zhang","Zi Yang","Naigang Wang","Yingyong Qin","Jack Xin","Xin Li","Penghang Yin"],"pdf_url":"https://arxiv.org/pdf/2403.07134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02486v1","updated":"2024-06-04T16:55:42Z","published":"2024-06-04T16:55:42Z","title":"A Temporal Kolmogorov-Arnold Transformer for Time Series Forecasting","summary":"  Capturing complex temporal patterns and relationships within multivariate\ndata streams is a difficult task. We propose the Temporal Kolmogorov-Arnold\nTransformer (TKAT), a novel attention-based architecture designed to address\nthis task using Temporal Kolmogorov-Arnold Networks (TKANs). Inspired by the\nTemporal Fusion Transformer (TFT), TKAT emerges as a powerful encoder-decoder\nmodel tailored to handle tasks in which the observed part of the features is\nmore important than the a priori known part. This new architecture combined the\ntheoretical foundation of the Kolmogorov-Arnold representation with the power\nof transformers. TKAT aims to simplify the complex dependencies inherent in\ntime series, making them more \"interpretable\". The use of transformer\narchitecture in this framework allows us to capture long-range dependencies\nthrough self-attention mechanisms.\n","authors":["Remi Genet","Hugo Inzirillo"],"pdf_url":"https://arxiv.org/pdf/2406.02486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02477v1","updated":"2024-06-04T16:47:47Z","published":"2024-06-04T16:47:47Z","title":"Inpainting Pathology in Lumbar Spine MRI with Latent Diffusion","summary":"  Data driven models for automated diagnosis in radiology suffer from\ninsufficient and imbalanced datasets due to low representation of pathology in\na population and the cost of expert annotations. Datasets can be bolstered\nthrough data augmentation. However, even when utilizing a full suite of\ntransformations during model training, typical data augmentations do not\naddress variations in human anatomy. An alternative direction is to synthesize\ndata using generative models, which can potentially craft datasets with\nspecific attributes. While this holds promise, commonly used generative models\nsuch as Generative Adversarial Networks may inadvertently produce anatomically\ninaccurate features. On the other hand, diffusion models, which offer greater\nstability, tend to memorize training data, raising concerns about privacy and\ngenerative diversity. Alternatively, inpainting has the potential to augment\ndata through directly inserting pathology in medical images. However, this\napproach introduces a new challenge: accurately merging the generated\npathological features with the surrounding anatomical context. While inpainting\nis a well established method for addressing simple lesions, its application to\npathologies that involve complex structural changes remains relatively\nunexplored. We propose an efficient method for inpainting pathological features\nonto healthy anatomy in MRI through voxelwise noise scheduling in a latent\ndiffusion model. We evaluate the method's ability to insert disc herniation and\ncentral canal stenosis in lumbar spine sagittal T2 MRI, and it achieves\nsuperior Frechet Inception Distance compared to state-of-the-art methods.\n","authors":["Colin Hansen","Simas Glinskis","Ashwin Raju","Micha Kornreich","JinHyeong Park","Jayashri Pawar","Richard Herzog","Li Zhang","Benjamin Odry"],"pdf_url":"https://arxiv.org/pdf/2406.02477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02470v1","updated":"2024-06-04T16:40:55Z","published":"2024-06-04T16:40:55Z","title":"Meta-Designing Quantum Experiments with Language Models","summary":"  Artificial Intelligence (AI) has the potential to significantly advance\nscientific discovery by finding solutions beyond human capabilities. However,\nthese super-human solutions are often unintuitive and require considerable\neffort to uncover underlying principles, if possible at all. Here, we show how\na code-generating language model trained on synthetic data can not only find\nsolutions to specific problems but can create meta-solutions, which solve an\nentire class of problems in one shot and simultaneously offer insight into the\nunderlying design principles. Specifically, for the design of new quantum\nphysics experiments, our sequence-to-sequence transformer architecture\ngenerates interpretable Python code that describes experimental blueprints for\na whole class of quantum systems. We discover general and previously unknown\ndesign rules for infinitely large classes of quantum states. The ability to\nautomatically generate generalized patterns in readable computer code is a\ncrucial step toward machines that help discover new scientific understanding --\none of the central aims of physics.\n","authors":["Sören Arlt","Haonan Duan","Felix Li","Sang Michael Xie","Yuhuai Wu","Mario Krenn"],"pdf_url":"https://arxiv.org/pdf/2406.02470v1.pdf","comment":"10+3 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.02469v1","updated":"2024-06-04T16:38:57Z","published":"2024-06-04T16:38:57Z","title":"Landscape-Aware Growing: The Power of a Little LAG","summary":"  Recently, there has been increasing interest in efficient pretraining\nparadigms for training Transformer-based models. Several recent approaches use\nsmaller models to initialize larger models in order to save computation (e.g.,\nstacking and fusion). In this work, we study the fundamental question of how to\nselect the best growing strategy from a given pool of growing strategies. Prior\nworks have extensively focused on loss- and/or function-preserving behavior at\ninitialization or simply performance at the end of training. Instead, we\nidentify that behavior at initialization can be misleading as a predictor of\nfinal performance and present an alternative perspective based on early\ntraining dynamics, which we call \"landscape-aware growing (LAG)\". We perform\nextensive analysis of correlation of the final performance with performance in\nthe initial steps of training and find early and more accurate predictions of\nthe optimal growing strategy (i.e., with only a small \"lag\" after\ninitialization). This perspective also motivates an adaptive strategy for\ngradual stacking.\n","authors":["Stefani Karp","Nikunj Saunshi","Sobhan Miryoosefi","Sashank J. Reddi","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2406.02469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02465v1","updated":"2024-06-04T16:34:17Z","published":"2024-06-04T16:34:17Z","title":"An Empirical Study into Clustering of Unseen Datasets with\n  Self-Supervised Encoders","summary":"  Can pretrained models generalize to new datasets without any retraining? We\ndeploy pretrained image models on datasets they were not trained for, and\ninvestigate whether their embeddings form meaningful clusters. Our suite of\nbenchmarking experiments use encoders pretrained solely on ImageNet-1k with\neither supervised or self-supervised training techniques, deployed on image\ndatasets that were not seen during training, and clustered with conventional\nclustering algorithms. This evaluation provides new insights into the\nembeddings of self-supervised models, which prioritize different features to\nsupervised models. Supervised encoders typically offer more utility than SSL\nencoders within the training domain, and vice-versa far outside of it, however,\nfine-tuned encoders demonstrate the opposite trend. Clustering provides a way\nto evaluate the utility of self-supervised learned representations orthogonal\nto existing methods such as kNN. Additionally, we find the silhouette score\nwhen measured in a UMAP-reduced space is highly correlated with clustering\nperformance, and can therefore be used as a proxy for clustering performance on\ndata with no ground truth labels. Our code implementation is available at\n\\url{https://github.com/scottclowe/zs-ssl-clustering/}.\n","authors":["Scott C. Lowe","Joakim Bruslund Haurum","Sageev Oore","Thomas B. Moeslund","Graham W. Taylor"],"pdf_url":"https://arxiv.org/pdf/2406.02465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02464v1","updated":"2024-06-04T16:31:43Z","published":"2024-06-04T16:31:43Z","title":"Meta-Learners for Partially-Identified Treatment Effects Across Multiple\n  Environments","summary":"  Estimating the conditional average treatment effect (CATE) from observational\ndata is relevant for many applications such as personalized medicine. Here, we\nfocus on the widespread setting where the observational data come from multiple\nenvironments, such as different hospitals, physicians, or countries.\nFurthermore, we allow for violations of standard causal assumptions, namely,\noverlap within the environments and unconfoundedness. To this end, we move away\nfrom point identification and focus on partial identification. Specifically, we\nshow that current assumptions from the literature on multiple environments\nallow us to interpret the environment as an instrumental variable (IV). This\nallows us to adapt bounds from the IV literature for partial identification of\nCATE by leveraging treatment assignment mechanisms across environments. Then,\nwe propose different model-agnostic learners (so-called meta-learners) to\nestimate the bounds that can be used in combination with arbitrary machine\nlearning models. We further demonstrate the effectiveness of our meta-learners\nacross various experiments using both simulated and real-world data. Finally,\nwe discuss the applicability of our meta-learners to partial identification in\ninstrumental variable settings, such as randomized controlled trials with\nnon-compliance.\n","authors":["Jonas Schweisthal","Dennis Frauen","Mihaela van der Schaar","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2406.02464v1.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2406.02457v1","updated":"2024-06-04T16:21:24Z","published":"2024-06-04T16:21:24Z","title":"Machine learning Hubbard parameters with equivariant neural networks","summary":"  Density-functional theory with extended Hubbard functionals (DFT+$U$+$V$)\nprovides a robust framework to accurately describe complex materials containing\ntransition-metal or rare-earth elements. It does so by mitigating\nself-interaction errors inherent to semi-local functionals which are\nparticularly pronounced in systems with partially-filled $d$ and $f$ electronic\nstates. However, achieving accuracy in this approach hinges upon the accurate\ndetermination of the on-site $U$ and inter-site $V$ Hubbard parameters. In\npractice, these are obtained either by semi-empirical tuning, requiring prior\nknowledge, or, more correctly, by using predictive but expensive\nfirst-principles calculations. Here, we present a machine learning model based\non equivariant neural networks which uses atomic occupation matrices as\ndescriptors, directly capturing the electronic structure, local chemical\nenvironment, and oxidation states of the system at hand. We target here the\nprediction of Hubbard parameters computed self-consistently with iterative\nlinear-response calculations, as implemented in density-functional perturbation\ntheory (DFPT), and structural relaxations. Remarkably, when trained on data\nfrom 11 materials spanning various crystal structures and compositions, our\nmodel achieves mean absolute relative errors of 3% and 5% for Hubbard $U$ and\n$V$ parameters, respectively. By circumventing computationally expensive DFT or\nDFPT self-consistent protocols, our model significantly expedites the\nprediction of Hubbard parameters with negligible computational overhead, while\napproaching the accuracy of DFPT. Moreover, owing to its robust\ntransferability, the model facilitates accelerated materials discovery and\ndesign via high-throughput calculations, with relevance for various\ntechnological applications.\n","authors":["Martin Uhrin","Austin Zadoks","Luca Binci","Nicola Marzari","Iurii Timrov"],"pdf_url":"https://arxiv.org/pdf/2406.02457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02456v1","updated":"2024-06-04T16:21:14Z","published":"2024-06-04T16:21:14Z","title":"Offline Bayesian Aleatoric and Epistemic Uncertainty Quantification and\n  Posterior Value Optimisation in Finite-State MDPs","summary":"  We address the challenge of quantifying Bayesian uncertainty and\nincorporating it in offline use cases of finite-state Markov Decision Processes\n(MDPs) with unknown dynamics. Our approach provides a principled method to\ndisentangle epistemic and aleatoric uncertainty, and a novel technique to find\npolicies that optimise Bayesian posterior expected value without relying on\nstrong assumptions about the MDP's posterior distribution. First, we utilise\nstandard Bayesian reinforcement learning methods to capture the posterior\nuncertainty in MDP parameters based on available data. We then analytically\ncompute the first two moments of the return distribution across posterior\nsamples and apply the law of total variance to disentangle aleatoric and\nepistemic uncertainties. To find policies that maximise posterior expected\nvalue, we leverage the closed-form expression for value as a function of\npolicy. This allows us to propose a stochastic gradient-based approach for\nsolving the problem. We illustrate the uncertainty quantification and Bayesian\nposterior value optimisation performance of our agent in simple, interpretable\ngridworlds and validate it through ground-truth evaluations on synthetic MDPs.\nFinally, we highlight the real-world impact and computational scalability of\nour method by applying it to the AI Clinician problem, which recommends\ntreatment for patients in intensive care units and has emerged as a key use\ncase of finite-state MDPs with offline data. We discuss the challenges that\narise with Bayesian modelling of larger scale MDPs while demonstrating the\npotential to apply our methods rooted in Bayesian decision theory into the real\nworld. We make our code available at\nhttps://github.com/filippovaldettaro/finite-state-mdps .\n","authors":["Filippo Valdettaro","A. Aldo Faisal"],"pdf_url":"https://arxiv.org/pdf/2406.02456v1.pdf","comment":"19 pages, 13 figures, 40th Conference on Uncertainty in Artificial\n  Intelligence (UAI 2024)"},{"id":"http://arxiv.org/abs/2406.02450v1","updated":"2024-06-04T16:14:55Z","published":"2024-06-04T16:14:55Z","title":"A Generalized Apprenticeship Learning Framework for Modeling\n  Heterogeneous Student Pedagogical Strategies","summary":"  A key challenge in e-learning environments like Intelligent Tutoring Systems\n(ITSs) is to induce effective pedagogical policies efficiently. While Deep\nReinforcement Learning (DRL) often suffers from sample inefficiency and reward\nfunction design difficulty, Apprenticeship Learning(AL) algorithms can overcome\nthem. However, most AL algorithms can not handle heterogeneity as they assume\nall demonstrations are generated with a homogeneous policy driven by a single\nreward function. Still, some AL algorithms which consider heterogeneity, often\ncan not generalize to large continuous state space and only work with discrete\nstates. In this paper, we propose an expectation-maximization(EM)-EDM, a\ngeneral AL framework to induce effective pedagogical policies from given\noptimal or near-optimal demonstrations, which are assumed to be driven by\nheterogeneous reward functions. We compare the effectiveness of the policies\ninduced by our proposed EM-EDM against four AL-based baselines and two policies\ninduced by DRL on two different but related tasks that involve pedagogical\naction prediction. Our overall results showed that, for both tasks, EM-EDM\noutperforms the four AL baselines across all performance metrics and the two\nDRL baselines. This suggests that EM-EDM can effectively model complex student\npedagogical decision-making processes through the ability to manage a large,\ncontinuous state space and adapt to handle diverse and heterogeneous reward\nfunctions with very few given demonstrations.\n","authors":["Md Mirajul Islam","Xi Yang","John Hostetter","Adittya Soukarjya Saha","Min Chi"],"pdf_url":"https://arxiv.org/pdf/2406.02450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02447v1","updated":"2024-06-04T16:12:27Z","published":"2024-06-04T16:12:27Z","title":"Reducing Bias in Federated Class-Incremental Learning with Hierarchical\n  Generative Prototypes","summary":"  Federated Learning (FL) aims at unburdening the training of deep models by\ndistributing computation across multiple devices (clients) while safeguarding\ndata privacy. On top of that, Federated Continual Learning (FCL) also accounts\nfor data distribution evolving over time, mirroring the dynamic nature of\nreal-world environments. In this work, we shed light on the Incremental and\nFederated biases that naturally emerge in FCL. While the former is a known\nproblem in Continual Learning, stemming from the prioritization of recently\nintroduced classes, the latter (i.e., the bias towards local distributions)\nremains relatively unexplored. Our proposal constrains both biases in the last\nlayer by efficiently fine-tuning a pre-trained backbone using learnable\nprompts, resulting in clients that produce less biased representations and more\nbiased classifiers. Therefore, instead of solely relying on parameter\naggregation, we also leverage generative prototypes to effectively balance the\npredictions of the global model. Our method improves on the current State Of\nThe Art, providing an average increase of +7.9% in accuracy.\n","authors":["Riccardo Salami","Pietro Buzzega","Matteo Mosconi","Mattia Verasani","Simone Calderara"],"pdf_url":"https://arxiv.org/pdf/2406.02447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11536v2","updated":"2024-06-04T16:01:27Z","published":"2023-12-15T19:50:32Z","title":"Fast Decision Boundary based Out-of-Distribution Detector","summary":"  Efficient and effective Out-of-Distribution (OOD) detection is essential for\nthe safe deployment of AI systems. Existing feature space methods, while\neffective, often incur significant computational overhead due to their reliance\non auxiliary models built from training features. In this paper, we propose a\ncomputationally-efficient OOD detector without using auxiliary models while\nstill leveraging the rich information embedded in the feature space.\nSpecifically, we detect OOD samples based on their feature distances to\ndecision boundaries. To minimize computational cost, we introduce an efficient\nclosed-form estimation, analytically proven to tightly lower bound the\ndistance. Based on our estimation, we discover that In-Distribution (ID)\nfeatures tend to be further from decision boundaries than OOD features.\nAdditionally, ID and OOD samples are better separated when compared at equal\ndeviation levels from the mean of training features. By regularizing the\ndistances to decision boundaries based on feature deviation from the mean, we\ndevelop a hyperparameter-free, auxiliary model-free OOD detector. Our method\nmatches or surpasses the effectiveness of state-of-the-art methods in extensive\nexperiments while incurring negligible overhead in inference latency. Overall,\nour approach significantly improves the efficiency-effectiveness trade-off in\nOOD detection. Code is available at: https://github.com/litianliu/fDBD-OOD.\n","authors":["Litian Liu","Yao Qin"],"pdf_url":"https://arxiv.org/pdf/2312.11536v2.pdf","comment":"ICML 2024 main conference paper"},{"id":"http://arxiv.org/abs/2312.14820v2","updated":"2024-06-04T15:51:36Z","published":"2023-12-22T16:47:10Z","title":"How Smooth Is Attention?","summary":"  Self-attention and masked self-attention are at the heart of Transformers'\noutstanding success. Still, our mathematical understanding of attention, in\nparticular of its Lipschitz properties - which are key when it comes to\nanalyzing robustness and expressive power - is incomplete. We provide a\ndetailed study of the Lipschitz constant of self-attention in several practical\nscenarios, discussing the impact of the sequence length $n$ and layer\nnormalization on the local Lipschitz constant of both unmasked and masked\nself-attention. In particular, we show that for inputs of length $n$ in any\ncompact set, the Lipschitz constant of self-attention is bounded by $\\sqrt{n}$\nup to a constant factor and that this bound is tight for reasonable sequence\nlengths. When the sequence length $n$ is too large for the previous bound to be\ntight, which we refer to as the mean-field regime, we provide an upper bound\nand a matching lower bound which are independent of $n$. Our mean-field\nframework for masked self-attention is novel and of independent interest. Our\nexperiments on pretrained and randomly initialized BERT and GPT-2 support our\ntheoretical findings.\n","authors":["Valérie Castin","Pierre Ablin","Gabriel Peyré"],"pdf_url":"https://arxiv.org/pdf/2312.14820v2.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2406.02432v1","updated":"2024-06-04T15:50:42Z","published":"2024-06-04T15:50:42Z","title":"Coresets for Multiple $\\ell_p$ Regression","summary":"  A coreset of a dataset with $n$ examples and $d$ features is a weighted\nsubset of examples that is sufficient for solving downstream data analytic\ntasks. Nearly optimal constructions of coresets for least squares and $\\ell_p$\nlinear regression with a single response are known in prior work. However, for\nmultiple $\\ell_p$ regression where there can be $m$ responses, there are no\nknown constructions with size sublinear in $m$. In this work, we construct\ncoresets of size $\\tilde O(\\varepsilon^{-2}d)$ for $p<2$ and $\\tilde\nO(\\varepsilon^{-p}d^{p/2})$ for $p>2$ independently of $m$ (i.e.,\ndimension-free) that approximate the multiple $\\ell_p$ regression objective at\nevery point in the domain up to $(1\\pm\\varepsilon)$ relative error. If we only\nneed to preserve the minimizer subject to a subspace constraint, we improve\nthese bounds by an $\\varepsilon$ factor for all $p>1$. All of our bounds are\nnearly tight.\n  We give two application of our results. First, we settle the number of\nuniform samples needed to approximate $\\ell_p$ Euclidean power means up to a\n$(1+\\varepsilon)$ factor, showing that $\\tilde\\Theta(\\varepsilon^{-2})$ samples\nfor $p = 1$, $\\tilde\\Theta(\\varepsilon^{-1})$ samples for $1 < p < 2$, and\n$\\tilde\\Theta(\\varepsilon^{1-p})$ samples for $p>2$ is tight, answering a\nquestion of Cohen-Addad, Saulpic, and Schwiegelshohn. Second, we show that for\n$1<p<2$, every matrix has a subset of $\\tilde O(\\varepsilon^{-1}k)$ rows which\nspans a $(1+\\varepsilon)$-approximately optimal $k$-dimensional subspace for\n$\\ell_p$ subspace approximation, which is also nearly optimal.\n","authors":["David P. Woodruff","Taisuke Yasuda"],"pdf_url":"https://arxiv.org/pdf/2406.02432v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.02431v1","updated":"2024-06-04T15:50:35Z","published":"2024-06-04T15:50:35Z","title":"Reweighted Solutions for Weighted Low Rank Approximation","summary":"  Weighted low rank approximation (WLRA) is an important yet computationally\nchallenging primitive with applications ranging from statistical analysis,\nmodel compression, and signal processing. To cope with the NP-hardness of this\nproblem, prior work considers heuristics, bicriteria, or fixed parameter\ntractable algorithms to solve this problem. In this work, we introduce a new\nrelaxed solution to WLRA which outputs a matrix that is not necessarily low\nrank, but can be stored using very few parameters and gives provable\napproximation guarantees when the weight matrix has low rank. Our central idea\nis to use the weight matrix itself to reweight a low rank solution, which gives\nan extremely simple algorithm with remarkable empirical performance in\napplications to model compression and on synthetic datasets. Our algorithm also\ngives nearly optimal communication complexity bounds for a natural distributed\nproblem associated with this problem, for which we show matching communication\nlower bounds. Together, our communication complexity bounds show that the rank\nof the weight matrix provably parameterizes the communication complexity of\nWLRA. We also obtain the first relative error guarantees for feature selection\nwith a weighted objective.\n","authors":["David P. Woodruff","Taisuke Yasuda"],"pdf_url":"https://arxiv.org/pdf/2406.02431v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.02428v1","updated":"2024-06-04T15:47:03Z","published":"2024-06-04T15:47:03Z","title":"Harnessing Neural Unit Dynamics for Effective and Scalable\n  Class-Incremental Learning","summary":"  Class-incremental learning (CIL) aims to train a model to learn new classes\nfrom non-stationary data streams without forgetting old ones. In this paper, we\npropose a new kind of connectionist model by tailoring neural unit dynamics\nthat adapt the behavior of neural networks for CIL. In each training session,\nit introduces a supervisory mechanism to guide network expansion whose growth\nsize is compactly commensurate with the intrinsic complexity of a newly\narriving task. This constructs a near-minimal network while allowing the model\nto expand its capacity when cannot sufficiently hold new classes. At inference\ntime, it automatically reactivates the required neural units to retrieve\nknowledge and leaves the remaining inactivated to prevent interference. We name\nour model AutoActivator, which is effective and scalable. To gain insights into\nthe neural unit dynamics, we theoretically analyze the model's convergence\nproperty via a universal approximation theorem on learning sequential mappings,\nwhich is under-explored in the CIL community. Experiments show that our method\nachieves strong CIL performance in rehearsal-free and minimal-expansion\nsettings with different backbones.\n","authors":["Depeng Li","Tianqi Wang","Junwei Chen","Wei Dai","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2406.02428v1.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2406.02426v1","updated":"2024-06-04T15:46:41Z","published":"2024-06-04T15:46:41Z","title":"Contextual Optimization under Covariate Shift: A Robust Approach by\n  Intersecting Wasserstein Balls","summary":"  In contextual optimization, a decision-maker observes historical samples of\nuncertain variables and associated concurrent covariates, without knowing their\njoint distribution. Given an additional covariate observation, the goal is to\nchoose a decision that minimizes some operational costs. A prevalent issue here\nis covariate shift, where the marginal distribution of the new covariate\ndiffers from historical samples, leading to decision performance variations\nwith nonparametric or parametric estimators. To address this, we propose a\ndistributionally robust approach that uses an ambiguity set by the intersection\nof two Wasserstein balls, each centered on typical nonparametric or parametric\ndistribution estimators. Computationally, we establish the tractable\nreformulation of this distributionally robust optimization problem.\nStatistically, we provide guarantees for our Wasserstein ball intersection\napproach under covariate shift by analyzing the measure concentration of the\nestimators. Furthermore, to reduce computational complexity, we employ a\nsurrogate objective that maintains similar generalization guarantees. Through\nsynthetic and empirical case studies on income prediction and portfolio\noptimization, we demonstrate the strong empirical performance of our proposed\nmodels.\n","authors":["Tianyu Wang","Ningyuan Chen","Chun Wang"],"pdf_url":"https://arxiv.org/pdf/2406.02426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07471v2","updated":"2024-06-04T15:46:22Z","published":"2024-02-12T08:16:58Z","title":"Differentially Private Decentralized Learning with Random Walks","summary":"  The popularity of federated learning comes from the possibility of better\nscalability and the ability for participants to keep control of their data,\nimproving data security and sovereignty. Unfortunately, sharing model updates\nalso creates a new privacy attack surface. In this work, we characterize the\nprivacy guarantees of decentralized learning with random walk algorithms, where\na model is updated by traveling from one node to another along the edges of a\ncommunication graph. Using a recent variant of differential privacy tailored to\nthe study of decentralized algorithms, namely Pairwise Network Differential\nPrivacy, we derive closed-form expressions for the privacy loss between each\npair of nodes where the impact of the communication topology is captured by\ngraph theoretic quantities. Our results further reveal that random walk\nalgorithms tends to yield better privacy guarantees than gossip algorithms for\nnodes close from each other. We supplement our theoretical results with\nempirical evaluation on synthetic and real-world graphs and datasets.\n","authors":["Edwige Cyffers","Aurélien Bellet","Jalaj Upadhyay"],"pdf_url":"https://arxiv.org/pdf/2402.07471v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2406.02424v1","updated":"2024-06-04T15:44:10Z","published":"2024-06-04T15:44:10Z","title":"Contextual Dynamic Pricing: Algorithms, Optimality, and Local\n  Differential Privacy Constraints","summary":"  We study the contextual dynamic pricing problem where a firm sells products\nto $T$ sequentially arriving consumers that behave according to an unknown\ndemand model. The firm aims to maximize its revenue, i.e. minimize its regret\nover a clairvoyant that knows the model in advance. The demand model is a\ngeneralized linear model (GLM), allowing for a stochastic feature vector in\n$\\mathbb R^d$ that encodes product and consumer information. We first show that\nthe optimal regret upper bound is of order $\\sqrt{dT}$, up to a logarithmic\nfactor, improving upon existing upper bounds in the literature by a $\\sqrt{d}$\nfactor. This sharper rate is materialised by two algorithms: a confidence\nbound-type (supCB) algorithm and an explore-then-commit (ETC) algorithm. A key\ninsight of our theoretical result is an intrinsic connection between dynamic\npricing and the contextual multi-armed bandit problem with many arms based on a\ncareful discretization. We further study contextual dynamic pricing under the\nlocal differential privacy (LDP) constraints. In particular, we propose a\nstochastic gradient descent based ETC algorithm that achieves an optimal regret\nupper bound of order $d\\sqrt{T}/\\epsilon$, up to a logarithmic factor, where\n$\\epsilon>0$ is the privacy parameter. The regret upper bounds with and without\nLDP constraints are accompanied by newly constructed minimax lower bounds,\nwhich further characterize the cost of privacy. Extensive numerical experiments\nand a real data application on online lending are conducted to illustrate the\nefficiency and practical value of the proposed algorithms in dynamic pricing.\n","authors":["Zifeng Zhao","Feiyu Jiang","Yi Yu"],"pdf_url":"https://arxiv.org/pdf/2406.02424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03777v2","updated":"2024-06-04T15:41:11Z","published":"2024-03-06T15:15:42Z","title":"ENOT: Expectile Regularization for Fast and Accurate Training of Neural\n  Optimal Transport","summary":"  We present a new approach for Neural Optimal Transport (NOT) training\nprocedure, capable of accurately and efficiently estimating optimal\ntransportation plan via specific regularization on dual Kantorovich potentials.\nThe main bottleneck of existing NOT solvers is associated with the procedure of\nfinding a near-exact approximation of the conjugate operator (i.e., the\nc-transform), which is done either by optimizing over non-convex max-min\nobjectives or by the computationally intensive fine-tuning of the initial\napproximated prediction. We resolve both issues by proposing a new,\ntheoretically justified loss in the form of expectile regularisation which\nenforces binding conditions on the learning process of dual potentials. Such a\nregularization provides the upper bound estimation over the distribution of\npossible conjugate potentials and makes the learning stable, completely\neliminating the need for additional extensive fine-tuning. Proposed method,\ncalled Expectile-Regularised Neural Optimal Transport (ENOT), outperforms\nprevious state-of-the-art approaches on the established Wasserstein-2 benchmark\ntasks by a large margin (up to a 3-fold improvement in quality and up to a\n10-fold improvement in runtime). Moreover, we showcase performance of ENOT for\nvarying cost functions on different tasks such as image generation, showing\nrobustness of proposed algorithm.\n","authors":["Nazar Buzun","Maksim Bobrin","Dmitry V. Dylov"],"pdf_url":"https://arxiv.org/pdf/2403.03777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00944v3","updated":"2024-06-04T15:40:42Z","published":"2023-01-03T04:09:38Z","title":"Temporal Difference Learning with Compressed Updates: Error-Feedback\n  meets Reinforcement Learning","summary":"  In large-scale distributed machine learning, recent works have studied the\neffects of compressing gradients in stochastic optimization to alleviate the\ncommunication bottleneck. These works have collectively revealed that\nstochastic gradient descent (SGD) is robust to structured perturbations such as\nquantization, sparsification, and delays. Perhaps surprisingly, despite the\nsurge of interest in multi-agent reinforcement learning, almost nothing is\nknown about the analogous question: Are common reinforcement learning (RL)\nalgorithms also robust to similar perturbations? We investigate this question\nby studying a variant of the classical temporal difference (TD) learning\nalgorithm with a perturbed update direction, where a general compression\noperator is used to model the perturbation. Our work makes three important\ntechnical contributions. First, we prove that compressed TD algorithms, coupled\nwith an error-feedback mechanism used widely in optimization, exhibit the same\nnon-asymptotic theoretical guarantees as their SGD counterparts. Second, we\nshow that our analysis framework extends seamlessly to nonlinear stochastic\napproximation schemes that subsume Q-learning. Third, we prove that for\nmulti-agent TD learning, one can achieve linear convergence speedups with\nrespect to the number of agents while communicating just $\\tilde{O}(1)$ bits\nper iteration. Notably, these are the first finite-time results in RL that\naccount for general compression operators and error-feedback in tandem with\nlinear function approximation and Markovian sampling. Our proofs hinge on the\nconstruction of novel Lyapunov functions that capture the dynamics of a memory\nvariable introduced by error-feedback.\n","authors":["Aritra Mitra","George J. Pappas","Hamed Hassani"],"pdf_url":"https://arxiv.org/pdf/2301.00944v3.pdf","comment":"Accepted to Transactions on Machine Learning Research"},{"id":"http://arxiv.org/abs/2406.02422v1","updated":"2024-06-04T15:39:49Z","published":"2024-06-04T15:39:49Z","title":"IterMask2: Iterative Unsupervised Anomaly Segmentation via Spatial and\n  Frequency Masking for Brain Lesions in MRI","summary":"  Unsupervised anomaly segmentation approaches to pathology segmentation train\na model on images of healthy subjects, that they define as the 'normal' data\ndistribution. At inference, they aim to segment any pathologies in new images\nas 'anomalies', as they exhibit patterns that deviate from those in 'normal'\ntraining data. Prevailing methods follow the 'corrupt-and-reconstruct'\nparadigm. They intentionally corrupt an input image, reconstruct it to follow\nthe learned 'normal' distribution, and subsequently segment anomalies based on\nreconstruction error. Corrupting an input image, however, inevitably leads to\nsuboptimal reconstruction even of normal regions, causing false positives. To\nalleviate this, we propose a novel iterative spatial mask-refining strategy\nIterMask2. We iteratively mask areas of the image, reconstruct them, and update\nthe mask based on reconstruction error. This iterative process progressively\nadds information about areas that are confidently normal as per the model. The\nincreasing content guides reconstruction of nearby masked areas, improving\nreconstruction of normal tissue under these areas, reducing false positives. We\nalso use high-frequency image content as an auxiliary input to provide\nadditional structural information for masked areas. This further improves\nreconstruction error of normal in comparison to anomalous areas, facilitating\nsegmentation of the latter. We conduct experiments on several brain lesion\ndatasets and demonstrate effectiveness of our method. Code is available at:\nhttps://github.com/ZiyunLiang/IterMasks2\n","authors":["Ziyun Liang","Xiaoqing Guo","J. Alison Noble","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2406.02422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02421v1","updated":"2024-06-04T15:39:08Z","published":"2024-06-04T15:39:08Z","title":"Representing Piecewise-Linear Functions by Functions with Minimal Arity","summary":"  Any continuous piecewise-linear function $F\\colon \\mathbb{R}^{n}\\to\n\\mathbb{R}$ can be represented as a linear combination of $\\max$ functions of\nat most $n+1$ affine-linear functions. In our previous paper [``Representing\npiecewise linear functions by functions with small arity'', AAECC, 2023], we\nshowed that this upper bound of $n+1$ arguments is tight. In the present paper,\nwe extend this result by establishing a correspondence between the function $F$\nand the minimal number of arguments that are needed in any such decomposition.\nWe show that the tessellation of the input space $\\mathbb{R}^{n}$ induced by\nthe function $F$ has a direct connection to the number of arguments in the\n$\\max$ functions.\n","authors":["Christoph Koutschan","Anton Ponomarchuk","Josef Schicho"],"pdf_url":"https://arxiv.org/pdf/2406.02421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00943v2","updated":"2024-06-04T15:38:08Z","published":"2024-02-01T19:00:40Z","title":"Approximate Nearest Neighbor Search with Window Filters","summary":"  We define and investigate the problem of $\\textit{c-approximate window\nsearch}$: approximate nearest neighbor search where each point in the dataset\nhas a numeric label, and the goal is to find nearest neighbors to queries\nwithin arbitrary label ranges. Many semantic search problems, such as image and\ndocument search with timestamp filters, or product search with cost filters,\nare natural examples of this problem. We propose and theoretically analyze a\nmodular tree-based framework for transforming an index that solves the\ntraditional c-approximate nearest neighbor problem into a data structure that\nsolves window search. On standard nearest neighbor benchmark datasets equipped\nwith random label values, adversarially constructed embeddings, and image\nsearch embeddings with real timestamps, we obtain up to a $75\\times$ speedup\nover existing solutions at the same level of recall.\n","authors":["Joshua Engels","Benjamin Landrum","Shangdi Yu","Laxman Dhulipala","Julian Shun"],"pdf_url":"https://arxiv.org/pdf/2402.00943v2.pdf","comment":"Code available: https://github.com/JoshEngels/RangeFilteredANN"},{"id":"http://arxiv.org/abs/2210.01672v4","updated":"2024-06-04T15:34:57Z","published":"2022-10-04T15:19:24Z","title":"Bringing motion taxonomies to continuous domains via GPLVM on hyperbolic\n  manifolds","summary":"  Human motion taxonomies serve as high-level hierarchical abstractions that\nclassify how humans move and interact with their environment. They have proven\nuseful to analyse grasps, manipulation skills, and whole-body support poses.\nDespite substantial efforts devoted to design their hierarchy and underlying\ncategories, their use remains limited. This may be attributed to the lack of\ncomputational models that fill the gap between the discrete hierarchical\nstructure of the taxonomy and the high-dimensional heterogeneous data\nassociated to its categories. To overcome this problem, we propose to model\ntaxonomy data via hyperbolic embeddings that capture the associated\nhierarchical structure. We achieve this by formulating a novel Gaussian process\nhyperbolic latent variable model that incorporates the taxonomy structure\nthrough graph-based priors on the latent space and distance-preserving back\nconstraints. We validate our model on three different human motion taxonomies\nto learn hyperbolic embeddings that faithfully preserve the original graph\nstructure. We show that our model properly encodes unseen data from existing or\nnew taxonomy categories, and outperforms its Euclidean and VAE-based\ncounterparts. Finally, through proof-of-concept experiments, we show that our\nmodel may be used to generate realistic trajectories between the learned\nembeddings.\n","authors":["Noémie Jaquier","Leonel Rozo","Miguel González-Duque","Viacheslav Borovitskiy","Tamim Asfour"],"pdf_url":"https://arxiv.org/pdf/2210.01672v4.pdf","comment":"Intl. Conference on Machine Learning (ICML), 2024"},{"id":"http://arxiv.org/abs/2405.09831v3","updated":"2024-06-04T15:34:55Z","published":"2024-05-16T06:07:31Z","title":"Nearly Minimax Optimal Regret for Multinomial Logistic Bandit","summary":"  In this paper, we study the contextual multinomial logit (MNL) bandit problem\nin which a learning agent sequentially selects an assortment based on\ncontextual information, and user feedback follows an MNL choice model. There\nhas been a significant discrepancy between lower and upper regret bounds,\nparticularly regarding the feature dimension $d$ and the maximum assortment\nsize $K$. Additionally, the variation in reward structures between these bounds\ncomplicates the quest for optimality. Under uniform rewards, where all items\nhave the same expected reward, we establish a regret lower bound of\n$\\Omega(d\\sqrt{\\smash[b]{T/K}})$ and propose a constant-time algorithm,\nOFU-MNL+, that achieves a matching upper bound of\n$\\tilde{O}(d\\sqrt{\\smash[b]{T/K}})$. Under non-uniform rewards, we prove a\nlower bound of $\\Omega(d\\sqrt{T})$ and an upper bound of\n$\\tilde{O}(d\\sqrt{T})$, also achievable by OFU-MNL+. Our empirical studies\nsupport these theoretical findings. To the best of our knowledge, this is the\nfirst work in the contextual MNL bandit literature to prove minimax optimality\n-- for either uniform or non-uniform reward setting -- and to propose a\ncomputationally efficient algorithm that achieves this optimality up to\nlogarithmic factors.\n","authors":["Joongkyu Lee","Min-hwan Oh"],"pdf_url":"https://arxiv.org/pdf/2405.09831v3.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2402.06737v2","updated":"2024-06-04T15:30:15Z","published":"2024-02-09T19:16:04Z","title":"ExGRG: Explicitly-Generated Relation Graph for Self-Supervised\n  Representation Learning","summary":"  Self-supervised Learning (SSL) has emerged as a powerful technique in\npre-training deep learning models without relying on expensive annotated\nlabels, instead leveraging embedded signals in unlabeled data. While SSL has\nshown remarkable success in computer vision tasks through intuitive data\naugmentation, its application to graph-structured data poses challenges due to\nthe semantic-altering and counter-intuitive nature of graph augmentations.\nAddressing this limitation, this paper introduces a novel non-contrastive SSL\napproach to Explicitly Generate a compositional Relation Graph (ExGRG) instead\nof relying solely on the conventional augmentation-based implicit relation\ngraph. ExGRG offers a framework for incorporating prior domain knowledge and\nonline extracted information into the SSL invariance objective, drawing\ninspiration from the Laplacian Eigenmap and Expectation-Maximization (EM).\nEmploying an EM perspective on SSL, our E-step involves relation graph\ngeneration to identify candidates to guide the SSL invariance objective, and\nM-step updates the model parameters by integrating the derived relational\ninformation. Extensive experimentation on diverse node classification datasets\ndemonstrates the superiority of our method over state-of-the-art techniques,\naffirming ExGRG as an effective adoption of SSL for graph representation\nlearning.\n","authors":["Mahdi Naseri","Mahdi Biparva"],"pdf_url":"https://arxiv.org/pdf/2402.06737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15294v2","updated":"2024-06-04T15:28:34Z","published":"2024-05-24T07:30:45Z","title":"Semi-Supervised Learning guided by the Generalized Bayes Rule under Soft\n  Revision","summary":"  We provide a theoretical and computational investigation of the Gamma-Maximin\nmethod with soft revision, which was recently proposed as a robust criterion\nfor pseudo-label selection (PLS) in semi-supervised learning. Opposed to\ntraditional methods for PLS we use credal sets of priors (\"generalized Bayes\")\nto represent the epistemic modeling uncertainty. These latter are then updated\nby the Gamma-Maximin method with soft revision. We eventually select\npseudo-labeled data that are most likely in light of the least favorable\ndistribution from the so updated credal set. We formalize the task of finding\noptimal pseudo-labeled data w.r.t. the Gamma-Maximin method with soft revision\nas an optimization problem. A concrete implementation for the class of logistic\nmodels then allows us to compare the predictive power of the method with\ncompeting approaches. It is observed that the Gamma-Maximin method with soft\nrevision can achieve very promising results, especially when the proportion of\nlabeled data is low.\n","authors":["Stefan Dietrich","Julian Rodemann","Christoph Jansen"],"pdf_url":"https://arxiv.org/pdf/2405.15294v2.pdf","comment":"Accepted at the 11th International Conference on Soft Methods in\n  Probability and Statistics (SMPS) 2024"},{"id":"http://arxiv.org/abs/2406.02416v1","updated":"2024-06-04T15:27:53Z","published":"2024-06-04T15:27:53Z","title":"Improved Modelling of Federated Datasets using\n  Mixtures-of-Dirichlet-Multinomials","summary":"  In practice, training using federated learning can be orders of magnitude\nslower than standard centralized training. This severely limits the amount of\nexperimentation and tuning that can be done, making it challenging to obtain\ngood performance on a given task. Server-side proxy data can be used to run\ntraining simulations, for instance for hyperparameter tuning. This can greatly\nspeed up the training pipeline by reducing the number of tuning runs to be\nperformed overall on the true clients. However, it is challenging to ensure\nthat these simulations accurately reflect the dynamics of the real federated\ntraining. In particular, the proxy data used for simulations often comes as a\nsingle centralized dataset without a partition into distinct clients, and\npartitioning this data in a naive way can lead to simulations that poorly\nreflect real federated training. In this paper we address the challenge of how\nto partition centralized data in a way that reflects the statistical\nheterogeneity of the true federated clients. We propose a fully federated,\ntheoretically justified, algorithm that efficiently learns the distribution of\nthe true clients and observe improved server-side simulations when using the\ninferred distribution to create simulated clients from the centralized data.\n","authors":["Jonathan Scott","Áine Cahill"],"pdf_url":"https://arxiv.org/pdf/2406.02416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05436v3","updated":"2024-06-04T15:21:50Z","published":"2023-11-09T15:21:56Z","title":"Fair Wasserstein Coresets","summary":"  Data distillation and coresets have emerged as popular approaches to generate\na smaller representative set of samples for downstream learning tasks to handle\nlarge-scale datasets. At the same time, machine learning is being increasingly\napplied to decision-making processes at a societal level, making it imperative\nfor modelers to address inherent biases towards subgroups present in the data.\nWhile current approaches focus on creating fair synthetic representative\nsamples by optimizing local properties relative to the original samples, their\nimpact on downstream learning processes has yet to be explored. In this work,\nwe present fair Wasserstein coresets (FWC), a novel coreset approach which\ngenerates fair synthetic representative samples along with sample-level weights\nto be used in downstream learning tasks. FWC uses an efficient majority\nminimization algorithm to minimize the Wasserstein distance between the\noriginal dataset and the weighted synthetic samples while enforcing demographic\nparity. We show that an unconstrained version of FWC is equivalent to Lloyd's\nalgorithm for k-medians and k-means clustering. Experiments conducted on both\nsynthetic and real datasets show that FWC: (i) achieves a competitive\nfairness-utility tradeoff in downstream models compared to existing approaches,\n(ii) improves downstream fairness when added to the existing training data and\n(iii) can be used to reduce biases in predictions from large language models\n(GPT-3.5 and GPT-4).\n","authors":["Zikai Xiong","Niccolò Dalmasso","Shubham Sharma","Freddy Lecue","Daniele Magazzeni","Vamsi K. Potluru","Tucker Balch","Manuela Veloso"],"pdf_url":"https://arxiv.org/pdf/2311.05436v3.pdf","comment":"28 pages, 7 figures, 7 tables"},{"id":"http://arxiv.org/abs/2302.09160v2","updated":"2024-06-04T15:20:15Z","published":"2023-02-17T22:15:20Z","title":"Identifying Equivalent Training Dynamics","summary":"  Study of the nonlinear evolution deep neural network (DNN) parameters undergo\nduring training has uncovered regimes of distinct dynamical behavior. While a\ndetailed understanding of these phenomena has the potential to advance\nimprovements in training efficiency and robustness, the lack of methods for\nidentifying when DNN models have equivalent dynamics limits the insight that\ncan be gained from prior work. Topological conjugacy, a notion from dynamical\nsystems theory, provides a precise definition of dynamical equivalence,\noffering a possible route to address this need. However, topological\nconjugacies have historically been challenging to compute. By leveraging\nadvances in Koopman operator theory, we develop a framework for identifying\nconjugate and non-conjugate training dynamics. To validate our approach, we\ndemonstrate that it can correctly identify a known equivalence between online\nmirror descent and online gradient descent. We then utilize it to: identify\nnon-conjugate training dynamics between shallow and wide fully connected neural\nnetworks; characterize the early phase of training dynamics in convolutional\nneural networks; uncover non-conjugate training dynamics in Transformers that\ndo and do not undergo grokking. Our results, across a range of DNN\narchitectures, illustrate the flexibility of our framework and highlight its\npotential for shedding new light on training dynamics.\n","authors":["William T. Redman","Juan M. Bello-Rivas","Maria Fonoberova","Ryan Mohr","Ioannis G. Kevrekidis","Igor Mezić"],"pdf_url":"https://arxiv.org/pdf/2302.09160v2.pdf","comment":"18 pages, 6 figures, 3 supplemental figures"},{"id":"http://arxiv.org/abs/2406.00371v2","updated":"2024-06-04T15:16:28Z","published":"2024-06-01T09:10:57Z","title":"Alternative Methods to SHAP Derived from Properties of Kernels: A Note\n  on Theoretical Analysis","summary":"  This study first derives a general and analytical expression of AFA (Additive\nFeature Attribution) in terms of the kernel in LIME (Local Interpretable\nModel-agnostic Explanations). Then, we propose some new AFAs that have\nappropriate properties of kernels or that coincide with the LS prenucleolus in\ncooperative game theory. We also revisit existing AFAs such as SHAP (SHapley\nAdditive exPlanations) and re-examine the properties of their kernels.\n","authors":["Kazuhiro Hiraki","Shinichi Ishihara","Junnosuke Shino"],"pdf_url":"https://arxiv.org/pdf/2406.00371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05223v3","updated":"2024-06-04T15:13:12Z","published":"2023-04-11T13:46:59Z","title":"Inhomogeneous graph trend filtering via a l2,0 cardinality penalty","summary":"  We study estimation of piecewise smooth signals over a graph. We propose a\n$\\ell_{2,0}$-norm penalized Graph Trend Filtering (GTF) model to estimate\npiecewise smooth graph signals that exhibit inhomogeneous levels of smoothness\nacross the nodes. We prove that the proposed GTF model is simultaneously a\nk-means clustering on the signal over the nodes and a minimum graph cut on the\nedges of the graph, where the clustering and the cut share the same assignment\nmatrix. We propose two methods to solve the proposed GTF model: a spectral\ndecomposition method and a method based on simulated annealing. In the\nexperiment on synthetic and real-world datasets, we show that the proposed GTF\nmodel has a better performances compared with existing approaches on the tasks\nof denoising, support recovery and semi-supervised classification. We also show\nthat the proposed GTF model can be solved more efficiently than existing models\nfor the dataset with a large edge set.\n","authors":["Xiaoqing Huang","Andersen Ang","Kun Huang","Jie Zhang","Yijie Wang"],"pdf_url":"https://arxiv.org/pdf/2304.05223v3.pdf","comment":"13 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2209.04419v2","updated":"2024-06-04T15:11:25Z","published":"2022-09-08T15:48:24Z","title":"Majority Vote for Distributed Differentially Private Sign Selection","summary":"  Privacy-preserving data analysis has become more prevalent in recent years.\nIn this study, we propose a distributed group differentially private Majority\nVote mechanism, for the sign selection problem in a distributed setup. To\nachieve this, we apply the iterative peeling to the stability function and use\nthe exponential mechanism to recover the signs. For enhanced applicability, we\nstudy the private sign selection for mean estimation and linear regression\nproblems, in distributed systems. Our method recovers the support and signs\nwith the optimal signal-to-noise ratio as in the non-private scenario, which is\nbetter than contemporary works of private variable selections. Moreover, the\nsign selection consistency is justified by theoretical guarantees. Simulation\nstudies are conducted to demonstrate the effectiveness of the proposed method.\n","authors":["Weidong Liu","Jiyuan Tu","Xiaojun Mao","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2209.04419v2.pdf","comment":"41 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.02395v1","updated":"2024-06-04T15:09:29Z","published":"2024-06-04T15:09:29Z","title":"GrootVL: Tree Topology is All You Need in State Space Model","summary":"  The state space models, employing recursively propagated features,\ndemonstrate strong representation capabilities comparable to Transformer models\nand superior efficiency. However, constrained by the inherent geometric\nconstraints of sequences, it still falls short in modeling long-range\ndependencies. To address this issue, we propose the GrootVL network, which\nfirst dynamically generates a tree topology based on spatial relationships and\ninput features. Then, feature propagation is performed based on this graph,\nthereby breaking the original sequence constraints to achieve stronger\nrepresentation capabilities. Additionally, we introduce a linear complexity\ndynamic programming algorithm to enhance long-range interactions without\nincreasing computational cost. GrootVL is a versatile multimodal framework that\ncan be applied to both visual and textual tasks. Extensive experiments\ndemonstrate that our method significantly outperforms existing structured state\nspace models on image classification, object detection and segmentation.\nBesides, by fine-tuning large language models, our approach achieves consistent\nimprovements in multiple textual tasks at minor training cost.\n","authors":["Yicheng Xiao","Lin Song","Shaoli Huang","Jiangshan Wang","Siyu Song","Yixiao Ge","Xiu Li","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2406.02395v1.pdf","comment":"The code is available at https://github.com/EasonXiao-888/GrootVL"},{"id":"http://arxiv.org/abs/2406.02394v1","updated":"2024-06-04T15:08:56Z","published":"2024-06-04T15:08:56Z","title":"Multiple Choice Questions and Large Languages Models: A Case Study with\n  Fictional Medical Data","summary":"  Large Language Models (LLMs) like ChatGPT demonstrate significant potential\nin the medical field, often evaluated using multiple-choice questions (MCQs)\nsimilar to those found on the USMLE. Despite their prevalence in medical\neducation, MCQs have limitations that might be exacerbated when assessing LLMs.\nTo evaluate the effectiveness of MCQs in assessing the performance of LLMs, we\ndeveloped a fictional medical benchmark focused on a non-existent gland, the\nGlianorex. This approach allowed us to isolate the knowledge of the LLM from\nits test-taking abilities. We used GPT-4 to generate a comprehensive textbook\non the Glianorex in both English and French and developed corresponding\nmultiple-choice questions in both languages. We evaluated various open-source,\nproprietary, and domain-specific LLMs using these questions in a zero-shot\nsetting. The models achieved average scores around 67%, with minor performance\ndifferences between larger and smaller models. Performance was slightly higher\nin English than in French. Fine-tuned medical models showed some improvement\nover their base versions in English but not in French. The uniformly high\nperformance across models suggests that traditional MCQ-based benchmarks may\nnot accurately measure LLMs' clinical knowledge and reasoning abilities,\ninstead highlighting their pattern recognition skills. This study underscores\nthe need for more robust evaluation methods to better assess the true\ncapabilities of LLMs in medical contexts.\n","authors":["Maxime Griot","Jean Vanderdonckt","Demet Yuksel","Coralie Hemptinne"],"pdf_url":"https://arxiv.org/pdf/2406.02394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03506v5","updated":"2024-06-04T15:08:15Z","published":"2024-01-07T14:54:57Z","title":"DiarizationLM: Speaker Diarization Post-Processing with Large Language\n  Models","summary":"  In this paper, we introduce DiarizationLM, a framework to leverage large\nlanguage models (LLM) to post-process the outputs from a speaker diarization\nsystem. Various goals can be achieved with the proposed framework, such as\nimproving the readability of the diarized transcript, or reducing the word\ndiarization error rate (WDER). In this framework, the outputs of the automatic\nspeech recognition (ASR) and speaker diarization systems are represented as a\ncompact textual format, which is included in the prompt to an optionally\nfinetuned LLM. The outputs of the LLM can be used as the refined diarization\nresults with the desired enhancement. As a post-processing step, this framework\ncan be easily applied to any off-the-shelf ASR and speaker diarization systems\nwithout retraining existing components. Our experiments show that a finetuned\nPaLM 2-S model can reduce the WDER by rel. 55.5% on the Fisher telephone\nconversation dataset, and rel. 44.9% on the Callhome English dataset.\n","authors":["Quan Wang","Yiling Huang","Guanlong Zhao","Evan Clark","Wei Xia","Hank Liao"],"pdf_url":"https://arxiv.org/pdf/2401.03506v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07160v2","updated":"2024-06-04T15:01:50Z","published":"2024-02-11T11:11:39Z","title":"PASOA- PArticle baSed Bayesian Optimal Adaptive design","summary":"  We propose a new procedure named PASOA, for Bayesian experimental design,\nthat performs sequential design optimization by simultaneously providing\naccurate estimates of successive posterior distributions for parameter\ninference. The sequential design process is carried out via a contrastive\nestimation principle, using stochastic optimization and Sequential Monte Carlo\n(SMC) samplers to maximise the Expected Information Gain (EIG). As larger\ninformation gains are obtained for larger distances between successive\nposterior distributions, this EIG objective may worsen classical SMC\nperformance. To handle this issue, tempering is proposed to have both a large\ninformation gain and an accurate SMC sampling, that we show is crucial for\nperformance. This novel combination of stochastic optimization and tempered SMC\nallows to jointly handle design optimization and parameter inference. We\nprovide a proof that the obtained optimal design estimators benefit from some\nconsistency property. Numerical experiments confirm the potential of the\napproach, which outperforms other recent existing procedures.\n","authors":["Jacopo Iollo","Christophe Heinkelé","Pierre Alliez","Florence Forbes"],"pdf_url":"https://arxiv.org/pdf/2402.07160v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.02383v1","updated":"2024-06-04T14:59:38Z","published":"2024-06-04T14:59:38Z","title":"Learning to Edit Visual Programs with Self-Supervision","summary":"  We design a system that learns how to edit visual programs. Our edit network\nconsumes a complete input program and a visual target. From this input, we task\nour network with predicting a local edit operation that could be applied to the\ninput program to improve its similarity to the target. In order to apply this\nscheme for domains that lack program annotations, we develop a self-supervised\nlearning approach that integrates this edit network into a bootstrapped\nfinetuning loop along with a network that predicts entire programs in one-shot.\nOur joint finetuning scheme, when coupled with an inference procedure that\ninitializes a population from the one-shot model and evolves members of this\npopulation with the edit network, helps to infer more accurate visual programs.\nOver multiple domains, we experimentally compare our method against the\nalternative of using only the one-shot model, and find that even under equal\nsearch-time budgets, our editing-based paradigm provides significant\nadvantages.\n","authors":["R. Kenny Jones","Renhao Zhang","Aditya Ganeshan","Daniel Ritchie"],"pdf_url":"https://arxiv.org/pdf/2406.02383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17077v2","updated":"2024-06-04T14:57:58Z","published":"2024-01-30T14:57:32Z","title":"Dynamical Survival Analysis with Controlled Latent States","summary":"  We consider the task of learning individual-specific intensities of counting\nprocesses from a set of static variables and irregularly sampled time series.\nWe introduce a novel modelization approach in which the intensity is the\nsolution to a controlled differential equation. We first design a neural\nestimator by building on neural controlled differential equations. In a second\ntime, we show that our model can be linearized in the signature space under\nsufficient regularity conditions, yielding a signature-based estimator which we\ncall CoxSig. We provide theoretical learning guarantees for both estimators,\nbefore showcasing the performance of our models on a vast array of simulated\nand real-world datasets from finance, predictive maintenance and food supply\nchain management.\n","authors":["Linus Bleistein","Van-Tuan Nguyen","Adeline Fermanian","Agathe Guilloux"],"pdf_url":"https://arxiv.org/pdf/2401.17077v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.08553v2","updated":"2024-06-04T14:49:36Z","published":"2024-05-14T12:41:11Z","title":"Improving Transformers with Dynamically Composable Multi-Head Attention","summary":"  Multi-Head Attention (MHA) is a key component of Transformer. In MHA,\nattention heads work independently, causing problems such as low-rank\nbottleneck of attention score matrices and head redundancy. We propose\nDynamically Composable Multi-Head Attention (DCMHA), a parameter and\ncomputation efficient attention architecture that tackles the shortcomings of\nMHA and increases the expressive power of the model by dynamically composing\nattention heads. At the core of DCMHA is a $\\it{Compose}$ function that\ntransforms the attention score and weight matrices in an input-dependent way.\nDCMHA can be used as a drop-in replacement of MHA in any transformer\narchitecture to obtain the corresponding DCFormer. DCFormer significantly\noutperforms Transformer on different architectures and model scales in language\nmodeling, matching the performance of models with ~1.7x-2.0x compute. For\nexample, DCPythia-6.9B outperforms open source Pythia-12B on both pretraining\nperplexity and downstream task evaluation. The code and models are available at\nhttps://github.com/Caiyun-AI/DCFormer.\n","authors":["Da Xiao","Qingye Meng","Shengping Li","Xingyuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2405.08553v2.pdf","comment":"Accepted to the 41st International Conference on Machine Learning\n  (ICML'24 oral)"},{"id":"http://arxiv.org/abs/2406.02366v1","updated":"2024-06-04T14:45:47Z","published":"2024-06-04T14:45:47Z","title":"Finding NeMo: Localizing Neurons Responsible For Memorization in\n  Diffusion Models","summary":"  Diffusion models (DMs) produce very detailed and high-quality images. Their\npower results from extensive training on large amounts of data, usually scraped\nfrom the internet without proper attribution or consent from content creators.\nUnfortunately, this practice raises privacy and intellectual property concerns,\nas DMs can memorize and later reproduce their potentially sensitive or\ncopyrighted training images at inference time. Prior efforts prevent this issue\nby either changing the input to the diffusion process, thereby preventing the\nDM from generating memorized samples during inference, or removing the\nmemorized data from training altogether. While those are viable solutions when\nthe DM is developed and deployed in a secure and constantly monitored\nenvironment, they hold the risk of adversaries circumventing the safeguards and\nare not effective when the DM itself is publicly released. To solve the\nproblem, we introduce NeMo, the first method to localize memorization of\nindividual data samples down to the level of neurons in DMs' cross-attention\nlayers. Through our experiments, we make the intriguing finding that in many\ncases, single neurons are responsible for memorizing particular training\nsamples. By deactivating these memorization neurons, we can avoid the\nreplication of training data at inference time, increase the diversity in the\ngenerated outputs, and mitigate the leakage of private and copyrighted data. In\nthis way, our NeMo contributes to a more responsible deployment of DMs.\n","authors":["Dominik Hintersdorf","Lukas Struppek","Kristian Kersting","Adam Dziedzic","Franziska Boenisch"],"pdf_url":"https://arxiv.org/pdf/2406.02366v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2406.02362v1","updated":"2024-06-04T14:39:51Z","published":"2024-06-04T14:39:51Z","title":"Temporal Graph Rewiring with Expander Graphs","summary":"  Evolving relations in real-world networks are often modelled by temporal\ngraphs. Graph rewiring techniques have been utilised on Graph Neural Networks\n(GNNs) to improve expressiveness and increase model performance. In this work,\nwe propose Temporal Graph Rewiring (TGR), the first approach for graph rewiring\non temporal graphs. TGR enables communication between temporally distant nodes\nin a continuous time dynamic graph by utilising expander graph propagation to\nconstruct a message passing highway for message passing between distant nodes.\nExpander graphs are suitable candidates for rewiring as they help overcome the\noversquashing problem often observed in GNNs. On the public tgbl-wiki\nbenchmark, we show that TGR improves the performance of a widely used TGN model\nby a significant margin. Our code repository is accessible at\nhttps://anonymous.4open.science/r/TGR-254C.\n","authors":["Katarina Petrović","Shenyang Huang","Farimah Poursafaei","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2406.02362v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2406.02361v1","updated":"2024-06-04T14:38:30Z","published":"2024-06-04T14:38:30Z","title":"Using Self-supervised Learning Can Improve Model Fairness","summary":"  Self-supervised learning (SSL) has become the de facto training paradigm of\nlarge models, where pre-training is followed by supervised fine-tuning using\ndomain-specific data and labels. Despite demonstrating comparable performance\nwith supervised methods, comprehensive efforts to assess SSL's impact on\nmachine learning fairness (i.e., performing equally on different demographic\nbreakdowns) are lacking. Hypothesizing that SSL models would learn more\ngeneric, hence less biased representations, this study explores the impact of\npre-training and fine-tuning strategies on fairness. We introduce a fairness\nassessment framework for SSL, comprising five stages: defining dataset\nrequirements, pre-training, fine-tuning with gradual unfreezing, assessing\nrepresentation similarity conditioned on demographics, and establishing\ndomain-specific evaluation processes. We evaluate our method's generalizability\non three real-world human-centric datasets (i.e., MIMIC, MESA, and GLOBEM) by\nsystematically comparing hundreds of SSL and fine-tuned models on various\ndimensions spanning from the intermediate representations to appropriate\nevaluation metrics. Our findings demonstrate that SSL can significantly improve\nmodel fairness, while maintaining performance on par with supervised\nmethods-exhibiting up to a 30% increase in fairness with minimal loss in\nperformance through self-supervision. We posit that such differences can be\nattributed to representation dissimilarities found between the best- and the\nworst-performing demographics across models-up to x13 greater for protected\nattributes with larger performance discrepancies between segments.\n","authors":["Sofia Yfantidou","Dimitris Spathis","Marios Constantinides","Athena Vakali","Daniele Quercia","Fahim Kawsar"],"pdf_url":"https://arxiv.org/pdf/2406.02361v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2401.01640"},{"id":"http://arxiv.org/abs/2406.02357v1","updated":"2024-06-04T14:35:27Z","published":"2024-06-04T14:35:27Z","title":"The complexity of approximate (coarse) correlated equilibrium for\n  incomplete information games","summary":"  We study the iteration complexity of decentralized learning of approximate\ncorrelated equilibria in incomplete information games.\n  On the negative side, we prove that in $\\mathit{extensive}$-$\\mathit{form}$\n$\\mathit{games}$, assuming $\\mathsf{PPAD} \\not\\subset\n\\mathsf{TIME}(n^{\\mathsf{polylog}(n)})$, any polynomial-time learning\nalgorithms must take at least $2^{\\log_2^{1-o(1)}(|\\mathcal{I}|)}$ iterations\nto converge to the set of $\\epsilon$-approximate correlated equilibrium, where\n$|\\mathcal{I}|$ is the number of nodes in the game and $\\epsilon > 0$ is an\nabsolute constant. This nearly matches, up to the $o(1)$ term, the algorithms\nof [PR'24, DDFG'24] for learning $\\epsilon$-approximate correlated equilibrium,\nand resolves an open question of Anagnostides, Kalavasis, Sandholm, and\nZampetakis [AKSZ'24]. Our lower bound holds even for the easier solution\nconcept of $\\epsilon$-approximate $\\mathit{coarse}$ correlated equilibrium\n  On the positive side, we give uncoupled dynamics that reach\n$\\epsilon$-approximate correlated equilibria of a $\\mathit{Bayesian}$\n$\\mathit{game}$ in polylogarithmic iterations, without any dependence of the\nnumber of types. This demonstrates a separation between Bayesian games and\nextensive-form games.\n","authors":["Binghui Peng","Aviad Rubinstein"],"pdf_url":"https://arxiv.org/pdf/2406.02357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02356v1","updated":"2024-06-04T14:34:39Z","published":"2024-06-04T14:34:39Z","title":"Language Models Do Hard Arithmetic Tasks Easily and Hardly Do Easy\n  Arithmetic Tasks","summary":"  The ability (and inability) of large language models (LLMs) to perform\narithmetic tasks has been the subject of much theoretical and practical debate.\nWe show that LLMs are frequently able to correctly and confidently predict the\nfirst digit of n-digit by m-digit multiplication tasks without using chain of\nthought reasoning, despite these tasks require compounding operations to solve.\nSimultaneously, LLMs in practice often fail to correctly or confidently predict\nthe last digit of an n-digit by m-digit multiplication, a task equivalent to\n1-digit by 1-digit multiplication which can be easily learned or memorized. We\nshow that the latter task can be solved more robustly when the LLM is\nconditioned on all of the correct higher-order digits, which on average\nincreases the confidence of the correct last digit on 5-digit by 5-digit\nmultiplication tasks using Llama 2-13B by over 230% (0.13 to 0.43) and\nMistral-7B by 150% (0.22 to 0.55).\n","authors":["Andrew Gambardella","Yusuke Iwasawa","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2406.02356v1.pdf","comment":"In Proceedings of the 62nd Annual Meeting of the Association for\n  Computational Linguistics (Volume 2: Short Papers)"},{"id":"http://arxiv.org/abs/2404.10271v2","updated":"2024-06-04T14:34:38Z","published":"2024-04-16T03:59:33Z","title":"Social Choice Should Guide AI Alignment in Dealing with Diverse Human\n  Feedback","summary":"  Foundation models such as GPT-4 are fine-tuned to avoid unsafe or otherwise\nproblematic behavior, such as helping to commit crimes or producing racist\ntext. One approach to fine-tuning, called reinforcement learning from human\nfeedback, learns from humans' expressed preferences over multiple outputs.\nAnother approach is constitutional AI, in which the input from humans is a list\nof high-level principles. But how do we deal with potentially diverging input\nfrom humans? How can we aggregate the input into consistent data about\n\"collective\" preferences or otherwise use it to make collective choices about\nmodel behavior? In this paper, we argue that the field of social choice is well\npositioned to address these questions, and we discuss ways forward for this\nagenda, drawing on discussions in a recent workshop on Social Choice for AI\nEthics and Safety held in Berkeley, CA, USA in December 2023.\n","authors":["Vincent Conitzer","Rachel Freedman","Jobst Heitzig","Wesley H. Holliday","Bob M. Jacobs","Nathan Lambert","Milan Mossé","Eric Pacuit","Stuart Russell","Hailey Schoelkopf","Emanuel Tewolde","William S. Zwicker"],"pdf_url":"https://arxiv.org/pdf/2404.10271v2.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.02355v1","updated":"2024-06-04T14:34:13Z","published":"2024-06-04T14:34:13Z","title":"FedDr+: Stabilizing Dot-regression with Global Feature Distillation for\n  Federated Learning","summary":"  Federated Learning (FL) has emerged as a pivotal framework for the\ndevelopment of effective global models (global FL) or personalized models\n(personalized FL) across clients with heterogeneous, non-iid data distribution.\nA key challenge in FL is client drift, where data heterogeneity impedes the\naggregation of scattered knowledge. Recent studies have tackled the client\ndrift issue by identifying significant divergence in the last classifier layer.\nTo mitigate this divergence, strategies such as freezing the classifier weights\nand aligning the feature extractor accordingly have proven effective. Although\nthe local alignment between classifier and feature extractor has been studied\nas a crucial factor in FL, we observe that it may lead the model to\noveremphasize the observed classes within each client. Thus, our objectives are\ntwofold: (1) enhancing local alignment while (2) preserving the representation\nof unseen class samples. This approach aims to effectively integrate knowledge\nfrom individual clients, thereby improving performance for both global and\npersonalized FL. To achieve this, we introduce a novel algorithm named FedDr+,\nwhich empowers local model alignment using dot-regression loss. FedDr+ freezes\nthe classifier as a simplex ETF to align the features and improves aggregated\nglobal models by employing a feature distillation mechanism to retain\ninformation about unseen/missing classes. Consequently, we provide empirical\nevidence demonstrating that our algorithm surpasses existing methods that use a\nfrozen classifier to boost alignment across the diverse distribution.\n","authors":["Seongyoon Kim","Minchan Jeong","Sungnyun Kim","Sungwoo Cho","Sumyeong Ahn","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2406.02355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02354v1","updated":"2024-06-04T14:33:23Z","published":"2024-06-04T14:33:23Z","title":"Label-wise Aleatoric and Epistemic Uncertainty Quantification","summary":"  We present a novel approach to uncertainty quantification in classification\ntasks based on label-wise decomposition of uncertainty measures. This\nlabel-wise perspective allows uncertainty to be quantified at the individual\nclass level, thereby improving cost-sensitive decision-making and helping\nunderstand the sources of uncertainty. Furthermore, it allows to define total,\naleatoric, and epistemic uncertainty on the basis of non-categorical measures\nsuch as variance, going beyond common entropy-based measures. In particular,\nvariance-based measures address some of the limitations associated with\nestablished methods that have recently been discussed in the literature. We\nshow that our proposed measures adhere to a number of desirable properties.\nThrough empirical evaluation on a variety of benchmark data sets -- including\napplications in the medical domain where accurate uncertainty quantification is\ncrucial -- we establish the effectiveness of label-wise uncertainty\nquantification.\n","authors":["Yusuf Sale","Paul Hofman","Timo Löhr","Lisa Wimmer","Thomas Nagler","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2406.02354v1.pdf","comment":"Uncertainty in Artificial Intelligence. arXiv admin note: substantial\n  text overlap with arXiv:2401.00276"},{"id":"http://arxiv.org/abs/2406.02352v1","updated":"2024-06-04T14:28:36Z","published":"2024-06-04T14:28:36Z","title":"System-Aware Neural ODE Processes for Few-Shot Bayesian Optimization","summary":"  We consider the problem of optimizing initial conditions and timing in\ndynamical systems governed by unknown ordinary differential equations (ODEs),\nwhere evaluating different initial conditions is costly and there are\nconstraints on observation times. To identify the optimal conditions within\nseveral trials, we introduce a few-shot Bayesian Optimization (BO) framework\nbased on the system's prior information. At the core of our approach is the\nSystem-Aware Neural ODE Processes (SANODEP), an extension of Neural ODE\nProcesses (NODEP) designed to meta-learn ODE systems from multiple trajectories\nusing a novel context embedding block. Additionally, we propose a\nmulti-scenario loss function specifically for optimization purposes. Our\ntwo-stage BO framework effectively incorporates search space constraints,\nenabling efficient optimization of both initial conditions and observation\ntimings. We conduct extensive experiments showcasing SANODEP's potential for\nfew-shot BO. We also explore SANODEP's adaptability to varying levels of prior\ninformation, highlighting the trade-off between prior flexibility and model\nfitting accuracy.\n","authors":["Jixiang Qing","Becky D Langdon","Robert M Lee","Behrang Shafei","Mark van der Wilk","Calvin Tsay","Ruth Misener"],"pdf_url":"https://arxiv.org/pdf/2406.02352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15925v4","updated":"2024-06-04T14:28:29Z","published":"2023-05-25T10:53:18Z","title":"On the Identifiability of Switching Dynamical Systems","summary":"  The identifiability of latent variable models has received increasing\nattention due to its relevance in interpretability and out-of-distribution\ngeneralisation. In this work, we study the identifiability of Switching\nDynamical Systems, taking an initial step toward extending identifiability\nanalysis to sequential latent variable models. We first prove the\nidentifiability of Markov Switching Models, which commonly serve as the prior\ndistribution for the continuous latent variables in Switching Dynamical\nSystems. We present identification conditions for first-order Markov dependency\nstructures, whose transition distribution is parametrised via non-linear\nGaussians. We then establish the identifiability of the latent variables and\nnon-linear mappings in Switching Dynamical Systems up to affine\ntransformations, by leveraging identifiability analysis techniques from\nidentifiable deep latent variable models. We finally develop estimation\nalgorithms for identifiable Switching Dynamical Systems. Throughout empirical\nstudies, we demonstrate the practicality of identifiable Switching Dynamical\nSystems for segmenting high-dimensional time series such as videos, and\nshowcase the use of identifiable Markov Switching Models for regime-dependent\ncausal discovery in climate data.\n","authors":["Carles Balsells-Rodas","Yixin Wang","Yingzhen Li"],"pdf_url":"https://arxiv.org/pdf/2305.15925v4.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.02348v1","updated":"2024-06-04T14:24:30Z","published":"2024-06-04T14:24:30Z","title":"AMOSL: Adaptive Modality-wise Structure Learning in Multi-view Graph\n  Neural Networks For Enhanced Unified Representation","summary":"  While Multi-view Graph Neural Networks (MVGNNs) excel at leveraging diverse\nmodalities for learning object representation, existing methods assume\nidentical local topology structures across modalities that overlook real-world\ndiscrepancies. This leads MVGNNs straggles in modality fusion and\nrepresentations denoising. To address these issues, we propose adaptive\nmodality-wise structure learning (AMoSL). AMoSL captures node correspondences\nbetween modalities via optimal transport, and jointly learning with graph\nembedding. To enable efficient end-to-end training, we employ an efficient\nsolution for the resulting complex bilevel optimization problem. Furthermore,\nAMoSL adapts to downstream tasks through unsupervised learning on\ninter-modality distances. The effectiveness of AMoSL is demonstrated by its\nability to train more accurate graph classifiers on six benchmark datasets.\n","authors":["Peiyu Liang","Hongchang Gao","Xubin He"],"pdf_url":"https://arxiv.org/pdf/2406.02348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02347v1","updated":"2024-06-04T14:23:27Z","published":"2024-06-04T14:23:27Z","title":"Flash Diffusion: Accelerating Any Conditional Diffusion Model for Few\n  Steps Image Generation","summary":"  In this paper, we propose an efficient, fast, and versatile distillation\nmethod to accelerate the generation of pre-trained diffusion models: Flash\nDiffusion. The method reaches state-of-the-art performances in terms of FID and\nCLIP-Score for few steps image generation on the COCO2014 and COCO2017\ndatasets, while requiring only several GPU hours of training and fewer\ntrainable parameters than existing methods. In addition to its efficiency, the\nversatility of the method is also exposed across several tasks such as\ntext-to-image, inpainting, face-swapping, super-resolution and using different\nbackbones such as UNet-based denoisers (SD1.5, SDXL) or DiT (Pixart-$\\alpha$),\nas well as adapters. In all cases, the method allowed to reduce drastically the\nnumber of sampling steps while maintaining very high-quality image generation.\nThe official implementation is available at\nhttps://github.com/gojasper/flash-diffusion.\n","authors":["Clement Chadebec","Onur Tasar","Eyal Benaroche","Benjamin Aubin"],"pdf_url":"https://arxiv.org/pdf/2406.02347v1.pdf","comment":"16 pages + 16 pages appendices"},{"id":"http://arxiv.org/abs/2406.02345v1","updated":"2024-06-04T14:21:41Z","published":"2024-06-04T14:21:41Z","title":"Progressive Confident Masking Attention Network for Audio-Visual\n  Segmentation","summary":"  Audio and visual signals typically occur simultaneously, and humans possess\nan innate ability to correlate and synchronize information from these two\nmodalities. Recently, a challenging problem known as Audio-Visual Segmentation\n(AVS) has emerged, intending to produce segmentation maps for sounding objects\nwithin a scene. However, the methods proposed so far have not sufficiently\nintegrated audio and visual information, and the computational costs have been\nextremely high. Additionally, the outputs of different stages have not been\nfully utilized. To facilitate this research, we introduce a novel Progressive\nConfident Masking Attention Network (PMCANet). It leverages attention\nmechanisms to uncover the intrinsic correlations between audio signals and\nvisual frames. Furthermore, we design an efficient and effective\ncross-attention module to enhance semantic perception by selecting query\ntokens. This selection is determined through confidence-driven units based on\nthe network's multi-stage predictive outputs. Experiments demonstrate that our\nnetwork outperforms other AVS methods while requiring less computational\nresources.\n","authors":["Yuxuan Wang","Feng Dong","Jinchao Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.02345v1.pdf","comment":"10 pages, 9 figures, submitted to IEEE TRANSACTIONS ON CIRCUITS AND\n  SYSTEMS FOR VIDEO TECHNOLOGY"},{"id":"http://arxiv.org/abs/2406.02344v1","updated":"2024-06-04T14:20:10Z","published":"2024-06-04T14:20:10Z","title":"Incorporating Navigation Context into Inland Vessel Trajectory\n  Prediction: A Gaussian Mixture Model and Transformer Approach","summary":"  Using data sources beyond the Automatic Identification System to represent\nthe context a vessel is navigating in and consequently improve situation\nawareness is still rare in machine learning approaches to vessel trajectory\nprediction (VTP). In inland shipping, where vessel movement is constrained\nwithin fairways, navigational context information is indispensable. In this\ncontribution targeting inland VTP, Gaussian Mixture Models (GMMs) are applied,\non a fused dataset of AIS and discharge measurements, to generate multi-modal\ndistribution curves, capturing typical lateral vessel positioning in the\nfairway and dislocation speeds along the waterway. By sampling the probability\ndensity curves of the GMMs, feature vectors are derived which are used,\ntogether with spatio-temporal vessel features and fairway geometries, as input\nto a VTP transformer model. The incorporation of these distribution features of\nboth the current and forthcoming navigation context improves prediction\naccuracy. The superiority of the model over a previously proposed transformer\nmodel for inland VTP is shown. The novelty lies in the provision of\npreprocessed, statistics-based features representing the conditioned spatial\ncontext, rather than relying on the model to extract relevant features for the\nVTP task from contextual data. Oversimplification of the complexity of inland\nnavigation patterns by assuming a single typical route or selecting specific\nclusters prior to model application is avoided by giving the model access to\nthe entire distribution information. The methodology's generalizability is\ndemonstrated through the usage of data of 3 distinct river sections. It can be\nintegrated into an interaction-aware prediction framework, where insights into\nthe positioning of the actual vessel behavior in the overall distribution at\nthe current location and discharge can enhance trajectory prediction accuracy.\n","authors":["Kathrin Donandt","Dirk Söffker"],"pdf_url":"https://arxiv.org/pdf/2406.02344v1.pdf","comment":"To be published in Proceedings of the 27th International Conference\n  on Information Fusion (FUSION 2024)"},{"id":"http://arxiv.org/abs/2406.02343v1","updated":"2024-06-04T14:19:50Z","published":"2024-06-04T14:19:50Z","title":"Cluster-Aware Similarity Diffusion for Instance Retrieval","summary":"  Diffusion-based re-ranking is a common method used for retrieving instances\nby performing similarity propagation in a nearest neighbor graph. However,\nexisting techniques that construct the affinity graph based on pairwise\ninstances can lead to the propagation of misinformation from outliers and other\nmanifolds, resulting in inaccurate results. To overcome this issue, we propose\na novel Cluster-Aware Similarity (CAS) diffusion for instance retrieval. The\nprimary concept of CAS is to conduct similarity diffusion within local\nclusters, which can reduce the influence from other manifolds explicitly. To\nobtain a symmetrical and smooth similarity matrix, our Bidirectional Similarity\nDiffusion strategy introduces an inverse constraint term to the optimization\nobjective of local cluster diffusion. Additionally, we have optimized a\nNeighbor-guided Similarity Smoothing approach to ensure similarity consistency\namong the local neighbors of each instance. Evaluations in instance retrieval\nand object re-identification validate the effectiveness of the proposed CAS,\nour code is publicly available.\n","authors":["Jifei Luo","Hantao Yao","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2406.02343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09125v2","updated":"2024-06-04T14:15:55Z","published":"2024-01-17T11:01:28Z","title":"Understanding Heterophily for Graph Neural Networks","summary":"  Graphs with heterophily have been regarded as challenging scenarios for Graph\nNeural Networks (GNNs), where nodes are connected with dissimilar neighbors\nthrough various patterns. In this paper, we present theoretical understandings\nof the impacts of different heterophily patterns for GNNs by incorporating the\ngraph convolution (GC) operations into fully connected networks via the\nproposed Heterophilous Stochastic Block Models (HSBM), a general random graph\nmodel that can accommodate diverse heterophily patterns. Firstly, we show that\nby applying a GC operation, the separability gains are determined by two\nfactors, i.e., the Euclidean distance of the neighborhood distributions and\n$\\sqrt{\\mathbb{E}\\left[\\operatorname{deg}\\right]}$, where\n$\\mathbb{E}\\left[\\operatorname{deg}\\right]$ is the averaged node degree. It\nreveals that the impact of heterophily on classification needs to be evaluated\nalongside the averaged node degree. Secondly, we show that the topological\nnoise has a detrimental impact on separability, which is equivalent to\ndegrading $\\mathbb{E}\\left[\\operatorname{deg}\\right]$. Finally, when applying\nmultiple GC operations, we show that the separability gains are determined by\nthe normalized distance of the $l$-powered neighborhood distributions. It\nindicates that the nodes still possess separability as $l$ goes to infinity in\na wide range of regimes. Extensive experiments on both synthetic and real-world\ndata verify the effectiveness of our theory.\n","authors":["Junfu Wang","Yuanfang Guo","Liang Yang","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09125v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2306.12214v4","updated":"2024-06-04T14:09:44Z","published":"2023-06-21T12:13:46Z","title":"More PAC-Bayes bounds: From bounded losses, to losses with general tail\n  behaviors, to anytime validity","summary":"  In this paper, we present new high-probability PAC-Bayes bounds for different\ntypes of losses. Firstly, for losses with a bounded range, we recover a\nstrengthened version of Catoni's bound that holds uniformly for all parameter\nvalues. This leads to new fast-rate and mixed-rate bounds that are\ninterpretable and tighter than previous bounds in the literature. In\nparticular, the fast-rate bound is equivalent to the Seeger--Langford bound.\nSecondly, for losses with more general tail behaviors, we introduce two new\nparameter-free bounds: a PAC-Bayes Chernoff analogue when the loss' cumulative\ngenerating function is bounded, and a bound when the loss' second moment is\nbounded. These two bounds are obtained using a new technique based on a\ndiscretization of the space of possible events for the ``in probability''\nparameter optimization problem. This technique is both simpler and more general\nthan previous approaches optimizing over a grid on the parameters' space.\nFinally, using a simple technique that is applicable to any existing bound, we\nextend all previous results to anytime-valid bounds.\n","authors":["Borja Rodríguez-Gálvez","Ragnar Thobaben","Mikael Skoglund"],"pdf_url":"https://arxiv.org/pdf/2306.12214v4.pdf","comment":"43 pages: ~20 of main text, ~6.5 of references, and ~17.5 of\n  appendices. Published at JMLR"},{"id":"http://arxiv.org/abs/2406.02336v1","updated":"2024-06-04T14:06:15Z","published":"2024-06-04T14:06:15Z","title":"Polynomial-Augmented Neural Networks (PANNs) with Weak Orthogonality\n  Constraints for Enhanced Function and PDE Approximation","summary":"  We present polynomial-augmented neural networks (PANNs), a novel machine\nlearning architecture that combines deep neural networks (DNNs) with a\npolynomial approximant. PANNs combine the strengths of DNNs (flexibility and\nefficiency in higher-dimensional approximation) with those of polynomial\napproximation (rapid convergence rates for smooth functions). To aid in both\nstable training and enhanced accuracy over a variety of problems, we present\n(1) a family of orthogonality constraints that impose mutual orthogonality\nbetween the polynomial and the DNN within a PANN; (2) a simple basis pruning\napproach to combat the curse of dimensionality introduced by the polynomial\ncomponent; and (3) an adaptation of a polynomial preconditioning strategy to\nboth DNNs and polynomials. We test the resulting architecture for its\npolynomial reproduction properties, ability to approximate both smooth\nfunctions and functions of limited smoothness, and as a method for the solution\nof partial differential equations (PDEs). Through these experiments, we\ndemonstrate that PANNs offer superior approximation properties to DNNs for both\nregression and the numerical solution of PDEs, while also offering enhanced\naccuracy over both polynomial and DNN-based regression (each) when regressing\nfunctions with limited smoothness.\n","authors":["Madison Cooley","Shandian Zhe","Robert M. Kirby","Varun Shankar"],"pdf_url":"https://arxiv.org/pdf/2406.02336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02333v1","updated":"2024-06-04T14:01:03Z","published":"2024-06-04T14:01:03Z","title":"Towards Neural Architecture Search for Transfer Learning in 6G Networks","summary":"  The future 6G network is envisioned to be AI-native, and as such, ML models\nwill be pervasive in support of optimizing performance, reducing energy\nconsumption, and in coping with increasing complexity and heterogeneity. A key\nchallenge is automating the process of finding optimal model architectures\nsatisfying stringent requirements stemming from varying tasks, dynamicity and\navailable resources in the infrastructure and deployment positions. In this\npaper, we describe and review the state-of-the-art in Neural Architecture\nSearch and Transfer Learning and their applicability in networking. Further, we\nidentify open research challenges and set directions with a specific focus on\nthree main requirements with elements unique to the future network, namely\ncombining NAS and TL, multi-objective search, and tabular data. Finally, we\noutline and discuss both near-term and long-term work ahead.\n","authors":["Adam Orucu","Farnaz Moradi","Masoumeh Ebrahimi","Andreas Johnsson"],"pdf_url":"https://arxiv.org/pdf/2406.02333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02332v1","updated":"2024-06-04T14:00:25Z","published":"2024-06-04T14:00:25Z","title":"Extended Mind Transformers","summary":"  Pre-trained language models demonstrate general intelligence and common\nsense, but long inputs quickly become a bottleneck for memorizing information\nat inference time. We resurface a simple method, Memorizing Transformers (Wu et\nal., 2022), that gives the model access to a bank of pre-computed memories. We\nshow that it is possible to fix many of the shortcomings of the original\nmethod, such as the need for fine-tuning, by critically assessing how\npositional encodings should be updated for the keys and values retrieved. This\nintuitive method uses the model's own key/query system to select and attend to\nthe most relevant memories at each generation step, rather than using external\nembeddings. We demonstrate the importance of external information being\nretrieved in a majority of decoder layers, contrary to previous work. We open\nsource a new counterfactual long-range retrieval benchmark, and show that\nExtended Mind Transformers outperform today's state of the art by 6% on\naverage.\n","authors":["Phoebe Klett","Thomas Ahle"],"pdf_url":"https://arxiv.org/pdf/2406.02332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03703v3","updated":"2024-06-04T13:58:30Z","published":"2023-11-07T03:55:39Z","title":"Practical Performance Guarantees for Pipelined DNN Inference","summary":"  We optimize pipeline parallelism for deep neural network (DNN) inference by\npartitioning model graphs into $k$ stages and minimizing the running time of\nthe bottleneck stage, including communication. We give practical and effective\nalgorithms for this NP-hard problem, but our emphasis is on tackling the\npractitioner's dilemma of deciding when a solution is good enough. To this end,\nwe design novel mixed-integer programming (MIP) relaxations for proving lower\nbounds. Applying these methods to a diverse testbed of 369 production models,\nfor $k \\in \\{2, 4, 8, 16, 32, 64\\}$, we empirically show that these lower\nbounds are strong enough to be useful in practice. Our lower bounds are\nsubstantially stronger than standard combinatorial bounds. For example,\nevaluated via geometric means across a production testbed with $k = 16$\npipeline stages, our MIP formulations raise the lower bound from 0.4598 to\n0.9452, expressed as a fraction of the best partition found. In other words,\nour improved lower bounds close the optimality gap by a factor of 9.855x.\n","authors":["Aaron Archer","Matthew Fahrbach","Kuikui Liu","Prakash Prabhu"],"pdf_url":"https://arxiv.org/pdf/2311.03703v3.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.02329v1","updated":"2024-06-04T13:58:28Z","published":"2024-06-04T13:58:28Z","title":"On Affine Homotopy between Language Encoders","summary":"  Pre-trained language encoders -- functions that represent text as vectors --\nare an integral component of many NLP tasks. We tackle a natural question in\nlanguage encoder analysis: What does it mean for two encoders to be similar? We\ncontend that a faithful measure of similarity needs to be \\emph{intrinsic},\nthat is, task-independent, yet still be informative of \\emph{extrinsic}\nsimilarity -- the performance on downstream tasks. It is common to consider two\nencoders similar if they are \\emph{homotopic}, i.e., if they can be aligned\nthrough some transformation. In this spirit, we study the properties of\n\\emph{affine} alignment of language encoders and its implications on extrinsic\nsimilarity. We find that while affine alignment is fundamentally an asymmetric\nnotion of similarity, it is still informative of extrinsic similarity. We\nconfirm this on datasets of natural language representations. Beyond providing\nuseful bounds on extrinsic similarity, affine intrinsic similarity also allows\nus to begin uncovering the structure of the space of pre-trained encoders by\ndefining an order over them.\n","authors":["Robin SM Chan","Reda Boumasmoud","Anej Svete","Yuxin Ren","Qipeng Guo","Zhijing Jin","Shauli Ravfogel","Mrinmaya Sachan","Bernhard Schölkopf","Mennatallah El-Assady","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2406.02329v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2406.02327v1","updated":"2024-06-04T13:57:34Z","published":"2024-06-04T13:57:34Z","title":"Continual Unsupervised Out-of-Distribution Detection","summary":"  Deep learning models excel when the data distribution during training aligns\nwith testing data. Yet, their performance diminishes when faced with\nout-of-distribution (OOD) samples, leading to great interest in the field of\nOOD detection. Current approaches typically assume that OOD samples originate\nfrom an unconcentrated distribution complementary to the training distribution.\nWhile this assumption is appropriate in the traditional unsupervised OOD\n(U-OOD) setting, it proves inadequate when considering the place of deployment\nof the underlying deep learning model. To better reflect this real-world\nscenario, we introduce the novel setting of continual U-OOD detection. To\ntackle this new setting, we propose a method that starts from a U-OOD detector,\nwhich is agnostic to the OOD distribution, and slowly updates during deployment\nto account for the actual OOD distribution. Our method uses a new U-OOD scoring\nfunction that combines the Mahalanobis distance with a nearest-neighbor\napproach. Furthermore, we design a confidence-scaled few-shot OOD detector that\noutperforms previous methods. We show our method greatly improves upon strong\nbaselines from related fields.\n","authors":["Lars Doorenbos","Raphael Sznitman","Pablo Márquez-Neila"],"pdf_url":"https://arxiv.org/pdf/2406.02327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02322v1","updated":"2024-06-04T13:52:42Z","published":"2024-06-04T13:52:42Z","title":"A Survey of Transformer Enabled Time Series Synthesis","summary":"  Generative AI has received much attention in the image and language domains,\nwith the transformer neural network continuing to dominate the state of the\nart. Application of these models to time series generation is less explored,\nhowever, and is of great utility to machine learning, privacy preservation, and\nexplainability research. The present survey identifies this gap at the\nintersection of the transformer, generative AI, and time series data, and\nreviews works in this sparsely populated subdomain. The reviewed works show\ngreat variety in approach, and have not yet converged on a conclusive answer to\nthe problems the domain poses. GANs, diffusion models, state space models, and\nautoencoders were all encountered alongside or surrounding the transformers\nwhich originally motivated the survey. While too open a domain to offer\nconclusive insights, the works surveyed are quite suggestive, and several\nrecommendations for best practice, and suggestions of valuable future work, are\nprovided.\n","authors":["Alexander Sommers","Logan Cummins","Sudip Mittal","Shahram Rahimi","Maria Seale","Joseph Jaboure","Thomas Arnold"],"pdf_url":"https://arxiv.org/pdf/2406.02322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19014v2","updated":"2024-06-04T13:51:10Z","published":"2024-05-29T11:53:07Z","title":"Trust the Model Where It Trusts Itself -- Model-Based Actor-Critic with\n  Uncertainty-Aware Rollout Adaption","summary":"  Dyna-style model-based reinforcement learning (MBRL) combines model-free\nagents with predictive transition models through model-based rollouts. This\ncombination raises a critical question: 'When to trust your model?'; i.e.,\nwhich rollout length results in the model providing useful data? Janner et al.\n(2019) address this question by gradually increasing rollout lengths throughout\nthe training. While theoretically tempting, uniform model accuracy is a fallacy\nthat collapses at the latest when extrapolating. Instead, we propose asking the\nquestion 'Where to trust your model?'. Using inherent model uncertainty to\nconsider local accuracy, we obtain the Model-Based Actor-Critic with\nUncertainty-Aware Rollout Adaption (MACURA) algorithm. We propose an\neasy-to-tune rollout mechanism and demonstrate substantial improvements in data\nefficiency and performance compared to state-of-the-art deep MBRL methods on\nthe MuJoCo benchmark.\n","authors":["Bernd Frauenknecht","Artur Eisele","Devdutt Subhasish","Friedrich Solowjow","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2405.19014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02318v1","updated":"2024-06-04T13:51:08Z","published":"2024-06-04T13:51:08Z","title":"PeFAD: A Parameter-Efficient Federated Framework for Time Series Anomaly\n  Detection","summary":"  With the proliferation of mobile sensing techniques, huge amounts of time\nseries data are generated and accumulated in various domains, fueling plenty of\nreal-world applications. In this setting, time series anomaly detection is\npractically important. It endeavors to identify deviant samples from the normal\nsample distribution in time series. Existing approaches generally assume that\nall the time series is available at a central location. However, we are\nwitnessing the decentralized collection of time series due to the deployment of\nvarious edge devices. To bridge the gap between the decentralized time series\ndata and the centralized anomaly detection algorithms, we propose a\nParameter-efficient Federated Anomaly Detection framework named PeFAD with the\nincreasing privacy concerns. PeFAD for the first time employs the pre-trained\nlanguage model (PLM) as the body of the client's local model, which can benefit\nfrom its cross-modality knowledge transfer capability. To reduce the\ncommunication overhead and local model adaptation cost, we propose a\nparameter-efficient federated training module such that clients only need to\nfine-tune small-scale parameters and transmit them to the server for update.\nPeFAD utilizes a novel anomaly-driven mask selection strategy to mitigate the\nimpact of neglected anomalies during training. A knowledge distillation\noperation on a synthetic privacy-preserving dataset that is shared by all the\nclients is also proposed to address the data heterogeneity issue across\nclients. We conduct extensive evaluations on four real datasets, where PeFAD\noutperforms existing state-of-the-art baselines by up to 28.74\\%.\n","authors":["Ronghui Xu","Hao Miao","Senzhang Wang","Philip S. Yu","Jianxin Wang"],"pdf_url":"https://arxiv.org/pdf/2406.02318v1.pdf","comment":"Accepted by SIGKDD 2024 (Research Track)"},{"id":"http://arxiv.org/abs/2406.02317v1","updated":"2024-06-04T13:45:35Z","published":"2024-06-04T13:45:35Z","title":"Generative Conditional Distributions by Neural (Entropic) Optimal\n  Transport","summary":"  Learning conditional distributions is challenging because the desired outcome\nis not a single distribution but multiple distributions that correspond to\nmultiple instances of the covariates. We introduce a novel neural entropic\noptimal transport method designed to effectively learn generative models of\nconditional distributions, particularly in scenarios characterized by limited\nsample sizes. Our method relies on the minimax training of two neural networks:\na generative network parametrizing the inverse cumulative distribution\nfunctions of the conditional distributions and another network parametrizing\nthe conditional Kantorovich potential. To prevent overfitting, we regularize\nthe objective function by penalizing the Lipschitz constant of the network\noutput. Our experiments on real-world datasets show the effectiveness of our\nalgorithm compared to state-of-the-art conditional distribution learning\ntechniques. Our implementation can be found at\nhttps://github.com/nguyenngocbaocmt02/GENTLE.\n","authors":["Bao Nguyen","Binh Nguyen","Hieu Trung Nguyen","Viet Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2406.02317v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.02315v1","updated":"2024-06-04T13:44:39Z","published":"2024-06-04T13:44:39Z","title":"An Independence-promoting Loss for Music Generation with Language Models","summary":"  Music generation schemes using language modeling rely on a vocabulary of\naudio tokens, generally provided as codes in a discrete latent space learnt by\nan auto-encoder. Multi-stage quantizers are often employed to produce these\ntokens, therefore the decoding strategy used for token prediction must be\nadapted to account for multiple codebooks: either it should model the joint\ndistribution over all codebooks, or fit the product of the codebook marginal\ndistributions. Modelling the joint distribution requires a costly increase in\nthe number of auto-regressive steps, while fitting the product of the marginals\nyields an inexact model unless the codebooks are mutually independent. In this\nwork, we introduce an independence-promoting loss to regularize the\nauto-encoder used as the tokenizer in language models for music generation. The\nproposed loss is a proxy for mutual information based on the maximum mean\ndiscrepancy principle, applied in reproducible kernel Hilbert spaces. Our\ncriterion is simple to implement and train, and it is generalizable to other\nmulti-stream codecs. We show that it reduces the statistical dependence between\ncodebooks during auto-encoding. This leads to an increase in the generated\nmusic quality when modelling the product of the marginal distributions, while\ngenerating audio much faster than the joint distribution model.\n","authors":["Jean-Marie Lemercier","Simon Rouard","Jade Copet","Yossi Adi","Alexandre Déffosez"],"pdf_url":"https://arxiv.org/pdf/2406.02315v1.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2406.02313v1","updated":"2024-06-04T13:42:42Z","published":"2024-06-04T13:42:42Z","title":"Neural Thermodynamic Integration: Free Energies from Energy-based\n  Diffusion Models","summary":"  Thermodynamic integration (TI) offers a rigorous method for estimating\nfree-energy differences by integrating over a sequence of interpolating\nconformational ensembles. However, TI calculations are computationally\nexpensive and typically limited to coupling a small number of degrees of\nfreedom due to the need to sample numerous intermediate ensembles with\nsufficient conformational-space overlap. In this work, we propose to perform TI\nalong an alchemical pathway represented by a trainable neural network, which we\nterm Neural TI. Critically, we parametrize a time-dependent Hamiltonian\ninterpolating between the interacting and non-interacting systems, and optimize\nits gradient using a denoising-diffusion objective. The ability of the\nresulting energy-based diffusion model to sample all intermediate ensembles,\nallows us to perform TI from a single reference calculation. We apply our\nmethod to Lennard-Jones fluids, where we report accurate calculations of the\nexcess chemical potential, demonstrating that Neural TI is capable of coupling\nhundreds of degrees of freedom at once.\n","authors":["Bálint Máté","François Fleuret","Tristan Bereau"],"pdf_url":"https://arxiv.org/pdf/2406.02313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12479v2","updated":"2024-06-04T13:42:06Z","published":"2024-02-19T19:34:07Z","title":"In value-based deep reinforcement learning, a pruned network is a good\n  network","summary":"  Recent work has shown that deep reinforcement learning agents have difficulty\nin effectively using their network parameters. We leverage prior insights into\nthe advantages of sparse training techniques and demonstrate that gradual\nmagnitude pruning enables value-based agents to maximize parameter\neffectiveness. This results in networks that yield dramatic performance\nimprovements over traditional networks, using only a small fraction of the full\nnetwork parameters.\n","authors":["Johan Obando-Ceron","Aaron Courville","Pablo Samuel Castro"],"pdf_url":"https://arxiv.org/pdf/2402.12479v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02310v1","updated":"2024-06-04T13:41:07Z","published":"2024-06-04T13:41:07Z","title":"Disentangled Representation via Variational AutoEncoder for Continuous\n  Treatment Effect Estimation","summary":"  Continuous treatment effect estimation holds significant practical importance\nacross various decision-making and assessment domains, such as healthcare and\nthe military. However, current methods for estimating dose-response curves\nhinge on balancing the entire representation by treating all covariates as\nconfounding variables. Although various approaches disentangle covariates into\ndifferent factors for treatment effect estimation, they are confined to binary\ntreatment settings. Moreover, observational data are often tainted with\nnon-causal noise information that is imperceptible to the human. Hence, in this\npaper, we propose a novel Dose-Response curve estimator via Variational\nAutoEncoder (DRVAE) disentangled covariates representation. Our model is\ndedicated to disentangling covariates into instrumental factors, confounding\nfactors, adjustment factors, and external noise factors, thereby facilitating\nthe estimation of treatment effects under continuous treatment settings by\nbalancing the disentangled confounding factors. Extensive results on synthetic\nand semi-synthetic datasets demonstrate that our model outperforms the current\nstate-of-the-art methods.\n","authors":["Ruijing Cui","Jianbin Sun","Bingyu He","Kewei Yang","Bingfeng Ge"],"pdf_url":"https://arxiv.org/pdf/2406.02310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02309v1","updated":"2024-06-04T13:41:00Z","published":"2024-06-04T13:41:00Z","title":"Effects of Exponential Gaussian Distribution on (Double Sampling)\n  Randomized Smoothing","summary":"  Randomized Smoothing (RS) is currently a scalable certified defense method\nproviding robustness certification against adversarial examples. Although\nsignificant progress has been achieved in providing defenses against $\\ell_p$\nadversaries, the interaction between the smoothing distribution and the\nrobustness certification still remains vague. In this work, we comprehensively\nstudy the effect of two families of distributions, named Exponential Standard\nGaussian (ESG) and Exponential General Gaussian (EGG) distributions, on\nRandomized Smoothing and Double Sampling Randomized Smoothing (DSRS). We derive\nan analytic formula for ESG's certified radius, which converges to the origin\nformula of RS as the dimension $d$ increases. Additionally, we prove that EGG\ncan provide tighter constant factors than DSRS in providing $\\Omega(\\sqrt{d})$\nlower bounds of $\\ell_2$ certified radius, and thus further addresses the curse\nof dimensionality in RS. Our experiments on real-world datasets confirm our\ntheoretical analysis of the ESG distributions, that they provide almost the\nsame certification under different exponents $\\eta$ for both RS and DSRS. In\naddition, EGG\n","authors":["Youwei Shu","Xi Xiao","Derui Wang","Yuxin Cao","Siji Chen","Jason Xue","Linyi Li","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2406.02309v1.pdf","comment":"ICML 2024 Poster"},{"id":"http://arxiv.org/abs/2312.07457v3","updated":"2024-06-04T13:39:17Z","published":"2023-12-12T17:34:42Z","title":"Dynamics Harmonic Analysis of Robotic Systems: Application in\n  Data-Driven Koopman Modelling","summary":"  We introduce the use of harmonic analysis to decompose the state space of\nsymmetric robotic systems into orthogonal isotypic subspaces. These are\nlower-dimensional spaces that capture distinct, symmetric, and synergistic\nmotions. For linear dynamics, we characterize how this decomposition leads to a\nsubdivision of the dynamics into independent linear systems on each subspace, a\nproperty we term dynamics harmonic analysis (DHA). To exploit this property, we\nuse Koopman operator theory to propose an equivariant deep-learning\narchitecture that leverages the properties of DHA to learn a global linear\nmodel of the system dynamics. Our architecture, validated on synthetic systems\nand the dynamics of locomotion of a quadrupedal robot, exhibits enhanced\ngeneralization, sample efficiency, and interpretability, with fewer trainable\nparameters and computational costs.\n","authors":["Daniel Ordoñez-Apraez","Vladimir Kostic","Giulio Turrisi","Pietro Novelli","Carlos Mastalli","Claudio Semini","Massimiliano Pontil"],"pdf_url":"https://arxiv.org/pdf/2312.07457v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08609v2","updated":"2024-06-04T13:36:10Z","published":"2024-02-13T17:18:56Z","title":"Mixtures of Experts Unlock Parameter Scaling for Deep RL","summary":"  The recent rapid progress in (self) supervised learning models is in large\npart predicted by empirical scaling laws: a model's performance scales\nproportionally to its size. Analogous scaling laws remain elusive for\nreinforcement learning domains, however, where increasing the parameter count\nof a model often hurts its final performance. In this paper, we demonstrate\nthat incorporating Mixture-of-Expert (MoE) modules, and in particular Soft MoEs\n(Puigcerver et al., 2023), into value-based networks results in more\nparameter-scalable models, evidenced by substantial performance increases\nacross a variety of training regimes and model sizes. This work thus provides\nstrong empirical evidence towards developing scaling laws for reinforcement\nlearning.\n","authors":["Johan Obando-Ceron","Ghada Sokar","Timon Willi","Clare Lyle","Jesse Farebrother","Jakob Foerster","Gintare Karolina Dziugaite","Doina Precup","Pablo Samuel Castro"],"pdf_url":"https://arxiv.org/pdf/2402.08609v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02300v1","updated":"2024-06-04T13:29:12Z","published":"2024-06-04T13:29:12Z","title":"Node-Level Topological Representation Learning on Point Clouds","summary":"  Topological Data Analysis (TDA) allows us to extract powerful topological and\nhigher-order information on the global shape of a data set or point cloud.\nTools like Persistent Homology or the Euler Transform give a single complex\ndescription of the global structure of the point cloud. However, common machine\nlearning applications like classification require point-level information and\nfeatures to be available. In this paper, we bridge this gap and propose a novel\nmethod to extract node-level topological features from complex point clouds\nusing discrete variants of concepts from algebraic topology and differential\ngeometry. We verify the effectiveness of these topological point features\n(TOPF) on both synthetic and real-world data and study their robustness under\nnoise.\n","authors":["Vincent P. Grande","Michael T. Schaub"],"pdf_url":"https://arxiv.org/pdf/2406.02300v1.pdf","comment":"30 pages, 10 figures, comments welcome"},{"id":"http://arxiv.org/abs/2406.02298v1","updated":"2024-06-04T13:19:06Z","published":"2024-06-04T13:19:06Z","title":"Solving Partial Differential Equations in Different Domains by Operator\n  Learning method Based on Boundary Integral Equations","summary":"  This article explores operator learning models that can deduce solutions to\npartial differential equations (PDEs) on arbitrary domains without requiring\nretraining. We introduce two innovative models rooted in boundary integral\nequations (BIEs): the Boundary Integral Type Deep Operator Network\n(BI-DeepONet) and the Boundary Integral Trigonometric Deep Operator Neural\nNetwork (BI-TDONet), which are crafted to address PDEs across diverse domains.\nOnce fully trained, these BIE-based models adeptly predict the solutions of\nPDEs in any domain without the need for additional training. BI-TDONet notably\nenhances its performance by employing the singular value decomposition (SVD) of\nbounded linear operators, allowing for the efficient distribution of input\nfunctions across its modules. Furthermore, to tackle the issue of function\nsampling values that do not effectively capture oscillatory and impulse signal\ncharacteristics, trigonometric coefficients are utilized as both inputs and\noutputs in BI-TDONet. Our numerical experiments robustly support and confirm\nthe efficacy of this theoretical framework.\n","authors":["Bin Meng","Yutong Lu","Ying Jiang"],"pdf_url":"https://arxiv.org/pdf/2406.02298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02296v1","updated":"2024-06-04T13:17:24Z","published":"2024-06-04T13:17:24Z","title":"Learning-Rate-Free Stochastic Optimization over Riemannian Manifolds","summary":"  In recent years, interest in gradient-based optimization over Riemannian\nmanifolds has surged. However, a significant challenge lies in the reliance on\nhyperparameters, especially the learning rate, which requires meticulous tuning\nby practitioners to ensure convergence at a suitable rate. In this work, we\nintroduce innovative learning-rate-free algorithms for stochastic optimization\nover Riemannian manifolds, eliminating the need for hand-tuning and providing a\nmore robust and user-friendly approach. We establish high probability\nconvergence guarantees that are optimal, up to logarithmic factors, compared to\nthe best-known optimally tuned rate in the deterministic setting. Our approach\nis validated through numerical experiments, demonstrating competitive\nperformance against learning-rate-dependent algorithms.\n","authors":["Daniel Dodd","Louis Sharrock","Christopher Nemeth"],"pdf_url":"https://arxiv.org/pdf/2406.02296v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.02295v1","updated":"2024-06-04T13:16:34Z","published":"2024-06-04T13:16:34Z","title":"How to Explore with Belief: State Entropy Maximization in POMDPs","summary":"  Recent works have studied *state entropy maximization* in reinforcement\nlearning, in which the agent's objective is to learn a policy inducing high\nentropy over states visitation (Hazan et al., 2019). They typically assume full\nobservability of the state of the system, so that the entropy of the\nobservations is maximized. In practice, the agent may only get *partial*\nobservations, e.g., a robot perceiving the state of a physical space through\nproximity sensors and cameras. A significant mismatch between the entropy over\nobservations and true states of the system can arise in those settings. In this\npaper, we address the problem of entropy maximization over the *true states*\nwith a decision policy conditioned on partial observations *only*. The latter\nis a generalization of POMDPs, which is intractable in general. We develop a\nmemory and computationally efficient *policy gradient* method to address a\nfirst-order relaxation of the objective defined on *belief* states, providing\nvarious formal characterizations of approximation gaps, the optimization\nlandscape, and the *hallucination* problem. This paper aims to generalize state\nentropy maximization to more realistic domains that meet the challenges of\napplications.\n","authors":["Riccardo Zamboni","Duilio Cirino","Marcello Restelli","Mirco Mutti"],"pdf_url":"https://arxiv.org/pdf/2406.02295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02294v1","updated":"2024-06-04T13:16:08Z","published":"2024-06-04T13:16:08Z","title":"Smaller Batches, Bigger Gains? Investigating the Impact of Batch Sizes\n  on Reinforcement Learning Based Real-World Production Scheduling","summary":"  Production scheduling is an essential task in manufacturing, with\nReinforcement Learning (RL) emerging as a key solution. In a previous work, RL\nwas utilized to solve an extended permutation flow shop scheduling problem\n(PFSSP) for a real-world production line with two stages, linked by a central\nbuffer. The RL agent was trained to sequence equallysized product batches to\nminimize setup efforts and idle times. However, the substantial impact caused\nby varying the size of these product batches has not yet been explored. In this\nfollow-up study, we investigate the effects of varying batch sizes, exploring\nboth the quality of solutions and the training dynamics of the RL agent. The\nresults demonstrate that it is possible to methodically identify reasonable\nboundaries for the batch size. These boundaries are determined on one side by\nthe increasing sample complexity associated with smaller batch sizes, and on\nthe other side by the decreasing flexibility of the agent when dealing with\nlarger batch sizes. This provides the practitioner the ability to make an\ninformed decision regarding the selection of an appropriate batch size.\nMoreover, we introduce and investigate two new curriculum learning strategies\nto enable the training with small batch sizes. The findings of this work offer\nthe potential for application in several industrial use cases with comparable\nscheduling problems.\n","authors":["Arthur Müller","Felix Grumbach","Matthia Sabatelli"],"pdf_url":"https://arxiv.org/pdf/2406.02294v1.pdf","comment":"This paper was accepted at the ETFA 2024 conference"},{"id":"http://arxiv.org/abs/2406.02293v1","updated":"2024-06-04T13:13:29Z","published":"2024-06-04T13:13:29Z","title":"Composite Quantile Regression With XGBoost Using the Novel Arctan\n  Pinball Loss","summary":"  This paper explores the use of XGBoost for composite quantile regression.\nXGBoost is a highly popular model renowned for its flexibility, efficiency, and\ncapability to deal with missing data. The optimization uses a second order\napproximation of the loss function, complicating the use of loss functions with\na zero or vanishing second derivative. Quantile regression -- a popular\napproach to obtain conditional quantiles when point estimates alone are\ninsufficient -- unfortunately uses such a loss function, the pinball loss.\nExisting workarounds are typically inefficient and can result in severe\nquantile crossings. In this paper, we present a smooth approximation of the\npinball loss, the arctan pinball loss, that is tailored to the needs of\nXGBoost. Specifically, contrary to other smooth approximations, the arctan\npinball loss has a relatively large second derivative, which makes it more\nsuitable to use in the second order approximation. Using this loss function\nenables the simultaneous prediction of multiple quantiles, which is more\nefficient and results in far fewer quantile crossings.\n","authors":["Laurens Sluijterman","Frank Kreuwel","Eric Cator","Tom Heskes"],"pdf_url":"https://arxiv.org/pdf/2406.02293v1.pdf","comment":"24 pages, 9 figures"},{"id":"http://arxiv.org/abs/2406.02292v1","updated":"2024-06-04T13:11:01Z","published":"2024-06-04T13:11:01Z","title":"An Axiomatic Approach to Loss Aggregation and an Adapted Aggregating\n  Algorithm","summary":"  Supervised learning has gone beyond the expected risk minimization framework.\nCentral to most of these developments is the introduction of more general\naggregation functions for losses incurred by the learner. In this paper, we\nturn towards online learning under expert advice. Via easily justified\nassumptions we characterize a set of reasonable loss aggregation functions as\nquasi-sums. Based upon this insight, we suggest a variant of the Aggregating\nAlgorithm tailored to these more general aggregation functions. This variant\ninherits most of the nice theoretical properties of the AA, such as recovery of\nBayes' updating and a time-independent bound on quasi-sum regret. Finally, we\nargue that generalized aggregations express the attitude of the learner towards\nlosses.\n","authors":["Armando J. Cabrera Pacheco","Rabanus Derr","Robert C. Williamson"],"pdf_url":"https://arxiv.org/pdf/2406.02292v1.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2406.02290v1","updated":"2024-06-04T13:05:47Z","published":"2024-06-04T13:05:47Z","title":"A Study of Optimizations for Fine-tuning Large Language Models","summary":"  Fine-tuning large language models is a popular choice among users trying to\nadapt them for specific applications. However, fine-tuning these models is a\ndemanding task because the user has to examine several factors, such as\nresource budget, runtime, model size and context length among others. A\nspecific challenge is that fine-tuning is memory intensive, imposing\nconstraints on the required hardware memory and context length of training data\nthat can be handled. In this work, we share a detailed study on a variety of\nfine-tuning optimizations across different fine-tuning scenarios. In\nparticular, we assess Gradient Checkpointing, Low Rank Adaptation, DeepSpeed's\nZeRO Redundancy Optimizer and Flash Attention. With a focus on memory and\nruntime, we examine the impact of different optimization combinations on GPU\nmemory usage and execution runtime during fine-tuning phase. We provide\nrecommendation on best default optimization for balancing memory and runtime\nacross diverse model sizes. We share effective strategies for fine-tuning very\nlarge models with tens or hundreds of billions of parameters and enabling large\ncontext lengths during fine-tuning. Furthermore, we propose the appropriate\noptimization mixtures for fine-tuning under GPU resource limitations.\n","authors":["Arjun Singh","Nikhil Pandey","Anup Shirgaonkar","Pavan Manoj","Vijay Aski"],"pdf_url":"https://arxiv.org/pdf/2406.02290v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.17495v2","updated":"2024-06-04T13:04:53Z","published":"2024-05-25T16:05:06Z","title":"Vertical Federated Learning for Effectiveness, Security, Applicability:\n  A Survey","summary":"  Vertical Federated Learning (VFL) is a privacy-preserving distributed\nlearning paradigm where different parties collaboratively learn models using\npartitioned features of shared samples, without leaking private data. Recent\nresearch has shown promising results addressing various challenges in VFL,\nhighlighting its potential for practical applications in cross-domain\ncollaboration. However, the corresponding research is scattered and lacks\norganization. To advance VFL research, this survey offers a systematic overview\nof recent developments. First, we provide a history and background\nintroduction, along with a summary of the general training protocol of VFL. We\nthen revisit the taxonomy in recent reviews and analyze limitations in-depth.\nFor a comprehensive and structured discussion, we synthesize recent research\nfrom three fundamental perspectives: effectiveness, security, and\napplicability. Finally, we discuss several critical future research directions\nin VFL, which will facilitate the developments in this field. We provide a\ncollection of research lists and periodically update them at\nhttps://github.com/shentt67/VFL_Survey.\n","authors":["Mang Ye","Wei Shen","Bo Du","Eduard Snezhko","Vassili Kovalev","Pong C. Yuen"],"pdf_url":"https://arxiv.org/pdf/2405.17495v2.pdf","comment":"31 pages, 9 figures, 10 tables"},{"id":"http://arxiv.org/abs/2406.02285v1","updated":"2024-06-04T12:58:19Z","published":"2024-06-04T12:58:19Z","title":"Towards Supervised Performance on Speaker Verification with\n  Self-Supervised Learning by Leveraging Large-Scale ASR Models","summary":"  Recent advancements in Self-Supervised Learning (SSL) have shown promising\nresults in Speaker Verification (SV). However, narrowing the performance gap\nwith supervised systems remains an ongoing challenge. Several studies have\nobserved that speech representations from large-scale ASR models contain\nvaluable speaker information. This work explores the limitations of fine-tuning\nthese models for SV using an SSL contrastive objective in an end-to-end\napproach. Then, we propose a framework to learn speaker representations in an\nSSL context by fine-tuning a pre-trained WavLM with a supervised loss using\npseudo-labels. Initial pseudo-labels are derived from an SSL DINO-based model\nand are iteratively refined by clustering the model embeddings. Our method\nachieves 0.99% EER on VoxCeleb1-O, establishing the new state-of-the-art on\nself-supervised SV. As this performance is close to our supervised baseline of\n0.94% EER, this contribution is a step towards supervised performance on SV\nwith SSL.\n","authors":["Victor Miara","Theo Lepage","Reda Dehak"],"pdf_url":"https://arxiv.org/pdf/2406.02285v1.pdf","comment":"accepted at INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2307.10167v2","updated":"2024-06-04T12:57:53Z","published":"2023-07-19T17:53:22Z","title":"VITS : Variational Inference Thomson Sampling for contextual bandits","summary":"  In this paper, we introduce and analyze a variant of the Thompson sampling\n(TS) algorithm for contextual bandits. At each round, traditional TS requires\nsamples from the current posterior distribution, which is usually intractable.\nTo circumvent this issue, approximate inference techniques can be used and\nprovide samples with distribution close to the posteriors. However, current\napproximate techniques yield to either poor estimation (Laplace approximation)\nor can be computationally expensive (MCMC methods, Ensemble sampling...). In\nthis paper, we propose a new algorithm, Varational Inference Thompson sampling\nVITS, based on Gaussian Variational Inference. This scheme provides powerful\nposterior approximations which are easy to sample from, and is computationally\nefficient, making it an ideal choice for TS. In addition, we show that VITS\nachieves a sub-linear regret bound of the same order in the dimension and\nnumber of round as traditional TS for linear contextual bandit. Finally, we\ndemonstrate experimentally the effectiveness of VITS on both synthetic and real\nworld datasets.\n","authors":["Pierre Clavier","Tom Huix","Alain Durmus"],"pdf_url":"https://arxiv.org/pdf/2307.10167v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03760v3","updated":"2024-06-04T12:56:46Z","published":"2023-11-07T06:54:40Z","title":"Posterior Sampling-Based Bayesian Optimization with Tighter Bayesian\n  Regret Bounds","summary":"  Among various acquisition functions (AFs) in Bayesian optimization (BO),\nGaussian process upper confidence bound (GP-UCB) and Thompson sampling (TS) are\nwell-known options with established theoretical properties regarding Bayesian\ncumulative regret (BCR). Recently, it has been shown that a randomized variant\nof GP-UCB achieves a tighter BCR bound compared with GP-UCB, which we call the\ntighter BCR bound for brevity. Inspired by this study, this paper first shows\nthat TS achieves the tighter BCR bound. On the other hand, GP-UCB and TS often\npractically suffer from manual hyperparameter tuning and over-exploration\nissues, respectively. Therefore, we analyze yet another AF called a probability\nof improvement from the maximum of a sample path (PIMS). We show that PIMS\nachieves the tighter BCR bound and avoids the hyperparameter tuning, unlike\nGP-UCB. Furthermore, we demonstrate a wide range of experiments, focusing on\nthe effectiveness of PIMS that mitigates the practical issues of GP-UCB and TS.\n","authors":["Shion Takeno","Yu Inatsu","Masayuki Karasuyama","Ichiro Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2311.03760v3.pdf","comment":"28 pages, 3 figures, 2 tables, Accepted to ICML2024"},{"id":"http://arxiv.org/abs/2406.02282v1","updated":"2024-06-04T12:56:10Z","published":"2024-06-04T12:56:10Z","title":"Test-Time Regret Minimization in Meta Reinforcement Learning","summary":"  Meta reinforcement learning sets a distribution over a set of tasks on which\nthe agent can train at will, then is asked to learn an optimal policy for any\ntest task efficiently. In this paper, we consider a finite set of tasks modeled\nthrough Markov decision processes with various dynamics. We assume to have\nendured a long training phase, from which the set of tasks is perfectly\nrecovered, and we focus on regret minimization against the optimal policy in\nthe unknown test task. Under a separation condition that states the existence\nof a state-action pair revealing a task against another, Chen et al. (2022)\nshow that $O(M^2 \\log(H))$ regret can be achieved, where $M, H$ are the number\nof tasks in the set and test episodes, respectively. In our first contribution,\nwe demonstrate that the latter rate is nearly optimal by developing a novel\nlower bound for test-time regret minimization under separation, showing that a\nlinear dependence with $M$ is unavoidable. Then, we present a family of\nstronger yet reasonable assumptions beyond separation, which we call strong\nidentifiability, enabling algorithms achieving fast rates $\\log (H)$ and\nsublinear dependence with $M$ simultaneously. Our paper provides a new\nunderstanding of the statistical barriers of test-time regret minimization and\nwhen fast rates can be achieved.\n","authors":["Mirco Mutti","Aviv Tamar"],"pdf_url":"https://arxiv.org/pdf/2406.02282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02273v1","updated":"2024-06-04T12:49:46Z","published":"2024-06-04T12:49:46Z","title":"A KL-based Analysis Framework with Applications to Non-Descent\n  Optimization Methods","summary":"  We propose a novel analysis framework for non-descent-type optimization\nmethodologies in nonconvex scenarios based on the Kurdyka-Lojasiewicz property.\nOur framework allows covering a broad class of algorithms, including those\ncommonly employed in stochastic and distributed optimization. Specifically, it\nenables the analysis of first-order methods that lack a sufficient descent\nproperty and do not require access to full (deterministic) gradient\ninformation. We leverage this framework to establish, for the first time,\niterate convergence and the corresponding rates for the decentralized gradient\nmethod and federated averaging under mild assumptions. Furthermore, based on\nthe new analysis techniques, we show the convergence of the random reshuffling\nand stochastic gradient descent method without necessitating typical a priori\nbounded iterates assumptions.\n","authors":["Junwen Qiu","Bohao Ma","Xiao Li","Andre Milzarek"],"pdf_url":"https://arxiv.org/pdf/2406.02273v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2406.02269v1","updated":"2024-06-04T12:47:13Z","published":"2024-06-04T12:47:13Z","title":"Graph Neural Networks Do Not Always Oversmooth","summary":"  Graph neural networks (GNNs) have emerged as powerful tools for processing\nrelational data in applications. However, GNNs suffer from the problem of\noversmoothing, the property that the features of all nodes exponentially\nconverge to the same vector over layers, prohibiting the design of deep GNNs.\nIn this work we study oversmoothing in graph convolutional networks (GCNs) by\nusing their Gaussian process (GP) equivalence in the limit of infinitely many\nhidden features. By generalizing methods from conventional deep neural networks\n(DNNs), we can describe the distribution of features at the output layer of\ndeep GCNs in terms of a GP: as expected, we find that typical parameter choices\nfrom the literature lead to oversmoothing. The theory, however, allows us to\nidentify a new, nonoversmoothing phase: if the initial weights of the network\nhave sufficiently large variance, GCNs do not oversmooth, and node features\nremain informative even at large depth. We demonstrate the validity of this\nprediction in finite-size GCNs by training a linear classifier on their output.\nMoreover, using the linearization of the GCN GP, we generalize the concept of\npropagation depth of information from DNNs to GCNs. This propagation depth\ndiverges at the transition between the oversmoothing and non-oversmoothing\nphase. We test the predictions of our approach and find good agreement with\nfinite-size GCNs. Initializing GCNs near the transition to the\nnon-oversmoothing phase, we obtain networks which are both deep and expressive.\n","authors":["Bastian Epping","Alexandre René","Moritz Helias","Michael T. Schaub"],"pdf_url":"https://arxiv.org/pdf/2406.02269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02268v1","updated":"2024-06-04T12:47:11Z","published":"2024-06-04T12:47:11Z","title":"Analyzing the Benefits of Prototypes for Semi-Supervised Category\n  Learning","summary":"  Categories can be represented at different levels of abstraction, from\nprototypes focused on the most typical members to remembering all observed\nexemplars of the category. These representations have been explored in the\ncontext of supervised learning, where stimuli are presented with known category\nlabels. We examine the benefits of prototype-based representations in a\nless-studied domain: semi-supervised learning, where agents must form\nunsupervised representations of stimuli before receiving category labels. We\nstudy this problem in a Bayesian unsupervised learning model called a\nvariational auto-encoder, and we draw on recent advances in machine learning to\nimplement a prior that encourages the model to use abstract prototypes to\nrepresent data. We apply this approach to image datasets and show that forming\nprototypes can improve semi-supervised category learning. Additionally, we\nstudy the latent embeddings of the models and show that these prototypes allow\nthe models to form clustered representations without supervision, contributing\nto their success in downstream categorization performance.\n","authors":["Liyi Zhang","Logan Nelson","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2406.02268v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.07579v3","updated":"2024-06-04T12:35:56Z","published":"2023-10-11T15:19:31Z","title":"In-Context Unlearning: Language Models as Few Shot Unlearners","summary":"  Machine unlearning, the study of efficiently removing the impact of specific\ntraining instances on a model, has garnered increased attention in recent years\ndue to regulatory guidelines such as the \\emph{Right to be Forgotten}.\nAchieving precise unlearning typically involves fully retraining the model and\nis computationally infeasible in case of very large models such as Large\nLanguage Models (LLMs). To this end, recent work has proposed several\nalgorithms which approximate the removal of training data without retraining\nthe model. These algorithms crucially rely on access to the model parameters in\norder to update them, an assumption that may not hold in practice due to\ncomputational constraints or having only query access to the LLMs. In this\nwork, we propose a new class of unlearning methods for LLMs called ``In-Context\nUnlearning.'' This method unlearns instances from the model by simply providing\nspecific kinds of inputs in context, without the need to update model\nparameters. To unlearn specific training instances, we present these instances\nto the LLMs at inference time along with labels that differ from their ground\ntruth. Our experimental results demonstrate that in-context unlearning performs\non par with, or in some cases outperforms other state-of-the-art methods that\nrequire access to model parameters, effectively removing the influence of\nspecific instances on the model while preserving test accuracy.\n","authors":["Martin Pawelczyk","Seth Neel","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2310.07579v3.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2402.10001v2","updated":"2024-06-04T12:34:25Z","published":"2024-02-15T15:06:33Z","title":"Privacy Attacks in Decentralized Learning","summary":"  Decentralized Gradient Descent (D-GD) allows a set of users to perform\ncollaborative learning without sharing their data by iteratively averaging\nlocal model updates with their neighbors in a network graph. The absence of\ndirect communication between non-neighbor nodes might lead to the belief that\nusers cannot infer precise information about the data of others. In this work,\nwe demonstrate the opposite, by proposing the first attack against D-GD that\nenables a user (or set of users) to reconstruct the private data of other users\noutside their immediate neighborhood. Our approach is based on a reconstruction\nattack against the gossip averaging protocol, which we then extend to handle\nthe additional challenges raised by D-GD. We validate the effectiveness of our\nattack on real graphs and datasets, showing that the number of users\ncompromised by a single or a handful of attackers is often surprisingly large.\nWe empirically investigate some of the factors that affect the performance of\nthe attack, namely the graph topology, the number of attackers, and their\nposition in the graph.\n","authors":["Abdellah El Mrini","Edwige Cyffers","Aurélien Bellet"],"pdf_url":"https://arxiv.org/pdf/2402.10001v2.pdf","comment":"accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2406.02258v1","updated":"2024-06-04T12:29:51Z","published":"2024-06-04T12:29:51Z","title":"Reinforcement Learning with Lookahead Information","summary":"  We study reinforcement learning (RL) problems in which agents observe the\nreward or transition realizations at their current state before deciding which\naction to take. Such observations are available in many applications, including\ntransactions, navigation and more. When the environment is known, previous work\nshows that this lookahead information can drastically increase the collected\nreward. However, outside of specific applications, existing approaches for\ninteracting with unknown environments are not well-adapted to these\nobservations. In this work, we close this gap and design provably-efficient\nlearning algorithms able to incorporate lookahead information. To achieve this,\nwe perform planning using the empirical distribution of the reward and\ntransition observations, in contrast to vanilla approaches that only rely on\nestimated expectations. We prove that our algorithms achieve tight regret\nversus a baseline that also has access to lookahead information - linearly\nincreasing the amount of collected reward compared to agents that cannot handle\nlookahead information.\n","authors":["Nadav Merlis"],"pdf_url":"https://arxiv.org/pdf/2406.02258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02255v1","updated":"2024-06-04T12:21:55Z","published":"2024-06-04T12:21:55Z","title":"MidiCaps -- A large-scale MIDI dataset with text captions","summary":"  Generative models guided by text prompts are increasingly becoming more\npopular. However, no text-to-MIDI models currently exist, mostly due to the\nlack of a captioned MIDI dataset. This work aims to enable research that\ncombines LLMs with symbolic music by presenting the first large-scale MIDI\ndataset with text captions that is openly available: MidiCaps. MIDI (Musical\nInstrument Digital Interface) files are a widely used format for encoding\nmusical information. Their structured format captures the nuances of musical\ncomposition and has practical applications by music producers, composers,\nmusicologists, as well as performers. Inspired by recent advancements in\ncaptioning techniques applied to various domains, we present a large-scale\ncurated dataset of over 168k MIDI files accompanied by textual descriptions.\nEach MIDI caption succinctly describes the musical content, encompassing tempo,\nchord progression, time signature, instruments present, genre and mood; thereby\nfacilitating multi-modal exploration and analysis. The dataset contains a mix\nof various genres, styles, and complexities, offering a rich source for\ntraining and evaluating models for tasks such as music information retrieval,\nmusic understanding and cross-modal translation. We provide detailed statistics\nabout the dataset and have assessed the quality of the captions in an extensive\nlistening study. We anticipate that this resource will stimulate further\nresearch in the intersection of music and natural language processing,\nfostering advancements in both fields.\n","authors":["Jan Melechovsky","Abhinaba Roy","Dorien Herremans"],"pdf_url":"https://arxiv.org/pdf/2406.02255v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.12000v4","updated":"2024-06-04T12:14:46Z","published":"2023-08-23T08:38:53Z","title":"On Universally Optimal Algorithms for A/B Testing","summary":"  We study the problem of best-arm identification with fixed budget in\nstochastic multi-armed bandits with Bernoulli rewards. For the problem with two\narms, also known as the A/B testing problem, we prove that there is no\nalgorithm that (i) performs as well as the algorithm sampling each arm equally\n(referred to as the {\\it uniform sampling} algorithm) in all instances, and\nthat (ii) strictly outperforms uniform sampling on at least one instance. In\nshort, there is no algorithm better than the uniform sampling algorithm. To\nestablish this result, we first introduce the natural class of {\\it consistent}\nand {\\it stable} algorithms, and show that any algorithm that performs as well\nas the uniform sampling algorithm in all instances belongs to this class. The\nproof then proceeds by deriving a lower bound on the error rate satisfied by\nany consistent and stable algorithm, and by showing that the uniform sampling\nalgorithm matches this lower bound. Our results provide a solution to the two\nopen problems presented in \\citep{qin2022open}. For the general problem with\nmore than two arms, we provide a first set of results. We characterize the\nasymptotic error rate of the celebrated Successive Rejects (SR) algorithm\n\\citep{audibert2010best} and show that, surprisingly, the uniform sampling\nalgorithm outperforms the SR algorithm in some instances.\n","authors":["Po-An Wang","Kaito Ariu","Alexandre Proutiere"],"pdf_url":"https://arxiv.org/pdf/2308.12000v4.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2406.02245v1","updated":"2024-06-04T12:09:44Z","published":"2024-06-04T12:09:44Z","title":"Description Boosting for Zero-Shot Entity and Relation Classification","summary":"  Zero-shot entity and relation classification models leverage available\nexternal information of unseen classes -- e.g., textual descriptions -- to\nannotate input text data. Thanks to the minimum data requirement, Zero-Shot\nLearning (ZSL) methods have high value in practice, especially in applications\nwhere labeled data is scarce. Even though recent research in ZSL has\ndemonstrated significant results, our analysis reveals that those methods are\nsensitive to provided textual descriptions of entities (or relations). Even a\nminor modification of descriptions can lead to a change in the decision\nboundary between entity (or relation) classes. In this paper, we formally\ndefine the problem of identifying effective descriptions for zero shot\ninference. We propose a strategy for generating variations of an initial\ndescription, a heuristic for ranking them and an ensemble method capable of\nboosting the predictions of zero-shot models through description enhancement.\nEmpirical results on four different entity and relation classification datasets\nshow that our proposed method outperform existing approaches and achieve new\nSOTA results on these datasets under the ZSL settings. The source code of the\nproposed solutions and the evaluation framework are open-sourced.\n","authors":["Gabriele Picco","Leopold Fuchs","Marcos Martínez Galindo","Alberto Purpura","Vanessa López","Hoang Thanh Lam"],"pdf_url":"https://arxiv.org/pdf/2406.02245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00438v3","updated":"2024-06-04T12:03:28Z","published":"2023-03-01T11:54:22Z","title":"A Framework for Neurosymbolic Robot Action Planning using Large Language\n  Models","summary":"  Symbolic task planning is a widely used approach to enforce robot autonomy\ndue to its ease of understanding and deployment in robot architectures.\nHowever, techniques for symbolic task planning are difficult to scale in\nreal-world, human-robot collaboration scenarios because of the poor performance\nin complex planning domains or when frequent re-planning is needed. We present\na framework, Teriyaki, specifically aimed at bridging the gap between symbolic\ntask planning and machine learning approaches. The rationale is training Large\nLanguage Models (LLMs), namely GPT-3, into a neurosymbolic task planner\ncompatible with the Planning Domain Definition Language (PDDL), and then\nleveraging its generative capabilities to overcome a number of limitations\ninherent to symbolic task planners. Potential benefits include (i) a better\nscalability in so far as the planning domain complexity increases, since LLMs'\nresponse time linearly scales with the combined length of the input and the\noutput, and (ii) the ability to synthesize a plan action-by-action instead of\nend-to-end, making each action available for execution as soon as it is\ngenerated instead of waiting for the whole plan to be available, which in turn\nenables concurrent planning and execution. Recently, significant efforts have\nbeen devoted by the research community to evaluate the cognitive capabilities\nof LLMs, with alternate successes. Instead, with Teriyaki we aim to provide an\noverall planning performance comparable to traditional planners in specific\nplanning domains, while leveraging LLMs capabilities to build a look-ahead\npredictive planning model. Preliminary results in selected domains show that\nour method can: (i) solve 95.5% of problems in a test data set of 1,000\nsamples; (ii) produce plans up to 13.5% shorter than a traditional symbolic\nplanner; (iii) reduce average overall waiting times for a plan availability by\nup to 61.4%\n","authors":["Alessio Capitanelli","Fulvio Mastrogiovanni"],"pdf_url":"https://arxiv.org/pdf/2303.00438v3.pdf","comment":"36 pages, 7 figures, 2 tables. Updated according to reviewers'\n  comments. Previous title: A Framework to Generate Neurosymbolic\n  PDDL-compliant Planners"},{"id":"http://arxiv.org/abs/2402.01399v2","updated":"2024-06-04T11:59:20Z","published":"2024-02-02T13:31:17Z","title":"A Probabilistic Model behind Self-Supervised Learning","summary":"  In self-supervised learning (SSL), representations are learned via an\nauxiliary task without annotated labels. A common task is to classify\naugmentations or different modalities of the data, which share semantic content\n(e.g. an object in an image) but differ in style (e.g. the object's location).\nMany approaches to self-supervised learning have been proposed, e.g. SimCLR,\nCLIP, and VicREG, which have recently gained much attention for their\nrepresentations achieving downstream performance comparable to supervised\nlearning. However, a theoretical understanding of self-supervised methods\neludes. Addressing this, we present a generative latent variable model for\nself-supervised learning and show that several families of discriminative SSL,\nincluding contrastive methods, induce a comparable distribution over\nrepresentations, providing a unifying theoretical framework for these methods.\nThe proposed model also justifies connections drawn to mutual information and\nthe use of a \"projection head\". Learning representations by fitting the model\ngeneratively (termed SimVAE) improves performance over discriminative and other\nVAE-based methods on simple image benchmarks and significantly narrows the gap\nbetween generative and discriminative representation learning in more complex\nsettings. Importantly, as our analysis predicts, SimVAE outperforms\nself-supervised learning where style information is required, taking an\nimportant step toward understanding self-supervised methods and achieving\ntask-agnostic representations.\n","authors":["Alice Bizeul","Bernhard Schölkopf","Carl Allen"],"pdf_url":"https://arxiv.org/pdf/2402.01399v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02234v1","updated":"2024-06-04T11:56:19Z","published":"2024-06-04T11:56:19Z","title":"On the Limitations of Fractal Dimension as a Measure of Generalization","summary":"  Bounding and predicting the generalization gap of overparameterized neural\nnetworks remains a central open problem in theoretical machine learning. Neural\nnetwork optimization trajectories have been proposed to possess fractal\nstructure, leading to bounds and generalization measures based on notions of\nfractal dimension on these trajectories. Prominently, both the Hausdorff\ndimension and the persistent homology dimension have been proposed to correlate\nwith generalization gap, thus serving as a measure of generalization. This work\nperforms an extended evaluation of these topological generalization measures.\nWe demonstrate that fractal dimension fails to predict generalization of models\ntrained from poor initializations. We further identify that the $\\ell^2$ norm\nof the final parameter iterate, one of the simplest complexity measures in\nlearning theory, correlates more strongly with the generalization gap than\nthese notions of fractal dimension. Finally, our study reveals the intriguing\nmanifestation of model-wise double descent in persistent homology-based\ngeneralization measures. This work lays the ground for a deeper investigation\nof the causal relationships between fractal geometry, topological data\nanalysis, and neural network optimization.\n","authors":["Charlie Tan","Inés García-Redondo","Qiquan Wang","Michael M. Bronstein","Anthea Monod"],"pdf_url":"https://arxiv.org/pdf/2406.02234v1.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.03565v2","updated":"2024-06-04T11:54:09Z","published":"2023-07-07T12:57:10Z","title":"MALIBO: Meta-learning for Likelihood-free Bayesian Optimization","summary":"  Bayesian optimization (BO) is a popular method to optimize costly black-box\nfunctions. While traditional BO optimizes each new target task from scratch,\nmeta-learning has emerged as a way to leverage knowledge from related tasks to\noptimize new tasks faster. However, existing meta-learning BO methods rely on\nsurrogate models that suffer from scalability issues and are sensitive to\nobservations with different scales and noise types across tasks. Moreover, they\noften overlook the uncertainty associated with task similarity. This leads to\nunreliable task adaptation when only limited observations are obtained or when\nthe new tasks differ significantly from the related tasks. To address these\nlimitations, we propose a novel meta-learning BO approach that bypasses the\nsurrogate model and directly learns the utility of queries across tasks. Our\nmethod explicitly models task uncertainty and includes an auxiliary model to\nenable robust adaptation to new tasks. Extensive experiments show that our\nmethod demonstrates strong anytime performance and outperforms state-of-the-art\nmeta-learning BO methods in various benchmarks.\n","authors":["Jiarong Pan","Stefan Falkner","Felix Berkenkamp","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2307.03565v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16475v2","updated":"2024-06-04T11:53:44Z","published":"2024-05-26T07:58:51Z","title":"Looks Too Good To Be True: An Information-Theoretic Analysis of\n  Hallucinations in Generative Restoration Models","summary":"  The pursuit of high perceptual quality in image restoration has driven the\ndevelopment of revolutionary generative models, capable of producing results\noften visually indistinguishable from real data. However, as their perceptual\nquality continues to improve, these models also exhibit a growing tendency to\ngenerate hallucinations - realistic-looking details that do not exist in the\nground truth images. The presence of hallucinations introduces uncertainty\nregarding the reliability of the models' predictions, raising major concerns\nabout their practical application. In this paper, we employ information-theory\ntools to investigate this phenomenon, revealing a fundamental tradeoff between\nuncertainty and perception. We rigorously analyze the relationship between\nthese two factors, proving that the global minimal uncertainty in generative\nmodels grows in tandem with perception. In particular, we define the inherent\nuncertainty of the restoration problem and show that attaining perfect\nperceptual quality entails at least twice this uncertainty. Additionally, we\nestablish a relation between mean squared-error distortion, uncertainty and\nperception, through which we prove the aforementioned uncertainly-perception\ntradeoff induces the well-known perception-distortion tradeoff. This work\nuncovers fundamental limitations of generative models in achieving both high\nperceptual quality and reliable predictions for image restoration. We\ndemonstrate our theoretical findings through an analysis of single image\nsuper-resolution algorithms. Our work aims to raise awareness among\npractitioners about this inherent tradeoff, empowering them to make informed\ndecisions and potentially prioritize safety over perceptual performance.\n","authors":["Regev Cohen","Idan Kligvasser","Ehud Rivlin","Daniel Freedman"],"pdf_url":"https://arxiv.org/pdf/2405.16475v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02372v2","updated":"2024-06-04T11:40:50Z","published":"2024-05-03T10:10:11Z","title":"Triadic-OCD: Asynchronous Online Change Detection with Provable\n  Robustness, Optimality, and Convergence","summary":"  The primary goal of online change detection (OCD) is to promptly identify\nchanges in the data stream. OCD problem find a wide variety of applications in\ndiverse areas, e.g., security detection in smart grids and intrusion detection\nin communication networks. Prior research usually assumes precise knowledge of\nthe system parameters. Nevertheless, this presumption often proves unattainable\nin practical scenarios due to factors such as estimation errors, system\nupdates, etc. This paper aims to take the first attempt to develop a\ntriadic-OCD framework with certifiable robustness, provable optimality, and\nguaranteed convergence. In addition, the proposed triadic-OCD algorithm can be\nrealized in a fully asynchronous distributed manner, easing the necessity of\ntransmitting the data to a single server. This asynchronous mechanism could\nalso mitigate the straggler issue that faced by traditional synchronous\nalgorithm. Moreover, the non-asymptotic convergence property of Triadic-OCD is\ntheoretically analyzed, and its iteration complexity to achieve an\n$\\epsilon$-optimal point is derived. Extensive experiments have been conducted\nto elucidate the effectiveness of the proposed method.\n","authors":["Yancheng Huang","Kai Yang","Zelin Zhu","Leian Chen"],"pdf_url":"https://arxiv.org/pdf/2405.02372v2.pdf","comment":"Accepted at ICML2024"},{"id":"http://arxiv.org/abs/2406.02225v1","updated":"2024-06-04T11:37:11Z","published":"2024-06-04T11:37:11Z","title":"Riemannian coordinate descent algorithms on matrix manifolds","summary":"  Many machine learning applications are naturally formulated as optimization\nproblems on Riemannian manifolds. The main idea behind Riemannian optimization\nis to maintain the feasibility of the variables while moving along a descent\ndirection on the manifold. This results in updating all the variables at every\niteration. In this work, we provide a general framework for developing\ncomputationally efficient coordinate descent (CD) algorithms on matrix\nmanifolds that allows updating only a few variables at every iteration while\nadhering to the manifold constraint. In particular, we propose CD algorithms\nfor various manifolds such as Stiefel, Grassmann, (generalized) hyperbolic,\nsymplectic, and symmetric positive (semi)definite. While the cost per iteration\nof the proposed CD algorithms is low, we further develop a more efficient\nvariant via a first-order approximation of the objective function. We analyze\ntheir convergence and complexity, and empirically illustrate their efficacy in\nseveral applications.\n","authors":["Andi Han","Pratik Jawanpuria","Bamdev Mishra"],"pdf_url":"https://arxiv.org/pdf/2406.02225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02223v1","updated":"2024-06-04T11:33:40Z","published":"2024-06-04T11:33:40Z","title":"SMCL: Saliency Masked Contrastive Learning for Long-tailed Recognition","summary":"  Real-world data often follow a long-tailed distribution with a high imbalance\nin the number of samples between classes. The problem with training from\nimbalanced data is that some background features, common to all classes, can be\nunobserved in classes with scarce samples. As a result, this background\ncorrelates to biased predictions into ``major\" classes. In this paper, we\npropose saliency masked contrastive learning, a new method that uses saliency\nmasking and contrastive learning to mitigate the problem and improve the\ngeneralizability of a model. Our key idea is to mask the important part of an\nimage using saliency detection and use contrastive learning to move the masked\nimage towards minor classes in the feature space, so that background features\npresent in the masked image are no longer correlated with the original class.\nExperiment results show that our method achieves state-of-the-art level\nperformance on benchmark long-tailed datasets.\n","authors":["Sanglee Park","Seung-won Hwang","Jungmin So"],"pdf_url":"https://arxiv.org/pdf/2406.02223v1.pdf","comment":"accepted at ICASSP 2023"},{"id":"http://arxiv.org/abs/2402.10693v3","updated":"2024-06-04T11:33:27Z","published":"2024-02-16T13:53:26Z","title":"Exploring Precision and Recall to assess the quality and diversity of\n  LLMs","summary":"  We introduce a novel evaluation framework for Large Language Models (LLMs)\nsuch as \\textsc{Llama-2} and \\textsc{Mistral}, focusing on importing Precision\nand Recall metrics from image generation to text generation. This approach\nallows for a nuanced assessment of the quality and diversity of generated text\nwithout the need for aligned corpora. By conducting a comprehensive evaluation\nof state-of-the-art language models, the study reveals new insights into their\nperformance on open-ended generation tasks, which are not adequately captured\nby traditional benchmarks. The findings highlight a trade-off between the\nquality and diversity of generated samples, particularly when models are\nfine-tuned on instruction dataset or with human feedback. This work extends the\ntoolkit for distribution-based NLP evaluation, offering insights into the\npractical capabilities and challenges that current LLMs face in generating\ndiverse and high-quality text. We release our code and data.\n","authors":["Florian Le Bronnec","Alexandre Verine","Benjamin Negrevergne","Yann Chevaleyre","Alexandre Allauzen"],"pdf_url":"https://arxiv.org/pdf/2402.10693v3.pdf","comment":"21 pages, 15 figures, ACL 2024 Main"},{"id":"http://arxiv.org/abs/2402.13006v2","updated":"2024-06-04T11:18:03Z","published":"2024-02-20T13:41:21Z","title":"Investigating the Impact of Model Instability on Explanations and\n  Uncertainty","summary":"  Explainable AI methods facilitate the understanding of model behaviour, yet,\nsmall, imperceptible perturbations to inputs can vastly distort explanations.\nAs these explanations are typically evaluated holistically, before model\ndeployment, it is difficult to assess when a particular explanation is\ntrustworthy. Some studies have tried to create confidence estimators for\nexplanations, but none have investigated an existing link between uncertainty\nand explanation quality. We artificially simulate epistemic uncertainty in text\ninput by introducing noise at inference time. In this large-scale empirical\nstudy, we insert different levels of noise perturbations and measure the effect\non the output of pre-trained language models and different uncertainty metrics.\nRealistic perturbations have minimal effect on performance and explanations,\nyet masking has a drastic effect. We find that high uncertainty doesn't\nnecessarily imply low explanation plausibility; the correlation between the two\nmetrics can be moderately positive when noise is exposed during the training\nprocess. This suggests that noise-augmented models may be better at identifying\nsalient tokens when uncertain. Furthermore, when predictive and epistemic\nuncertainty measures are over-confident, the robustness of a saliency map to\nperturbation can indicate model stability issues. Integrated Gradients shows\nthe overall greatest robustness to perturbation, while still showing\nmodel-specific patterns in performance; however, this phenomenon is limited to\nsmaller Transformer-based language models.\n","authors":["Sara Vera Marjanović","Isabelle Augenstein","Christina Lioma"],"pdf_url":"https://arxiv.org/pdf/2402.13006v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02214v1","updated":"2024-06-04T11:14:21Z","published":"2024-06-04T11:14:21Z","title":"SLTrain: a sparse plus low-rank approach for parameter and memory\n  efficient pretraining","summary":"  Large language models (LLMs) have shown impressive capabilities across\nvarious tasks. However, training LLMs from scratch requires significant\ncomputational power and extensive memory capacity. Recent studies have explored\nlow-rank structures on weights for efficient fine-tuning in terms of parameters\nand memory, either through low-rank adaptation or factorization. While\neffective for fine-tuning, low-rank structures are generally less suitable for\npretraining because they restrict parameters to a low-dimensional subspace. In\nthis work, we propose to parameterize the weights as a sum of low-rank and\nsparse matrices for pretraining, which we call SLTrain. The low-rank component\nis learned via matrix factorization, while for the sparse component, we employ\na simple strategy of uniformly selecting the sparsity support at random and\nlearning only the non-zero entries with the fixed support. While being simple,\nthe random fixed-support sparse learning strategy significantly enhances\npretraining when combined with low-rank learning. Our results show that SLTrain\nadds minimal extra parameters and memory costs compared to pretraining with\nlow-rank parameterization, yet achieves substantially better performance, which\nis comparable to full-rank training. Remarkably, when combined with\nquantization and per-layer updates, SLTrain can reduce memory requirements by\nup to 73% when pretraining the LLaMA 7B model.\n","authors":["Andi Han","Jiaxiang Li","Wei Huang","Mingyi Hong","Akiko Takeda","Pratik Jawanpuria","Bamdev Mishra"],"pdf_url":"https://arxiv.org/pdf/2406.02214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02213v1","updated":"2024-06-04T11:11:53Z","published":"2024-06-04T11:11:53Z","title":"Rectifying Reinforcement Learning for Reward Matching","summary":"  The Generative Flow Network (GFlowNet) is a probabilistic framework in which\nan agent learns a stochastic policy and flow functions to sample objects with\nprobability proportional to an unnormalized reward function. GFlowNets share a\nstrong resemblance to reinforcement learning (RL), that typically aims to\nmaximize reward, due to their sequential decision-making processes. Recent\nworks have studied connections between GFlowNets and maximum entropy (MaxEnt)\nRL, which modifies the standard objective of RL agents by learning an\nentropy-regularized objective. However, a critical theoretical gap persists:\ndespite the apparent similarities in their sequential decision-making nature, a\ndirect link between GFlowNets and standard RL has yet to be discovered, while\nbridging this gap could further unlock the potential of both fields. In this\npaper, we establish a new connection between GFlowNets and policy evaluation\nfor a uniform policy. Surprisingly, we find that the resulting value function\nfor the uniform policy has a close relationship to the flows in GFlowNets.\nLeveraging these insights, we further propose a novel rectified policy\nevaluation (RPE) algorithm, which achieves the same reward-matching effect as\nGFlowNets, offering a new perspective. We compare RPE, MaxEnt RL, and GFlowNets\nin a number of benchmarks, and show that RPE achieves competitive results\ncompared to previous approaches. This work sheds light on the previously\nunexplored connection between (non-MaxEnt) RL and GFlowNets, potentially\nopening new avenues for future research in both fields.\n","authors":["Haoran He","Emmanuel Bengio","Qingpeng Cai","Ling Pan"],"pdf_url":"https://arxiv.org/pdf/2406.02213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01306v2","updated":"2024-06-04T11:08:42Z","published":"2024-03-02T20:36:10Z","title":"ICC: Quantifying Image Caption Concreteness for Multimodal Dataset\n  Curation","summary":"  Web-scale training on paired text-image data is becoming increasingly central\nto multimodal learning, but is challenged by the highly noisy nature of\ndatasets in the wild. Standard data filtering approaches succeed in removing\nmismatched text-image pairs, but permit semantically related but highly\nabstract or subjective text. These approaches lack the fine-grained ability to\nisolate the most concrete samples that provide the strongest signal for\nlearning in a noisy dataset. In this work, we propose a new metric, image\ncaption concreteness, that evaluates caption text without an image reference to\nmeasure its concreteness and relevancy for use in multimodal learning. Our\napproach leverages strong foundation models for measuring visual-semantic\ninformation loss in multimodal representations. We demonstrate that this\nstrongly correlates with human evaluation of concreteness in both single-word\nand sentence-level texts. Moreover, we show that curation using ICC complements\nexisting approaches: It succeeds in selecting the highest quality samples from\nmultimodal web-scale datasets to allow for efficient training in\nresource-constrained settings.\n","authors":["Moran Yanuka","Morris Alper","Hadar Averbuch-Elor","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2403.01306v2.pdf","comment":"Accepted to ACL 2024 (Finding). For Project webpage, see\n  https://moranyanuka.github.io/icc/"},{"id":"http://arxiv.org/abs/2402.12423v2","updated":"2024-06-04T11:03:57Z","published":"2024-02-19T16:22:21Z","title":"On the Semantic Latent Space of Diffusion-Based Text-to-Speech Models","summary":"  The incorporation of Denoising Diffusion Models (DDMs) in the Text-to-Speech\n(TTS) domain is rising, providing great value in synthesizing high quality\nspeech. Although they exhibit impressive audio quality, the extent of their\nsemantic capabilities is unknown, and controlling their synthesized speech's\nvocal properties remains a challenge. Inspired by recent advances in image\nsynthesis, we explore the latent space of frozen TTS models, which is composed\nof the latent bottleneck activations of the DDM's denoiser. We identify that\nthis space contains rich semantic information, and outline several novel\nmethods for finding semantic directions within it, both supervised and\nunsupervised. We then demonstrate how these enable off-the-shelf audio editing,\nwithout any further training, architectural changes or data requirements. We\npresent evidence of the semantic and acoustic qualities of the edited audio,\nand provide supplemental samples:\nhttps://latent-analysis-grad-tts.github.io/speech-samples/.\n","authors":["Miri Varshavsky-Hassid","Roy Hirsch","Regev Cohen","Tomer Golany","Daniel Freedman","Ehud Rivlin"],"pdf_url":"https://arxiv.org/pdf/2402.12423v2.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2406.02204v1","updated":"2024-06-04T10:59:54Z","published":"2024-06-04T10:59:54Z","title":"The Deep Latent Space Particle Filter for Real-Time Data Assimilation\n  with Uncertainty Quantification","summary":"  In Data Assimilation, observations are fused with simulations to obtain an\naccurate estimate of the state and parameters for a given physical system.\nCombining data with a model, however, while accurately estimating uncertainty,\nis computationally expensive and infeasible to run in real-time for complex\nsystems. Here, we present a novel particle filter methodology, the Deep Latent\nSpace Particle filter or D-LSPF, that uses neural network-based surrogate\nmodels to overcome this computational challenge. The D-LSPF enables filtering\nin the low-dimensional latent space obtained using Wasserstein AEs with\nmodified vision transformer layers for dimensionality reduction and\ntransformers for parameterized latent space time stepping. As we demonstrate on\nthree test cases, including leak localization in multi-phase pipe flow and\nseabed identification for fully nonlinear water waves, the D-LSPF runs orders\nof magnitude faster than a high-fidelity particle filter and 3-5 times faster\nthan alternative methods while being up to an order of magnitude more accurate.\nThe D-LSPF thus enables real-time data assimilation with uncertainty\nquantification for physical systems.\n","authors":["Nikolaj T. Mücke","Sander M. Bohté","Cornelis W. Oosterlee"],"pdf_url":"https://arxiv.org/pdf/2406.02204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05196v2","updated":"2024-06-04T10:47:02Z","published":"2024-03-08T10:19:00Z","title":"Denoising Autoregressive Representation Learning","summary":"  In this paper, we explore a new generative approach for learning visual\nrepresentations. Our method, DARL, employs a decoder-only Transformer to\npredict image patches autoregressively. We find that training with Mean Squared\nError (MSE) alone leads to strong representations. To enhance the image\ngeneration ability, we replace the MSE loss with the diffusion objective by\nusing a denoising patch decoder. We show that the learned representation can be\nimproved by using tailored noise schedules and longer training in larger\nmodels. Notably, the optimal schedule differs significantly from the typical\nones used in standard image diffusion models. Overall, despite its simple\narchitecture, DARL delivers performance remarkably close to state-of-the-art\nmasked prediction models under the fine-tuning protocol. This marks an\nimportant step towards a unified model capable of both visual perception and\ngeneration, effectively combining the strengths of autoregressive and denoising\ndiffusion models.\n","authors":["Yazhe Li","Jorg Bornschein","Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2403.05196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18938v3","updated":"2024-06-04T10:42:46Z","published":"2024-05-29T09:46:44Z","title":"HLOB -- Information Persistence and Structure in Limit Order Books","summary":"  We introduce a novel large-scale deep learning model for Limit Order Book\nmid-price changes forecasting, and we name it `HLOB'. This architecture (i)\nexploits the information encoded by an Information Filtering Network, namely\nthe Triangulated Maximally Filtered Graph, to unveil deeper and non-trivial\ndependency structures among volume levels; and (ii) guarantees deterministic\ndesign choices to handle the complexity of the underlying system by drawing\ninspiration from the groundbreaking class of Homological Convolutional Neural\nNetworks. We test our model against 9 state-of-the-art deep learning\nalternatives on 3 real-world Limit Order Book datasets, each including 15\nstocks traded on the NASDAQ exchange, and we systematically characterize the\nscenarios where HLOB outperforms state-of-the-art architectures. Our approach\nsheds new light on the spatial distribution of information in Limit Order Books\nand on its degradation over increasing prediction horizons, narrowing the gap\nbetween microstructural modeling and deep learning-based forecasting in\nhigh-frequency financial markets.\n","authors":["Antonio Briola","Silvia Bartolucci","Tomaso Aste"],"pdf_url":"https://arxiv.org/pdf/2405.18938v3.pdf","comment":"34 pages, 7 figures, 7 tables, 3 equations"},{"id":"http://arxiv.org/abs/2406.02191v1","updated":"2024-06-04T10:35:16Z","published":"2024-06-04T10:35:16Z","title":"On the Recoverability of Causal Relations from Temporally Aggregated\n  I.I.D. Data","summary":"  We consider the effect of temporal aggregation on instantaneous\n(non-temporal) causal discovery in general setting. This is motivated by the\nobservation that the true causal time lag is often considerably shorter than\nthe observational interval. This discrepancy leads to high aggregation, causing\ntime-delay causality to vanish and instantaneous dependence to manifest.\nAlthough we expect such instantaneous dependence has consistency with the true\ncausal relation in certain sense to make the discovery results meaningful, it\nremains unclear what type of consistency we need and when will such consistency\nbe satisfied. We proposed functional consistency and conditional independence\nconsistency in formal way correspond functional causal model-based methods and\nconditional independence-based methods respectively and provide the conditions\nunder which these consistencies will hold. We show theoretically and\nexperimentally that causal discovery results may be seriously distorted by\naggregation especially in complete nonlinear case and we also find causal\nrelationship still recoverable from aggregated data if we have partial\nlinearity or appropriate prior. Our findings suggest community should take a\ncautious and meticulous approach when interpreting causal discovery results\nfrom such data and show why and when aggregation will distort the performance\nof causal discovery methods.\n","authors":["Shunxing Fan","Mingming Gong","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.02191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02189v1","updated":"2024-06-04T10:34:40Z","published":"2024-06-04T10:34:40Z","title":"Fast and Scalable Multi-Kernel Encoder Classifier","summary":"  This paper introduces a new kernel-based classifier by viewing kernel\nmatrices as generalized graphs and leveraging recent progress in graph\nembedding techniques. The proposed method facilitates fast and scalable kernel\nmatrix embedding, and seamlessly integrates multiple kernels to enhance the\nlearning process. Our theoretical analysis offers a population-level\ncharacterization of this approach using random variables. Empirically, our\nmethod demonstrates superior running time compared to standard approaches such\nas support vector machines and two-layer neural network, while achieving\ncomparable classification accuracy across various simulated and real datasets.\n","authors":["Cencheng Shen"],"pdf_url":"https://arxiv.org/pdf/2406.02189v1.pdf","comment":"12 pages main + 3 pages appendix"},{"id":"http://arxiv.org/abs/2406.02187v1","updated":"2024-06-04T10:31:03Z","published":"2024-06-04T10:31:03Z","title":"DNCs Require More Planning Steps","summary":"  Many recent works use machine learning models to solve various complex\nalgorithmic problems. However, these models attempt to reach a solution without\nconsidering the problem's required computational complexity, which can be\ndetrimental to their ability to solve it correctly. In this work we investigate\nthe effect of computational time and memory on generalization of implicit\nalgorithmic solvers. To do so, we focus on the Differentiable Neural Computer\n(DNC), a general problem solver that also lets us reason directly about its\nusage of time and memory. In this work, we argue that the number of planning\nsteps the model is allowed to take, which we call \"planning budget\", is a\nconstraint that can cause the model to generalize poorly and hurt its ability\nto fully utilize its external memory. We evaluate our method on Graph Shortest\nPath, Convex Hull, Graph MinCut and Associative Recall, and show how the\nplanning budget can drastically change the behavior of the learned algorithm,\nin terms of learned time complexity, training time, stability and\ngeneralization to inputs larger than those seen during training.\n","authors":["Yara Shamshoum","Nitzan Hodos","Yuval Sieradzki","Assaf Schuster"],"pdf_url":"https://arxiv.org/pdf/2406.02187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08516v3","updated":"2024-06-04T10:25:13Z","published":"2023-11-14T20:12:38Z","title":"LLMs cannot find reasoning errors, but can correct them given the error\n  location","summary":"  While self-correction has shown promise in improving LLM outputs in terms of\nstyle and quality (e.g. Chen et al., 2023b; Madaan et al., 2023), recent\nattempts to self-correct logical or reasoning errors often cause correct\nanswers to become incorrect, resulting in worse performances overall (Huang et\nal., 2023). In this paper, we show that poor self-correction performance stems\nfrom LLMs' inability to find logical mistakes, rather than their ability to\ncorrect a known mistake. Firstly, we benchmark several state-of-the-art LLMs on\ntheir mistake-finding ability and demonstrate that they generally struggle with\nthe task, even in highly objective, unambiguous cases. Secondly, we test the\ncorrection abilities of LLMs -- separately from mistake finding -- using a\nbacktracking setup that feeds ground truth mistake location information to the\nmodel. We show that this boosts downstream task performance across our 5\nreasoning tasks, indicating that LLMs' correction abilities are robust.\nFinally, we show that it is possible to obtain mistake location information\nwithout ground truth labels or in-domain training data. We train a small\nclassifier with out-of-domain data, which exhibits stronger mistake-finding\nperformance than prompting a large model. We release our dataset of\nLLM-generated logical mistakes, BIG-Bench Mistake, to enable further research\ninto locating LLM reasoning mistakes.\n","authors":["Gladys Tyen","Hassan Mansoor","Victor Cărbune","Peter Chen","Tony Mak"],"pdf_url":"https://arxiv.org/pdf/2311.08516v3.pdf","comment":"ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2204.09389v2","updated":"2024-06-04T10:24:11Z","published":"2022-04-20T11:01:51Z","title":"Epistemic Uncertainty-Weighted Loss for Visual Bias Mitigation","summary":"  Deep neural networks are highly susceptible to learning biases in visual\ndata. While various methods have been proposed to mitigate such bias, the\nmajority require explicit knowledge of the biases present in the training data\nin order to mitigate. We argue the relevance of exploring methods which are\ncompletely ignorant of the presence of any bias, but are capable of identifying\nand mitigating them. Furthermore, we propose using Bayesian neural networks\nwith a predictive uncertainty-weighted loss function to dynamically identify\npotential bias in individual training samples and to weight them during\ntraining. We find a positive correlation between samples subject to bias and\nhigher epistemic uncertainties. Finally, we show the method has potential to\nmitigate visual bias on a bias benchmark dataset and on a real-world face\ndetection problem, and we consider the merits and weaknesses of our approach.\n","authors":["Rebecca S Stone","Nishant Ravikumar","Andrew J Bulpitt","David C Hogg"],"pdf_url":"https://arxiv.org/pdf/2204.09389v2.pdf","comment":"Published in 2022 IEEE CVPR Workshop on Fair, Data Efficient and\n  Trusted Computer Vision"},{"id":"http://arxiv.org/abs/2406.02180v1","updated":"2024-06-04T10:22:12Z","published":"2024-06-04T10:22:12Z","title":"On The Statistical Representation Properties Of The Perturb-Softmax And\n  The Perturb-Argmax Probability Distributions","summary":"  The Gumbel-Softmax probability distribution allows learning discrete tokens\nin generative learning, while the Gumbel-Argmax probability distribution is\nuseful in learning discrete structures in discriminative learning. Despite the\nefforts invested in optimizing these probability models, their statistical\nproperties are under-explored. In this work, we investigate their\nrepresentation properties and determine for which families of parameters these\nprobability distributions are complete, i.e., can represent any probability\ndistribution, and minimal, i.e., can represent a probability distribution\nuniquely. We rely on convexity and differentiability to determine these\nstatistical conditions and extend this framework to general probability models,\nsuch as Gaussian-Softmax and Gaussian-Argmax. We experimentally validate the\nqualities of these extensions, which enjoy a faster convergence rate. We\nconclude the analysis by identifying two sets of parameters that satisfy these\nassumptions and thus admit a complete and minimal representation. Our\ncontribution is theoretical with supporting practical evaluation.\n","authors":["Hedda Cohen Indelman","Tamir Hazan"],"pdf_url":"https://arxiv.org/pdf/2406.02180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20085v2","updated":"2024-06-04T10:15:42Z","published":"2024-05-30T14:16:19Z","title":"Soft Partitioning of Latent Space for Semantic Channel Equalization","summary":"  Semantic channel equalization has emerged as a solution to address language\nmismatch in multi-user semantic communications. This approach aims to align the\nlatent spaces of an encoder and a decoder which were not jointly trained and it\nrelies on a partition of the semantic (latent) space into atoms based on the\nthe semantic meaning. In this work we explore the role of the semantic space\npartition in scenarios where the task structure involves a one-to-many mapping\nbetween the semantic space and the action space. In such scenarios,\npartitioning based on hard inference results results in loss of information\nwhich degrades the equalization performance. We propose a soft criterion to\nderive the atoms of the partition which leverages the soft decoder's output and\noffers a more comprehensive understanding of the semantic space's structure.\nThrough empirical validation, we demonstrate that soft partitioning yields a\nmore descriptive and regular partition of the space, consequently enhancing the\nperformance of the equalization algorithm.\n","authors":["Tomás Hüttebräucker","Mohamed Sana","Emilio Calvanese Strinati"],"pdf_url":"https://arxiv.org/pdf/2405.20085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02177v1","updated":"2024-06-04T10:14:39Z","published":"2024-06-04T10:14:39Z","title":"One-Shot Federated Learning with Bayesian Pseudocoresets","summary":"  Optimization-based techniques for federated learning (FL) often come with\nprohibitive communication cost, as high dimensional model parameters need to be\ncommunicated repeatedly between server and clients. In this paper, we follow a\nBayesian approach allowing to perform FL with one-shot communication, by\nsolving the global inference problem as a product of local client posteriors.\nFor models with multi-modal likelihoods, such as neural networks, a naive\napplication of this scheme is hampered, since clients will capture different\nposterior modes, causing a destructive collapse of the posterior on the server\nside. Consequently, we explore approximate inference in the function-space\nrepresentation of client posteriors, hence suffering less or not at all from\nmulti-modality. We show that distributed function-space inference is tightly\nrelated to learning Bayesian pseudocoresets and develop a tractable Bayesian FL\nalgorithm on this insight. We show that this approach achieves prediction\nperformance competitive to state-of-the-art while showing a striking reduction\nin communication cost of up to two orders of magnitude. Moreover, due to its\nBayesian nature, our method also delivers well-calibrated uncertainty\nestimates.\n","authors":["Tim d'Hondt","Mykola Pechenizkiy","Robert Peharz"],"pdf_url":"https://arxiv.org/pdf/2406.02177v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2405.13511v2","updated":"2024-06-04T10:13:13Z","published":"2024-05-22T10:12:32Z","title":"Latent Space Alignment for Semantic Channel Equalization","summary":"  We relax the constraint of a shared language between agents in a semantic and\ngoal-oriented communication system to explore the effect of language mismatch\nin distributed task solving. We propose a mathematical framework, which\nprovides a modelling and a measure of the semantic distortion introduced in the\ncommunication when agents use distinct languages. We then propose a new\napproach to semantic channel equalization with proven effectiveness through\nnumerical evaluations.\n","authors":["Tomás Hüttebräucker","Mohamed Sana","Emilio Calvanese Strinati"],"pdf_url":"https://arxiv.org/pdf/2405.13511v2.pdf","comment":"Accepted for publication at 2024 IEEE ICMLCN"},{"id":"http://arxiv.org/abs/2406.02176v1","updated":"2024-06-04T10:12:09Z","published":"2024-06-04T10:12:09Z","title":"AROMA: Preserving Spatial Structure for Latent PDE Modeling with Local\n  Neural Fields","summary":"  We present AROMA (Attentive Reduced Order Model with Attention), a framework\ndesigned to enhance the modeling of partial differential equations (PDEs) using\nlocal neural fields. Our flexible encoder-decoder architecture can obtain\nsmooth latent representations of spatial physical fields from a variety of data\ntypes, including irregular-grid inputs and point clouds. This versatility\neliminates the need for patching and allows efficient processing of diverse\ngeometries. The sequential nature of our latent representation can be\ninterpreted spatially and permits the use of a conditional transformer for\nmodeling the temporal dynamics of PDEs. By employing a diffusion-based\nformulation, we achieve greater stability and enable longer rollouts compared\nto conventional MSE training. AROMA's superior performance in simulating 1D and\n2D equations underscores the efficacy of our approach in capturing complex\ndynamical behaviors.\n","authors":["Louis Serrano","Thomas X Wang","Etienne Le Naour","Jean-Noël Vittaut","Patrick Gallinari"],"pdf_url":"https://arxiv.org/pdf/2406.02176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02175v1","updated":"2024-06-04T10:11:46Z","published":"2024-06-04T10:11:46Z","title":"Branches: A Fast Dynamic Programming and Branch & Bound Algorithm for\n  Optimal Decision Trees","summary":"  Decision Tree Learning is a fundamental problem for Interpretable Machine\nLearning, yet it poses a formidable optimization challenge. Despite numerous\nefforts dating back to the early 1990's, practical algorithms have only\nrecently emerged, primarily leveraging Dynamic Programming (DP) and Branch &\nBound (B&B) techniques. These breakthroughs led to the development of two\ndistinct approaches. Algorithms like DL8.5 and MurTree operate on the space of\nnodes (or branches), they are very fast, but do not penalise complex Decision\nTrees, i.e. they do not solve for sparsity. On the other hand, algorithms like\nOSDT and GOSDT operate on the space of Decision Trees, they solve for sparsity\nbut at the detriment of speed. In this work, we introduce Branches, a novel\nalgorithm that integrates the strengths of both paradigms. Leveraging DP and\nB&B, Branches achieves exceptional speed while also solving for sparsity.\nCentral to its efficiency is a novel analytical bound enabling substantial\npruning of the search space. Theoretical analysis demonstrates that Branches\nhas lower complexity compared to state-of-the-art methods, a claim validated\nthrough extensive empirical evaluation. Our results illustrate that Branches\nnot only greatly outperforms existing approaches in terms of speed and number\nof iterations, it also consistently yields optimal Decision Trees.\n","authors":["Ayman Chaouki","Jesse Read","Albert Bifet"],"pdf_url":"https://arxiv.org/pdf/2406.02175v1.pdf","comment":"This preprint is currently under review"},{"id":"http://arxiv.org/abs/2402.16858v2","updated":"2024-06-04T10:10:22Z","published":"2024-01-19T16:43:47Z","title":"Pragmatic Goal-Oriented Communications under Semantic-Effectiveness\n  Channel Errors","summary":"  In forthcoming AI-assisted 6G networks, integrating semantic, pragmatic, and\ngoal-oriented communication strategies becomes imperative. This integration\nwill enable sensing, transmission, and processing of exclusively pertinent task\ndata, ensuring conveyed information possesses understandable, pragmatic\nsemantic significance, aligning with destination needs and goals. Without\ndoubt, no communication is error free. Within this context, besides errors\nstemming from typical wireless communication dynamics, potential distortions\nbetween transmitter-intended and receiver-interpreted meanings can emerge due\nto limitations in semantic processing capabilities, as well as language and\nknowledge representation disparities between transmitters and receivers. The\nmain contribution of this paper is two-fold. First, it proposes and details a\nnovel mathematical modeling of errors stemming from language mismatches at both\nsemantic and effectiveness levels. Second, it provides a novel algorithmic\nsolution to counteract these types of errors which leverages optimal transport\ntheory. Our numerical results show the potential of the proposed mechanism to\ncompensate for language mismatches, thereby enhancing the attainability of\nreliable communication under noisy communication environments.\n","authors":["Tomás Hüttebräucker","Mohamed Sana","Emilio Calvanese Strinati"],"pdf_url":"https://arxiv.org/pdf/2402.16858v2.pdf","comment":"Accepted for publication in 2024 IEEE Consumer Communications and\n  Networking Conference"},{"id":"http://arxiv.org/abs/2306.06777v5","updated":"2024-06-04T10:09:10Z","published":"2023-06-11T21:14:29Z","title":"Improving the Validity of Decision Trees as Explanations","summary":"  In classification and forecasting with tabular data, one often utilizes\ntree-based models. Those can be competitive with deep neural networks on\ntabular data and, under some conditions, explainable. The explainability\ndepends on the depth of the tree and the accuracy in each leaf of the tree. We\npoint out that decision trees containing leaves with unbalanced accuracy can\nprovide misleading explanations. Low-accuracy leaves give less valid\nexplanations, which could be interpreted as unfairness among subgroups\nutilizing these explanations. Here, we train a shallow tree with the objective\nof minimizing the maximum misclassification error across all leaf nodes. The\nshallow tree provides a global explanation, while the overall statistical\nperformance of the shallow tree can become comparable to state-of-the-art\nmethods (e.g., well-tuned XGBoost) by extending the leaves with further models.\n","authors":["Jiri Nemecek","Tomas Pevny","Jakub Marecek"],"pdf_url":"https://arxiv.org/pdf/2306.06777v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10248v4","updated":"2024-06-04T10:08:39Z","published":"2023-08-20T12:21:05Z","title":"Activation Addition: Steering Language Models Without Optimization","summary":"  Reliably controlling the behavior of large language models is a pressing open\nproblem. Existing methods include supervised finetuning, reinforcement learning\nfrom human feedback, prompt engineering and guided decoding. We instead\ninvestigate activation engineering: modifying activations at inference-time to\npredictably alter model behavior. We bias the forward pass with a 'steering\nvector' implicitly specified through natural language. Past work learned these\nsteering vectors; our Activation Addition (ActAdd) method instead computes them\nby taking activation differences resulting from pairs of prompts. We\ndemonstrate ActAdd on a range of LLMs (LLaMA-3, OPT, GPT-2, and GPT-J),\nobtaining SOTA on detoxification and negative-to-positive sentiment control.\nOur approach yields inference-time control over high-level properties of output\nlike topic and sentiment while preserving performance on off-target tasks.\nActAdd takes far less compute and implementation effort than finetuning or\nRLHF, allows users control through natural language, and its computational\noverhead (as a fraction of inference time) appears stable or improving over\nincreasing model size.\n","authors":["Alexander Matt Turner","Lisa Thiergart","Gavin Leech","David Udell","Juan J. Vazquez","Ulisse Mini","Monte MacDiarmid"],"pdf_url":"https://arxiv.org/pdf/2308.10248v4.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2406.02438v1","updated":"2024-06-04T16:00:18Z","published":"2024-06-04T16:00:18Z","title":"CtrSVDD: A Benchmark Dataset and Baseline Analysis for Controlled\n  Singing Voice Deepfake Detection","summary":"  Recent singing voice synthesis and conversion advancements necessitate robust\nsinging voice deepfake detection (SVDD) models. Current SVDD datasets face\nchallenges due to limited controllability, diversity in deepfake methods, and\nlicensing restrictions. Addressing these gaps, we introduce CtrSVDD, a\nlarge-scale, diverse collection of bonafide and deepfake singing vocals. These\nvocals are synthesized using state-of-the-art methods from publicly accessible\nsinging voice datasets. CtrSVDD includes 47.64 hours of bonafide and 260.34\nhours of deepfake singing vocals, spanning 14 deepfake methods and involving\n164 singer identities. We also present a baseline system with flexible\nfront-end features, evaluated against a structured train/dev/eval split. The\nexperiments show the importance of feature selection and highlight a need for\ngeneralization towards deepfake methods that deviate further from training\ndistribution. The CtrSVDD dataset and baselines are publicly accessible.\n","authors":["Yongyi Zang","Jiatong Shi","You Zhang","Ryuichi Yamamoto","Jionghao Han","Yuxun Tang","Shengyuan Xu","Wenxiao Zhao","Jing Guo","Tomoki Toda","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2406.02438v1.pdf","comment":"Accepted by Interspeech 2024"},{"id":"http://arxiv.org/abs/2406.02345v1","updated":"2024-06-04T14:21:41Z","published":"2024-06-04T14:21:41Z","title":"Progressive Confident Masking Attention Network for Audio-Visual\n  Segmentation","summary":"  Audio and visual signals typically occur simultaneously, and humans possess\nan innate ability to correlate and synchronize information from these two\nmodalities. Recently, a challenging problem known as Audio-Visual Segmentation\n(AVS) has emerged, intending to produce segmentation maps for sounding objects\nwithin a scene. However, the methods proposed so far have not sufficiently\nintegrated audio and visual information, and the computational costs have been\nextremely high. Additionally, the outputs of different stages have not been\nfully utilized. To facilitate this research, we introduce a novel Progressive\nConfident Masking Attention Network (PMCANet). It leverages attention\nmechanisms to uncover the intrinsic correlations between audio signals and\nvisual frames. Furthermore, we design an efficient and effective\ncross-attention module to enhance semantic perception by selecting query\ntokens. This selection is determined through confidence-driven units based on\nthe network's multi-stage predictive outputs. Experiments demonstrate that our\nnetwork outperforms other AVS methods while requiring less computational\nresources.\n","authors":["Yuxuan Wang","Feng Dong","Jinchao Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.02345v1.pdf","comment":"10 pages, 9 figures, submitted to IEEE TRANSACTIONS ON CIRCUITS AND\n  SYSTEMS FOR VIDEO TECHNOLOGY"},{"id":"http://arxiv.org/abs/2406.02302v1","updated":"2024-06-04T13:31:57Z","published":"2024-06-04T13:31:57Z","title":"Towards AI-Assisted Sustainable Adaptive Video Streaming Systems:\n  Tutorial and Survey","summary":"  Improvements in networking technologies and the steadily increasing numbers\nof users, as well as the shift from traditional broadcasting to streaming\ncontent over the Internet, have made video applications (e.g., live and\nVideo-on-Demand (VoD)) predominant sources of traffic. Recent advances in\nArtificial Intelligence (AI) and its widespread application in various academic\nand industrial fields have focused on designing and implementing a variety of\nvideo compression and content delivery techniques to improve user Quality of\nExperience (QoE). However, providing high QoE services results in more energy\nconsumption and carbon footprint across the service delivery path, extending\nfrom the end user's device through the network and service infrastructure\n(e.g., cloud providers). Despite the importance of energy efficiency in video\nstreaming, there is a lack of comprehensive surveys covering state-of-the-art\nAI techniques and their applications throughout the video streaming lifecycle.\nExisting surveys typically focus on specific parts, such as video encoding,\ndelivery networks, playback, or quality assessment, without providing a\nholistic view of the entire lifecycle and its impact on energy consumption and\nQoE. Motivated by this research gap, this survey provides a comprehensive\noverview of the video streaming lifecycle, content delivery, energy and Video\nQuality Assessment (VQA) metrics and models, and AI techniques employed in\nvideo streaming. In addition, it conducts an in-depth state-of-the-art analysis\nfocused on AI-driven approaches to enhance the energy efficiency of end-to-end\naspects of video streaming systems (i.e., encoding, delivery network, playback,\nand VQA approaches). Finally, it discusses prospective research directions for\ndeveloping AI-assisted energy-aware video streaming systems.\n","authors":["Reza Farahani","Zoha Azimi","Christian Timmerer","Radu Prodan"],"pdf_url":"https://arxiv.org/pdf/2406.02302v1.pdf","comment":"33 pages, 7 figures, 6 Tables, Journal paper"},{"id":"http://arxiv.org/abs/2406.02255v1","updated":"2024-06-04T12:21:55Z","published":"2024-06-04T12:21:55Z","title":"MidiCaps -- A large-scale MIDI dataset with text captions","summary":"  Generative models guided by text prompts are increasingly becoming more\npopular. However, no text-to-MIDI models currently exist, mostly due to the\nlack of a captioned MIDI dataset. This work aims to enable research that\ncombines LLMs with symbolic music by presenting the first large-scale MIDI\ndataset with text captions that is openly available: MidiCaps. MIDI (Musical\nInstrument Digital Interface) files are a widely used format for encoding\nmusical information. Their structured format captures the nuances of musical\ncomposition and has practical applications by music producers, composers,\nmusicologists, as well as performers. Inspired by recent advancements in\ncaptioning techniques applied to various domains, we present a large-scale\ncurated dataset of over 168k MIDI files accompanied by textual descriptions.\nEach MIDI caption succinctly describes the musical content, encompassing tempo,\nchord progression, time signature, instruments present, genre and mood; thereby\nfacilitating multi-modal exploration and analysis. The dataset contains a mix\nof various genres, styles, and complexities, offering a rich source for\ntraining and evaluating models for tasks such as music information retrieval,\nmusic understanding and cross-modal translation. We provide detailed statistics\nabout the dataset and have assessed the quality of the captions in an extensive\nlistening study. We anticipate that this resource will stimulate further\nresearch in the intersection of music and natural language processing,\nfostering advancements in both fields.\n","authors":["Jan Melechovsky","Abhinaba Roy","Dorien Herremans"],"pdf_url":"https://arxiv.org/pdf/2406.02255v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2305.10223v4","updated":"2024-06-04T10:52:16Z","published":"2023-05-17T13:56:48Z","title":"Advancing Unsupervised Low-light Image Enhancement: Noise Estimation,\n  Illumination Interpolation, and Self-Regulation","summary":"  Contemporary Low-Light Image Enhancement (LLIE) techniques have made notable\nadvancements in preserving image details and enhancing contrast, achieving\ncommendable results on specific datasets. Nevertheless, these approaches\nencounter persistent challenges in efficiently mitigating dynamic noise and\naccommodating diverse low-light scenarios. Insufficient constraints on complex\npixel-wise mapping learning lead to overfitting to specific types of noise and\nartifacts associated with low-light conditions, reducing effectiveness in\nvariable lighting scenarios. To this end, we first propose a method for\nestimating the noise level in low light images in a quick and accurate way.\nThis facilitates precise denoising, prevents over-smoothing, and adapts to\ndynamic noise patterns. Subsequently, we devise a Learnable Illumination\nInterpolator (LII), which employs learnlable interpolation operations between\nthe input and unit vector to satisfy general constraints between illumination\nand input. Finally, we introduce a self-regularization loss that incorporates\nintrinsic image properties and essential visual attributes to guide the output\ntowards meeting human visual expectations. Comprehensive experiments validate\nthe competitiveness of our proposed algorithm in both qualitative and\nquantitative assessments. Notably, our noise estimation method, with linear\ntime complexity and suitable for various denoisers, significantly improves both\ndenoising and enhancement performance. Benefiting from this, our approach\nachieves a 0.675dB PSNR improvement on the LOL dataset and 0.818dB on the MIT\ndataset on LLIE task, even compared to supervised methods. The source code is\navailable at \\href{https://doi.org/10.5281/zenodo.11463142}{this DOI\nrepository} and the specific code for noise estimation can be found at\n\\href{https://github.com/GoogolplexGoodenough/noise_estimate}{this separate\nGitHub link}.\n","authors":["Xiaofeng Liu","Jiaxin Gao","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2305.10223v4.pdf","comment":"Image processing, low-light image enhancement, noise estimation,\n  illumination learning"},{"id":"http://arxiv.org/abs/2406.02062v1","updated":"2024-06-04T07:44:18Z","published":"2024-06-04T07:44:18Z","title":"Towards Railways Remote Driving: Analysis of Video Streaming Latency and\n  Adaptive Rate Control","summary":"  Remote driving aims to improve transport systems by promoting efficiency,\nsustainability, and accessibility. In the railway sector, remote driving makes\nit possible to increase flexibility, as the driver no longer has to be in the\ncab. However, this brings several challenges, as it has to provide at least the\nsame level of safety obtained when the driver is in the cab. To achieve it,\nwireless networks and video streaming technologies gain importance as they\nshould provide real-time track visualization and obstacle detection\ncapabilities to the remote driver. Low latency camera capture, onboard media\nprocessing devices, and streaming protocols adapted for wireless links are the\nnecessary enablers to be developed and integrated into the railway\ninfrastructure. This paper compares video streaming protocols such as Real-Time\nStreaming Protocol (RTSP) and Web Real-Time Communication (WebRTC), as they are\nthe main alternatives based on Real-time Transport Protocol (RTP) protocol to\nenable low latency. As latency is the main performance metric, this paper also\nprovides a solution to calculate the End-to-End video streaming latency\nanalytically. Finally, the paper proposes a rate control algorithm to adapt the\nvideo stream depending on the network capacity. The objective is to keep the\nlatency as low as possible while avoiding any visual artifacts. The proposed\nsolutions are tested in different setups and scenarios to prove their\neffectiveness before the planned field testing.\n","authors":["Daniel Mejias","Zaloa Fernandez","Roberto Viola","Ander Aramburu","Igor Lopez","Andoni Diaz"],"pdf_url":"https://arxiv.org/pdf/2406.02062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02032v1","updated":"2024-06-04T07:17:44Z","published":"2024-06-04T07:17:44Z","title":"M2D-CLAP: Masked Modeling Duo Meets CLAP for Learning General-purpose\n  Audio-Language Representation","summary":"  Contrastive language-audio pre-training (CLAP) enables zero-shot (ZS)\ninference of audio and exhibits promising performance in several classification\ntasks. However, conventional audio representations are still crucial for many\ntasks where ZS is not applicable (e.g., regression problems). Here, we explore\na new representation, a general-purpose audio-language representation, that\nperforms well in both ZS and transfer learning. To do so, we propose a new\nmethod, M2D-CLAP, which combines self-supervised learning Masked Modeling Duo\n(M2D) and CLAP. M2D learns an effective representation to model audio signals,\nand CLAP aligns the representation with text embedding. As a result, M2D-CLAP\nlearns a versatile representation that allows for both ZS and transfer\nlearning. Experiments show that M2D-CLAP performs well on linear evaluation,\nfine-tuning, and ZS classification with a GTZAN state-of-the-art of 75.17%,\nthus achieving a general-purpose audio-language representation.\n","authors":["Daisuke Niizumi","Daiki Takeuchi","Yasunori Ohishi","Noboru Harada","Masahiro Yasuda","Shunsuke Tsubaki","Keisuke Imoto"],"pdf_url":"https://arxiv.org/pdf/2406.02032v1.pdf","comment":"5 pages, 1 figure, 5 tables. Accepted by Interspeech 2024"},{"id":"http://arxiv.org/abs/2406.01938v1","updated":"2024-06-04T03:45:08Z","published":"2024-06-04T03:45:08Z","title":"Nutrition Estimation for Dietary Management: A Transformer Approach with\n  Depth Sensing","summary":"  Nutrition estimation is crucial for effective dietary management and overall\nhealth and well-being. Existing methods often struggle with sub-optimal\naccuracy and can be time-consuming. In this paper, we propose NuNet, a\ntransformer-based network designed for nutrition estimation that utilizes both\nRGB and depth information from food images. We have designed and implemented a\nmulti-scale encoder and decoder, along with two types of feature fusion\nmodules, specialized for estimating five nutritional factors. These modules\neffectively balance the efficiency and effectiveness of feature extraction with\nflexible usage of our customized attention mechanisms and fusion strategies.\nOur experimental study shows that NuNet outperforms its variants and existing\nsolutions significantly for nutrition estimation. It achieves an error rate of\n15.65%, the lowest known to us, largely due to our multi-scale architecture and\nfusion modules. This research holds practical values for dietary management\nwith huge potential for transnational research and deployment and could inspire\nother applications involving multiple data types with varying degrees of\nimportance.\n","authors":["Zhengyi Kwan","Wei Zhang","Zhengkui Wang","Aik Beng Ng","Simon See"],"pdf_url":"https://arxiv.org/pdf/2406.01938v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2406.02761v1","updated":"2024-06-04T20:28:02Z","published":"2024-06-04T20:28:02Z","title":"Multi-layer Learnable Attention Mask for Multimodal Tasks","summary":"  While the Self-Attention mechanism in the Transformer model has proven to be\neffective in many domains, we observe that it is less effective in more diverse\nsettings (e.g. multimodality) due to the varying granularity of each token and\nthe high computational demands of lengthy sequences. To address the challenges,\nwe introduce the Learnable Attention Mask (LAM), strategically designed to\nglobally regulate attention maps and prioritize critical tokens within the\nsequence. Leveraging the Self-Attention module in a BERT-like transformer\nnetwork, our approach adeptly captures associations between tokens. The\nextension of the LAM to a multi-layer version accommodates the varied\ninformation aspects embedded at each layer of the Transformer network.\nComprehensive experimental validation on various datasets, such as MADv2,\nQVHighlights, ImageNet 1K, and MSRVTT, demonstrates the efficacy of the LAM,\nexemplifying its ability to enhance model performance while mitigating\nredundant computations. This pioneering approach presents a significant\nadvancement in enhancing the understanding of complex scenarios, such as in\nmovie understanding.\n","authors":["Wayner Barrios","SouYoung Jin"],"pdf_url":"https://arxiv.org/pdf/2406.02761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16807v2","updated":"2024-06-04T18:42:01Z","published":"2024-05-27T03:54:09Z","title":"Extreme Compression of Adaptive Neural Images","summary":"  Implicit Neural Representations (INRs) and Neural Fields are a novel paradigm\nfor signal representation, from images and audio to 3D scenes and videos. The\nfundamental idea is to represent a signal as a continuous and differentiable\nneural network. This idea offers unprecedented benefits such as continuous\nresolution and memory efficiency, enabling new compression techniques. However,\nrepresenting data as neural networks poses new challenges. For instance, given\na 2D image as a neural network, how can we further compress such a neural\nimage?. In this work, we present a novel analysis on compressing neural fields,\nwith the focus on images. We also introduce Adaptive Neural Images (ANI), an\nefficient neural representation that enables adaptation to different inference\nor transmission requirements. Our proposed method allows to reduce the\nbits-per-pixel (bpp) of the neural image by 4x, without losing sensitive\ndetails or harming fidelity. We achieve this thanks to our successful\nimplementation of 4-bit neural representations. Our work offers a new framework\nfor developing compressed neural fields.\n","authors":["Leo Hoshikawa","Marcos V. Conde","Takeshi Ohashi","Atsushi Irie"],"pdf_url":"https://arxiv.org/pdf/2405.16807v2.pdf","comment":"Technical Report. Work in progress"}]},"2024-06-05T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2406.03496v1","updated":"2024-06-05T17:59:40Z","published":"2024-06-05T17:59:40Z","title":"Wings: Learning Multimodal LLMs without Text-only Forgetting","summary":"  Multimodal large language models (MLLMs), initiated with a trained LLM, first\nalign images with text and then fine-tune on multimodal mixed inputs. However,\nthe MLLM catastrophically forgets the text-only instructions, which do not\ninclude images and can be addressed within the initial LLM. In this paper, we\npresent Wings, a novel MLLM that excels in both text-only dialogues and\nmultimodal comprehension. Analyzing MLLM attention in multimodal instructions\nreveals that text-only forgetting is related to the attention shifts from\npre-image to post-image text. From that, we construct extra modules that act as\nthe boosted learner to compensate for the attention shift. The complementary\nvisual and textual learners, like \"wings\" on either side, are connected in\nparallel within each layer's attention block. Initially, image and text inputs\nare aligned with visual learners operating alongside the main attention,\nbalancing focus on visual elements. Textual learners are later collaboratively\nintegrated with attention-based routing to blend the outputs of the visual and\ntextual learners. We design the Low-Rank Residual Attention (LoRRA) to\nguarantee high efficiency for learners. Our experimental results demonstrate\nthat Wings outperforms equally-scaled MLLMs in both text-only and visual\nquestion-answering tasks. On a newly constructed Interleaved Image-Text (IIT)\nbenchmark, Wings exhibits superior performance from text-only-rich to\nmultimodal-rich question-answering tasks.\n","authors":["Yi-Kai Zhang","Shiyin Lu","Yang Li","Yanqing Ma","Qing-Guo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","De-Chuan Zhan","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2406.03496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03487v1","updated":"2024-06-05T17:49:47Z","published":"2024-06-05T17:49:47Z","title":"Analyzing LLM Behavior in Dialogue Summarization: Unveiling\n  Circumstantial Hallucination Trends","summary":"  Recent advancements in large language models (LLMs) have considerably\nadvanced the capabilities of summarization systems. However, they continue to\nface concerns about hallucinations. While prior work has evaluated LLMs\nextensively in news domains, most evaluation of dialogue summarization has\nfocused on BART-based models, leaving a gap in our understanding of their\nfaithfulness. Our work benchmarks the faithfulness of LLMs for dialogue\nsummarization, using human annotations and focusing on identifying and\ncategorizing span-level inconsistencies. Specifically, we focus on two\nprominent LLMs: GPT-4 and Alpaca-13B. Our evaluation reveals subtleties as to\nwhat constitutes a hallucination: LLMs often generate plausible inferences,\nsupported by circumstantial evidence in the conversation, that lack direct\nevidence, a pattern that is less prevalent in older models. We propose a\nrefined taxonomy of errors, coining the category of \"Circumstantial Inference\"\nto bucket these LLM behaviors and release the dataset. Using our taxonomy, we\ncompare the behavioral differences between LLMs and older fine-tuned models.\nAdditionally, we systematically assess the efficacy of automatic error\ndetection methods on LLM summaries and find that they struggle to detect these\nnuanced errors. To address this, we introduce two prompt-based approaches for\nfine-grained error detection that outperform existing metrics, particularly for\nidentifying \"Circumstantial Inference.\"\n","authors":["Sanjana Ramprasad","Elisa Ferracane","Zachary C. Lipton"],"pdf_url":"https://arxiv.org/pdf/2406.03487v1.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2406.03486v1","updated":"2024-06-05T17:49:24Z","published":"2024-06-05T17:49:24Z","title":"BIPED: Pedagogically Informed Tutoring System for ESL Education","summary":"  Large Language Models (LLMs) have a great potential to serve as readily\navailable and cost-efficient Conversational Intelligent Tutoring Systems (CITS)\nfor teaching L2 learners of English. Existing CITS, however, are designed to\nteach only simple concepts or lack the pedagogical depth necessary to address\ndiverse learning strategies. To develop a more pedagogically informed CITS\ncapable of teaching complex concepts, we construct a BIlingual\nPEDagogically-informed Tutoring Dataset (BIPED) of one-on-one, human-to-human\nEnglish tutoring interactions. Through post-hoc analysis of the tutoring\ninteractions, we come up with a lexicon of dialogue acts (34 tutor acts and 9\nstudent acts), which we use to further annotate the collected dataset. Based on\na two-step framework of first predicting the appropriate tutor act then\ngenerating the corresponding response, we implemented two CITS models using\nGPT-4 and SOLAR-KO, respectively. We experimentally demonstrate that the\nimplemented models not only replicate the style of human teachers but also\nemploy diverse and contextually appropriate pedagogical strategies.\n","authors":["Soonwoo Kwon","Sojung Kim","Minju Park","Seunghyun Lee","Kyuseok Kim"],"pdf_url":"https://arxiv.org/pdf/2406.03486v1.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2403.03942v2","updated":"2024-06-05T17:44:03Z","published":"2024-03-06T18:50:14Z","title":"The Heuristic Core: Understanding Subnetwork Generalization in\n  Pretrained Language Models","summary":"  Prior work has found that pretrained language models (LMs) fine-tuned with\ndifferent random seeds can achieve similar in-domain performance but generalize\ndifferently on tests of syntactic generalization. In this work, we show that,\neven within a single model, we can find multiple subnetworks that perform\nsimilarly in-domain, but generalize vastly differently. To better understand\nthese phenomena, we investigate if they can be understood in terms of\n\"competing subnetworks\": the model initially represents a variety of distinct\nalgorithms, corresponding to different subnetworks, and generalization occurs\nwhen it ultimately converges to one. This explanation has been used to account\nfor generalization in simple algorithmic tasks (\"grokking\"). Instead of finding\ncompeting subnetworks, we find that all subnetworks -- whether they generalize\nor not -- share a set of attention heads, which we refer to as the heuristic\ncore. Further analysis suggests that these attention heads emerge early in\ntraining and compute shallow, non-generalizing features. The model learns to\ngeneralize by incorporating additional attention heads, which depend on the\noutputs of the \"heuristic\" heads to compute higher-level features. Overall, our\nresults offer a more detailed picture of the mechanisms for syntactic\ngeneralization in pretrained LMs.\n","authors":["Adithya Bhaskar","Dan Friedman","Danqi Chen"],"pdf_url":"https://arxiv.org/pdf/2403.03942v2.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2406.03482v1","updated":"2024-06-05T17:42:05Z","published":"2024-06-05T17:42:05Z","title":"QJL: 1-Bit Quantized JL Transform for KV Cache Quantization with Zero\n  Overhead","summary":"  Serving LLMs requires substantial memory due to the storage requirements of\nKey-Value (KV) embeddings in the KV cache, which grows with sequence length. An\neffective approach to compress KV cache is quantization. However, traditional\nquantization methods face significant memory overhead due to the need to store\nquantization constants (at least a zero point and a scale) in full precision\nper data block. Depending on the block size, this overhead can add 1 or 2 bits\nper quantized number. We introduce QJL, a new quantization approach that\nconsists of a Johnson-Lindenstrauss (JL) transform followed by sign-bit\nquantization. In contrast to existing methods, QJL eliminates memory overheads\nby removing the need for storing quantization constants. We propose an\nasymmetric estimator for the inner product of two vectors and demonstrate that\napplying QJL to one vector and a standard JL transform without quantization to\nthe other provides an unbiased estimator with minimal distortion. We have\ndeveloped an efficient implementation of the QJL sketch and its corresponding\ninner product estimator, incorporating a lightweight CUDA kernel for optimized\ncomputation. When applied across various LLMs and NLP tasks to quantize the KV\ncache to only 3 bits, QJL demonstrates a more than fivefold reduction in KV\ncache memory usage without compromising accuracy, all while achieving faster\nruntime. Codes are available at \\url{https://github.com/amirzandieh/QJL}.\n","authors":["Amir Zandieh","Majid Daliri","Insu Han"],"pdf_url":"https://arxiv.org/pdf/2406.03482v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2406.03479v1","updated":"2024-06-05T17:32:28Z","published":"2024-06-05T17:32:28Z","title":"MODABS: Multi-Objective Learning for Dynamic Aspect-Based Summarization","summary":"  The rapid proliferation of online content necessitates effective\nsummarization methods, among which dynamic aspect-based summarization stands\nout. Unlike its traditional counterpart, which assumes a fixed set of known\naspects, this approach adapts to the varied aspects of the input text. We\nintroduce a novel multi-objective learning framework employing a\nLongformer-Encoder-Decoder for this task. The framework optimizes aspect number\nprediction, minimizes disparity between generated and reference summaries for\neach aspect, and maximizes dissimilarity across aspect-specific summaries.\nExtensive experiments show our method significantly outperforms baselines on\nthree diverse datasets, largely due to the effective alignment of generated and\nreference aspect counts without sacrificing single-aspect summarization\nquality.\n","authors":["Xiaobo Guo","Soroush Vosoughi"],"pdf_url":"https://arxiv.org/pdf/2406.03479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03476v1","updated":"2024-06-05T17:29:15Z","published":"2024-06-05T17:29:15Z","title":"Does your data spark joy? Performance gains from domain upsampling at\n  the end of training","summary":"  Pretraining datasets for large language models (LLMs) have grown to trillions\nof tokens composed of large amounts of CommonCrawl (CC) web scrape along with\nsmaller, domain-specific datasets. It is expensive to understand the impact of\nthese domain-specific datasets on model capabilities as training at large FLOP\nscales is required to reveal significant changes to difficult and emergent\nbenchmarks. Given the increasing cost of experimenting with pretraining data,\nhow does one determine the optimal balance between the diversity in general web\nscrapes and the information density of domain specific data? In this work, we\nshow how to leverage the smaller domain specific datasets by upsampling them\nrelative to CC at the end of training to drive performance improvements on\ndifficult benchmarks. This simple technique allows us to improve up to 6.90 pp\non MMLU, 8.26 pp on GSM8K, and 6.17 pp on HumanEval relative to the base data\nmix for a 7B model trained for 1 trillion (T) tokens, thus rivaling Llama-2\n(7B)$\\unicode{x2014}$a model trained for twice as long. We experiment with\nablating the duration of domain upsampling from 5% to 30% of training and find\nthat 10% to 20% percent is optimal for navigating the tradeoff between general\nlanguage modeling capabilities and targeted benchmarks. We also use domain\nupsampling to characterize at scale the utility of individual datasets for\nimproving various benchmarks by removing them during this final phase of\ntraining. This tool opens up the ability to experiment with the impact of\ndifferent pretraining datasets at scale, but at an order of magnitude lower\ncost compared to full pretraining runs.\n","authors":["Cody Blakeney","Mansheej Paul","Brett W. Larsen","Sean Owen","Jonathan Frankle"],"pdf_url":"https://arxiv.org/pdf/2406.03476v1.pdf","comment":"The first three authors contributed equally"},{"id":"http://arxiv.org/abs/2403.10131v2","updated":"2024-06-05T17:27:51Z","published":"2024-03-15T09:26:02Z","title":"RAFT: Adapting Language Model to Domain Specific RAG","summary":"  Pretraining Large Language Models (LLMs) on large corpora of textual data is\nnow a standard paradigm. When using these LLMs for many downstream\napplications, it is common to additionally bake in new knowledge (e.g.,\ntime-critical news, or private domain knowledge) into the pretrained model\neither through RAG-based-prompting, or fine-tuning. However, the optimal\nmethodology for the model to gain such new knowledge remains an open question.\nIn this paper, we present Retrieval Augmented FineTuning (RAFT), a training\nrecipe that improves the model's ability to answer questions in a \"open-book\"\nin-domain settings. In RAFT, given a question, and a set of retrieved\ndocuments, we train the model to ignore those documents that don't help in\nanswering the question, which we call, distractor documents. RAFT accomplishes\nthis by citing verbatim the right sequence from the relevant document that\nwould help answer the question. This coupled with RAFT's chain-of-thought-style\nresponse helps improve the model's ability to reason. In domain-specific RAG,\nRAFT consistently improves the model's performance across PubMed, HotpotQA, and\nGorilla datasets, presenting a post-training recipe to improve pre-trained LLMs\nto in-domain RAG. RAFT's code and demo are open-sourced at\ngithub.com/ShishirPatil/gorilla.\n","authors":["Tianjun Zhang","Shishir G. Patil","Naman Jain","Sheng Shen","Matei Zaharia","Ion Stoica","Joseph E. Gonzalez"],"pdf_url":"https://arxiv.org/pdf/2403.10131v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15983v2","updated":"2024-06-05T17:15:47Z","published":"2023-11-27T16:28:20Z","title":"SPIN: Sparsifying and Integrating Internal Neurons in Large Language\n  Models for Text Classification","summary":"  Among the many tasks that Large Language Models (LLMs) have revolutionized is\ntext classification. Current text classification paradigms, however, rely\nsolely on the output of the final layer in the LLM, with the rich information\ncontained in internal neurons largely untapped. In this study, we present SPIN:\na model-agnostic framework that sparsifies and integrates internal neurons of\nintermediate layers of LLMs for text classification. Specifically, SPIN\nsparsifies internal neurons by linear probing-based salient neuron selection\nlayer by layer, avoiding noise from unrelated neurons and ensuring efficiency.\nThe cross-layer salient neurons are then integrated to serve as multi-layered\nfeatures for the classification head. Extensive experimental results show our\nproposed SPIN significantly improves text classification accuracy, efficiency,\nand interpretability.\n","authors":["Difan Jiao","Yilun Liu","Zhenwei Tang","Daniel Matter","Jürgen Pfeffer","Ashton Anderson"],"pdf_url":"https://arxiv.org/pdf/2311.15983v2.pdf","comment":"17 pages, 7 figures, 12 tables Code available at\n  https://github.com/difanj0713/SPIN"},{"id":"http://arxiv.org/abs/2402.12261v3","updated":"2024-06-05T17:12:23Z","published":"2024-02-19T16:19:15Z","title":"NEO-BENCH: Evaluating Robustness of Large Language Models with\n  Neologisms","summary":"  The performance of Large Language Models (LLMs) degrades from the temporal\ndrift between data used for model training and newer text seen during\ninference. One understudied avenue of language change causing data drift is the\nemergence of neologisms -- new word forms -- over time. We create a diverse\nresource of recent English neologisms by using several popular collection\nmethods. We analyze temporal drift using neologisms by comparing sentences\ncontaining new words with near-identical sentences that replace neologisms with\nexisting substitute words. Model performance is nearly halved in machine\ntranslation when a single neologism is introduced in a sentence. Motivated by\nthese results, we construct a benchmark to evaluate LLMs' ability to generalize\nto neologisms with various natural language understanding tasks and model\nperplexity. Models with later knowledge cutoff dates yield lower perplexities\nand perform better in downstream tasks. LLMs are also affected differently\nbased on the linguistic origins of words, indicating that neologisms are\ncomplex for static LLMs to address. We will release our benchmark and code for\nreproducing our experiments.\n","authors":["Jonathan Zheng","Alan Ritter","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2402.12261v3.pdf","comment":"accepted to ACL 2024 main conference, 9 pages"},{"id":"http://arxiv.org/abs/2310.14921v2","updated":"2024-06-05T17:12:04Z","published":"2023-10-23T13:25:54Z","title":"PartialFormer: Modeling Part Instead of Whole for Machine Translation","summary":"  The design choices in Transformer feed-forward neural networks have resulted\nin significant computational and parameter overhead. In this work, we emphasize\nthe importance of hidden dimensions in designing lightweight FFNs, a factor\noften overlooked in previous architectures. Guided by this principle, we\nintroduce PartialFormer, a parameter-efficient Transformer architecture\nutilizing multiple smaller FFNs to reduce parameters and computation while\nmaintaining essential hidden dimensions. These smaller FFNs are integrated into\na multi-head attention mechanism for effective collaboration. We also propose a\ntailored head scaling strategy to enhance PartialFormer's capabilities.\nFurthermore, we present a residual-like attention calculation to improve depth\nscaling within PartialFormer. Extensive experiments on 9 translation tasks and\n1 abstractive summarization task validate the effectiveness of our\nPartialFormer approach on machine translation and summarization tasks. Our code\nwould be available at: https://github.com/zhengkid/PartialFormer.\n","authors":["Tong Zheng","Bei Li","Huiwen Bao","Jiale Wang","Weiqiao Shan","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.14921v2.pdf","comment":"Accepted by ACL2024 Findings"},{"id":"http://arxiv.org/abs/2405.20974v2","updated":"2024-06-05T17:04:01Z","published":"2024-05-31T16:21:16Z","title":"SaySelf: Teaching LLMs to Express Confidence with Self-Reflective\n  Rationales","summary":"  Large language models (LLMs) often generate inaccurate or fabricated\ninformation and generally fail to indicate their confidence, which limits their\nbroader applications. Previous work elicits confidence from LLMs by direct or\nself-consistency prompting, or constructing specific datasets for supervised\nfinetuning. The prompting-based approaches have inferior performance, and the\ntraining-based approaches are limited to binary or inaccurate group-level\nconfidence estimates. In this work, we present the advanced SaySelf, a training\nframework that teaches LLMs to express more accurate fine-grained confidence\nestimates. In addition, beyond the confidence scores, SaySelf initiates the\nprocess of directing LLMs to produce self-reflective rationales that clearly\nidentify gaps in their parametric knowledge and explain their uncertainty. This\nis achieved by using an LLM to automatically summarize the uncertainties in\nspecific knowledge via natural language. The summarization is based on the\nanalysis of the inconsistency in multiple sampled reasoning chains, and the\nresulting data is utilized for supervised fine-tuning. Moreover, we utilize\nreinforcement learning with a meticulously crafted reward function to calibrate\nthe confidence estimates, motivating LLMs to deliver accurate, high-confidence\npredictions and to penalize overconfidence in erroneous outputs. Experimental\nresults in both in-distribution and out-of-distribution datasets demonstrate\nthe effectiveness of SaySelf in reducing the confidence calibration error and\nmaintaining the task performance. We show that the generated self-reflective\nrationales are reasonable and can further contribute to the calibration. The\ncode is made public at https://github.com/xu1868/SaySelf.\n","authors":["Tianyang Xu","Shujin Wu","Shizhe Diao","Xiaoze Liu","Xingyao Wang","Yangyi Chen","Jing Gao"],"pdf_url":"https://arxiv.org/pdf/2405.20974v2.pdf","comment":"The code is available at https://github.com/xu1868/SaySelf"},{"id":"http://arxiv.org/abs/2402.14836v2","updated":"2024-06-05T17:02:47Z","published":"2024-02-18T16:51:02Z","title":"Stealthy Attack on Large Language Model based Recommendation","summary":"  Recently, the powerful large language models (LLMs) have been instrumental in\npropelling the progress of recommender systems (RS). However, while these\nsystems have flourished, their susceptibility to security threats has been\nlargely overlooked. In this work, we reveal that the introduction of LLMs into\nrecommendation models presents new security vulnerabilities due to their\nemphasis on the textual content of items. We demonstrate that attackers can\nsignificantly boost an item's exposure by merely altering its textual content\nduring the testing phase, without requiring direct interference with the\nmodel's training process. Additionally, the attack is notably stealthy, as it\ndoes not affect the overall recommendation performance and the modifications to\nthe text are subtle, making it difficult for users and platforms to detect. Our\ncomprehensive experiments across four mainstream LLM-based recommendation\nmodels demonstrate the superior efficacy and stealthiness of our approach. Our\nwork unveils a significant security gap in LLM-based recommendation systems and\npaves the way for future research on protecting these systems.\n","authors":["Jinghao Zhang","Yuting Liu","Qiang Liu","Shu Wu","Guibing Guo","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.14836v2.pdf","comment":"ACL 2024 Main"},{"id":"http://arxiv.org/abs/2406.03452v1","updated":"2024-06-05T16:52:21Z","published":"2024-06-05T16:52:21Z","title":"Using Synchronic Definitions and Semantic Relations to Classify Semantic\n  Change Types","summary":"  There is abundant evidence of the fact that the way words change their\nmeaning can be classified in different types of change, highlighting the\nrelationship between the old and new meanings (among which generalization,\nspecialization and co-hyponymy transfer). In this paper, we present a way of\ndetecting these types of change by constructing a model that leverages\ninformation both from synchronic lexical relations and definitions of word\nmeanings. Specifically, we use synset definitions and hierarchy information\nfrom WordNet and test it on a digitized version of Blank's (1997) dataset of\nsemantic change types. Finally, we show how the sense relationships can improve\nmodels for both approximation of human judgments of semantic relatedness as\nwell as binary Lexical Semantic Change Detection.\n","authors":["Pierluigi Cassotti","Stefano De Pascale","Nina Tahmasebi"],"pdf_url":"https://arxiv.org/pdf/2406.03452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03450v1","updated":"2024-06-05T16:48:26Z","published":"2024-06-05T16:48:26Z","title":"What is the Best Way for ChatGPT to Translate Poetry?","summary":"  Machine translation (MT) has historically faced significant challenges when\napplied to literary works, particularly in the domain of poetry translation.\nThe advent of Large Language Models such as ChatGPT holds potential for\ninnovation in this field. This study examines ChatGPT's capabilities in\nEnglish-Chinese poetry translation tasks, utilizing targeted prompts and small\nsample scenarios to ascertain optimal performance. Despite promising outcomes,\nour analysis reveals persistent issues in the translations generated by ChatGPT\nthat warrant attention. To address these shortcomings, we propose an\nExplanation-Assisted Poetry Machine Translation (EAPMT) method, which leverages\nmonolingual poetry explanation as a guiding information for the translation\nprocess. Furthermore, we refine existing evaluation criteria to better suit the\nnuances of modern poetry translation. We engaged a panel of professional poets\nfor assessments, complemented evaluations by using GPT-4. The results from both\nhuman and machine evaluations demonstrate that our EAPMT method outperforms\ntraditional translation methods of ChatGPT and the existing online systems.\nThis paper validates the efficacy of our method and contributes a novel\nperspective to machine-assisted literary translation.\n","authors":["Shanshan Wang","Derek F. Wong","Jingming Yao","Lidia S. Chao"],"pdf_url":"https://arxiv.org/pdf/2406.03450v1.pdf","comment":"19 pages, 1 figure. The paper has been accepted by ACL 2024(Main\n  Conference)"},{"id":"http://arxiv.org/abs/2404.04633v2","updated":"2024-06-05T16:42:38Z","published":"2024-04-06T13:46:53Z","title":"Context versus Prior Knowledge in Language Models","summary":"  To answer a question, language models often need to integrate prior knowledge\nlearned during pretraining and new information presented in context. We\nhypothesize that models perform this integration in a predictable way across\ndifferent questions and contexts: models will rely more on prior knowledge for\nquestions about entities (e.g., persons, places, etc.) that they are more\nfamiliar with due to higher exposure in the training corpus, and be more easily\npersuaded by some contexts than others. To formalize this problem, we propose\ntwo mutual information-based metrics to measure a model's dependency on a\ncontext and on its prior about an entity: first, the persuasion score of a\ngiven context represents how much a model depends on the context in its\ndecision, and second, the susceptibility score of a given entity represents how\nmuch the model can be swayed away from its original answer distribution about\nan entity. We empirically test our metrics for their validity and reliability.\nFinally, we explore and find a relationship between the scores and the model's\nexpected familiarity with an entity, and provide two use cases to illustrate\ntheir benefits.\n","authors":["Kevin Du","Vésteinn Snæbjarnarson","Niklas Stoehr","Jennifer C. White","Aaron Schein","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2404.04633v2.pdf","comment":"Long paper accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2406.03445v1","updated":"2024-06-05T16:40:53Z","published":"2024-06-05T16:40:53Z","title":"Pre-trained Large Language Models Use Fourier Features to Compute\n  Addition","summary":"  Pre-trained large language models (LLMs) exhibit impressive mathematical\nreasoning capabilities, yet how they compute basic arithmetic, such as\naddition, remains unclear. This paper shows that pre-trained LLMs add numbers\nusing Fourier features -- dimensions in the hidden state that represent numbers\nvia a set of features sparse in the frequency domain. Within the model, MLP and\nattention layers use Fourier features in complementary ways: MLP layers\nprimarily approximate the magnitude of the answer using low-frequency features,\nwhile attention layers primarily perform modular addition (e.g., computing\nwhether the answer is even or odd) using high-frequency features. Pre-training\nis crucial for this mechanism: models trained from scratch to add numbers only\nexploit low-frequency features, leading to lower accuracy. Introducing\npre-trained token embeddings to a randomly initialized model rescues its\nperformance. Overall, our analysis demonstrates that appropriate pre-trained\nrepresentations (e.g., Fourier features) can unlock the ability of Transformers\nto learn precise mechanisms for algorithmic tasks.\n","authors":["Tianyi Zhou","Deqing Fu","Vatsal Sharan","Robin Jia"],"pdf_url":"https://arxiv.org/pdf/2406.03445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03442v1","updated":"2024-06-05T16:36:21Z","published":"2024-06-05T16:36:21Z","title":"Are language models rational? The case of coherence norms and belief\n  revision","summary":"  Do norms of rationality apply to machine learning models, in particular\nlanguage models? In this paper we investigate this question by focusing on a\nspecial subset of rational norms: coherence norms. We consider both logical\ncoherence norms as well as coherence norms tied to the strength of belief. To\nmake sense of the latter, we introduce the Minimal Assent Connection (MAC) and\npropose a new account of credence, which captures the strength of belief in\nlanguage models. This proposal uniformly assigns strength of belief simply on\nthe basis of model internal next token probabilities. We argue that rational\nnorms tied to coherence do apply to some language models, but not to others.\nThis issue is significant since rationality is closely tied to predicting and\nexplaining behavior, and thus it is connected to considerations about AI safety\nand alignment, as well as understanding model behavior more generally.\n","authors":["Thomas Hofweber","Peter Hase","Elias Stengel-Eskin","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2406.03442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21018v2","updated":"2024-06-05T16:35:49Z","published":"2024-05-31T17:07:15Z","title":"Improved Techniques for Optimization-Based Jailbreaking on Large\n  Language Models","summary":"  Large language models (LLMs) are being rapidly developed, and a key component\nof their widespread deployment is their safety-related alignment. Many\nred-teaming efforts aim to jailbreak LLMs, where among these efforts, the\nGreedy Coordinate Gradient (GCG) attack's success has led to a growing interest\nin the study of optimization-based jailbreaking techniques. Although GCG is a\nsignificant milestone, its attacking efficiency remains unsatisfactory. In this\npaper, we present several improved (empirical) techniques for\noptimization-based jailbreaks like GCG. We first observe that the single target\ntemplate of \"Sure\" largely limits the attacking performance of GCG; given this,\nwe propose to apply diverse target templates containing harmful self-suggestion\nand/or guidance to mislead LLMs. Besides, from the optimization aspects, we\npropose an automatic multi-coordinate updating strategy in GCG (i.e.,\nadaptively deciding how many tokens to replace in each step) to accelerate\nconvergence, as well as tricks like easy-to-hard initialisation. Then, we\ncombine these improved technologies to develop an efficient jailbreak method,\ndubbed I-GCG. In our experiments, we evaluate on a series of benchmarks (such\nas NeurIPS 2023 Red Teaming Track). The results demonstrate that our improved\ntechniques can help GCG outperform state-of-the-art jailbreaking attacks and\nachieve nearly 100% attack success rate. The code is released at\nhttps://github.com/jiaxiaojunQAQ/I-GCG.\n","authors":["Xiaojun Jia","Tianyu Pang","Chao Du","Yihao Huang","Jindong Gu","Yang Liu","Xiaochun Cao","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2405.21018v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03441v1","updated":"2024-06-05T16:35:30Z","published":"2024-06-05T16:35:30Z","title":"Cycles of Thought: Measuring LLM Confidence through Stable Explanations","summary":"  In many high-risk machine learning applications it is essential for a model\nto indicate when it is uncertain about a prediction. While large language\nmodels (LLMs) can reach and even surpass human-level accuracy on a variety of\nbenchmarks, their overconfidence in incorrect responses is still a\nwell-documented failure mode. Traditional methods for ML uncertainty\nquantification can be difficult to directly adapt to LLMs due to the\ncomputational cost of implementation and closed-source nature of many models. A\nvariety of black-box methods have recently been proposed, but these often rely\non heuristics such as self-verbalized confidence. We instead propose a\nframework for measuring an LLM's uncertainty with respect to the distribution\nof generated explanations for an answer. While utilizing explanations is not a\nnew idea in and of itself, by interpreting each possible model+explanation pair\nas a test-time classifier we can calculate a posterior answer distribution over\nthe most likely of these classifiers. We demonstrate how a specific instance of\nthis framework using explanation entailment as our classifier likelihood\nimproves confidence score metrics (in particular AURC and AUROC) over baselines\nacross five different datasets. We believe these results indicate that our\nframework is both a well-principled and effective way of quantifying\nuncertainty in LLMs.\n","authors":["Evan Becker","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2406.03441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01306v3","updated":"2024-06-05T16:07:13Z","published":"2024-02-28T22:21:47Z","title":"NeuroPrune: A Neuro-inspired Topological Sparse Training Algorithm for\n  Large Language Models","summary":"  Transformer-based Language Models have become ubiquitous in Natural Language\nProcessing (NLP) due to their impressive performance on various tasks. However,\nexpensive training as well as inference remains a significant impediment to\ntheir widespread applicability. While enforcing sparsity at various levels of\nthe model architecture has found promise in addressing scaling and efficiency\nissues, there remains a disconnect between how sparsity affects network\ntopology. Inspired by brain neuronal networks, we explore sparsity approaches\nthrough the lens of network topology. Specifically, we exploit mechanisms seen\nin biological networks, such as preferential attachment and redundant synapse\npruning, and show that principled, model-agnostic sparsity approaches are\nperformant and efficient across diverse NLP tasks, spanning both classification\n(such as natural language inference) and generation (summarization, machine\ntranslation), despite our sole objective not being optimizing performance.\nNeuroPrune is competitive with (or sometimes superior to) baselines on\nperformance and can be up to $10$x faster in terms of training time for a given\nlevel of sparsity, simultaneously exhibiting measurable improvements in\ninference time in many cases.\n","authors":["Amit Dhurandhar","Tejaswini Pedapati","Ronny Luss","Soham Dan","Aurelie Lozano","Payel Das","Georgios Kollias"],"pdf_url":"https://arxiv.org/pdf/2404.01306v3.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2402.14860v3","updated":"2024-06-05T15:56:49Z","published":"2024-02-21T00:49:43Z","title":"Ranking Large Language Models without Ground Truth","summary":"  Evaluation and ranking of large language models (LLMs) has become an\nimportant problem with the proliferation of these models and their impact.\nEvaluation methods either require human responses which are expensive to\nacquire or use pairs of LLMs to evaluate each other which can be unreliable. In\nthis paper, we provide a novel perspective where, given a dataset of prompts\n(viz. questions, instructions, etc.) and a set of LLMs, we rank them without\naccess to any ground truth or reference responses. Inspired by real life where\nboth an expert and a knowledgeable person can identify a novice our main idea\nis to consider triplets of models, where each one of them evaluates the other\ntwo, correctly identifying the worst model in the triplet with high\nprobability. We also analyze our idea and provide sufficient conditions for it\nto succeed. Applying this idea repeatedly, we propose two methods to rank LLMs.\nIn experiments on different generative tasks (summarization, multiple-choice,\nand dialog), our methods reliably recover close to true rankings without\nreference data. This points to a viable low-resource mechanism for practical\nuse.\n","authors":["Amit Dhurandhar","Rahul Nair","Moninder Singh","Elizabeth Daly","Karthikeyan Natesan Ramamurthy"],"pdf_url":"https://arxiv.org/pdf/2402.14860v3.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2404.19328v2","updated":"2024-06-05T15:56:26Z","published":"2024-04-30T07:52:26Z","title":"Computational Approaches for Integrating out Subjectivity in Cognate\n  Synonym Selection","summary":"  Working with cognate data involves handling synonyms, that is, multiple words\nthat describe the same concept in a language. In the early days of language\nphylogenetics it was recommended to select one synonym only. However, as we\nshow here, binary character matrices, which are used as input for computational\nmethods, do allow for representing the entire dataset including all synonyms.\nHere we address the question how one can and if one should include all synonyms\nor whether it is preferable to select synonyms a priori. To this end, we\nperform maximum likelihood tree inferences with the widely used RAxML-NG tool\nand show that it yields plausible trees when all synonyms are used as input.\nFurthermore, we show that a priori synonym selection can yield topologically\nsubstantially different trees and we therefore advise against doing so. To\nrepresent cognate data including all synonyms, we introduce two types of\ncharacter matrices beyond the standard binary ones: probabilistic binary and\nprobabilistic multi-valued character matrices. We further show that it is\ndataset-dependent for which character matrix type the inferred RAxML-NG tree is\ntopologically closest to the gold standard. We also make available a Python\ninterface for generating all of the above character matrix types for cognate\ndata provided in CLDF format.\n","authors":["Luise Häuser","Gerhard Jäger","Alexandros Stamatakis"],"pdf_url":"https://arxiv.org/pdf/2404.19328v2.pdf","comment":"Experiments available on GitHub\n  (https://github.com/luisevonderwiese/synonyms,\n  https://github.com/luisevonderwiese/lingdata)"},{"id":"http://arxiv.org/abs/2406.03397v1","updated":"2024-06-05T15:54:50Z","published":"2024-06-05T15:54:50Z","title":"Automating Turkish Educational Quiz Generation Using Large Language\n  Models","summary":"  Crafting quizzes from educational content is a pivotal activity that benefits\nboth teachers and students by reinforcing learning and evaluating\nunderstanding. In this study, we introduce a novel approach to generate quizzes\nfrom Turkish educational texts, marking a pioneering endeavor in educational\ntechnology specifically tailored to the Turkish educational context. We present\na specialized dataset, named the Turkish-Quiz-Instruct, comprising an extensive\ncollection of Turkish educational texts accompanied by multiple-choice and\nshort-answer quizzes. This research leverages the capabilities of Large\nLanguage Models (LLMs), including GPT-4-Turbo, GPT-3.5-Turbo,\nLlama-2-7b-chat-hf, and Llama-2-13b-chat-hf, to automatically generate quiz\nquestions and answers from the Turkish educational content. Our work delineates\nthe methodology for employing these LLMs in the context of Turkish educational\nmaterial, thereby opening new avenues for automated Turkish quiz generation.\nThe study not only demonstrates the efficacy of using such models for\ngenerating coherent and relevant quiz content but also sets a precedent for\nfuture research in the domain of automated educational content creation for\nlanguages other than English. The Turkish-Quiz-Instruct dataset is introduced\nas a valuable resource for researchers and practitioners aiming to explore the\nboundaries of educational technology and language-specific applications of LLMs\nin Turkish. By addressing the challenges of quiz generation in a non-English\ncontext specifically Turkish, this study contributes significantly to the field\nof Turkish educational technology, providing insights into the potential of\nleveraging LLMs for educational purposes across diverse linguistic landscapes.\n","authors":["Kamyar Zeinalipour","Yusuf Gökberk Keptiğ","Marco Maggini","Marco Gori"],"pdf_url":"https://arxiv.org/pdf/2406.03397v1.pdf","comment":"Accepted Paper for ISPR 2024"},{"id":"http://arxiv.org/abs/2406.01873v2","updated":"2024-06-05T15:53:01Z","published":"2024-06-04T01:02:22Z","title":"CR-UTP: Certified Robustness against Universal Text Perturbations on\n  Large Language Models","summary":"  It is imperative to ensure the stability of every prediction made by a\nlanguage model; that is, a language's prediction should remain consistent\ndespite minor input variations, like word substitutions. In this paper, we\ninvestigate the problem of certifying a language model's robustness against\nUniversal Text Perturbations (UTPs), which have been widely used in universal\nadversarial attacks and backdoor attacks. Existing certified robustness based\non random smoothing has shown considerable promise in certifying the\ninput-specific text perturbations (ISTPs), operating under the assumption that\nany random alteration of a sample's clean or adversarial words would negate the\nimpact of sample-wise perturbations. However, with UTPs, masking only the\nadversarial words can eliminate the attack. A naive method is to simply\nincrease the masking ratio and the likelihood of masking attack tokens, but it\nleads to a significant reduction in both certified accuracy and the certified\nradius due to input corruption by extensive masking. To solve this challenge,\nwe introduce a novel approach, the superior prompt search method, designed to\nidentify a superior prompt that maintains higher certified accuracy under\nextensive masking. Additionally, we theoretically motivate why ensembles are a\nparticularly suitable choice as base prompts for random smoothing. The method\nis denoted by superior prompt ensembling technique. We also empirically confirm\nthis technique, obtaining state-of-the-art results in multiple settings. These\nmethodologies, for the first time, enable high certified accuracy against both\nUTPs and ISTPs. The source code of CR-UTP is available at \\url\n{https://github.com/UCFML-Research/CR-UTP}.\n","authors":["Qian Lou","Xin Liang","Jiaqi Xue","Yancheng Zhang","Rui Xie","Mengxin Zheng"],"pdf_url":"https://arxiv.org/pdf/2406.01873v2.pdf","comment":"Accepted by ACL Findings 2024"},{"id":"http://arxiv.org/abs/2403.02902v2","updated":"2024-06-05T15:47:38Z","published":"2024-03-05T12:11:32Z","title":"Demonstrating Mutual Reinforcement Effect through Information Flow","summary":"  The Mutual Reinforcement Effect (MRE) investigates the synergistic\nrelationship between word-level and text-level classifications in text\nclassification tasks. It posits that the performance of both classification\nlevels can be mutually enhanced. However, this mechanism has not been\nadequately demonstrated or explained in prior research. To address this gap, we\nemploy information flow analysis to observe and substantiate the MRE theory.\nOur experiments on six MRE hybrid datasets revealed the presence of MRE in the\nmodel and its impact. Additionally, we conducted fine-tuning experiments, whose\nresults were consistent with those of the information flow experiments. The\nconvergence of findings from both experiments corroborates the existence of\nMRE. Furthermore, we extended the application of MRE to prompt learning,\nutilizing word-level information as a verbalizer to bolster the model's\nprediction of text-level classification labels. In our final experiment, the\nF1-score significantly surpassed the baseline in five out of six datasets,\nfurther validating the notion that word-level information enhances the language\nmodel's comprehension of the text as a whole.\n","authors":["Chengguang Gan","Xuzheng He","Qinghao Zhang","Tatsunori Mori"],"pdf_url":"https://arxiv.org/pdf/2403.02902v2.pdf","comment":"The co-authors have requested that the manuscript be withdrawn. And\n  the paper has major flaws"},{"id":"http://arxiv.org/abs/2402.11905v2","updated":"2024-06-05T15:46:03Z","published":"2024-02-19T07:45:17Z","title":"Learning to Edit: Aligning LLMs with Knowledge Editing","summary":"  Knowledge editing techniques, aiming to efficiently modify a minor proportion\nof knowledge in large language models (LLMs) without negatively impacting\nperformance across other inputs, have garnered widespread attention. However,\nexisting methods predominantly rely on memorizing the updated knowledge,\nimpeding LLMs from effectively combining the new knowledge with their inherent\nknowledge when answering questions. To this end, we propose a Learning to Edit\n(LTE) framework, focusing on teaching LLMs to apply updated knowledge into\ninput questions, inspired by the philosophy of \"Teach a man to fish.\" LTE\nfeatures a two-phase process: (i) the Alignment Phase, which fine-tunes LLMs on\na meticulously curated parallel dataset to make reliable, in-scope edits while\npreserving out-of-scope information and linguistic proficiency; and (ii) the\nInference Phase, which employs a retrieval-based mechanism for real-time and\nmass knowledge editing. By comparing our approach with seven advanced baselines\nacross four popular knowledge editing benchmarks and two LLM architectures, we\ndemonstrate LTE's superiority in knowledge editing performance, robustness in\nboth batch and sequential editing, minimal interference on general tasks, and\nrapid editing speeds. The data and code are available at\nhttps://github.com/YJiangcm/LTE.\n","authors":["Yuxin Jiang","Yufei Wang","Chuhan Wu","Wanjun Zhong","Xingshan Zeng","Jiahui Gao","Liangyou Li","Xin Jiang","Lifeng Shang","Ruiming Tang","Qun Liu","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2402.11905v2.pdf","comment":"17 pages, 8 figures, 9 tables. ACL 2024 main camera-ready version"},{"id":"http://arxiv.org/abs/2406.00975v2","updated":"2024-06-05T15:45:04Z","published":"2024-06-03T04:14:21Z","title":"Luna: An Evaluation Foundation Model to Catch Language Model\n  Hallucinations with High Accuracy and Low Cost","summary":"  Retriever Augmented Generation (RAG) systems have become pivotal in enhancing\nthe capabilities of language models by incorporating external knowledge\nretrieval mechanisms. However, a significant challenge in deploying these\nsystems in industry applications is the detection and mitigation of\nhallucinations: instances where the model generates information that is not\ngrounded in the retrieved context. Addressing this issue is crucial for\nensuring the reliability and accuracy of responses generated by large language\nmodels (LLMs) in diverse industry settings. Current hallucination detection\ntechniques fail to deliver accuracy, low latency, and low cost simultaneously.\nWe introduce Luna: a DeBERTA-large (440M) encoder, finetuned for hallucination\ndetection in RAG settings. We demonstrate that Luna outperforms GPT-3.5 and\ncommercial evaluation frameworks on the hallucination detection task, with 97%\nand 91% reduction in cost and latency, respectively. Luna is lightweight and\ngeneralizes across multiple industry verticals and out-of-domain data, making\nit an ideal candidate for industry LLM applications.\n","authors":["Masha Belyi","Robert Friel","Shuai Shao","Atindriyo Sanyal"],"pdf_url":"https://arxiv.org/pdf/2406.00975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20410v3","updated":"2024-06-05T15:39:26Z","published":"2023-10-31T12:32:38Z","title":"FollowBench: A Multi-level Fine-grained Constraints Following Benchmark\n  for Large Language Models","summary":"  The ability to follow instructions is crucial for Large Language Models\n(LLMs) to handle various real-world applications. Existing benchmarks primarily\nfocus on evaluating pure response quality, rather than assessing whether the\nresponse follows constraints stated in the instruction. To fill this research\ngap, in this paper, we propose FollowBench, a Multi-level Fine-grained\nConstraints Following Benchmark for LLMs. FollowBench comprehensively includes\nfive different types (i.e., Content, Situation, Style, Format, and Example) of\nfine-grained constraints. To enable a precise constraint following estimation\non diverse difficulties, we introduce a Multi-level mechanism that\nincrementally adds a single constraint to the initial instruction at each\nincreased level. To assess whether LLMs' outputs have satisfied every\nindividual constraint, we propose to prompt strong LLMs with\nconstraint-evolution paths to handle challenging open-ended instructions. By\nevaluating 13 closed-source and open-source popular LLMs on FollowBench, we\nhighlight the weaknesses of LLMs in instruction following and point towards\npotential avenues for future work. The data and code are publicly available at\nhttps://github.com/YJiangcm/FollowBench.\n","authors":["Yuxin Jiang","Yufei Wang","Xingshan Zeng","Wanjun Zhong","Liangyou Li","Fei Mi","Lifeng Shang","Xin Jiang","Qun Liu","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2310.20410v3.pdf","comment":"22 pages, 11 figures, 16 tables. ACL 2024 main camera-ready version"},{"id":"http://arxiv.org/abs/2310.11244v3","updated":"2024-06-05T15:33:25Z","published":"2023-10-17T13:12:32Z","title":"Entity Matching using Large Language Models","summary":"  Entity Matching is the task of deciding whether two entity descriptions refer\nto the same real-world entity and is a central step in most data integration\npipelines. Many state-of-the-art entity matching methods rely on pre-trained\nlanguage models (PLMs) such as BERT or RoBERTa. Two major drawbacks of these\nmodels for entity matching are that (i) the models require significant amounts\nof task-specific training data and (ii) the fine-tuned models are not robust\nconcerning out-of-distribution entities. This paper investigates using\ngenerative large language models (LLMs) as a less task-specific training\ndata-dependent and more robust alternative to PLM-based matchers. Our study\ncovers hosted and open-source LLMs, which can be run locally. We evaluate these\nmodels in a zero-shot scenario and a scenario where task-specific training data\nis available. We compare different prompt designs and the prompt sensitivity of\nthe models and show that there is no single best prompt but needs to be tuned\nfor each model/dataset combination. We further investigate (i) the selection of\nin-context demonstrations, (ii) the generation of matching rules, as well as\n(iii) fine-tuning a hosted LLM using the same pool of training data. Our\nexperiments show that the best LLMs require no or only a few training examples\nto perform similarly to PLMs that were fine-tuned using thousands of examples.\nLLM-based matchers further exhibit higher robustness to unseen entities. We\nshow that GPT4 can generate structured explanations for matching decisions. The\nmodel can automatically identify potential causes of matching errors by\nanalyzing explanations of wrong decisions. We demonstrate that the model can\ngenerate meaningful textual descriptions of the identified error classes, which\ncan help data engineers improve entity matching pipelines.\n","authors":["Ralph Peeters","Christian Bizer"],"pdf_url":"https://arxiv.org/pdf/2310.11244v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17849v2","updated":"2024-06-05T15:26:58Z","published":"2024-05-28T05:56:11Z","title":"I-LLM: Efficient Integer-Only Inference for Fully-Quantized Low-Bit\n  Large Language Models","summary":"  Post-training quantization (PTQ) serves as a potent technique to accelerate\nthe inference of large language models (LLMs). Nonetheless, existing works\nstill necessitate a considerable number of floating-point (FP) operations\nduring inference, including additional quantization and de-quantization, as\nwell as non-linear operators such as RMSNorm and Softmax. This limitation\nhinders the deployment of LLMs on the edge and cloud devices. In this paper, we\nidentify the primary obstacle to integer-only quantization for LLMs lies in the\nlarge fluctuation of activations across channels and tokens in both linear and\nnon-linear operations. To address this issue, we propose I-LLM, a novel\ninteger-only fully-quantized PTQ framework tailored for LLMs. Specifically, (1)\nwe develop Fully-Smooth Block-Reconstruction (FSBR) to aggressively smooth\ninter-channel variations of all activations and weights. (2) to alleviate\ndegradation caused by inter-token variations, we introduce a novel approach\ncalled Dynamic Integer-only MatMul (DI-MatMul). This method enables dynamic\nquantization in full-integer matrix multiplication by dynamically quantizing\nthe input and outputs with integer-only operations. (3) we design\nDI-ClippedSoftmax, DI-Exp, and DI-Normalization, which utilize bit shift to\nexecute non-linear operators efficiently while maintaining accuracy. The\nexperiment shows that our I-LLM achieves comparable accuracy to the FP baseline\nand outperforms non-integer quantization methods. For example, I-LLM can\noperate at W4A4 with negligible loss of accuracy. To our knowledge, we are the\nfirst to bridge the gap between integer-only quantization and LLMs. We've\npublished our code on anonymous.4open.science, aiming to contribute to the\nadvancement of this field.\n","authors":["Xing Hu","Yuan Cheng","Dawei Yang","Zhihang Yuan","Jiangyong Yu","Chen Xu","Sifan Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.17849v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08631v2","updated":"2024-06-05T15:25:06Z","published":"2023-09-13T01:27:48Z","title":"Large Language Models Can Infer Psychological Dispositions of Social\n  Media Users","summary":"  Large Language Models (LLMs) demonstrate increasingly human-like abilities\nacross a wide variety of tasks. In this paper, we investigate whether LLMs like\nChatGPT can accurately infer the psychological dispositions of social media\nusers and whether their ability to do so varies across socio-demographic\ngroups. Specifically, we test whether GPT-3.5 and GPT-4 can derive the Big Five\npersonality traits from users' Facebook status updates in a zero-shot learning\nscenario. Our results show an average correlation of r = .29 (range = [.22,\n.33]) between LLM-inferred and self-reported trait scores - a level of accuracy\nthat is similar to that of supervised machine learning models specifically\ntrained to infer personality. Our findings also highlight heterogeneity in the\naccuracy of personality inferences across different age groups and gender\ncategories: predictions were found to be more accurate for women and younger\nindividuals on several traits, suggesting a potential bias stemming from the\nunderlying training data or differences in online self-expression. The ability\nof LLMs to infer psychological dispositions from user-generated text has the\npotential to democratize access to cheap and scalable psychometric assessments\nfor both researchers and practitioners. On the one hand, this democratization\nmight facilitate large-scale research of high ecological validity and spark\ninnovation in personalized services. On the other hand, it also raises ethical\nconcerns regarding user privacy and self-determination, highlighting the need\nfor stringent ethical frameworks and regulation.\n","authors":["Heinrich Peters","Sandra Matz"],"pdf_url":"https://arxiv.org/pdf/2309.08631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03368v1","updated":"2024-06-05T15:23:08Z","published":"2024-06-05T15:23:08Z","title":"IrokoBench: A New Benchmark for African Languages in the Age of Large\n  Language Models","summary":"  Despite the widespread adoption of Large language models (LLMs), their\nremarkable capabilities remain limited to a few high-resource languages.\nAdditionally, many low-resource languages (e.g. African languages) are often\nevaluated only on basic text classification tasks due to the lack of\nappropriate or comprehensive benchmarks outside of high-resource languages. In\nthis paper, we introduce IrokoBench -- a human-translated benchmark dataset for\n16 typologically-diverse low-resource African languages covering three tasks:\nnatural language inference~(AfriXNLI), mathematical reasoning~(AfriMGSM), and\nmulti-choice knowledge-based QA~(AfriMMLU). We use IrokoBench to evaluate\nzero-shot, few-shot, and translate-test settings~(where test sets are\ntranslated into English) across 10 open and four proprietary LLMs. Our\nevaluation reveals a significant performance gap between high-resource\nlanguages~(such as English and French) and low-resource African languages. We\nobserve a significant performance gap between open and proprietary models, with\nthe highest performing open model, Aya-101 only at 58\\% of the best-performing\nproprietary model GPT-4o performance. Machine translating the test set to\nEnglish before evaluation helped to close the gap for larger models that are\nEnglish-centric, like LLaMa 3 70B. These findings suggest that more efforts are\nneeded to develop and adapt LLMs for African languages.\n","authors":["David Ifeoluwa Adelani","Jessica Ojo","Israel Abebe Azime","Jian Yun Zhuang","Jesujoba O. Alabi","Xuanli He","Millicent Ochieng","Sara Hooker","Andiswa Bukula","En-Shiun Annie Lee","Chiamaka Chukwuneke","Happy Buzaaba","Blessing Sibanda","Godson Kalipe","Jonathan Mukiibi","Salomon Kabongo","Foutse Yuehgoh","Mmasibidi Setaka","Lolwethu Ndolela","Nkiruka Odu","Rooweither Mabuya","Shamsuddeen Hassan Muhammad","Salomey Osei","Sokhar Samb","Tadesse Kebede Guge","Pontus Stenetorp"],"pdf_url":"https://arxiv.org/pdf/2406.03368v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2406.03363v1","updated":"2024-06-05T15:18:08Z","published":"2024-06-05T15:18:08Z","title":"LLM-based Rewriting of Inappropriate Argumentation using Reinforcement\n  Learning from Machine Feedback","summary":"  Ensuring that online discussions are civil and productive is a major\nchallenge for social media platforms. Such platforms usually rely both on users\nand on automated detection tools to flag inappropriate arguments of other\nusers, which moderators then review. However, this kind of post-hoc moderation\nis expensive and time-consuming, and moderators are often overwhelmed by the\namount and severity of flagged content. Instead, a promising alternative is to\nprevent negative behavior during content creation. This paper studies how\ninappropriate language in arguments can be computationally mitigated. We\npropose a reinforcement learning-based rewriting approach that balances content\npreservation and appropriateness based on existing classifiers, prompting an\ninstruction-finetuned large language model (LLM) as our initial policy. Unlike\nrelated style transfer tasks, rewriting inappropriate arguments allows deleting\nand adding content permanently. It is therefore tackled on document level\nrather than sentence level. We evaluate different weighting schemes for the\nreward function in both absolute and relative human assessment studies.\nSystematic experiments on non-parallel data provide evidence that our approach\ncan mitigate the inappropriateness of arguments while largely preserving their\ncontent. It significantly outperforms competitive baselines, including few-shot\nlearning, prompting, and humans.\n","authors":["Timon Ziegenbein","Gabriella Skitalinskaya","Alireza Bayat Makou","Henning Wachsmuth"],"pdf_url":"https://arxiv.org/pdf/2406.03363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10500v2","updated":"2024-06-05T15:10:08Z","published":"2024-02-16T08:19:34Z","title":"Active Preference Optimization for Sample Efficient RLHF","summary":"  Reinforcement Learning from Human Feedback (RLHF) is pivotal in aligning\nLarge Language Models (LLMs) with human preferences. Although aligned\ngenerative models have shown remarkable abilities in various tasks, their\nreliance on high-quality human preference data creates a costly bottleneck in\nthe practical application of RLHF. One primary reason is that current methods\nrely on uniformly picking prompt-generation pairs from a dataset of\nprompt-generations, to collect human feedback, resulting in sub-optimal\nalignment under a constrained budget, which highlights the criticality of\nadaptive strategies in efficient alignment. Recent works [Mehta et al., 2023,\nMuldrew et al., 2024] have tried to address this problem by designing various\nheuristics based on generation uncertainty. However, either the assumptions in\n[Mehta et al., 2023] are restrictive, or [Muldrew et al., 2024] do not provide\nany rigorous theoretical guarantee. To address these, we reformulate RLHF\nwithin contextual preference bandit framework, treating prompts as contexts,\nand develop an active-learning algorithm, $\\textit{Active Preference\nOptimization}$ ($\\texttt{APO}$), which enhances model alignment by querying\npreference data from the most important samples, achieving superior performance\nfor small sample budget. We analyze the theoretical performance guarantees of\n$\\texttt{APO}$ under the BTL preference model showing that the suboptimality\ngap of the policy learned via $\\texttt{APO}$ scales as $O(1/\\sqrt{T})$ for a\nbudget of $T$. We also show that collecting preference data by choosing prompts\nrandomly leads to a policy that suffers a constant sub-optimality. We perform\ndetailed experimental evaluations on practical preference datasets to validate\n$\\texttt{APO}$'s efficacy over the existing methods, establishing it as a\nsample-efficient and practical solution of alignment in a cost-effective and\nscalable manner.\n","authors":["Nirjhar Das","Souradip Chakraborty","Aldo Pacchiano","Sayak Ray Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2402.10500v2.pdf","comment":"New experimental results added. Some reorganization"},{"id":"http://arxiv.org/abs/2406.02350v2","updated":"2024-06-05T15:08:42Z","published":"2024-06-04T14:24:53Z","title":"LlamaCare: A Large Medical Language Model for Enhancing Healthcare\n  Knowledge Sharing","summary":"  Large language models (LLMs) have shown amazing capabilities in knowledge\nmemorization and the present. However, when it comes to domain-specific\nknowledge and downstream tasks like medical, general LLMs are often unable to\ngive precise answers. In addition, when people want LLMs to answer\nclassification questions, they usually go through instruction tuning first.\nHowever, LLMs do not always give a direct index of the categorization after\ninstruction tuning. In this paper, we proposed LlamaCare, a fine-tuned medical\nlanguage model, and Extended Classification Integration(ECI), a module to\nhandle classification problems of LLMs. Our contributions are : (i) We\nfine-tuned a large language model of medical knowledge with very low carbon\nemissions and achieved similar performance with ChatGPT by a 24G GPU. (ii) We\nsolved the problem of redundant categorical answers and improved the\nperformance of LLMs by proposing a new module called Extended Classification\nIntegration. (iii) We released our processed data for one-shot and few-shot\ntraining for some benchmarks such as PubMedQA and USMLE 1-3 step. Our method\nachieves a close performance comparable to some state-of-the-art models with\nthe same quantity of parameters on benchmarks, while being more environmentally\nfriendly by using less GPU computation time. Our models, codes, and datasets\ncan be found at \\url{https://github.com/Stephen-SMJ/LLamaCare}.\n","authors":["Maojun Sun"],"pdf_url":"https://arxiv.org/pdf/2406.02350v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12061v2","updated":"2024-06-05T15:08:28Z","published":"2024-02-19T11:28:20Z","title":"All Language Models Large and Small","summary":"  Many leading language models (LMs) use high-intensity computational resources\nboth during training and execution. This poses the challenge of lowering\nresource costs for deployment and faster execution of decision-making tasks\namong others. We introduce a novel plug-and-play LM framework named Language\nOptimising Network Distribution (LONDI) framework. LONDI learns to selectively\nemploy large LMs only where complex decision-making and reasoning are required\nwhile using low-resource LMs (i.e. LMs require less GPU usage, but may not be\nable to solve the problem alone) everywhere else. LONDI consists of a system of\ntwo (off-)policy networks, an LM, a large LM (LLM), and a reinforcement\nlearning module that uses switching controls to quickly learn which system\nstates to call the LLM. We then introduce a variant of LONDI that maintains\nbudget constraints on LLM calls and hence its resource usage. Theoretically, we\nprove LONDI learns the subset of system states to activate the LLM required to\nsolve the task. We then prove that LONDI converges to optimal solutions while\nalso preserving budgetary constraints on LLM calls almost surely enabling it to\nsolve various tasks while significantly lowering computational costs. We test\nLONDI's performance in a range of tasks in ScienceWorld and BabyAI-Text and\ndemonstrate that LONDI can solve tasks only solvable by resource-intensive LLMs\nwhile reducing GPU usage by up to 30%.\n","authors":["Zhixun Chen","Yali Du","David Mguni"],"pdf_url":"https://arxiv.org/pdf/2402.12061v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08726v2","updated":"2024-06-05T15:04:29Z","published":"2023-11-15T06:36:29Z","title":"Uncertainty Estimation on Sequential Labeling via Uncertainty\n  Transmission","summary":"  Sequential labeling is a task predicting labels for each token in a sequence,\nsuch as Named Entity Recognition (NER). NER tasks aim to extract entities and\npredict their labels given a text, which is important in information\nextraction. Although previous works have shown great progress in improving NER\nperformance, uncertainty estimation on NER (UE-NER) is still underexplored but\nessential. This work focuses on UE-NER, which aims to estimate uncertainty\nscores for the NER predictions. Previous uncertainty estimation models often\noverlook two unique characteristics of NER: the connection between entities\n(i.e., one entity embedding is learned based on the other ones) and wrong span\ncases in the entity extraction subtask. Therefore, we propose a Sequential\nLabeling Posterior Network (SLPN) to estimate uncertainty scores for the\nextracted entities, considering uncertainty transmitted from other tokens.\nMoreover, we have defined an evaluation strategy to address the specificity of\nwrong-span cases. Our SLPN has achieved significant improvements on three\ndatasets, such as a 5.54-point improvement in AUPR on the MIT-Restaurant\ndataset. Our code is available at\n\\url{https://github.com/he159ok/UncSeqLabeling_SLPN}.\n","authors":["Jianfeng He","Linlin Yu","Shuo Lei","Chang-Tien Lu","Feng Chen"],"pdf_url":"https://arxiv.org/pdf/2311.08726v2.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2312.03656v2","updated":"2024-06-05T15:03:37Z","published":"2023-12-06T18:25:53Z","title":"Interpretability Illusions in the Generalization of Simplified Models","summary":"  A common method to study deep learning systems is to use simplified model\nrepresentations--for example, using singular value decomposition to visualize\nthe model's hidden states in a lower dimensional space. This approach assumes\nthat the results of these simplifications are faithful to the original model.\nHere, we illustrate an important caveat to this assumption: even if the\nsimplified representations can accurately approximate the full model on the\ntraining set, they may fail to accurately capture the model's behavior out of\ndistribution. We illustrate this by training Transformer models on controlled\ndatasets with systematic generalization splits, including the Dyck\nbalanced-parenthesis languages and a code completion task. We simplify these\nmodels using tools like dimensionality reduction and clustering, and then\nexplicitly test how these simplified proxies match the behavior of the original\nmodel. We find consistent generalization gaps: cases in which the simplified\nproxies are more faithful to the original model on the in-distribution\nevaluations and less faithful on various tests of systematic generalization.\nThis includes cases where the original model generalizes systematically but the\nsimplified proxies fail, and cases where the simplified proxies generalize\nbetter. Together, our results raise questions about the extent to which\nmechanistic interpretations derived using tools like SVD can reliably predict\nwhat a model will do in novel situations.\n","authors":["Dan Friedman","Andrew Lampinen","Lucas Dixon","Danqi Chen","Asma Ghandeharioun"],"pdf_url":"https://arxiv.org/pdf/2312.03656v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.03339v1","updated":"2024-06-05T14:55:10Z","published":"2024-06-05T14:55:10Z","title":"The Challenges of Evaluating LLM Applications: An Analysis of Automated,\n  Human, and LLM-Based Approaches","summary":"  Chatbots have been an interesting application of natural language generation\nsince its inception. With novel transformer based Generative AI methods,\nbuilding chatbots have become trivial. Chatbots which are targeted at specific\ndomains such as medicine, psychology, and general information retrieval are\nimplemented rapidly. This, however, should not distract from the need to\nevaluate the chatbot responses. Especially because the natural language\ngeneration community does not entirely agree upon how to effectively evaluate\nsuch applications. With this work we discuss the issue further with the\nincreasingly popular LLM based evaluations and how they correlate with human\nevaluations. Additionally, we introduce a comprehensive factored evaluation\nmechanism that can be utilized in conjunction with both human and LLM-based\nevaluations.\n  We present the results of an experimental evaluation conducted using this\nscheme in one of our chatbot implementations, and subsequently compare\nautomated, traditional human evaluation, factored human evaluation, and\nfactored LLM evaluation. Results show that factor based evaluation produces\nbetter insights on which aspects need to be improved in LLM applications and\nfurther strengthens the argument to use human evaluation in critical spaces\nwhere main functionality is not direct retrieval.\n","authors":["Bhashithe Abeysinghe","Ruhan Circi"],"pdf_url":"https://arxiv.org/pdf/2406.03339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02796v3","updated":"2024-06-05T14:38:56Z","published":"2023-06-05T11:46:36Z","title":"MCTS: A Multi-Reference Chinese Text Simplification Dataset","summary":"  Text simplification aims to make the text easier to understand by applying\nrewriting transformations. There has been very little research on Chinese text\nsimplification for a long time. The lack of generic evaluation data is an\nessential reason for this phenomenon. In this paper, we introduce MCTS, a\nmulti-reference Chinese text simplification dataset. We describe the annotation\nprocess of the dataset and provide a detailed analysis. Furthermore, we\nevaluate the performance of several unsupervised methods and advanced large\nlanguage models. We additionally provide Chinese text simplification parallel\ndata that can be used for training, acquired by utilizing machine translation\nand English text simplification. We hope to build a basic understanding of\nChinese text simplification through the foundational work and provide\nreferences for future research. All of the code and data are released at\nhttps://github.com/blcuicall/mcts/.\n","authors":["Ruining Chong","Luming Lu","Liner Yang","Jinran Nie","Zhenghao Liu","Shuo Wang","Shuhan Zhou","Yaoxin Li","Erhong Yang"],"pdf_url":"https://arxiv.org/pdf/2306.02796v3.pdf","comment":"Accepted to COLING 2024"},{"id":"http://arxiv.org/abs/2402.10644v2","updated":"2024-06-05T14:13:22Z","published":"2024-02-16T12:44:15Z","title":"Linear Transformers with Learnable Kernel Functions are Better\n  In-Context Models","summary":"  Advancing the frontier of subquadratic architectures for Language Models\n(LMs) is crucial in the rapidly evolving field of natural language processing.\nCurrent innovations, including State Space Models, were initially celebrated\nfor surpassing Transformer performance on language modeling tasks. However,\nthese models have revealed deficiencies in essential In-Context Learning\ncapabilities - a domain where the Transformer traditionally shines. The Based\nmodel emerged as a hybrid solution, blending a Linear Transformer with a kernel\ninspired by the Taylor expansion of exponential functions, augmented by\nconvolutional networks. Mirroring the Transformer's in-context adeptness, it\nbecame a strong contender in the field. In our work, we present a singular,\nelegant alteration to the Based kernel that amplifies its In-Context Learning\nabilities evaluated with the Multi-Query Associative Recall task and overall\nlanguage modeling process, as demonstrated on the Pile dataset.\n","authors":["Yaroslav Aksenov","Nikita Balagansky","Sofia Maria Lo Cicero Vaina","Boris Shaposhnikov","Alexey Gorbatovski","Daniil Gavrilov"],"pdf_url":"https://arxiv.org/pdf/2402.10644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07870v2","updated":"2024-06-05T14:12:03Z","published":"2024-01-15T18:04:29Z","title":"JumpCoder: Go Beyond Autoregressive Coder via Online Modification","summary":"  While existing code large language models (code LLMs) exhibit impressive\ncapabilities in code generation, their autoregressive sequential generation\ninherently lacks reversibility. This limitation hinders them from timely\ncorrecting previous missing statements during coding as humans do, often\nleading to error propagation and suboptimal performance. We introduce\nJumpCoder, a novel model-agnostic framework that enables human-like online\nmodification and non-sequential generation to augment code LLMs. The key idea\nbehind JumpCoder is to insert new code into the currently generated code when\nnecessary during generation, which is achieved through an auxiliary infilling\nmodel that works in tandem with the code LLM. Since identifying the best infill\nposition beforehand is intractable, we adopt an \\textit{infill-first,\njudge-later} strategy, which experiments with filling at the $k$ most critical\npositions following the generation of each line, and uses an Abstract Syntax\nTree (AST) parser alongside the Generation Model Scoring to effectively judge\nthe validity of each potential infill. Extensive experiments using six\nstate-of-the-art code LLMs across multiple and multilingual benchmarks\nconsistently indicate significant improvements over all baselines. Our code is\npublic at https://github.com/Keytoyze/JumpCoder.\n","authors":["Mouxiang Chen","Hao Tian","Zhongxin Liu","Xiaoxue Ren","Jianling Sun"],"pdf_url":"https://arxiv.org/pdf/2401.07870v2.pdf","comment":"ACL 2024 (main)"},{"id":"http://arxiv.org/abs/2401.06751v2","updated":"2024-06-05T14:10:11Z","published":"2024-01-12T18:36:29Z","title":"The Unreasonable Effectiveness of Easy Training Data for Hard Tasks","summary":"  How can we train models to perform well on hard test data when hard training\ndata is by definition difficult to label correctly? This question has been\ntermed the scalable oversight problem and has drawn increasing attention as\nlanguage models have continually improved. In this paper, we present the\nsurprising conclusion that current pretrained language models often generalize\nrelatively well from easy to hard data, even performing as well as oracle\nmodels finetuned on hard data. We demonstrate this kind of easy-to-hard\ngeneralization using simple finetuning methods like in-context learning, linear\nclassifier heads, and QLoRA for seven different measures of datapoint hardness,\nincluding six empirically diverse human hardness measures (like grade level)\nand one model-based measure (loss-based). Furthermore, we show that even if one\ncares most about model performance on hard data, it can be better to collect\neasy data rather than hard data for finetuning, since hard data is generally\nnoisier and costlier to collect. Our experiments use open models up to 70b in\nsize and four publicly available question-answering datasets with questions\nranging in difficulty from 3rd grade science questions to college level STEM\nquestions and general-knowledge trivia. We conclude that easy-to-hard\ngeneralization in LMs is surprisingly strong for the tasks studied. Our code is\navailable at: https://github.com/allenai/easy-to-hard-generalization\n","authors":["Peter Hase","Mohit Bansal","Peter Clark","Sarah Wiegreffe"],"pdf_url":"https://arxiv.org/pdf/2401.06751v2.pdf","comment":"ACL 2024. 23 pages, 20 figures"},{"id":"http://arxiv.org/abs/2406.03299v1","updated":"2024-06-05T14:08:54Z","published":"2024-06-05T14:08:54Z","title":"The Good, the Bad, and the Hulk-like GPT: Analyzing Emotional Decisions\n  of Large Language Models in Cooperation and Bargaining Games","summary":"  Behavior study experiments are an important part of society modeling and\nunderstanding human interactions. In practice, many behavioral experiments\nencounter challenges related to internal and external validity,\nreproducibility, and social bias due to the complexity of social interactions\nand cooperation in human user studies. Recent advances in Large Language Models\n(LLMs) have provided researchers with a new promising tool for the simulation\nof human behavior. However, existing LLM-based simulations operate under the\nunproven hypothesis that LLM agents behave similarly to humans as well as\nignore a crucial factor in human decision-making: emotions.\n  In this paper, we introduce a novel methodology and the framework to study\nboth, the decision-making of LLMs and their alignment with human behavior under\nemotional states. Experiments with GPT-3.5 and GPT-4 on four games from two\ndifferent classes of behavioral game theory showed that emotions profoundly\nimpact the performance of LLMs, leading to the development of more optimal\nstrategies. While there is a strong alignment between the behavioral responses\nof GPT-3.5 and human participants, particularly evident in bargaining games,\nGPT-4 exhibits consistent behavior, ignoring induced emotions for rationality\ndecisions. Surprisingly, emotional prompting, particularly with `anger'\nemotion, can disrupt the \"superhuman\" alignment of GPT-4, resembling human\nemotional responses.\n","authors":["Mikhail Mozikov","Nikita Severin","Valeria Bodishtianu","Maria Glushanina","Mikhail Baklashkin","Andrey V. Savchenko","Ilya Makarov"],"pdf_url":"https://arxiv.org/pdf/2406.03299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18281v2","updated":"2024-06-05T14:07:50Z","published":"2024-02-28T12:17:40Z","title":"Towards Better Understanding of Contrastive Sentence Representation\n  Learning: A Unified Paradigm for Gradient","summary":"  Sentence Representation Learning (SRL) is a crucial task in Natural Language\nProcessing (NLP), where contrastive Self-Supervised Learning (SSL) is currently\na mainstream approach. However, the reasons behind its remarkable effectiveness\nremain unclear. Specifically, many studies have investigated the similarities\nbetween contrastive and non-contrastive SSL from a theoretical perspective.\nSuch similarities can be verified in classification tasks, where the two\napproaches achieve comparable performance. But in ranking tasks (i.e., Semantic\nTextual Similarity (STS) in SRL), contrastive SSL significantly outperforms\nnon-contrastive SSL. Therefore, two questions arise: First, *what commonalities\nenable various contrastive losses to achieve superior performance in STS?*\nSecond, *how can we make non-contrastive SSL also effective in STS?* To address\nthese questions, we start from the perspective of gradients and discover that\nfour effective contrastive losses can be integrated into a unified paradigm,\nwhich depends on three components: the **Gradient Dissipation**, the\n**Weight**, and the **Ratio**. Then, we conduct an in-depth analysis of the\nroles these components play in optimization and experimentally demonstrate\ntheir significance for model performance. Finally, by adjusting these\ncomponents, we enable non-contrastive SSL to achieve outstanding performance in\nSTS.\n","authors":["Mingxin Li","Richong Zhang","Zhijie Nie"],"pdf_url":"https://arxiv.org/pdf/2402.18281v2.pdf","comment":"Accepted at ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2406.03287v1","updated":"2024-06-05T13:59:03Z","published":"2024-06-05T13:59:03Z","title":"SpikeLM: Towards General Spike-Driven Language Modeling via Elastic\n  Bi-Spiking Mechanisms","summary":"  Towards energy-efficient artificial intelligence similar to the human brain,\nthe bio-inspired spiking neural networks (SNNs) have advantages of biological\nplausibility, event-driven sparsity, and binary activation. Recently,\nlarge-scale language models exhibit promising generalization capability, making\nit a valuable issue to explore more general spike-driven models. However, the\nbinary spikes in existing SNNs fail to encode adequate semantic information,\nplacing technological challenges for generalization. This work proposes the\nfirst fully spiking mechanism for general language tasks, including both\ndiscriminative and generative ones. Different from previous spikes with {0,1}\nlevels, we propose a more general spike formulation with bi-directional,\nelastic amplitude, and elastic frequency encoding, while still maintaining the\naddition nature of SNNs. In a single time step, the spike is enhanced by\ndirection and amplitude information; in spike frequency, a strategy to control\nspike firing rate is well designed. We plug this elastic bi-spiking mechanism\nin language modeling, named SpikeLM. It is the first time to handle general\nlanguage tasks with fully spike-driven models, which achieve much higher\naccuracy than previously possible. SpikeLM also greatly bridges the performance\ngap between SNNs and ANNs in language modeling. Our code is available at\nhttps://github.com/Xingrun-Xing/SpikeLM.\n","authors":["Xingrun Xing","Zheng Zhang","Ziyi Ni","Shitao Xiao","Yiming Ju","Siqi Fan","Yequan Wang","Jiajun Zhang","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2406.03287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03280v1","updated":"2024-06-05T13:54:28Z","published":"2024-06-05T13:54:28Z","title":"FusionBench: A Comprehensive Benchmark of Deep Model Fusion","summary":"  Deep model fusion is an emerging technique that unifies the predictions or\nparameters of several deep neural networks into a single model in a\ncost-effective and data-efficient manner. This enables the unified model to\ntake advantage of the original models' strengths, potentially exceeding their\nperformance. Although a variety of deep model fusion techniques have been\nintroduced, their evaluations tend to be inconsistent and often inadequate to\nvalidate their effectiveness and robustness against distribution shifts. To\naddress this issue, we introduce FusionBench, which is the first comprehensive\nbenchmark dedicated to deep model fusion. FusionBench covers a wide range of\ntasks, including open-vocabulary image classification, text classification, and\ntext-to-text generation. Each category includes up to eight tasks with\ncorresponding task-specific models, featuring both full fine-tuning and LoRA\nfine-tuning, as well as models of different sizes, to ensure fair and balanced\ncomparisons of various multi-task model fusion techniques across different\ntasks, model scales, and fine-tuning strategies. We implement and evaluate a\nbroad spectrum of deep model fusion techniques. These techniques range from\nmodel ensemble methods, which combine the predictions to improve the overall\nperformance, to model merging, which integrates different models into a single\none, and model mixing methods, which upscale or recombine the components of the\noriginal models. FusionBench now contains 26 distinct tasks, 74 fine-tuned\nmodels, and 16 fusion techniques, and we are committed to consistently\nexpanding the benchmark with more tasks, models, and fusion techniques. In\naddition, we offer a well-documented set of resources and guidelines to aid\nresearchers in understanding and replicating the benchmark results. Homepage\nhttps://tanganke.github.io/fusion_bench/\n","authors":["Anke Tang","Li Shen","Yong Luo","Han Hu","Bo Do","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2406.03280v1.pdf","comment":"Project homepage: https://tanganke.github.io/fusion_bench/"},{"id":"http://arxiv.org/abs/2403.18715v2","updated":"2024-06-05T13:53:42Z","published":"2024-03-27T16:04:47Z","title":"Mitigating Hallucinations in Large Vision-Language Models with\n  Instruction Contrastive Decoding","summary":"  Large Vision-Language Models (LVLMs) are increasingly adept at generating\ncontextually detailed and coherent responses from visual inputs. However, their\napplication in multimodal decision-making and open-ended generation is hindered\nby a notable rate of hallucinations, where generated text inaccurately\nrepresents the visual contents. To address this issue, this paper introduces\nthe Instruction Contrastive Decoding (ICD) method, a novel approach designed to\nreduce hallucinations during LVLM inference. Our method is inspired by our\nobservation that what we call disturbance instructions significantly exacerbate\nhallucinations in multimodal fusion modules. ICD contrasts distributions from\nstandard and instruction disturbance, thereby increasing alignment uncertainty\nand effectively subtracting hallucinated concepts from the original\ndistribution. Through comprehensive experiments on discriminative benchmarks\n(POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that\nICD significantly mitigates both object-level and attribute-level\nhallucinations. Moreover, our method not only addresses hallucinations but also\nsignificantly enhances the general perception and recognition capabilities of\nLVLMs.\n","authors":["Xintong Wang","Jingheng Pan","Liang Ding","Chris Biemann"],"pdf_url":"https://arxiv.org/pdf/2403.18715v2.pdf","comment":"Accepted to Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2402.13211v2","updated":"2024-06-05T13:39:59Z","published":"2024-02-20T18:21:32Z","title":"Can Large Language Models be Good Emotional Supporter? Mitigating\n  Preference Bias on Emotional Support Conversation","summary":"  Emotional Support Conversation (ESC) is a task aimed at alleviating\nindividuals' emotional distress through daily conversation. Given its inherent\ncomplexity and non-intuitive nature, ESConv dataset incorporates support\nstrategies to facilitate the generation of appropriate responses. Recently,\ndespite the remarkable conversational ability of large language models (LLMs),\nprevious studies have suggested that they often struggle with providing useful\nemotional support. Hence, this work initially analyzes the results of LLMs on\nESConv, revealing challenges in selecting the correct strategy and a notable\npreference for a specific strategy. Motivated by these, we explore the impact\nof the inherent preference in LLMs on providing emotional support, and\nconsequently, we observe that exhibiting high preference for specific\nstrategies hinders effective emotional support, aggravating its robustness in\npredicting the appropriate strategy. Moreover, we conduct a methodological\nstudy to offer insights into the necessary approaches for LLMs to serve as\nproficient emotional supporters. Our findings emphasize that (1) low preference\nfor specific strategies hinders the progress of emotional support, (2) external\nassistance helps reduce preference bias, and (3) existing LLMs alone cannot\nbecome good emotional supporters. These insights suggest promising avenues for\nfuture research to enhance the emotional intelligence of LLMs.\n","authors":["Dongjin Kang","Sunghwan Kim","Taeyoon Kwon","Seungjun Moon","Hyunsouk Cho","Youngjae Yu","Dongha Lee","Jinyoung Yeo"],"pdf_url":"https://arxiv.org/pdf/2402.13211v2.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2404.03528v3","updated":"2024-06-05T13:39:56Z","published":"2024-04-04T15:31:21Z","title":"BanglaAutoKG: Automatic Bangla Knowledge Graph Construction with\n  Semantic Neural Graph Filtering","summary":"  Knowledge Graphs (KGs) have proven essential in information processing and\nreasoning applications because they link related entities and give context-rich\ninformation, supporting efficient information retrieval and knowledge\ndiscovery; presenting information flow in a very effective manner. Despite\nbeing widely used globally, Bangla is relatively underrepresented in KGs due to\na lack of comprehensive datasets, encoders, NER (named entity recognition)\nmodels, POS (part-of-speech) taggers, and lemmatizers, hindering efficient\ninformation processing and reasoning applications in the language. Addressing\nthe KG scarcity in Bengali, we propose BanglaAutoKG, a pioneering framework\nthat is able to automatically construct Bengali KGs from any Bangla text. We\nutilize multilingual LLMs to understand various languages and correlate\nentities and relations universally. By employing a translation dictionary to\nidentify English equivalents and extracting word features from pre-trained BERT\nmodels, we construct the foundational KG. To reduce noise and align word\nembeddings with our goal, we employ graph-based polynomial filters. Lastly, we\nimplement a GNN-based semantic filter, which elevates contextual understanding\nand trims unnecessary edges, culminating in the formation of the definitive KG.\nEmpirical findings and case studies demonstrate the universal effectiveness of\nour model, capable of autonomously constructing semantically enriched KGs from\nany text.\n","authors":["Azmine Toushik Wasi","Taki Hasan Rafi","Raima Islam","Dong-Kyu Chae"],"pdf_url":"https://arxiv.org/pdf/2404.03528v3.pdf","comment":"7 pages, 3 figures. Accepted to LREC-COLING 2024. Read in ACL\n  Anthology: https://aclanthology.org/2024.lrec-main.189/"},{"id":"http://arxiv.org/abs/2312.17025v3","updated":"2024-06-05T13:39:20Z","published":"2023-12-28T13:50:42Z","title":"Experiential Co-Learning of Software-Developing Agents","summary":"  Recent advancements in large language models (LLMs) have brought significant\nchanges to various domains, especially through LLM-driven autonomous agents. A\nrepresentative scenario is in software development, where LLM agents\ndemonstrate efficient collaboration, task division, and assurance of software\nquality, markedly reducing the need for manual involvement. However, these\nagents frequently perform a variety of tasks independently, without benefiting\nfrom past experiences, which leads to repeated mistakes and inefficient\nattempts in multi-step task execution. To this end, we introduce Experiential\nCo-Learning, a novel LLM-agent learning framework in which instructor and\nassistant agents gather shortcut-oriented experiences from their historical\ntrajectories and use these past experiences for future task execution. The\nextensive experiments demonstrate that the framework enables agents to tackle\nunseen software-developing tasks more effectively. We anticipate that our\ninsights will guide LLM agents towards enhanced autonomy and contribute to\ntheir evolutionary growth in cooperative learning. The code and data are\navailable at https://github.com/OpenBMB/ChatDev.\n","authors":["Chen Qian","Yufan Dang","Jiahao Li","Wei Liu","Zihao Xie","Yifei Wang","Weize Chen","Cheng Yang","Xin Cong","Xiaoyin Che","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2312.17025v3.pdf","comment":"Accepted to ACL 2024, https://github.com/OpenBMB/ChatDev"},{"id":"http://arxiv.org/abs/2402.10024v2","updated":"2024-06-05T13:38:42Z","published":"2024-02-15T15:43:05Z","title":"Self-Augmented In-Context Learning for Unsupervised Word Translation","summary":"  Recent work has shown that, while large language models (LLMs) demonstrate\nstrong word translation or bilingual lexicon induction (BLI) capabilities in\nfew-shot setups, they still cannot match the performance of 'traditional'\nmapping-based approaches in the unsupervised scenario where no seed translation\npairs are available, especially for lower-resource languages. To address this\nchallenge with LLMs, we propose self-augmented in-context learning (SAIL) for\nunsupervised BLI: starting from a zero-shot prompt, SAIL iteratively induces a\nset of high-confidence word translation pairs for in-context learning (ICL)\nfrom an LLM, which it then reapplies to the same LLM in the ICL fashion. Our\nmethod shows substantial gains over zero-shot prompting of LLMs on two\nestablished BLI benchmarks spanning a wide range of language pairs, also\noutperforming mapping-based baselines across the board. In addition to\nachieving state-of-the-art unsupervised BLI performance, we also conduct\ncomprehensive analyses on SAIL and discuss its limitations.\n","authors":["Yaoyiran Li","Anna Korhonen","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2402.10024v2.pdf","comment":"ACL 2024 Main Conference; 11 Pages, 3 Figures, 9 Tables"},{"id":"http://arxiv.org/abs/2404.01753v2","updated":"2024-06-05T13:34:55Z","published":"2024-04-02T09:11:58Z","title":"M2SA: Multimodal and Multilingual Model for Sentiment Analysis of Tweets","summary":"  In recent years, multimodal natural language processing, aimed at learning\nfrom diverse data types, has garnered significant attention. However, there\nneeds to be more clarity when it comes to analysing multimodal tasks in\nmulti-lingual contexts. While prior studies on sentiment analysis of tweets\nhave predominantly focused on the English language, this paper addresses this\ngap by transforming an existing textual Twitter sentiment dataset into a\nmultimodal format through a straightforward curation process. Our work opens up\nnew avenues for sentiment-related research within the research community.\nAdditionally, we conduct baseline experiments utilising this augmented dataset\nand report the findings. Notably, our evaluations reveal that when comparing\nunimodal and multimodal configurations, using a sentiment-tuned large language\nmodel as a text encoder performs exceptionally well.\n","authors":["Gaurish Thakkar","Sherzod Hakimov","Marko Tadić"],"pdf_url":"https://arxiv.org/pdf/2404.01753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07924v5","updated":"2024-06-05T13:23:49Z","published":"2023-07-16T02:11:34Z","title":"ChatDev: Communicative Agents for Software Development","summary":"  Software development is a complex task that necessitates cooperation among\nmultiple members with diverse skills. Numerous studies used deep learning to\nimprove specific phases in a waterfall model, such as design, coding, and\ntesting. However, the deep learning model in each phase requires unique\ndesigns, leading to technical inconsistencies across various phases, which\nresults in a fragmented and ineffective development process. In this paper, we\nintroduce ChatDev, a chat-powered software development framework in which\nspecialized agents driven by large language models (LLMs) are guided in what to\ncommunicate (via chat chain) and how to communicate (via communicative\ndehallucination). These agents actively contribute to the design, coding, and\ntesting phases through unified language-based communication, with solutions\nderived from their multi-turn dialogues. We found their utilization of natural\nlanguage is advantageous for system design, and communicating in programming\nlanguage proves helpful in debugging. This paradigm demonstrates how linguistic\ncommunication facilitates multi-agent collaboration, establishing language as a\nunifying bridge for autonomous task-solving among LLM agents. The code and data\nare available at https://github.com/OpenBMB/ChatDev.\n","authors":["Chen Qian","Wei Liu","Hongzhang Liu","Nuo Chen","Yufan Dang","Jiahao Li","Cheng Yang","Weize Chen","Yusheng Su","Xin Cong","Juyuan Xu","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2307.07924v5.pdf","comment":"Accepted to ACL 2024; https://github.com/OpenBMB/ChatDev"},{"id":"http://arxiv.org/abs/2406.03248v1","updated":"2024-06-05T13:23:23Z","published":"2024-06-05T13:23:23Z","title":"Large Language Models as Evaluators for Recommendation Explanations","summary":"  The explainability of recommender systems has attracted significant attention\nin academia and industry. Many efforts have been made for explainable\nrecommendations, yet evaluating the quality of the explanations remains a\nchallenging and unresolved issue. In recent years, leveraging LLMs as\nevaluators presents a promising avenue in Natural Language Processing tasks\n(e.g., sentiment classification, information extraction), as they perform\nstrong capabilities in instruction following and common-sense reasoning.\nHowever, evaluating recommendation explanatory texts is different from these\nNLG tasks, as its criteria are related to human perceptions and are usually\nsubjective. In this paper, we investigate whether LLMs can serve as evaluators\nof recommendation explanations. To answer the question, we utilize real user\nfeedback on explanations given from previous work and additionally collect\nthird-party annotations and LLM evaluations. We design and apply a 3-level meta\nevaluation strategy to measure the correlation between evaluator labels and the\nground truth provided by users. Our experiments reveal that LLMs, such as GPT4,\ncan provide comparable evaluations with appropriate prompts and settings. We\nalso provide further insights into combining human labels with the LLM\nevaluation process and utilizing ensembles of multiple heterogeneous LLM\nevaluators to enhance the accuracy and stability of evaluations. Our study\nverifies that utilizing LLMs as evaluators can be an accurate, reproducible and\ncost-effective solution for evaluating recommendation explanation texts. Our\ncode is available at https://github.com/Xiaoyu-SZ/LLMasEvaluator.\n","authors":["Xiaoyu Zhang","Yishan Li","Jiayin Wang","Bowen Sun","Weizhi Ma","Peijie Sun","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.03248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03239v1","updated":"2024-06-05T13:16:46Z","published":"2024-06-05T13:16:46Z","title":"Document-level Claim Extraction and Decontextualisation for\n  Fact-Checking","summary":"  Selecting which claims to check is a time-consuming task for human\nfact-checkers, especially from documents consisting of multiple sentences and\ncontaining multiple claims. However, existing claim extraction approaches focus\nmore on identifying and extracting claims from individual sentences, e.g.,\nidentifying whether a sentence contains a claim or the exact boundaries of the\nclaim within a sentence. In this paper, we propose a method for document-level\nclaim extraction for fact-checking, which aims to extract check-worthy claims\nfrom documents and decontextualise them so that they can be understood out of\ncontext. Specifically, we first recast claim extraction as extractive\nsummarization in order to identify central sentences from documents, then\nrewrite them to include necessary context from the originating document through\nsentence decontextualisation. Evaluation with both automatic metrics and a\nfact-checking professional shows that our method is able to extract\ncheck-worthy claims from documents more accurately than previous work, while\nalso improving evidence retrieval.\n","authors":["Zhenyun Deng","Michael Schlichtkrul","Andreas Vlachos"],"pdf_url":"https://arxiv.org/pdf/2406.03239v1.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2406.03235v1","updated":"2024-06-05T13:15:37Z","published":"2024-06-05T13:15:37Z","title":"Error-preserving Automatic Speech Recognition of Young English Learners'\n  Language","summary":"  One of the central skills that language learners need to practice is speaking\nthe language. Currently, students in school do not get enough speaking\nopportunities and lack conversational practice. Recent advances in speech\ntechnology and natural language processing allow for the creation of novel\ntools to practice their speaking skills. In this work, we tackle the first\ncomponent of such a pipeline, namely, the automated speech recognition module\n(ASR), which faces a number of challenges: first, state-of-the-art ASR models\nare often trained on adult read-aloud data by native speakers and do not\ntransfer well to young language learners' speech. Second, most ASR systems\ncontain a powerful language model, which smooths out errors made by the\nspeakers. To give corrective feedback, which is a crucial part of language\nlearning, the ASR systems in our setting need to preserve the errors made by\nthe language learners. In this work, we build an ASR system that satisfies\nthese requirements: it works on spontaneous speech by young language learners\nand preserves their errors. For this, we collected a corpus containing around\n85 hours of English audio spoken by learners in Switzerland from grades 4 to 6\non different language learning tasks, which we used to train an ASR model. Our\nexperiments show that our model benefits from direct fine-tuning on children's\nvoices and has a much higher error preservation rate than other models.\n","authors":["Janick Michot","Manuela Hürlimann","Jan Deriu","Luzia Sauer","Katsiaryna Mlynchyk","Mark Cieliebak"],"pdf_url":"https://arxiv.org/pdf/2406.03235v1.pdf","comment":"Accepted at ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2406.03221v1","updated":"2024-06-05T13:00:04Z","published":"2024-06-05T13:00:04Z","title":"Linking Named Entities in Diderot's \\textit{Encyclopédie} to Wikidata","summary":"  Diderot's \\textit{Encyclop\\'edie} is a reference work from XVIIIth century in\nEurope that aimed at collecting the knowledge of its era. \\textit{Wikipedia}\nhas the same ambition with a much greater scope. However, the lack of digital\nconnection between the two encyclopedias may hinder their comparison and the\nstudy of how knowledge has evolved. A key element of \\textit{Wikipedia} is\nWikidata that backs the articles with a graph of structured data. In this\npaper, we describe the annotation of more than 10,300 of the\n\\textit{Encyclop\\'edie} entries with Wikidata identifiers enabling us to\nconnect these entries to the graph. We considered geographic and human\nentities. The \\textit{Encyclop\\'edie} does not contain biographic entries as\nthey mostly appear as subentries of locations. We extracted all the geographic\nentries and we completely annotated all the entries containing a description of\nhuman entities. This represents more than 2,600 links referring to locations or\nhuman entities. In addition, we annotated more than 9,500 entries having a\ngeographic content only. We describe the annotation process as well as\napplication examples. This resource is available at\nhttps://github.com/pnugues/encyclopedie_1751\n","authors":["Pierre Nugues"],"pdf_url":"https://arxiv.org/pdf/2406.03221v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.12599v5","updated":"2024-06-05T12:54:51Z","published":"2023-05-21T23:16:26Z","title":"Abstract Meaning Representation-Based Logic-Driven Data Augmentation for\n  Logical Reasoning","summary":"  Combining large language models with logical reasoning enhances their\ncapacity to address problems in a robust and reliable manner. Nevertheless, the\nintricate nature of logical reasoning poses challenges when gathering reliable\ndata from the web to build comprehensive training datasets, subsequently\naffecting performance on downstream tasks. To address this, we introduce a\nnovel logic-driven data augmentation approach, AMR-LDA. AMR-LDA converts the\noriginal text into an Abstract Meaning Representation (AMR) graph, a structured\nsemantic representation that encapsulates the logical structure of the\nsentence, upon which operations are performed to generate logically modified\nAMR graphs. The modified AMR graphs are subsequently converted back into text\nto create augmented data. Notably, our methodology is architecture-agnostic and\nenhances both generative large language models, such as GPT-3.5 and GPT-4,\nthrough prompt augmentation, and discriminative large language models through\ncontrastive learning with logic-driven data augmentation. Empirical evidence\nunderscores the efficacy of our proposed method with improvement in performance\nacross seven downstream tasks, such as reading comprehension requiring logical\nreasoning, textual entailment, and natural language inference. Furthermore, our\nmethod leads on the ReClor\nleaderboard\\footnote{\\url{https://eval.ai/web/challenges/challenge-page/503/leaderboard/1347}}.\nThe source code and data are publicly\navailable\\footnote{\\href{https://github.com/Strong-AI-Lab/Logical-Equivalence-driven-AMR-Data-Augmentation-for-Representation-Learning}{AMR-LDA\nGitHub Repository}}.\n","authors":["Qiming Bao","Alex Yuxuan Peng","Zhenyun Deng","Wanjun Zhong","Gael Gendron","Timothy Pistotti","Neset Tan","Nathan Young","Yang Chen","Yonghua Zhu","Paul Denny","Michael Witbrock","Jiamou Liu"],"pdf_url":"https://arxiv.org/pdf/2305.12599v5.pdf","comment":"21 pages, 8 figures, the Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.03202v1","updated":"2024-06-05T12:35:00Z","published":"2024-06-05T12:35:00Z","title":"ChatLang-8: An LLM-Based Synthetic Data Generation Framework for\n  Grammatical Error Correction","summary":"  We explore and improve the capabilities of LLMs to generate data for\ngrammatical error correction (GEC). When merely producing parallel sentences,\ntheir patterns are too simplistic to be valuable as a corpus. To address this\nissue, we propose an automated framework that includes a Subject Selector,\nGrammar Selector, Prompt Manager, and Evaluator. Additionally, we introduce a\nnew dataset for GEC tasks, named \\textbf{ChatLang-8}, which encompasses eight\ntypes of subject nouns and 23 types of grammar. It consists of 1 million pairs\nfeaturing human-like grammatical errors. Our experiments reveal that ChatLang-8\nexhibits a more uniform pattern composition compared to existing GEC datasets.\nFurthermore, we observe improved model performance when using ChatLang-8\ninstead of existing GEC datasets. The experimental results suggest that our\nframework and ChatLang-8 are valuable resources for enhancing ChatGPT's data\ngeneration capabilities.\n","authors":["Jeiyoon Park","Chanjun Park","Heuiseok Lim"],"pdf_url":"https://arxiv.org/pdf/2406.03202v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2406.01934v2","updated":"2024-06-05T12:13:56Z","published":"2024-06-04T03:35:25Z","title":"Optimal Transport Guided Correlation Assignment for Multimodal Entity\n  Linking","summary":"  Multimodal Entity Linking (MEL) aims to link ambiguous mentions in multimodal\ncontexts to entities in a multimodal knowledge graph. A pivotal challenge is to\nfully leverage multi-element correlations between mentions and entities to\nbridge modality gap and enable fine-grained semantic matching. Existing methods\nattempt several local correlative mechanisms, relying heavily on the\nautomatically learned attention weights, which may over-concentrate on partial\ncorrelations. To mitigate this issue, we formulate the correlation assignment\nproblem as an optimal transport (OT) problem, and propose a novel MEL\nframework, namely OT-MEL, with OT-guided correlation assignment. Thereby, we\nexploit the correlation between multimodal features to enhance multimodal\nfusion, and the correlation between mentions and entities to enhance\nfine-grained matching. To accelerate model prediction, we further leverage\nknowledge distillation to transfer OT assignment knowledge to attention\nmechanism. Experimental results show that our model significantly outperforms\nprevious state-of-the-art baselines and confirm the effectiveness of the\nOT-guided correlation assignment.\n","authors":["Zefeng Zhang","Jiawei Sheng","Chuang Zhang","Yunzhi Liang","Wenyuan Zhang","Siqi Wang","Tingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2406.01934v2.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.03181v1","updated":"2024-06-05T12:11:10Z","published":"2024-06-05T12:11:10Z","title":"Missci: Reconstructing Fallacies in Misrepresented Science","summary":"  Health-related misinformation on social networks can lead to poor\ndecision-making and real-world dangers. Such misinformation often misrepresents\nscientific publications and cites them as \"proof\" to gain perceived\ncredibility. To effectively counter such claims automatically, a system must\nexplain how the claim was falsely derived from the cited publication. Current\nmethods for automated fact-checking or fallacy detection neglect to assess the\n(mis)used evidence in relation to misinformation claims, which is required to\ndetect the mismatch between them. To address this gap, we introduce Missci, a\nnovel argumentation theoretical model for fallacious reasoning together with a\nnew dataset for real-world misinformation detection that misrepresents\nbiomedical publications. Unlike previous fallacy detection datasets, Missci (i)\nfocuses on implicit fallacies between the relevant content of the cited\npublication and the inaccurate claim, and (ii) requires models to verbalize the\nfallacious reasoning in addition to classifying it. We present Missci as a\ndataset to test the critical reasoning abilities of large language models\n(LLMs), that are required to reconstruct real-world fallacious arguments, in a\nzero-shot setting. We evaluate two representative LLMs and the impact of\ndifferent levels of detail about the fallacy classes provided to the LLM via\nprompts. Our experiments and human evaluation show promising results for GPT 4,\nwhile also demonstrating the difficulty of this task.\n","authors":["Max Glockner","Yufang Hou","Preslav Nakov","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2406.03181v1.pdf","comment":"ACL 2024 (main)"},{"id":"http://arxiv.org/abs/2406.03170v1","updated":"2024-06-05T12:03:19Z","published":"2024-06-05T12:03:19Z","title":"StatBot.Swiss: Bilingual Open Data Exploration in Natural Language","summary":"  The potential for improvements brought by Large Language Models (LLMs) in\nText-to-SQL systems is mostly assessed on monolingual English datasets.\nHowever, LLMs' performance for other languages remains vastly unexplored. In\nthis work, we release the StatBot.Swiss dataset, the first bilingual benchmark\nfor evaluating Text-to-SQL systems based on real-world applications. The\nStatBot.Swiss dataset contains 455 natural language/SQL-pairs over 35 big\ndatabases with varying level of complexity for both English and German.\n  We evaluate the performance of state-of-the-art LLMs such as GPT-3.5-Turbo\nand mixtral-8x7b-instruct for the Text-to-SQL translation task using an\nin-context learning approach. Our experimental analysis illustrates that\ncurrent LLMs struggle to generalize well in generating SQL queries on our novel\nbilingual dataset.\n","authors":["Farhad Nooralahzadeh","Yi Zhang","Ellery Smith","Sabine Maennel","Cyril Matthey-Doret","Raphaël de Fondville","Kurt Stockinger"],"pdf_url":"https://arxiv.org/pdf/2406.03170v1.pdf","comment":"This work is accepted at ACL Findings 2024"},{"id":"http://arxiv.org/abs/2402.08327v2","updated":"2024-06-05T11:46:23Z","published":"2024-02-13T09:47:07Z","title":"PreFLMR: Scaling Up Fine-Grained Late-Interaction Multi-modal Retrievers","summary":"  Large Multimodal Models (LMMs) excel in natural language and visual\nunderstanding but are challenged by exacting tasks such as Knowledge-based\nVisual Question Answering (KB-VQA) which involve the retrieval of relevant\ninformation from document collections to use in shaping answers to questions.\nWe present an extensive training and evaluation framework, M2KR, for KB-VQA.\nM2KR contains a collection of vision and language tasks which we have\nincorporated into a single suite of benchmark tasks for training and evaluating\ngeneral-purpose multi-modal retrievers. We use M2KR to develop PreFLMR, a\npre-trained version of the recently developed Fine-grained Late-interaction\nMulti-modal Retriever (FLMR) approach to KB-VQA, and we report new\nstate-of-the-art results across a range of tasks. We also present\ninvestigations into the scaling behaviors of PreFLMR intended to be useful in\nfuture developments in general-purpose multi-modal retrievers.\n","authors":["Weizhe Lin","Jingbiao Mei","Jinghong Chen","Bill Byrne"],"pdf_url":"https://arxiv.org/pdf/2402.08327v2.pdf","comment":"ACL 2024; Project page: https://preflmr.github.io/"},{"id":"http://arxiv.org/abs/2406.03158v1","updated":"2024-06-05T11:35:44Z","published":"2024-06-05T11:35:44Z","title":"CSS: Contrastive Semantic Similarity for Uncertainty Quantification of\n  LLMs","summary":"  Despite the impressive capability of large language models (LLMs), knowing\nwhen to trust their generations remains an open challenge. The recent\nliterature on uncertainty quantification of natural language generation (NLG)\nutilises a conventional natural language inference (NLI) classifier to measure\nthe semantic dispersion of LLMs responses. These studies employ logits of NLI\nclassifier for semantic clustering to estimate uncertainty. However, logits\nrepresent the probability of the predicted class and barely contain feature\ninformation for potential clustering. Alternatively, CLIP (Contrastive\nLanguage-Image Pre-training) performs impressively in extracting image-text\npair features and measuring their similarity. To extend its usability, we\npropose Contrastive Semantic Similarity, the CLIP-based feature extraction\nmodule to obtain similarity features for measuring uncertainty for text pairs.\nWe apply this method to selective NLG, which detects and rejects unreliable\ngenerations for better trustworthiness of LLMs. We conduct extensive\nexperiments with three LLMs on several benchmark question-answering datasets\nwith comprehensive evaluation metrics. Results show that our proposed method\nperforms better in estimating reliable responses of LLMs than comparable\nbaselines. Results show that our proposed method performs better in estimating\nreliable responses of LLMs than comparable baselines. The code are available at\n\\url{https://github.com/AoShuang92/css_uq_llms}.\n","authors":["Shuang Ao","Stefan Rueger","Advaith Siddharthan"],"pdf_url":"https://arxiv.org/pdf/2406.03158v1.pdf","comment":"The paper is accepted by The Conference on Uncertainty in Artificial\n  Intelligence (UAI), 2024"},{"id":"http://arxiv.org/abs/2406.03151v1","updated":"2024-06-05T11:15:45Z","published":"2024-06-05T11:15:45Z","title":"Which Side Are You On? A Multi-task Dataset for End-to-End Argument\n  Summarisation and Evaluation","summary":"  With the recent advances of large language models (LLMs), it is no longer\ninfeasible to build an automated debate system that helps people to synthesise\npersuasive arguments. Previous work attempted this task by integrating multiple\ncomponents. In our work, we introduce an argument mining dataset that captures\nthe end-to-end process of preparing an argumentative essay for a debate, which\ncovers the tasks of claim and evidence identification (Task 1 ED), evidence\nconvincingness ranking (Task 2 ECR), argumentative essay summarisation and\nhuman preference ranking (Task 3 ASR) and metric learning for automated\nevaluation of resulting essays, based on human feedback along argument quality\ndimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are\nfully annotated with the various properties supporting the aforementioned\ntasks. We evaluate multiple generative baselines for each of these tasks,\nincluding representative LLMs. We find, that while they show promising results\non individual tasks in our benchmark, their end-to-end performance on all four\ntasks in succession deteriorates significantly, both in automated measures as\nwell as in human-centred evaluation. This challenge presented by our proposed\ndataset motivates future research on end-to-end argument mining and\nsummarisation. The repository of this project is available at\nhttps://github.com/HarrywillDr/ArgSum-Datatset\n","authors":["Hao Li","Yuping Wu","Viktor Schlegel","Riza Batista-Navarro","Tharindu Madusanka","Iqra Zahid","Jiayan Zeng","Xiaochi Wang","Xinran He","Yizhi Li","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2406.03151v1.pdf","comment":"Published on ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2402.17811v2","updated":"2024-06-05T11:15:04Z","published":"2024-02-27T14:45:04Z","title":"TruthX: Alleviating Hallucinations by Editing Large Language Models in\n  Truthful Space","summary":"  Large Language Models (LLMs) sometimes suffer from producing hallucinations,\nespecially LLMs may generate untruthful responses despite knowing the correct\nknowledge. Activating the truthfulness within LLM is the key to fully unlocking\nLLM's knowledge potential. In this paper, we propose TruthX, an inference-time\nintervention method to activate the truthfulness of LLM by identifying and\nediting the features within LLM's internal representations that govern the\ntruthfulness. TruthX employs an auto-encoder to map LLM's representations into\nsemantic and truthful latent spaces respectively, and applies contrastive\nlearning to identify a truthful editing direction within the truthful space.\nDuring inference, by editing LLM's internal representations in truthful space,\nTruthX effectively enhances the truthfulness of LLM. Experiments show that\nTruthX improves the truthfulness of 13 advanced LLMs by an average of 20% on\nTruthfulQA benchmark. Further analyses suggest that TruthX can control LLM to\nproduce truthful or hallucinatory responses via editing only one vector in\nLLM's internal representations.\n","authors":["Shaolei Zhang","Tian Yu","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2402.17811v2.pdf","comment":"Accepted to ACL 2024 main conference, Project Page:\n  https://ictnlp.github.io/TruthX-site/"},{"id":"http://arxiv.org/abs/2402.15337v2","updated":"2024-06-05T10:42:40Z","published":"2024-02-23T14:17:01Z","title":"Ranking Entities along Conceptual Space Dimensions with LLMs: An\n  Analysis of Fine-Tuning Strategies","summary":"  Conceptual spaces represent entities in terms of their primitive semantic\nfeatures. Such representations are highly valuable but they are notoriously\ndifficult to learn, especially when it comes to modelling perceptual and\nsubjective features. Distilling conceptual spaces from Large Language Models\n(LLMs) has recently emerged as a promising strategy, but existing work has been\nlimited to probing pre-trained LLMs using relatively simple zero-shot\nstrategies. We focus in particular on the task of ranking entities according to\na given conceptual space dimension. Unfortunately, we cannot directly fine-tune\nLLMs on this task, because ground truth rankings for conceptual space\ndimensions are rare. We therefore use more readily available features as\ntraining data and analyse whether the ranking capabilities of the resulting\nmodels transfer to perceptual and subjective features. We find that this is\nindeed the case, to some extent, but having at least some perceptual and\nsubjective features in the training data seems essential for achieving the best\nresults.\n","authors":["Nitesh Kumar","Usashi Chatterjee","Steven Schockaert"],"pdf_url":"https://arxiv.org/pdf/2402.15337v2.pdf","comment":"Accepted in ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2311.05297v2","updated":"2024-06-05T10:33:18Z","published":"2023-11-09T11:54:01Z","title":"Challenging the Validity of Personality Tests for Large Language Models","summary":"  With large language models (LLMs) like GPT-4 appearing to behave increasingly\nhuman-like in text-based interactions, it has become popular to attempt to\nevaluate personality traits of LLMs using questionnaires originally developed\nfor humans. While reusing measures is a resource-efficient way to evaluate\nLLMs, careful adaptations are usually required to ensure that assessment\nresults are valid even across human subpopulations. In this work, we provide\nevidence that LLMs' responses to personality tests systematically deviate from\nhuman responses, implying that the results of these tests cannot be interpreted\nin the same way. Concretely, reverse-coded items (\"I am introverted\" vs. \"I am\nextraverted\") are often both answered affirmatively. Furthermore, variation\nacross prompts designed to \"steer\" LLMs to simulate particular personality\ntypes does not follow the clear separation into five independent personality\nfactors from human samples. In light of these results, we believe that it is\nimportant to investigate tests' validity for LLMs before drawing strong\nconclusions about potentially ill-defined concepts like LLMs' \"personality\".\n","authors":["Tom Sühr","Florian E. Dorner","Samira Samadi","Augustin Kelava"],"pdf_url":"https://arxiv.org/pdf/2311.05297v2.pdf","comment":"A less extensive and shorter version of this work has been accepted\n  at Socially Responsible Language Modelling Research (SoLaR) 2023 Workshop at\n  NeurIPS 2023"},{"id":"http://arxiv.org/abs/2402.07891v2","updated":"2024-06-05T10:26:18Z","published":"2024-02-12T18:54:02Z","title":"Label-Efficient Model Selection for Text Generation","summary":"  Model selection for a given target task can be costly, as it may entail\nextensive annotation of the quality of outputs of different models. We\nintroduce DiffUse, an efficient method to make an informed decision between\ncandidate text generation models based on preference annotations. DiffUse\nreduces the required amount of annotations, thus saving valuable time and\nresources in performing evaluation. DiffUse intelligently selects instances by\nclustering embeddings that represent the semantic differences between model\noutputs. Thus, it is able to identify a subset of examples that are more\ninformative for preference decisions. Our method is model-agnostic, and can be\napplied to any text generation model for selecting between models, prompts and\nconfigurations. Moreover, we propose a practical iterative approach for\ndynamically determining how many instances to annotate. In a series of\nexperiments over hundreds of model pairs, we demonstrate that DiffUse can\ndramatically reduce the required number of annotations -- by up to 75% -- while\nmaintaining high evaluation reliability.\n","authors":["Shir Ashury-Tahan","Ariel Gera","Benjamin Sznajder","Leshem Choshen","Liat Ein-Dor","Eyal Shnarch"],"pdf_url":"https://arxiv.org/pdf/2402.07891v2.pdf","comment":"Accepted to ACL (main conference)"},{"id":"http://arxiv.org/abs/2406.03127v1","updated":"2024-06-05T10:22:27Z","published":"2024-06-05T10:22:27Z","title":"Towards Real-world Scenario: Imbalanced New Intent Discovery","summary":"  New Intent Discovery (NID) aims at detecting known and previously undefined\ncategories of user intent by utilizing limited labeled and massive unlabeled\ndata. Most prior works often operate under the unrealistic assumption that the\ndistribution of both familiar and new intent classes is uniform, overlooking\nthe skewed and long-tailed distributions frequently encountered in real-world\nscenarios. To bridge the gap, our work introduces the imbalanced new intent\ndiscovery (i-NID) task, which seeks to identify familiar and novel intent\ncategories within long-tailed distributions. A new benchmark (ImbaNID-Bench)\ncomprised of three datasets is created to simulate the real-world long-tail\ndistributions. ImbaNID-Bench ranges from broad cross-domain to specific\nsingle-domain intent categories, providing a thorough representation of\npractical use cases. Besides, a robust baseline model ImbaNID is proposed to\nachieve cluster-friendly intent representations. It includes three stages:\nmodel pre-training, generation of reliable pseudo-labels, and robust\nrepresentation learning that strengthens the model performance to handle the\nintricacies of real-world data distributions. Our extensive experiments on\nprevious benchmarks and the newly established benchmark demonstrate the\nsuperior performance of ImbaNID in addressing the i-NID task, highlighting its\npotential as a powerful baseline for uncovering and categorizing user intents\nin imbalanced and long-tailed\ndistributions\\footnote{\\url{https://github.com/Zkdc/i-NID}}.\n","authors":["Shun Zhang","Chaoran Yan","Jian Yang","Jiaheng Liu","Ying Mo","Jiaqi Bai","Tongliang Li","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2406.03127v1.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2401.12192v4","updated":"2024-06-05T10:22:00Z","published":"2024-01-22T18:34:42Z","title":"Text Embedding Inversion Security for Multilingual Language Models","summary":"  Textual data is often represented as real-numbered embeddings in NLP,\nparticularly with the popularity of large language models (LLMs) and Embeddings\nas a Service (EaaS). However, storing sensitive information as embeddings can\nbe susceptible to security breaches, as research shows that text can be\nreconstructed from embeddings, even without knowledge of the underlying model.\nWhile defence mechanisms have been explored, these are exclusively focused on\nEnglish, leaving other languages potentially exposed to attacks. This work\nexplores LLM security through multilingual embedding inversion. We define the\nproblem of black-box multilingual and cross-lingual inversion attacks, and\nexplore their potential implications. Our findings suggest that multilingual\nLLMs may be more vulnerable to inversion attacks, in part because English-based\ndefences may be ineffective. To alleviate this, we propose a simple masking\ndefense effective for both monolingual and multilingual models. This study is\nthe first to investigate multilingual inversion attacks, shedding light on the\ndifferences in attacks and defenses across monolingual and multilingual\nsettings.\n","authors":["Yiyi Chen","Heather Lent","Johannes Bjerva"],"pdf_url":"https://arxiv.org/pdf/2401.12192v4.pdf","comment":"18 pages, 17 Tables, 6 Figures"},{"id":"http://arxiv.org/abs/2406.03125v1","updated":"2024-06-05T10:20:10Z","published":"2024-06-05T10:20:10Z","title":"Space Decomposition for Sentence Embedding","summary":"  Determining sentence pair similarity is crucial for various NLP tasks. A\ncommon technique to address this is typically evaluated on a continuous\nsemantic textual similarity scale from 0 to 5. However, based on a linguistic\nobservation in STS annotation guidelines, we found that the score in the range\n[4,5] indicates an upper-range sample, while the rest are lower-range samples.\nThis necessitates a new approach to treating the upper-range and lower-range\nclasses separately. In this paper, we introduce a novel embedding space\ndecomposition method called MixSP utilizing a Mixture of Specialized\nProjectors, designed to distinguish and rank upper-range and lower-range\nsamples accurately. The experimental results demonstrate that MixSP decreased\nthe overlap representation between upper-range and lower-range classes\nsignificantly while outperforming competitors on STS and zero-shot benchmarks.\n","authors":["Wuttikorn Ponwitayarat","Peerat Limkonchotiwat","Ekapol Chuangsuwanich","Sarana Nutanong"],"pdf_url":"https://arxiv.org/pdf/2406.03125v1.pdf","comment":"ACL Finding 2024. The code and pre-trained models are available at\n  https://github.com/KornWtp/MixSP"},{"id":"http://arxiv.org/abs/2212.10197v2","updated":"2024-06-05T10:18:14Z","published":"2022-12-20T12:16:46Z","title":"EIT: Enhanced Interactive Transformer","summary":"  Two principles: the complementary principle and the consensus principle are\nwidely acknowledged in the literature of multi-view learning. However, the\ncurrent design of multi-head self-attention, an instance of multi-view\nlearning, prioritizes the complementarity while ignoring the consensus. To\naddress this problem, we propose an enhanced multi-head self-attention (EMHA).\nFirst, to satisfy the complementary principle, EMHA removes the one-to-one\nmapping constraint among queries and keys in multiple subspaces and allows each\nquery to attend to multiple keys. On top of that, we develop a method to fully\nencourage consensus among heads by introducing two interaction models, namely\ninner-subspace interaction and cross-subspace interaction. Extensive\nexperiments on a wide range of language tasks (e.g., machine translation,\nabstractive summarization and grammar correction, language modeling), show its\nsuperiority, with a very modest increase in model size. Our code would be\navailable at: https://github.com/zhengkid/EIT-Enhanced-Interactive-Transformer.\n","authors":["Tong Zheng","Bei Li","Huiwen Bao","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2212.10197v2.pdf","comment":"Accepted by ACL2024 Main"},{"id":"http://arxiv.org/abs/2402.16786v2","updated":"2024-06-05T10:17:53Z","published":"2024-02-26T18:00:49Z","title":"Political Compass or Spinning Arrow? Towards More Meaningful Evaluations\n  for Values and Opinions in Large Language Models","summary":"  Much recent work seeks to evaluate values and opinions in large language\nmodels (LLMs) using multiple-choice surveys and questionnaires. Most of this\nwork is motivated by concerns around real-world LLM applications. For example,\npolitically-biased LLMs may subtly influence society when they are used by\nmillions of people. Such real-world concerns, however, stand in stark contrast\nto the artificiality of current evaluations: real users do not typically ask\nLLMs survey questions. Motivated by this discrepancy, we challenge the\nprevailing constrained evaluation paradigm for values and opinions in LLMs and\nexplore more realistic unconstrained evaluations. As a case study, we focus on\nthe popular Political Compass Test (PCT). In a systematic review, we find that\nmost prior work using the PCT forces models to comply with the PCT's\nmultiple-choice format. We show that models give substantively different\nanswers when not forced; that answers change depending on how models are\nforced; and that answers lack paraphrase robustness. Then, we demonstrate that\nmodels give different answers yet again in a more realistic open-ended answer\nsetting. We distill these findings into recommendations and open challenges in\nevaluating values and opinions in LLMs.\n","authors":["Paul Röttger","Valentin Hofmann","Valentina Pyatkin","Musashi Hinck","Hannah Rose Kirk","Hinrich Schütze","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2402.16786v2.pdf","comment":"Accepted at ACL 2024 (Main Conference)"},{"id":"http://arxiv.org/abs/2312.14187v4","updated":"2024-06-05T10:06:45Z","published":"2023-12-20T09:02:29Z","title":"WaveCoder: Widespread And Versatile Enhancement For Code Large Language\n  Models By Instruction Tuning","summary":"  Recent work demonstrates that, after being fine-tuned on a high-quality\ninstruction dataset, the resulting model can obtain impressive capabilities to\naddress a wide range of tasks. However, existing methods for instruction data\ngeneration often produce duplicate data and are not controllable enough on data\nquality. In this paper, we extend the generalization of instruction tuning by\nclassifying the instruction data to 4 code-related tasks and propose a\nLLM-based Generator-Discriminator data process framework to generate diverse,\nhigh-quality instruction data from open source code. Hence, we introduce\nCodeOcean, a dataset comprising 20,000 instruction instances across 4 universal\ncode-related tasks,which is aimed at augmenting the effectiveness of\ninstruction tuning and improving the generalization ability of fine-tuned\nmodel. Subsequently, we present WaveCoder, a fine-tuned Code LLM with\nWidespread And Versatile Enhanced instruction tuning. This model is\nspecifically designed for enhancing instruction tuning of Code Language Models\n(LLMs). Our experiments demonstrate that Wavecoder models outperform other\nopen-source models in terms of generalization ability across different\ncode-related tasks at the same level of fine-tuning scale. Moreover, Wavecoder\nexhibits high efficiency in previous code generation tasks. This paper thus\noffers a significant contribution to the field of instruction data generation\nand fine-tuning models, providing new insights and tools for enhancing\nperformance in code-related tasks.\n","authors":["Zhaojian Yu","Xin Zhang","Ning Shang","Yangyu Huang","Can Xu","Yishujie Zhao","Wenxiang Hu","Qiufeng Yin"],"pdf_url":"https://arxiv.org/pdf/2312.14187v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18721v2","updated":"2024-06-05T09:59:21Z","published":"2024-05-29T03:05:59Z","title":"Correctable Landmark Discovery via Large Models for Vision-Language\n  Navigation","summary":"  Vision-Language Navigation (VLN) requires the agent to follow language\ninstructions to reach a target position. A key factor for successful navigation\nis to align the landmarks implied in the instruction with diverse visual\nobservations. However, previous VLN agents fail to perform accurate modality\nalignment especially in unexplored scenes, since they learn from limited\nnavigation data and lack sufficient open-world alignment knowledge. In this\nwork, we propose a new VLN paradigm, called COrrectable LaNdmark DiScOvery via\nLarge ModEls (CONSOLE). In CONSOLE, we cast VLN as an open-world sequential\nlandmark discovery problem, by introducing a novel correctable landmark\ndiscovery scheme based on two large models ChatGPT and CLIP. Specifically, we\nuse ChatGPT to provide rich open-world landmark cooccurrence commonsense, and\nconduct CLIP-driven landmark discovery based on these commonsense priors. To\nmitigate the noise in the priors due to the lack of visual constraints, we\nintroduce a learnable cooccurrence scoring module, which corrects the\nimportance of each cooccurrence according to actual observations for accurate\nlandmark discovery. We further design an observation enhancement strategy for\nan elegant combination of our framework with different VLN agents, where we\nutilize the corrected landmark features to obtain enhanced observation features\nfor action decision. Extensive experimental results on multiple popular VLN\nbenchmarks (R2R, REVERIE, R4R, RxR) show the significant superiority of CONSOLE\nover strong baselines. Especially, our CONSOLE establishes the new\nstate-of-the-art results on R2R and R4R in unseen scenarios. Code is available\nat https://github.com/expectorlin/CONSOLE.\n","authors":["Bingqian Lin","Yunshuang Nie","Ziming Wei","Yi Zhu","Hang Xu","Shikui Ma","Jianzhuang Liu","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2405.18721v2.pdf","comment":"Accepted by TPAMI 2024"},{"id":"http://arxiv.org/abs/2405.20835v3","updated":"2024-06-05T09:53:18Z","published":"2024-05-31T14:24:33Z","title":"Outliers and Calibration Sets have Diminishing Effect on Quantization of\n  Modern LLMs","summary":"  Post-Training Quantization (PTQ) enhances the efficiency of Large Language\nModels (LLMs) by enabling faster operation and compatibility with more\naccessible hardware through reduced memory usage, at the cost of small\nperformance drops. We explore the role of calibration sets in PTQ, specifically\ntheir effect on hidden activations in various notable open-source LLMs.\nCalibration sets are crucial for evaluating activation magnitudes and\nidentifying outliers, which can distort the quantization range and negatively\nimpact performance. Our analysis reveals a marked contrast in quantization\neffectiveness across models. The older OPT model, upon which much of the\nquantization literature is based, shows significant performance deterioration\nand high susceptibility to outliers with varying calibration sets. In contrast,\nnewer models like Llama-2 7B, Llama-3 8B, Command-R 35B, and Mistral 7B\ndemonstrate strong robustness, with Mistral 7B showing near-immunity to\noutliers and stable activations. These findings suggest a shift in PTQ\nstrategies might be needed. As advancements in pre-training methods reduce the\nrelevance of outliers, there is an emerging need to reassess the fundamentals\nof current quantization literature. The emphasis should pivot towards\noptimizing inference speed, rather than primarily focusing on outlier\npreservation, to align with the evolving characteristics of state-of-the-art\nLLMs.\n","authors":["Davide Paglieri","Saurabh Dash","Tim Rocktäschel","Jack Parker-Holder"],"pdf_url":"https://arxiv.org/pdf/2405.20835v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06635v5","updated":"2024-06-05T09:47:27Z","published":"2023-12-11T18:51:59Z","title":"Gated Linear Attention Transformers with Hardware-Efficient Training","summary":"  Transformers with linear attention allow for efficient parallel training but\ncan simultaneously be formulated as an RNN with 2D (matrix-valued) hidden\nstates, thus enjoying linear-time inference complexity. However, linear\nattention generally underperforms ordinary softmax attention. Moreover, current\nimplementations of linear attention lack I/O-awareness and are thus slower than\nhighly optimized implementations of softmax attention. This work describes a\nhardware-efficient algorithm for linear attention that trades off memory\nmovement against parallelizability. The resulting implementation, dubbed\nFLASHLINEARATTENTION, is faster than FLASHATTENTION-2 (Dao, 2023) as a\nstandalone layer even on short sequence lengths (e.g., 1K). We then generalize\nthis algorithm to a more expressive variant of linear attention with\ndata-dependent gates. When used as a replacement for the standard attention\nlayer in Transformers, the resulting gated linear attention (GLA) Transformer\nis found to perform competitively against the LLaMA-architecture Transformer\n(Touvron et al., 2023) as well recent linear-time-inference baselines such as\nRetNet (Sun et al., 2023a) and Mamba (Gu & Dao, 2023) on moderate-scale\nlanguage modeling experiments. GLA Transformer is especially effective at\nlength generalization, enabling a model trained on 2K to generalize to\nsequences longer than 20K without significant perplexity degradations. For\ntraining speed, the GLA Transformer has higher throughput than a\nsimilarly-sized Mamba model.\n","authors":["Songlin Yang","Bailin Wang","Yikang Shen","Rameswar Panda","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2312.06635v5.pdf","comment":"ICML cameray ready"},{"id":"http://arxiv.org/abs/2406.03092v1","updated":"2024-06-05T09:31:37Z","published":"2024-06-05T09:31:37Z","title":"FragRel: Exploiting Fragment-level Relations in the External Memory of\n  Large Language Models","summary":"  To process contexts with unlimited length using Large Language Models (LLMs),\nrecent studies explore hierarchically managing the long text. Only several text\nfragments are taken from the external memory and passed into the temporary\nworking memory, i.e., LLM's context window. However, existing approaches\nisolatedly handle the text fragments without considering their structural\nconnections, thereby suffering limited capability on texts with intensive\ninter-relations, e.g., coherent stories and code repositories. This work\nattempts to resolve this by exploiting the fragment-level relations in external\nmemory. First, we formulate the fragment-level relations and present several\ninstantiations for different text types. Next, we introduce a relation-aware\nfragment assessment criteria upon previous independent fragment assessment.\nFinally, we present the fragment-connected Hierarchical Memory based LLM. We\nvalidate the benefits of involving these relations on long story understanding,\nrepository-level code generation, and long-term chatting.\n","authors":["Xihang Yue","Linchao Zhu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2406.03092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10207v5","updated":"2024-06-05T09:25:41Z","published":"2024-02-15T18:58:31Z","title":"Rewards-in-Context: Multi-objective Alignment of Foundation Models with\n  Dynamic Preference Adjustment","summary":"  We consider the problem of multi-objective alignment of foundation models\nwith human preferences, which is a critical step towards helpful and harmless\nAI systems. However, it is generally costly and unstable to fine-tune large\nfoundation models using reinforcement learning (RL), and the\nmulti-dimensionality, heterogeneity, and conflicting nature of human\npreferences further complicate the alignment process. In this paper, we\nintroduce Rewards-in-Context (RiC), which conditions the response of a\nfoundation model on multiple rewards in its prompt context and applies\nsupervised fine-tuning for alignment. The salient features of RiC are\nsimplicity and adaptivity, as it only requires supervised fine-tuning of a\nsingle foundation model and supports dynamic adjustment for user preferences\nduring inference time. Inspired by the analytical solution of an abstracted\nconvex optimization problem, our dynamic inference-time adjustment method\napproaches the Pareto-optimal solution for multiple objectives. Empirical\nevidence demonstrates the efficacy of our method in aligning both Large\nLanguage Models (LLMs) and diffusion models to accommodate diverse rewards with\nonly around 10% GPU hours compared with multi-objective RL baseline.\n","authors":["Rui Yang","Xiaoman Pan","Feng Luo","Shuang Qiu","Han Zhong","Dong Yu","Jianshu Chen"],"pdf_url":"https://arxiv.org/pdf/2402.10207v5.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2401.06469v3","updated":"2024-06-05T09:13:17Z","published":"2024-01-12T09:31:17Z","title":"Batch-ICL: Effective, Efficient, and Order-Agnostic In-Context Learning","summary":"  In this paper, by treating in-context learning (ICL) as a meta-optimization\nprocess, we explain why LLMs are sensitive to the order of ICL examples. This\nunderstanding leads us to the development of Batch-ICL, an effective,\nefficient, and order-agnostic inference algorithm for ICL. Differing from the\nstandard N-shot learning approach, Batch-ICL employs $N$ separate 1-shot\nforward computations and aggregates the resulting meta-gradients. These\naggregated meta-gradients are then applied to the forward computation of a\nzero-shot query to generate the final prediction. This batch processing\napproach renders the LLM agnostic to the order of ICL examples. Through\nextensive experiments and analysis, we demonstrate that Batch-ICL consistently\noutperforms most permutations of ICL examples. In some cases, it even exceeds\nthe performance of the best order for standard ICL, all while reducing the\ncomputational resources required. Furthermore, we develop a novel variant of\nBatch-ICL featuring multiple \"epochs\" of meta-optimization. This variant\nimplicitly explores permutations of ICL examples, further enhancing ICL\nperformance.\n","authors":["Kaiyi Zhang","Ang Lv","Yuhan Chen","Hansen Ha","Tao Xu","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2401.06469v3.pdf","comment":"This paper has been accepted by ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2406.03079v1","updated":"2024-06-05T09:09:32Z","published":"2024-06-05T09:09:32Z","title":"Cryptocurrency Frauds for Dummies: How ChatGPT introduces us to fraud?","summary":"  Recent advances in the field of large language models (LLMs), particularly\nthe ChatGPT family, have given rise to a powerful and versatile machine\ninterlocutor, packed with knowledge and challenging our understanding of\nlearning. This interlocutor is a double-edged sword: it can be harnessed for a\nwide variety of beneficial tasks, but it can also be used to cause harm. This\nstudy explores the complicated interaction between ChatGPT and the growing\nproblem of cryptocurrency fraud. Although ChatGPT is known for its adaptability\nand ethical considerations when used for harmful purposes, we highlight the\ndeep connection that may exist between ChatGPT and fraudulent actions in the\nvolatile cryptocurrency ecosystem. Based on our categorization of\ncryptocurrency frauds, we show how to influence outputs, bypass ethical terms,\nand achieve specific fraud goals by manipulating ChatGPT prompts. Furthermore,\nour findings emphasize the importance of realizing that ChatGPT could be a\nvaluable instructor even for novice fraudsters, as well as understanding and\nsafely deploying complex language models, particularly in the context of\ncryptocurrency frauds. Finally, our study underlines the importance of using\nLLMs responsibly and ethically in the digital currency sector, identifying\npotential risks and resolving ethical issues. It should be noted that our work\nis not intended to encourage and promote fraud, but rather to raise awareness\nof the risks of fraud associated with the use of ChatGPT.\n","authors":["Wail Zellagui","Abdessamad Imine","Yamina Tadjeddine"],"pdf_url":"https://arxiv.org/pdf/2406.03079v1.pdf","comment":"To be published in ACM journal \"Digital Government: Research and\n  Practice\""},{"id":"http://arxiv.org/abs/2405.12532v2","updated":"2024-06-05T09:01:24Z","published":"2024-05-21T06:46:37Z","title":"PyramidInfer: Pyramid KV Cache Compression for High-throughput LLM\n  Inference","summary":"  Large Language Models (LLMs) have shown remarkable comprehension abilities\nbut face challenges in GPU memory usage during inference, hindering their\nscalability for real-time applications like chatbots. To accelerate inference,\nwe store computed keys and values (KV cache) in the GPU memory. Existing\nmethods study the KV cache compression to reduce memory by pruning the\npre-computed KV cache. However, they neglect the inter-layer dependency between\nlayers and huge memory consumption in pre-computation. To explore these\ndeficiencies, we find that the number of crucial keys and values that influence\nfuture generations decreases layer by layer and we can extract them by the\nconsistency in attention weights. Based on the findings, we propose\nPyramidInfer, a method that compresses the KV cache by layer-wise retaining\ncrucial context. PyramidInfer saves significant memory by computing fewer keys\nand values without sacrificing performance. Experimental results show\nPyramidInfer improves 2.2x throughput compared to Accelerate with over 54% GPU\nmemory reduction in KV cache.\n","authors":["Dongjie Yang","XiaoDong Han","Yan Gao","Yao Hu","Shilin Zhang","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.12532v2.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2406.03075v1","updated":"2024-06-05T08:59:45Z","published":"2024-06-05T08:59:45Z","title":"Towards Detecting LLMs Hallucination via Markov Chain-based Multi-agent\n  Debate Framework","summary":"  The advent of large language models (LLMs) has facilitated the development of\nnatural language text generation. It also poses unprecedented challenges, with\ncontent hallucination emerging as a significant concern. Existing solutions\noften involve expensive and complex interventions during the training process.\nMoreover, some approaches emphasize problem disassembly while neglecting the\ncrucial validation process, leading to performance degradation or limited\napplications. To overcome these limitations, we propose a Markov Chain-based\nmulti-agent debate verification framework to enhance hallucination detection\naccuracy in concise claims. Our method integrates the fact-checking process,\nincluding claim detection, evidence retrieval, and multi-agent verification. In\nthe verification stage, we deploy multiple agents through flexible Markov\nChain-based debates to validate individual claims, ensuring meticulous\nverification outcomes. Experimental results across three generative tasks\ndemonstrate that our approach achieves significant improvements over baselines.\n","authors":["Xiaoxi Sun","Jinpeng Li","Yan Zhong","Dongyan Zhao","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2406.03075v1.pdf","comment":"18 pages, 3 figures"},{"id":"http://arxiv.org/abs/2406.03068v1","updated":"2024-06-05T08:51:08Z","published":"2024-06-05T08:51:08Z","title":"How Truncating Weights Improves Reasoning in Language Models","summary":"  In addition to the ability to generate fluent text in various languages,\nlarge language models have been successful at tasks that involve basic forms of\nlogical \"reasoning\" over their context. Recent work found that selectively\nremoving certain components from weight matrices in pre-trained models can\nimprove such reasoning capabilities. We investigate this phenomenon further by\ncarefully studying how certain global associations tend to be stored in\nspecific weight components or Transformer blocks, in particular feed-forward\nlayers. Such associations may hurt predictions in reasoning tasks, and removing\nthe corresponding components may then improve performance. We analyze how this\narises during training, both empirically and theoretically, on a two-layer\nTransformer trained on a basic reasoning task with noise, a toy associative\nmemory model, and on the Pythia family of pre-trained models tested on simple\nreasoning tasks.\n","authors":["Lei Chen","Joan Bruna","Alberto Bietti"],"pdf_url":"https://arxiv.org/pdf/2406.03068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03062v1","updated":"2024-06-05T08:43:11Z","published":"2024-06-05T08:43:11Z","title":"RadBARTsum: Domain Specific Adaption of Denoising Sequence-to-Sequence\n  Models for Abstractive Radiology Report Summarization","summary":"  Radiology report summarization is a crucial task that can help doctors\nquickly identify clinically significant findings without the need to review\ndetailed sections of reports. This study proposes RadBARTsum, a domain-specific\nand ontology facilitated adaptation of the BART model for abstractive radiology\nreport summarization. The approach involves two main steps: 1) re-training the\nBART model on a large corpus of radiology reports using a novel entity masking\nstrategy to improving biomedical domain knowledge learning, and 2) fine-tuning\nthe model for the summarization task using the Findings and Background sections\nto predict the Impression section. Experiments are conducted using different\nmasking strategies. Results show that the re-training process with domain\nknowledge facilitated masking improves performances consistently across various\nsettings. This work contributes a domain-specific generative language model for\nradiology report summarization and a method for utilising medical knowledge to\nrealise entity masking language model. The proposed approach demonstrates a\npromising direction of enhancing the efficiency of language models by deepening\nits understanding of clinical knowledge in radiology reports.\n","authors":["Jinge Wu","Abul Hasan","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2406.03062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03049v1","updated":"2024-06-05T08:24:22Z","published":"2024-06-05T08:24:22Z","title":"StreamSpeech: Simultaneous Speech-to-Speech Translation with Multi-task\n  Learning","summary":"  Simultaneous speech-to-speech translation (Simul-S2ST, a.k.a streaming speech\ntranslation) outputs target speech while receiving streaming speech inputs,\nwhich is critical for real-time communication. Beyond accomplishing translation\nbetween speech, Simul-S2ST requires a policy to control the model to generate\ncorresponding target speech at the opportune moment within speech inputs,\nthereby posing a double challenge of translation and policy. In this paper, we\npropose StreamSpeech, a direct Simul-S2ST model that jointly learns translation\nand simultaneous policy in a unified framework of multi-task learning. Adhering\nto a multi-task learning approach, StreamSpeech can perform offline and\nsimultaneous speech recognition, speech translation and speech synthesis via an\n\"All-in-One\" seamless model. Experiments on CVSS benchmark demonstrate that\nStreamSpeech achieves state-of-the-art performance in both offline S2ST and\nSimul-S2ST tasks. Besides, StreamSpeech is able to present high-quality\nintermediate results (i.e., ASR or translation results) during simultaneous\ntranslation process, offering a more comprehensive real-time communication\nexperience.\n","authors":["Shaolei Zhang","Qingkai Fang","Shoutao Guo","Zhengrui Ma","Min Zhang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2406.03049v1.pdf","comment":"Accepted to ACL 2024 main conference, Project Page:\n  https://ictnlp.github.io/StreamSpeech-site/"},{"id":"http://arxiv.org/abs/2310.01041v2","updated":"2024-06-05T08:23:11Z","published":"2023-10-02T09:35:27Z","title":"Language Model Decoding as Direct Metrics Optimization","summary":"  Despite the remarkable advances in language modeling, current mainstream\ndecoding methods still struggle to generate texts that align with human texts\nacross different aspects. In particular, sampling-based methods produce\nless-repetitive texts which are often disjunctive in discourse, while\nsearch-based methods maintain topic coherence at the cost of increased\nrepetition. Overall, these methods fall short in achieving holistic alignment\nacross a broad range of aspects. In this work, we frame decoding from a\nlanguage model as an optimization problem with the goal of strictly matching\nthe expected performance with human texts measured by multiple metrics of\ndesired aspects simultaneously. The resulting decoding distribution enjoys an\nanalytical solution that scales the input language model distribution via a\nsequence-level energy function defined by these metrics. And most importantly,\nwe prove that this induced distribution is guaranteed to improve the perplexity\non human texts, which suggests a better approximation to the underlying\ndistribution of human texts. To facilitate tractable sampling from this\nglobally normalized distribution, we adopt the Sampling-Importance-Resampling\ntechnique. Experiments on various domains and model scales demonstrate the\nsuperiority of our method in metrics alignment with human texts and human\nevaluation over strong baselines.\n","authors":["Haozhe Ji","Pei Ke","Hongning Wang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2310.01041v2.pdf","comment":"33 pages, 3 figures"},{"id":"http://arxiv.org/abs/2405.15362v2","updated":"2024-06-05T08:19:02Z","published":"2024-05-24T08:54:36Z","title":"Pipeline Parallelism with Controllable Memory","summary":"  Pipeline parallelism has been widely explored, but most existing schedules\nlack a systematic methodology. In this paper, we propose a framework to\ndecompose pipeline schedules as repeating a building block and we show that the\nlifespan of the building block decides the peak activation memory of the\npipeline schedule. Guided by the observations, we find that almost all existing\npipeline schedules, to the best of our knowledge, are memory inefficient. To\naddress this, we introduce a family of memory efficient building blocks with\ncontrollable activation memory, which can reduce the peak activation memory to\n1/2 of 1F1B without sacrificing efficiency, and even to 1/3 with comparable\nthroughput. We can also achieve almost zero pipeline bubbles while maintaining\nthe same activation memory as 1F1B. Our evaluations demonstrate that in pure\npipeline parallelism settings, our methods outperform 1F1B by from 7% to 55% in\nterms of throughput. When employing a grid search over hybrid parallelism\nhyperparameters in practical scenarios, our proposed methods demonstrate a 16%\nthroughput improvement over the 1F1B baseline for large language models.\n","authors":["Penghui Qi","Xinyi Wan","Nyamdavaa Amar","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2405.15362v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00856v4","updated":"2024-06-05T08:15:12Z","published":"2024-02-01T18:51:54Z","title":"Towards Efficient Exact Optimization of Language Model Alignment","summary":"  The alignment of language models with human preferences is vital for their\napplication in real-world tasks. The problem is formulated as optimizing the\nmodel's policy to maximize the expected reward that reflects human preferences\nwith minimal deviation from the initial policy. While considered as a\nstraightforward solution, reinforcement learning (RL) suffers from high\nvariance in policy updates, which impedes efficient policy improvement.\nRecently, direct preference optimization (DPO) was proposed to directly\noptimize the policy from preference data. However, we show that DPO derived\nbased on the optimal solution of the problem leads to a compromised\nmean-seeking approximation of the optimal solution in practice. In this paper,\nwe propose efficient exact optimization (EXO) of the alignment objective. EXO\nis guaranteed to optimize in the same direction as RL algorithms asymptotically\nfor arbitrary policy parametrization. This leads to the same mode-seeking\nsolution, while enables efficient optimization by circumventing the\ncomplexities of RL. We also compare our method to DPO with both theoretical and\nempirical analyses, and further demonstrate the advantages of our method over\nexisting approaches on realistic human preference data. Code is available at\nhttps://github.com/haozheji/exact-optimization.\n","authors":["Haozhe Ji","Cheng Lu","Yilin Niu","Pei Ke","Hongning Wang","Jun Zhu","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2402.00856v4.pdf","comment":"24 pages, 9 figures"},{"id":"http://arxiv.org/abs/2406.03030v1","updated":"2024-06-05T07:57:17Z","published":"2024-06-05T07:57:17Z","title":"From Tarzan to Tolkien: Controlling the Language Proficiency Level of\n  LLMs for Content Generation","summary":"  We study the problem of controlling the difficulty level of text generated by\nLarge Language Models (LLMs) for contexts where end-users are not fully\nproficient, such as language learners. Using a novel framework, we evaluate the\neffectiveness of several key approaches for this task, including few-shot\nprompting, supervised finetuning, and reinforcement learning (RL), utilising\nboth GPT-4 and open source alternatives like LLama2-7B and Mistral-7B.\n  Our findings reveal a large performance gap between GPT-4 and the open source\nmodels when using prompt-based strategies. However, we show how to bridge this\ngap with a careful combination of finetuning and RL alignment. Our best model,\nCALM (CEFR-Aligned Language Model), surpasses the performance of GPT-4 and\nother strategies, at only a fraction of the cost. We further validate the\nquality of our results through a small-scale human study.\n","authors":["Ali Malik","Stephen Mayhew","Chris Piech","Klinton Bicknell"],"pdf_url":"https://arxiv.org/pdf/2406.03030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02050v2","updated":"2024-06-05T07:56:11Z","published":"2024-06-04T07:31:06Z","title":"Analyzing Social Biases in Japanese Large Language Models","summary":"  With the development of Large Language Models (LLMs), social biases in the\nLLMs have become a crucial issue. While various benchmarks for social biases\nhave been provided across languages, the extent to which Japanese LLMs exhibit\nsocial biases has not been fully investigated. In this study, we construct the\nJapanese Bias Benchmark dataset for Question Answering (JBBQ) based on the\nEnglish bias benchmark BBQ, and analyze social biases in Japanese LLMs. The\nresults show that while current Japanese LLMs improve their accuracies on JBBQ\nby instruction-tuning, their bias scores become larger. In addition, augmenting\ntheir prompts with warning about social biases reduces the effect of biases in\nsome models.\n","authors":["Hitomi Yanaka","Namgi Han","Ryoma Kumon","Jie Lu","Masashi Takeshita","Ryo Sekizawa","Taisei Kato","Hiromi Arai"],"pdf_url":"https://arxiv.org/pdf/2406.02050v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10987v2","updated":"2024-06-05T07:44:30Z","published":"2024-02-16T05:29:59Z","title":"WilKE: Wise-Layer Knowledge Editor for Lifelong Knowledge Editing","summary":"  Knowledge editing aims to rectify inaccuracies in large language models\n(LLMs) without costly retraining for outdated or erroneous knowledge. However,\ncurrent knowledge editing methods primarily focus on single editing, failing to\nmeet the requirements for lifelong editing. This study reveals a performance\ndegradation encountered by knowledge editing in lifelong editing, characterized\nby toxicity buildup and toxicity flash, with the primary cause identified as\npattern unmatch. We introduce a knowledge editing approach named Wise-Layer\nKnowledge Editor (WilKE), which selects editing layer based on the pattern\nmatching degree of editing knowledge across different layers in language\nmodels. Experimental results demonstrate that, in lifelong editing, WilKE\nexhibits an average improvement of 46.2% and 67.8% on editing GPT2-XL and GPT-J\nrelative to state-of-the-art knowledge editing methods.\n","authors":["Chenhui Hu","Pengfei Cao","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.10987v2.pdf","comment":"To be published in ACL Findings 2024"},{"id":"http://arxiv.org/abs/2303.13809v4","updated":"2024-06-05T07:40:54Z","published":"2023-03-24T05:05:03Z","title":"Error Analysis Prompting Enables Human-Like Translation Evaluation in\n  Large Language Models","summary":"  Generative large language models (LLMs), e.g., ChatGPT, have demonstrated\nremarkable proficiency across several NLP tasks, such as machine translation,\ntext summarization. Recent research (Kocmi and Federmann, 2023) has shown that\nutilizing LLMs for assessing the quality of machine translation (MT) achieves\nstate-of-the-art performance at the system level but \\textit{performs poorly at\nthe segment level}. To further improve the performance of LLMs on MT quality\nassessment, we investigate several prompting designs, and propose a new\nprompting method called \\textbf{\\texttt{Error Analysis Prompting}} (EAPrompt)\nby combining Chain-of-Thoughts (Wei et al., 2022) and Error Analysis (Lu et\nal., 2023). This technique emulates the commonly accepted human evaluation\nframework - Multidimensional Quality Metrics (MQM, Freitag et al. (2021)) and\n\\textit{produces explainable and reliable MT evaluations at both the system and\nsegment level}. Experimental Results from the WMT22 metrics shared task\nvalidate the effectiveness of EAPrompt on various LLMs, with different\nstructures. Further analysis confirms that EAPrompt effectively distinguishes\nmajor errors from minor ones, while also sharing a similar distribution of the\nnumber of errors with MQM. These findings highlight the potential of EAPrompt\nas a human-like evaluator prompting technique for MT evaluation.\n","authors":["Qingyu Lu","Baopu Qiu","Liang Ding","Kanjian Zhang","Tom Kocmi","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2303.13809v4.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.01931v2","updated":"2024-06-05T07:21:19Z","published":"2024-06-04T03:31:09Z","title":"Dishonesty in Helpful and Harmless Alignment","summary":"  People tell lies when seeking rewards. Large language models (LLMs) are\naligned to human values with reinforcement learning where they get rewards if\nthey satisfy human preference. We find that this also induces dishonesty in\nhelpful and harmless alignment where LLMs tell lies in generating harmless\nresponses. Using the latest interpreting tools, we detect dishonesty, show how\nLLMs can be harmful if their honesty is increased, and analyze such conflicts\nat the parameter-level. Given these preliminaries and the hypothesis that\nreward-seeking stimulates dishonesty, we theoretically show that the dishonesty\ncan in-turn decrease the alignment performances and augment reward-seeking\nalignment with representation regularization. Extensive results, including\nGPT-4 annotated win-rates, perplexities, and cases studies demonstrate that we\ncan train more honest, helpful, and harmless LLMs. We will make all our codes\nand results be open-sourced upon this paper's acceptance.\n","authors":["Youcheng Huang","Jingkun Tang","Duanyu Feng","Zheng Zhang","Wenqiang Lei","Jiancheng Lv","Anthony G. Cohn"],"pdf_url":"https://arxiv.org/pdf/2406.01931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03009v1","updated":"2024-06-05T07:16:51Z","published":"2024-06-05T07:16:51Z","title":"Unveiling Selection Biases: Exploring Order and Token Sensitivity in\n  Large Language Models","summary":"  In this paper, we investigate the phenomena of \"selection biases\" in Large\nLanguage Models (LLMs), focusing on problems where models are tasked with\nchoosing the optimal option from an ordered sequence. We delve into biases\nrelated to option order and token usage, which significantly impact LLMs'\ndecision-making processes. We also quantify the impact of these biases through\nan extensive empirical analysis across multiple models and tasks. Furthermore,\nwe propose mitigation strategies to enhance model performance. Our key\ncontributions are threefold: 1) Precisely quantifying the influence of option\norder and token on LLMs, 2) Developing strategies to mitigate the impact of\ntoken and order sensitivity to enhance robustness, and 3) Offering a detailed\nanalysis of sensitivity across models and tasks, which informs the creation of\nmore stable and reliable LLM applications for selection problems.\n","authors":["Sheng-Lun Wei","Cheng-Kuang Wu","Hen-Hsen Huang","Hsin-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2406.03009v1.pdf","comment":"Accepted as a long findings paper at ACL 2024"},{"id":"http://arxiv.org/abs/2406.03008v1","updated":"2024-06-05T07:14:44Z","published":"2024-06-05T07:14:44Z","title":"DriVLMe: Enhancing LLM-based Autonomous Driving Agents with Embodied and\n  Social Experiences","summary":"  Recent advancements in foundation models (FMs) have unlocked new prospects in\nautonomous driving, yet the experimental settings of these studies are\npreliminary, over-simplified, and fail to capture the complexity of real-world\ndriving scenarios in human environments. It remains under-explored whether FM\nagents can handle long-horizon navigation tasks with free-from dialogue and\ndeal with unexpected situations caused by environmental dynamics or task\nchanges. To explore the capabilities and boundaries of FMs faced with the\nchallenges above, we introduce DriVLMe, a video-language-model-based agent to\nfacilitate natural and effective communication between humans and autonomous\nvehicles that perceive the environment and navigate. We develop DriVLMe from\nboth embodied experiences in a simulated environment and social experiences\nfrom real human dialogue. While DriVLMe demonstrates competitive performance in\nboth open-loop benchmarks and closed-loop human studies, we reveal several\nlimitations and challenges, including unacceptable inference time, imbalanced\ntraining data, limited visual understanding, challenges with multi-turn\ninteractions, simplified language generation from robotic experiences, and\ndifficulties in handling on-the-fly unexpected situations like environmental\ndynamics and task changes.\n","authors":["Yidong Huang","Jacob Sansom","Ziqiao Ma","Felix Gervits","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2406.03008v1.pdf","comment":"First Vision and Language for Autonomous Driving and Robotics\n  Workshop (VLADR @ CVPR 2024)"},{"id":"http://arxiv.org/abs/2406.03007v1","updated":"2024-06-05T07:14:28Z","published":"2024-06-05T07:14:28Z","title":"BadAgent: Inserting and Activating Backdoor Attacks in LLM Agents","summary":"  With the prosperity of large language models (LLMs), powerful LLM-based\nintelligent agents have been developed to provide customized services with a\nset of user-defined tools. State-of-the-art methods for constructing LLM agents\nadopt trained LLMs and further fine-tune them on data for the agent task.\nHowever, we show that such methods are vulnerable to our proposed backdoor\nattacks named BadAgent on various agent tasks, where a backdoor can be embedded\nby fine-tuning on the backdoor data. At test time, the attacker can manipulate\nthe deployed LLM agents to execute harmful operations by showing the trigger in\nthe agent input or environment. To our surprise, our proposed attack methods\nare extremely robust even after fine-tuning on trustworthy data. Though\nbackdoor attacks have been studied extensively in natural language processing,\nto the best of our knowledge, we could be the first to study them on LLM agents\nthat are more dangerous due to the permission to use external tools. Our work\ndemonstrates the clear risk of constructing LLM agents based on untrusted LLMs\nor data. Our code is public at https://github.com/DPamK/BadAgent\n","authors":["Yifei Wang","Dizhan Xue","Shengjie Zhang","Shengsheng Qian"],"pdf_url":"https://arxiv.org/pdf/2406.03007v1.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2401.00139v2","updated":"2024-06-05T07:12:02Z","published":"2023-12-30T04:51:46Z","title":"Is Knowledge All Large Language Models Needed for Causal Reasoning?","summary":"  This paper explores the causal reasoning of large language models (LLMs) to\nenhance their interpretability and reliability in advancing artificial\nintelligence. Despite the proficiency of LLMs in a range of tasks, their\npotential for understanding causality requires further exploration. We propose\na novel causal attribution model that utilizes ``do-operators\" for constructing\ncounterfactual scenarios, allowing us to systematically quantify the influence\nof input numerical data and LLMs' pre-existing knowledge on their causal\nreasoning processes. Our newly developed experimental setup assesses LLMs'\nreliance on contextual information and inherent knowledge across various\ndomains. Our evaluation reveals that LLMs' causal reasoning ability mainly\ndepends on the context and domain-specific knowledge provided. In the absence\nof such knowledge, LLMs can still maintain a degree of causal reasoning using\nthe available numerical data, albeit with limitations in the calculations. This\nmotivates the proposed fine-tuned LLM for pairwise causal discovery,\neffectively leveraging both knowledge and numerical information.\n","authors":["Hengrui Cai","Shengjie Liu","Rui Song"],"pdf_url":"https://arxiv.org/pdf/2401.00139v2.pdf","comment":"A Python implementation of our proposed method is available at\n  https://github.com/ncsulsj/Causal_LLM"},{"id":"http://arxiv.org/abs/2406.03004v1","updated":"2024-06-05T07:11:56Z","published":"2024-06-05T07:11:56Z","title":"Evaluation of data inconsistency for multi-modal sentiment analysis","summary":"  Emotion semantic inconsistency is an ubiquitous challenge in multi-modal\nsentiment analysis (MSA). MSA involves analyzing sentiment expressed across\nvarious modalities like text, audio, and videos. Each modality may convey\ndistinct aspects of sentiment, due to subtle and nuanced expression of human\nbeings, leading to inconsistency, which may hinder the prediction of artificial\nagents. In this work, we introduce a modality conflicting test set and assess\nthe performance of both traditional multi-modal sentiment analysis models and\nmulti-modal large language models (MLLMs). Our findings reveal significant\nperformance degradation across traditional models when confronted with\nsemantically conflicting data and point out the drawbacks of MLLMs when\nhandling multi-modal emotion analysis. Our research presents a new challenge\nand offer valuable insights for the future development of sentiment analysis\nsystems.\n","authors":["Yufei Wang","Mengyue Wu"],"pdf_url":"https://arxiv.org/pdf/2406.03004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11911v5","updated":"2024-06-05T06:33:16Z","published":"2024-01-22T12:54:04Z","title":"Blinded by Generated Contexts: How Language Models Merge Generated and\n  Retrieved Contexts When Knowledge Conflicts?","summary":"  While auxiliary information has become a key to enhancing Large Language\nModels (LLMs), relatively little is known about how LLMs merge these contexts,\nspecifically contexts generated by LLMs and those retrieved from external\nsources. To investigate this, we formulate a systematic framework to identify\nwhether LLMs' responses are attributed to either generated or retrieved\ncontexts. To easily trace the origin of the response, we construct datasets\nwith conflicting contexts, i.e., each question is paired with both generated\nand retrieved contexts, yet only one of them contains the correct answer. Our\nexperiments reveal a significant bias in several LLMs (GPT-4/3.5 and Llama2) to\nfavor generated contexts, even when they provide incorrect information. We\nfurther identify two key factors contributing to this bias: i) contexts\ngenerated by LLMs typically show greater similarity to the questions,\nincreasing their likelihood of being selected; ii) the segmentation process\nused in retrieved contexts disrupts their completeness, thereby hindering their\nfull utilization in LLMs. Our analysis enhances the understanding of how LLMs\nmerge diverse contexts, offers valuable insights for advancing current LLM\naugmentation methods, and highlights the risk of generated misinformation for\nretrieval-augmented LLMs.\n","authors":["Hexiang Tan","Fei Sun","Wanli Yang","Yuanzhuo Wang","Qi Cao","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.11911v5.pdf","comment":"Accepted at ACL 2024 Main"},{"id":"http://arxiv.org/abs/2406.02974v1","updated":"2024-06-05T06:15:48Z","published":"2024-06-05T06:15:48Z","title":"Readability-guided Idiom-aware Sentence Simplification (RISS) for\n  Chinese","summary":"  Chinese sentence simplification faces challenges due to the lack of\nlarge-scale labeled parallel corpora and the prevalence of idioms. To address\nthese challenges, we propose Readability-guided Idiom-aware Sentence\nSimplification (RISS), a novel framework that combines data augmentation\ntechniques with lexcial simplification. RISS introduces two key components: (1)\nReadability-guided Paraphrase Selection (RPS), a method for mining high-quality\nsentence pairs, and (2) Idiom-aware Simplification (IAS), a model that enhances\nthe comprehension and simplification of idiomatic expressions. By integrating\nRPS and IAS using multi-stage and multi-task learning strategies, RISS\noutperforms previous state-of-the-art methods on two Chinese sentence\nsimplification datasets. Furthermore, RISS achieves additional improvements\nwhen fine-tuned on a small labeled dataset. Our approach demonstrates the\npotential for more effective and accessible Chinese text simplification.\n","authors":["Jingshen Zhang","Xinglu Chen","Xinying Qiu","Zhimin Wang","Wenhe Feng"],"pdf_url":"https://arxiv.org/pdf/2406.02974v1.pdf","comment":"Accepted to the 23rd China National Conference on Computational\n  Linguistics (CCL 2024)"},{"id":"http://arxiv.org/abs/2402.04247v4","updated":"2024-06-05T06:13:09Z","published":"2024-02-06T18:54:07Z","title":"Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science","summary":"  Intelligent agents powered by large language models (LLMs) have demonstrated\nsubstantial promise in autonomously conducting experiments and facilitating\nscientific discoveries across various disciplines. While their capabilities are\npromising, these agents, called scientific LLM agents, also introduce novel\nvulnerabilities that demand careful consideration for safety. However, there\nexists a notable gap in the literature, as there has been no comprehensive\nexploration of these vulnerabilities. This perspective paper fills this gap by\nconducting a thorough examination of vulnerabilities in LLM-based agents within\nscientific domains, shedding light on potential risks associated with their\nmisuse and emphasizing the need for safety measures. We begin by providing a\ncomprehensive overview of the potential risks inherent to scientific LLM\nagents, taking into account user intent, the specific scientific domain, and\ntheir potential impact on the external environment. Then, we delve into the\norigins of these vulnerabilities and provide a scoping review of the limited\nexisting works. Based on our analysis, we propose a triadic framework involving\nhuman regulation, agent alignment, and an understanding of environmental\nfeedback (agent regulation) to mitigate these identified risks. Furthermore, we\nhighlight the limitations and challenges associated with safeguarding\nscientific agents and advocate for the development of improved models, robust\nbenchmarks, and comprehensive regulations to address these issues effectively.\n","authors":["Xiangru Tang","Qiao Jin","Kunlun Zhu","Tongxin Yuan","Yichi Zhang","Wangchunshu Zhou","Meng Qu","Yilun Zhao","Jian Tang","Zhuosheng Zhang","Arman Cohan","Zhiyong Lu","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2402.04247v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02969v1","updated":"2024-06-05T05:53:50Z","published":"2024-06-05T05:53:50Z","title":"Filtered not Mixed: Stochastic Filtering-Based Online Gating for Mixture\n  of Large Language Models","summary":"  We propose MoE-F -- a formalised mechanism for combining $N$ pre-trained\nexpert Large Language Models (LLMs) in online time-series prediction tasks by\nadaptively forecasting the best weighting of LLM predictions at every time\nstep. Our mechanism leverages the conditional information in each expert's\nrunning performance to forecast the best combination of LLMs for predicting the\ntime series in its next step. Diverging from static (learned) Mixture of\nExperts (MoE) methods, MoE-F employs time-adaptive stochastic filtering\ntechniques to combine experts. By framing the expert selection problem as a\nfinite state-space, continuous-time Hidden Markov model (HMM), we can leverage\nthe Wohman-Shiryaev filter. Our approach first constructs $N$ parallel filters\ncorresponding to each of the $N$ individual LLMs. Each filter proposes its best\ncombination of LLMs, given the information that they have access to.\nSubsequently, the $N$ filter outputs are aggregated to optimize a lower bound\nfor the loss of the aggregated LLMs, which can be optimized in closed-form,\nthus generating our ensemble predictor. Our contributions here are: (I) the\nMoE-F algorithm -- deployable as a plug-and-play filtering harness, (II)\ntheoretical optimality guarantees of the proposed filtering-based gating\nalgorithm, and (III) empirical evaluation and ablative results using state of\nthe art foundational and MoE LLMs on a real-world Financial Market Movement\ntask where MoE-F attains a remarkable 17% absolute and 48.5% relative F1\nmeasure improvement over the next best performing individual LLM expert.\n","authors":["Raeid Saqur","Anastasis Kratsios","Florian Krach","Yannick Limmer","Jacob-Junqi Tian","John Willes","Blanka Horvath","Frank Rudzicz"],"pdf_url":"https://arxiv.org/pdf/2406.02969v1.pdf","comment":"29 pages, 5 Appendix sections"},{"id":"http://arxiv.org/abs/2403.15226v2","updated":"2024-06-05T05:52:44Z","published":"2024-03-22T14:20:34Z","title":"Not All Attention is Needed: Parameter and Computation Efficient\n  Transfer Learning for Multi-modal Large Language Models","summary":"  In this paper, we propose a novel parameter and computation efficient tuning\nmethod for Multi-modal Large Language Models (MLLMs), termed Efficient\nAttention Skipping (EAS). Concretely, we first reveal that multi-head\nattentions (MHAs), the main computational overhead of MLLMs, are often\nredundant to downstream tasks. Based on this observation, EAS evaluates the\nattention redundancy and skips the less important MHAs to speed up inference.\nBesides, we also propose a novel propagation-of-information adapter (PIA) to\nserve the attention skipping of EAS and keep parameter efficiency, which can be\nfurther re-parameterized into feed-forward networks (FFNs) for zero-extra\nlatency. To validate EAS, we apply it to a recently proposed MLLM called LaVIN\nand a classic VL pre-trained model called METER, and conduct extensive\nexperiments on a set of benchmarks. The experiments show that EAS not only\nretains high performance and parameter efficiency, but also greatly speeds up\ninference speed. For instance, LaVIN-EAS can obtain 89.98\\% accuracy on\nScineceQA while speeding up inference by 2.2 times to LaVIN\n","authors":["Qiong Wu","Weihao Ye","Yiyi Zhou","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2403.15226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09627v2","updated":"2024-06-05T05:49:19Z","published":"2023-11-16T07:16:55Z","title":"Mitigating Biases for Instruction-following Language Models via Bias\n  Neurons Elimination","summary":"  Instruction-following language models often show undesirable biases. These\nundesirable biases may be accelerated in the real-world usage of language\nmodels, where a wide range of instructions is used through zero-shot example\nprompting. To solve this problem, we first define the bias neuron, which\nsignificantly affects biased outputs, and prove its existence empirically.\nFurthermore, we propose a novel and practical bias mitigation method, CRISPR,\nto eliminate bias neurons of language models in instruction-following settings.\nCRISPR automatically determines biased outputs and categorizes neurons that\naffect the biased outputs as bias neurons using an explainability method.\nExperimental results demonstrate the effectiveness of our method in mitigating\nbiases under zero-shot instruction-following settings without losing the\nmodel's task performance and existing knowledge. The experimental results\nreveal the generalizability of our method as it shows robustness under various\ninstructions and datasets. Surprisingly, our method can mitigate the bias in\nlanguage models by eliminating only a few neurons (at least three).\n","authors":["Nakyeong Yang","Taegwan Kang","Jungkyu Choi","Honglak Lee","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2311.09627v2.pdf","comment":"accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2405.13041v3","updated":"2024-06-05T05:48:27Z","published":"2024-05-17T15:30:18Z","title":"Assessing Political Bias in Large Language Models","summary":"  The assessment of bias within Large Language Models (LLMs) has emerged as a\ncritical concern in the contemporary discourse surrounding Artificial\nIntelligence (AI) in the context of their potential impact on societal\ndynamics. Recognizing and considering political bias within LLM applications is\nespecially important when closing in on the tipping point toward performative\nprediction. Then, being educated about potential effects and the societal\nbehavior LLMs can drive at scale due to their interplay with human operators.\nIn this way, the upcoming elections of the European Parliament will not remain\nunaffected by LLMs. We evaluate the political bias of the currently most\npopular open-source LLMs (instruct or assistant models) concerning political\nissues within the European Union (EU) from a German voter's perspective. To do\nso, we use the \"Wahl-O-Mat,\" a voting advice application used in Germany. From\nthe voting advice of the \"Wahl-O-Mat\" we quantize the degree of alignment of\nLLMs with German political parties. We show that larger models, such as\nLlama3-70B, tend to align more closely with left-leaning political parties,\nwhile smaller models often remain neutral, particularly when prompted in\nEnglish. The central finding is that LLMs are similarly biased, with low\nvariances in the alignment concerning a specific party. Our findings underline\nthe importance of rigorously assessing and making bias transparent in LLMs to\nsafeguard the integrity and trustworthiness of applications that employ the\ncapabilities of performative prediction and the invisible hand of machine\nlearning prediction and language generation.\n","authors":["Luca Rettenberger","Markus Reischl","Mark Schutera"],"pdf_url":"https://arxiv.org/pdf/2405.13041v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02962v1","updated":"2024-06-05T05:35:59Z","published":"2024-06-05T05:35:59Z","title":"Docs2KG: Unified Knowledge Graph Construction from Heterogeneous\n  Documents Assisted by Large Language Models","summary":"  Even for a conservative estimate, 80% of enterprise data reside in\nunstructured files, stored in data lakes that accommodate heterogeneous\nformats. Classical search engines can no longer meet information seeking needs,\nespecially when the task is to browse and explore for insight formulation. In\nother words, there are no obvious search keywords to use. Knowledge graphs, due\nto their natural visual appeals that reduce the human cognitive load, become\nthe winning candidate for heterogeneous data integration and knowledge\nrepresentation.\n  In this paper, we introduce Docs2KG, a novel framework designed to extract\nmultimodal information from diverse and heterogeneous unstructured documents,\nincluding emails, web pages, PDF files, and Excel files. Dynamically generates\na unified knowledge graph that represents the extracted key information,\nDocs2KG enables efficient querying and exploration of document data lakes.\nUnlike existing approaches that focus on domain-specific data sources or\npre-designed schemas, Docs2KG offers a flexible and extensible solution that\ncan adapt to various document structures and content types. The proposed\nframework unifies data processing supporting a multitude of downstream tasks\nwith improved domain interpretability. Docs2KG is publicly accessible at\nhttps://docs2kg.ai4wa.com, and a demonstration video is available at\nhttps://docs2kg.ai4wa.com/Video.\n","authors":["Qiang Sun","Yuanyi Luo","Wenxiao Zhang","Sirui Li","Jichunyang Li","Kai Niu","Xiangrui Kong","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2406.02962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02959v1","updated":"2024-06-05T05:27:29Z","published":"2024-06-05T05:27:29Z","title":"Adversarial Moment-Matching Distillation of Large Language Models","summary":"  Knowledge distillation (KD) has been shown to be highly effective in guiding\na student model with a larger teacher model and achieving practical benefits in\nimproving the computational and memory efficiency for large language models\n(LLMs). State-of-the-art KD methods for LLMs mostly rely on minimizing explicit\ndistribution distance between teacher and student probability predictions.\nInstead of optimizing these mandatory behaviour cloning objectives, we explore\nan imitation learning strategy for KD of LLMs. In particular, we minimize the\nimitation gap by matching the action-value moments of the teacher's behavior\nfrom both on- and off-policy perspectives. To achieve this action-value\nmoment-matching goal, we propose an adversarial training algorithm to jointly\nestimate the moment-matching distance and optimize the student policy to\nminimize it. Results from both task-agnostic instruction-following experiments\nand task-specific experiments demonstrate the effectiveness of our method and\nachieve new state-of-the-art performance.\n","authors":["Chen Jia"],"pdf_url":"https://arxiv.org/pdf/2406.02959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02958v1","updated":"2024-06-05T05:27:02Z","published":"2024-06-05T05:27:02Z","title":"PrE-Text: Training Language Models on Private Federated Data in the Age\n  of LLMs","summary":"  On-device training is currently the most common approach for training machine\nlearning (ML) models on private, distributed user data. Despite this, on-device\ntraining has several drawbacks: (1) most user devices are too small to train\nlarge models on-device, (2) on-device training is communication- and\ncomputation-intensive, and (3) on-device training can be difficult to debug and\ndeploy. To address these problems, we propose Private Evolution-Text\n(PrE-Text), a method for generating differentially private (DP) synthetic\ntextual data. First, we show that across multiple datasets, training small\nmodels (models that fit on user devices) with PrE-Text synthetic data\noutperforms small models trained on-device under practical privacy regimes\n($\\epsilon=1.29$, $\\epsilon=7.58$). We achieve these results while using\n9$\\times$ fewer rounds, 6$\\times$ less client computation per round, and\n100$\\times$ less communication per round. Second, finetuning large models on\nPrE-Text's DP synthetic data improves large language model (LLM) performance on\nprivate data across the same range of privacy budgets. Altogether, these\nresults suggest that training on DP synthetic data can be a better option than\ntraining a model on-device on private distributed data. Code is available at\nhttps://github.com/houcharlie/PrE-Text.\n","authors":["Charlie Hou","Akshat Shrivastava","Hongyuan Zhan","Rylan Conway","Trang Le","Adithya Sagar","Giulia Fanti","Daniel Lazar"],"pdf_url":"https://arxiv.org/pdf/2406.02958v1.pdf","comment":"ICML 2024 (Oral)"},{"id":"http://arxiv.org/abs/2406.00832v2","updated":"2024-06-05T05:23:40Z","published":"2024-06-02T18:42:57Z","title":"BoNBoN Alignment for Large Language Models and the Sweetness of\n  Best-of-n Sampling","summary":"  This paper concerns the problem of aligning samples from large language\nmodels to human preferences using best-of-$n$ sampling, where we draw $n$\nsamples, rank them, and return the best one. We consider two fundamental\nproblems. First: what is the relationship between best-of-$n$ and approaches to\nalignment that train LLMs to output samples with a high expected reward (e.g.,\nRLHF or DPO)? To answer this, we embed both the best-of-$n$ distribution and\nthe sampling distributions learned by alignment procedures in a common class of\ntiltings of the base LLM distribution. We then show that, within this class,\nbest-of-$n$ is essentially optimal in terms of the trade-off between win-rate\nagainst the base model vs KL distance from the base model. That is, best-of-$n$\nis the best choice of alignment distribution if the goal is to maximize win\nrate. However, best-of-$n$ requires drawing $n$ samples for each inference, a\nsubstantial cost. To avoid this, the second problem we consider is how to\nfine-tune a LLM to mimic the best-of-$n$ sampling distribution. We derive\nBoNBoN Alignment to achieve this by exploiting the special structure of the\nbest-of-$n$ distribution. Experiments show that BoNBoN alignment yields\nsubstantial improvements in producing a model that is preferred to the base\npolicy while minimally affecting off-target aspects.\n","authors":["Lin Gui","Cristina Gârbacea","Victor Veitch"],"pdf_url":"https://arxiv.org/pdf/2406.00832v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16457v2","updated":"2024-06-05T05:23:21Z","published":"2024-02-26T09:59:04Z","title":"RetrievalQA: Assessing Adaptive Retrieval-Augmented Generation for\n  Short-form Open-Domain Question Answering","summary":"  Adaptive retrieval-augmented generation (ARAG) aims to dynamically determine\nthe necessity of retrieval for queries instead of retrieving indiscriminately\nto enhance the efficiency and relevance of the sourced information. However,\nprevious works largely overlook the evaluation of ARAG approaches, leading to\ntheir effectiveness being understudied. This work presents a benchmark,\nRetrievalQA, comprising 1,271 short-form questions covering new world and\nlong-tail knowledge. The knowledge necessary to answer the questions is absent\nfrom LLMs; therefore, external information must be retrieved to answer\ncorrectly. This makes RetrievalQA a suitable testbed to evaluate existing ARAG\nmethods. We observe that calibration-based methods heavily rely on threshold\ntuning, while vanilla prompting is inadequate for guiding LLMs to make reliable\nretrieval decisions. Based on our findings, we propose Time-Aware Adaptive\nRetrieval (TA-ARE), a simple yet effective method that helps LLMs assess the\nnecessity of retrieval without calibration or additional training. The dataset\nand code will be available at https://github.com/hyintell/RetrievalQA\n","authors":["Zihan Zhang","Meng Fang","Ling Chen"],"pdf_url":"https://arxiv.org/pdf/2402.16457v2.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.02950v1","updated":"2024-06-05T05:18:20Z","published":"2024-06-05T05:18:20Z","title":"4D ASR: Joint Beam Search Integrating CTC, Attention, Transducer, and\n  Mask Predict Decoders","summary":"  End-to-end automatic speech recognition (E2E-ASR) can be classified into\nseveral network architectures, such as connectionist temporal classification\n(CTC), recurrent neural network transducer (RNN-T), attention-based\nencoder-decoder, and mask-predict models. Each network architecture has\nadvantages and disadvantages, leading practitioners to switch between these\ndifferent models depending on application requirements. Instead of building\nseparate models, we propose a joint modeling scheme where four decoders (CTC,\nRNN-T, attention, and mask-predict) share the same encoder -- we refer to this\nas 4D modeling. The 4D model is trained using multitask learning, which will\nbring model regularization and maximize the model robustness thanks to their\ncomplementary properties. To efficiently train the 4D model, we introduce a\ntwo-stage training strategy that stabilizes multitask learning. In addition, we\npropose three novel one-pass beam search algorithms by combining three decoders\n(CTC, RNN-T, and attention) to further improve performance. These three beam\nsearch algorithms differ in which decoder is used as the primary decoder. We\ncarefully evaluate the performance and computational tradeoffs associated with\neach algorithm. Experimental results demonstrate that the jointly trained 4D\nmodel outperforms the E2E-ASR models trained with only one individual decoder.\nFurthermore, we demonstrate that the proposed one-pass beam search algorithm\noutperforms the previously proposed CTC/attention decoding.\n","authors":["Yui Sudo","Muhammad Shakeel","Yosuke Fukumoto","Brian Yan","Jiatong Shi","Yifan Peng","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2406.02950v1.pdf","comment":"submitted to IEEE/ACM Transactions on Audio Speech and Language\n  Processing"},{"id":"http://arxiv.org/abs/2311.14495v4","updated":"2024-06-05T05:15:16Z","published":"2023-11-24T14:08:31Z","title":"StableSSM: Alleviating the Curse of Memory in State-space Models through\n  Stable Reparameterization","summary":"  In this paper, we investigate the long-term memory learning capabilities of\nstate-space models (SSMs) from the perspective of parameterization. We prove\nthat state-space models without any reparameterization exhibit a memory\nlimitation similar to that of traditional RNNs: the target relationships that\ncan be stably approximated by state-space models must have an exponential\ndecaying memory. Our analysis identifies this \"curse of memory\" as a result of\nthe recurrent weights converging to a stability boundary, suggesting that a\nreparameterization technique can be effective. To this end, we introduce a\nclass of reparameterization techniques for SSMs that effectively lift its\nmemory limitations. Besides improving approximation capabilities, we further\nillustrate that a principled choice of reparameterization scheme can also\nenhance optimization stability. We validate our findings using synthetic\ndatasets, language models and image classifications.\n","authors":["Shida Wang","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2311.14495v4.pdf","comment":"28 pages, 7 figures, ICML 2024"},{"id":"http://arxiv.org/abs/2406.02943v1","updated":"2024-06-05T05:05:41Z","published":"2024-06-05T05:05:41Z","title":"The Task-oriented Queries Benchmark (ToQB)","summary":"  Task-oriented queries (e.g., one-shot queries to play videos, order food, or\ncall a taxi) are crucial for assessing the quality of virtual assistants,\nchatbots, and other large language model (LLM)-based services. However, a\nstandard benchmark for task-oriented queries is not yet available, as existing\nbenchmarks in the relevant NLP (Natural Language Processing) fields have\nprimarily focused on task-oriented dialogues. Thus, we present a new\nmethodology for efficiently generating the Task-oriented Queries Benchmark\n(ToQB) using existing task-oriented dialogue datasets and an LLM service. Our\nmethodology involves formulating the underlying NLP task to summarize the\noriginal intent of a speaker in each dialogue, detailing the key steps to\nperform the devised NLP task using an LLM service, and outlining a framework\nfor automating a major part of the benchmark generation process. Through a case\nstudy encompassing three domains (i.e., two single-task domains and one\nmulti-task domain), we demonstrate how to customize the LLM prompts (e.g.,\nomitting system utterances or speaker labels) for those three domains and\ncharacterize the generated task-oriented queries. The generated ToQB dataset is\nmade available to the public. We further discuss new domains that can be added\nto ToQB by community contributors and its practical applications.\n","authors":["Keun Soo Yim"],"pdf_url":"https://arxiv.org/pdf/2406.02943v1.pdf","comment":"Data available on GitHub,\n  https://github.com/google/task-oriented-queries"},{"id":"http://arxiv.org/abs/2404.09221v2","updated":"2024-06-05T05:00:35Z","published":"2024-04-14T11:49:38Z","title":"Exploring and Improving Drafts in Blockwise Parallel Decoding","summary":"  Despite the remarkable strides made by autoregressive language models, their\npotential is often hampered by the slow inference speeds inherent in sequential\ntoken generation. Blockwise parallel decoding (BPD) was proposed by Stern et\nal. as a method to improve inference speed of language models by simultaneously\npredicting multiple future tokens, termed block drafts, which are subsequently\nverified and conditionally accepted by the autoregressive model. This paper\ncontributes to the understanding and improvement of block drafts in two ways.\nFirst, we analyze the token distributions produced by multiple prediction\nheads. Secondly, we leverage this analysis to develop algorithms to improve BPD\ninference speed by refining the block drafts using n-gram and neural language\nmodels. Experiments demonstrate that refined block drafts yield a +5-21%\nincrease in block efficiency (i.e., the number of accepted tokens from the\nblock draft) across diverse datasets.\n","authors":["Taehyeon Kim","Ananda Theertha Suresh","Kishore Papineni","Michael Riley","Sanjiv Kumar","Adrian Benton"],"pdf_url":"https://arxiv.org/pdf/2404.09221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10058v2","updated":"2024-06-05T04:57:09Z","published":"2024-02-15T16:28:34Z","title":"Towards Safer Large Language Models through Machine Unlearning","summary":"  The rapid advancement of Large Language Models (LLMs) has demonstrated their\nvast potential across various domains, attributed to their extensive\npretraining knowledge and exceptional generalizability. However, LLMs often\nencounter challenges in generating harmful content when faced with problematic\nprompts. To address this problem, existing work attempted to implement a\ngradient ascent based approach to prevent LLMs from producing harmful output.\nWhile these methods can be effective, they frequently impact the model utility\nin responding to normal prompts. To address this gap, we introduce Selective\nKnowledge negation Unlearning (SKU), a novel unlearning framework for LLMs,\ndesigned to eliminate harmful knowledge while preserving utility on normal\nprompts. Specifically, SKU is consisted of two stages: harmful knowledge\nacquisition stage and knowledge negation stage. The first stage aims to\nidentify and acquire harmful knowledge within the model, whereas the second is\ndedicated to remove this knowledge. SKU selectively isolates and removes\nharmful knowledge in model parameters, ensuring the model's performance remains\nrobust on normal prompts. Our experiments conducted across various LLM\narchitectures demonstrate that SKU identifies a good balance point between\nremoving harmful information and preserving utility.\n","authors":["Zheyuan Liu","Guangyao Dou","Zhaoxuan Tan","Yijun Tian","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.10058v2.pdf","comment":"Accepted by ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2406.02925v1","updated":"2024-06-05T04:25:56Z","published":"2024-06-05T04:25:56Z","title":"SYN2REAL: Leveraging Task Arithmetic for Mitigating Synthetic-Real\n  Discrepancies in ASR Domain Adaptation","summary":"  Recent advancements in large language models (LLMs) have introduced the 'task\nvector' concept, which has significantly impacted various domains but remains\nunderexplored in speech recognition. This paper presents a novel 'SYN2REAL'\ntask vector for domain adaptation in automatic speech recognition (ASR),\nspecifically targeting text-only domains. Traditional fine-tuning on synthetic\nspeech often results in performance degradation due to acoustic mismatches. To\naddress this issue, we propose creating a 'SYN2REAL' vector by subtracting the\nparameter differences between models fine-tuned on real and synthetic speech.\nThis vector effectively bridges the gap between the two domains. Experiments on\nthe SLURP dataset demonstrate that our approach yields an average improvement\nof 11.15% in word error rate for unseen target domains, highlighting the\npotential of task vectors in enhancing speech domain adaptation.\n","authors":["Hsuan Su","Hua Farn","Shang-Tse Chen","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2406.02925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02924v1","updated":"2024-06-05T04:25:23Z","published":"2024-06-05T04:25:23Z","title":"Pruner-Zero: Evolving Symbolic Pruning Metric from scratch for Large\n  Language Models","summary":"  Despite the remarkable capabilities, Large Language Models (LLMs) face\ndeployment challenges due to their extensive size. Pruning methods drop a\nsubset of weights to accelerate, but many of them require retraining, which is\nprohibitively expensive and computationally demanding. Recently, post-training\npruning approaches introduced novel metrics, enabling the pruning of LLMs\nwithout retraining. However, these metrics require the involvement of human\nexperts and tedious trial and error. To efficiently identify superior pruning\nmetrics, we develop an automatic framework for searching symbolic pruning\nmetrics using genetic programming. In particular, we devise an elaborate search\nspace encompassing the existing pruning metrics to discover the potential\nsymbolic pruning metric. We propose an opposing operation simplification\nstrategy to increase the diversity of the population. In this way, Pruner-Zero\nallows auto-generation of symbolic pruning metrics. Based on the searched\nresults, we explore the correlation between pruning metrics and performance\nafter pruning and summarize some principles. Extensive experiments on LLaMA and\nLLaMA-2 on language modeling and zero-shot tasks demonstrate that our\nPruner-Zero obtains superior performance than SOTA post-training pruning\nmethods. Code at: \\url{https://github.com/pprp/Pruner-Zero}.\n","authors":["Peijie Dong","Lujun Li","Zhenheng Tang","Xiang Liu","Xinglin Pan","Qiang Wang","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2406.02924v1.pdf","comment":"Accepted by ICML2024, 29 pages, 4 figures"},{"id":"http://arxiv.org/abs/2406.02921v1","updated":"2024-06-05T04:20:17Z","published":"2024-06-05T04:20:17Z","title":"Text Injection for Neural Contextual Biasing","summary":"  Neural contextual biasing effectively improves automatic speech recognition\n(ASR) for crucial phrases within a speaker's context, particularly those that\nare infrequent in the training data. This work proposes contextual text\ninjection (CTI) to enhance contextual ASR. CTI leverages not only the paired\nspeech-text data, but also a much larger corpus of unpaired text to optimize\nthe ASR model and its biasing component. Unpaired text is converted into\nspeech-like representations and used to guide the model's attention towards\nrelevant bias phrases. Moreover, we introduce a contextual text-injected (CTI)\nminimum word error rate (MWER) training, which minimizes the expected WER\ncaused by contextual biasing when unpaired text is injected into the model.\nExperiments show that CTI with 100 billion text sentences can achieve up to\n43.3% relative WER reduction from a strong neural biasing model. CTI-MWER\nprovides a further relative improvement of 23.5%.\n","authors":["Zhong Meng","Zelin Wu","Rohit Prabhavalkar","Cal Peyser","Weiran Wang","Nanxin Chen","Tara N. Sainath","Bhuvana Ramabhadran"],"pdf_url":"https://arxiv.org/pdf/2406.02921v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2406.02919v1","updated":"2024-06-05T04:15:07Z","published":"2024-06-05T04:15:07Z","title":"MultifacetEval: Multifaceted Evaluation to Probe LLMs in Mastering\n  Medical Knowledge","summary":"  Large language models (LLMs) have excelled across domains, also delivering\nnotable performance on the medical evaluation benchmarks, such as MedQA.\nHowever, there still exists a significant gap between the reported performance\nand the practical effectiveness in real-world medical scenarios. In this paper,\nwe aim to explore the causes of this gap by employing a multifaceted\nexamination schema to systematically probe the actual mastery of medical\nknowledge by current LLMs. Specifically, we develop a novel evaluation\nframework MultifacetEval to examine the degree and coverage of LLMs in encoding\nand mastering medical knowledge at multiple facets (comparison, rectification,\ndiscrimination, and verification) concurrently. Based on the MultifacetEval\nframework, we construct two multifaceted evaluation datasets: MultiDiseK (by\nproducing questions from a clinical disease knowledge base) and MultiMedQA (by\nrephrasing each question from a medical benchmark MedQA into multifaceted\nquestions). The experimental results on these multifaceted datasets demonstrate\nthat the extent of current LLMs in mastering medical knowledge is far below\ntheir performance on existing medical benchmarks, suggesting that they lack\ndepth, precision, and comprehensiveness in mastering medical knowledge.\nConsequently, current LLMs are not yet ready for application in real-world\nmedical tasks. The codes and datasets are available at\nhttps://github.com/THUMLP/MultifacetEval.\n","authors":["Yuxuan Zhou","Xien Liu","Chen Ning","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2406.02919v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2312.17080v4","updated":"2024-06-05T04:05:42Z","published":"2023-12-28T15:49:43Z","title":"MR-GSM8K: A Meta-Reasoning Benchmark for Large Language Model Evaluation","summary":"  In this work, we introduce a novel evaluation paradigm for Large Language\nModels (LLMs) that compels them to transition from a traditional\nquestion-answering role, akin to a student, to a solution-scoring role, akin to\na teacher. This paradigm, focusing on \"reasoning about reasoning,\" hence termed\nmeta-reasoning, shifts the emphasis from result-oriented assessments, which\noften neglect the reasoning process, to a more comprehensive evaluation that\neffectively distinguishes between the cognitive capabilities of different\nmodels. By applying this paradigm in the GSM8K dataset, we have developed the\nMR-GSM8K benchmark. Our extensive analysis includes several state-of-the-art\nmodels from both open-source and commercial domains, uncovering fundamental\ndeficiencies in their training and evaluation methodologies. Notably, while\nmodels like Deepseek-v2 and Claude3-Sonnet closely competed with GPT-4 in\nGSM8K, their performance disparities expanded dramatically in MR-GSM8K, with\ndifferences widening to over 20 absolute points, underscoring the significant\nchallenge posed by our meta-reasoning approach.\n","authors":["Zhongshen Zeng","Pengguang Chen","Shu Liu","Haiyun Jiang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.17080v4.pdf","comment":"Code: https://github.com/dvlab-research/MR-GSM8K"},{"id":"http://arxiv.org/abs/2406.02911v1","updated":"2024-06-05T04:04:08Z","published":"2024-06-05T04:04:08Z","title":"Improving In-Context Learning with Prediction Feedback for Sentiment\n  Analysis","summary":"  Large language models (LLMs) have achieved promising results in sentiment\nanalysis through the in-context learning (ICL) paradigm. However, their ability\nto distinguish subtle sentiments still remains a challenge. Inspired by the\nhuman ability to adjust understanding via feedback, this paper enhances ICL by\nincorporating prior predictions and feedback, aiming to rectify sentiment\nmisinterpretation of LLMs. Specifically, the proposed framework consists of\nthree steps: (1) acquiring prior predictions of LLMs, (2) devising predictive\nfeedback based on correctness, and (3) leveraging a feedback-driven prompt to\nrefine sentiment understanding. Experimental results across nine sentiment\nanalysis datasets demonstrate the superiority of our framework over\nconventional ICL methods, with an average F1 improvement of 5.95%.\n","authors":["Hongling Xu","Qianlong Wang","Yice Zhang","Min Yang","Xi Zeng","Bing Qin","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2406.02911v1.pdf","comment":"Accepted by ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2406.01574v3","updated":"2024-06-05T04:03:36Z","published":"2024-06-03T17:53:00Z","title":"MMLU-Pro: A More Robust and Challenging Multi-Task Language\n  Understanding Benchmark","summary":"  In the age of large-scale language models, benchmarks like the Massive\nMultitask Language Understanding (MMLU) have been pivotal in pushing the\nboundaries of what AI can achieve in language comprehension and reasoning\nacross diverse domains. However, as models continue to improve, their\nperformance on these benchmarks has begun to plateau, making it increasingly\ndifficult to discern differences in model capabilities. This paper introduces\nMMLU-Pro, an enhanced dataset designed to extend the mostly knowledge-driven\nMMLU benchmark by integrating more challenging, reasoning-focused questions and\nexpanding the choice set from four to ten options. Additionally, MMLU-Pro\neliminates the trivial and noisy questions in MMLU. Our experimental results\nshow that MMLU-Pro not only raises the challenge, causing a significant drop in\naccuracy by 16% to 33% compared to MMLU but also demonstrates greater stability\nunder varying prompts. With 24 different prompt styles tested, the sensitivity\nof model scores to prompt variations decreased from 4-5% in MMLU to just 2% in\nMMLU-Pro. Additionally, we found that models utilizing Chain of Thought (CoT)\nreasoning achieved better performance on MMLU-Pro compared to direct answering,\nwhich is in stark contrast to the findings on the original MMLU, indicating\nthat MMLU-Pro includes more complex reasoning questions. Our assessments\nconfirm that MMLU-Pro is a more discriminative benchmark to better track\nprogress in the field.\n","authors":["Yubo Wang","Xueguang Ma","Ge Zhang","Yuansheng Ni","Abhranil Chandra","Shiguang Guo","Weiming Ren","Aaran Arulraj","Xuan He","Ziyan Jiang","Tianle Li","Max Ku","Kai Wang","Alex Zhuang","Rongqi Fan","Xiang Yue","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2406.01574v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00715v2","updated":"2024-06-05T04:03:17Z","published":"2024-04-25T15:34:53Z","title":"Adapting Open-Source Large Language Models for Cost-Effective,\n  Expert-Level Clinical Note Generation with On-Policy Reinforcement Learning","summary":"  Proprietary Large Language Models (LLMs) such as GPT-4 and Gemini have\ndemonstrated promising capabilities in clinical text summarization tasks.\nHowever, due to patient data privacy concerns and computational costs, many\nhealthcare providers prefer using small, locally-hosted models over external\ngeneric LLMs. This study presents a comprehensive domain- and task-specific\nadaptation process for the open-source LLaMA-2 13 billion parameter model,\nenabling it to generate high-quality clinical notes from outpatient\npatient-doctor dialogues. Our process incorporates continued pre-training,\nsupervised fine-tuning, and reinforcement learning from both AI and human\nfeedback. We introduced a new approach, DistillDirect, for performing on-policy\nreinforcement learning with Gemini 1.0 Pro as the teacher model. Our resulting\nmodel, LLaMA-Clinic, can generate clinical notes comparable in quality to those\nauthored by physicians. In a blinded physician reader study, the majority\n(90.4%) of individual evaluations rated the notes generated by LLaMA-Clinic as\n\"acceptable\" or higher across all three criteria: real-world readiness,\ncompleteness, and accuracy. In the more challenging \"Assessment and Plan\"\nsection, LLaMA-Clinic scored higher (4.2/5) in real-world readiness than\nphysician-authored notes (4.1/5). Our cost analysis for inference shows that\nour LLaMA-Clinic model achieves a 4.375-fold cost reduction compared to an\nexternal generic LLM service. Additionally, we highlight key considerations for\nfuture clinical note-generation tasks, emphasizing the importance of\npre-defining a best-practice note format, rather than relying on LLMs to\ndetermine this for clinical practice. We have made our newly created synthetic\nclinic dialogue-note dataset and the physician feedback dataset publicly\navailable to foster future research.\n","authors":["Hanyin Wang","Chufan Gao","Bolun Liu","Qiping Xu","Guleid Hussein","Mohamad El Labban","Kingsley Iheasirim","Hariprasad Korsapati","Chuck Outcalt","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2405.00715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19121v2","updated":"2024-06-05T04:00:08Z","published":"2024-03-28T03:25:23Z","title":"Code Comparison Tuning for Code Large Language Models","summary":"  We present Code Comparison Tuning (CCT), a simple and effective tuning method\nfor code large language models (Code LLMs) to better handle subtle code errors.\nSpecifically, we integrate the concept of comparison into instruction tuning,\nboth at the token and sequence levels, enabling the model to discern even the\nslightest deviations in code. To compare the original code with an erroneous\nversion containing manually added code errors, we use token-level preference\nloss for detailed token-level comparisons. Additionally, we combine code\nsegments to create a new instruction tuning sample for sequence-level\ncomparisons, enhancing the model's bug-fixing capability. Experimental results\non the HumanEvalFix benchmark show that CCT surpasses instruction tuning in\npass@1 scores by up to 4 points across diverse code LLMs, and extensive\nanalysis demonstrates the effectiveness of our method.\n","authors":["Yufan Jiang","Qiaozhi He","Xiaomin Zhuang","Zhihua Wu"],"pdf_url":"https://arxiv.org/pdf/2403.19121v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2306.07629v4","updated":"2024-06-05T03:57:41Z","published":"2023-06-13T08:57:54Z","title":"SqueezeLLM: Dense-and-Sparse Quantization","summary":"  Generative Large Language Models (LLMs) have demonstrated remarkable results\nfor a wide range of tasks. However, deploying these models for inference has\nbeen a significant challenge due to their unprecedented resource requirements.\nThis has forced existing deployment frameworks to use multi-GPU inference\npipelines, which are often complex and costly, or to use smaller and less\nperformant models. In this work, we demonstrate that the main bottleneck for\ngenerative inference with LLMs is memory bandwidth, rather than compute,\nspecifically for single batch inference. While quantization has emerged as a\npromising solution by representing weights with reduced precision, previous\nefforts have often resulted in notable performance degradation. To address\nthis, we introduce SqueezeLLM, a post-training quantization framework that not\nonly enables lossless compression to ultra-low precisions of up to 3-bit, but\nalso achieves higher quantization performance under the same memory constraint.\nOur framework incorporates two novel ideas: (i) sensitivity-based non-uniform\nquantization, which searches for the optimal bit precision assignment based on\nsecond-order information; and (ii) the Dense-and-Sparse decomposition that\nstores outliers and sensitive weight values in an efficient sparse format. When\napplied to the LLaMA models, our 3-bit quantization significantly reduces the\nperplexity gap from the FP16 baseline by up to 2.1x as compared to the\nstate-of-the-art methods with the same memory requirement. Furthermore, when\ndeployed on an A6000 GPU, our quantized models achieve up to 2.3x speedup\ncompared to the baseline. Our code is available at\nhttps://github.com/SqueezeAILab/SqueezeLLM.\n","authors":["Sehoon Kim","Coleman Hooper","Amir Gholami","Zhen Dong","Xiuyu Li","Sheng Shen","Michael W. Mahoney","Kurt Keutzer"],"pdf_url":"https://arxiv.org/pdf/2306.07629v4.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2312.04511v3","updated":"2024-06-05T03:53:10Z","published":"2023-12-07T18:32:04Z","title":"An LLM Compiler for Parallel Function Calling","summary":"  The reasoning capabilities of the recent LLMs enable them to execute external\nfunction calls to overcome their inherent limitations, such as knowledge\ncutoffs, poor arithmetic skills, or lack of access to private data. This\ndevelopment has allowed LLMs to select and coordinate multiple functions based\non the context to tackle more complex problems. However, current methods for\nfunction calling often require sequential reasoning and acting for each\nfunction which can result in high latency, cost, and sometimes inaccurate\nbehavior. To address this, we introduce LLMCompiler, which executes functions\nin parallel to efficiently orchestrate multiple function calls. Drawing\ninspiration from the principles of classical compilers, LLMCompiler enables\nparallel function calling with three components: (i) a Function Calling\nPlanner, formulating execution plans for function calling; (ii) a Task Fetching\nUnit, dispatching function calling tasks; and (iii) an Executor, executing\nthese tasks in parallel. LLMCompiler automatically generates an optimized\norchestration for the function calls and can be used with both open-source and\nclosed-source models. We have benchmarked LLMCompiler on a range of tasks with\ndifferent patterns of function calling. We observe consistent latency speedup\nof up to 3.7x, cost savings of up to 6.7x, and accuracy improvement of up to\n~9% compared to ReAct. Our code is available at\nhttps://github.com/SqueezeAILab/LLMCompiler.\n","authors":["Sehoon Kim","Suhong Moon","Ryan Tabrizi","Nicholas Lee","Michael W. Mahoney","Kurt Keutzer","Amir Gholami"],"pdf_url":"https://arxiv.org/pdf/2312.04511v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.02903v1","updated":"2024-06-05T03:46:52Z","published":"2024-06-05T03:46:52Z","title":"Open Grounded Planning: Challenges and Benchmark Construction","summary":"  The emergence of large language models (LLMs) has increasingly drawn\nattention to the use of LLMs for human-like planning. Existing work on\nLLM-based planning either focuses on leveraging the inherent language\ngeneration capabilities of LLMs to produce free-style plans, or employs\nreinforcement learning approaches to learn decision-making for a limited set of\nactions within restricted environments. However, both approaches exhibit\nsignificant discrepancies from the open and executable requirements in\nreal-world planning. In this paper, we propose a new planning task--open\ngrounded planning. The primary objective of open grounded planning is to ask\nthe model to generate an executable plan based on a variable action set,\nthereby ensuring the executability of the produced plan. To this end, we\nestablishes a benchmark for open grounded planning spanning a wide range of\ndomains. Then we test current state-of-the-art LLMs along with five planning\napproaches, revealing that existing LLMs and methods still struggle to address\nthe challenges posed by grounded planning in open domains. The outcomes of this\npaper define and establish a foundational dataset for open grounded planning,\nand shed light on the potential challenges and future directions of LLM-based\nplanning.\n","authors":["Shiguang Guo","Ziliang Deng","Hongyu Lin","Yaojie Lu","Xianpei Han","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2406.02903v1.pdf","comment":"Accept to ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2406.02902v1","updated":"2024-06-05T03:44:35Z","published":"2024-06-05T03:44:35Z","title":"S$^2$GSL: Incorporating Segment to Syntactic Enhanced Graph Structure\n  Learning for Aspect-based Sentiment Analysis","summary":"  Previous graph-based approaches in Aspect based Sentiment Analysis(ABSA) have\ndemonstrated impressive performance by utilizing graph neural networks and\nattention mechanisms to learn structures of static dependency trees and dynamic\nlatent trees. However, incorporating both semantic and syntactic information\nsimultaneously within complex global structures can introduce irrelevant\ncontexts and syntactic dependencies during the process of graph structure\nlearning, potentially resulting in inaccurate predictions. In order to address\nthe issues above, we propose S$^2$GSL, incorporating Segment to Syntactic\nenhanced Graph Structure Learning for ABSA. Specifically,S$^2$GSL is featured\nwith a segment-aware semantic graph learning and a syntax-based latent graph\nlearning enabling the removal of irrelevant contexts and dependencies,\nrespectively. We further propose a self-adaptive aggregation network that\nfacilitates the fusion of two graph learning branches, thereby achieving\ncomplementarity across diverse structures. Experimental results on four\nbenchmarks demonstrate the effectiveness of our framework.\n","authors":["Bingfeng Chen","Qihan Ouyang","Yongqi Luo","Boyan Xu","Ruichu Cai","Zhifeng Hao"],"pdf_url":"https://arxiv.org/pdf/2406.02902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02900v1","updated":"2024-06-05T03:41:37Z","published":"2024-06-05T03:41:37Z","title":"Scaling Laws for Reward Model Overoptimization in Direct Alignment\n  Algorithms","summary":"  Reinforcement Learning from Human Feedback (RLHF) has been crucial to the\nrecent success of Large Language Models (LLMs), however, it is often a complex\nand brittle process. In the classical RLHF framework, a reward model is first\ntrained to represent human preferences, which is in turn used by an online\nreinforcement learning (RL) algorithm to optimize the LLM. A prominent issue\nwith such methods is \\emph{reward over-optimization} or \\emph{reward hacking},\nwhere performance as measured by the learned proxy reward model increases, but\ntrue quality plateaus or even deteriorates. Direct Alignment Algorithms (DDAs)\nlike Direct Preference Optimization have emerged as alternatives to the\nclassical RLHF pipeline by circumventing the reward modeling phase. However,\nalthough DAAs do not use a separate proxy reward model, they still commonly\ndeteriorate from over-optimization. While the so-called reward hacking\nphenomenon is not well-defined for DAAs, we still uncover similar trends: at\nhigher KL budgets, DAA algorithms exhibit similar degradation patterns to their\nclassic RLHF counterparts. In particular, we find that DAA methods deteriorate\nnot only across a wide range of KL budgets but also often before even a single\nepoch of the dataset is completed. Through extensive empirical experimentation,\nthis work formulates and formalizes the reward over-optimization or hacking\nproblem for DAAs and explores its consequences across objectives, training\nregimes, and model scales.\n","authors":["Rafael Rafailov","Yaswanth Chittepu","Ryan Park","Harshit Sikchi","Joey Hejna","Bradley Knox","Chelsea Finn","Scott Niekum"],"pdf_url":"https://arxiv.org/pdf/2406.02900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07758v6","updated":"2024-06-05T03:37:35Z","published":"2023-08-15T13:19:59Z","title":"Forward-Backward Reasoning in Large Language Models for Mathematical\n  Verification","summary":"  Self-Consistency samples diverse reasoning chains with answers and chooses\nthe final answer by majority voting. It is based on forward reasoning and\ncannot further improve performance by sampling more reasoning chains when\nsaturated. To further boost performance, we introduce backward reasoning to\nverify candidate answers. Specifically, for mathematical tasks, we mask a\nnumber in the question and ask the LLM to answer a backward question created by\na simple template, i.e., to predict the masked number when a candidate answer\nis provided. Instead of using forward or backward reasoning alone, we propose\nFOBAR to combine FOrward and BAckward Reasoning for verification. Extensive\nexperiments on six standard mathematical data sets and three LLMs show that\nFOBAR achieves state-of-the-art performance. In particular, FOBAR outperforms\nSelf-Consistency, which uses forward reasoning alone, demonstrating that\ncombining forward and forward reasoning is better. In addition, FOBAR performs\nbetter than existing verification methods, showing the effectiveness of the\nsimple template used in backward reasoning and the proposed combination.\nExtensions to non-mathematical problems are also discussed and validated\nempirically.\n","authors":["Weisen Jiang","Han Shi","Longhui Yu","Zhengying Liu","Yu Zhang","Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2308.07758v6.pdf","comment":"Accepted by Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2304.11406v4","updated":"2024-06-05T03:29:31Z","published":"2023-04-22T13:42:04Z","title":"LaMP: When Large Language Models Meet Personalization","summary":"  This paper highlights the importance of personalization in large language\nmodels and introduces the LaMP benchmark -- a novel benchmark for training and\nevaluating language models for producing personalized outputs. LaMP offers a\ncomprehensive evaluation framework with diverse language tasks and multiple\nentries for each user profile. It consists of seven personalized tasks,\nspanning three text classification and four text generation tasks. We\nadditionally propose two retrieval augmentation approaches that retrieve\npersonal items from each user profile for personalizing language model outputs.\nTo this aim, we study various retrieval models, including term matching,\nsemantic matching, and time-aware methods. Extensive experiments on LaMP for\nzero-shot and fine-tuned language models demonstrate the efficacy of the\nproposed retrieval augmentation approach and highlight the impact of\npersonalization in various natural language tasks.\n","authors":["Alireza Salemi","Sheshera Mysore","Michael Bendersky","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2304.11406v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02030v2","updated":"2024-06-05T03:28:01Z","published":"2024-06-04T07:13:23Z","title":"Multimodal Reasoning with Multimodal Knowledge Graph","summary":"  Multimodal reasoning with large language models (LLMs) often suffers from\nhallucinations and the presence of deficient or outdated knowledge within LLMs.\nSome approaches have sought to mitigate these issues by employing textual\nknowledge graphs, but their singular modality of knowledge limits comprehensive\ncross-modal understanding. In this paper, we propose the Multimodal Reasoning\nwith Multimodal Knowledge Graph (MR-MKG) method, which leverages multimodal\nknowledge graphs (MMKGs) to learn rich and semantic knowledge across\nmodalities, significantly enhancing the multimodal reasoning capabilities of\nLLMs. In particular, a relation graph attention network is utilized for\nencoding MMKGs and a cross-modal alignment module is designed for optimizing\nimage-text alignment. A MMKG-grounded dataset is constructed to equip LLMs with\ninitial expertise in multimodal reasoning through pretraining. Remarkably,\nMR-MKG achieves superior performance while training on only a small fraction of\nparameters, approximately 2.25% of the LLM's parameter size. Experimental\nresults on multimodal question answering and multimodal analogy reasoning tasks\ndemonstrate that our MR-MKG method outperforms previous state-of-the-art\nmodels.\n","authors":["Junlin Lee","Yequan Wang","Jing Li","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.02030v2.pdf","comment":"Accepted by ACL 2024 (Main Conference)"},{"id":"http://arxiv.org/abs/2406.02893v1","updated":"2024-06-05T03:26:59Z","published":"2024-06-05T03:26:59Z","title":"Language Model Can Do Knowledge Tracing: Simple but Effective Method to\n  Integrate Language Model and Knowledge Tracing Task","summary":"  Knowledge Tracing (KT) is a critical task in online learning for modeling\nstudent knowledge over time. Despite the success of deep learning-based KT\nmodels, which rely on sequences of numbers as data, most existing approaches\nfail to leverage the rich semantic information in the text of questions and\nconcepts. This paper proposes Language model-based Knowledge Tracing (LKT), a\nnovel framework that integrates pre-trained language models (PLMs) with KT\nmethods. By leveraging the power of language models to capture semantic\nrepresentations, LKT effectively incorporates textual information and\nsignificantly outperforms previous KT models on large benchmark datasets.\nMoreover, we demonstrate that LKT can effectively address the cold-start\nproblem in KT by leveraging the semantic knowledge captured by PLMs.\nInterpretability of LKT is enhanced compared to traditional KT models due to\nits use of text-rich data. We conducted the local interpretable model-agnostic\nexplanation technique and analysis of attention scores to interpret the model\nperformance further. Our work highlights the potential of integrating PLMs with\nKT and paves the way for future research in KT domain.\n","authors":["Unggi Lee","Jiyeong Bae","Dohee Kim","Sookbun Lee","Jaekwon Park","Taekyung Ahn","Gunho Lee","Damji Stratton","Hyeoncheol Kim"],"pdf_url":"https://arxiv.org/pdf/2406.02893v1.pdf","comment":"11 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2212.02021v5","updated":"2024-06-05T03:14:42Z","published":"2022-12-05T04:37:22Z","title":"Analysis of Utterance Embeddings and Clustering Methods Related to\n  Intent Induction for Task-Oriented Dialogue","summary":"  The focus of this work is to investigate unsupervised approaches to overcome\nquintessential challenges in designing task-oriented dialog schema: assigning\nintent labels to each dialog turn (intent clustering) and generating a set of\nintents based on the intent clustering methods (intent induction). We postulate\nthere are two salient factors for automatic induction of intents: (1)\nclustering algorithm for intent labeling and (2) user utterance embedding\nspace. We compare existing off-the-shelf clustering models and embeddings based\non DSTC11 evaluation. Our extensive experiments demonstrate that the combined\nselection of utterance embedding and clustering method in the intent induction\ntask should be carefully considered. We also present that pretrained MiniLM\nwith Agglomerative clustering shows significant improvement in NMI, ARI, F1,\naccuracy and example coverage in intent induction tasks. The source codes are\navailable at https://github.com/Jeiyoon/dstc11-track2.\n","authors":["Jeiyoon Park","Yoonna Jang","Chanhee Lee","Heuiseok Lim"],"pdf_url":"https://arxiv.org/pdf/2212.02021v5.pdf","comment":"The Eleventh Dialog System Technology Challenge (DSTC11)"},{"id":"http://arxiv.org/abs/2401.03590v2","updated":"2024-06-05T03:13:31Z","published":"2024-01-07T22:11:36Z","title":"Building Efficient and Effective OpenQA Systems for Low-Resource\n  Languages","summary":"  Question answering (QA) is the task of answering questions posed in natural\nlanguage with free-form natural language answers extracted from a given\npassage. In the OpenQA variant, only a question text is given, and the system\nmust retrieve relevant passages from an unstructured knowledge source and use\nthem to provide answers, which is the case in the mainstream QA systems on the\nWeb. QA systems currently are mostly limited to the English language due to the\nlack of large-scale labeled QA datasets in non-English languages. In this\npaper, we show that effective, low-cost OpenQA systems can be developed for\nlow-resource contexts. The key ingredients are (1) weak supervision using\nmachine-translated labeled datasets and (2) a relevant unstructured knowledge\nsource in the target language context. Furthermore, we show that only a few\nhundred gold assessment examples are needed to reliably evaluate these systems.\nWe apply our method to Turkish as a challenging case study, since English and\nTurkish are typologically very distinct and Turkish has limited resources for\nQA. We present SQuAD-TR, a machine translation of SQuAD2.0, and we build our\nOpenQA system by adapting ColBERT-QA and retraining it over Turkish resources\nand SQuAD-TR using two versions of Wikipedia dumps spanning two years. We\nobtain a performance improvement of 24-32% in the Exact Match (EM) score and\n22-29% in the F1 score compared to the BM25-based and DPR-based baseline QA\nreader models. Our results show that SQuAD-TR makes OpenQA feasible for\nTurkish, which we hope encourages researchers to build OpenQA systems in other\nlow-resource languages. We make all the code, models, and the dataset publicly\navailable at https://github.com/boun-tabi/SQuAD-TR.\n","authors":["Emrah Budur","Rıza Özçelik","Dilara Soylu","Omar Khattab","Tunga Güngör","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2401.03590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04691v3","updated":"2024-06-05T03:12:13Z","published":"2023-12-07T20:42:05Z","title":"Simul-LLM: A Framework for Exploring High-Quality Simultaneous\n  Translation with Large Language Models","summary":"  Large language models (LLMs) with billions of parameters and pretrained on\nmassive amounts of data are now capable of near or better than state-of-the-art\nperformance in a variety of downstream natural language processing tasks.\nNeural machine translation (NMT) is one such task that LLMs have been applied\nto with great success. However, little research has focused on applying LLMs to\nthe more difficult subset of NMT called simultaneous translation (SimulMT),\nwhere translation begins before the entire source context is available to the\nmodel. In this paper, we address key challenges facing LLMs fine-tuned for\nSimulMT, validate classical SimulMT concepts and practices in the context of\nLLMs, explore adapting LLMs that are fine-tuned for NMT to the task of SimulMT,\nand introduce Simul-LLM, the first open-source fine-tuning and evaluation\npipeline development framework for LLMs focused on SimulMT.\n","authors":["Victor Agostinelli","Max Wild","Matthew Raffel","Kazi Ahmed Asif Fuad","Lizhong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.04691v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2406.02888v1","updated":"2024-06-05T03:08:46Z","published":"2024-06-05T03:08:46Z","title":"HYDRA: Model Factorization Framework for Black-Box LLM Personalization","summary":"  Personalization has emerged as a critical research area in modern intelligent\nsystems, focusing on mining users' behavioral history and adapting to their\npreferences for delivering tailored experiences. Despite the remarkable\nfew-shot capabilities exhibited by black-box large language models (LLMs), the\ninherent opacity of their model parameters presents significant challenges in\naligning the generated output with individual expectations. Existing solutions\nhave primarily focused on prompt design to incorporate user-specific profiles\nand behaviors; however, such approaches often struggle to generalize\neffectively due to their inability to capture shared knowledge among all users.\nTo address these challenges, we propose HYDRA, a model factorization framework\nthat captures both user-specific behavior patterns from historical data and\nshared general knowledge among all users to deliver personalized generation. In\norder to capture user-specific behavior patterns, we first train a reranker to\nprioritize the most useful information from top-retrieved relevant historical\nrecords. By combining the prioritized history with the corresponding query, we\ntrain an adapter to align the output with individual user-specific preferences,\neliminating the reliance on access to inherent model parameters of black-box\nLLMs. Both the reranker and the adapter can be decomposed into a base model\nwith multiple user-specific heads, resembling a hydra. The base model maintains\nshared knowledge across users, while the multiple personal heads capture\nuser-specific preferences. Experimental results demonstrate that HYDRA\noutperforms existing state-of-the-art prompt-based methods by an average\nrelative improvement of 9.01% across five diverse personalization tasks in the\nLaMP benchmark. Our implementation is available at\nhttps://github.com/night-chen/HYDRA.\n","authors":["Yuchen Zhuang","Haotian Sun","Yue Yu","Qifan Wang","Chao Zhang","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2406.02888v1.pdf","comment":"24 pages, 6 figures, work in progress"},{"id":"http://arxiv.org/abs/2406.02886v1","updated":"2024-06-05T03:08:25Z","published":"2024-06-05T03:08:25Z","title":"PLaD: Preference-based Large Language Model Distillation with\n  Pseudo-Preference Pairs","summary":"  Large Language Models (LLMs) have exhibited impressive capabilities in\nvarious tasks, yet their vast parameter sizes restrict their applicability in\nresource-constrained settings. Knowledge distillation (KD) offers a viable\nsolution by transferring expertise from large teacher models to compact student\nmodels. However, traditional KD techniques face specific challenges when\napplied to LLMs, including restricted access to LLM outputs, significant\nteacher-student capacity gaps, and the inherited mis-calibration issue. In this\nwork, we present PLaD, a novel preference-based LLM distillation framework.\nPLaD exploits the teacher-student capacity discrepancy to generate\npseudo-preference pairs where teacher outputs are preferred over student\noutputs. Then, PLaD leverages a ranking loss to re-calibrate student's\nestimation of sequence likelihood, which steers the student's focus towards\nunderstanding the relative quality of outputs instead of simply imitating the\nteacher. PLaD bypasses the need for access to teacher LLM's internal states,\ntackles the student's expressivity limitations, and mitigates the student\nmis-calibration issue. Through extensive experiments on two sequence generation\ntasks and with various LLMs, we demonstrate the effectiveness of our proposed\nPLaD framework.\n","authors":["Rongzhi Zhang","Jiaming Shen","Tianqi Liu","Haorui Wang","Zhen Qin","Feng Han","Jialu Liu","Simon Baumgartner","Michael Bendersky","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.02886v1.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2403.00932v2","updated":"2024-06-05T03:07:44Z","published":"2024-03-01T19:22:24Z","title":"Differentially Private Knowledge Distillation via Synthetic Text\n  Generation","summary":"  Large Language models (LLMs) are achieving state-of-the-art performance in\nmany different downstream tasks. However, the increasing urgency of data\nprivacy puts pressure on practitioners to train LLMs with Differential Privacy\n(DP) on private data. Concurrently, the exponential growth in parameter size of\nLLMs necessitates model compression before deployment of LLMs on\nresource-constrained devices or latency-sensitive applications. Differential\nprivacy and model compression generally must trade off utility loss to achieve\ntheir objectives. Moreover, simultaneously applying both schemes can compound\nthe utility degradation. To this end, we propose DistilDP: a novel\ndifferentially private knowledge distillation algorithm that exploits synthetic\ndata generated by a differentially private teacher LLM. The knowledge of a\nteacher LLM is transferred onto the student in two ways: one way from the\nsynthetic data itself -- the hard labels, and the other way by the output\ndistribution of the teacher evaluated on the synthetic data -- the soft labels.\nFurthermore, if the teacher and student share a similar architectural\nstructure, we can further distill knowledge by aligning the hidden\nrepresentations between both. Our experimental results demonstrate that\nDistilDP can substantially improve the utility over existing baselines, at\nleast $9.0$ PPL on the Big Patent dataset, with strong privacy parameters,\n$\\epsilon=2$. These promising results progress privacy-preserving compression\nof autoregressive LLMs. Our code can be accessed here:\nhttps://github.com/james-flemings/dp_compress.\n","authors":["James Flemings","Murali Annavaram"],"pdf_url":"https://arxiv.org/pdf/2403.00932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18255v5","updated":"2024-06-05T03:02:48Z","published":"2024-04-28T17:36:43Z","title":"PatentGPT: A Large Language Model for Intellectual Property","summary":"  In recent years, large language models(LLMs) have attracted significant\nattention due to their exceptional performance across a multitude of natural\nlanguage process tasks, and have been widely applied in various fields.\nHowever, the application of large language models in the Intellectual Property\n(IP) domain is challenging due to the strong need for specialized knowledge,\nprivacy protection, processing of extremely long text in this field. In this\ntechnical report, we present for the first time a low-cost, standardized\nprocedure for training IP-oriented LLMs, meeting the unique requirements of the\nIP domain. Using this standard process, we have trained the PatentGPT series\nmodels based on open-source pretrained models. By evaluating them on the\nopen-source IP-oriented benchmark MOZIP, our domain-specific LLMs outperforms\nGPT-4, indicating the effectiveness of the proposed training procedure and the\nexpertise of the PatentGPT models in the IP domain. Remarkably, our model\nsurpassed GPT-4 on the 2019 China Patent Agent Qualification Examination,\nscoring 65 and matching human expert levels. Additionally, the PatentGPT model,\nwhich utilizes the SMoE architecture, achieves performance comparable to that\nof GPT-4 in the IP domain and demonstrates a better cost-performance ratio on\nlong-text tasks, potentially serving as an alternative to GPT-4 within the IP\ndomain.\n","authors":["Zilong Bai","Ruiji Zhang","Linqing Chen","Qijun Cai","Yuan Zhong","Cong Wang","Yan Fang","Jie Fang","Jing Sun","Weikuan Wang","Lizhi Zhou","Haoran Hua","Tian Qiu","Chaochao Wang","Cheng Sun","Jianping Lu","Yixin Wang","Yubin Xia","Meng Hu","Haowen Liu","Peng Xu","Licong Xu","Fu Bian","Xiaolong Gu","Lisha Zhang","Weilei Wang","Changyang Tu"],"pdf_url":"https://arxiv.org/pdf/2404.18255v5.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2406.02882v1","updated":"2024-06-05T03:00:15Z","published":"2024-06-05T03:00:15Z","title":"Outdated Issue Aware Decoding for Factual Knowledge Editing","summary":"  Recently, Knowledge Editing has received increasing attention, since it could\nupdate the specific knowledge from outdated ones in pretrained models without\nre-training. However, as pointed out by recent studies, existing related\nmethods tend to merely memorize the superficial word composition of the edited\nknowledge, rather than truly learning and absorbing it. Consequently, on the\nreasoning questions, we discover that existing methods struggle to utilize the\nedited knowledge to reason the new answer, and tend to retain outdated\nresponses, which are generated by the original models utilizing original\nknowledge. Nevertheless, the outdated responses are unexpected for the correct\nanswers to reasoning questions, which we named as the outdated issue. To\nalleviate this issue, in this paper, we propose a simple yet effective decoding\nstrategy, i.e., outDated ISsue aware deCOding (DISCO), to enhance the\nperformance of edited models on reasoning questions. Specifically, we capture\nthe difference in the probability distribution between the original and edited\nmodels. Further, we amplify the difference of the token prediction in the\nedited model to alleviate the outdated issue, and thus enhance the model\nperformance w.r.t the edited knowledge. Experimental results suggest that\napplying DISCO could enhance edited models to reason, e.g., on reasoning\nquestions, DISCO outperforms the prior SOTA method by 12.99 F1 scores, and\nreduces the ratio of the outdated issue to 5.78% on the zsRE dataset.\n","authors":["Zengkui Sun","Yijin Liu","Jiaan Wang","Fandong Meng","Jinan Xu","Yufeng Chen","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.02882v1.pdf","comment":"ACL2024 Findings"},{"id":"http://arxiv.org/abs/2405.19778v3","updated":"2024-06-05T02:57:20Z","published":"2024-05-30T07:44:16Z","title":"Enhancing Consistency and Role-Specific Knowledge Capturing by\n  Rebuilding Fictional Character's Persona","summary":"  With the recent introduction of Assistants API, it is expected that\ndocument-based language models will be actively used in various domains,\nespecially Role-playing. However, a key challenge lies in utilizing\nprotagonist's persona: Assistants API often fails to achieve with its search\nbecause the information extraction part is different each time and it often\nomits important information such as protagonist's backstory or relationships.\nIt is hard to maintain a consistent persona simply by using the persona\ndocument as input to the Assistants API. To address the challenge of achieving\nstable persona consistency, we propose CharacterGPT, a novel persona\nreconstruction framework to alleviate the shortcomings of the Assistants API.\nOur method involves Character Persona Training (CPT), an effective persona\nrebuilding process that updates the character persona by extracting the\ncharacter's traits from given summary of the novel for each character as if the\nstory in a novel progresses. In our experiments, we ask each character to take\nthe Big Five Inventory personality test in various settings and analyze the\nresults. To assess whether it can think outside the box, we let each character\ngenerate short novels. Extensive experiments and human evaluation demonstrate\nthat CharacterGPT presents new possibilities for role-playing agent research.\nCode and results are available at: https://github.com/Jeiyoon/charactergpt\n","authors":["Jeiyoon Park","Chanjun Park","Heuiseok Lim"],"pdf_url":"https://arxiv.org/pdf/2405.19778v3.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2406.02876v1","updated":"2024-06-05T02:52:17Z","published":"2024-06-05T02:52:17Z","title":"LCS: A Language Converter Strategy for Zero-Shot Neural Machine\n  Translation","summary":"  Multilingual neural machine translation models generally distinguish\ntranslation directions by the language tag (LT) in front of the source or\ntarget sentences. However, current LT strategies cannot indicate the desired\ntarget language as expected on zero-shot translation, i.e., the off-target\nissue. Our analysis reveals that the indication of the target language is\nsensitive to the placement of the target LT. For example, when placing the\ntarget LT on the decoder side, the indication would rapidly degrade along with\ndecoding steps, while placing the target LT on the encoder side would lead to\ncopying or paraphrasing the source input. To address the above issues, we\npropose a simple yet effective strategy named Language Converter Strategy\n(LCS). By introducing the target language embedding into the top encoder\nlayers, LCS mitigates confusion in the encoder and ensures stable language\nindication for the decoder. Experimental results on MultiUN, TED, and OPUS-100\ndatasets demonstrate that LCS could significantly mitigate the off-target\nissue, with language accuracy up to 95.28%, 96.21%, and 85.35% meanwhile\noutperforming the vanilla LT strategy by 3.07, 3,3, and 7.93 BLEU scores on\nzero-shot translation, respectively.\n","authors":["Zengkui Sun","Yijin Liu","Fandong Meng","Jinan Xu","Yufeng Chen","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.02876v1.pdf","comment":"ACL2024 Findings"},{"id":"http://arxiv.org/abs/2406.02864v1","updated":"2024-06-05T02:26:14Z","published":"2024-06-05T02:26:14Z","title":"NUMCoT: Numerals and Units of Measurement in Chain-of-Thought Reasoning\n  using Large Language Models","summary":"  Numeral systems and units of measurement are two conjoined topics in\nactivities of human beings and have mutual effects with the languages\nexpressing them. Currently, the evaluation of Large Language Models (LLMs)\noften involves mathematical reasoning, yet little attention is given to how\nminor changes in numbers or units can drastically alter the complexity of\nproblems and the performance of LLMs. In this paper, we scrutinize existing\nLLMs on processing of numerals and units of measurement by constructing\ndatasets with perturbations. We first anatomize the reasoning of math word\nproblems to different sub-procedures like numeral conversions from language to\nnumbers and measurement conversions based on units. Then we further annotate\nmath word problems from ancient Chinese arithmetic works which are challenging\nin numerals and units of measurement. Experiments on perturbed datasets\ndemonstrate that LLMs still encounter difficulties in handling numeral and\nmeasurement conversions.\n","authors":["Ancheng Xu","Minghuan Tan","Lei Wang","Min Yang","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2406.02864v1.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.02863v1","updated":"2024-06-05T02:25:10Z","published":"2024-06-05T02:25:10Z","title":"LLM as a Scorer: The Impact of Output Order on Dialogue Evaluation","summary":"  This research investigates the effect of prompt design on dialogue evaluation\nusing large language models (LLMs). While LLMs are increasingly used for\nscoring various inputs, creating effective prompts for dialogue evaluation\nremains challenging due to model sensitivity and subjectivity in dialogue\nassessments. Our study experimented with different prompt structures, altering\nthe sequence of output instructions and including explanatory reasons. We found\nthat the order of presenting reasons and scores significantly influences LLMs'\nscoring, with a \"reason-first\" approach yielding more comprehensive\nevaluations. This insight is crucial for enhancing the accuracy and consistency\nof LLM-based evaluations.\n","authors":["Yi-Pei Chen","KuanChao Chu","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2406.02863v1.pdf","comment":"Presented in AAAI 2024 Spring Symposium. The first two authors\n  contributed equally"},{"id":"http://arxiv.org/abs/2308.12490v2","updated":"2024-06-05T02:16:42Z","published":"2023-08-24T01:24:09Z","title":"MultiPA: A Multi-task Speech Pronunciation Assessment Model for Open\n  Response Scenarios","summary":"  Pronunciation assessment models designed for open response scenarios enable\nusers to practice language skills in a manner similar to real-life\ncommunication. However, previous open-response pronunciation assessment models\nhave predominantly focused on a single pronunciation task, such as\nsentence-level accuracy, rather than offering a comprehensive assessment in\nvarious aspects. We propose MultiPA, a Multitask Pronunciation Assessment model\nthat provides sentence-level accuracy, fluency, prosody, and word-level\naccuracy assessment for open responses. We examined the correlation between\ndifferent pronunciation tasks and showed the benefits of multi-task learning.\nOur model reached the state-of-the-art performance on existing in-domain data\nsets and effectively generalized to an out-of-domain dataset that we newly\ncollected. The experimental results demonstrate the practical utility of our\nmodel in real-world applications.\n","authors":["Yu-Wen Chen","Zhou Yu","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2308.12490v2.pdf","comment":"INTERSPEECH 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2307.03175v2","updated":"2024-06-05T17:59:01Z","published":"2023-07-06T17:55:28Z","title":"Push Past Green: Learning to Look Behind Plant Foliage by Moving It","summary":"  Autonomous agriculture applications (e.g., inspection, phenotyping, plucking\nfruits) require manipulating the plant foliage to look behind the leaves and\nthe branches. Partial visibility, extreme clutter, thin structures, and unknown\ngeometry and dynamics for plants make such manipulation challenging. We tackle\nthese challenges through data-driven methods. We use self-supervision to train\nSRPNet, a neural network that predicts what space is revealed on execution of a\ncandidate action on a given plant. We use SRPNet with the cross-entropy method\nto predict actions that are effective at revealing space beneath plant foliage.\nFurthermore, as SRPNet does not just predict how much space is revealed but\nalso where it is revealed, we can execute a sequence of actions that\nincrementally reveal more and more space beneath the plant foliage. We\nexperiment with a synthetic (vines) and a real plant (Dracaena) on a physical\ntest-bed across 5 settings including 2 settings that test generalization to\nnovel plant configurations. Our experiments reveal the effectiveness of our\noverall method, PPG, over a competitive hand-crafted exploration method, and\nthe effectiveness of SRPNet over a hand-crafted dynamics model and relevant\nablations.\n","authors":["Xiaoyu Zhang","Saurabh Gupta"],"pdf_url":"https://arxiv.org/pdf/2307.03175v2.pdf","comment":"Accepted by Conference on Robot Learning (CoRL) 2023. for project\n  website with video, see https://sites.google.com/view/pushpastgreen/"},{"id":"http://arxiv.org/abs/2402.08552v2","updated":"2024-06-05T17:36:47Z","published":"2024-02-13T15:55:41Z","title":"Confronting Reward Overoptimization for Diffusion Models: A Perspective\n  of Inductive and Primacy Biases","summary":"  Bridging the gap between diffusion models and human preferences is crucial\nfor their integration into practical generative workflows. While optimizing\ndownstream reward models has emerged as a promising alignment strategy,\nconcerns arise regarding the risk of excessive optimization with learned reward\nmodels, which potentially compromises ground-truth performance. In this work,\nwe confront the reward overoptimization problem in diffusion model alignment\nthrough the lenses of both inductive and primacy biases. We first identify a\nmismatch between current methods and the temporal inductive bias inherent in\nthe multi-step denoising process of diffusion models, as a potential source of\nreward overoptimization. Then, we surprisingly discover that dormant neurons in\nour critic model act as a regularization against reward overoptimization while\nactive neurons reflect primacy bias. Motivated by these observations, we\npropose Temporal Diffusion Policy Optimization with critic active neuron Reset\n(TDPO-R), a policy gradient algorithm that exploits the temporal inductive bias\nof diffusion models and mitigates the primacy bias stemming from active\nneurons. Empirical results demonstrate the superior efficacy of our methods in\nmitigating reward overoptimization. Code is avaliable at\nhttps://github.com/ZiyiZhang27/tdpo.\n","authors":["Ziyi Zhang","Sen Zhang","Yibing Zhan","Yong Luo","Yonggang Wen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2402.08552v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2402.17768v2","updated":"2024-06-05T17:33:56Z","published":"2024-02-27T18:59:18Z","title":"Diffusion Meets DAgger: Supercharging Eye-in-hand Imitation Learning","summary":"  A common failure mode for policies trained with imitation is compounding\nexecution errors at test time. When the learned policy encounters states that\nare not present in the expert demonstrations, the policy fails, leading to\ndegenerate behavior. The Dataset Aggregation, or DAgger approach to this\nproblem simply collects more data to cover these failure states. However, in\npractice, this is often prohibitively expensive. In this work, we propose\nDiffusion Meets DAgger (DMD), a method to reap the benefits of DAgger without\nthe cost for eye-in-hand imitation learning problems. Instead of collecting new\nsamples to cover out-of-distribution states, DMD uses recent advances in\ndiffusion models to synthesize these samples. This leads to robust performance\nfrom few demonstrations. We compare DMD against behavior cloning baseline\nacross four tasks: pushing, stacking, pouring, and shirt hanging. In pushing,\nDMD achieves 80% success rate with as few as 8 expert demonstrations, where\nnaive behavior cloning reaches only 20%. In stacking, DMD succeeds on average\n92% of the time across 5 cups, versus 40% for BC. When pouring coffee beans,\nDMD transfers to another cup successfully 80% of the time. Finally, DMD attains\n90% success rate for hanging shirt on a clothing rack.\n","authors":["Xiaoyu Zhang","Matthew Chang","Pranav Kumar","Saurabh Gupta"],"pdf_url":"https://arxiv.org/pdf/2402.17768v2.pdf","comment":"Accepted by Robotics: Science and Systems (RSS) 2024. project website\n  with video, see https://sites.google.com/view/diffusion-meets-dagger"},{"id":"http://arxiv.org/abs/2406.03478v1","updated":"2024-06-05T17:32:22Z","published":"2024-06-05T17:32:22Z","title":"Convolutional Neural Networks and Vision Transformers for Fashion MNIST\n  Classification: A Literature Review","summary":"  Our review explores the comparative analysis between Convolutional Neural\nNetworks (CNNs) and Vision Transformers (ViTs) in the domain of image\nclassification, with a particular focus on clothing classification within the\ne-commerce sector. Utilizing the Fashion MNIST dataset, we delve into the\nunique attributes of CNNs and ViTs. While CNNs have long been the cornerstone\nof image classification, ViTs introduce an innovative self-attention mechanism\nenabling nuanced weighting of different input data components. Historically,\ntransformers have primarily been associated with Natural Language Processing\n(NLP) tasks. Through a comprehensive examination of existing literature, our\naim is to unveil the distinctions between ViTs and CNNs in the context of image\nclassification. Our analysis meticulously scrutinizes state-of-the-art\nmethodologies employing both architectures, striving to identify the factors\ninfluencing their performance. These factors encompass dataset characteristics,\nimage dimensions, the number of target classes, hardware infrastructure, and\nthe specific architectures along with their respective top results. Our key\ngoal is to determine the most appropriate architecture between ViT and CNN for\nclassifying images in the Fashion MNIST dataset within the e-commerce industry,\nwhile taking into account specific conditions and needs. We highlight the\nimportance of combining these two architectures with different forms to enhance\noverall performance. By uniting these architectures, we can take advantage of\ntheir unique strengths, which may lead to more precise and reliable models for\ne-commerce applications. CNNs are skilled at recognizing local patterns, while\nViTs are effective at grasping overall context, making their combination a\npromising strategy for boosting image classification performance.\n","authors":["Sonia Bbouzidi","Ghazala Hcini","Imen Jdey","Fadoua Drira"],"pdf_url":"https://arxiv.org/pdf/2406.03478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03474v1","updated":"2024-06-05T17:25:46Z","published":"2024-06-05T17:25:46Z","title":"AD-H: Autonomous Driving with Hierarchical Agents","summary":"  Due to the impressive capabilities of multimodal large language models\n(MLLMs), recent works have focused on employing MLLM-based agents for\nautonomous driving in large-scale and dynamic environments. However, prevalent\napproaches often directly translate high-level instructions into low-level\nvehicle control signals, which deviates from the inherent language generation\nparadigm of MLLMs and fails to fully harness their emergent powers. As a\nresult, the generalizability of these methods is highly restricted by\nautonomous driving datasets used during fine-tuning. To tackle this challenge,\nwe propose to connect high-level instructions and low-level control signals\nwith mid-level language-driven commands, which are more fine-grained than\nhigh-level instructions but more universal and explainable than control\nsignals, and thus can effectively bridge the gap in between. We implement this\nidea through a hierarchical multi-agent driving system named AD-H, including a\nMLLM planner for high-level reasoning and a lightweight controller for\nlow-level execution. The hierarchical design liberates the MLLM from low-level\ncontrol signal decoding and therefore fully releases their emergent capability\nin high-level perception, reasoning, and planning. We build a new dataset with\naction hierarchy annotations. Comprehensive closed-loop evaluations demonstrate\nseveral key advantages of our proposed AD-H system. First, AD-H can notably\noutperform state-of-the-art methods in achieving exceptional driving\nperformance, even exhibiting self-correction capabilities during vehicle\noperation, a scenario not encountered in the training dataset. Second, AD-H\ndemonstrates superior generalization under long-horizon instructions and novel\nenvironmental conditions, significantly surpassing current state-of-the-art\nmethods. We will make our data and code publicly accessible at\nhttps://github.com/zhangzaibin/AD-H\n","authors":["Zaibin Zhang","Shiyu Tang","Yuanhang Zhang","Talas Fu","Yifan Wang","Yang Liu","Dong Wang","Jing Shao","Lijun Wang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2406.03474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.01072v3","updated":"2024-06-05T17:12:32Z","published":"2022-09-02T14:07:25Z","title":"Fiducial Tag Localization on a 3D LiDAR Prior Map","summary":"  The LiDAR fiducial tag, akin to the well-known AprilTag used in camera\napplications, serves as a convenient resource to impart artificial features to\nthe LiDAR sensor, facilitating robotics applications. Unfortunately, the\nexisting LiDAR fiducial tag localization methods do not apply to 3D LiDAR maps\nwhile resolving this problem is beneficial to LiDAR-based relocalization and\nnavigation. In this paper, we develop a novel approach to directly localize\nfiducial tags on a 3D LiDAR prior map, returning the tag poses (labeled by ID\nnumber) and vertex locations (labeled by index) w.r.t. the global coordinate\nsystem of the map. In particular, considering that fiducial tags are thin sheet\nobjects indistinguishable from the attached planes, we design a new pipeline\nthat gradually analyzes the 3D point cloud of the map from the intensity and\ngeometry perspectives, extracting potential tag-containing point clusters.\nThen, we introduce an intermediate-plane-based method to further check if each\npotential cluster has a tag and compute the vertex locations and tag pose if\nfound. We conduct both qualitative and quantitative experiments to demonstrate\nthat our approach is the first method applicable to localize tags on a 3D LiDAR\nmap while achieving better accuracy compared to previous methods. The\nopen-source implementation of this work is available at:\nhttps://github.com/York-SDCNLab/Marker-Detection-General.\n","authors":["Yibo Liu","Jinjun Shan","Hunter Schofield"],"pdf_url":"https://arxiv.org/pdf/2209.01072v3.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2406.03461v1","updated":"2024-06-05T17:09:51Z","published":"2024-06-05T17:09:51Z","title":"Polarization Wavefront Lidar: Learning Large Scene Reconstruction from\n  Polarized Wavefronts","summary":"  Lidar has become a cornerstone sensing modality for 3D vision, especially for\nlarge outdoor scenarios and autonomous driving. Conventional lidar sensors are\ncapable of providing centimeter-accurate distance information by emitting laser\npulses into a scene and measuring the time-of-flight (ToF) of the reflection.\nHowever, the polarization of the received light that depends on the surface\norientation and material properties is usually not considered. As such, the\npolarization modality has the potential to improve scene reconstruction beyond\ndistance measurements. In this work, we introduce a novel long-range\npolarization wavefront lidar sensor (PolLidar) that modulates the polarization\nof the emitted and received light. Departing from conventional lidar sensors,\nPolLidar allows access to the raw time-resolved polarimetric wavefronts. We\nleverage polarimetric wavefronts to estimate normals, distance, and material\nproperties in outdoor scenarios with a novel learned reconstruction method. To\ntrain and evaluate the method, we introduce a simulated and real-world\nlong-range dataset with paired raw lidar data, ground truth distance, and\nnormal maps. We find that the proposed method improves normal and distance\nreconstruction by 53\\% mean angular error and 41\\% mean absolute error compared\nto existing shape-from-polarization (SfP) and ToF methods. Code and data are\nopen-sourced at https://light.princeton.edu/pollidar.\n","authors":["Dominik Scheuble","Chenyang Lei","Seung-Hwan Baek","Mario Bijelic","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2406.03461v1.pdf","comment":"Accepted at CVPR 2024; Project Website:\n  https://light.princeton.edu/publication/pollidar"},{"id":"http://arxiv.org/abs/2406.03459v1","updated":"2024-06-05T17:07:24Z","published":"2024-06-05T17:07:24Z","title":"LW-DETR: A Transformer Replacement to YOLO for Real-Time Detection","summary":"  In this paper, we present a light-weight detection transformer, LW-DETR,\nwhich outperforms YOLOs for real-time object detection. The architecture is a\nsimple stack of a ViT encoder, a projector, and a shallow DETR decoder. Our\napproach leverages recent advanced techniques, such as training-effective\ntechniques, e.g., improved loss and pretraining, and interleaved window and\nglobal attentions for reducing the ViT encoder complexity. We improve the ViT\nencoder by aggregating multi-level feature maps, and the intermediate and final\nfeature maps in the ViT encoder, forming richer feature maps, and introduce\nwindow-major feature map organization for improving the efficiency of\ninterleaved attention computation. Experimental results demonstrate that the\nproposed approach is superior over existing real-time detectors, e.g., YOLO and\nits variants, on COCO and other benchmark datasets. Code and models are\navailable at (https://github.com/Atten4Vis/LW-DETR).\n","authors":["Qiang Chen","Xiangbo Su","Xinyu Zhang","Jian Wang","Jiahui Chen","Yunpeng Shen","Chuchu Han","Ziliang Chen","Weixiang Xu","Fanrong Li","Shan Zhang","Kun Yao","Errui Ding","Gang Zhang","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2406.03459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00758v2","updated":"2024-06-05T17:05:55Z","published":"2024-06-02T14:22:09Z","title":"Once-for-All: Controllable Generative Image Compression with Dynamic\n  Granularity Adaption","summary":"  Although recent generative image compression methods have demonstrated\nimpressive potential in optimizing the rate-distortion-perception trade-off,\nthey still face the critical challenge of flexible rate adaption to diverse\ncompression necessities and scenarios. To overcome this challenge, this paper\nproposes a Controllable Generative Image Compression framework, Control-GIC,\nthe first capable of fine-grained bitrate adaption across a broad spectrum\nwhile ensuring high-fidelity and generality compression. We base Control-GIC on\na VQGAN framework representing an image as a sequence of variable-length codes\n(i.e. VQ-indices), which can be losslessly compressed and exhibits a direct\npositive correlation with the bitrates. Therefore, drawing inspiration from the\nclassical coding principle, we naturally correlate the information density of\nlocal image patches with their granular representations, to achieve dynamic\nadjustment of the code quantity following different granularity decisions. This\nimplies we can flexibly determine a proper allocation of granularity for the\npatches to acquire desirable compression rates. We further develop a\nprobabilistic conditional decoder that can trace back to historic encoded\nmulti-granularity representations according to transmitted codes, and then\nreconstruct hierarchical granular features in the formalization of conditional\nprobability, enabling more informative aggregation to improve reconstruction\nrealism. Our experiments show that Control-GIC allows highly flexible and\ncontrollable bitrate adaption and even once compression on an entire dataset to\nfulfill constrained bitrate conditions. Experimental results demonstrate its\nsuperior performance over recent state-of-the-art methods.\n","authors":["Anqi Li","Yuxi Liu","Huihui Bai","Feng Li","Runmin Cong","Meng Wang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.00758v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00092v2","updated":"2024-06-05T17:03:02Z","published":"2023-11-30T11:01:37Z","title":"Mixture of Gaussian-distributed Prototypes with Generative Modelling for\n  Interpretable and Trustworthy Image Recognition","summary":"  Prototypical-part methods, e.g., ProtoPNet, enhance interpretability in image\nrecognition by linking predictions to training prototypes, thereby offering\nintuitive insights into their decision-making. Existing methods, which rely on\na point-based learning of prototypes, typically face two critical issues: 1)\nthe learned prototypes have limited representation power and are not suitable\nto detect Out-of-Distribution (OoD) inputs, reducing their decision\ntrustworthiness; and 2) the necessary projection of the learned prototypes back\ninto the space of training images causes a drastic degradation in the\npredictive performance. Furthermore, current prototype learning adopts an\naggressive approach that considers only the most active object parts during\ntraining, while overlooking sub-salient object regions which still hold crucial\nclassification information. In this paper, we present a new generative paradigm\nto learn prototype distributions, termed as Mixture of Gaussian-distributed\nPrototypes (MGProto). The distribution of prototypes from MGProto enables both\ninterpretable image classification and trustworthy recognition of OoD inputs.\nThe optimisation of MGProto naturally projects the learned prototype\ndistributions back into the training image space, thereby addressing the\nperformance degradation caused by prototype projection. Additionally, we\ndevelop a novel and effective prototype mining strategy that considers not only\nthe most active but also sub-salient object parts. To promote model\ncompactness, we further propose to prune MGProto by removing prototypes with\nlow importance priors. Experiments on CUB-200-2011, Stanford Cars, Stanford\nDogs, and Oxford-IIIT Pets datasets show that MGProto achieves state-of-the-art\nimage recognition and OoD detection performances, while providing encouraging\ninterpretability results.\n","authors":["Chong Wang","Yuanhong Chen","Fengbei Liu","Yuyuan Liu","Davis James McCarthy","Helen Frazer","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2312.00092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03447v1","updated":"2024-06-05T16:44:06Z","published":"2024-06-05T16:44:06Z","title":"FILS: Self-Supervised Video Feature Prediction In Semantic Language\n  Space","summary":"  This paper demonstrates a self-supervised approach for learning semantic\nvideo representations. Recent vision studies show that a masking strategy for\nvision and natural language supervision has contributed to developing\ntransferable visual pretraining. Our goal is to achieve a more semantic video\nrepresentation by leveraging the text related to the video content during the\npretraining in a fully self-supervised manner. To this end, we present FILS, a\nnovel self-supervised video Feature prediction In semantic Language Space\n(FILS). The vision model can capture valuable structured information by\ncorrectly predicting masked feature semantics in language space. It is learned\nusing a patch-wise video-text contrastive strategy, in which the text\nrepresentations act as prototypes for transforming vision features into a\nlanguage space, which are then used as targets for semantically meaningful\nfeature prediction using our masked encoder-decoder structure. FILS\ndemonstrates remarkable transferability on downstream action recognition tasks,\nachieving state-of-the-art on challenging egocentric datasets, like\nEpic-Kitchens, Something-SomethingV2, Charades-Ego, and EGTEA, using ViT-Base.\nOur efficient method requires less computation and smaller batches compared to\nprevious works.\n","authors":["Mona Ahmadian","Frank Guerin","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2406.03447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03439v1","updated":"2024-06-05T16:34:12Z","published":"2024-06-05T16:34:12Z","title":"Text-to-Events: Synthetic Event Camera Streams from Conditional Text\n  Input","summary":"  Event cameras are advantageous for tasks that require vision sensors with\nlow-latency and sparse output responses. However, the development of deep\nnetwork algorithms using event cameras has been slow because of the lack of\nlarge labelled event camera datasets for network training. This paper reports a\nmethod for creating new labelled event datasets by using a text-to-X model,\nwhere X is one or multiple output modalities, in the case of this work, events.\nOur proposed text-to-events model produces synthetic event frames directly from\ntext prompts. It uses an autoencoder which is trained to produce sparse event\nframes representing event camera outputs. By combining the pretrained\nautoencoder with a diffusion model architecture, the new text-to-events model\nis able to generate smooth synthetic event streams of moving objects. The\nautoencoder was first trained on an event camera dataset of diverse scenes. In\nthe combined training with the diffusion model, the DVS gesture dataset was\nused. We demonstrate that the model can generate realistic event sequences of\nhuman gestures prompted by different text statements. The classification\naccuracy of the generated sequences, using a classifier trained on the real\ndataset, ranges between 42% to 92%, depending on the gesture group. The results\ndemonstrate the capability of this method in synthesizing event datasets.\n","authors":["Joachim Ott","Zuowen Wang","Shih-Chii Liu"],"pdf_url":"https://arxiv.org/pdf/2406.03439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03431v1","updated":"2024-06-05T16:29:13Z","published":"2024-06-05T16:29:13Z","title":"CattleFace-RGBT: RGB-T Cattle Facial Landmark Benchmark","summary":"  To address this challenge, we introduce CattleFace-RGBT, a RGB-T Cattle\nFacial Landmark dataset consisting of 2,300 RGB-T image pairs, a total of 4,600\nimages. Creating a landmark dataset is time-consuming, but AI-assisted\nannotation can help. However, applying AI to thermal images is challenging due\nto suboptimal results from direct thermal training and infeasible RGB-thermal\nalignment due to different camera views. Therefore, we opt to transfer models\ntrained on RGB to thermal images and refine them using our AI-assisted\nannotation tool following a semi-automatic annotation approach. Accurately\nlocalizing facial key points on both RGB and thermal images enables us to not\nonly discern the cattle's respiratory signs but also measure temperatures to\nassess the animal's thermal state. To the best of our knowledge, this is the\nfirst dataset for the cattle facial landmark on RGB-T images. We conduct\nbenchmarking of the CattleFace-RGBT dataset across various backbone\narchitectures, with the objective of establishing baselines for future\nresearch, analysis, and comparison. The dataset and models are at\nhttps://github.com/UARK-AICV/CattleFace-RGBT-benchmark\n","authors":["Ethan Coffman","Reagan Clark","Nhat-Tan Bui","Trong Thang Pham","Beth Kegley","Jeremy G. Powell","Jiangchao Zhao","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2406.03431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03430v1","updated":"2024-06-05T16:29:03Z","published":"2024-06-05T16:29:03Z","title":"Computation-Efficient Era: A Comprehensive Survey of State Space Models\n  in Medical Image Analysis","summary":"  Sequence modeling plays a vital role across various domains, with recurrent\nneural networks being historically the predominant method of performing these\ntasks. However, the emergence of transformers has altered this paradigm due to\ntheir superior performance. Built upon these advances, transformers have\nconjoined CNNs as two leading foundational models for learning visual\nrepresentations. However, transformers are hindered by the $\\mathcal{O}(N^2)$\ncomplexity of their attention mechanisms, while CNNs lack global receptive\nfields and dynamic weight allocation. State Space Models (SSMs), specifically\nthe \\textit{\\textbf{Mamba}} model with selection mechanisms and hardware-aware\narchitecture, have garnered immense interest lately in sequential modeling and\nvisual representation learning, challenging the dominance of transformers by\nproviding infinite context lengths and offering substantial efficiency\nmaintaining linear complexity in the input sequence. Capitalizing on the\nadvances in computer vision, medical imaging has heralded a new epoch with\nMamba models. Intending to help researchers navigate the surge, this survey\nseeks to offer an encyclopedic review of Mamba models in medical imaging.\nSpecifically, we start with a comprehensive theoretical review forming the\nbasis of SSMs, including Mamba architecture and its alternatives for sequence\nmodeling paradigms in this context. Next, we offer a structured classification\nof Mamba models in the medical field and introduce a diverse categorization\nscheme based on their application, imaging modalities, and targeted organs.\nFinally, we summarize key challenges, discuss different future research\ndirections of the SSMs in the medical domain, and propose several directions to\nfulfill the demands of this field. In addition, we have compiled the studies\ndiscussed in this paper along with their open-source implementations on our\nGitHub repository.\n","authors":["Moein Heidari","Sina Ghorbani Kolahi","Sanaz Karimijafarbigloo","Bobby Azad","Afshin Bozorgpour","Soheila Hatami","Reza Azad","Ali Diba","Ulas Bagci","Dorit Merhof","Ilker Hacihaliloglu"],"pdf_url":"https://arxiv.org/pdf/2406.03430v1.pdf","comment":"This is the first version of our survey, and the paper is currently\n  under review"},{"id":"http://arxiv.org/abs/2406.03421v1","updated":"2024-06-05T16:16:03Z","published":"2024-06-05T16:16:03Z","title":"Post-hoc Part-prototype Networks","summary":"  Post-hoc explainability methods such as Grad-CAM are popular because they do\nnot influence the performance of a trained model. However, they mainly reveal\n\"where\" a model looks at for a given input, fail to explain \"what\" the model\nlooks for (e.g., what is important to classify a bird image to a Scott\nOriole?). Existing part-prototype networks leverage part-prototypes (e.g.,\ncharacteristic Scott Oriole's wing and head) to answer both \"where\" and \"what\",\nbut often under-perform their black box counterparts in the accuracy.\nTherefore, a natural question is: can one construct a network that answers both\n\"where\" and \"what\" in a post-hoc manner to guarantee the model's performance?\nTo this end, we propose the first post-hoc part-prototype network via\ndecomposing the classification head of a trained model into a set of\ninterpretable part-prototypes. Concretely, we propose an unsupervised prototype\ndiscovery and refining strategy to obtain prototypes that can precisely\nreconstruct the classification head, yet being interpretable. Besides\nguaranteeing the performance, we show that our network offers more faithful\nexplanations qualitatively and yields even better part-prototypes\nquantitatively than prior part-prototype networks.\n","authors":["Andong Tan","Fengtao Zhou","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2406.03421v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.03417v1","updated":"2024-06-05T16:12:19Z","published":"2024-06-05T16:12:19Z","title":"CoFie: Learning Compact Neural Surface Representations with Coordinate\n  Fields","summary":"  This paper introduces CoFie, a novel local geometry-aware neural surface\nrepresentation. CoFie is motivated by the theoretical analysis of local SDFs\nwith quadratic approximation. We find that local shapes are highly compressive\nin an aligned coordinate frame defined by the normal and tangent directions of\nlocal shapes. Accordingly, we introduce Coordinate Field, which is a\ncomposition of coordinate frames of all local shapes. The Coordinate Field is\noptimizable and is used to transform the local shapes from the world coordinate\nframe to the aligned shape coordinate frame. It largely reduces the complexity\nof local shapes and benefits the learning of MLP-based implicit\nrepresentations. Moreover, we introduce quadratic layers into the MLP to\nenhance expressiveness concerning local shape geometry. CoFie is a\ngeneralizable surface representation. It is trained on a curated set of 3D\nshapes and works on novel shape instances during testing. When using the same\namount of parameters with prior works, CoFie reduces the shape error by 48% and\n56% on novel instances of both training and unseen shape categories. Moreover,\nCoFie demonstrates comparable performance to prior works when using only 70%\nfewer parameters.\n","authors":["Hanwen Jiang","Haitao Yang","Georgios Pavlakos","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2406.03417v1.pdf","comment":"Project page: https://hwjiang1510.github.io/CoFie/"},{"id":"http://arxiv.org/abs/2406.03413v1","updated":"2024-06-05T16:10:29Z","published":"2024-06-05T16:10:29Z","title":"UnWave-Net: Unrolled Wavelet Network for Compton Tomography Image\n  Reconstruction","summary":"  Computed tomography (CT) is a widely used medical imaging technique to scan\ninternal structures of a body, typically involving collimation and mechanical\nrotation. Compton scatter tomography (CST) presents an interesting alternative\nto conventional CT by leveraging Compton physics instead of collimation to\ngather information from multiple directions. While CST introduces new imaging\nopportunities with several advantages such as high sensitivity, compactness,\nand entirely fixed systems, image reconstruction remains an open problem due to\nthe mathematical challenges of CST modeling. In contrast, deep unrolling\nnetworks have demonstrated potential in CT image reconstruction, despite their\ncomputationally intensive nature. In this study, we investigate the efficiency\nof unrolling networks for CST image reconstruction. To address the important\ncomputational cost required for training, we propose UnWave-Net, a novel\nunrolled wavelet-based reconstruction network. This architecture includes a\nnon-local regularization term based on wavelets, which captures long-range\ndependencies within images and emphasizes the multi-scale components of the\nwavelet transform. We evaluate our approach using a CST of circular geometry\nwhich stays completely static during data acquisition, where UnWave-Net\nfacilitates image reconstruction in the absence of a specific reconstruction\nformula. Our method outperforms existing approaches and achieves\nstate-of-the-art performance in terms of SSIM and PSNR, and offers an improved\ncomputational efficiency compared to traditional unrolling networks.\n","authors":["Ishak Ayad","Cécilia Tarpau","Javier Cebeiro","Maï K. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2406.03413v1.pdf","comment":"This paper has been early accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2406.03411v1","updated":"2024-06-05T16:09:01Z","published":"2024-06-05T16:09:01Z","title":"Interactive Text-to-Image Retrieval with Large Language Models: A\n  Plug-and-Play Approach","summary":"  In this paper, we primarily address the issue of dialogue-form context query\nwithin the interactive text-to-image retrieval task. Our methodology, PlugIR,\nactively utilizes the general instruction-following capability of LLMs in two\nways. First, by reformulating the dialogue-form context, we eliminate the\nnecessity of fine-tuning a retrieval model on existing visual dialogue data,\nthereby enabling the use of any arbitrary black-box model. Second, we construct\nthe LLM questioner to generate non-redundant questions about the attributes of\nthe target image, based on the information of retrieval candidate images in the\ncurrent context. This approach mitigates the issues of noisiness and redundancy\nin the generated questions. Beyond our methodology, we propose a novel\nevaluation metric, Best log Rank Integral (BRI), for a comprehensive assessment\nof the interactive retrieval system. PlugIR demonstrates superior performance\ncompared to both zero-shot and fine-tuned baselines in various benchmarks.\nAdditionally, the two methodologies comprising PlugIR can be flexibly applied\ntogether or separately in various situations. Our codes are available at\nhttps://github.com/Saehyung-Lee/PlugIR.\n","authors":["Saehyung Lee","Sangwon Yu","Junsung Park","Jihun Yi","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2406.03411v1.pdf","comment":"To appear in ACL 2024 Main"},{"id":"http://arxiv.org/abs/2312.10531v2","updated":"2024-06-05T15:51:53Z","published":"2023-12-16T20:10:23Z","title":"How to Train Neural Field Representations: A Comprehensive Study and\n  Benchmark","summary":"  Neural fields (NeFs) have recently emerged as a versatile method for modeling\nsignals of various modalities, including images, shapes, and scenes.\nSubsequently, a number of works have explored the use of NeFs as\nrepresentations for downstream tasks, e.g. classifying an image based on the\nparameters of a NeF that has been fit to it. However, the impact of the NeF\nhyperparameters on their quality as downstream representation is scarcely\nunderstood and remains largely unexplored. This is in part caused by the large\namount of time required to fit datasets of neural fields.\n  In this work, we propose a JAX-based library that leverages parallelization\nto enable fast optimization of large-scale NeF datasets, resulting in a\nsignificant speed-up. With this library, we perform a comprehensive study that\ninvestigates the effects of different hyperparameters on fitting NeFs for\ndownstream tasks. In particular, we explore the use of a shared initialization,\nthe effects of overtraining, and the expressiveness of the network\narchitectures used. Our study provides valuable insights on how to train NeFs\nand offers guidance for optimizing their effectiveness in downstream\napplications. Finally, based on the proposed library and our analysis, we\npropose Neural Field Arena, a benchmark consisting of neural field variants of\npopular vision datasets, including MNIST, CIFAR, variants of ImageNet, and\nShapeNetv2. Our library and the Neural Field Arena will be open-sourced to\nintroduce standardized benchmarking and promote further research on neural\nfields.\n","authors":["Samuele Papa","Riccardo Valperga","David Knigge","Miltiadis Kofinas","Phillip Lippe","Jan-Jakob Sonke","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2312.10531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03394v1","updated":"2024-06-05T15:44:54Z","published":"2024-06-05T15:44:54Z","title":"Gaussian Representation for Deformable Image Registration","summary":"  Deformable image registration (DIR) is a fundamental task in radiotherapy,\nwith existing methods often struggling to balance computational efficiency,\nregistration accuracy, and speed effectively. We introduce a novel DIR approach\nemploying parametric 3D Gaussian control points achieving a better tradeoff. It\nprovides an explicit and flexible representation for spatial deformation fields\nbetween 3D volumetric medical images, producing a displacement vector field\n(DVF) across all volumetric positions. The movement of individual voxels is\nderived using linear blend skinning (LBS) through localized interpolation of\ntransformations associated with neighboring Gaussians. This interpolation\nstrategy not only simplifies the determination of voxel motions but also acts\nas an effective regularization technique. Our approach incorporates a unified\noptimization process through backpropagation, enabling iterative learning of\nboth the parameters of the 3D Gaussians and their transformations.\nAdditionally, the density of Gaussians is adjusted adaptively during the\nlearning phase to accommodate varying degrees of motion complexity. We\nvalidated our approach on the 4D-CT lung DIR-Lab and cardiac ACDC datasets,\nachieving an average target registration error (TRE) of 1.06 mm within a\nmuch-improved processing time of 2.43 seconds for the DIR-Lab dataset over\nexisting methods, demonstrating significant advancements in both accuracy and\nefficiency.\n","authors":["Jihe Li","Fabian Zhang","Xia Li","Tianhao Zhang","Ye Zhang","Joachim Buhmann"],"pdf_url":"https://arxiv.org/pdf/2406.03394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03388v1","updated":"2024-06-05T15:38:02Z","published":"2024-06-05T15:38:02Z","title":"SelfReDepth: Self-Supervised Real-Time Depth Restoration for\n  Consumer-Grade Sensors","summary":"  Depth maps produced by consumer-grade sensors suffer from inaccurate\nmeasurements and missing data from either system or scene-specific sources.\nData-driven denoising algorithms can mitigate such problems. However, they\nrequire vast amounts of ground truth depth data. Recent research has tackled\nthis limitation using self-supervised learning techniques, but it requires\nmultiple RGB-D sensors. Moreover, most existing approaches focus on denoising\nsingle isolated depth maps or specific subjects of interest, highlighting a\nneed for methods to effectively denoise depth maps in real-time dynamic\nenvironments. This paper extends state-of-the-art approaches for\ndepth-denoising commodity depth devices, proposing SelfReDepth, a\nself-supervised deep learning technique for depth restoration, via denoising\nand hole-filling by inpainting full-depth maps captured with RGB-D sensors. The\nalgorithm targets depth data in video streams, utilizing multiple sequential\ndepth frames coupled with color data to achieve high-quality depth videos with\ntemporal coherence. Finally, SelfReDepth is designed to be compatible with\nvarious RGB-D sensors and usable in real-time scenarios as a pre-processing\nstep before applying other depth-dependent algorithms. Our results demonstrate\nour approach's real-time performance on real-world datasets. They show that it\noutperforms state-of-the-art denoising and restoration performance at over\n30fps on Commercial Depth Cameras, with potential benefits for augmented and\nmixed-reality applications.\n","authors":["Alexandre Duarte","Francisco Fernandes","João M. Pereira","Catarina Moreira","Jacinto C. Nascimento","Joaquim Jorge"],"pdf_url":"https://arxiv.org/pdf/2406.03388v1.pdf","comment":"13pp, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2406.01194v2","updated":"2024-06-05T15:34:47Z","published":"2024-06-03T10:57:18Z","title":"AFF-ttention! Affordances and Attention models for Short-Term Object\n  Interaction Anticipation","summary":"  Short-Term object-interaction Anticipation consists of detecting the location\nof the next-active objects, the noun and verb categories of the interaction,\nand the time to contact from the observation of egocentric video. This ability\nis fundamental for wearable assistants or human robot interaction to understand\nthe user goals, but there is still room for improvement to perform STA in a\nprecise and reliable way. In this work, we improve the performance of STA\npredictions with two contributions: 1. We propose STAformer, a novel\nattention-based architecture integrating frame guided temporal pooling, dual\nimage-video attention, and multiscale feature fusion to support STA predictions\nfrom an image-input video pair. 2. We introduce two novel modules to ground STA\npredictions on human behavior by modeling affordances.First, we integrate an\nenvironment affordance model which acts as a persistent memory of interactions\nthat can take place in a given physical scene. Second, we predict interaction\nhotspots from the observation of hands and object trajectories, increasing\nconfidence in STA predictions localized around the hotspot. Our results show\nsignificant relative Overall Top-5 mAP improvements of up to +45% on Ego4D and\n+42% on a novel set of curated EPIC-Kitchens STA labels. We will release the\ncode, annotations, and pre extracted affordances on Ego4D and EPIC- Kitchens to\nencourage future research in this area.\n","authors":["Lorenzo Mur-Labadia","Ruben Martinez-Cantin","Josechu Guerrero","Giovanni Maria Farinella","Antonino Furnari"],"pdf_url":"https://arxiv.org/pdf/2406.01194v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.04288v2","updated":"2024-06-05T15:33:23Z","published":"2022-10-09T15:42:36Z","title":"CoopHash: Cooperative Learning of Multipurpose Descriptor and\n  Contrastive Pair Generator via Variational MCMC Teaching for Supervised Image\n  Hashing","summary":"  Leveraging supervised information can lead to superior retrieval performance\nin the image hashing domain but the performance degrades significantly without\nenough labeled data. One effective solution to boost performance is to employ\ngenerative models, such as Generative Adversarial Networks (GANs), to generate\nsynthetic data in an image hashing model. However, GAN-based methods are\ndifficult to train, which prevents the hashing approaches from jointly training\nthe generative models and the hash functions. This limitation results in\nsub-optimal retrieval performance. To overcome this limitation, we propose a\nnovel framework, the generative cooperative hashing network, which is based on\nenergy-based cooperative learning. This framework jointly learns a powerful\ngenerative representation of the data and a robust hash function via two\ncomponents: a top-down contrastive pair generator that synthesizes contrastive\nimages and a bottom-up multipurpose descriptor that simultaneously represents\nthe images from multiple perspectives, including probability density, hash\ncode, latent code, and category. The two components are jointly learned via a\nnovel likelihood-based cooperative learning scheme. We conduct experiments on\nseveral real-world datasets and show that the proposed method outperforms the\ncompeting hashing supervised methods, achieving up to 10\\% relative improvement\nover the current state-of-the-art supervised hashing methods, and exhibits a\nsignificantly better performance in out-of-distribution retrieval.\n","authors":["Khoa D. Doan","Jianwen Xie","Yaxuan Zhu","Yang Zhao","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2210.04288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12336v2","updated":"2024-06-05T15:32:03Z","published":"2024-02-19T18:09:48Z","title":"Robust CLIP: Unsupervised Adversarial Fine-Tuning of Vision Embeddings\n  for Robust Large Vision-Language Models","summary":"  Multi-modal foundation models like OpenFlamingo, LLaVA, and GPT-4 are\nincreasingly used for various real-world tasks. Prior work has shown that these\nmodels are highly vulnerable to adversarial attacks on the vision modality.\nThese attacks can be leveraged to spread fake information or defraud users, and\nthus pose a significant risk, which makes the robustness of large multi-modal\nfoundation models a pressing problem. The CLIP model, or one of its variants,\nis used as a frozen vision encoder in many large vision-language models\n(LVLMs), e.g. LLaVA and OpenFlamingo. We propose an unsupervised adversarial\nfine-tuning scheme to obtain a robust CLIP vision encoder, which yields\nrobustness on all vision down-stream tasks (LVLMs, zero-shot classification)\nthat rely on CLIP. In particular, we show that stealth-attacks on users of\nLVLMs by a malicious third party providing manipulated images are no longer\npossible once one replaces the original CLIP model with our robust one. No\nretraining or fine-tuning of the down-stream LVLMs is required. The code and\nrobust models are available at https://github.com/chs20/RobustVLM\n","authors":["Christian Schlarmann","Naman Deep Singh","Francesco Croce","Matthias Hein"],"pdf_url":"https://arxiv.org/pdf/2402.12336v2.pdf","comment":"ICML 2024 Oral"},{"id":"http://arxiv.org/abs/2406.03359v1","updated":"2024-06-05T15:14:29Z","published":"2024-06-05T15:14:29Z","title":"SuperFormer: Volumetric Transformer Architectures for MRI\n  Super-Resolution","summary":"  This paper presents a novel framework for processing volumetric medical\ninformation using Visual Transformers (ViTs). First, We extend the\nstate-of-the-art Swin Transformer model to the 3D medical domain. Second, we\npropose a new approach for processing volumetric information and encoding\nposition in ViTs for 3D applications. We instantiate the proposed framework and\npresent SuperFormer, a volumetric transformer-based approach for Magnetic\nResonance Imaging (MRI) Super-Resolution. Our method leverages the 3D\ninformation of the MRI domain and uses a local self-attention mechanism with a\n3D relative positional encoding to recover anatomical details. In addition, our\napproach takes advantage of multi-domain information from volume and feature\ndomains and fuses them to reconstruct the High-Resolution MRI. We perform an\nextensive validation on the Human Connectome Project dataset and demonstrate\nthe superiority of volumetric transformers over 3D CNN-based methods. Our code\nand pretrained models are available at\nhttps://github.com/BCV-Uniandes/SuperFormer.\n","authors":["Cristhian Forigua","Maria Escobar","Pablo Arbelaez"],"pdf_url":"https://arxiv.org/pdf/2406.03359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03333v1","updated":"2024-06-05T14:49:14Z","published":"2024-06-05T14:49:14Z","title":"A Flexible Recursive Network for Video Stereo Matching Based on Residual\n  Estimation","summary":"  Due to the high similarity of disparity between consecutive frames in video\nsequences, the area where disparity changes is defined as the residual map,\nwhich can be calculated. Based on this, we propose RecSM, a network based on\nresidual estimation with a flexible recursive structure for video stereo\nmatching. The RecSM network accelerates stereo matching using a Multi-scale\nResidual Estimation Module (MREM), which employs the temporal context as a\nreference and rapidly calculates the disparity for the current frame by\ncomputing only the residual values between the current and previous frames. To\nfurther reduce the error of estimated disparities, we use the Disparity\nOptimization Module (DOM) and Temporal Attention Module (TAM) to enforce\nconstraints between each module, and together with MREM, form a flexible\nStackable Computation Structure (SCS), which allows for the design of different\nnumbers of SCS based on practical scenarios. Experimental results demonstrate\nthat with a stack count of 3, RecSM achieves a 4x speed improvement compared to\nACVNet, running at 0.054 seconds based on one NVIDIA RTX 2080TI GPU, with an\naccuracy decrease of only 0.7%. Code is available at\nhttps://github.com/Y0uchenZ/RecSM.\n","authors":["Youchen Zhao","Guorong Luo","Hua Zhong","Haixiong Li"],"pdf_url":"https://arxiv.org/pdf/2406.03333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03325v1","updated":"2024-06-05T14:38:30Z","published":"2024-06-05T14:38:30Z","title":"EngineBench: Flow Reconstruction in the Transparent Combustion Chamber\n  III Optical Engine","summary":"  We present EngineBench, the first machine learning (ML) oriented database to\nuse high quality experimental data for the study of turbulent flows inside\ncombustion machinery. Prior datasets for ML in fluid mechanics are synthetic or\nuse overly simplistic geometries. EngineBench is comprised of real-world\nparticle image velocimetry (PIV) data that captures the turbulent airflow\npatterns in a specially-designed optical engine. However, in PIV data from\ninternal flows, such as from engines, it is often challenging to achieve a full\nfield of view and large occlusions can be present. In order to design optimal\ncombustion systems, insight into the turbulent flows in these obscured areas is\nneeded, which can be provided via inpainting models. Here we propose a novel\ninpainting task using random edge gaps, a technique that emphasises realism by\nintroducing occlusions at random sizes and orientations at the edges of the PIV\nimages. We test five ML methods on random edge gaps using pixel-wise,\nvector-based, and multi-scale performance metrics. We find that UNet-based\nmodels are more accurate than the industry-norm non-parametric approach and the\ncontext encoder at this task on both small and large gap sizes. The dataset and\ninpainting task presented in this paper support the development of more\ngeneral-purpose pre-trained ML models for engine design problems. The method\ncomparisons allow for more informed selection of ML models for problems in\nexperimental flow diagnostics. All data and code are publicly available at\nhttps://eng.ox.ac.uk/tpsrg/research/enginebench/.\n","authors":["Samuel J. Baker","Michael A. Hobley","Isabel Scherl","Xiaohang Fang","Felix C. P. Leach","Martin H. Davy"],"pdf_url":"https://arxiv.org/pdf/2406.03325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03323v1","updated":"2024-06-05T14:36:33Z","published":"2024-06-05T14:36:33Z","title":"Comparative Benchmarking of Failure Detection Methods in Medical Image\n  Segmentation: Unveiling the Role of Confidence Aggregation","summary":"  Semantic segmentation is an essential component of medical image analysis\nresearch, with recent deep learning algorithms offering out-of-the-box\napplicability across diverse datasets. Despite these advancements, segmentation\nfailures remain a significant concern for real-world clinical applications,\nnecessitating reliable detection mechanisms. This paper introduces a\ncomprehensive benchmarking framework aimed at evaluating failure detection\nmethodologies within medical image segmentation. Through our analysis, we\nidentify the strengths and limitations of current failure detection metrics,\nadvocating for the risk-coverage analysis as a holistic evaluation approach.\nUtilizing a collective dataset comprising five public 3D medical image\ncollections, we assess the efficacy of various failure detection strategies\nunder realistic test-time distribution shifts. Our findings highlight the\nimportance of pixel confidence aggregation and we observe superior performance\nof the pairwise Dice score (Roy et al., 2019) between ensemble predictions,\npositioning it as a simple and robust baseline for failure detection in medical\nimage segmentation. To promote ongoing research, we make the benchmarking\nframework available to the community.\n","authors":["Maximilian Zenk","David Zimmerer","Fabian Isensee","Jeremias Traub","Tobias Norajitra","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2406.03323v1.pdf","comment":"This work has been submitted for possible publication. Copyright may\n  be transferred without notice, after which this version may no longer be\n  accessible"},{"id":"http://arxiv.org/abs/2311.10093v4","updated":"2024-06-05T14:34:30Z","published":"2023-11-16T18:59:51Z","title":"The Chosen One: Consistent Characters in Text-to-Image Diffusion Models","summary":"  Recent advances in text-to-image generation models have unlocked vast\npotential for visual creativity. However, the users that use these models\nstruggle with the generation of consistent characters, a crucial aspect for\nnumerous real-world applications such as story visualization, game development,\nasset design, advertising, and more. Current methods typically rely on multiple\npre-existing images of the target character or involve labor-intensive manual\nprocesses. In this work, we propose a fully automated solution for consistent\ncharacter generation, with the sole input being a text prompt. We introduce an\niterative procedure that, at each stage, identifies a coherent set of images\nsharing a similar identity and extracts a more consistent identity from this\nset. Our quantitative analysis demonstrates that our method strikes a better\nbalance between prompt alignment and identity consistency compared to the\nbaseline methods, and these findings are reinforced by a user study. To\nconclude, we showcase several practical applications of our approach.\n","authors":["Omri Avrahami","Amir Hertz","Yael Vinker","Moab Arar","Shlomi Fruchter","Ohad Fried","Daniel Cohen-Or","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2311.10093v4.pdf","comment":"Accepted to SIGGRAPH 2024. Project page is available at\n  https://omriavrahami.com/the-chosen-one/"},{"id":"http://arxiv.org/abs/2406.03303v1","updated":"2024-06-05T14:13:38Z","published":"2024-06-05T14:13:38Z","title":"Learning Visual Prompts for Guiding the Attention of Vision Transformers","summary":"  Visual prompting infuses visual information into the input image to adapt\nmodels toward specific predictions and tasks. Recently, manually crafted\nmarkers such as red circles are shown to guide the model to attend to a target\nregion on the image. However, these markers only work on models trained with\ndata containing those markers. Moreover, finding these prompts requires\nguesswork or prior knowledge of the domain on which the model is trained. This\nwork circumvents manual design constraints by proposing to learn the visual\nprompts for guiding the attention of vision transformers. The learned visual\nprompt, added to any input image would redirect the attention of the\npre-trained vision transformer to its spatial location on the image.\nSpecifically, the prompt is learned in a self-supervised manner without\nrequiring annotations and without fine-tuning the vision transformer. Our\nexperiments demonstrate the effectiveness of the proposed optimization-based\nvisual prompting strategy across various pre-trained vision encoders.\n","authors":["Razieh Rezaei","Masoud Jalili Sabet","Jindong Gu","Daniel Rueckert","Philip Torr","Ashkan Khakzar"],"pdf_url":"https://arxiv.org/pdf/2406.03303v1.pdf","comment":"Short version (4-pages) accepted as a spotlight paper at T4V\n  workshop, CVPR 2024"},{"id":"http://arxiv.org/abs/2406.03298v1","updated":"2024-06-05T14:08:13Z","published":"2024-06-05T14:08:13Z","title":"L-PR: Exploiting LiDAR Fiducial Marker for Unordered Low Overlap\n  Multiview Point Cloud Registration","summary":"  Point cloud registration is a prerequisite for many applications in computer\nvision and robotics. Most existing methods focus on pairwise registration of\ntwo point clouds with high overlap. Although there have been some methods for\nlow overlap cases, they struggle in degraded scenarios. This paper introduces a\nnovel framework named L-PR, designed to register unordered low overlap\nmultiview point clouds leveraging LiDAR fiducial markers. We refer to them as\nLiDAR fiducial markers, but they are the same as the popular AprilTag and ArUco\nmarkers, thin sheets of paper that do not affect the 3D geometry of the\nenvironment. We first propose an improved adaptive threshold marker detection\nmethod to provide robust detection results when the viewpoints among point\nclouds change dramatically. Then, we formulate the unordered multiview point\ncloud registration problem as a maximum a-posteriori (MAP) problem and develop\na framework consisting of two levels of graphs to address it. The first-level\ngraph, constructed as a weighted graph, is designed to efficiently and\noptimally infer initial values of scan poses from the unordered set. The\nsecond-level graph is constructed as a factor graph. By globally optimizing the\nvariables on the graph, including scan poses, marker poses, and marker corner\npositions, we tackle the MAP problem. We conduct qualitative and quantitative\nexperiments to demonstrate that the proposed method exhibits superiority over\ncompetitors in four aspects: registration accuracy, instance reconstruction\nquality, localization accuracy, and robustness to the degraded scene. To\nbenefit the community, we open-source our method and dataset at\nhttps://github.com/yorklyb/LiDAR-SFM.\n","authors":["Yibo Liu","Jinjun Shan","Amaldev Haridevan","Shuo Zhang","Kejian Lin"],"pdf_url":"https://arxiv.org/pdf/2406.03298v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2406.01302v2","updated":"2024-06-05T14:05:16Z","published":"2024-06-03T13:10:13Z","title":"Pulmonary Embolism Mortality Prediction Using Multimodal Learning Based\n  on Computed Tomography Angiography and Clinical Data","summary":"  Purpose: Pulmonary embolism (PE) is a significant cause of mortality in the\nUnited States. The objective of this study is to implement deep learning (DL)\nmodels using Computed Tomography Pulmonary Angiography (CTPA), clinical data,\nand PE Severity Index (PESI) scores to predict PE mortality. Materials and\nMethods: 918 patients (median age 64 years, range 13-99 years, 52% female) with\n3,978 CTPAs were identified via retrospective review across three institutions.\nTo predict survival, an AI model was used to extract disease-related imaging\nfeatures from CTPAs. Imaging features and/or clinical variables were then\nincorporated into DL models to predict survival outcomes. Four models were\ndeveloped as follows: (1) using CTPA imaging features only; (2) using clinical\nvariables only; (3) multimodal, integrating both CTPA and clinical variables;\nand (4) multimodal fused with calculated PESI score. Performance and\ncontribution from each modality were evaluated using concordance index\n(c-index) and Net Reclassification Improvement, respectively. Performance was\ncompared to PESI predictions using the Wilcoxon signed-rank test. Kaplan-Meier\nanalysis was performed to stratify patients into high- and low-risk groups.\nAdditional factor-risk analysis was conducted to account for right ventricular\n(RV) dysfunction. Results: For both data sets, the PESI-fused and multimodal\nmodels achieved higher c-indices than PESI alone. Following stratification of\npatients into high- and low-risk groups by multimodal and PESI-fused models,\nmortality outcomes differed significantly (both p<0.001). A strong correlation\nwas found between high-risk grouping and RV dysfunction. Conclusions: Multiomic\nDL models incorporating CTPA features, clinical data, and PESI achieved higher\nc-indices than PESI alone for PE survival prediction.\n","authors":["Zhusi Zhong","Helen Zhang","Fayez H. Fayad","Andrew C. Lancaster","John Sollee","Shreyas Kulkarni","Cheng Ting Lin","Jie Li","Xinbo Gao","Scott Collins","Colin Greineder","Sun H. Ahn","Harrison X. Bai","Zhicheng Jiao","Michael K. Atalay"],"pdf_url":"https://arxiv.org/pdf/2406.01302v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03293v1","updated":"2024-06-05T14:02:31Z","published":"2024-06-05T14:02:31Z","title":"Text-to-Image Rectified Flow as Plug-and-Play Priors","summary":"  Large-scale diffusion models have achieved remarkable performance in\ngenerative tasks. Beyond their initial training applications, these models have\nproven their ability to function as versatile plug-and-play priors. For\ninstance, 2D diffusion models can serve as loss functions to optimize 3D\nimplicit models. Rectified flow, a novel class of generative models, enforces a\nlinear progression from the source to the target distribution and has\ndemonstrated superior performance across various domains. Compared to\ndiffusion-based methods, rectified flow approaches surpass in terms of\ngeneration quality and efficiency, requiring fewer inference steps. In this\nwork, we present theoretical and experimental evidence demonstrating that\nrectified flow based methods offer similar functionalities to diffusion models\n- they can also serve as effective priors. Besides the generative capabilities\nof diffusion priors, motivated by the unique time-symmetry properties of\nrectified flow models, a variant of our method can additionally perform image\ninversion. Experimentally, our rectified flow-based priors outperform their\ndiffusion counterparts - the SDS and VSD losses - in text-to-3D generation. Our\nmethod also displays competitive performance in image inversion and editing.\n","authors":["Xiaofeng Yang","Cheng Chen","Xulei Yang","Fayao Liu","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2406.03293v1.pdf","comment":"Code: https://github.com/yangxiaofeng/rectified_flow_prior"},{"id":"http://arxiv.org/abs/2403.18715v2","updated":"2024-06-05T13:53:42Z","published":"2024-03-27T16:04:47Z","title":"Mitigating Hallucinations in Large Vision-Language Models with\n  Instruction Contrastive Decoding","summary":"  Large Vision-Language Models (LVLMs) are increasingly adept at generating\ncontextually detailed and coherent responses from visual inputs. However, their\napplication in multimodal decision-making and open-ended generation is hindered\nby a notable rate of hallucinations, where generated text inaccurately\nrepresents the visual contents. To address this issue, this paper introduces\nthe Instruction Contrastive Decoding (ICD) method, a novel approach designed to\nreduce hallucinations during LVLM inference. Our method is inspired by our\nobservation that what we call disturbance instructions significantly exacerbate\nhallucinations in multimodal fusion modules. ICD contrasts distributions from\nstandard and instruction disturbance, thereby increasing alignment uncertainty\nand effectively subtracting hallucinated concepts from the original\ndistribution. Through comprehensive experiments on discriminative benchmarks\n(POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that\nICD significantly mitigates both object-level and attribute-level\nhallucinations. Moreover, our method not only addresses hallucinations but also\nsignificantly enhances the general perception and recognition capabilities of\nLVLMs.\n","authors":["Xintong Wang","Jingheng Pan","Liang Ding","Chris Biemann"],"pdf_url":"https://arxiv.org/pdf/2403.18715v2.pdf","comment":"Accepted to Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.03273v1","updated":"2024-06-05T13:52:45Z","published":"2024-06-05T13:52:45Z","title":"VWise: A novel benchmark for evaluating scene classification for\n  vehicular applications","summary":"  Current datasets for vehicular applications are mostly collected in North\nAmerica or Europe. Models trained or evaluated on these datasets might suffer\nfrom geographical bias when deployed in other regions. Specifically, for scene\nclassification, a highway in a Latin American country differs drastically from\nan Autobahn, for example, both in design and maintenance levels. We propose\nVWise, a novel benchmark for road-type classification and scene classification\ntasks, in addition to tasks focused on external contexts related to vehicular\napplications in LatAm. We collected over 520 video clips covering diverse urban\nand rural environments across Latin American countries, annotated with six\nclasses of road types. We also evaluated several state-of-the-art\nclassification models in baseline experiments, obtaining over 84% accuracy.\nWith this dataset, we aim to enhance research on vehicular tasks in Latin\nAmerica.\n","authors":["Pedro Azevedo","Emanuella Araújo","Gabriel Pierre","Willams de Lima Costa","João Marcelo Teixeira","Valter Ferreira","Roberto Jones","Veronica Teichrieb"],"pdf_url":"https://arxiv.org/pdf/2406.03273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03271v1","updated":"2024-06-05T13:50:29Z","published":"2024-06-05T13:50:29Z","title":"Image Copy-Move Forgery Detection and Localization Scheme: How to Avoid\n  Missed Detection and False Alarm","summary":"  Image copy-move is an operation that replaces one part of the image with\nanother part of the same image, which can be used for illegal purposes due to\nthe potential semantic changes. Recent studies have shown that keypoint-based\nalgorithms achieved excellent and robust localization performance even when\nsmall or smooth tampered areas were involved. However, when the input image is\nlow-resolution, most existing keypoint-based algorithms are difficult to\ngenerate sufficient keypoints, resulting in more missed detections. In\naddition, existing algorithms are usually unable to distinguish between Similar\nbut Genuine Objects (SGO) images and tampered images, resulting in more false\nalarms. This is mainly due to the lack of further verification of local\nhomography matrix in forgery localization stage. To tackle these problems, this\npaper firstly proposes an excessive keypoint extraction strategy to overcome\nmissed detection. Subsequently, a group matching algorithm is used to speed up\nthe matching of excessive keypoints. Finally, a new iterative forgery\nlocalization algorithm is introduced to quickly form pixel-level localization\nresults while ensuring a lower false alarm. Extensive experimental results show\nthat our scheme has superior performance than state-of-the-art algorithms in\novercoming missed detection and false alarm. Our code is available at\nhttps://github.com/LUZW1998/CMFDL.\n","authors":["Li Jiang","Zhaowei Lu","Yuebing Gao","Yifan Wang"],"pdf_url":"https://arxiv.org/pdf/2406.03271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01355v2","updated":"2024-06-05T13:42:25Z","published":"2024-02-02T12:22:41Z","title":"FindingEmo: An Image Dataset for Emotion Recognition in the Wild","summary":"  We introduce FindingEmo, a new image dataset containing annotations for 25k\nimages, specifically tailored to Emotion Recognition. Contrary to existing\ndatasets, it focuses on complex scenes depicting multiple people in various\nnaturalistic, social settings, with images being annotated as a whole, thereby\ngoing beyond the traditional focus on faces or single individuals. Annotated\ndimensions include Valence, Arousal and Emotion label, with annotations\ngathered using Prolific. Together with the annotations, we release the list of\nURLs pointing to the original images, as well as all associated source code.\n","authors":["Laurent Mertens","Elahe' Yargholi","Hans Op de Beeck","Jan Van den Stock","Joost Vennekens"],"pdf_url":"https://arxiv.org/pdf/2402.01355v2.pdf","comment":"33 pages, 21 figures, 12 tables"},{"id":"http://arxiv.org/abs/2406.03263v1","updated":"2024-06-05T13:41:09Z","published":"2024-06-05T13:41:09Z","title":"Deep Generative Models for Proton Zero Degree Calorimeter Simulations in\n  ALICE, CERN","summary":"  Simulating detector responses is a crucial part of understanding the\ninner-workings of particle collisions in the Large Hadron Collider at CERN. The\ncurrent reliance on statistical Monte-Carlo simulations strains CERN's\ncomputational grid, underscoring the urgency for more efficient alternatives.\nAddressing these challenges, recent proposals advocate for generative machine\nlearning methods. In this study, we present an innovative deep learning\nsimulation approach tailored for the proton Zero Degree Calorimeter in the\nALICE experiment. Leveraging a Generative Adversarial Network model with\nSelective Diversity Increase loss, we directly simulate calorimeter responses.\nTo enhance its capabilities in modeling a broad range of calorimeter response\nintensities, we expand the SDI-GAN architecture with additional regularization.\nMoreover, to improve the spatial fidelity of the generated data, we introduce\nan auxiliary regressor network. Our method offers a significant speedup when\ncomparing to the traditional Monte-Carlo based approaches.\n","authors":["Patryk Będkowski","Jan Dubiński","Kamil Deja","Przemysław Rokita"],"pdf_url":"https://arxiv.org/pdf/2406.03263v1.pdf","comment":"8 pages, 3 figures, PP-RAI 2024 conference"},{"id":"http://arxiv.org/abs/2406.03262v1","updated":"2024-06-05T13:40:07Z","published":"2024-06-05T13:40:07Z","title":"ADer: A Comprehensive Benchmark for Multi-class Visual Anomaly Detection","summary":"  Visual anomaly detection aims to identify anomalous regions in images through\nunsupervised learning paradigms, with increasing application demand and value\nin fields such as industrial inspection and medical lesion detection. Despite\nsignificant progress in recent years, there is a lack of comprehensive\nbenchmarks to adequately evaluate the performance of various mainstream methods\nacross different datasets under the practical multi-class setting. The absence\nof standardized experimental setups can lead to potential biases in training\nepochs, resolution, and metric results, resulting in erroneous conclusions.\nThis paper addresses this issue by proposing a comprehensive visual anomaly\ndetection benchmark, \\textbf{\\textit{ADer}}, which is a modular framework that\nis highly extensible for new methods. The benchmark includes multiple datasets\nfrom industrial and medical domains, implementing fifteen state-of-the-art\nmethods and nine comprehensive metrics. Additionally, we have open-sourced the\nGPU-assisted \\href{https://pypi.org/project/ADEval}{ADEval} package to address\nthe slow evaluation problem of metrics like time-consuming mAU-PRO on\nlarge-scale data, significantly reducing evaluation time by more than\n\\textit{1000-fold}. Through extensive experimental results, we objectively\nreveal the strengths and weaknesses of different methods and provide insights\ninto the challenges and future directions of multi-class visual anomaly\ndetection. We hope that \\textbf{\\textit{ADer}} will become a valuable resource\nfor researchers and practitioners in the field, promoting the development of\nmore robust and generalizable anomaly detection systems. Full codes have been\nattached in Appendix and open-sourced at\n\\url{https://github.com/zhangzjn/ader}.\n","authors":["Jiangning Zhang","Haoyang He","Zhenye Gan","Qingdong He","Yuxuan Cai","Zhucun Xue","Yabiao Wang","Chengjie Wang","Lei Xie","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2406.03262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03250v1","updated":"2024-06-05T13:26:30Z","published":"2024-06-05T13:26:30Z","title":"Prompt-based Visual Alignment for Zero-shot Policy Transfer","summary":"  Overfitting in RL has become one of the main obstacles to applications in\nreinforcement learning(RL). Existing methods do not provide explicit semantic\nconstrain for the feature extractor, hindering the agent from learning a\nunified cross-domain representation and resulting in performance degradation on\nunseen domains. Besides, abundant data from multiple domains are needed. To\naddress these issues, in this work, we propose prompt-based visual alignment\n(PVA), a robust framework to mitigate the detrimental domain bias in the image\nfor zero-shot policy transfer. Inspired that Visual-Language Model (VLM) can\nserve as a bridge to connect both text space and image space, we leverage the\nsemantic information contained in a text sequence as an explicit constraint to\ntrain a visual aligner. Thus, the visual aligner can map images from multiple\ndomains to a unified domain and achieve good generalization performance. To\nbetter depict semantic information, prompt tuning is applied to learn a\nsequence of learnable tokens. With explicit constraints of semantic\ninformation, PVA can learn unified cross-domain representation under limited\naccess to cross-domain data and achieves great zero-shot generalization ability\nin unseen domains. We verify PVA on a vision-based autonomous driving task with\nCARLA simulator. Experiments show that the agent generalizes well on unseen\ndomains under limited access to multi-domain data.\n","authors":["Haihan Gao","Rui Zhang","Qi Yi","Hantao Yao","Haochen Li","Jiaming Guo","Shaohui Peng","Yunkai Gao","QiCheng Wang","Xing Hu","Yuanbo Wen","Zihao Zhang","Zidong Du","Ling Li","Qi Guo","Yunji Chen"],"pdf_url":"https://arxiv.org/pdf/2406.03250v1.pdf","comment":"This paper has been accepted by ICML2024"},{"id":"http://arxiv.org/abs/2406.02422v2","updated":"2024-06-05T13:17:23Z","published":"2024-06-04T15:39:49Z","title":"IterMask2: Iterative Unsupervised Anomaly Segmentation via Spatial and\n  Frequency Masking for Brain Lesions in MRI","summary":"  Unsupervised anomaly segmentation approaches to pathology segmentation train\na model on images of healthy subjects, that they define as the 'normal' data\ndistribution. At inference, they aim to segment any pathologies in new images\nas 'anomalies', as they exhibit patterns that deviate from those in 'normal'\ntraining data. Prevailing methods follow the 'corrupt-and-reconstruct'\nparadigm. They intentionally corrupt an input image, reconstruct it to follow\nthe learned 'normal' distribution, and subsequently segment anomalies based on\nreconstruction error. Corrupting an input image, however, inevitably leads to\nsuboptimal reconstruction even of normal regions, causing false positives. To\nalleviate this, we propose a novel iterative spatial mask-refining strategy\nIterMask2. We iteratively mask areas of the image, reconstruct them, and update\nthe mask based on reconstruction error. This iterative process progressively\nadds information about areas that are confidently normal as per the model. The\nincreasing content guides reconstruction of nearby masked areas, improving\nreconstruction of normal tissue under these areas, reducing false positives. We\nalso use high-frequency image content as an auxiliary input to provide\nadditional structural information for masked areas. This further improves\nreconstruction error of normal in comparison to anomalous areas, facilitating\nsegmentation of the latter. We conduct experiments on several brain lesion\ndatasets and demonstrate effectiveness of our method. Code is available at:\nhttps://github.com/ZiyunLiang/IterMask2\n","authors":["Ziyun Liang","Xiaoqing Guo","J. Alison Noble","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2406.02422v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02678v3","updated":"2024-06-05T13:12:17Z","published":"2024-05-04T14:43:31Z","title":"Position: Quo Vadis, Unsupervised Time Series Anomaly Detection?","summary":"  The current state of machine learning scholarship in Timeseries Anomaly\nDetection (TAD) is plagued by the persistent use of flawed evaluation metrics,\ninconsistent benchmarking practices, and a lack of proper justification for the\nchoices made in novel deep learning-based model designs. Our paper presents a\ncritical analysis of the status quo in TAD, revealing the misleading track of\ncurrent research and highlighting problematic methods, and evaluation\npractices. Our position advocates for a shift in focus from solely pursuing\nnovel model designs to improving benchmarking practices, creating non-trivial\ndatasets, and critically evaluating the utility of complex methods against\nsimpler baselines. Our findings demonstrate the need for rigorous evaluation\nprotocols, the creation of simple baselines, and the revelation that\nstate-of-the-art deep anomaly detection models effectively learn linear\nmappings. These findings suggest the need for more exploration and development\nof simple and interpretable TAD methods. The increment of model complexity in\nthe state-of-the-art deep-learning based models unfortunately offers very\nlittle improvement. We offer insights and suggestions for the field to move\nforward.\n  Code: https://github.com/ssarfraz/QuoVadisTAD\n","authors":["M. Saquib Sarfraz","Mei-Yen Chen","Lukas Layer","Kunyu Peng","Marios Koulakis"],"pdf_url":"https://arxiv.org/pdf/2405.02678v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.03233v1","updated":"2024-06-05T13:11:53Z","published":"2024-06-05T13:11:53Z","title":"Generative Diffusion Models for Fast Simulations of Particle Collisions\n  at CERN","summary":"  In High Energy Physics simulations play a crucial role in unraveling the\ncomplexities of particle collision experiments within CERN's Large Hadron\nCollider. Machine learning simulation methods have garnered attention as\npromising alternatives to traditional approaches. While existing methods mainly\nemploy Variational Autoencoders (VAEs) or Generative Adversarial Networks\n(GANs), recent advancements highlight the efficacy of diffusion models as\nstate-of-the-art generative machine learning methods. We present the first\nsimulation for Zero Degree Calorimeter (ZDC) at the ALICE experiment based on\ndiffusion models, achieving the highest fidelity compared to existing\nbaselines. We perform an analysis of trade-offs between generation times and\nthe simulation quality. The results indicate a significant potential of latent\ndiffusion model due to its rapid generation time.\n","authors":["Mikołaj Kita","Jan Dubiński","Przemysław Rokita","Kamil Deja"],"pdf_url":"https://arxiv.org/pdf/2406.03233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03229v1","updated":"2024-06-05T13:06:17Z","published":"2024-06-05T13:06:17Z","title":"Global Clipper: Enhancing Safety and Reliability of Transformer-based\n  Object Detection Models","summary":"  As transformer-based object detection models progress, their impact in\ncritical sectors like autonomous vehicles and aviation is expected to grow.\nSoft errors causing bit flips during inference have significantly impacted DNN\nperformance, altering predictions. Traditional range restriction solutions for\nCNNs fall short for transformers. This study introduces the Global Clipper and\nGlobal Hybrid Clipper, effective mitigation strategies specifically designed\nfor transformer-based models. It significantly enhances their resilience to\nsoft errors and reduces faulty inferences to ~ 0\\%. We also detail extensive\ntesting across over 64 scenarios involving two transformer models (DINO-DETR\nand Lite-DETR) and two CNN models (YOLOv3 and SSD) using three datasets,\ntotalling approximately 3.3 million inferences, to assess model robustness\ncomprehensively. Moreover, the paper explores unique aspects of attention\nblocks in transformers and their operational differences from CNNs.\n","authors":["Qutub Syed Sha","Michael Paulitsch","Karthik Pattabiraman","Korbinian Hagn","Fabian Oboril","Cornelius Buerkle","Kay-Ulrich Scholl","Gereon Hinz","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2406.03229v1.pdf","comment":"Accepted at IJCAI-AISafety'24 Workshop"},{"id":"http://arxiv.org/abs/2406.03225v1","updated":"2024-06-05T13:03:06Z","published":"2024-06-05T13:03:06Z","title":"Interactive Image Selection and Training for Brain Tumor Segmentation\n  Network","summary":"  Medical image segmentation is a relevant problem, with deep learning being an\nexponent. However, the necessity of a high volume of fully annotated images for\ntraining massive models can be a problem, especially for applications whose\nimages present a great diversity, such as brain tumors, which can occur in\ndifferent sizes and shapes. In contrast, a recent methodology, Feature Learning\nfrom Image Markers (FLIM), has involved an expert in the learning loop,\nproducing small networks that require few images to train the convolutional\nlayers. In this work, We employ an interactive method for image selection and\ntraining based on FLIM, exploring the user's knowledge. The results\ndemonstrated that with our methodology, we could choose a small set of images\nto train the encoder of a U-shaped network, obtaining performance equal to\nmanual selection and even surpassing the same U-shaped network trained with\nbackpropagation and all training images.\n","authors":["Matheus A. Cerqueira","Flávia Sprenger","Bernardo C. A. Teixeira","Alexandre X. Falcão"],"pdf_url":"https://arxiv.org/pdf/2406.03225v1.pdf","comment":"5 pages, 4 figures, and 3 tables"},{"id":"http://arxiv.org/abs/2406.03215v1","updated":"2024-06-05T12:53:28Z","published":"2024-06-05T12:53:28Z","title":"Searching Priors Makes Text-to-Video Synthesis Better","summary":"  Significant advancements in video diffusion models have brought substantial\nprogress to the field of text-to-video (T2V) synthesis. However, existing T2V\nsynthesis model struggle to accurately generate complex motion dynamics,\nleading to a reduction in video realism. One possible solution is to collect\nmassive data and train the model on it, but this would be extremely expensive.\nTo alleviate this problem, in this paper, we reformulate the typical T2V\ngeneration process as a search-based generation pipeline. Instead of scaling up\nthe model training, we employ existing videos as the motion prior database.\nSpecifically, we divide T2V generation process into two steps: (i) For a given\nprompt input, we search existing text-video datasets to find videos with text\nlabels that closely match the prompt motions. We propose a tailored search\nalgorithm that emphasizes object motion features. (ii) Retrieved videos are\nprocessed and distilled into motion priors to fine-tune a pre-trained base T2V\nmodel, followed by generating desired videos using input prompt. By utilizing\nthe priors gleaned from the searched videos, we enhance the realism of the\ngenerated videos' motion. All operations can be finished on a single NVIDIA RTX\n4090 GPU. We validate our method against state-of-the-art T2V models across\ndiverse prompt inputs. The code will be public.\n","authors":["Haoran Cheng","Liang Peng","Linxuan Xia","Yuepeng Hu","Hengjia Li","Qinglin Lu","Xiaofei He","Boxi Wu"],"pdf_url":"https://arxiv.org/pdf/2406.03215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07865v3","updated":"2024-06-05T12:46:15Z","published":"2024-05-13T15:53:18Z","title":"AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous\n  Driving","summary":"  The scale-up of autonomous vehicles depends heavily on their ability to deal\nwith anomalies, such as rare objects on the road. In order to handle such\nsituations, it is necessary to detect anomalies in the first place. Anomaly\ndetection for autonomous driving has made great progress in the past years but\nsuffers from poorly designed benchmarks with a strong focus on camera data. In\nthis work, we propose AnoVox, the largest benchmark for ANOmaly detection in\nautonomous driving to date. AnoVox incorporates large-scale multimodal sensor\ndata and spatial VOXel ground truth, allowing for the comparison of methods\nindependent of their used sensor. We propose a formal definition of normality\nand provide a compliant training dataset. AnoVox is the first benchmark to\ncontain both content and temporal anomalies.\n","authors":["Daniel Bogdoll","Iramm Hamdard","Lukas Namgyu Rößler","Felix Geisler","Muhammed Bayram","Felix Wang","Jan Imhof","Miguel de Campos","Anushervon Tabarov","Yitian Yang","Hanno Gottschalk","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2405.07865v3.pdf","comment":"Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\\\"o{\\ss}ler\n  contributed equally"},{"id":"http://arxiv.org/abs/2406.03207v1","updated":"2024-06-05T12:44:49Z","published":"2024-06-05T12:44:49Z","title":"Identification of Stone Deterioration Patterns with Large Multimodal\n  Models","summary":"  The conservation of stone-based cultural heritage sites is a critical concern\nfor preserving cultural and historical landmarks. With the advent of Large\nMultimodal Models, as GPT-4omni (OpenAI), Claude 3 Opus (Anthropic) and Gemini\n1.5 Pro (Google), it is becoming increasingly important to define the\noperational capabilities of these models. In this work, we systematically\nevaluate the abilities of the main foundational multimodal models to recognise\nand classify anomalies and deterioration patterns of the stone elements that\nare useful in the practice of conservation and restoration of world heritage.\nAfter defining a taxonomy of the main stone deterioration patterns and\nanomalies, we asked the foundational models to identify a curated selection of\n354 highly representative images of stone-built heritage, offering them a\ncareful selection of labels to choose from. The result, which varies depending\non the type of pattern, allowed us to identify the strengths and weaknesses of\nthese models in the field of heritage conservation and restoration.\n","authors":["Daniele Corradetti","Jose Delgado Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2406.03207v1.pdf","comment":"10 pages, 5 figures, submitted to Journal of Cultural Heritage"},{"id":"http://arxiv.org/abs/2311.16835v5","updated":"2024-06-05T12:43:31Z","published":"2023-11-28T14:51:08Z","title":"Unified-modal Salient Object Detection via Adaptive Prompt Learning","summary":"  Existing single-modal and multi-modal salient object detection (SOD) methods\nfocus on designing specific architectures tailored for their respective tasks.\nHowever, developing completely different models for different tasks leads to\nlabor and time consumption, as well as high computational and practical\ndeployment costs. In this paper, we attempt to address both single-modal and\nmulti-modal SOD in a unified framework called UniSOD, which fully exploits the\noverlapping prior knowledge between different tasks. Nevertheless, assigning\nappropriate strategies to modality variable inputs is challenging. To this end,\nUniSOD learns modality-aware prompts with task-specific hints through adaptive\nprompt learning, which are plugged into the proposed pre-trained baseline SOD\nmodel to handle corresponding tasks, while only requiring few learnable\nparameters compared to training the entire model. Each modality-aware prompt is\ngenerated from a switchable prompt generation block, which adaptively performs\nstructural switching based on single-modal and multi-modal inputs without human\nintervention. Through end-to-end joint training, UniSOD achieves overall\nperformance improvement on 14 benchmark datasets for RGB, RGB-D, and RGB-T SOD,\nwhich demonstrates that our method effectively and efficiently unifies\nsingle-modal and multi-modal SOD tasks.The code and results are available at\nhttps://github.com/Angknpng/UniSOD.\n","authors":["Kunpeng Wang","Chenglong Li","Zhengzheng Tu","Zhengyi Liu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2311.16835v5.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2405.19957v3","updated":"2024-06-05T12:35:09Z","published":"2024-05-30T11:23:01Z","title":"PLA4D: Pixel-Level Alignments for Text-to-4D Gaussian Splatting","summary":"  As text-conditioned diffusion models (DMs) achieve breakthroughs in image,\nvideo, and 3D generation, the research community's focus has shifted to the\nmore challenging task of text-to-4D synthesis, which introduces a temporal\ndimension to generate dynamic 3D objects. In this context, we identify Score\nDistillation Sampling (SDS), a widely used technique for text-to-3D synthesis,\nas a significant hindrance to text-to-4D performance due to its Janus-faced and\ntexture-unrealistic problems coupled with high computational costs. In this\npaper, we propose \\textbf{P}ixel-\\textbf{L}evel \\textbf{A}lignments for\nText-to-\\textbf{4D} Gaussian Splatting (\\textbf{PLA4D}), a novel method that\nutilizes text-to-video frames as explicit pixel alignment targets to generate\nstatic 3D objects and inject motion into them. Specifically, we introduce Focal\nAlignment to calibrate camera poses for rendering and GS-Mesh Contrastive\nLearning to distill geometry priors from rendered image contrasts at the pixel\nlevel. Additionally, we develop Motion Alignment using a deformation network to\ndrive changes in Gaussians and implement Reference Refinement for smooth 4D\nobject surfaces. These techniques enable 4D Gaussian Splatting to align\ngeometry, texture, and motion with generated videos at the pixel level.\nCompared to previous methods, PLA4D produces synthesized outputs with better\ntexture details in less time and effectively mitigates the Janus-faced problem.\nPLA4D is fully implemented using open-source models, offering an accessible,\nuser-friendly, and promising direction for 4D digital content creation. Our\nproject page: https://miaoqiaowei.github.io/PLA4D/.\n","authors":["Qiaowei Miao","Yawei Luo","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2405.19957v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14520v3","updated":"2024-06-05T12:34:58Z","published":"2024-03-21T16:17:57Z","title":"Cobra: Extending Mamba to Multi-Modal Large Language Model for Efficient\n  Inference","summary":"  In recent years, the application of multimodal large language models (MLLM)\nin various fields has achieved remarkable success. However, as the foundation\nmodel for many downstream tasks, current MLLMs are composed of the well-known\nTransformer network, which has a less efficient quadratic computation\ncomplexity. To improve the efficiency of such basic models, we propose Cobra, a\nlinear computational complexity MLLM. Specifically, Cobra integrates the\nefficient Mamba language model into the visual modality. Moreover, we explore\nand study various modal fusion schemes to create an effective multi-modal\nMamba. Extensive experiments demonstrate that (1) Cobra achieves extremely\ncompetitive performance with current computationally efficient state-of-the-art\nmethods, e.g., LLaVA-Phi, TinyLLaVA, and MobileVLM v2, and has faster speed due\nto Cobra's linear sequential modeling. (2) Interestingly, the results of\nclosed-set challenging prediction benchmarks show that Cobra performs well in\novercoming visual illusions and spatial relationship judgments. (3) Notably,\nCobra even achieves comparable performance to LLaVA with about 43% of the\nnumber of parameters. We will make all codes of Cobra open-source and hope that\nthe proposed method can facilitate future research on complexity problems in\nMLLM. Our project page is available at: https://sites.google.com/view/cobravlm.\n","authors":["Han Zhao","Min Zhang","Wei Zhao","Pengxiang Ding","Siteng Huang","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.14520v3.pdf","comment":"Update ablation results"},{"id":"http://arxiv.org/abs/2403.09976v2","updated":"2024-06-05T12:25:53Z","published":"2024-03-15T02:46:19Z","title":"AD3: Implicit Action is the Key for World Models to Distinguish the\n  Diverse Visual Distractors","summary":"  Model-based methods have significantly contributed to distinguishing\ntask-irrelevant distractors for visual control. However, prior research has\nprimarily focused on heterogeneous distractors like noisy background videos,\nleaving homogeneous distractors that closely resemble controllable agents\nlargely unexplored, which poses significant challenges to existing methods. To\ntackle this problem, we propose Implicit Action Generator (IAG) to learn the\nimplicit actions of visual distractors, and present a new algorithm named\nimplicit Action-informed Diverse visual Distractors Distinguisher (AD3), that\nleverages the action inferred by IAG to train separated world models. Implicit\nactions effectively capture the behavior of background distractors, aiding in\ndistinguishing the task-irrelevant components, and the agent can optimize the\npolicy within the task-relevant state space. Our method achieves superior\nperformance on various visual control tasks featuring both heterogeneous and\nhomogeneous distractors. The indispensable role of implicit actions learned by\nIAG is also empirically validated.\n","authors":["Yucen Wang","Shenghua Wan","Le Gan","Shuai Feng","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2403.09976v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03194v1","updated":"2024-06-05T12:23:17Z","published":"2024-06-05T12:23:17Z","title":"Writing Order Recovery in Complex and Long Static Handwriting","summary":"  The order in which the trajectory is executed is a powerful source of\ninformation for recognizers. However, there is still no general approach for\nrecovering the trajectory of complex and long handwriting from static images.\nComplex specimens can result in multiple pen-downs and in a high number of\ntrajectory crossings yielding agglomerations of pixels (also known as\nclusters). While the scientific literature describes a wide range of approaches\nfor recovering the writing order in handwriting, these approaches nevertheless\nlack a common evaluation metric. In this paper, we introduce a new system to\nestimate the order recovery of thinned static trajectories, which allows to\neffectively resolve the clusters and select the order of the executed\npen-downs. We evaluate how knowing the starting points of the pen-downs affects\nthe quality of the recovered writing. Once the stability and sensitivity of the\nsystem is analyzed, we describe a series of experiments with three publicly\navailable databases, showing competitive results in all cases. We expect the\nproposed system, whose code is made publicly available to the research\ncommunity, to reduce potential confusion when the order of complex trajectories\nare recovered, and this will in turn make the trajectories recovered to be\nviable for further applications, such as velocity estimation.\n","authors":["Moises Diaz","Gioele Crispo","Antonio Parziale","Angelo Marcelli","Miguel A. Ferrer"],"pdf_url":"https://arxiv.org/pdf/2406.03194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03188v1","updated":"2024-06-05T12:20:36Z","published":"2024-06-05T12:20:36Z","title":"Situation Monitor: Diversity-Driven Zero-Shot Out-of-Distribution\n  Detection using Budding Ensemble Architecture for Object Detection","summary":"  We introduce Situation Monitor, a novel zero-shot Out-of-Distribution (OOD)\ndetection approach for transformer-based object detection models to enhance\nreliability in safety-critical machine learning applications such as autonomous\ndriving. The Situation Monitor utilizes the Diversity-based Budding Ensemble\nArchitecture (DBEA) and increases the OOD performance by integrating a\ndiversity loss into the training process on top of the budding ensemble\narchitecture, detecting Far-OOD samples and minimizing false positives on\nNear-OOD samples. Moreover, utilizing the resulting DBEA increases the model's\nOOD performance and improves the calibration of confidence scores, particularly\nconcerning the intersection over union of the detected objects. The DBEA model\nachieves these advancements with a 14% reduction in trainable parameters\ncompared to the vanilla model. This signifies a substantial improvement in\nefficiency without compromising the model's ability to detect OOD instances and\ncalibrate the confidence scores accurately.\n","authors":["Qutub Syed","Michael Paulitsch","Korbinian Hagn","Neslihan Kose Cihangir","Kay-Ulrich Scholl","Fabian Oboril","Gereon Hinz","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2406.03188v1.pdf","comment":"Paper accepted at CVPR SAIAD Workshop"},{"id":"http://arxiv.org/abs/2406.03184v1","updated":"2024-06-05T12:15:22Z","published":"2024-06-05T12:15:22Z","title":"Ouroboros3D: Image-to-3D Generation via 3D-aware Recursive Diffusion","summary":"  Existing single image-to-3D creation methods typically involve a two-stage\nprocess, first generating multi-view images, and then using these images for 3D\nreconstruction. However, training these two stages separately leads to\nsignificant data bias in the inference phase, thus affecting the quality of\nreconstructed results. We introduce a unified 3D generation framework, named\nOuroboros3D, which integrates diffusion-based multi-view image generation and\n3D reconstruction into a recursive diffusion process. In our framework, these\ntwo modules are jointly trained through a self-conditioning mechanism, allowing\nthem to adapt to each other's characteristics for robust inference. During the\nmulti-view denoising process, the multi-view diffusion model uses the 3D-aware\nmaps rendered by the reconstruction module at the previous timestep as\nadditional conditions. The recursive diffusion framework with 3D-aware feedback\nunites the entire process and improves geometric consistency.Experiments show\nthat our framework outperforms separation of these two stages and existing\nmethods that combine them at the inference phase. Project page:\nhttps://costwen.github.io/Ouroboros3D/\n","authors":["Hao Wen","Zehuan Huang","Yaohui Wang","Xinyuan Chen","Yu Qiao","Lu Sheng"],"pdf_url":"https://arxiv.org/pdf/2406.03184v1.pdf","comment":"See our project page at https://costwen.github.io/Ouroboros3D/"},{"id":"http://arxiv.org/abs/2406.03183v1","updated":"2024-06-05T12:13:25Z","published":"2024-06-05T12:13:25Z","title":"Geometric Localization of Homology Cycles","summary":"  Computing an optimal cycle in a given homology class, also referred to as the\nhomology localization problem, is known to be an NP-hard problem in general.\nFurthermore, there is currently no known optimality criterion that localizes\nclasses geometrically and admits a stability property under the setting of\npersistent homology. We present a geometric optimization of the cycles that is\ncomputable in polynomial time and is stable in an approximate sense. Tailoring\nour search criterion to different settings, we obtain various optimization\nproblems like optimal homologous cycle, minimum homology basis, and minimum\npersistent homology basis. In practice, the (trivial) exact algorithm is\ncomputationally expensive despite having a worst case polynomial runtime.\nTherefore, we design approximation algorithms for the above problems and study\ntheir performance experimentally. These algorithms have reasonable runtimes for\nmoderate sized datasets and the cycles computed by these algorithms are\nconsistently of high quality as demonstrated via experiments on multiple\ndatasets.\n","authors":["Amritendu Dhar","Vijay Natarajan","Abhishek Rathod"],"pdf_url":"https://arxiv.org/pdf/2406.03183v1.pdf","comment":"To Appear in CCCG 2024 : Proc. 36th Canadian Conference on\n  Computational Geometry"},{"id":"http://arxiv.org/abs/2406.03177v1","updated":"2024-06-05T12:08:01Z","published":"2024-06-05T12:08:01Z","title":"FAPNet: An Effective Frequency Adaptive Point-based Eye Tracker","summary":"  Eye tracking is crucial for human-computer interaction in different domains.\nConventional cameras encounter challenges such as power consumption and image\nquality during different eye movements, prompting the need for advanced\nsolutions with ultra-fast, low-power, and accurate eye trackers. Event cameras,\nfundamentally designed to capture information about moving objects, exhibit low\npower consumption and high temporal resolution. This positions them as an\nalternative to traditional cameras in the realm of eye tracking. Nevertheless,\nexisting event-based eye tracking networks neglect the pivotal sparse and\nfine-grained temporal information in events, resulting in unsatisfactory\nperformance. Moreover, the energy-efficient features are further compromised by\nthe use of excessively complex models, hindering efficient deployment on edge\ndevices. In this paper, we utilize Point Cloud as the event representation to\nharness the high temporal resolution and sparse characteristics of events in\neye tracking tasks. We rethink the point-based architecture PEPNet with\npreprocessing the long-term relationships between samples, leading to the\ninnovative design of FAPNet. A frequency adaptive mechanism is designed to\nrealize adaptive tracking according to the speed of the pupil movement and the\nInter Sample LSTM module is introduced to utilize the temporal correlation\nbetween samples. In the Event-based Eye Tracking Challenge, we utilize vanilla\nPEPNet, which is the former work to achieve the $p_{10}$ accuracy of 97.95\\%.\nOn the SEET synthetic dataset, FAPNet can achieve state-of-the-art while\nconsuming merely 10\\% of the PEPNet's computational resources. Notably, the\ncomputational demand of FAPNet is independent of the sensor's spatial\nresolution, enhancing its applicability on resource-limited edge devices.\n","authors":["Xiaopeng Lin","Hongwei Ren","Bojun Cheng"],"pdf_url":"https://arxiv.org/pdf/2406.03177v1.pdf","comment":"Accepted by CVPRW 2024 (AIS)"},{"id":"http://arxiv.org/abs/2406.03176v1","updated":"2024-06-05T12:07:58Z","published":"2024-06-05T12:07:58Z","title":"MMCL: Boosting Deformable DETR-Based Detectors with Multi-Class\n  Min-Margin Contrastive Learning for Superior Prohibited Item Detection","summary":"  Prohibited Item detection in X-ray images is one of the most effective\nsecurity inspection methods.However, differing from natural light images, the\nunique overlapping phenomena in X-ray images lead to the coupling of foreground\nand background features, thereby lowering the accuracy of general object\ndetectors.Therefore, we propose a Multi-Class Min-Margin Contrastive Learning\n(MMCL) method that, by clarifying the category semantic information of content\nqueries under the deformable DETR architecture, aids the model in extracting\nspecific category foreground information from coupled features.Specifically,\nafter grouping content queries by the number of categories, we employ the\nMulti-Class Inter-Class Exclusion (MIE) loss to push apart content queries from\ndifferent groups. Concurrently, the Intra-Class Min-Margin Clustering (IMC)\nloss is utilized to attract content queries within the same group, while\nensuring the preservation of necessary disparity. As training, the inherent\nHungarian matching of the model progressively strengthens the alignment between\neach group of queries and the semantic features of their corresponding category\nof objects. This evolving coherence ensures a deep-seated grasp of category\ncharacteristics, consequently bolstering the anti-overlapping detection\ncapabilities of models.MMCL is versatile and can be easily plugged into any\ndeformable DETR-based model with dozens of lines of code. Extensive experiments\non the PIXray and OPIXray datasets demonstrate that MMCL significantly enhances\nthe performance of various state-of-the-art models without increasing\ncomplexity. The code has been released at\nhttps://github.com/anonymity0403/MMCL.\n","authors":["Mingyuan Li","Tong Jia","Hui Lu","Bowen Ma","Hao Wang","Dongyue Chen"],"pdf_url":"https://arxiv.org/pdf/2406.03176v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2406.03175v1","updated":"2024-06-05T12:07:39Z","published":"2024-06-05T12:07:39Z","title":"Dynamic 3D Gaussian Fields for Urban Areas","summary":"  We present an efficient neural 3D scene representation for novel-view\nsynthesis (NVS) in large-scale, dynamic urban areas. Existing works are not\nwell suited for applications like mixed-reality or closed-loop simulation due\nto their limited visual quality and non-interactive rendering speeds. Recently,\nrasterization-based approaches have achieved high-quality NVS at impressive\nspeeds. However, these methods are limited to small-scale, homogeneous data,\ni.e. they cannot handle severe appearance and geometry variations due to\nweather, season, and lighting and do not scale to larger, dynamic areas with\nthousands of images. We propose 4DGF, a neural scene representation that scales\nto large-scale dynamic urban areas, handles heterogeneous input data, and\nsubstantially improves rendering speeds. We use 3D Gaussians as an efficient\ngeometry scaffold while relying on neural fields as a compact and flexible\nappearance model. We integrate scene dynamics via a scene graph at global scale\nwhile modeling articulated motions on a local level via deformations. This\ndecomposed approach enables flexible scene composition suitable for real-world\napplications. In experiments, we surpass the state-of-the-art by over 3 dB in\nPSNR and more than 200 times in rendering speed.\n","authors":["Tobias Fischer","Jonas Kulhanek","Samuel Rota Bulò","Lorenzo Porzi","Marc Pollefeys","Peter Kontschieder"],"pdf_url":"https://arxiv.org/pdf/2406.03175v1.pdf","comment":"Project page is available at https://tobiasfshr.github.io/pub/4dgf/"},{"id":"http://arxiv.org/abs/2406.03173v1","updated":"2024-06-05T12:06:04Z","published":"2024-06-05T12:06:04Z","title":"Multi-Task Multi-Scale Contrastive Knowledge Distillation for Efficient\n  Medical Image Segmentation","summary":"  This thesis aims to investigate the feasibility of knowledge transfer between\nneural networks for medical image segmentation tasks, specifically focusing on\nthe transfer from a larger multi-task \"Teacher\" network to a smaller \"Student\"\nnetwork. In the context of medical imaging, where the data volumes are often\nlimited, leveraging knowledge from a larger pre-trained network could be\nuseful. The primary objective is to enhance the performance of a smaller\nstudent model by incorporating knowledge representations acquired by a teacher\nmodel that adopts a multi-task pre-trained architecture trained on CT images,\nto a more resource-efficient student network, which can essentially be a\nsmaller version of the same, trained on a mere 50% of the data than that of the\nteacher model.\n  To facilitate knowledge transfer between the two models, we devised an\narchitecture incorporating multi-scale feature distillation and supervised\ncontrastive learning. Our study aims to improve the student model's performance\nby integrating knowledge representations from the teacher model. We investigate\nwhether this approach is particularly effective in scenarios with limited\ncomputational resources and limited training data availability. To assess the\nimpact of multi-scale feature distillation, we conducted extensive experiments.\nWe also conducted a detailed ablation study to determine whether it is\nessential to distil knowledge at various scales, including low-level features\nfrom encoder layers, for effective knowledge transfer. In addition, we examine\ndifferent losses in the knowledge distillation process to gain insights into\ntheir effects on overall performance.\n","authors":["Risab Biswas"],"pdf_url":"https://arxiv.org/pdf/2406.03173v1.pdf","comment":"Master's thesis"},{"id":"http://arxiv.org/abs/2302.03640v4","updated":"2024-06-05T12:02:12Z","published":"2023-02-07T17:47:52Z","title":"SSR-2D: Semantic 3D Scene Reconstruction from 2D Images","summary":"  Most deep learning approaches to comprehensive semantic modeling of 3D indoor\nspaces require costly dense annotations in the 3D domain. In this work, we\nexplore a central 3D scene modeling task, namely, semantic scene reconstruction\nwithout using any 3D annotations. The key idea of our approach is to design a\ntrainable model that employs both incomplete 3D reconstructions and their\ncorresponding source RGB-D images, fusing cross-domain features into volumetric\nembeddings to predict complete 3D geometry, color, and semantics with only 2D\nlabeling which can be either manual or machine-generated. Our key technical\ninnovation is to leverage differentiable rendering of color and semantics to\nbridge 2D observations and unknown 3D space, using the observed RGB images and\n2D semantics as supervision, respectively. We additionally develop a learning\npipeline and corresponding method to enable learning from imperfect predicted\n2D labels, which could be additionally acquired by synthesizing in an augmented\nset of virtual training views complementing the original real captures,\nenabling more efficient self-supervision loop for semantics. As a result, our\nend-to-end trainable solution jointly addresses geometry completion,\ncolorization, and semantic mapping from limited RGB-D images, without relying\non any 3D ground-truth information. Our method achieves the state-of-the-art\nperformance of semantic scene completion on two large-scale benchmark datasets\nMatterPort3D and ScanNet, surpasses baselines even with costly 3D annotations\nin predicting both geometry and semantics. To our knowledge, our method is also\nthe first 2D-driven method addressing completion and semantic segmentation of\nreal-world 3D scans simultaneously.\n","authors":["Junwen Huang","Alexey Artemov","Yujin Chen","Shuaifeng Zhi","Kai Xu","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2302.03640v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16873v2","updated":"2024-06-05T11:59:37Z","published":"2024-05-27T06:43:12Z","title":"ContrastAlign: Toward Robust BEV Feature Alignment via Contrastive\n  Learning for Multi-Modal 3D Object Detection","summary":"  In the field of 3D object detection tasks, fusing heterogeneous features from\nLiDAR and camera sensors into a unified Bird's Eye View (BEV) representation is\na widely adopted paradigm. However, existing methods are often compromised by\nimprecise sensor calibration, resulting in feature misalignment in LiDAR-camera\nBEV fusion. Moreover, such inaccuracies result in errors in depth estimation\nfor the camera branch, ultimately causing misalignment between LiDAR and camera\nBEV features. In this work, we propose a novel ContrastAlign approach that\nutilizes contrastive learning to enhance the alignment of heterogeneous\nmodalities, thereby improving the robustness of the fusion process.\nSpecifically, our approach includes the L-Instance module, which directly\noutputs LiDAR instance features within LiDAR BEV features. Then, we introduce\nthe C-Instance module, which predicts camera instance features through RoI\n(Region of Interest) pooling on the camera BEV features. We propose the\nInstanceFusion module, which utilizes contrastive learning to generate similar\ninstance features across heterogeneous modalities. We then use graph matching\nto calculate the similarity between the neighboring camera instance features\nand the similarity instance features to complete the alignment of instance\nfeatures. Our method achieves state-of-the-art performance, with an mAP of\n70.3%, surpassing BEVFusion by 1.8% on the nuScenes validation set.\nImportantly, our method outperforms BEVFusion by 7.3% under conditions with\nmisalignment noise.\n","authors":["Ziying Song","Feiyang Jia","Hongyu Pan","Yadan Luo","Caiyan Jia","Guoxin Zhang","Lin Liu","Yang Ji","Lei Yang","Li Wang"],"pdf_url":"https://arxiv.org/pdf/2405.16873v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03150v1","updated":"2024-06-05T11:15:43Z","published":"2024-06-05T11:15:43Z","title":"Sample-specific Masks for Visual Reprogramming-based Prompting","summary":"  Visual reprogramming (VR) is a prompting technique that aims to re-purpose a\npre-trained model (e.g., a classifier on ImageNet) to target tasks (e.g.,\nmedical data prediction) by learning a small-scale pattern added into input\nimages instead of tuning considerable parameters within the model. The location\nof the pattern within input samples is usually determined by a pre-defined mask\nshared across all samples. In this paper, we show that the shared mask\npotentially limits VR's generalization and increases its approximation error\ndue to the lack of sample-level adaptation. Motivated by this finding, we\ndesign a new framework for VR called sample-specific multi-channel masks (SMM).\nSpecifically, SMM employs a lightweight ConvNet and patch-wise interpolation to\ngenerate sample-specific three-channel masks instead of a shared and\npre-defined mask. Since we generate different masks for individual samples, SMM\nis theoretically shown to reduce approximation error for the target tasks\ncompared with existing state-of-the-art VR methods. We also empirically\ndemonstrate its performance gain on both ResNet and ViT. The success of SMM\nfurther highlights the broader applicability of VR in leveraging the latent\nknowledge of pre-trained models for various target tasks. Our code is available\nat https://github.com/tmlr-group/SMM.\n","authors":["Chengyi Cai","Zesheng Ye","Lei Feng","Jianzhong Qi","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2406.03150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03146v1","updated":"2024-06-05T11:01:42Z","published":"2024-06-05T11:01:42Z","title":"Tiny models from tiny data: Textual and null-text inversion for few-shot\n  distillation","summary":"  Few-shot image classification involves classifying images using very few\ntraining examples. Recent vision foundation models show excellent few-shot\ntransfer abilities, but are large and slow at inference. Using knowledge\ndistillation, the capabilities of high-performing but slow models can be\ntransferred to tiny, efficient models. However, common distillation methods\nrequire a large set of unlabeled data, which is not available in the few-shot\nsetting. To overcome this lack of data, there has been a recent interest in\nusing synthetic data.\n  We expand on this work by presenting a novel diffusion model inversion\ntechnique (TINT) combining the diversity of textual inversion with the\nspecificity of null-text inversion. Using this method in a few-shot\ndistillation pipeline leads to state-of-the-art accuracy among small student\nmodels on popular benchmarks, while being significantly faster than prior work.\nThis allows us to push even tiny models to high accuracy using only a tiny\napplication-specific dataset, albeit relying on extra data for pre-training.\n  Popular few-shot benchmarks involve evaluation over a large number of\nepisodes, which is computationally cumbersome for methods involving synthetic\ndata generation. Therefore, we also present a theoretical analysis on how the\nvariance of the accuracy estimator depends on the number of episodes and query\nexamples, and use these results to lower the computational effort required for\nmethod evaluation. In addition, to further motivate the use of generative\nmodels in few-shot distillation, we demonstrate that our method performs better\ncompared to training on real data mined from the dataset used to train the\ndiffusion model.\n  Source code will be made available at https://github.com/pixwse/tiny2.\n","authors":["Erik Landolsi","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2406.03146v1.pdf","comment":"21 pages (9 main pages + references and appendix)"},{"id":"http://arxiv.org/abs/2406.03143v1","updated":"2024-06-05T10:58:15Z","published":"2024-06-05T10:58:15Z","title":"ZeroPur: Succinct Training-Free Adversarial Purification","summary":"  Adversarial purification is a kind of defense technique that can defend\nvarious unseen adversarial attacks without modifying the victim classifier.\nExisting methods often depend on external generative models or cooperation\nbetween auxiliary functions and victim classifiers. However, retraining\ngenerative models, auxiliary functions, or victim classifiers relies on the\ndomain of the fine-tuned dataset and is computation-consuming. In this work, we\nsuppose that adversarial images are outliers of the natural image manifold and\nthe purification process can be considered as returning them to this manifold.\nFollowing this assumption, we present a simple adversarial purification method\nwithout further training to purify adversarial images, called ZeroPur. ZeroPur\ncontains two steps: given an adversarial example, Guided Shift obtains the\nshifted embedding of the adversarial example by the guidance of its blurred\ncounterparts; after that, Adaptive Projection constructs a directional vector\nby this shifted embedding to provide momentum, projecting adversarial images\nonto the manifold adaptively. ZeroPur is independent of external models and\nrequires no retraining of victim classifiers or auxiliary functions, relying\nsolely on victim classifiers themselves to achieve purification. Extensive\nexperiments on three datasets (CIFAR-10, CIFAR-100, and ImageNet-1K) using\nvarious classifier architectures (ResNet, WideResNet) demonstrate that our\nmethod achieves state-of-the-art robust performance. The code will be publicly\navailable.\n","authors":["Xiuli Bi","Zonglin Yang","Bo Liu","Xiaodong Cun","Chi-Man Pun","Pietro Lio","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2406.03143v1.pdf","comment":"16 pages, 5 figures, under review"},{"id":"http://arxiv.org/abs/2305.01644v2","updated":"2024-06-05T10:24:43Z","published":"2023-05-02T17:56:06Z","title":"Key-Locked Rank One Editing for Text-to-Image Personalization","summary":"  Text-to-image models (T2I) offer a new level of flexibility by allowing users\nto guide the creative process through natural language. However, personalizing\nthese models to align with user-provided visual concepts remains a challenging\nproblem. The task of T2I personalization poses multiple hard challenges, such\nas maintaining high visual fidelity while allowing creative control, combining\nmultiple personalized concepts in a single image, and keeping a small model\nsize. We present Perfusion, a T2I personalization method that addresses these\nchallenges using dynamic rank-1 updates to the underlying T2I model. Perfusion\navoids overfitting by introducing a new mechanism that \"locks\" new concepts'\ncross-attention Keys to their superordinate category. Additionally, we develop\na gated rank-1 approach that enables us to control the influence of a learned\nconcept during inference time and to combine multiple concepts. This allows\nruntime-efficient balancing of visual-fidelity and textual-alignment with a\nsingle 100KB trained model, which is five orders of magnitude smaller than the\ncurrent state of the art. Moreover, it can span different operating points\nacross the Pareto front without additional training. Finally, we show that\nPerfusion outperforms strong baselines in both qualitative and quantitative\nterms. Importantly, key-locking leads to novel results compared to traditional\napproaches, allowing to portray personalized object interactions in\nunprecedented ways, even in one-shot settings.\n","authors":["Yoad Tewel","Rinon Gal","Gal Chechik","Yuval Atzmon"],"pdf_url":"https://arxiv.org/pdf/2305.01644v2.pdf","comment":"Accepted to SIGGRAPH 2023. Project page is in\n  https://research.nvidia.com/labs/par/Perfusion/"},{"id":"http://arxiv.org/abs/2406.03129v1","updated":"2024-06-05T10:24:00Z","published":"2024-06-05T10:24:00Z","title":"Enhanced Automotive Object Detection via RGB-D Fusion in a DiffusionDet\n  Framework","summary":"  Vision-based autonomous driving requires reliable and efficient object\ndetection. This work proposes a DiffusionDet-based framework that exploits data\nfusion from the monocular camera and depth sensor to provide the RGB and depth\n(RGB-D) data. Within this framework, ground truth bounding boxes are randomly\nreshaped as part of the training phase, allowing the model to learn the reverse\ndiffusion process of noise addition. The system methodically enhances a\nrandomly generated set of boxes at the inference stage, guiding them toward\naccurate final detections. By integrating the textural and color features from\nRGB images with the spatial depth information from the LiDAR sensors, the\nproposed framework employs a feature fusion that substantially enhances object\ndetection of automotive targets. The $2.3$ AP gain in detecting automotive\ntargets is achieved through comprehensive experiments using the KITTI dataset.\nSpecifically, the improved performance of the proposed approach in detecting\nsmall objects is demonstrated.\n","authors":["Eliraz Orfaig","Inna Stainvas","Igal Bilik"],"pdf_url":"https://arxiv.org/pdf/2406.03129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06741v2","updated":"2024-06-05T10:15:24Z","published":"2024-03-11T14:07:53Z","title":"Distribution-Aware Data Expansion with Diffusion Models","summary":"  The scale and quality of a dataset significantly impact the performance of\ndeep models. However, acquiring large-scale annotated datasets is both a costly\nand time-consuming endeavor. To address this challenge, dataset expansion\ntechnologies aim to automatically augment datasets, unlocking the full\npotential of deep models. Current data expansion techniques include image\ntransformation and image synthesis methods. Transformation-based methods\nintroduce only local variations, leading to limited diversity. In contrast,\nsynthesis-based methods generate entirely new content, greatly enhancing\ninformativeness. However, existing synthesis methods carry the risk of\ndistribution deviations, potentially degrading model performance with\nout-of-distribution samples. In this paper, we propose DistDiff, a\ntraining-free data expansion framework based on the distribution-aware\ndiffusion model. DistDiff constructs hierarchical prototypes to approximate the\nreal data distribution, optimizing latent data points within diffusion models\nwith hierarchical energy guidance. We demonstrate its capability to generate\ndistribution-consistent samples, significantly improving data expansion tasks.\nDistDiff consistently enhances accuracy across a diverse range of datasets\ncompared to models trained solely on original data. Furthermore, our approach\nconsistently outperforms existing synthesis-based techniques and demonstrates\ncompatibility with widely adopted transformation-based augmentation methods.\nAdditionally, the expanded dataset exhibits robustness across various\narchitectural frameworks. Our code is available at\nhttps://github.com/haoweiz23/DistDiff\n","authors":["Haowei Zhu","Ling Yang","Jun-Hai Yong","Hongzhi Yin","Jiawei Jiang","Meng Xiao","Wentao Zhang","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.06741v2.pdf","comment":"Project: https://github.com/haoweiz23/DistDiff"},{"id":"http://arxiv.org/abs/2406.03117v1","updated":"2024-06-05T10:10:03Z","published":"2024-06-05T10:10:03Z","title":"VQUNet: Vector Quantization U-Net for Defending Adversarial Atacks by\n  Regularizing Unwanted Noise","summary":"  Deep Neural Networks (DNN) have become a promising paradigm when developing\nArtificial Intelligence (AI) and Machine Learning (ML) applications. However,\nDNN applications are vulnerable to fake data that are crafted with adversarial\nattack algorithms. Under adversarial attacks, the prediction accuracy of DNN\napplications suffers, making them unreliable. In order to defend against\nadversarial attacks, we introduce a novel noise-reduction procedure, Vector\nQuantization U-Net (VQUNet), to reduce adversarial noise and reconstruct data\nwith high fidelity. VQUNet features a discrete latent representation learning\nthrough a multi-scale hierarchical structure for both noise reduction and data\nreconstruction. The empirical experiments show that the proposed VQUNet\nprovides better robustness to the target DNN models, and it outperforms other\nstate-of-the-art noise-reduction-based defense methods under various\nadversarial attacks for both Fashion-MNIST and CIFAR10 datasets. When there is\nno adversarial attack, the defense method has less than 1% accuracy degradation\nfor both datasets.\n","authors":["Zhixun He","Mukesh Singhal"],"pdf_url":"https://arxiv.org/pdf/2406.03117v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.18721v2","updated":"2024-06-05T09:59:21Z","published":"2024-05-29T03:05:59Z","title":"Correctable Landmark Discovery via Large Models for Vision-Language\n  Navigation","summary":"  Vision-Language Navigation (VLN) requires the agent to follow language\ninstructions to reach a target position. A key factor for successful navigation\nis to align the landmarks implied in the instruction with diverse visual\nobservations. However, previous VLN agents fail to perform accurate modality\nalignment especially in unexplored scenes, since they learn from limited\nnavigation data and lack sufficient open-world alignment knowledge. In this\nwork, we propose a new VLN paradigm, called COrrectable LaNdmark DiScOvery via\nLarge ModEls (CONSOLE). In CONSOLE, we cast VLN as an open-world sequential\nlandmark discovery problem, by introducing a novel correctable landmark\ndiscovery scheme based on two large models ChatGPT and CLIP. Specifically, we\nuse ChatGPT to provide rich open-world landmark cooccurrence commonsense, and\nconduct CLIP-driven landmark discovery based on these commonsense priors. To\nmitigate the noise in the priors due to the lack of visual constraints, we\nintroduce a learnable cooccurrence scoring module, which corrects the\nimportance of each cooccurrence according to actual observations for accurate\nlandmark discovery. We further design an observation enhancement strategy for\nan elegant combination of our framework with different VLN agents, where we\nutilize the corrected landmark features to obtain enhanced observation features\nfor action decision. Extensive experimental results on multiple popular VLN\nbenchmarks (R2R, REVERIE, R4R, RxR) show the significant superiority of CONSOLE\nover strong baselines. Especially, our CONSOLE establishes the new\nstate-of-the-art results on R2R and R4R in unseen scenarios. Code is available\nat https://github.com/expectorlin/CONSOLE.\n","authors":["Bingqian Lin","Yunshuang Nie","Ziming Wei","Yi Zhu","Hang Xu","Shikui Ma","Jianzhuang Liu","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2405.18721v2.pdf","comment":"Accepted by TPAMI 2024"},{"id":"http://arxiv.org/abs/2406.03105v1","updated":"2024-06-05T09:48:56Z","published":"2024-06-05T09:48:56Z","title":"Enhancing 3D Lane Detection and Topology Reasoning with 2D Lane Priors","summary":"  3D lane detection and topology reasoning are essential tasks in autonomous\ndriving scenarios, requiring not only detecting the accurate 3D coordinates on\nlane lines, but also reasoning the relationship between lanes and traffic\nelements. Current vision-based methods, whether explicitly constructing BEV\nfeatures or not, all establish the lane anchors/queries in 3D space while\nignoring the 2D lane priors. In this study, we propose Topo2D, a novel\nframework based on Transformer, leveraging 2D lane instances to initialize 3D\nqueries and 3D positional embeddings. Furthermore, we explicitly incorporate 2D\nlane features into the recognition of topology relationships among lane\ncenterlines and between lane centerlines and traffic elements. Topo2D achieves\n44.5% OLS on multi-view topology reasoning benchmark OpenLane-V2 and 62.6%\nF-Socre on single-view 3D lane detection benchmark OpenLane, exceeding the\nperformance of existing state-of-the-art methods.\n","authors":["Han Li","Zehao Huang","Zitian Wang","Wenge Rong","Naiyan Wang","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2406.03105v1.pdf","comment":"20 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2306.03430v4","updated":"2024-06-05T09:46:22Z","published":"2023-06-06T06:09:11Z","title":"Revisiting the Trade-off between Accuracy and Robustness via Weight\n  Distribution of Filters","summary":"  Adversarial attacks have been proven to be potential threats to Deep Neural\nNetworks (DNNs), and many methods are proposed to defend against adversarial\nattacks. However, while enhancing the robustness, the clean accuracy will\ndecline to a certain extent, implying a trade-off existed between the accuracy\nand robustness. In this paper, to meet the trade-off problem, we theoretically\nexplore the underlying reason for the difference of the filters' weight\ndistribution between standard-trained and robust-trained models and then argue\nthat this is an intrinsic property for static neural networks, thus they are\ndifficult to fundamentally improve the accuracy and adversarial robustness at\nthe same time. Based on this analysis, we propose a sample-wise dynamic network\narchitecture named Adversarial Weight-Varied Network (AW-Net), which focuses on\ndealing with clean and adversarial examples with a \"divide and rule\" weight\nstrategy. The AW-Net adaptively adjusts the network's weights based on\nregulation signals generated by an adversarial router, which is directly\ninfluenced by the input sample. Benefiting from the dynamic network\narchitecture, clean and adversarial examples can be processed with different\nnetwork weights, which provides the potential to enhance both accuracy and\nadversarial robustness. A series of experiments demonstrate that our AW-Net is\narchitecture-friendly to handle both clean and adversarial examples and can\nachieve better trade-off performance than state-of-the-art robust models.\n","authors":["Xingxing Wei","Shiji Zhao","Bo li"],"pdf_url":"https://arxiv.org/pdf/2306.03430v4.pdf","comment":"Accepted by TPAMI2024"},{"id":"http://arxiv.org/abs/2406.03103v1","updated":"2024-06-05T09:45:56Z","published":"2024-06-05T09:45:56Z","title":"EpidermaQuant: Unsupervised detection and quantification of epidermal\n  differentiation markers on H-DAB-stained images of reconstructed human\n  epidermis","summary":"  The integrity of the reconstructed human epidermis generated in vitro could\nbe assessed using histological analyses combined with immunohistochemical\nstaining of keratinocyte differentiation markers. Computer-based analysis of\nscanned tissue saves the expert time and may improve the accuracy of\nquantification by eliminating interrater reliability issues. However, technical\ndifferences during the preparation and capture of stained images and the\npresence of multiple artifacts may influence the outcome of computational\nmethods. Using a dataset with 598 unannotated images showing cross-sections of\nin vitro reconstructed human epidermis stained with DAB-based\nimmunohistochemistry reaction to visualize 4 different keratinocyte\ndifferentiation marker proteins (filaggrin, keratin 10, Ki67, HSPA2) and\ncounterstained with hematoxylin, we developed an unsupervised method for the\ndetection and quantification of immunohistochemical staining. The proposed\npipeline includes the following steps: (i) color normalization to reduce the\nvariability of pixel intensity values in different samples; (ii) color\ndeconvolution to acquire color channels of the stains used; (iii) morphological\noperations to find the background area of the image; (iv) automatic image\nrotation; and (v) finding markers of human epidermal differentiation with\nclustering. Also, we created a method to exclude images without DAB-stained\nareas. The most effective combination of methods includes: (i) Reinhard's\nnormalization; (ii) Ruifrok and Johnston color deconvolution method; (iii)\nproposed image rotation method based on boundary distribution of image\nintensity; (iv) k-means clustering using DAB stain intensity. These results\nshould enhance the performance of quantitative analysis of protein markers in\nreconstructed human epidermis samples and enable comparison of their spatial\ndistribution between different experimental conditions.\n","authors":["Dawid Zamojski","Agnieszka Gogler","Dorota Scieglinska","Michal Marczyk"],"pdf_url":"https://arxiv.org/pdf/2406.03103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16182v2","updated":"2024-06-05T09:44:52Z","published":"2024-03-24T15:00:44Z","title":"EgoExoLearn: A Dataset for Bridging Asynchronous Ego- and Exo-centric\n  View of Procedural Activities in Real World","summary":"  Being able to map the activities of others into one's own point of view is\none fundamental human skill even from a very early age. Taking a step toward\nunderstanding this human ability, we introduce EgoExoLearn, a large-scale\ndataset that emulates the human demonstration following process, in which\nindividuals record egocentric videos as they execute tasks guided by\ndemonstration videos. Focusing on the potential applications in daily\nassistance and professional support, EgoExoLearn contains egocentric and\ndemonstration video data spanning 120 hours captured in daily life scenarios\nand specialized laboratories. Along with the videos we record high-quality gaze\ndata and provide detailed multimodal annotations, formulating a playground for\nmodeling the human ability to bridge asynchronous procedural actions from\ndifferent viewpoints. To this end, we present benchmarks such as cross-view\nassociation, cross-view action planning, and cross-view referenced skill\nassessment, along with detailed analysis. We expect EgoExoLearn can serve as an\nimportant resource for bridging the actions across views, thus paving the way\nfor creating AI agents capable of seamlessly learning by observing humans in\nthe real world. Code and data can be found at:\nhttps://github.com/OpenGVLab/EgoExoLearn\n","authors":["Yifei Huang","Guo Chen","Jilan Xu","Mingfang Zhang","Lijin Yang","Baoqi Pei","Hongjie Zhang","Lu Dong","Yali Wang","Limin Wang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2403.16182v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2406.03095v1","updated":"2024-06-05T09:36:15Z","published":"2024-06-05T09:36:15Z","title":"EgoSurgery-Tool: A Dataset of Surgical Tool and Hand Detection from\n  Egocentric Open Surgery Videos","summary":"  Surgical tool detection is a fundamental task for understanding egocentric\nopen surgery videos. However, detecting surgical tools presents significant\nchallenges due to their highly imbalanced class distribution, similar shapes\nand similar textures, and heavy occlusion. The lack of a comprehensive\nlarge-scale dataset compounds these challenges. In this paper, we introduce\nEgoSurgery-Tool, an extension of the existing EgoSurgery-Phase dataset, which\ncontains real open surgery videos captured using an egocentric camera attached\nto the surgeon's head, along with phase annotations. EgoSurgery-Tool has been\ndensely annotated with surgical tools and comprises over 49K surgical tool\nbounding boxes across 15 categories, constituting a large-scale surgical tool\ndetection dataset. EgoSurgery-Tool also provides annotations for hand detection\nwith over 46K hand-bounding boxes, capturing hand-object interactions that are\ncrucial for understanding activities in egocentric open surgery.\nEgoSurgery-Tool is superior to existing datasets due to its larger scale,\ngreater variety of surgical tools, more annotations, and denser scenes. We\nconduct a comprehensive analysis of EgoSurgery-Tool using nine popular object\ndetectors to assess their effectiveness in both surgical tool and hand\ndetection. The dataset will be released at\nhttps://github.com/Fujiry0/EgoSurgery.\n","authors":["Ryo Fujii","Hideo Saito","Hiroyuki Kajita"],"pdf_url":"https://arxiv.org/pdf/2406.03095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07857v3","updated":"2024-06-05T09:32:36Z","published":"2024-05-13T15:42:46Z","title":"Synergistic Integration of Coordinate Network and Tensorial Feature for\n  Improving Neural Radiance Fields from Sparse Inputs","summary":"  The multi-plane representation has been highlighted for its fast training and\ninference across static and dynamic neural radiance fields. This approach\nconstructs relevant features via projection onto learnable grids and\ninterpolating adjacent vertices. However, it has limitations in capturing\nlow-frequency details and tends to overuse parameters for low-frequency\nfeatures due to its bias toward fine details, despite its multi-resolution\nconcept. This phenomenon leads to instability and inefficiency when training\nposes are sparse. In this work, we propose a method that synergistically\nintegrates multi-plane representation with a coordinate-based MLP network known\nfor strong bias toward low-frequency signals. The coordinate-based network is\nresponsible for capturing low-frequency details, while the multi-plane\nrepresentation focuses on capturing fine-grained details. We demonstrate that\nusing residual connections between them seamlessly preserves their own inherent\nproperties. Additionally, the proposed progressive training scheme accelerates\nthe disentanglement of these two features. We demonstrate empirically that our\nproposed method not only outperforms baseline models for both static and\ndynamic NeRFs with sparse inputs, but also achieves comparable results with\nfewer parameters.\n","authors":["Mingyu Kim","Jun-Seong Kim","Se-Young Yun","Jin-Hwa Kim"],"pdf_url":"https://arxiv.org/pdf/2405.07857v3.pdf","comment":"ICML2024 ; Project page is accessible at\n  https://mingyukim87.github.io/SynergyNeRF ; Code is available at\n  https://github.com/MingyuKim87/SynergyNeRF"},{"id":"http://arxiv.org/abs/2406.03087v1","updated":"2024-06-05T09:24:10Z","published":"2024-06-05T09:24:10Z","title":"Lossless Image Compression Using Multi-level Dictionaries: Binary Images","summary":"  Lossless image compression is required in various applications to reduce\nstorage or transmission costs of images, while requiring the reconstructed\nimages to have zero information loss compared to the original. Existing\nlossless image compression methods either have simple design but poor\ncompression performance, or complex design, better performance, but with no\nperformance guarantees. In our endeavor to develop a lossless image compression\nmethod with low complexity and guaranteed performance, we argue that\ncompressibility of a color image is essentially derived from the patterns in\nits spatial structure, intensity variations, and color variations. Thus, we\ndivide the overall design of a lossless image compression scheme into three\nparts that exploit corresponding redundancies. We further argue that the\nbinarized version of an image captures its fundamental spatial structure and in\nthis work, we propose a scheme for lossless compression of binary images.\n  The proposed scheme first learns dictionaries of $16\\times16$, $8\\times8$,\n$4\\times4$, and $2\\times 2$ square pixel patterns from various datasets of\nbinary images. It then uses these dictionaries to encode binary images. These\ndictionaries have various interesting properties that are further exploited to\nconstruct an efficient scheme. Our preliminary results show that the proposed\nscheme consistently outperforms existing conventional and learning based\nlossless compression approaches, and provides, on average, as much as\n$1.5\\times$ better performance than a common general purpose lossless\ncompression scheme (WebP), more than $3\\times$ better performance than a state\nof the art learning based scheme, and better performance than a specialized\nscheme for binary image compression (JBIG2).\n","authors":["Samar Agnihotri","Renu Rameshan","Ritwik Ghosal"],"pdf_url":"https://arxiv.org/pdf/2406.03087v1.pdf","comment":"11 pages, 7 figures, and 5 tables"},{"id":"http://arxiv.org/abs/2406.03071v1","updated":"2024-06-05T08:56:24Z","published":"2024-06-05T08:56:24Z","title":"Exploiting LMM-based knowledge for image classification tasks","summary":"  In this paper we address image classification tasks leveraging knowledge\nencoded in Large Multimodal Models (LMMs). More specifically, we use the\nMiniGPT-4 model to extract semantic descriptions for the images, in a\nmultimodal prompting fashion. In the current literature, vision language models\nsuch as CLIP, among other approaches, are utilized as feature extractors, using\nonly the image encoder, for solving image classification tasks. In this paper,\nwe propose to additionally use the text encoder to obtain the text embeddings\ncorresponding to the MiniGPT-4-generated semantic descriptions. Thus, we use\nboth the image and text embeddings for solving the image classification task.\nThe experimental evaluation on three datasets validates the improved\nclassification performance achieved by exploiting LMM-based knowledge.\n","authors":["Maria Tzelepi","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2406.03071v1.pdf","comment":"Accepted for publication, 25th Int. Conf. on Engineering Applications\n  of Neural Networks (EANN/EAAAI 2024), Corfu, Greece, June 2024. This is the\n  \"submitted manuscript\""},{"id":"http://arxiv.org/abs/2406.03070v1","updated":"2024-06-05T08:55:02Z","published":"2024-06-05T08:55:02Z","title":"A-Bench: Are LMMs Masters at Evaluating AI-generated Images?","summary":"  How to accurately and efficiently assess AI-generated images (AIGIs) remains\na critical challenge for generative models. Given the high costs and extensive\ntime commitments required for user studies, many researchers have turned\ntowards employing large multi-modal models (LMMs) as AIGI evaluators, the\nprecision and validity of which are still questionable. Furthermore,\ntraditional benchmarks often utilize mostly natural-captured content rather\nthan AIGIs to test the abilities of LMMs, leading to a noticeable gap for\nAIGIs. Therefore, we introduce A-Bench in this paper, a benchmark designed to\ndiagnose whether LMMs are masters at evaluating AIGIs. Specifically, A-Bench is\norganized under two key principles: 1) Emphasizing both high-level semantic\nunderstanding and low-level visual quality perception to address the intricate\ndemands of AIGIs. 2) Various generative models are utilized for AIGI creation,\nand various LMMs are employed for evaluation, which ensures a comprehensive\nvalidation scope. Ultimately, 2,864 AIGIs from 16 text-to-image models are\nsampled, each paired with question-answers annotated by human experts, and\ntested across 18 leading LMMs. We hope that A-Bench will significantly enhance\nthe evaluation process and promote the generation quality for AIGIs. The\nbenchmark is available at https://github.com/Q-Future/A-Bench.\n","authors":["Zicheng Zhang","Haoning Wu","Chunyi Li","Yingjie Zhou","Wei Sun","Xiongkuo Min","Zijian Chen","Xiaohong Liu","Weisi Lin","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2406.03070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03065v1","updated":"2024-06-05T08:49:51Z","published":"2024-06-05T08:49:51Z","title":"Decision Boundary-aware Knowledge Consolidation Generates Better\n  Instance-Incremental Learner","summary":"  Instance-incremental learning (IIL) focuses on learning continually with data\nof the same classes. Compared to class-incremental learning (CIL), the IIL is\nseldom explored because IIL suffers less from catastrophic forgetting (CF).\nHowever, besides retaining knowledge, in real-world deployment scenarios where\nthe class space is always predefined, continual and cost-effective model\npromotion with the potential unavailability of previous data is a more\nessential demand. Therefore, we first define a new and more practical IIL\nsetting as promoting the model's performance besides resisting CF with only new\nobservations. Two issues have to be tackled in the new IIL setting: 1) the\nnotorious catastrophic forgetting because of no access to old data, and 2)\nbroadening the existing decision boundary to new observations because of\nconcept drift. To tackle these problems, our key insight is to moderately\nbroaden the decision boundary to fail cases while retain old boundary. Hence,\nwe propose a novel decision boundary-aware distillation method with\nconsolidating knowledge to teacher to ease the student learning new knowledge.\nWe also establish the benchmarks on existing datasets Cifar-100 and ImageNet.\nNotably, extensive experiments demonstrate that the teacher model can be a\nbetter incremental learner than the student model, which overturns previous\nknowledge distillation-based methods treating student as the main role.\n","authors":["Qiang Nie","Weifu Fu","Yuhuan Lin","Jialin Li","Yifeng Zhou","Yong Liu","Lei Zhu","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2406.03065v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2406.03051v1","updated":"2024-06-05T08:26:44Z","published":"2024-06-05T08:26:44Z","title":"Adapter-X: A Novel General Parameter-Efficient Fine-Tuning Framework for\n  Vision","summary":"  Parameter-efficient fine-tuning (PEFT) has become increasingly important as\nfoundation models continue to grow in both popularity and size. Adapter has\nbeen particularly well-received due to their potential for parameter reduction\nand adaptability across diverse tasks. However, striking a balance between high\nefficiency and robust generalization across tasks remains a challenge for\nadapter-based methods. We analyze existing methods and find that: 1) parameter\nsharing is the key to reducing redundancy; 2) more tunable parameters, dynamic\nallocation, and block-specific design are keys to improving performance.\nUnfortunately, no previous work considers all these factors. Inspired by this\ninsight, we introduce a novel framework named Adapter-X. First, a Sharing\nMixture of Adapters (SMoA) module is proposed to fulfill token-level dynamic\nallocation, increased tunable parameters, and inter-block sharing at the same\ntime. Second, some block-specific designs like Prompt Generator (PG) are\nintroduced to further enhance the ability of adaptation. Extensive experiments\nacross 2D image and 3D point cloud modalities demonstrate that Adapter-X\nrepresents a significant milestone as it is the first to outperform full\nfine-tuning in both 2D image and 3D point cloud modalities with significantly\nfewer parameters, i.e., only 0.20% and 1.88% of original trainable parameters\nfor 2D and 3D classification tasks. Our code will be publicly available.\n","authors":["Minglei Li","Peng Ye","Yongqi Huang","Lin Zhang","Tao Chen","Tong He","Jiayuan Fan","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2406.03051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03048v1","updated":"2024-06-05T08:23:38Z","published":"2024-06-05T08:23:38Z","title":"Giving each task what it needs -- leveraging structured sparsity for\n  tailored multi-task learning","summary":"  Every task demands distinct feature representations, ranging from low-level\nto high-level attributes, so it is vital to address the specific needs of each\ntask, especially in the Multi-task Learning (MTL) framework. This work,\ntherefore, introduces Layer-Optimized Multi-Task (LOMT) models that utilize\nstructured sparsity to refine feature selection for individual tasks and\nenhance the performance of all tasks in a multi-task scenario. Structured or\ngroup sparsity systematically eliminates parameters from trivial channels and,\neventually, entire layers within a convolution neural network during training.\nConsequently, the remaining layers provide the most optimal features for a\ngiven task. In this two-step approach, we subsequently leverage this\nsparsity-induced optimal layer information to build the LOMT models by\nconnecting task-specific decoders to these strategically identified layers,\ndeviating from conventional approaches that uniformly connect decoders at the\nend of the network. This tailored architecture optimizes the network, focusing\non essential features while reducing redundancy. We validate the efficacy of\nthe proposed approach on two datasets, ie NYU-v2 and CelebAMask-HD datasets,\nfor multiple heterogeneous tasks. A detailed performance analysis of the LOMT\nmodels, in contrast to the conventional MTL models, reveals that the LOMT\nmodels outperform for most task combinations. The excellent qualitative and\nquantitative outcomes highlight the effectiveness of employing structured\nsparsity for optimal layer (or feature) selection.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2406.03048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02228v2","updated":"2024-06-05T08:22:29Z","published":"2023-12-04T03:05:59Z","title":"PixelLM: Pixel Reasoning with Large Multimodal Model","summary":"  While large multimodal models (LMMs) have achieved remarkable progress,\ngenerating pixel-level masks for image reasoning tasks involving multiple\nopen-world targets remains a challenge. To bridge this gap, we introduce\nPixelLM, an effective and efficient LMM for pixel-level reasoning and\nunderstanding. Central to PixelLM is a novel, lightweight pixel decoder and a\ncomprehensive segmentation codebook. The decoder efficiently produces masks\nfrom the hidden embeddings of the codebook tokens, which encode detailed\ntarget-relevant information. With this design, PixelLM harmonizes with the\nstructure of popular LMMs and avoids the need for additional costly\nsegmentation models. Furthermore, we propose a target refinement loss to\nenhance the model's ability to differentiate between multiple targets, leading\nto substantially improved mask quality. To advance research in this area, we\nconstruct MUSE, a high-quality multi-target reasoning segmentation benchmark.\nPixelLM excels across various pixel-level image reasoning and understanding\ntasks, outperforming well-established methods in multiple benchmarks, including\nMUSE, single- and multi-referring segmentation. Comprehensive ablations confirm\nthe efficacy of each proposed component. All code, models, and datasets will be\npublicly available.\n","authors":["Zhongwei Ren","Zhicheng Huang","Yunchao Wei","Yao Zhao","Dongmei Fu","Jiashi Feng","Xiaojie Jin"],"pdf_url":"https://arxiv.org/pdf/2312.02228v2.pdf","comment":"(Accepted by CVPR 2024) Code and models are released at:\n  https://pixellm.github.io/"},{"id":"http://arxiv.org/abs/2405.19055v2","updated":"2024-06-05T08:13:03Z","published":"2024-05-29T12:56:11Z","title":"FUSU: A Multi-temporal-source Land Use Change Segmentation Dataset for\n  Fine-grained Urban Semantic Understanding","summary":"  Fine urban change segmentation using multi-temporal remote sensing images is\nessential for understanding human-environment interactions in urban areas.\nDespite advances in remote sensing data for urban monitoring, coarse-grained\nclassification systems and the lack of continuous temporal observations hinder\nthe application of deep learning to urban change analysis. To address this, we\nintroduce FUSU, a multi-source, multi-temporal change segmentation dataset for\nFine-grained Urban Semantic Understanding. FUSU features the most detailed land\nuse classification system to date, with 17 classes and 30 billion pixels of\nannotations. It includes bi-temporal high-resolution satellite images with\n20-50 cm ground sample distance and monthly optical and radar satellite time\nseries, covering 847 km2 across five urban areas in China. The fine-grained\npixel-wise annotations and high spatial-temporal resolution data provide a\nrobust foundation for deep learning models to understand urbanization and land\nuse changes. To fully leverage FUSU, we propose a unified time-series\narchitecture for both change detection and segmentation and then benchmark FUSU\non various methods for several tasks. Dataset and code will be available at:\nhttps://github.com/yuanshuai0914/FUSU.\n","authors":["Shuai Yuan","Guancong Lin","Lixian Zhang","Runmin Dong","Jinxiao Zhang","Shuang Chen","Juepeng Zheng","Jie Wang","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2405.19055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12466v3","updated":"2024-06-05T08:10:26Z","published":"2024-03-19T05:50:48Z","title":"Few-shot Object Localization","summary":"  Existing object localization methods are tailored to locate specific classes\nof objects, relying heavily on abundant labeled data for model optimization.\nHowever, acquiring large amounts of labeled data is challenging in many\nreal-world scenarios, significantly limiting the broader application of\nlocalization models. To bridge this research gap, this paper defines a novel\ntask named Few-Shot Object Localization (FSOL), which aims to achieve precise\nlocalization with limited samples. This task achieves generalized object\nlocalization by leveraging a small number of labeled support samples to query\nthe positional information of objects within corresponding images. To advance\nthis field, we design an innovative high-performance baseline model. This model\nintegrates a dual-path feature augmentation module to enhance shape association\nand gradient differences between supports and query images, alongside a self\nquery module to explore the association between feature maps and query images.\nExperimental results demonstrate a significant performance improvement of our\napproach in the FSOL task, establishing an efficient benchmark for further\nresearch. All codes and data are available at https://github.com/Ryh1218/FSOL.\n","authors":["Yunhan Ren","Bo Li","Chengyang Zhang","Yong Zhang","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2403.12466v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03035v1","updated":"2024-06-05T08:03:18Z","published":"2024-06-05T08:03:18Z","title":"Follow-Your-Pose v2: Multiple-Condition Guided Character Image Animation\n  for Stable Pose Control","summary":"  Pose-controllable character video generation is in high demand with extensive\napplications for fields such as automatic advertising and content creation on\nsocial media platforms. While existing character image animation methods using\npose sequences and reference images have shown promising performance, they tend\nto struggle with incoherent animation in complex scenarios, such as multiple\ncharacter animation and body occlusion. Additionally, current methods request\nlarge-scale high-quality videos with stable backgrounds and temporal\nconsistency as training datasets, otherwise, their performance will greatly\ndeteriorate. These two issues hinder the practical utilization of character\nimage animation tools. In this paper, we propose a practical and robust\nframework Follow-Your-Pose v2, which can be trained on noisy open-sourced\nvideos readily available on the internet. Multi-condition guiders are designed\nto address the challenges of background stability, body occlusion in\nmulti-character generation, and consistency of character appearance. Moreover,\nto fill the gap of fair evaluation of multi-character pose animation, we\npropose a new benchmark comprising approximately 4,000 frames. Extensive\nexperiments demonstrate that our approach outperforms state-of-the-art methods\nby a margin of over 35\\% across 2 datasets and on 7 metrics. Meanwhile,\nqualitative assessments reveal a significant improvement in the quality of\ngenerated video, particularly in scenarios involving complex backgrounds and\nbody occlusion of multi-character, suggesting the superiority of our approach.\n","authors":["Jingyun Xue","Hongfa Wang","Qi Tian","Yue Ma","Andong Wang","Zhiyuan Zhao","Shaobo Min","Wenzhe Zhao","Kaihao Zhang","Heung-Yeung Shum","Wei Liu","Mengyang Liu","Wenhan Luo"],"pdf_url":"https://arxiv.org/pdf/2406.03035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03032v1","updated":"2024-06-05T07:59:48Z","published":"2024-06-05T07:59:48Z","title":"Instructing Prompt-to-Prompt Generation for Zero-Shot Learning","summary":"  Zero-shot learning (ZSL) aims to explore the semantic-visual interactions to\ndiscover comprehensive knowledge transferred from seen categories to classify\nunseen categories. Recently, prompt engineering has emerged in ZSL,\ndemonstrating impressive potential as it enables the zero-shot transfer of\ndiverse visual concepts to downstream tasks. However, these methods are still\nnot well generalized to broad unseen domains. A key reason is that the fixed\nadaption of learnable prompts on seen domains makes it tend to over-emphasize\nthe primary visual features observed during training. In this work, we propose\na \\textbf{P}rompt-to-\\textbf{P}rompt generation methodology (\\textbf{P2P}),\nwhich addresses this issue by further embracing the instruction-following\ntechnique to distill instructive visual prompts for comprehensive transferable\nknowledge discovery. The core of P2P is to mine semantic-related instruction\nfrom prompt-conditioned visual features and text instruction on modal-sharing\nsemantic concepts and then inversely rectify the visual representations with\nthe guidance of the learned instruction prompts. This enforces the compensation\nfor missing visual details to primary contexts and further eliminates the\ncross-modal disparity, endowing unseen domain generalization. Through extensive\nexperimental results, we demonstrate the efficacy of P2P in achieving superior\nperformance over state-of-the-art methods.\n","authors":["Man Liu","Huihui Bai","Feng Li","Chunjie Zhang","Yunchao Wei","Meng Wang","Tat-Seng Chua","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.03032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20247v3","updated":"2024-06-05T07:52:07Z","published":"2024-05-30T16:58:34Z","title":"KerasCV and KerasNLP: Vision and Language Power-Ups","summary":"  We present the Keras domain packages KerasCV and KerasNLP, extensions of the\nKeras API for Computer Vision and Natural Language Processing workflows,\ncapable of running on either JAX, TensorFlow, or PyTorch. These domain packages\nare designed to enable fast experimentation, with a focus on ease-of-use and\nperformance. We adopt a modular, layered design: at the library's lowest level\nof abstraction, we provide building blocks for creating models and data\npreprocessing pipelines, and at the library's highest level of abstraction, we\nprovide pretrained ``task\" models for popular architectures such as Stable\nDiffusion, YOLOv8, GPT2, BERT, Mistral, CLIP, Gemma, T5, etc. Task models have\nbuilt-in preprocessing, pretrained weights, and can be fine-tuned on raw\ninputs. To enable efficient training, we support XLA compilation for all\nmodels, and run all preprocessing via a compiled graph of TensorFlow operations\nusing the tf.data API. The libraries are fully open-source (Apache 2.0 license)\nand available on GitHub.\n","authors":["Matthew Watson","Divyashree Shivakumar Sreepathihalli","Francois Chollet","Martin Gorner","Kiranbir Sodhia","Ramesh Sampath","Tirth Patel","Haifeng Jin","Neel Kovelamudi","Gabriel Rasskin","Samaneh Saadat","Luke Wood","Chen Qian","Jonathan Bischof","Ian Stenbit","Abheesht Sharma","Anshuman Mishra"],"pdf_url":"https://arxiv.org/pdf/2405.20247v3.pdf","comment":"Submitted to Journal of Machine Learning Open Source Software"},{"id":"http://arxiv.org/abs/2403.09117v2","updated":"2024-06-05T07:49:25Z","published":"2024-03-14T05:40:23Z","title":"Randomized Principal Component Analysis for Hyperspectral Image\n  Classification","summary":"  The high-dimensional feature space of the hyperspectral imagery poses major\nchallenges to the processing and analysis of the hyperspectral data sets. In\nsuch a case, dimensionality reduction is necessary to decrease the\ncomputational complexity. The random projections open up new ways of\ndimensionality reduction, especially for large data sets. In this paper, the\nprincipal component analysis (PCA) and randomized principal component analysis\n(R-PCA) for the classification of hyperspectral images using support vector\nmachines (SVM) and light gradient boosting machines (LightGBM) have been\ninvestigated. In this experimental research, the number of features was reduced\nto 20 and 30 for classification of two hyperspectral datasets (Indian Pines and\nPavia University). The experimental results demonstrated that PCA outperformed\nR-PCA for SVM for both datasets, but received close accuracy values for\nLightGBM. The highest classification accuracies were obtained as 0.9925 and\n0.9639 by LightGBM with original features for the Pavia University and Indian\nPines, respectively.\n","authors":["Mustafa Ustuner"],"pdf_url":"https://arxiv.org/pdf/2403.09117v2.pdf","comment":"5 pages, I have submitted this paper to M2GARSS 2024, 2024 IEEE\n  Mediterranean and Middle-East Geoscience and Remote Sensing Symposium"},{"id":"http://arxiv.org/abs/2306.10311v5","updated":"2024-06-05T07:44:13Z","published":"2023-06-17T10:10:15Z","title":"Efficient HDR Reconstruction from Real-World Raw Images","summary":"  The widespread usage of high-definition screens on edge devices stimulates a\nstrong demand for efficient high dynamic range (HDR) algorithms. However, many\nexisting HDR methods either deliver unsatisfactory results or consume too much\ncomputational and memory resources, hindering their application to\nhigh-resolution images (usually with more than 12 megapixels) in practice. In\naddition, existing HDR dataset collection methods often are labor-intensive. In\nthis work, in a new aspect, we discover an excellent opportunity for HDR\nreconstructing directly from raw images and investigating novel neural network\nstructures that benefit the deployment of mobile devices. Our key insights are\nthreefold: (1) we develop a lightweight-efficient HDR model, RepUNet, using the\nstructural re-parameterization technique to achieve fast and robust HDR; (2) we\ndesign a new computational raw HDR data formation pipeline and construct a\nreal-world raw HDR dataset, RealRaw-HDR; (3) we propose a plug-and-play motion\nalignment loss to mitigate motion ghosting under limited bandwidth conditions.\nOur model contains less than 830K parameters and takes less than 3 ms to\nprocess an image of 4K resolution using one RTX 3090 GPU. While being highly\nefficient, our model also outperforms the state-of-the-art HDR methods in terms\nof PSNR, SSIM, and a color difference metric.\n","authors":["Qirui Yang","Yihao Liu","Qihua Chen","Huanjing Yue","Kun Li","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2306.10311v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19882v2","updated":"2024-06-05T07:42:18Z","published":"2024-05-30T09:41:10Z","title":"PixOOD: Pixel-Level Out-of-Distribution Detection","summary":"  We propose a dense image prediction out-of-distribution detection algorithm,\ncalled PixOOD, which does not require training on samples of anomalous data and\nis not designed for a specific application which avoids traditional training\nbiases. In order to model the complex intra-class variability of the\nin-distribution data at the pixel level, we propose an online data condensation\nalgorithm which is more robust than standard K-means and is easily trainable\nthrough SGD. We evaluate PixOOD on a wide range of problems. It achieved\nstate-of-the-art results on four out of seven datasets, while being competitive\non the rest. The source code is available at https://github.com/vojirt/PixOOD.\n","authors":["Tomáš Vojíř","Jan Šochman","Jiří Matas"],"pdf_url":"https://arxiv.org/pdf/2405.19882v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2406.03019v1","updated":"2024-06-05T07:34:39Z","published":"2024-06-05T07:34:39Z","title":"Puzzle Pieces Picker: Deciphering Ancient Chinese Characters with\n  Radical Reconstruction","summary":"  Oracle Bone Inscriptions is one of the oldest existing forms of writing in\nthe world. However, due to the great antiquity of the era, a large number of\nOracle Bone Inscriptions (OBI) remain undeciphered, making it one of the global\nchallenges in the field of paleography today. This paper introduces a novel\napproach, namely Puzzle Pieces Picker (P$^3$), to decipher these enigmatic\ncharacters through radical reconstruction. We deconstruct OBI into foundational\nstrokes and radicals, then employ a Transformer model to reconstruct them into\ntheir modern (conterpart)\\textcolor{blue}{counterparts}, offering a\ngroundbreaking solution to ancient script analysis. To further this endeavor, a\nnew Ancient Chinese Character Puzzles (ACCP) dataset was developed, comprising\nan extensive collection of character images from seven key historical stages,\nannotated with detailed radical sequences. The experiments have showcased\nconsiderable promising insights, underscoring the potential and effectiveness\nof our approach in deciphering the intricacies of ancient Chinese scripts.\nThrough this novel dataset and methodology, we aim to bridge the gap between\ntraditional philology and modern document analysis techniques, offering new\ninsights into the rich history of Chinese linguistic heritage.\n","authors":["Pengjie Wang","Kaile Zhang","Xinyu Wang","Shengwei Han","Yongge Liu","Lianwen Jin","Xiang Bai","Yuliang Liu"],"pdf_url":"https://arxiv.org/pdf/2406.03019v1.pdf","comment":"ICDAR 2024"},{"id":"http://arxiv.org/abs/2404.13680v3","updated":"2024-06-05T07:32:32Z","published":"2024-04-21T14:43:31Z","title":"Zero-shot High-fidelity and Pose-controllable Character Animation","summary":"  Image-to-video (I2V) generation aims to create a video sequence from a single\nimage, which requires high temporal coherence and visual fidelity. However,\nexisting approaches suffer from inconsistency of character appearances and poor\npreservation of fine details. Moreover, they require a large amount of video\ndata for training, which can be computationally demanding. To address these\nlimitations, we propose PoseAnimate, a novel zero-shot I2V framework for\ncharacter animation. PoseAnimate contains three key components: 1) a Pose-Aware\nControl Module (PACM) that incorporates diverse pose signals into text\nembeddings, to preserve character-independent content and maintain precise\nalignment of actions. 2) a Dual Consistency Attention Module (DCAM) that\nenhances temporal consistency and retains character identity and intricate\nbackground details. 3) a Mask-Guided Decoupling Module (MGDM) that refines\ndistinct feature perception abilities, improving animation fidelity by\ndecoupling the character and background. We also propose a Pose Alignment\nTransition Algorithm (PATA) to ensure smooth action transition. Extensive\nexperiment results demonstrate that our approach outperforms the\nstate-of-the-art training-based methods in terms of character consistency and\ndetail fidelity. Moreover, it maintains a high level of temporal coherence\nthroughout the generated animations.\n","authors":["Bingwen Zhu","Fanyi Wang","Tianyi Lu","Peng Liu","Jingwen Su","Jinxiu Liu","Yanhao Zhang","Zuxuan Wu","Guo-Jun Qi","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.13680v3.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.03017v1","updated":"2024-06-05T07:32:29Z","published":"2024-06-05T07:32:29Z","title":"DifAttack++: Query-Efficient Black-Box Adversarial Attack via\n  Hierarchical Disentangled Feature Space in Cross Domain","summary":"  This work investigates efficient score-based black-box adversarial attacks\nwith a high Attack Success Rate (ASR) and good generalizability. We design a\nnovel attack method based on a \\textit{Hierarchical} \\textbf{Di}sentangled\n\\textbf{F}eature space and \\textit{cross domain}, called \\textbf{DifAttack++},\nwhich differs significantly from the existing ones operating over the entire\nfeature space. Specifically, DifAttack++ firstly disentangles an image's latent\nfeature into an \\textit{adversarial feature} (AF) and a \\textit{visual feature}\n(VF) via an autoencoder equipped with our specially designed\n\\textbf{H}ierarchical \\textbf{D}ecouple-\\textbf{F}usion (HDF) module, where the\nAF dominates the adversarial capability of an image, while the VF largely\ndetermines its visual appearance. We train such autoencoders for the clean and\nadversarial image domains respectively, meanwhile realizing feature\ndisentanglement, by using pairs of clean images and their Adversarial Examples\n(AEs) generated from available surrogate models via white-box attack methods.\nEventually, in the black-box attack stage, DifAttack++ iteratively optimizes\nthe AF according to the query feedback from the victim model until a successful\nAE is generated, while keeping the VF unaltered. Extensive experimental results\ndemonstrate that our method achieves superior ASR and query efficiency than\nSOTA methods, meanwhile exhibiting much better visual quality of AEs. The code\nis available at https://github.com/csjunjun/DifAttack.git.\n","authors":["Jun Liu","Jiantao Zhou","Jiandian Zeng","Jinyu Tian"],"pdf_url":"https://arxiv.org/pdf/2406.03017v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2309.14585"},{"id":"http://arxiv.org/abs/2406.03015v1","updated":"2024-06-05T07:31:05Z","published":"2024-06-05T07:31:05Z","title":"Balancing Performance and Efficiency in Zero-shot Robotic Navigation","summary":"  We present an optimization study of the Vision-Language Frontier Maps (VLFM)\napplied to the Object Goal Navigation task in robotics. Our work evaluates the\nefficiency and performance of various vision-language models, object detectors,\nsegmentation models, and multi-modal comprehension and Visual Question\nAnswering modules. Using the $\\textit{val-mini}$ and $\\textit{val}$ splits of\nHabitat-Matterport 3D dataset, we conduct experiments on a desktop with limited\nVRAM. We propose a solution that achieves a higher success rate (+1.55%)\nimproving over the VLFM BLIP-2 baseline without substantial success-weighted\npath length loss while requiring $\\textbf{2.3 times}$ less video memory. Our\nfindings provide insights into balancing model performance and computational\nefficiency, suggesting effective deployment strategies for resource-limited\nenvironments.\n","authors":["Dmytro Kuzmenko","Nadiya Shvai"],"pdf_url":"https://arxiv.org/pdf/2406.03015v1.pdf","comment":"Submitted to ICTERI 2024 Posters Track"},{"id":"http://arxiv.org/abs/2401.15365v3","updated":"2024-06-05T07:23:00Z","published":"2024-01-27T09:54:16Z","title":"An open dataset for oracle bone script recognition and decipherment","summary":"  Oracle Bone Script (OBS), one of the earliest known forms of ancient Chinese\nwriting, holds invaluable insights into the humanities and geography of the\nShang Dynasty, dating back 3,000 years. The immense historical and cultural\nsignificance of these writings cannot be overstated. However, the passage of\ntime has obscured much of their meaning, presenting a significant challenge in\ndeciphering these ancient texts. With the advent of Artificial Intelligence\n(AI), employing AI to assist in interpreting OBS has become a feasible option.\nYet, progress in this area has been hindered by a lack of high-quality\ndatasets. To address this issue, this paper details the creation of the\nHUST-OBS dataset. This dataset encompasses 77,064 images of 1,588 individual\ndeciphered scripts and 62,989 images of 9,411 undeciphered characters, with a\ntotal of 140,053 images, compiled from diverse sources. Additionally, all\nimages and labels have been reviewed and corrected by experts in oracle bone\nstudies. The hope is that this dataset could inspire and assist future research\nin deciphering those unknown OBS. All the codes and datasets are available at\nhttps://github.com/Pengjie-W/HUST-OBC.\n","authors":["Pengjie Wang","Kaile Zhang","Xinyu Wang","Shengwei Han","Yongge Liu","Jinpeng Wan","Haisu Guan","Zhebin Kuang","Lianwen Jin","Xiang Bai","Yuliang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.15365v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03008v1","updated":"2024-06-05T07:14:44Z","published":"2024-06-05T07:14:44Z","title":"DriVLMe: Enhancing LLM-based Autonomous Driving Agents with Embodied and\n  Social Experiences","summary":"  Recent advancements in foundation models (FMs) have unlocked new prospects in\nautonomous driving, yet the experimental settings of these studies are\npreliminary, over-simplified, and fail to capture the complexity of real-world\ndriving scenarios in human environments. It remains under-explored whether FM\nagents can handle long-horizon navigation tasks with free-from dialogue and\ndeal with unexpected situations caused by environmental dynamics or task\nchanges. To explore the capabilities and boundaries of FMs faced with the\nchallenges above, we introduce DriVLMe, a video-language-model-based agent to\nfacilitate natural and effective communication between humans and autonomous\nvehicles that perceive the environment and navigate. We develop DriVLMe from\nboth embodied experiences in a simulated environment and social experiences\nfrom real human dialogue. While DriVLMe demonstrates competitive performance in\nboth open-loop benchmarks and closed-loop human studies, we reveal several\nlimitations and challenges, including unacceptable inference time, imbalanced\ntraining data, limited visual understanding, challenges with multi-turn\ninteractions, simplified language generation from robotic experiences, and\ndifficulties in handling on-the-fly unexpected situations like environmental\ndynamics and task changes.\n","authors":["Yidong Huang","Jacob Sansom","Ziqiao Ma","Felix Gervits","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2406.03008v1.pdf","comment":"First Vision and Language for Autonomous Driving and Robotics\n  Workshop (VLADR @ CVPR 2024)"},{"id":"http://arxiv.org/abs/2406.03002v1","updated":"2024-06-05T07:09:19Z","published":"2024-06-05T07:09:19Z","title":"Phy-Diff: Physics-guided Hourglass Diffusion Model for Diffusion MRI\n  Synthesis","summary":"  Diffusion MRI (dMRI) is an important neuroimaging technique with high\nacquisition costs. Deep learning approaches have been used to enhance dMRI and\npredict diffusion biomarkers through undersampled dMRI. To generate more\ncomprehensive raw dMRI, generative adversarial network based methods are\nproposed to include b-values and b-vectors as conditions, but they are limited\nby unstable training and less desirable diversity. The emerging diffusion model\n(DM) promises to improve generative performance. However, it remains\nchallenging to include essential information in conditioning DM for more\nrelevant generation, i.e., the physical principles of dMRI and white matter\ntract structures. In this study, we propose a physics-guided diffusion model to\ngenerate high-quality dMRI. Our model introduces the physical principles of\ndMRI in the noise evolution in the diffusion process and introduce a\nquery-based conditional mapping within the difussion model. In addition, to\nenhance the anatomical fine detials of the generation, we introduce the XTRACT\natlas as prior of white matter tracts by adopting an adapter technique. Our\nexperiment results show that our method outperforms other state-of-the-art\nmethods and has the potential to advance dMRI enhancement.\n","authors":["Juanhua Zhang","Ruodan Yan","Alessandro Perelli","Xi Chen","Chao Li"],"pdf_url":"https://arxiv.org/pdf/2406.03002v1.pdf","comment":"Accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2406.03001v1","updated":"2024-06-05T07:06:26Z","published":"2024-06-05T07:06:26Z","title":"EdgeSync: Faster Edge-model Updating via Adaptive Continuous Learning\n  for Video Data Drift","summary":"  Real-time video analytics systems typically place models with fewer weights\non edge devices to reduce latency. The distribution of video content features\nmay change over time for various reasons (i.e. light and weather change) ,\nleading to accuracy degradation of existing models, to solve this problem,\nrecent work proposes a framework that uses a remote server to continually train\nand adapt the lightweight model at edge with the help of complex model.\nHowever, existing analytics approaches leave two challenges untouched: firstly,\nretraining task is compute-intensive, resulting in large model update delays;\nsecondly, new model may not fit well enough with the data distribution of the\ncurrent video stream. To address these challenges, in this paper, we present\nEdgeSync, EdgeSync filters the samples by considering both timeliness and\ninference results to make training samples more relevant to the current video\ncontent as well as reduce the update delay, to improve the quality of training,\nEdgeSync also designs a training management module that can efficiently adjusts\nthe model training time and training order on the runtime. By evaluating real\ndatasets with complex scenes, our method improves about 3.4% compared to\nexisting methods and about 10% compared to traditional means.\n","authors":["Peng Zhao","Runchu Dong","Guiqin Wang","Cong Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.03001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11568v2","updated":"2024-06-05T07:05:15Z","published":"2024-03-18T08:42:08Z","title":"EffiVED:Efficient Video Editing via Text-instruction Diffusion Models","summary":"  Large-scale text-to-video models have shown remarkable abilities, but their\ndirect application in video editing remains challenging due to limited\navailable datasets. Current video editing methods commonly require per-video\nfine-tuning of diffusion models or specific inversion optimization to ensure\nhigh-fidelity edits. In this paper, we introduce EffiVED, an efficient\ndiffusion-based model that directly supports instruction-guided video editing.\nTo achieve this, we present two efficient workflows to gather video editing\npairs, utilizing augmentation and fundamental vision-language techniques. These\nworkflows transform vast image editing datasets and open-world videos into a\nhigh-quality dataset for training EffiVED. Experimental results reveal that\nEffiVED not only generates high-quality editing videos but also executes\nrapidly. Finally, we demonstrate that our data collection method significantly\nimproves editing performance and can potentially tackle the scarcity of video\nediting data. Code can be found at https://github.com/alibaba/EffiVED.\n","authors":["Zhenghao Zhang","Zuozhuo Dai","Long Qin","Weizhi Wang"],"pdf_url":"https://arxiv.org/pdf/2403.11568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02996v1","updated":"2024-06-05T06:52:29Z","published":"2024-06-05T06:52:29Z","title":"Quantifying Task Priority for Multi-Task Optimization","summary":"  The goal of multi-task learning is to learn diverse tasks within a single\nunified network. As each task has its own unique objective function, conflicts\nemerge during training, resulting in negative transfer among them. Earlier\nresearch identified these conflicting gradients in shared parameters between\ntasks and attempted to realign them in the same direction. However, we prove\nthat such optimization strategies lead to sub-optimal Pareto solutions due to\ntheir inability to accurately determine the individual contributions of each\nparameter across various tasks. In this paper, we propose the concept of task\npriority to evaluate parameter contributions across different tasks. To learn\ntask priority, we identify the type of connections related to links between\nparameters influenced by task-specific losses during backpropagation. The\nstrength of connections is gauged by the magnitude of parameters to determine\ntask priority. Based on these, we present a new method named connection\nstrength-based optimization for multi-task learning which consists of two\nphases. The first phase learns the task priority within the network, while the\nsecond phase modifies the gradients while upholding this priority. This\nultimately leads to finding new Pareto optimal solutions for multiple tasks.\nThrough extensive experiments, we show that our approach greatly enhances\nmulti-task performance in comparison to earlier gradient manipulation methods.\n","authors":["Wooseong Jeong","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2406.02996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02991v1","updated":"2024-06-05T06:43:48Z","published":"2024-06-05T06:43:48Z","title":"A Human-Annotated Video Dataset for Training and Evaluation of\n  360-Degree Video Summarization Methods","summary":"  In this paper we introduce a new dataset for 360-degree video summarization:\nthe transformation of 360-degree video content to concise 2D-video summaries\nthat can be consumed via traditional devices, such as TV sets and smartphones.\nThe dataset includes ground-truth human-generated summaries, that can be used\nfor training and objectively evaluating 360-degree video summarization methods.\nUsing this dataset, we train and assess two state-of-the-art summarization\nmethods that were originally proposed for 2D-video summarization, to serve as a\nbaseline for future comparisons with summarization methods that are\nspecifically tailored to 360-degree video. Finally, we present an interactive\ntool that was developed to facilitate the data annotation process and can\nassist other annotation activities that rely on video fragment selection.\n","authors":["Ioannis Kontostathis","Evlampios Apostolidis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2406.02991v1.pdf","comment":"Accepted for publication, 1st Int. Workshop on Video for Immersive\n  Experiences (Video4IMX-2024) at ACM IMX 2024, Stockholm, Sweden, June 2024.\n  This is the \"accepted version\""},{"id":"http://arxiv.org/abs/2406.02990v1","updated":"2024-06-05T06:42:27Z","published":"2024-06-05T06:42:27Z","title":"Predicting Genetic Mutation from Whole Slide Images via\n  Biomedical-Linguistic Knowledge Enhanced Multi-label Classification","summary":"  Predicting genetic mutations from whole slide images is indispensable for\ncancer diagnosis. However, existing work training multiple binary\nclassification models faces two challenges: (a) Training multiple binary\nclassifiers is inefficient and would inevitably lead to a class imbalance\nproblem. (b) The biological relationships among genes are overlooked, which\nlimits the prediction performance. To tackle these challenges, we innovatively\ndesign a Biological-knowledge enhanced PathGenomic multi-label Transformer to\nimprove genetic mutation prediction performances. BPGT first establishes a\nnovel gene encoder that constructs gene priors by two carefully designed\nmodules: (a) A gene graph whose node features are the genes' linguistic\ndescriptions and the cancer phenotype, with edges modeled by genes' pathway\nassociations and mutation consistencies. (b) A knowledge association module\nthat fuses linguistic and biomedical knowledge into gene priors by\ntransformer-based graph representation learning, capturing the intrinsic\nrelationships between different genes' mutations. BPGT then designs a label\ndecoder that finally performs genetic mutation prediction by two tailored\nmodules: (a) A modality fusion module that firstly fuses the gene priors with\ncritical regions in WSIs and obtains gene-wise mutation logits. (b) A\ncomparative multi-label loss that emphasizes the inherent comparisons among\nmutation status to enhance the discrimination capabilities. Sufficient\nexperiments on The Cancer Genome Atlas benchmark demonstrate that BPGT\noutperforms the state-of-the-art.\n","authors":["Gexin Huang","Chenfei Wu","Mingjie Li","Xiaojun Chang","Ling Chen","Ying Sun","Shen Zhao","Xiaodan Liang","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2406.02990v1.pdf","comment":"16 pages, 8 figures, and 3 tables"},{"id":"http://arxiv.org/abs/2406.02987v1","updated":"2024-06-05T06:36:43Z","published":"2024-06-05T06:36:43Z","title":"Enhancing Multimodal Large Language Models with Multi-instance Visual\n  Prompt Generator for Visual Representation Enrichment","summary":"  Multimodal Large Language Models (MLLMs) have achieved SOTA performance in\nvarious visual language tasks by fusing the visual representations with LLMs\nleveraging some visual adapters. In this paper, we first establish that\nadapters using query-based Transformers such as Q-former is a simplified\nMulti-instance Learning method without considering instance\nheterogeneity/correlation. We then propose a general component termed\nMulti-instance Visual Prompt Generator (MIVPG) to incorporate enriched visual\nrepresentations into LLMs by taking advantage of instance correlation between\nimages or patches for the same sample. Quantatitive evaluation on three public\nvision-language (VL) datasets from different scenarios shows that the proposed\nMIVPG improves Q-former in main VL tasks.\n","authors":["Wenliang Zhong","Wenyi Wu","Qi Li","Rob Barton","Boxin Du","Shioulin Sam","Karim Bouyarmane","Ismail Tutar","Junzhou Huang"],"pdf_url":"https://arxiv.org/pdf/2406.02987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05583v2","updated":"2024-06-05T06:29:37Z","published":"2024-04-08T14:58:52Z","title":"Towards More General Video-based Deepfake Detection through Facial\n  Feature Guided Adaptation for Foundation Model","summary":"  With the rise of deep learning, generative models have enabled the creation\nof highly realistic synthetic images, presenting challenges due to their\npotential misuse. While research in Deepfake detection has grown rapidly in\nresponse, many detection methods struggle with unseen Deepfakes generated by\nnew synthesis techniques. To address this generalisation challenge, we propose\na novel Deepfake detection approach by adapting the Foundation Models with rich\ninformation encoded inside, specifically using the image encoder from CLIP\nwhich has demonstrated strong zero-shot capability for downstream tasks.\nInspired by the recent advances of parameter efficient fine-tuning, we propose\na novel side-network-based decoder to extract spatial and temporal cues from\nthe given video clip, with the promotion of the Facial Component Guidance (FCG)\nto encourage the spatial feature to include features of key facial parts for\nmore robust and general Deepfake detection. Through extensive cross-dataset\nevaluations, our approach exhibits superior effectiveness in identifying unseen\nDeepfake samples, achieving notable performance improvement even with limited\ntraining samples and manipulation types. Our model secures an average\nperformance enhancement of 0.9\\% AUROC in cross-dataset assessments comparing\nwith state-of-the-art methods, especially a significant lead of achieving 4.4\\%\nimprovement on the challenging DFDC dataset.\n","authors":["Yue-Hua Han","Tai-Ming Huang","Shu-Tzu Lo","Po-Han Huang","Kai-Lung Hua","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2404.05583v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02978v1","updated":"2024-06-05T06:21:54Z","published":"2024-06-05T06:21:54Z","title":"Self-Supervised Skeleton Action Representation Learning: A Benchmark and\n  Beyond","summary":"  Self-supervised learning (SSL), which aims to learn meaningful prior\nrepresentations from unlabeled data, has been proven effective for\nlabel-efficient skeleton-based action understanding. Different from the image\ndomain, skeleton data possesses sparser spatial structures and diverse\nrepresentation forms, with the absence of background clues and the additional\ntemporal dimension. This presents the new challenges for the pretext task\ndesign of spatial-temporal motion representation learning. Recently, many\nendeavors have been made for skeleton-based SSL and remarkable progress has\nbeen achieved. However, a systematic and thorough review is still lacking. In\nthis paper, we conduct, for the first time, a comprehensive survey on\nself-supervised skeleton-based action representation learning, where various\nliterature is organized according to their pre-training pretext task\nmethodologies. Following the taxonomy of context-based, generative learning,\nand contrastive learning approaches, we make a thorough review and benchmark of\nexisting works and shed light on the future possible directions. Our\ninvestigation demonstrates that most SSL works rely on the single paradigm,\nlearning representations of a single level, and are evaluated on the action\nrecognition task solely, which leaves the generalization power of skeleton SSL\nmodels under-explored. To this end, a novel and effective SSL method for\nskeleton is further proposed, which integrates multiple pretext tasks to\njointly learn versatile representations of different granularity, substantially\nboosting the generalization capacity for different downstream tasks. Extensive\nexperiments under three large-scale datasets demonstrate that the proposed\nmethod achieves the superior generalization performance on various downstream\ntasks, including recognition, retrieval, detection, and few-shot learning.\n","authors":["Jiahang Zhang","Lilang Lin","Shuai Yang","Jiaying Liu"],"pdf_url":"https://arxiv.org/pdf/2406.02978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02977v1","updated":"2024-06-05T06:21:48Z","published":"2024-06-05T06:21:48Z","title":"Sparse Color-Code Net: Real-Time RGB-Based 6D Object Pose Estimation on\n  Edge Devices","summary":"  As robotics and augmented reality applications increasingly rely on precise\nand efficient 6D object pose estimation, real-time performance on edge devices\nis required for more interactive and responsive systems. Our proposed Sparse\nColor-Code Net (SCCN) embodies a clear and concise pipeline design to\neffectively address this requirement. SCCN performs pixel-level predictions on\nthe target object in the RGB image, utilizing the sparsity of essential object\ngeometry features to speed up the Perspective-n-Point (PnP) computation\nprocess. Additionally, it introduces a novel pixel-level geometry-based object\nsymmetry representation that seamlessly integrates with the initial pose\npredictions, effectively addressing symmetric object ambiguities. SCCN notably\nachieves an estimation rate of 19 frames per second (FPS) and 6 FPS on the\nbenchmark LINEMOD dataset and the Occlusion LINEMOD dataset, respectively, for\nan NVIDIA Jetson AGX Xavier, while consistently maintaining high estimation\naccuracy at these rates.\n","authors":["Xingjian Yang","Zhitao Yu","Ashis G. Banerjee"],"pdf_url":"https://arxiv.org/pdf/2406.02977v1.pdf","comment":"Accepted for publication in the Proceedings of the 2024 IEEE 20th\n  International Conference on Automation Science and Engineering"},{"id":"http://arxiv.org/abs/2406.02976v1","updated":"2024-06-05T06:18:03Z","published":"2024-06-05T06:18:03Z","title":"DA-Flow: Dual Attention Normalizing Flow for Skeleton-based Video\n  Anomaly Detection","summary":"  Cooperation between temporal convolutional networks (TCN) and graph\nconvolutional networks (GCN) as a processing module has shown promising results\nin skeleton-based video anomaly detection (SVAD). However, to maintain a\nlightweight model with low computational and storage complexity, shallow GCN\nand TCN blocks are constrained by small receptive fields and a lack of\ncross-dimension interaction capture. To tackle this limitation, we propose a\nlightweight module called the Dual Attention Module (DAM) for capturing\ncross-dimension interaction relationships in spatio-temporal skeletal data. It\nemploys the frame attention mechanism to identify the most significant frames\nand the skeleton attention mechanism to capture broader relationships across\nfixed partitions with minimal parameters and flops. Furthermore, the proposed\nDual Attention Normalizing Flow (DA-Flow) integrates the DAM as a\npost-processing unit after GCN within the normalizing flow framework.\nSimulations show that the proposed model is robust against noise and negative\nsamples. Experimental results show that DA-Flow reaches competitive or better\nperformance than the existing state-of-the-art (SOTA) methods in terms of the\nmicro AUC metric with the fewest number of parameters. Moreover, we found that\neven without training, simply using random projection without dimensionality\nreduction on skeleton data enables substantial anomaly detection capabilities.\n","authors":["Ruituo Wu","Yang Chen","Jian Xiao","Bing Li","Jicong Fan","Frédéric Dufaux","Ce Zhu","Yipeng Liu"],"pdf_url":"https://arxiv.org/pdf/2406.02976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02972v1","updated":"2024-06-05T06:06:03Z","published":"2024-06-05T06:06:03Z","title":"Event3DGS: Event-based 3D Gaussian Splatting for Fast Egomotion","summary":"  The recent emergence of 3D Gaussian splatting (3DGS) leverages the advantage\nof explicit point-based representations, which significantly improves the\nrendering speed and quality of novel-view synthesis. However, 3D radiance field\nrendering in environments with high-dynamic motion or challenging illumination\ncondition remains problematic in real-world robotic tasks. The reason is that\nfast egomotion is prevalent real-world robotic tasks, which induces motion\nblur, leading to inaccuracies and artifacts in the reconstructed structure. To\nalleviate this problem, we propose Event3DGS, the first method that learns\nGaussian Splatting solely from raw event streams. By exploiting the high\ntemporal resolution of event cameras and explicit point-based representation,\nEvent3DGS can reconstruct high-fidelity 3D structures solely from the event\nstreams under fast egomotion. Our sparsity-aware sampling and progressive\ntraining approaches allow for better reconstruction quality and consistency. To\nfurther enhance the fidelity of appearance, we explicitly incorporate the\nmotion blur formation process into a differentiable rasterizer, which is used\nwith a limited set of blurred RGB images to refine the appearance. Extensive\nexperiments on multiple datasets validate the superior rendering quality of\nEvent3DGS compared with existing approaches, with over 95% lower training time\nand faster rendering speed in orders of magnitude.\n","authors":["Tianyi Xiong","Jiayi Wu","Botao He","Cornelia Fermuller","Yiannis Aloimonos","Heng Huang","Christopher A. Metzler"],"pdf_url":"https://arxiv.org/pdf/2406.02972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02968v1","updated":"2024-06-05T05:52:20Z","published":"2024-06-05T05:52:20Z","title":"Adversarial Generation of Hierarchical Gaussians for 3D Generative Model","summary":"  Most advances in 3D Generative Adversarial Networks (3D GANs) largely depend\non ray casting-based volume rendering, which incurs demanding rendering costs.\nOne promising alternative is rasterization-based 3D Gaussian Splatting (3D-GS),\nproviding a much faster rendering speed and explicit 3D representation. In this\npaper, we exploit Gaussian as a 3D representation for 3D GANs by leveraging its\nefficient and explicit characteristics. However, in an adversarial framework,\nwe observe that a na\\\"ive generator architecture suffers from training\ninstability and lacks the capability to adjust the scale of Gaussians. This\nleads to model divergence and visual artifacts due to the absence of proper\nguidance for initialized positions of Gaussians and densification to manage\ntheir scales adaptively. To address these issues, we introduce a generator\narchitecture with a hierarchical multi-scale Gaussian representation that\neffectively regularizes the position and scale of generated Gaussians.\nSpecifically, we design a hierarchy of Gaussians where finer-level Gaussians\nare parameterized by their coarser-level counterparts; the position of\nfiner-level Gaussians would be located near their coarser-level counterparts,\nand the scale would monotonically decrease as the level becomes finer, modeling\nboth coarse and fine details of the 3D scene. Experimental results demonstrate\nthat ours achieves a significantly faster rendering speed (x100) compared to\nstate-of-the-art 3D consistent GANs with comparable 3D generation capability.\nProject page: https://hse1032.github.io/gsgan.\n","authors":["Sangeek Hyun","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2406.02968v1.pdf","comment":"Project page: https://hse1032.github.io/gsgan"},{"id":"http://arxiv.org/abs/2406.02965v1","updated":"2024-06-05T05:42:46Z","published":"2024-06-05T05:42:46Z","title":"Understanding the Impact of Negative Prompts: When and How Do They Take\n  Effect?","summary":"  The concept of negative prompts, emerging from conditional generation models\nlike Stable Diffusion, allows users to specify what to exclude from the\ngenerated images.%, demonstrating significant practical efficacy. Despite the\nwidespread use of negative prompts, their intrinsic mechanisms remain largely\nunexplored. This paper presents the first comprehensive study to uncover how\nand when negative prompts take effect. Our extensive empirical analysis\nidentifies two primary behaviors of negative prompts. Delayed Effect: The\nimpact of negative prompts is observed after positive prompts render\ncorresponding content. Deletion Through Neutralization: Negative prompts delete\nconcepts from the generated image through a mutual cancellation effect in\nlatent space with positive prompts. These insights reveal significant potential\nreal-world applications; for example, we demonstrate that negative prompts can\nfacilitate object inpainting with minimal alterations to the background via a\nsimple adaptive algorithm. We believe our findings will offer valuable insights\nfor the community in capitalizing on the potential of negative prompts.\n","authors":["Yuanhao Ban","Ruochen Wang","Tianyi Zhou","Minhao Cheng","Boqing Gong","Cho-Jui Hsieh"],"pdf_url":"https://arxiv.org/pdf/2406.02965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02951v1","updated":"2024-06-05T05:20:12Z","published":"2024-06-05T05:20:12Z","title":"AVFF: Audio-Visual Feature Fusion for Video Deepfake Detection","summary":"  With the rapid growth in deepfake video content, we require improved and\ngeneralizable methods to detect them. Most existing detection methods either\nuse uni-modal cues or rely on supervised training to capture the dissonance\nbetween the audio and visual modalities. While the former disregards the\naudio-visual correspondences entirely, the latter predominantly focuses on\ndiscerning audio-visual cues within the training corpus, thereby potentially\noverlooking correspondences that can help detect unseen deepfakes. We present\nAudio-Visual Feature Fusion (AVFF), a two-stage cross-modal learning method\nthat explicitly captures the correspondence between the audio and visual\nmodalities for improved deepfake detection. The first stage pursues\nrepresentation learning via self-supervision on real videos to capture the\nintrinsic audio-visual correspondences. To extract rich cross-modal\nrepresentations, we use contrastive learning and autoencoding objectives, and\nintroduce a novel audio-visual complementary masking and feature fusion\nstrategy. The learned representations are tuned in the second stage, where\ndeepfake classification is pursued via supervised learning on both real and\nfake videos. Extensive experiments and analysis suggest that our novel\nrepresentation learning paradigm is highly discriminative in nature. We report\n98.6% accuracy and 99.1% AUC on the FakeAVCeleb dataset, outperforming the\ncurrent audio-visual state-of-the-art by 14.9% and 9.9%, respectively.\n","authors":["Trevine Oorloff","Surya Koppisetti","Nicolò Bonettini","Divyaraj Solanki","Ben Colman","Yaser Yacoob","Ali Shahriyari","Gaurav Bharaj"],"pdf_url":"https://arxiv.org/pdf/2406.02951v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2406.02541v2","updated":"2024-06-05T05:00:39Z","published":"2024-06-04T17:57:37Z","title":"Enhancing Temporal Consistency in Video Editing by Reconstructing Videos\n  with 3D Gaussian Splatting","summary":"  Recent advancements in zero-shot video diffusion models have shown promise\nfor text-driven video editing, but challenges remain in achieving high temporal\nconsistency. To address this, we introduce Video-3DGS, a 3D Gaussian Splatting\n(3DGS)-based video refiner designed to enhance temporal consistency in\nzero-shot video editors. Our approach utilizes a two-stage 3D Gaussian\noptimizing process tailored for editing dynamic monocular videos. In the first\nstage, Video-3DGS employs an improved version of COLMAP, referred to as\nMC-COLMAP, which processes original videos using a Masked and Clipped approach.\nFor each video clip, MC-COLMAP generates the point clouds for dynamic\nforeground objects and complex backgrounds. These point clouds are utilized to\ninitialize two sets of 3D Gaussians (Frg-3DGS and Bkg-3DGS) aiming to represent\nforeground and background views. Both foreground and background views are then\nmerged with a 2D learnable parameter map to reconstruct full views. In the\nsecond stage, we leverage the reconstruction ability developed in the first\nstage to impose the temporal constraints on the video diffusion model. To\ndemonstrate the efficacy of Video-3DGS on both stages, we conduct extensive\nexperiments across two related tasks: Video Reconstruction and Video Editing.\nVideo-3DGS trained with 3k iterations significantly improves video\nreconstruction quality (+3 PSNR, +7 PSNR increase) and training efficiency\n(x1.9, x4.5 times faster) over NeRF-based and 3DGS-based state-of-art methods\non DAVIS dataset, respectively. Moreover, it enhances video editing by ensuring\ntemporal consistency across 58 dynamic monocular videos.\n","authors":["Inkyu Shin","Qihang Yu","Xiaohui Shen","In So Kweon","Kuk-Jin Yoon","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2406.02541v2.pdf","comment":"Project page at this https://video-3dgs-project.github.io/"},{"id":"http://arxiv.org/abs/2406.02936v1","updated":"2024-06-05T04:49:55Z","published":"2024-06-05T04:49:55Z","title":"Radiomics-guided Multimodal Self-attention Network for Predicting\n  Pathological Complete Response in Breast MRI","summary":"  Breast cancer is the most prevalent cancer among women and predicting\npathologic complete response (pCR) after anti-cancer treatment is crucial for\npatient prognosis and treatment customization. Deep learning has shown promise\nin medical imaging diagnosis, particularly when utilizing multiple imaging\nmodalities to enhance accuracy. This study presents a model that predicts pCR\nin breast cancer patients using dynamic contrast-enhanced (DCE) magnetic\nresonance imaging (MRI) and apparent diffusion coefficient (ADC) maps.\nRadiomics features are established hand-crafted features of the tumor region\nand thus could be useful in medical image analysis. Our approach extracts\nfeatures from both DCE MRI and ADC using an encoder with a self-attention\nmechanism, leveraging radiomics to guide feature extraction from tumor-related\nregions. Our experimental results demonstrate the superior performance of our\nmodel in predicting pCR compared to other baseline methods.\n","authors":["Jonghun Kim","Hyunjin Park"],"pdf_url":"https://arxiv.org/pdf/2406.02936v1.pdf","comment":"5 pages, 5 figures, IEEE ISBI 2024 proceedings"},{"id":"http://arxiv.org/abs/2406.02930v1","updated":"2024-06-05T04:38:45Z","published":"2024-06-05T04:38:45Z","title":"P2PFormer: A Primitive-to-polygon Method for Regular Building Contour\n  Extraction from Remote Sensing Images","summary":"  Extracting building contours from remote sensing imagery is a significant\nchallenge due to buildings' complex and diverse shapes, occlusions, and noise.\nExisting methods often struggle with irregular contours, rounded corners, and\nredundancy points, necessitating extensive post-processing to produce regular\npolygonal building contours. To address these challenges, we introduce a novel,\nstreamlined pipeline that generates regular building contours without\npost-processing. Our approach begins with the segmentation of generic geometric\nprimitives (which can include vertices, lines, and corners), followed by the\nprediction of their sequence. This allows for the direct construction of\nregular building contours by sequentially connecting the segmented primitives.\nBuilding on this pipeline, we developed P2PFormer, which utilizes a\ntransformer-based architecture to segment geometric primitives and predict\ntheir order. To enhance the segmentation of primitives, we introduce a unique\nrepresentation called group queries. This representation comprises a set of\nqueries and a singular query position, which improve the focus on multiple\nmidpoints of primitives and their efficient linkage. Furthermore, we propose an\ninnovative implicit update strategy for the query position embedding aimed at\nsharpening the focus of queries on the correct positions and, consequently,\nenhancing the quality of primitive segmentation. Our experiments demonstrate\nthat P2PFormer achieves new state-of-the-art performance on the WHU, CrowdAI,\nand WHU-Mix datasets, surpassing the previous SOTA PolyWorld by a margin of 2.7\nAP and 6.5 AP75 on the largest CrowdAI dataset. We intend to make the code and\ntrained weights publicly available to promote their use and facilitate further\nresearch.\n","authors":["Tao Zhang","Shiqing Wei","Yikang Zhou","Muying Luo","Wenling You","Shunping Ji"],"pdf_url":"https://arxiv.org/pdf/2406.02930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02929v1","updated":"2024-06-05T04:37:06Z","published":"2024-06-05T04:37:06Z","title":"Exploring Data Efficiency in Zero-Shot Learning with Diffusion Models","summary":"  Zero-Shot Learning (ZSL) aims to enable classifiers to identify unseen\nclasses by enhancing data efficiency at the class level. This is achieved by\ngenerating image features from pre-defined semantics of unseen classes.\nHowever, most current approaches heavily depend on the number of samples from\nseen classes, i.e. they do not consider instance-level effectiveness. In this\npaper, we demonstrate that limited seen examples generally result in\ndeteriorated performance of generative models. To overcome these challenges, we\npropose ZeroDiff, a Diffusion-based Generative ZSL model. This unified\nframework incorporates diffusion models to improve data efficiency at both the\nclass and instance levels. Specifically, for instance-level effectiveness,\nZeroDiff utilizes a forward diffusion chain to transform limited data into an\nexpanded set of noised data. For class-level effectiveness, we design a\ntwo-branch generation structure that consists of a Diffusion-based Feature\nGenerator (DFG) and a Diffusion-based Representation Generator (DRG). DFG\nfocuses on learning and sampling the distribution of cross-entropy-based\nfeatures, whilst DRG learns the supervised contrastive-based representation to\nboost the zero-shot capabilities of DFG. Additionally, we employ three\ndiscriminators to evaluate generated features from various aspects and\nintroduce a Wasserstein-distance-based mutual learning loss to transfer\nknowledge among discriminators, thereby enhancing guidance for generation.\nDemonstrated through extensive experiments on three popular ZSL benchmarks, our\nZeroDiff not only achieves significant improvements over existing ZSL methods\nbut also maintains robust performance even with scarce training data. Code will\nbe released upon acceptance.\n","authors":["Zihan Ye","Shreyank N. Gowda","Xiaobo Jin","Xiaowei Huang","Haotian Xu","Yaochu Jin","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2406.02929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00625v2","updated":"2024-06-05T04:33:55Z","published":"2024-06-02T06:08:26Z","title":"SAM-LAD: Segment Anything Model Meets Zero-Shot Logic Anomaly Detection","summary":"  Visual anomaly detection is vital in real-world applications, such as\nindustrial defect detection and medical diagnosis. However, most existing\nmethods focus on local structural anomalies and fail to detect higher-level\nfunctional anomalies under logical conditions. Although recent studies have\nexplored logical anomaly detection, they can only address simple anomalies like\nmissing or addition and show poor generalizability due to being heavily\ndata-driven. To fill this gap, we propose SAM-LAD, a zero-shot, plug-and-play\nframework for logical anomaly detection in any scene. First, we obtain a query\nimage's feature map using a pre-trained backbone. Simultaneously, we retrieve\nthe reference images and their corresponding feature maps via the nearest\nneighbor search of the query image. Then, we introduce the Segment Anything\nModel (SAM) to obtain object masks of the query and reference images. Each\nobject mask is multiplied with the entire image's feature map to obtain object\nfeature maps. Next, an Object Matching Model (OMM) is proposed to match objects\nin the query and reference images. To facilitate object matching, we further\npropose a Dynamic Channel Graph Attention (DCGA) module, treating each object\nas a keypoint and converting its feature maps into feature vectors. Finally,\nbased on the object matching relations, an Anomaly Measurement Model (AMM) is\nproposed to detect objects with logical anomalies. Structural anomalies in the\nobjects can also be detected. We validate our proposed SAM-LAD using various\nbenchmarks, including industrial datasets (MVTec Loco AD, MVTec AD), and the\nlogical dataset (DigitAnatomy). Extensive experimental results demonstrate that\nSAM-LAD outperforms existing SoTA methods, particularly in detecting logical\nanomalies.\n","authors":["Yun Peng","Xiao Lin","Nachuan Ma","Jiayuan Du","Chuangwei Liu","Chengju Liu","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2406.00625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01961v2","updated":"2024-06-05T04:28:26Z","published":"2024-06-04T04:43:58Z","title":"Exploring Real World Map Change Generalization of Prior-Informed HD Map\n  Prediction Models","summary":"  Building and maintaining High-Definition (HD) maps represents a large barrier\nto autonomous vehicle deployment. This, along with advances in modern online\nmap detection models, has sparked renewed interest in the online mapping\nproblem. However, effectively predicting online maps at a high enough quality\nto enable safe, driverless deployments remains a significant challenge. Recent\nwork on these models proposes training robust online mapping systems using low\nquality map priors with synthetic perturbations in an attempt to simulate\nout-of-date HD map priors. In this paper, we investigate how models trained on\nthese synthetically perturbed map priors generalize to performance on\ndeployment-scale, real world map changes. We present a large-scale experimental\nstudy to determine which synthetic perturbations are most useful in\ngeneralizing to real world HD map changes, evaluated using multiple years of\nreal-world autonomous driving data. We show there is still a substantial\nsim2real gap between synthetic prior perturbations and observed real-world\nchanges, which limits the utility of current prior-informed HD map prediction\nmodels.\n","authors":["Samuel M. Bateman","Ning Xu","H. Charles Zhao","Yael Ben Shalom","Vince Gong","Greg Long","Will Maddern"],"pdf_url":"https://arxiv.org/pdf/2406.01961v2.pdf","comment":"Accepted to CVPR 2024, Workshop on Autonomous Driving"},{"id":"http://arxiv.org/abs/2406.02918v1","updated":"2024-06-05T04:13:03Z","published":"2024-06-05T04:13:03Z","title":"U-KAN Makes Strong Backbone for Medical Image Segmentation and\n  Generation","summary":"  U-Net has become a cornerstone in various visual applications such as image\nsegmentation and diffusion probability models. While numerous innovative\ndesigns and improvements have been introduced by incorporating transformers or\nMLPs, the networks are still limited to linearly modeling patterns as well as\nthe deficient interpretability. To address these challenges, our intuition is\ninspired by the impressive results of the Kolmogorov-Arnold Networks (KANs) in\nterms of accuracy and interpretability, which reshape the neural network\nlearning via the stack of non-linear learnable activation functions derived\nfrom the Kolmogorov-Anold representation theorem. Specifically, in this paper,\nwe explore the untapped potential of KANs in improving backbones for vision\ntasks. We investigate, modify and re-design the established U-Net pipeline by\nintegrating the dedicated KAN layers on the tokenized intermediate\nrepresentation, termed U-KAN. Rigorous medical image segmentation benchmarks\nverify the superiority of U-KAN by higher accuracy even with less computation\ncost. We further delved into the potential of U-KAN as an alternative U-Net\nnoise predictor in diffusion models, demonstrating its applicability in\ngenerating task-oriented model architectures. These endeavours unveil valuable\ninsights and sheds light on the prospect that with U-KAN, you can make strong\nbackbone for medical image segmentation and generation. Project page:\nhttps://yes-ukan.github.io/\n","authors":["Chenxin Li","Xinyu Liu","Wuyang Li","Cheng Wang","Hengyu Liu","Yixuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2406.02918v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.14399,\n  arXiv:2203.04967 by other authors"},{"id":"http://arxiv.org/abs/2406.02915v1","updated":"2024-06-05T04:08:41Z","published":"2024-06-05T04:08:41Z","title":"Visual-Text Cross Alignment: Refining the Similarity Score in\n  Vision-Language Models","summary":"  It has recently been discovered that using a pre-trained vision-language\nmodel (VLM), e.g., CLIP, to align a whole query image with several finer text\ndescriptions generated by a large language model can significantly enhance\nzero-shot performance. However, in this paper, we empirically find that the\nfiner descriptions tend to align more effectively with local areas of the query\nimage rather than the whole image, and then we theoretically validate this\nfinding. Thus, we present a method called weighted visual-text cross alignment\n(WCA). This method begins with a localized visual prompting technique, designed\nto identify local visual areas within the query image. The local visual areas\nare then cross-aligned with the finer descriptions by creating a similarity\nmatrix using the pre-trained VLM. To determine how well a query image aligns\nwith each category, we develop a score function based on the weighted\nsimilarities in this matrix. Extensive experiments demonstrate that our method\nsignificantly improves zero-shot performance across various datasets, achieving\nresults that are even comparable to few-shot learning methods.\n","authors":["Jinhao Li","Haopeng Li","Sarah Erfani","Lei Feng","James Bailey","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2406.02915v1.pdf","comment":"22 pages, 16 figures, published to ICML 2024"},{"id":"http://arxiv.org/abs/2406.02914v1","updated":"2024-06-05T04:07:37Z","published":"2024-06-05T04:07:37Z","title":"A Self-Supervised Denoising Strategy for Underwater Acoustic Camera\n  Imageries","summary":"  In low-visibility marine environments characterized by turbidity and\ndarkness, acoustic cameras serve as visual sensors capable of generating\nhigh-resolution 2D sonar images. However, acoustic camera images are interfered\nwith by complex noise and are difficult to be directly ingested by downstream\nvisual algorithms. This paper introduces a novel strategy for denoising\nacoustic camera images using deep learning techniques, which comprises two\nprincipal components: a self-supervised denoising framework and a fine\nfeature-guided block. Additionally, the study explores the relationship between\nthe level of image denoising and the improvement in feature-matching\nperformance. Experimental results show that the proposed denoising strategy can\neffectively filter acoustic camera images without prior knowledge of the noise\nmodel. The denoising process is nearly end-to-end without complex parameter\ntuning and post-processing. It successfully removes noise while preserving fine\nfeature details, thereby enhancing the performance of local feature matching.\n","authors":["Xiaoteng Zhou","Katsunori Mizuno","Yilong Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.02914v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2307.15977v2","updated":"2024-06-05T04:00:17Z","published":"2023-07-29T13:00:42Z","title":"Model Synthesis for Zero-Shot Model Attribution","summary":"  Nowadays, generative models are shaping various fields such as art, design,\nand human-computer interaction, yet accompanied by challenges related to\ncopyright infringement and content management. In response, existing research\nseeks to identify the unique fingerprints on the images they generate, which\ncan be leveraged to attribute the generated images to their source models.\nExisting methods, however, are constrained to identifying models within a\nstatic set included in the classifier training, failing to adapt to newly\nemerged unseen models dynamically. To bridge this gap, we aim to develop a\ngeneralized model fingerprint extractor capable of zero-shot attribution,\neffectively attributes unseen models without exposure during training. Central\nto our method is a model synthesis technique, which generates numerous\nsynthetic models mimicking the fingerprint patterns of real-world generative\nmodels. The design of the synthesis technique is motivated by observations on\nhow the basic generative model's architecture building blocks and parameters\ninfluence fingerprint patterns, and it is validated through two designed\nmetrics that examine synthetic models' fidelity and diversity. Our experiments\ndemonstrate that this fingerprint extractor, trained solely on synthetic\nmodels, achieves impressive zero-shot generalization on a wide range of\nreal-world generative models, improving model identification and verification\naccuracy on unseen models by over 40% and 15%, respectively, compared to\nexisting approaches.\n","authors":["Tianyun Yang","Juan Cao","Danding Wang","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2307.15977v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2402.04356v2","updated":"2024-06-05T03:57:03Z","published":"2024-02-06T19:42:18Z","title":"Bidirectional Autoregressive Diffusion Model for Dance Generation","summary":"  Dance serves as a powerful medium for expressing human emotions, but the\nlifelike generation of dance is still a considerable challenge. Recently,\ndiffusion models have showcased remarkable generative abilities across various\ndomains. They hold promise for human motion generation due to their adaptable\nmany-to-many nature. Nonetheless, current diffusion-based motion generation\nmodels often create entire motion sequences directly and unidirectionally,\nlacking focus on the motion with local and bidirectional enhancement. When\nchoreographing high-quality dance movements, people need to take into account\nnot only the musical context but also the nearby music-aligned dance motions.\nTo authentically capture human behavior, we propose a Bidirectional\nAutoregressive Diffusion Model (BADM) for music-to-dance generation, where a\nbidirectional encoder is built to enforce that the generated dance is\nharmonious in both the forward and backward directions. To make the generated\ndance motion smoother, a local information decoder is built for local motion\nenhancement. The proposed framework is able to generate new motions based on\nthe input conditions and nearby motions, which foresees individual motion\nslices iteratively and consolidates all predictions. To further refine the\nsynchronicity between the generated dance and the beat, the beat information is\nincorporated as an input to generate better music-aligned dance movements.\nExperimental results demonstrate that the proposed model achieves\nstate-of-the-art performance compared to existing unidirectional approaches on\nthe prominent benchmark for music-to-dance generation.\n","authors":["Canyu Zhang","Youbao Tang","Ning Zhang","Ruei-Sung Lin","Mei Han","Jing Xiao","Song Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04356v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00313v2","updated":"2024-06-05T03:22:49Z","published":"2024-06-01T06:12:48Z","title":"From Seedling to Harvest: The GrowingSoy Dataset for Weed Detection in\n  Soy Crops via Instance Segmentation","summary":"  Deep learning, particularly Convolutional Neural Networks (CNNs), has gained\nsignificant attention for its effectiveness in computer vision, especially in\nagricultural tasks. Recent advancements in instance segmentation have improved\nimage classification accuracy. In this work, we introduce a comprehensive\ndataset for training neural networks to detect weeds and soy plants through\ninstance segmentation. Our dataset covers various stages of soy growth,\noffering a chronological perspective on weed invasion's impact, with 1,000\nmeticulously annotated images. We also provide 6 state of the art models,\ntrained in this dataset, that can understand and detect soy and weed in every\nstage of the plantation process. By using this dataset for weed and soy\nsegmentation, we achieved a segmentation average precision of 79.1% and an\naverage recall of 69.2% across all plant classes, with the YOLOv8X model.\nMoreover, the YOLOv8M model attained 78.7% mean average precision (mAp-50) in\ncaruru weed segmentation, 69.7% in grassy weed segmentation, and 90.1% in soy\nplant segmentation.\n","authors":["Raul Steinmetz","Victor A. Kich","Henrique Krever","Joao D. Rigo Mazzarolo","Ricardo B. Grando","Vinicius Marini","Celio Trois","Ard Nieuwenhuizen"],"pdf_url":"https://arxiv.org/pdf/2406.00313v2.pdf","comment":"11th IEEE International Conference on Cybernetics and Intelligent\n  Systems (CIS)"},{"id":"http://arxiv.org/abs/2312.02197v4","updated":"2024-06-05T03:13:24Z","published":"2023-12-02T13:35:48Z","title":"Test-Time Degradation Adaptation for Open-Set Image Restoration","summary":"  In contrast to close-set scenarios that restore images from a predefined set\nof degradations, open-set image restoration aims to handle the unknown\ndegradations that were unforeseen during the pretraining phase, which is\nless-touched as far as we know. This work study this challenging problem and\nreveal its essence as unidentified distribution shifts between the test and\ntraining data. Recently, test-time adaptation has emerged as a fundamental\nmethod to address this inherent disparities. Inspired by it, we propose a\ntest-time degradation adaptation framework for open-set image restoration,\nwhich consists of three components, \\textit{i.e.}, i) a pre-trained and\ndegradation-agnostic diffusion model for generating clean images, ii) a\ntest-time degradation adapter adapts the unknown degradations based on the\ninput image during the testing phase, and iii) the adapter-guided image\nrestoration guides the model through the adapter to produce the corresponding\nclean image. Through experiments on multiple degradations, we show that our\nmethod achieves comparable even better performance than those task-specific\nmethods. The code is available at\nhttps://github.com/XLearning-SCU/2024-ICML-TAO.\n","authors":["Yuanbiao Gou","Haiyu Zhao","Boyun Li","Xinyan Xiao","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2312.02197v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02889v1","updated":"2024-06-05T03:11:33Z","published":"2024-06-05T03:11:33Z","title":"Language-guided Detection and Mitigation of Unknown Dataset Bias","summary":"  Dataset bias is a significant problem in training fair classifiers. When\nattributes unrelated to classification exhibit strong biases towards certain\nclasses, classifiers trained on such dataset may overfit to these bias\nattributes, substantially reducing the accuracy for minority groups. Mitigation\ntechniques can be categorized according to the availability of bias information\n(\\ie, prior knowledge). Although scenarios with unknown biases are better\nsuited for real-world settings, previous work in this field often suffers from\na lack of interpretability regarding biases and lower performance. In this\nstudy, we propose a framework to identify potential biases as keywords without\nprior knowledge based on the partial occurrence in the captions. We further\npropose two debiasing methods: (a) handing over to an existing debiasing\napproach which requires prior knowledge by assigning pseudo-labels, and (b)\nemploying data augmentation via text-to-image generative models, using acquired\nbias keywords as prompts. Despite its simplicity, experimental results show\nthat our framework not only outperforms existing methods without prior\nknowledge, but also is even comparable with a method that assumes prior\nknowledge.\n","authors":["Zaiying Zhao","Soichiro Kumano","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2406.02889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02884v1","updated":"2024-06-05T03:05:52Z","published":"2024-06-05T03:05:52Z","title":"PosterLLaVa: Constructing a Unified Multi-modal Layout Generator with\n  LLM","summary":"  Layout generation is the keystone in achieving automated graphic design,\nrequiring arranging the position and size of various multi-modal design\nelements in a visually pleasing and constraint-following manner. Previous\napproaches are either inefficient for large-scale applications or lack\nflexibility for varying design requirements. Our research introduces a unified\nframework for automated graphic layout generation, leveraging the multi-modal\nlarge language model (MLLM) to accommodate diverse design tasks. In contrast,\nour data-driven method employs structured text (JSON format) and visual\ninstruction tuning to generate layouts under specific visual and textual\nconstraints, including user-defined natural language specifications. We\nconducted extensive experiments and achieved state-of-the-art (SOTA)\nperformance on public multi-modal layout generation benchmarks, demonstrating\nthe effectiveness of our method. Moreover, recognizing existing datasets'\nlimitations in capturing the complexity of real-world graphic designs, we\npropose two new datasets for much more challenging tasks (user-constrained\ngeneration and complicated poster), further validating our model's utility in\nreal-life settings. Marking by its superior accessibility and adaptability,\nthis approach further automates large-scale graphic design tasks. The code and\ndatasets will be publicly available on\nhttps://github.com/posterllava/PosterLLaVA.\n","authors":["Tao Yang","Yingmin Luo","Zhongang Qi","Yang Wu","Ying Shan","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2406.02884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02881v1","updated":"2024-06-05T02:59:08Z","published":"2024-06-05T02:59:08Z","title":"Inv-Adapter: ID Customization Generation via Image Inversion and\n  Lightweight Adapter","summary":"  The remarkable advancement in text-to-image generation models significantly\nboosts the research in ID customization generation. However, existing\npersonalization methods cannot simultaneously satisfy high fidelity and\nhigh-efficiency requirements. Their main bottleneck lies in the prompt image\nencoder, which produces weak alignment signals with the text-to-image model and\nsignificantly increased model size. Towards this end, we propose a lightweight\nInv-Adapter, which first extracts diffusion-domain representations of ID images\nutilizing a pre-trained text-to-image model via DDIM image inversion, without\nadditional image encoder. Benefiting from the high alignment of the extracted\nID prompt features and the intermediate features of the text-to-image model, we\nthen embed them efficiently into the base text-to-image model by carefully\ndesigning a lightweight attention adapter. We conduct extensive experiments to\nassess ID fidelity, generation loyalty, speed, and training parameters, all of\nwhich show that the proposed Inv-Adapter is highly competitive in ID\ncustomization generation and model scale.\n","authors":["Peng Xing","Ning Wang","Jianbo Ouyang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2406.02881v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2406.02880v1","updated":"2024-06-05T02:54:46Z","published":"2024-06-05T02:54:46Z","title":"Controllable Talking Face Generation by Implicit Facial Keypoints\n  Editing","summary":"  Audio-driven talking face generation has garnered significant interest within\nthe domain of digital human research. Existing methods are encumbered by\nintricate model architectures that are intricately dependent on each other,\ncomplicating the process of re-editing image or video inputs. In this work, we\npresent ControlTalk, a talking face generation method to control face\nexpression deformation based on driven audio, which can construct the head pose\nand facial expression including lip motion for both single image or sequential\nvideo inputs in a unified manner. By utilizing a pre-trained video synthesis\nrenderer and proposing the lightweight adaptation, ControlTalk achieves precise\nand naturalistic lip synchronization while enabling quantitative control over\nmouth opening shape. Our experiments show that our method is superior to\nstate-of-the-art performance on widely used benchmarks, including HDTF and\nMEAD. The parameterized adaptation demonstrates remarkable generalization\ncapabilities, effectively handling expression deformation across same-ID and\ncross-ID scenarios, and extending its utility to out-of-domain portraits,\nregardless of languages.\n","authors":["Dong Zhao","Jiaying Shi","Wenjun Li","Shudong Wang","Shenghui Xu","Zhaoming Pan"],"pdf_url":"https://arxiv.org/pdf/2406.02880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02879v1","updated":"2024-06-05T02:53:12Z","published":"2024-06-05T02:53:12Z","title":"Second-order differential operators, stochastic differential equations\n  and Brownian motions on embedded manifolds","summary":"  We specify the conditions when a manifold M embedded in an inner product\nspace E is an invariant manifold of a stochastic differential equation (SDE) on\nE, linking it with the notion of second-order differential operators on M. When\nM is given a Riemannian metric, we derive a simple formula for the\nLaplace-Beltrami operator in terms of the gradient and Hessian on E and\nconstruct the Riemannian Brownian motions on M as solutions of conservative\nStratonovich and Ito SDEs on E. We derive explicitly the SDE for Brownian\nmotions on several important manifolds in applications, including\nleft-invariant matrix Lie groups using embedded coordinates. Numerically, we\npropose three simulation schemes to solve SDEs on manifolds. In addition to the\nstochastic projection method, to simulate Riemannian Brownian motions, we\nconstruct a second-order tangent retraction of the Levi-Civita connection using\na given E-tubular retraction. We also propose the retractive Euler-Maruyama\nmethod to solve a SDE, taking into account the second-order term of a tangent\nretraction. We provide software to implement the methods in the paper,\nincluding Brownian motions of the manifolds discussed. We verify numerically\nthat on several compact Riemannian manifolds, the long-term limit of Brownian\nsimulation converges to the uniform distributions, suggesting a method to\nsample Riemannian uniform distributions\n","authors":["Du Nguyen","Stefan Sommer"],"pdf_url":"https://arxiv.org/pdf/2406.02879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1810.12813v4","updated":"2024-06-05T02:31:36Z","published":"2018-10-30T15:33:47Z","title":"Contextual Hourglass Network for Semantic Segmentation of High\n  Resolution Aerial Imagery","summary":"  Semantic segmentation for aerial imagery is a challenging and important\nproblem in remotely sensed imagery analysis. In recent years, with the success\nof deep learning, various convolutional neural network (CNN) based models have\nbeen developed. However, due to the varying sizes of the objects and imbalanced\nclass labels, it can be challenging to obtain accurate pixel-wise semantic\nsegmentation results. To address those challenges, we develop a novel semantic\nsegmentation method and call it Contextual Hourglass Network. In our method, in\norder to improve the robustness of the prediction, we design a new contextual\nhourglass module which incorporates attention mechanism on processed\nlow-resolution featuremaps to exploit the contextual semantics. We further\nexploit the stacked encoder-decoder structure by connecting multiple contextual\nhourglass modules from end to end. This architecture can effectively extract\nrich multi-scale features and add more feedback loops for better learning\ncontextual semantics through intermediate supervision. To demonstrate the\nefficacy of our semantic segmentation method, we test it on Potsdam and\nVaihingen datasets. Through the comparisons to other baseline methods, our\nmethod yields the best results on overall performance.\n","authors":["Panfeng Li","Youzuo Lin","Emily Schultz-Fellenz"],"pdf_url":"https://arxiv.org/pdf/1810.12813v4.pdf","comment":"Accepted by 2024 5th International Conference on Electronic\n  Communication and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2406.02862v1","updated":"2024-06-05T02:22:54Z","published":"2024-06-05T02:22:54Z","title":"Rethinking Guidance Information to Utilize Unlabeled Samples:A Label\n  Encoding Perspective","summary":"  Empirical Risk Minimization (ERM) is fragile in scenarios with insufficient\nlabeled samples. A vanilla extension of ERM to unlabeled samples is Entropy\nMinimization (EntMin), which employs the soft-labels of unlabeled samples to\nguide their learning. However, EntMin emphasizes prediction discriminability\nwhile neglecting prediction diversity. To alleviate this issue, in this paper,\nwe rethink the guidance information to utilize unlabeled samples. By analyzing\nthe learning objective of ERM, we find that the guidance information for\nlabeled samples in a specific category is the corresponding label encoding.\nInspired by this finding, we propose a Label-Encoding Risk Minimization (LERM).\nIt first estimates the label encodings through prediction means of unlabeled\nsamples and then aligns them with their corresponding ground-truth label\nencodings. As a result, the LERM ensures both prediction discriminability and\ndiversity, and it can be integrated into existing methods as a plugin.\nTheoretically, we analyze the relationships between LERM and ERM as well as\nEntMin. Empirically, we verify the superiority of the LERM under several label\ninsufficient scenarios. The codes are available at\nhttps://github.com/zhangyl660/LERM.\n","authors":["Yulong Zhang","Yuan Yao","Shuhao Chen","Pengrong Jin","Yu Zhang","Jian Jin","Jiangang Lu"],"pdf_url":"https://arxiv.org/pdf/2406.02862v1.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2402.13607v3","updated":"2024-06-05T02:14:06Z","published":"2024-02-21T08:21:12Z","title":"CODIS: Benchmarking Context-Dependent Visual Comprehension for\n  Multimodal Large Language Models","summary":"  Multimodal large language models (MLLMs) have demonstrated promising results\nin a variety of tasks that combine vision and language. As these models become\nmore integral to research and applications, conducting comprehensive\nevaluations of their capabilities has grown increasingly important. However,\nmost existing benchmarks fail to consider that, in certain situations, images\nneed to be interpreted within a broader context. In this work, we introduce a\nnew benchmark, named as CODIS, designed to assess the ability of models to use\ncontext provided in free-form text to enhance visual comprehension. Our\nfindings indicate that MLLMs consistently fall short of human performance on\nthis benchmark. Further analysis confirms that these models struggle to\neffectively extract and utilize contextual information to improve their\nunderstanding of images. This underscores the pressing need to enhance the\nability of MLLMs to comprehend visuals in a context-dependent manner. View our\nproject website at https://thunlp-mt.github.io/CODIS.\n","authors":["Fuwen Luo","Chi Chen","Zihao Wan","Zhaolu Kang","Qidong Yan","Yingjie Li","Xiaolong Wang","Siyu Wang","Ziyue Wang","Xiaoyue Mi","Peng Li","Ning Ma","Maosong Sun","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.13607v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02842v1","updated":"2024-06-05T01:32:31Z","published":"2024-06-05T01:32:31Z","title":"Zero-Shot Image Segmentation via Recursive Normalized Cut on Diffusion\n  Features","summary":"  Foundation models have emerged as powerful tools across various domains\nincluding language, vision, and multimodal tasks. While prior works have\naddressed unsupervised image segmentation, they significantly lag behind\nsupervised models. In this paper, we use a diffusion UNet encoder as a\nfoundation vision encoder and introduce DiffCut, an unsupervised zero-shot\nsegmentation method that solely harnesses the output features from the final\nself-attention block. Through extensive experimentation, we demonstrate that\nthe utilization of these diffusion features in a graph based segmentation\nalgorithm, significantly outperforms previous state-of-the-art methods on\nzero-shot segmentation. Specifically, we leverage a recursive Normalized Cut\nalgorithm that softly regulates the granularity of detected objects and\nproduces well-defined segmentation maps that precisely capture intricate image\ndetails. Our work highlights the remarkably accurate semantic knowledge\nembedded within diffusion UNet encoders that could then serve as foundation\nvision encoders for downstream tasks. Project page at\nhttps://diffcut-segmentation.github.io\n","authors":["Paul Couairon","Mustafa Shukor","Jean-Emmanuel Haugeard","Matthieu Cord","Nicolas Thome"],"pdf_url":"https://arxiv.org/pdf/2406.02842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02841v1","updated":"2024-06-05T01:31:50Z","published":"2024-06-05T01:31:50Z","title":"Conditional Idempotent Generative Networks","summary":"  We propose Conditional Idempotent Generative Networks (CIGN), a novel\napproach that expands upon Idempotent Generative Networks (IGN) to enable\nconditional generation. While IGNs offer efficient single-pass generation, they\nlack the ability to control the content of the generated data. CIGNs address\nthis limitation by incorporating conditioning mechanisms, allowing users to\nsteer the generation process towards specific types of data.\n  We establish the theoretical foundations for CIGNs, outlining their scope,\nloss function design, and evaluation metrics. We then present two potential\narchitectures for implementing CIGNs: channel conditioning and filter\nconditioning. Finally, we discuss experimental results on the MNIST dataset,\ndemonstrating the effectiveness of both approaches. Our findings pave the way\nfor further exploration of CIGNs on larger datasets and with more powerful\ncomputing resources to determine the optimal implementation strategy.\n","authors":["Niccolò Ronchetti"],"pdf_url":"https://arxiv.org/pdf/2406.02841v1.pdf","comment":"22 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.02836v1","updated":"2024-06-05T01:19:44Z","published":"2024-06-05T01:19:44Z","title":"DREW : Towards Robust Data Provenance by Leveraging Error-Controlled\n  Watermarking","summary":"  Identifying the origin of data is crucial for data provenance, with\napplications including data ownership protection, media forensics, and\ndetecting AI-generated content. A standard approach involves embedding-based\nretrieval techniques that match query data with entries in a reference dataset.\nHowever, this method is not robust against benign and malicious edits. To\naddress this, we propose Data Retrieval with Error-corrected codes and\nWatermarking (DREW). DREW randomly clusters the reference dataset, injects\nunique error-controlled watermark keys into each cluster, and uses these keys\nat query time to identify the appropriate cluster for a given sample. After\nlocating the relevant cluster, embedding vector similarity retrieval is\nperformed within the cluster to find the most accurate matches. The integration\nof error control codes (ECC) ensures reliable cluster assignments, enabling the\nmethod to perform retrieval on the entire dataset in case the ECC algorithm\ncannot detect the correct cluster with high confidence. This makes DREW\nmaintain baseline performance, while also providing opportunities for\nperformance improvements due to the increased likelihood of correctly matching\nqueries to their origin when performing retrieval on a smaller subset of the\ndataset. Depending on the watermark technique used, DREW can provide\nsubstantial improvements in retrieval accuracy (up to 40\\% for some datasets\nand modification types) across multiple datasets and state-of-the-art embedding\nmodels (e.g., DinoV2, CLIP), making our method a promising solution for secure\nand reliable source identification. The code is available at\nhttps://github.com/mehrdadsaberi/DREW\n","authors":["Mehrdad Saberi","Vinu Sankar Sadasivan","Arman Zarei","Hessam Mahdavifar","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2406.02836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02833v1","updated":"2024-06-05T01:05:26Z","published":"2024-06-05T01:05:26Z","title":"DenoDet: Attention as Deformable Multi-Subspace Feature Denoising for\n  Target Detection in SAR Images","summary":"  Synthetic Aperture Radar (SAR) target detection has long been impeded by\ninherent speckle noise and the prevalence of diminutive, ambiguous targets.\nWhile deep neural networks have advanced SAR target detection, their intrinsic\nlow-frequency bias and static post-training weights falter with coherent noise\nand preserving subtle details across heterogeneous terrains. Motivated by\ntraditional SAR image denoising, we propose DenoDet, a network aided by\nexplicit frequency domain transform to calibrate convolutional biases and pay\nmore attention to high-frequencies, forming a natural multi-scale subspace\nrepresentation to detect targets from the perspective of multi-subspace\ndenoising. We design TransDeno, a dynamic frequency domain attention module\nthat performs as a transform domain soft thresholding operation, dynamically\ndenoising across subspaces by preserving salient target signals and attenuating\nnoise. To adaptively adjust the granularity of subspace processing, we also\npropose a deformable group fully-connected layer (DeGroFC) that dynamically\nvaries the group conditioned on the input features. Without bells and whistles,\nour plug-and-play TransDeno sets state-of-the-art scores on multiple SAR target\ndetection datasets. The code is available at https://github.com/GrokCV/GrokSAR.\n","authors":["Yimian Dai","Minrui Zou","Yuxuan Li","Xiang Li","Kang Ni","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2406.02833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02831v1","updated":"2024-06-05T00:44:42Z","published":"2024-06-05T00:44:42Z","title":"Distilling Aggregated Knowledge for Weakly-Supervised Video Anomaly\n  Detection","summary":"  Video anomaly detection aims to develop automated models capable of\nidentifying abnormal events in surveillance videos. The benchmark setup for\nthis task is extremely challenging due to: i) the limited size of the training\nsets, ii) weak supervision provided in terms of video-level labels, and iii)\nintrinsic class imbalance induced by the scarcity of abnormal events. In this\nwork, we show that distilling knowledge from aggregated representations of\nmultiple backbones into a relatively simple model achieves state-of-the-art\nperformance. In particular, we develop a bi-level distillation approach along\nwith a novel disentangled cross-attention-based feature aggregation network.\nOur proposed approach, DAKD (Distilling Aggregated Knowledge with Disentangled\nAttention), demonstrates superior performance compared to existing methods\nacross multiple benchmark datasets. Notably, we achieve significant\nimprovements of 1.36%, 0.78%, and 7.02% on the UCF-Crime, ShanghaiTech, and\nXD-Violence datasets, respectively.\n","authors":["Jash Dalvi","Ali Dabouei","Gunjan Dhanuka","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2406.02831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00645v2","updated":"2024-06-05T00:05:23Z","published":"2024-06-02T07:20:08Z","title":"FuRL: Visual-Language Models as Fuzzy Rewards for Reinforcement Learning","summary":"  In this work, we investigate how to leverage pre-trained visual-language\nmodels (VLM) for online Reinforcement Learning (RL). In particular, we focus on\nsparse reward tasks with pre-defined textual task descriptions. We first\nidentify the problem of reward misalignment when applying VLM as a reward in RL\ntasks. To address this issue, we introduce a lightweight fine-tuning method,\nnamed Fuzzy VLM reward-aided RL (FuRL), based on reward alignment and relay RL.\nSpecifically, we enhance the performance of SAC/DrQ baseline agents on sparse\nreward tasks by fine-tuning VLM representations and using relay RL to avoid\nlocal minima. Extensive experiments on the Meta-world benchmark tasks\ndemonstrate the efficacy of the proposed method. Code is available at:\nhttps://github.com/fuyw/FuRL.\n","authors":["Yuwei Fu","Haichao Zhang","Di Wu","Wei Xu","Benoit Boulet"],"pdf_url":"https://arxiv.org/pdf/2406.00645v2.pdf","comment":"ICML 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2402.14836v2","updated":"2024-06-05T17:02:47Z","published":"2024-02-18T16:51:02Z","title":"Stealthy Attack on Large Language Model based Recommendation","summary":"  Recently, the powerful large language models (LLMs) have been instrumental in\npropelling the progress of recommender systems (RS). However, while these\nsystems have flourished, their susceptibility to security threats has been\nlargely overlooked. In this work, we reveal that the introduction of LLMs into\nrecommendation models presents new security vulnerabilities due to their\nemphasis on the textual content of items. We demonstrate that attackers can\nsignificantly boost an item's exposure by merely altering its textual content\nduring the testing phase, without requiring direct interference with the\nmodel's training process. Additionally, the attack is notably stealthy, as it\ndoes not affect the overall recommendation performance and the modifications to\nthe text are subtle, making it difficult for users and platforms to detect. Our\ncomprehensive experiments across four mainstream LLM-based recommendation\nmodels demonstrate the superior efficacy and stealthiness of our approach. Our\nwork unveils a significant security gap in LLM-based recommendation systems and\npaves the way for future research on protecting these systems.\n","authors":["Jinghao Zhang","Yuting Liu","Qiang Liu","Shu Wu","Guibing Guo","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.14836v2.pdf","comment":"ACL 2024 Main"},{"id":"http://arxiv.org/abs/2210.04288v2","updated":"2024-06-05T15:33:23Z","published":"2022-10-09T15:42:36Z","title":"CoopHash: Cooperative Learning of Multipurpose Descriptor and\n  Contrastive Pair Generator via Variational MCMC Teaching for Supervised Image\n  Hashing","summary":"  Leveraging supervised information can lead to superior retrieval performance\nin the image hashing domain but the performance degrades significantly without\nenough labeled data. One effective solution to boost performance is to employ\ngenerative models, such as Generative Adversarial Networks (GANs), to generate\nsynthetic data in an image hashing model. However, GAN-based methods are\ndifficult to train, which prevents the hashing approaches from jointly training\nthe generative models and the hash functions. This limitation results in\nsub-optimal retrieval performance. To overcome this limitation, we propose a\nnovel framework, the generative cooperative hashing network, which is based on\nenergy-based cooperative learning. This framework jointly learns a powerful\ngenerative representation of the data and a robust hash function via two\ncomponents: a top-down contrastive pair generator that synthesizes contrastive\nimages and a bottom-up multipurpose descriptor that simultaneously represents\nthe images from multiple perspectives, including probability density, hash\ncode, latent code, and category. The two components are jointly learned via a\nnovel likelihood-based cooperative learning scheme. We conduct experiments on\nseveral real-world datasets and show that the proposed method outperforms the\ncompeting hashing supervised methods, achieving up to 10\\% relative improvement\nover the current state-of-the-art supervised hashing methods, and exhibits a\nsignificantly better performance in out-of-distribution retrieval.\n","authors":["Khoa D. Doan","Jianwen Xie","Yaxuan Zhu","Yang Zhao","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2210.04288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11755v3","updated":"2024-06-05T14:41:02Z","published":"2023-05-19T15:42:00Z","title":"Visualization for Recommendation Explainability: A Survey and New\n  Perspectives","summary":"  Providing system-generated explanations for recommendations represents an\nimportant step towards transparent and trustworthy recommender systems.\nExplainable recommender systems provide a human-understandable rationale for\ntheir outputs. Over the last two decades, explainable recommendation has\nattracted much attention in the recommender systems research community. This\npaper aims to provide a comprehensive review of research efforts on visual\nexplanation in recommender systems. More concretely, we systematically review\nthe literature on explanations in recommender systems based on four dimensions,\nnamely explanation goal, explanation scope, explanation style, and explanation\nformat. Recognizing the importance of visualization, we approach the\nrecommender system literature from the angle of explanatory visualizations,\nthat is using visualizations as a display style of explanation. As a result, we\nderive a set of guidelines that might be constructive for designing explanatory\nvisualizations in recommender systems and identify perspectives for future work\nin this field. The aim of this review is to help recommendation researchers and\npractitioners better understand the potential of visually explainable\nrecommendation research and to support them in the systematic design of visual\nexplanations in current and future recommender systems.\n","authors":["Mohamed Amine Chatti","Mouadh Guesmi","Arham Muslim"],"pdf_url":"https://arxiv.org/pdf/2305.11755v3.pdf","comment":"This article has been accepted for publication in the ACM\n  Transactions on Interactive Intelligent Systems"},{"id":"http://arxiv.org/abs/2404.03528v3","updated":"2024-06-05T13:39:56Z","published":"2024-04-04T15:31:21Z","title":"BanglaAutoKG: Automatic Bangla Knowledge Graph Construction with\n  Semantic Neural Graph Filtering","summary":"  Knowledge Graphs (KGs) have proven essential in information processing and\nreasoning applications because they link related entities and give context-rich\ninformation, supporting efficient information retrieval and knowledge\ndiscovery; presenting information flow in a very effective manner. Despite\nbeing widely used globally, Bangla is relatively underrepresented in KGs due to\na lack of comprehensive datasets, encoders, NER (named entity recognition)\nmodels, POS (part-of-speech) taggers, and lemmatizers, hindering efficient\ninformation processing and reasoning applications in the language. Addressing\nthe KG scarcity in Bengali, we propose BanglaAutoKG, a pioneering framework\nthat is able to automatically construct Bengali KGs from any Bangla text. We\nutilize multilingual LLMs to understand various languages and correlate\nentities and relations universally. By employing a translation dictionary to\nidentify English equivalents and extracting word features from pre-trained BERT\nmodels, we construct the foundational KG. To reduce noise and align word\nembeddings with our goal, we employ graph-based polynomial filters. Lastly, we\nimplement a GNN-based semantic filter, which elevates contextual understanding\nand trims unnecessary edges, culminating in the formation of the definitive KG.\nEmpirical findings and case studies demonstrate the universal effectiveness of\nour model, capable of autonomously constructing semantically enriched KGs from\nany text.\n","authors":["Azmine Toushik Wasi","Taki Hasan Rafi","Raima Islam","Dong-Kyu Chae"],"pdf_url":"https://arxiv.org/pdf/2404.03528v3.pdf","comment":"7 pages, 3 figures. Accepted to LREC-COLING 2024. Read in ACL\n  Anthology: https://aclanthology.org/2024.lrec-main.189/"},{"id":"http://arxiv.org/abs/2402.10024v2","updated":"2024-06-05T13:38:42Z","published":"2024-02-15T15:43:05Z","title":"Self-Augmented In-Context Learning for Unsupervised Word Translation","summary":"  Recent work has shown that, while large language models (LLMs) demonstrate\nstrong word translation or bilingual lexicon induction (BLI) capabilities in\nfew-shot setups, they still cannot match the performance of 'traditional'\nmapping-based approaches in the unsupervised scenario where no seed translation\npairs are available, especially for lower-resource languages. To address this\nchallenge with LLMs, we propose self-augmented in-context learning (SAIL) for\nunsupervised BLI: starting from a zero-shot prompt, SAIL iteratively induces a\nset of high-confidence word translation pairs for in-context learning (ICL)\nfrom an LLM, which it then reapplies to the same LLM in the ICL fashion. Our\nmethod shows substantial gains over zero-shot prompting of LLMs on two\nestablished BLI benchmarks spanning a wide range of language pairs, also\noutperforming mapping-based baselines across the board. In addition to\nachieving state-of-the-art unsupervised BLI performance, we also conduct\ncomprehensive analyses on SAIL and discuss its limitations.\n","authors":["Yaoyiran Li","Anna Korhonen","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2402.10024v2.pdf","comment":"ACL 2024 Main Conference; 11 Pages, 3 Figures, 9 Tables"},{"id":"http://arxiv.org/abs/2406.03248v1","updated":"2024-06-05T13:23:23Z","published":"2024-06-05T13:23:23Z","title":"Large Language Models as Evaluators for Recommendation Explanations","summary":"  The explainability of recommender systems has attracted significant attention\nin academia and industry. Many efforts have been made for explainable\nrecommendations, yet evaluating the quality of the explanations remains a\nchallenging and unresolved issue. In recent years, leveraging LLMs as\nevaluators presents a promising avenue in Natural Language Processing tasks\n(e.g., sentiment classification, information extraction), as they perform\nstrong capabilities in instruction following and common-sense reasoning.\nHowever, evaluating recommendation explanatory texts is different from these\nNLG tasks, as its criteria are related to human perceptions and are usually\nsubjective. In this paper, we investigate whether LLMs can serve as evaluators\nof recommendation explanations. To answer the question, we utilize real user\nfeedback on explanations given from previous work and additionally collect\nthird-party annotations and LLM evaluations. We design and apply a 3-level meta\nevaluation strategy to measure the correlation between evaluator labels and the\nground truth provided by users. Our experiments reveal that LLMs, such as GPT4,\ncan provide comparable evaluations with appropriate prompts and settings. We\nalso provide further insights into combining human labels with the LLM\nevaluation process and utilizing ensembles of multiple heterogeneous LLM\nevaluators to enhance the accuracy and stability of evaluations. Our study\nverifies that utilizing LLMs as evaluators can be an accurate, reproducible and\ncost-effective solution for evaluating recommendation explanation texts. Our\ncode is available at https://github.com/Xiaoyu-SZ/LLMasEvaluator.\n","authors":["Xiaoyu Zhang","Yishan Li","Jiayin Wang","Bowen Sun","Weizhi Ma","Peijie Sun","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.03248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03221v1","updated":"2024-06-05T13:00:04Z","published":"2024-06-05T13:00:04Z","title":"Linking Named Entities in Diderot's \\textit{Encyclopédie} to Wikidata","summary":"  Diderot's \\textit{Encyclop\\'edie} is a reference work from XVIIIth century in\nEurope that aimed at collecting the knowledge of its era. \\textit{Wikipedia}\nhas the same ambition with a much greater scope. However, the lack of digital\nconnection between the two encyclopedias may hinder their comparison and the\nstudy of how knowledge has evolved. A key element of \\textit{Wikipedia} is\nWikidata that backs the articles with a graph of structured data. In this\npaper, we describe the annotation of more than 10,300 of the\n\\textit{Encyclop\\'edie} entries with Wikidata identifiers enabling us to\nconnect these entries to the graph. We considered geographic and human\nentities. The \\textit{Encyclop\\'edie} does not contain biographic entries as\nthey mostly appear as subentries of locations. We extracted all the geographic\nentries and we completely annotated all the entries containing a description of\nhuman entities. This represents more than 2,600 links referring to locations or\nhuman entities. In addition, we annotated more than 9,500 entries having a\ngeographic content only. We describe the annotation process as well as\napplication examples. This resource is available at\nhttps://github.com/pnugues/encyclopedie_1751\n","authors":["Pierre Nugues"],"pdf_url":"https://arxiv.org/pdf/2406.03221v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2406.03210v1","updated":"2024-06-05T12:45:25Z","published":"2024-06-05T12:45:25Z","title":"Text-like Encoding of Collaborative Information in Large Language Models\n  for Recommendation","summary":"  When adapting Large Language Models for Recommendation (LLMRec), it is\ncrucial to integrate collaborative information. Existing methods achieve this\nby learning collaborative embeddings in LLMs' latent space from scratch or by\nmapping from external models. However, they fail to represent the information\nin a text-like format, which may not align optimally with LLMs. To bridge this\ngap, we introduce BinLLM, a novel LLMRec method that seamlessly integrates\ncollaborative information through text-like encoding. BinLLM converts\ncollaborative embeddings from external models into binary sequences -- a\nspecific text format that LLMs can understand and operate on directly,\nfacilitating the direct usage of collaborative information in text-like format\nby LLMs. Additionally, BinLLM provides options to compress the binary sequence\nusing dot-decimal notation to avoid excessively long lengths. Extensive\nexperiments validate that BinLLM introduces collaborative information in a\nmanner better aligned with LLMs, resulting in enhanced performance. We release\nour code at https://github.com/zyang1580/BinLLM.\n","authors":["Yang Zhang","Keqin Bao","Ming Yan","Wenjie Wang","Fuli Feng","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2406.03210v1.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2406.03109v1","updated":"2024-06-05T09:57:58Z","published":"2024-06-05T09:57:58Z","title":"CAPRI-FAIR: Integration of Multi-sided Fairness in Contextual POI\n  Recommendation Framework","summary":"  Point-of-interest (POI) recommendation, a form of context-aware\nrecommendation, takes into account spatio-temporal constraints and contexts\nlike distance, peak business hours, and previous user check-ins. Given the\nability of these kinds of systems to influence not just the consumer's travel\nexperience, but also the POI's business, it is important to consider fairness\nfrom multiple perspectives. Unfortunately, these systems tend to provide less\naccurate recommendations to inactive users, and less exposure to unpopular\nPOIs. The goal of this paper is to develop a post-filter methodology that\nincorporates provider and consumer fairness factors into pre-existing\nrecommendation models, to satisfy fairness metrics like item exposure, and\nperformance metrics like precision and distance, making the system more\nsustainable to both consumers and providers. Experiments have shown that using\na linear scoring model for provider fairness in re-scoring recommended items\nyields the best tradeoff between performance and long-tail exposure, in some\ncases without a significant decrease in precision. When attempting to address\nconsumer fairness by recommending more popular POIs to inactive users, the\nresult was an increase in precision for only some recommendation models and\ndatasets. Finally, when considering the tradeoff between both parameters, the\ncombinations that reached the Pareto front of consumer and provider fairness,\nunfortunately, achieved the lowest precision values. We find that the nature of\nthis tradeoff depends heavily on the model and the dataset.\n","authors":["Francis Zac dela Cruz","Flora D. Salim","Yonchanok Khaokaew","Jeffrey Chan"],"pdf_url":"https://arxiv.org/pdf/2406.03109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03085v1","updated":"2024-06-05T09:19:54Z","published":"2024-06-05T09:19:54Z","title":"Exploring User Retrieval Integration towards Large Language Models for\n  Cross-Domain Sequential Recommendation","summary":"  Cross-Domain Sequential Recommendation (CDSR) aims to mine and transfer\nusers' sequential preferences across different domains to alleviate the\nlong-standing cold-start issue. Traditional CDSR models capture collaborative\ninformation through user and item modeling while overlooking valuable semantic\ninformation. Recently, Large Language Model (LLM) has demonstrated powerful\nsemantic reasoning capabilities, motivating us to introduce them to better\ncapture semantic information. However, introducing LLMs to CDSR is non-trivial\ndue to two crucial issues: seamless information integration and domain-specific\ngeneration. To this end, we propose a novel framework named URLLM, which aims\nto improve the CDSR performance by exploring the User Retrieval approach and\ndomain grounding on LLM simultaneously. Specifically, we first present a novel\ndual-graph sequential model to capture the diverse information, along with an\nalignment and contrastive learning method to facilitate domain knowledge\ntransfer. Subsequently, a user retrieve-generation model is adopted to\nseamlessly integrate the structural information into LLM, fully harnessing its\nemergent inferencing ability. Furthermore, we propose a domain-specific\nstrategy and a refinement module to prevent out-of-domain generation. Extensive\nexperiments on Amazon demonstrated the information integration and\ndomain-specific generation ability of URLLM in comparison to state-of-the-art\nbaselines. Our code is available at https://github.com/TingJShen/URLLM\n","authors":["Tingjia Shen","Hao Wang","Jiaqing Zhang","Sirui Zhao","Liangyue Li","Zulong Chen","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2406.03085v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.03064v1","updated":"2024-06-05T08:47:30Z","published":"2024-06-05T08:47:30Z","title":"Path-Specific Causal Reasoning for Fairness-aware Cognitive Diagnosis","summary":"  Cognitive Diagnosis~(CD), which leverages students and exercise data to\npredict students' proficiency levels on different knowledge concepts, is one of\nfundamental components in Intelligent Education. Due to the scarcity of\nstudent-exercise interaction data, most existing methods focus on making the\nbest use of available data, such as exercise content and student\ninformation~(e.g., educational context). Despite the great progress, the abuse\nof student sensitive information has not been paid enough attention. Due to the\nimportant position of CD in Intelligent Education, employing sensitive\ninformation when making diagnosis predictions will cause serious social issues.\nMoreover, data-driven neural networks are easily misled by the shortcut between\ninput data and output prediction, exacerbating this problem. Therefore, it is\ncrucial to eliminate the negative impact of sensitive information in CD models.\nIn response, we argue that sensitive attributes of students can also provide\nuseful information, and only the shortcuts directly related to the sensitive\ninformation should be eliminated from the diagnosis process. Thus, we employ\ncausal reasoning and design a novel Path-Specific Causal Reasoning Framework\n(PSCRF) to achieve this goal. Specifically, we first leverage an encoder to\nextract features and generate embeddings for general information and sensitive\ninformation of students. Then, we design a novel attribute-oriented predictor\nto decouple the sensitive attributes, in which fairness-related sensitive\nfeatures will be eliminated and other useful information will be retained.\nFinally, we designed a multi-factor constraint to ensure the performance of\nfairness and diagnosis performance simultaneously. Extensive experiments over\nreal-world datasets (e.g., PISA dataset) demonstrate the effectiveness of our\nproposed PSCRF.\n","authors":["Dacao Zhang","Kun Zhang","Le Wu","Mi Tian","Richang Hong","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2406.03064v1.pdf","comment":"Accpeted by KDD'2024"},{"id":"http://arxiv.org/abs/2311.04760v3","updated":"2024-06-05T08:04:32Z","published":"2023-11-08T15:33:06Z","title":"Towards Open-world Cross-Domain Sequential Recommendation: A\n  Model-Agnostic Contrastive Denoising Approach","summary":"  Cross-domain sequential recommendation (CDSR) aims to address the data\nsparsity problems that exist in traditional sequential recommendation (SR)\nsystems.\n  The existing approaches aim to design a specific cross-domain unit that can\ntransfer and propagate information across multiple domains by relying on\noverlapping users with abundant behaviors. However, in real-world recommender\nsystems, CDSR scenarios usually consist of a majority of long-tailed users with\nsparse behaviors and cold-start users who only exist in one domain. This leads\nto a drop in the performance of existing CDSR methods in the real-world\nindustry platform. Therefore, improving the consistency and effectiveness of\nmodels in open-world CDSR scenarios is crucial for constructing CDSR models\n(\\textit{1st} CH). Recently, some SR approaches have utilized auxiliary\nbehaviors to complement the information for long-tailed users. However, these\nmulti-behavior SR methods cannot deliver promising performance in CDSR, as they\noverlook the semantic gap between target and auxiliary behaviors, as well as\nuser interest deviation across domains (\\textit{2nd} CH).\n","authors":["Wujiang Xu","Xuying Ning","Wenfang Lin","Mingming Ha","Qiongxu Ma","Qianqiao Liang","Xuewen Tao","Linxun Chen","Bing Han","Minnan Luo"],"pdf_url":"https://arxiv.org/pdf/2311.04760v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14903v2","updated":"2024-06-05T07:45:31Z","published":"2024-04-23T10:36:23Z","title":"Multi-Sample Dynamic Time Warping for Few-Shot Keyword Spotting","summary":"  In multi-sample keyword spotting, each keyword class is represented by\nmultiple spoken instances, called samples. A na\\\"ive approach to detect\nkeywords in a target sequence consists of querying all samples of all classes\nusing sub-sequence dynamic time warping. However, the resulting processing time\nincreases linearly with respect to the number of samples belonging to each\nclass. Alternatively, only a single Fr\\'echet mean can be queried for each\nclass, resulting in reduced processing time but usually also in worse detection\nperformance as the variability of the query samples is not captured\nsufficiently well. In this work, multi-sample dynamic time warping is proposed\nto compute class-specific cost-tensors that include the variability of all\nquery samples. To significantly reduce the computational complexity during\ninference, these cost tensors are converted to cost matrices before applying\ndynamic time warping. In experimental evaluations for few-shot keyword\nspotting, it is shown that this method yields a very similar performance as\nusing all individual query samples as templates while having a runtime that is\nonly slightly slower than when using Fr\\'echet means.\n","authors":["Kevin Wilkinghoff","Alessia Cornaggia-Urrigshardt"],"pdf_url":"https://arxiv.org/pdf/2404.14903v2.pdf","comment":"Accepted for presentation at EUSIPCO 2024"},{"id":"http://arxiv.org/abs/2406.01022v2","updated":"2024-06-05T07:30:59Z","published":"2024-06-03T06:08:02Z","title":"Poisoning Attacks and Defenses in Recommender Systems: A Survey","summary":"  Modern recommender systems (RS) have profoundly enhanced user experience\nacross digital platforms, yet they face significant threats from poisoning\nattacks. These attacks, aimed at manipulating recommendation outputs for\nunethical gains, exploit vulnerabilities in RS through injecting malicious data\nor intervening model training. This survey presents a unique perspective by\nexamining these threats through the lens of an attacker, offering fresh\ninsights into their mechanics and impacts. Concretely, we detail a systematic\npipeline that encompasses four stages of a poisoning attack: setting attack\ngoals, assessing attacker capabilities, analyzing victim architecture, and\nimplementing poisoning strategies. The pipeline not only aligns with various\nattack tactics but also serves as a comprehensive taxonomy to pinpoint focuses\nof distinct poisoning attacks. Correspondingly, we further classify defensive\nstrategies into two main categories: poisoning data filtering and robust\ntraining from the defender's perspective. Finally, we highlight existing\nlimitations and suggest innovative directions for further exploration in this\nfield.\n","authors":["Zongwei Wang","Junliang Yu","Min Gao","Wei Yuan","Guanhua Ye","Shazia Sadiq","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2406.01022v2.pdf","comment":"22 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.02962v1","updated":"2024-06-05T05:35:59Z","published":"2024-06-05T05:35:59Z","title":"Docs2KG: Unified Knowledge Graph Construction from Heterogeneous\n  Documents Assisted by Large Language Models","summary":"  Even for a conservative estimate, 80% of enterprise data reside in\nunstructured files, stored in data lakes that accommodate heterogeneous\nformats. Classical search engines can no longer meet information seeking needs,\nespecially when the task is to browse and explore for insight formulation. In\nother words, there are no obvious search keywords to use. Knowledge graphs, due\nto their natural visual appeals that reduce the human cognitive load, become\nthe winning candidate for heterogeneous data integration and knowledge\nrepresentation.\n  In this paper, we introduce Docs2KG, a novel framework designed to extract\nmultimodal information from diverse and heterogeneous unstructured documents,\nincluding emails, web pages, PDF files, and Excel files. Dynamically generates\na unified knowledge graph that represents the extracted key information,\nDocs2KG enables efficient querying and exploration of document data lakes.\nUnlike existing approaches that focus on domain-specific data sources or\npre-designed schemas, Docs2KG offers a flexible and extensible solution that\ncan adapt to various document structures and content types. The proposed\nframework unifies data processing supporting a multitude of downstream tasks\nwith improved domain interpretability. Docs2KG is publicly accessible at\nhttps://docs2kg.ai4wa.com, and a demonstration video is available at\nhttps://docs2kg.ai4wa.com/Video.\n","authors":["Qiang Sun","Yuanyi Luo","Wenxiao Zhang","Sirui Li","Jichunyang Li","Kai Niu","Xiangrui Kong","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2406.02962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02163v2","updated":"2024-06-05T05:17:08Z","published":"2024-06-04T09:52:41Z","title":"Pairwise Ranking Loss for Multi-Task Learning in Recommender Systems","summary":"  Multi-Task Learning (MTL) plays a crucial role in real-world advertising\napplications such as recommender systems, aiming to achieve robust\nrepresentations while minimizing resource consumption. MTL endeavors to\nsimultaneously optimize multiple tasks to construct a unified model serving\ndiverse objectives. In online advertising systems, tasks like Click-Through\nRate (CTR) and Conversion Rate (CVR) are often treated as MTL problems\nconcurrently. However, it has been overlooked that a conversion ($y_{cvr}=1$)\nnecessitates a preceding click ($y_{ctr}=1$). In other words, while certain CTR\ntasks are associated with corresponding conversions, others lack such\nassociations. Moreover, the likelihood of noise is significantly higher in CTR\ntasks where conversions do not occur compared to those where they do, and\nexisting methods lack the ability to differentiate between these two scenarios.\nIn this study, exposure labels corresponding to conversions are regarded as\ndefinitive indicators, and a novel task-specific loss is introduced by\ncalculating a \\textbf{p}air\\textbf{wise} \\textbf{r}anking (PWiseR) loss between\nmodel predictions, manifesting as pairwise ranking loss, to encourage the model\nto rely more on them. To demonstrate the effect of the proposed loss function,\nexperiments were conducted on different MTL and Single-Task Learning (STL)\nmodels using four distinct public MTL datasets, namely Alibaba FR, NL, US, and\nCCP, along with a proprietary industrial dataset. The results indicate that our\nproposed loss function outperforms the BCE loss function in most cases in terms\nof the AUC metric.\n","authors":["Furkan Durmus","Hasan Saribas","Said Aldemir","Junyan Yang","Hakan Cevikalp"],"pdf_url":"https://arxiv.org/pdf/2406.02163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02943v1","updated":"2024-06-05T05:05:41Z","published":"2024-06-05T05:05:41Z","title":"The Task-oriented Queries Benchmark (ToQB)","summary":"  Task-oriented queries (e.g., one-shot queries to play videos, order food, or\ncall a taxi) are crucial for assessing the quality of virtual assistants,\nchatbots, and other large language model (LLM)-based services. However, a\nstandard benchmark for task-oriented queries is not yet available, as existing\nbenchmarks in the relevant NLP (Natural Language Processing) fields have\nprimarily focused on task-oriented dialogues. Thus, we present a new\nmethodology for efficiently generating the Task-oriented Queries Benchmark\n(ToQB) using existing task-oriented dialogue datasets and an LLM service. Our\nmethodology involves formulating the underlying NLP task to summarize the\noriginal intent of a speaker in each dialogue, detailing the key steps to\nperform the devised NLP task using an LLM service, and outlining a framework\nfor automating a major part of the benchmark generation process. Through a case\nstudy encompassing three domains (i.e., two single-task domains and one\nmulti-task domain), we demonstrate how to customize the LLM prompts (e.g.,\nomitting system utterances or speaker labels) for those three domains and\ncharacterize the generated task-oriented queries. The generated ToQB dataset is\nmade available to the public. We further discuss new domains that can be added\nto ToQB by community contributors and its practical applications.\n","authors":["Keun Soo Yim"],"pdf_url":"https://arxiv.org/pdf/2406.02943v1.pdf","comment":"Data available on GitHub,\n  https://github.com/google/task-oriented-queries"},{"id":"http://arxiv.org/abs/2406.02891v1","updated":"2024-06-05T03:17:48Z","published":"2024-06-05T03:17:48Z","title":"A Bi-metric Framework for Fast Similarity Search","summary":"  We propose a new \"bi-metric\" framework for designing nearest neighbor data\nstructures. Our framework assumes two dissimilarity functions: a ground-truth\nmetric that is accurate but expensive to compute, and a proxy metric that is\ncheaper but less accurate. In both theory and practice, we show how to\nconstruct data structures using only the proxy metric such that the query\nprocedure achieves the accuracy of the expensive metric, while only using a\nlimited number of calls to both metrics. Our theoretical results instantiate\nthis framework for two popular nearest neighbor search algorithms: DiskANN and\nCover Tree. In both cases we show that, as long as the proxy metric used to\nconstruct the data structure approximates the ground-truth metric up to a\nbounded factor, our data structure achieves arbitrarily good approximation\nguarantees with respect to the ground-truth metric. On the empirical side, we\napply the framework to the text retrieval problem with two dissimilarity\nfunctions evaluated by ML models with vastly different computational costs. We\nobserve that for almost all data sets in the MTEB benchmark, our approach\nachieves a considerably better accuracy-efficiency tradeoff than the\nalternatives, such as re-ranking.\n","authors":["Haike Xu","Sandeep Silwal","Piotr Indyk"],"pdf_url":"https://arxiv.org/pdf/2406.02891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02844v1","updated":"2024-06-05T01:35:50Z","published":"2024-06-05T01:35:50Z","title":"Item-Language Model for Conversational Recommendation","summary":"  Large-language Models (LLMs) have been extremely successful at tasks like\ncomplex dialogue understanding, reasoning and coding due to their emergent\nabilities. These emergent abilities have been extended with multi-modality to\ninclude image, audio, and video capabilities. Recommender systems, on the other\nhand, have been critical for information seeking and item discovery needs.\nRecently, there have been attempts to apply LLMs for recommendations. One\ndifficulty of current attempts is that the underlying LLM is usually not\ntrained on the recommender system data, which largely contains user interaction\nsignals and is often not publicly available. Another difficulty is user\ninteraction signals often have a different pattern from natural language text,\nand it is currently unclear if the LLM training setup can learn more\nnon-trivial knowledge from interaction signals compared with traditional\nrecommender system methods. Finally, it is difficult to train multiple LLMs for\ndifferent use-cases, and to retain the original language and reasoning\nabilities when learning from recommender system data. To address these three\nlimitations, we propose an Item-Language Model (ILM), which is composed of an\nitem encoder to produce text-aligned item representations that encode user\ninteraction signals, and a frozen LLM that can understand those item\nrepresentations with preserved pretrained knowledge. We conduct extensive\nexperiments which demonstrate both the importance of the language-alignment and\nof user interaction knowledge in the item encoder.\n","authors":["Li Yang","Anushya Subbiah","Hardik Patel","Judith Yue Li","Yanwei Song","Reza Mirghaderi","Vikram Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2406.02844v1.pdf","comment":"15 pages, 3 figures"},{"id":"http://arxiv.org/abs/2301.08146v3","updated":"2024-06-05T22:50:00Z","published":"2023-01-15T03:20:18Z","title":"What's happening in your neighborhood? A Weakly Supervised Approach to\n  Detect Local News","summary":"  Local news articles are a subset of news that impact users in a geographical\narea, such as a city, county, or state. Detecting local news (Step 1) and\nsubsequently deciding its geographical location as well as radius of impact\n(Step 2) are two important steps towards accurate local news recommendation.\nNaive rule-based methods, such as detecting city names from the news title,\ntend to give erroneous results due to lack of understanding of the news\ncontent. Empowered by the latest development in natural language processing, we\ndevelop an integrated pipeline that enables automatic local news detection and\ncontent-based local news recommendations. In this paper, we focus on Step 1 of\nthe pipeline, which highlights: (1) a weakly supervised framework incorporated\nwith domain knowledge and auto data processing, and (2) scalability to\nmulti-lingual settings. Compared with Stanford CoreNLP NER model, our pipeline\nhas higher precision and recall evaluated on a real-world and human-labeled\ndataset. This pipeline has potential to more precise local news to users, helps\nlocal businesses get more exposure, and gives people more information about\ntheir neighborhood safety.\n","authors":["Deven Santosh Shah","Shiying He","Gosuddin Kamaruddin Siddiqi","Radhika Bansal"],"pdf_url":"https://arxiv.org/pdf/2301.08146v3.pdf","comment":"8 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.10081v2","updated":"2024-06-05T18:44:59Z","published":"2024-03-15T07:45:37Z","title":"DRAGIN: Dynamic Retrieval Augmented Generation based on the Information\n  Needs of Large Language Models","summary":"  Dynamic retrieval augmented generation (RAG) paradigm actively decides when\nand what to retrieve during the text generation process of Large Language\nModels (LLMs). There are two key elements of this paradigm: identifying the\noptimal moment to activate the retrieval module (deciding when to retrieve) and\ncrafting the appropriate query once retrieval is triggered (determining what to\nretrieve). However, current dynamic RAG methods fall short in both aspects.\nFirstly, the strategies for deciding when to retrieve often rely on static\nrules. Moreover, the strategies for deciding what to retrieve typically limit\nthemselves to the LLM's most recent sentence or the last few tokens, while the\nLLM's real-time information needs may span across the entire context. To\novercome these limitations, we introduce a new framework, DRAGIN, i.e., Dynamic\nRetrieval Augmented Generation based on the real-time Information Needs of\nLLMs. Our framework is specifically designed to make decisions on when and what\nto retrieve based on the LLM's real-time information needs during the text\ngeneration process. We evaluate DRAGIN along with existing methods\ncomprehensively over 4 knowledge-intensive generation datasets. Experimental\nresults show that DRAGIN achieves superior performance on all tasks,\ndemonstrating the effectiveness of our method. We have open-sourced all the\ncode, data, and models in GitHub: https://github.com/oneal2000/DRAGIN/tree/main\n","authors":["Weihang Su","Yichen Tang","Qingyao Ai","Zhijing Wu","Yiqun Liu"],"pdf_url":"https://arxiv.org/pdf/2403.10081v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2406.03496v1","updated":"2024-06-05T17:59:40Z","published":"2024-06-05T17:59:40Z","title":"Wings: Learning Multimodal LLMs without Text-only Forgetting","summary":"  Multimodal large language models (MLLMs), initiated with a trained LLM, first\nalign images with text and then fine-tune on multimodal mixed inputs. However,\nthe MLLM catastrophically forgets the text-only instructions, which do not\ninclude images and can be addressed within the initial LLM. In this paper, we\npresent Wings, a novel MLLM that excels in both text-only dialogues and\nmultimodal comprehension. Analyzing MLLM attention in multimodal instructions\nreveals that text-only forgetting is related to the attention shifts from\npre-image to post-image text. From that, we construct extra modules that act as\nthe boosted learner to compensate for the attention shift. The complementary\nvisual and textual learners, like \"wings\" on either side, are connected in\nparallel within each layer's attention block. Initially, image and text inputs\nare aligned with visual learners operating alongside the main attention,\nbalancing focus on visual elements. Textual learners are later collaboratively\nintegrated with attention-based routing to blend the outputs of the visual and\ntextual learners. We design the Low-Rank Residual Attention (LoRRA) to\nguarantee high efficiency for learners. Our experimental results demonstrate\nthat Wings outperforms equally-scaled MLLMs in both text-only and visual\nquestion-answering tasks. On a newly constructed Interleaved Image-Text (IIT)\nbenchmark, Wings exhibits superior performance from text-only-rich to\nmultimodal-rich question-answering tasks.\n","authors":["Yi-Kai Zhang","Shiyin Lu","Yang Li","Yanqing Ma","Qing-Guo Chen","Zhao Xu","Weihua Luo","Kaifu Zhang","De-Chuan Zhan","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2406.03496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03495v1","updated":"2024-06-05T17:59:35Z","published":"2024-06-05T17:59:35Z","title":"Grokking Modular Polynomials","summary":"  Neural networks readily learn a subset of the modular arithmetic tasks, while\nfailing to generalize on the rest. This limitation remains unmoved by the\nchoice of architecture and training strategies. On the other hand, an\nanalytical solution for the weights of Multi-layer Perceptron (MLP) networks\nthat generalize on the modular addition task is known in the literature. In\nthis work, we (i) extend the class of analytical solutions to include modular\nmultiplication as well as modular addition with many terms. Additionally, we\nshow that real networks trained on these datasets learn similar solutions upon\ngeneralization (grokking). (ii) We combine these \"expert\" solutions to\nconstruct networks that generalize on arbitrary modular polynomials. (iii) We\nhypothesize a classification of modular polynomials into learnable and\nnon-learnable via neural networks training; and provide experimental evidence\nsupporting our claims.\n","authors":["Darshil Doshi","Tianyu He","Aritra Das","Andrey Gromov"],"pdf_url":"https://arxiv.org/pdf/2406.03495v1.pdf","comment":"7+4 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2406.03494v1","updated":"2024-06-05T17:59:22Z","published":"2024-06-05T17:59:22Z","title":"Solving Poisson Equations using Neural Walk-on-Spheres","summary":"  We propose Neural Walk-on-Spheres (NWoS), a novel neural PDE solver for the\nefficient solution of high-dimensional Poisson equations. Leveraging stochastic\nrepresentations and Walk-on-Spheres methods, we develop novel losses for neural\nnetworks based on the recursive solution of Poisson equations on spheres inside\nthe domain. The resulting method is highly parallelizable and does not require\nspatial gradients for the loss. We provide a comprehensive comparison against\ncompeting methods based on PINNs, the Deep Ritz method, and (backward)\nstochastic differential equations. In several challenging, high-dimensional\nnumerical examples, we demonstrate the superiority of NWoS in accuracy, speed,\nand computational costs. Compared to commonly used PINNs, our approach can\nreduce memory usage and errors by orders of magnitude. Furthermore, we apply\nNWoS to problems in PDE-constrained optimization and molecular dynamics to show\nits efficiency in practical applications.\n","authors":["Hong Chul Nam","Julius Berner","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2406.03494v1.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2307.03175v2","updated":"2024-06-05T17:59:01Z","published":"2023-07-06T17:55:28Z","title":"Push Past Green: Learning to Look Behind Plant Foliage by Moving It","summary":"  Autonomous agriculture applications (e.g., inspection, phenotyping, plucking\nfruits) require manipulating the plant foliage to look behind the leaves and\nthe branches. Partial visibility, extreme clutter, thin structures, and unknown\ngeometry and dynamics for plants make such manipulation challenging. We tackle\nthese challenges through data-driven methods. We use self-supervision to train\nSRPNet, a neural network that predicts what space is revealed on execution of a\ncandidate action on a given plant. We use SRPNet with the cross-entropy method\nto predict actions that are effective at revealing space beneath plant foliage.\nFurthermore, as SRPNet does not just predict how much space is revealed but\nalso where it is revealed, we can execute a sequence of actions that\nincrementally reveal more and more space beneath the plant foliage. We\nexperiment with a synthetic (vines) and a real plant (Dracaena) on a physical\ntest-bed across 5 settings including 2 settings that test generalization to\nnovel plant configurations. Our experiments reveal the effectiveness of our\noverall method, PPG, over a competitive hand-crafted exploration method, and\nthe effectiveness of SRPNet over a hand-crafted dynamics model and relevant\nablations.\n","authors":["Xiaoyu Zhang","Saurabh Gupta"],"pdf_url":"https://arxiv.org/pdf/2307.03175v2.pdf","comment":"Accepted by Conference on Robot Learning (CoRL) 2023. for project\n  website with video, see https://sites.google.com/view/pushpastgreen/"},{"id":"http://arxiv.org/abs/2402.02287v3","updated":"2024-06-05T17:55:43Z","published":"2024-02-03T22:55:31Z","title":"Future Directions in the Theory of Graph Machine Learning","summary":"  Machine learning on graphs, especially using graph neural networks (GNNs),\nhas seen a surge in interest due to the wide availability of graph data across\na broad spectrum of disciplines, from life to social and engineering sciences.\nDespite their practical success, our theoretical understanding of the\nproperties of GNNs remains highly incomplete. Recent theoretical advancements\nprimarily focus on elucidating the coarse-grained expressive power of GNNs,\npredominantly employing combinatorial techniques. However, these studies do not\nperfectly align with practice, particularly in understanding the generalization\nbehavior of GNNs when trained with stochastic first-order optimization\ntechniques. In this position paper, we argue that the graph machine learning\ncommunity needs to shift its attention to developing a balanced theory of graph\nmachine learning, focusing on a more thorough understanding of the interplay of\nexpressive power, generalization, and optimization.\n","authors":["Christopher Morris","Fabrizio Frasca","Nadav Dym","Haggai Maron","İsmail İlkan Ceylan","Ron Levie","Derek Lim","Michael Bronstein","Martin Grohe","Stefanie Jegelka"],"pdf_url":"https://arxiv.org/pdf/2402.02287v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2402.06578v2","updated":"2024-06-05T17:52:14Z","published":"2024-02-09T17:51:43Z","title":"On the Universality of Coupling-based Normalizing Flows","summary":"  We present a novel theoretical framework for understanding the expressive\npower of normalizing flows. Despite their prevalence in scientific\napplications, a comprehensive understanding of flows remains elusive due to\ntheir restricted architectures. Existing theorems fall short as they require\nthe use of arbitrarily ill-conditioned neural networks, limiting practical\napplicability. We propose a distributional universality theorem for\nwell-conditioned coupling-based normalizing flows such as RealNVP. In addition,\nwe show that volume-preserving normalizing flows are not universal, what\ndistribution they learn instead, and how to fix their expressivity. Our results\nsupport the general wisdom that affine and related couplings are expressive and\nin general outperform volume-preserving flows, bridging a gap between empirical\nresults and theoretical understanding.\n","authors":["Felix Draxler","Stefan Wahl","Christoph Schnörr","Ullrich Köthe"],"pdf_url":"https://arxiv.org/pdf/2402.06578v2.pdf","comment":"Proceedings of the 41 st International Conference on Machine\n  Learning, Vienna, Austria. PMLR 235, 2024"},{"id":"http://arxiv.org/abs/2406.03485v1","updated":"2024-06-05T17:46:26Z","published":"2024-06-05T17:46:26Z","title":"Highway Value Iteration Networks","summary":"  Value iteration networks (VINs) enable end-to-end learning for planning tasks\nby employing a differentiable \"planning module\" that approximates the value\niteration algorithm. However, long-term planning remains a challenge because\ntraining very deep VINs is difficult. To address this problem, we embed highway\nvalue iteration -- a recent algorithm designed to facilitate long-term credit\nassignment -- into the structure of VINs. This improvement augments the\n\"planning module\" of the VIN with three additional components: 1) an \"aggregate\ngate,\" which constructs skip connections to improve information flow across\nmany layers; 2) an \"exploration module,\" crafted to increase the diversity of\ninformation and gradient flow in spatial dimensions; 3) a \"filter gate\"\ndesigned to ensure safe exploration. The resulting novel highway VIN can be\ntrained effectively with hundreds of layers using standard backpropagation. In\nlong-term planning tasks requiring hundreds of planning steps, deep highway\nVINs outperform both traditional VINs and several advanced, very deep NNs.\n","authors":["Yuhui Wang","Weida Li","Francesco Faccio","Qingyuan Wu","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2406.03485v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2403.03942v2","updated":"2024-06-05T17:44:03Z","published":"2024-03-06T18:50:14Z","title":"The Heuristic Core: Understanding Subnetwork Generalization in\n  Pretrained Language Models","summary":"  Prior work has found that pretrained language models (LMs) fine-tuned with\ndifferent random seeds can achieve similar in-domain performance but generalize\ndifferently on tests of syntactic generalization. In this work, we show that,\neven within a single model, we can find multiple subnetworks that perform\nsimilarly in-domain, but generalize vastly differently. To better understand\nthese phenomena, we investigate if they can be understood in terms of\n\"competing subnetworks\": the model initially represents a variety of distinct\nalgorithms, corresponding to different subnetworks, and generalization occurs\nwhen it ultimately converges to one. This explanation has been used to account\nfor generalization in simple algorithmic tasks (\"grokking\"). Instead of finding\ncompeting subnetworks, we find that all subnetworks -- whether they generalize\nor not -- share a set of attention heads, which we refer to as the heuristic\ncore. Further analysis suggests that these attention heads emerge early in\ntraining and compute shallow, non-generalizing features. The model learns to\ngeneralize by incorporating additional attention heads, which depend on the\noutputs of the \"heuristic\" heads to compute higher-level features. Overall, our\nresults offer a more detailed picture of the mechanisms for syntactic\ngeneralization in pretrained LMs.\n","authors":["Adithya Bhaskar","Dan Friedman","Danqi Chen"],"pdf_url":"https://arxiv.org/pdf/2403.03942v2.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2406.03482v1","updated":"2024-06-05T17:42:05Z","published":"2024-06-05T17:42:05Z","title":"QJL: 1-Bit Quantized JL Transform for KV Cache Quantization with Zero\n  Overhead","summary":"  Serving LLMs requires substantial memory due to the storage requirements of\nKey-Value (KV) embeddings in the KV cache, which grows with sequence length. An\neffective approach to compress KV cache is quantization. However, traditional\nquantization methods face significant memory overhead due to the need to store\nquantization constants (at least a zero point and a scale) in full precision\nper data block. Depending on the block size, this overhead can add 1 or 2 bits\nper quantized number. We introduce QJL, a new quantization approach that\nconsists of a Johnson-Lindenstrauss (JL) transform followed by sign-bit\nquantization. In contrast to existing methods, QJL eliminates memory overheads\nby removing the need for storing quantization constants. We propose an\nasymmetric estimator for the inner product of two vectors and demonstrate that\napplying QJL to one vector and a standard JL transform without quantization to\nthe other provides an unbiased estimator with minimal distortion. We have\ndeveloped an efficient implementation of the QJL sketch and its corresponding\ninner product estimator, incorporating a lightweight CUDA kernel for optimized\ncomputation. When applied across various LLMs and NLP tasks to quantize the KV\ncache to only 3 bits, QJL demonstrates a more than fivefold reduction in KV\ncache memory usage without compromising accuracy, all while achieving faster\nruntime. Codes are available at \\url{https://github.com/amirzandieh/QJL}.\n","authors":["Amir Zandieh","Majid Daliri","Insu Han"],"pdf_url":"https://arxiv.org/pdf/2406.03482v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2402.08552v2","updated":"2024-06-05T17:36:47Z","published":"2024-02-13T15:55:41Z","title":"Confronting Reward Overoptimization for Diffusion Models: A Perspective\n  of Inductive and Primacy Biases","summary":"  Bridging the gap between diffusion models and human preferences is crucial\nfor their integration into practical generative workflows. While optimizing\ndownstream reward models has emerged as a promising alignment strategy,\nconcerns arise regarding the risk of excessive optimization with learned reward\nmodels, which potentially compromises ground-truth performance. In this work,\nwe confront the reward overoptimization problem in diffusion model alignment\nthrough the lenses of both inductive and primacy biases. We first identify a\nmismatch between current methods and the temporal inductive bias inherent in\nthe multi-step denoising process of diffusion models, as a potential source of\nreward overoptimization. Then, we surprisingly discover that dormant neurons in\nour critic model act as a regularization against reward overoptimization while\nactive neurons reflect primacy bias. Motivated by these observations, we\npropose Temporal Diffusion Policy Optimization with critic active neuron Reset\n(TDPO-R), a policy gradient algorithm that exploits the temporal inductive bias\nof diffusion models and mitigates the primacy bias stemming from active\nneurons. Empirical results demonstrate the superior efficacy of our methods in\nmitigating reward overoptimization. Code is avaliable at\nhttps://github.com/ZiyiZhang27/tdpo.\n","authors":["Ziyi Zhang","Sen Zhang","Yibing Zhan","Yong Luo","Yonggang Wen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2402.08552v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2402.17768v2","updated":"2024-06-05T17:33:56Z","published":"2024-02-27T18:59:18Z","title":"Diffusion Meets DAgger: Supercharging Eye-in-hand Imitation Learning","summary":"  A common failure mode for policies trained with imitation is compounding\nexecution errors at test time. When the learned policy encounters states that\nare not present in the expert demonstrations, the policy fails, leading to\ndegenerate behavior. The Dataset Aggregation, or DAgger approach to this\nproblem simply collects more data to cover these failure states. However, in\npractice, this is often prohibitively expensive. In this work, we propose\nDiffusion Meets DAgger (DMD), a method to reap the benefits of DAgger without\nthe cost for eye-in-hand imitation learning problems. Instead of collecting new\nsamples to cover out-of-distribution states, DMD uses recent advances in\ndiffusion models to synthesize these samples. This leads to robust performance\nfrom few demonstrations. We compare DMD against behavior cloning baseline\nacross four tasks: pushing, stacking, pouring, and shirt hanging. In pushing,\nDMD achieves 80% success rate with as few as 8 expert demonstrations, where\nnaive behavior cloning reaches only 20%. In stacking, DMD succeeds on average\n92% of the time across 5 cups, versus 40% for BC. When pouring coffee beans,\nDMD transfers to another cup successfully 80% of the time. Finally, DMD attains\n90% success rate for hanging shirt on a clothing rack.\n","authors":["Xiaoyu Zhang","Matthew Chang","Pranav Kumar","Saurabh Gupta"],"pdf_url":"https://arxiv.org/pdf/2402.17768v2.pdf","comment":"Accepted by Robotics: Science and Systems (RSS) 2024. project website\n  with video, see https://sites.google.com/view/diffusion-meets-dagger"},{"id":"http://arxiv.org/abs/2406.03478v1","updated":"2024-06-05T17:32:22Z","published":"2024-06-05T17:32:22Z","title":"Convolutional Neural Networks and Vision Transformers for Fashion MNIST\n  Classification: A Literature Review","summary":"  Our review explores the comparative analysis between Convolutional Neural\nNetworks (CNNs) and Vision Transformers (ViTs) in the domain of image\nclassification, with a particular focus on clothing classification within the\ne-commerce sector. Utilizing the Fashion MNIST dataset, we delve into the\nunique attributes of CNNs and ViTs. While CNNs have long been the cornerstone\nof image classification, ViTs introduce an innovative self-attention mechanism\nenabling nuanced weighting of different input data components. Historically,\ntransformers have primarily been associated with Natural Language Processing\n(NLP) tasks. Through a comprehensive examination of existing literature, our\naim is to unveil the distinctions between ViTs and CNNs in the context of image\nclassification. Our analysis meticulously scrutinizes state-of-the-art\nmethodologies employing both architectures, striving to identify the factors\ninfluencing their performance. These factors encompass dataset characteristics,\nimage dimensions, the number of target classes, hardware infrastructure, and\nthe specific architectures along with their respective top results. Our key\ngoal is to determine the most appropriate architecture between ViT and CNN for\nclassifying images in the Fashion MNIST dataset within the e-commerce industry,\nwhile taking into account specific conditions and needs. We highlight the\nimportance of combining these two architectures with different forms to enhance\noverall performance. By uniting these architectures, we can take advantage of\ntheir unique strengths, which may lead to more precise and reliable models for\ne-commerce applications. CNNs are skilled at recognizing local patterns, while\nViTs are effective at grasping overall context, making their combination a\npromising strategy for boosting image classification performance.\n","authors":["Sonia Bbouzidi","Ghazala Hcini","Imen Jdey","Fadoua Drira"],"pdf_url":"https://arxiv.org/pdf/2406.03478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03476v1","updated":"2024-06-05T17:29:15Z","published":"2024-06-05T17:29:15Z","title":"Does your data spark joy? Performance gains from domain upsampling at\n  the end of training","summary":"  Pretraining datasets for large language models (LLMs) have grown to trillions\nof tokens composed of large amounts of CommonCrawl (CC) web scrape along with\nsmaller, domain-specific datasets. It is expensive to understand the impact of\nthese domain-specific datasets on model capabilities as training at large FLOP\nscales is required to reveal significant changes to difficult and emergent\nbenchmarks. Given the increasing cost of experimenting with pretraining data,\nhow does one determine the optimal balance between the diversity in general web\nscrapes and the information density of domain specific data? In this work, we\nshow how to leverage the smaller domain specific datasets by upsampling them\nrelative to CC at the end of training to drive performance improvements on\ndifficult benchmarks. This simple technique allows us to improve up to 6.90 pp\non MMLU, 8.26 pp on GSM8K, and 6.17 pp on HumanEval relative to the base data\nmix for a 7B model trained for 1 trillion (T) tokens, thus rivaling Llama-2\n(7B)$\\unicode{x2014}$a model trained for twice as long. We experiment with\nablating the duration of domain upsampling from 5% to 30% of training and find\nthat 10% to 20% percent is optimal for navigating the tradeoff between general\nlanguage modeling capabilities and targeted benchmarks. We also use domain\nupsampling to characterize at scale the utility of individual datasets for\nimproving various benchmarks by removing them during this final phase of\ntraining. This tool opens up the ability to experiment with the impact of\ndifferent pretraining datasets at scale, but at an order of magnitude lower\ncost compared to full pretraining runs.\n","authors":["Cody Blakeney","Mansheej Paul","Brett W. Larsen","Sean Owen","Jonathan Frankle"],"pdf_url":"https://arxiv.org/pdf/2406.03476v1.pdf","comment":"The first three authors contributed equally"},{"id":"http://arxiv.org/abs/2406.03472v1","updated":"2024-06-05T17:25:29Z","published":"2024-06-05T17:25:29Z","title":"Solving Differential Equations using Physics-Informed Deep Equilibrium\n  Models","summary":"  This paper introduces Physics-Informed Deep Equilibrium Models (PIDEQs) for\nsolving initial value problems (IVPs) of ordinary differential equations\n(ODEs). Leveraging recent advancements in deep equilibrium models (DEQs) and\nphysics-informed neural networks (PINNs), PIDEQs combine the implicit output\nrepresentation of DEQs with physics-informed training techniques. We validate\nPIDEQs using the Van der Pol oscillator as a benchmark problem, demonstrating\ntheir efficiency and effectiveness in solving IVPs. Our analysis includes key\nhyperparameter considerations for optimizing PIDEQ performance. By bridging\ndeep learning and physics-based modeling, this work advances computational\ntechniques for solving IVPs, with implications for scientific computing and\nengineering applications.\n","authors":["Bruno Machado Pacheco","Eduardo Camponogara"],"pdf_url":"https://arxiv.org/pdf/2406.03472v1.pdf","comment":"Accepted at CASE 2024"},{"id":"http://arxiv.org/abs/2311.15983v2","updated":"2024-06-05T17:15:47Z","published":"2023-11-27T16:28:20Z","title":"SPIN: Sparsifying and Integrating Internal Neurons in Large Language\n  Models for Text Classification","summary":"  Among the many tasks that Large Language Models (LLMs) have revolutionized is\ntext classification. Current text classification paradigms, however, rely\nsolely on the output of the final layer in the LLM, with the rich information\ncontained in internal neurons largely untapped. In this study, we present SPIN:\na model-agnostic framework that sparsifies and integrates internal neurons of\nintermediate layers of LLMs for text classification. Specifically, SPIN\nsparsifies internal neurons by linear probing-based salient neuron selection\nlayer by layer, avoiding noise from unrelated neurons and ensuring efficiency.\nThe cross-layer salient neurons are then integrated to serve as multi-layered\nfeatures for the classification head. Extensive experimental results show our\nproposed SPIN significantly improves text classification accuracy, efficiency,\nand interpretability.\n","authors":["Difan Jiao","Yilun Liu","Zhenwei Tang","Daniel Matter","Jürgen Pfeffer","Ashton Anderson"],"pdf_url":"https://arxiv.org/pdf/2311.15983v2.pdf","comment":"17 pages, 7 figures, 12 tables Code available at\n  https://github.com/difanj0713/SPIN"},{"id":"http://arxiv.org/abs/2406.03464v1","updated":"2024-06-05T17:12:38Z","published":"2024-06-05T17:12:38Z","title":"Node-wise Filtering in Graph Neural Networks: A Mixture of Experts\n  Approach","summary":"  Graph Neural Networks (GNNs) have proven to be highly effective for node\nclassification tasks across diverse graph structural patterns. Traditionally,\nGNNs employ a uniform global filter, typically a low-pass filter for homophilic\ngraphs and a high-pass filter for heterophilic graphs. However, real-world\ngraphs often exhibit a complex mix of homophilic and heterophilic patterns,\nrendering a single global filter approach suboptimal. In this work, we\ntheoretically demonstrate that a global filter optimized for one pattern can\nadversely affect performance on nodes with differing patterns. To address this,\nwe introduce a novel GNN framework Node-MoE that utilizes a mixture of experts\nto adaptively select the appropriate filters for different nodes. Extensive\nexperiments demonstrate the effectiveness of Node-MoE on both homophilic and\nheterophilic graphs.\n","authors":["Haoyu Han","Juanhui Li","Wei Huang","Xianfeng Tang","Hanqing Lu","Chen Luo","Hui Liu","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2406.03464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03460v1","updated":"2024-06-05T17:07:39Z","published":"2024-06-05T17:07:39Z","title":"The PESQetarian: On the Relevance of Goodhart's Law for Speech\n  Enhancement","summary":"  To obtain improved speech enhancement models, researchers often focus on\nincreasing performance according to specific instrumental metrics. However,\nwhen the same metric is used in a loss function to optimize models, it may be\ndetrimental to aspects that the given metric does not see. The goal of this\npaper is to illustrate the risk of overfitting a speech enhancement model to\nthe metric used for evaluation. For this, we introduce enhancement models that\nexploit the widely used PESQ measure. Our \"PESQetarian\" model achieves 3.82\nPESQ on VB-DMD while scoring very poorly in a listening experiment. While the\nobtained PESQ value of 3.82 would imply \"state-of-the-art\" PESQ-performance on\nthe VB-DMD benchmark, our examples show that when optimizing w.r.t. a metric,\nan isolated evaluation on the same metric may be misleading. Instead, other\nmetrics should be included in the evaluation and the resulting performance\npredictions should be confirmed by listening.\n","authors":["Danilo de Oliveira","Simon Welker","Julius Richter","Timo Gerkmann"],"pdf_url":"https://arxiv.org/pdf/2406.03460v1.pdf","comment":"Accepted at Interspeech 2024"},{"id":"http://arxiv.org/abs/2405.20974v2","updated":"2024-06-05T17:04:01Z","published":"2024-05-31T16:21:16Z","title":"SaySelf: Teaching LLMs to Express Confidence with Self-Reflective\n  Rationales","summary":"  Large language models (LLMs) often generate inaccurate or fabricated\ninformation and generally fail to indicate their confidence, which limits their\nbroader applications. Previous work elicits confidence from LLMs by direct or\nself-consistency prompting, or constructing specific datasets for supervised\nfinetuning. The prompting-based approaches have inferior performance, and the\ntraining-based approaches are limited to binary or inaccurate group-level\nconfidence estimates. In this work, we present the advanced SaySelf, a training\nframework that teaches LLMs to express more accurate fine-grained confidence\nestimates. In addition, beyond the confidence scores, SaySelf initiates the\nprocess of directing LLMs to produce self-reflective rationales that clearly\nidentify gaps in their parametric knowledge and explain their uncertainty. This\nis achieved by using an LLM to automatically summarize the uncertainties in\nspecific knowledge via natural language. The summarization is based on the\nanalysis of the inconsistency in multiple sampled reasoning chains, and the\nresulting data is utilized for supervised fine-tuning. Moreover, we utilize\nreinforcement learning with a meticulously crafted reward function to calibrate\nthe confidence estimates, motivating LLMs to deliver accurate, high-confidence\npredictions and to penalize overconfidence in erroneous outputs. Experimental\nresults in both in-distribution and out-of-distribution datasets demonstrate\nthe effectiveness of SaySelf in reducing the confidence calibration error and\nmaintaining the task performance. We show that the generated self-reflective\nrationales are reasonable and can further contribute to the calibration. The\ncode is made public at https://github.com/xu1868/SaySelf.\n","authors":["Tianyang Xu","Shujin Wu","Shizhe Diao","Xiaoze Liu","Xingyao Wang","Yangyi Chen","Jing Gao"],"pdf_url":"https://arxiv.org/pdf/2405.20974v2.pdf","comment":"The code is available at https://github.com/xu1868/SaySelf"},{"id":"http://arxiv.org/abs/2406.03458v1","updated":"2024-06-05T17:03:47Z","published":"2024-06-05T17:03:47Z","title":"Distributional Adversarial Loss","summary":"  A major challenge in defending against adversarial attacks is the enormous\nspace of possible attacks that even a simple adversary might perform. To\naddress this, prior work has proposed a variety of defenses that effectively\nreduce the size of this space. These include randomized smoothing methods that\nadd noise to the input to take away some of the adversary's impact. Another\napproach is input discretization which limits the adversary's possible number\nof actions.\n  Motivated by these two approaches, we introduce a new notion of adversarial\nloss which we call distributional adversarial loss, to unify these two forms of\neffectively weakening an adversary. In this notion, we assume for each original\nexample, the allowed adversarial perturbation set is a family of distributions\n(e.g., induced by a smoothing procedure), and the adversarial loss over each\nexample is the maximum loss over all the associated distributions. The goal is\nto minimize the overall adversarial loss.\n  We show generalization guarantees for our notion of adversarial loss in terms\nof the VC-dimension of the hypothesis class and the size of the set of allowed\nadversarial distributions associated with each input. We also investigate the\nrole of randomness in achieving robustness against adversarial attacks in the\nmethods described above. We show a general derandomization technique that\npreserves the extent of a randomized classifier's robustness against\nadversarial attacks. We corroborate the procedure experimentally via\nderandomizing the Random Projection Filters framework of\n\\cite{dong2023adversarial}. Our procedure also improves the robustness of the\nmodel against various adversarial attacks.\n","authors":["Saba Ahmadi","Siddharth Bhandari","Avrim Blum","Chen Dan","Prabhav Jain"],"pdf_url":"https://arxiv.org/pdf/2406.03458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12535v2","updated":"2024-06-05T16:57:00Z","published":"2024-02-19T20:48:09Z","title":"Locality-Sensitive Hashing-Based Efficient Point Transformer with\n  Applications in High-Energy Physics","summary":"  This study introduces a novel transformer model optimized for large-scale\npoint cloud processing in scientific domains such as high-energy physics (HEP)\nand astrophysics. Addressing the limitations of graph neural networks and\nstandard transformers, our model integrates local inductive bias and achieves\nnear-linear complexity with hardware-friendly regular operations. One\ncontribution of this work is the quantitative analysis of the error-complexity\ntradeoff of various sparsification techniques for building efficient\ntransformers. Our findings highlight the superiority of using\nlocality-sensitive hashing (LSH), especially OR & AND-construction LSH, in\nkernel approximation for large-scale point cloud data with local inductive\nbias. Based on this finding, we propose LSH-based Efficient Point Transformer\n(HEPT), which combines E$^2$LSH with OR & AND constructions and is built upon\nregular computations. HEPT demonstrates remarkable performance on two critical\nyet time-consuming HEP tasks, significantly outperforming existing GNNs and\ntransformers in accuracy and computational speed, marking a significant\nadvancement in geometric deep learning and large-scale scientific data\nprocessing. Our code is available at https://github.com/Graph-COM/HEPT.\n","authors":["Siqi Miao","Zhiyuan Lu","Mia Liu","Javier Duarte","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2402.12535v2.pdf","comment":"Accepted to ICML 2024 (Oral)"},{"id":"http://arxiv.org/abs/2405.07344v2","updated":"2024-06-05T16:46:11Z","published":"2024-05-12T17:40:48Z","title":"TKAN: Temporal Kolmogorov-Arnold Networks","summary":"  Recurrent Neural Networks (RNNs) have revolutionized many areas of machine\nlearning, particularly in natural language and data sequence processing. Long\nShort-Term Memory (LSTM) has demonstrated its ability to capture long-term\ndependencies in sequential data. Inspired by the Kolmogorov-Arnold Networks\n(KANs) a promising alternatives to Multi-Layer Perceptrons (MLPs), we proposed\na new neural networks architecture inspired by KAN and the LSTM, the Temporal\nKolomogorov-Arnold Networks (TKANs). TKANs combined the strenght of both\nnetworks, it is composed of Recurring Kolmogorov-Arnold Networks (RKANs) Layers\nembedding memory management. This innovation enables us to perform multi-step\ntime series forecasting with enhanced accuracy and efficiency. By addressing\nthe limitations of traditional models in handling complex sequential patterns,\nthe TKAN architecture offers significant potential for advancements in fields\nrequiring more than one step ahead forecasting.\n","authors":["Remi Genet","Hugo Inzirillo"],"pdf_url":"https://arxiv.org/pdf/2405.07344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03447v1","updated":"2024-06-05T16:44:06Z","published":"2024-06-05T16:44:06Z","title":"FILS: Self-Supervised Video Feature Prediction In Semantic Language\n  Space","summary":"  This paper demonstrates a self-supervised approach for learning semantic\nvideo representations. Recent vision studies show that a masking strategy for\nvision and natural language supervision has contributed to developing\ntransferable visual pretraining. Our goal is to achieve a more semantic video\nrepresentation by leveraging the text related to the video content during the\npretraining in a fully self-supervised manner. To this end, we present FILS, a\nnovel self-supervised video Feature prediction In semantic Language Space\n(FILS). The vision model can capture valuable structured information by\ncorrectly predicting masked feature semantics in language space. It is learned\nusing a patch-wise video-text contrastive strategy, in which the text\nrepresentations act as prototypes for transforming vision features into a\nlanguage space, which are then used as targets for semantically meaningful\nfeature prediction using our masked encoder-decoder structure. FILS\ndemonstrates remarkable transferability on downstream action recognition tasks,\nachieving state-of-the-art on challenging egocentric datasets, like\nEpic-Kitchens, Something-SomethingV2, Charades-Ego, and EGTEA, using ViT-Base.\nOur efficient method requires less computation and smaller batches compared to\nprevious works.\n","authors":["Mona Ahmadian","Frank Guerin","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2406.03447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03445v1","updated":"2024-06-05T16:40:53Z","published":"2024-06-05T16:40:53Z","title":"Pre-trained Large Language Models Use Fourier Features to Compute\n  Addition","summary":"  Pre-trained large language models (LLMs) exhibit impressive mathematical\nreasoning capabilities, yet how they compute basic arithmetic, such as\naddition, remains unclear. This paper shows that pre-trained LLMs add numbers\nusing Fourier features -- dimensions in the hidden state that represent numbers\nvia a set of features sparse in the frequency domain. Within the model, MLP and\nattention layers use Fourier features in complementary ways: MLP layers\nprimarily approximate the magnitude of the answer using low-frequency features,\nwhile attention layers primarily perform modular addition (e.g., computing\nwhether the answer is even or odd) using high-frequency features. Pre-training\nis crucial for this mechanism: models trained from scratch to add numbers only\nexploit low-frequency features, leading to lower accuracy. Introducing\npre-trained token embeddings to a randomly initialized model rescues its\nperformance. Overall, our analysis demonstrates that appropriate pre-trained\nrepresentations (e.g., Fourier features) can unlock the ability of Transformers\nto learn precise mechanisms for algorithmic tasks.\n","authors":["Tianyi Zhou","Deqing Fu","Vatsal Sharan","Robin Jia"],"pdf_url":"https://arxiv.org/pdf/2406.03445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20790v2","updated":"2024-06-05T16:37:50Z","published":"2024-05-31T13:45:52Z","title":"Intersectional Unfairness Discovery","summary":"  AI systems have been shown to produce unfair results for certain subgroups of\npopulation, highlighting the need to understand bias on certain sensitive\nattributes. Current research often falls short, primarily focusing on the\nsubgroups characterized by a single sensitive attribute, while neglecting the\nnature of intersectional fairness of multiple sensitive attributes. This paper\nfocuses on its one fundamental aspect by discovering diverse high-bias\nsubgroups under intersectional sensitive attributes. Specifically, we propose a\nBias-Guided Generative Network (BGGN). By treating each bias value as a reward,\nBGGN efficiently generates high-bias intersectional sensitive attributes.\nExperiments on real-world text and image datasets demonstrate a diverse and\nefficient discovery of BGGN. To further evaluate the generated unseen but\npossible unfair intersectional sensitive attributes, we formulate them as\nprompts and use modern generative AI to produce new texts and images. The\nresults of frequently generating biased data provides new insights of\ndiscovering potential unfairness in popular modern generative AI systems.\nWarning: This paper contains generative examples that are offensive in nature.\n","authors":["Gezheng Xu","Qi Chen","Charles Ling","Boyu Wang","Changjian Shui"],"pdf_url":"https://arxiv.org/pdf/2405.20790v2.pdf","comment":"ICML-2024 camera-ready"},{"id":"http://arxiv.org/abs/2402.11168v3","updated":"2024-06-05T16:36:21Z","published":"2024-02-17T02:26:14Z","title":"Trust Regions for Explanations via Black-Box Probabilistic Certification","summary":"  Given the black box nature of machine learning models, a plethora of\nexplainability methods have been developed to decipher the factors behind\nindividual decisions. In this paper, we introduce a novel problem of black box\n(probabilistic) explanation certification. We ask the question: Given a black\nbox model with only query access, an explanation for an example and a quality\nmetric (viz. fidelity, stability), can we find the largest hypercube (i.e.,\n$\\ell_{\\infty}$ ball) centered at the example such that when the explanation is\napplied to all examples within the hypercube, (with high probability) a quality\ncriterion is met (viz. fidelity greater than some value)? Being able to\nefficiently find such a \\emph{trust region} has multiple benefits: i) insight\ninto model behavior in a \\emph{region}, with a \\emph{guarantee}; ii)\nascertained \\emph{stability} of the explanation; iii) \\emph{explanation reuse},\nwhich can save time, energy and money by not having to find explanations for\nevery example; and iv) a possible \\emph{meta-metric} to compare explanation\nmethods. Our contributions include formalizing this problem, proposing\nsolutions, providing theoretical guarantees for these solutions that are\ncomputable, and experimentally showing their efficacy on synthetic and real\ndata.\n","authors":["Amit Dhurandhar","Swagatam Haldar","Dennis Wei","Karthikeyan Natesan Ramamurthy"],"pdf_url":"https://arxiv.org/pdf/2402.11168v3.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2405.21018v2","updated":"2024-06-05T16:35:49Z","published":"2024-05-31T17:07:15Z","title":"Improved Techniques for Optimization-Based Jailbreaking on Large\n  Language Models","summary":"  Large language models (LLMs) are being rapidly developed, and a key component\nof their widespread deployment is their safety-related alignment. Many\nred-teaming efforts aim to jailbreak LLMs, where among these efforts, the\nGreedy Coordinate Gradient (GCG) attack's success has led to a growing interest\nin the study of optimization-based jailbreaking techniques. Although GCG is a\nsignificant milestone, its attacking efficiency remains unsatisfactory. In this\npaper, we present several improved (empirical) techniques for\noptimization-based jailbreaks like GCG. We first observe that the single target\ntemplate of \"Sure\" largely limits the attacking performance of GCG; given this,\nwe propose to apply diverse target templates containing harmful self-suggestion\nand/or guidance to mislead LLMs. Besides, from the optimization aspects, we\npropose an automatic multi-coordinate updating strategy in GCG (i.e.,\nadaptively deciding how many tokens to replace in each step) to accelerate\nconvergence, as well as tricks like easy-to-hard initialisation. Then, we\ncombine these improved technologies to develop an efficient jailbreak method,\ndubbed I-GCG. In our experiments, we evaluate on a series of benchmarks (such\nas NeurIPS 2023 Red Teaming Track). The results demonstrate that our improved\ntechniques can help GCG outperform state-of-the-art jailbreaking attacks and\nachieve nearly 100% attack success rate. The code is released at\nhttps://github.com/jiaxiaojunQAQ/I-GCG.\n","authors":["Xiaojun Jia","Tianyu Pang","Chao Du","Yihao Huang","Jindong Gu","Yang Liu","Xiaochun Cao","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2405.21018v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03441v1","updated":"2024-06-05T16:35:30Z","published":"2024-06-05T16:35:30Z","title":"Cycles of Thought: Measuring LLM Confidence through Stable Explanations","summary":"  In many high-risk machine learning applications it is essential for a model\nto indicate when it is uncertain about a prediction. While large language\nmodels (LLMs) can reach and even surpass human-level accuracy on a variety of\nbenchmarks, their overconfidence in incorrect responses is still a\nwell-documented failure mode. Traditional methods for ML uncertainty\nquantification can be difficult to directly adapt to LLMs due to the\ncomputational cost of implementation and closed-source nature of many models. A\nvariety of black-box methods have recently been proposed, but these often rely\non heuristics such as self-verbalized confidence. We instead propose a\nframework for measuring an LLM's uncertainty with respect to the distribution\nof generated explanations for an answer. While utilizing explanations is not a\nnew idea in and of itself, by interpreting each possible model+explanation pair\nas a test-time classifier we can calculate a posterior answer distribution over\nthe most likely of these classifiers. We demonstrate how a specific instance of\nthis framework using explanation entailment as our classifier likelihood\nimproves confidence score metrics (in particular AURC and AUROC) over baselines\nacross five different datasets. We believe these results indicate that our\nframework is both a well-principled and effective way of quantifying\nuncertainty in LLMs.\n","authors":["Evan Becker","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2406.03441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03437v1","updated":"2024-06-05T16:33:30Z","published":"2024-06-05T16:33:30Z","title":"Transfer Learning for Latent Variable Network Models","summary":"  We study transfer learning for estimation in latent variable network models.\nIn our setting, the conditional edge probability matrices given the latent\nvariables are represented by $P$ for the source and $Q$ for the target. We wish\nto estimate $Q$ given two kinds of data: (1) edge data from a subgraph induced\nby an $o(1)$ fraction of the nodes of $Q$, and (2) edge data from all of $P$.\nIf the source $P$ has no relation to the target $Q$, the estimation error must\nbe $\\Omega(1)$. However, we show that if the latent variables are shared, then\nvanishing error is possible. We give an efficient algorithm that utilizes the\nordering of a suitably defined graph distance. Our algorithm achieves $o(1)$\nerror and does not assume a parametric form on the source or target networks.\nNext, for the specific case of Stochastic Block Models we prove a minimax lower\nbound and show that a simple algorithm achieves this rate. Finally, we\nempirically demonstrate our algorithm's use on real-world and simulated graph\ntransfer problems.\n","authors":["Akhil Jalan","Arya Mazumdar","Soumendu Sundar Mukherjee","Purnamrita Sarkar"],"pdf_url":"https://arxiv.org/pdf/2406.03437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14759v2","updated":"2024-06-05T16:32:31Z","published":"2024-05-23T16:29:30Z","title":"Fault Tolerant ML: Efficient Meta-Aggregation and Synchronous Training","summary":"  In this paper, we investigate the challenging framework of Byzantine-robust\ntraining in distributed machine learning (ML) systems, focusing on enhancing\nboth efficiency and practicality. As distributed ML systems become integral for\ncomplex ML tasks, ensuring resilience against Byzantine failures-where workers\nmay contribute incorrect updates due to malice or error-gains paramount\nimportance. Our first contribution is the introduction of the Centered Trimmed\nMeta Aggregator (CTMA), an efficient meta-aggregator that upgrades baseline\naggregators to optimal performance levels, while requiring low computational\ndemands. Additionally, we propose harnessing a recently developed gradient\nestimation technique based on a double-momentum strategy within the Byzantine\ncontext. Our paper highlights its theoretical and practical advantages for\nByzantine-robust training, especially in simplifying the tuning process and\nreducing the reliance on numerous hyperparameters. The effectiveness of this\ntechnique is supported by theoretical insights within the stochastic convex\noptimization (SCO) framework and corroborated by empirical evidence.\n","authors":["Tehila Dahan","Kfir Y. Levy"],"pdf_url":"https://arxiv.org/pdf/2405.14759v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02486v2","updated":"2024-06-05T16:32:16Z","published":"2024-06-04T16:55:42Z","title":"A Temporal Kolmogorov-Arnold Transformer for Time Series Forecasting","summary":"  Capturing complex temporal patterns and relationships within multivariate\ndata streams is a difficult task. We propose the Temporal Kolmogorov-Arnold\nTransformer (TKAT), a novel attention-based architecture designed to address\nthis task using Temporal Kolmogorov-Arnold Networks (TKANs). Inspired by the\nTemporal Fusion Transformer (TFT), TKAT emerges as a powerful encoder-decoder\nmodel tailored to handle tasks in which the observed part of the features is\nmore important than the a priori known part. This new architecture combined the\ntheoretical foundation of the Kolmogorov-Arnold representation with the power\nof transformers. TKAT aims to simplify the complex dependencies inherent in\ntime series, making them more \"interpretable\". The use of transformer\narchitecture in this framework allows us to capture long-range dependencies\nthrough self-attention mechanisms.\n","authors":["Remi Genet","Hugo Inzirillo"],"pdf_url":"https://arxiv.org/pdf/2406.02486v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.07344"},{"id":"http://arxiv.org/abs/2406.03434v1","updated":"2024-06-05T16:32:14Z","published":"2024-06-05T16:32:14Z","title":"Unified PAC-Bayesian Study of Pessimism for Offline Policy Learning with\n  Regularized Importance Sampling","summary":"  Off-policy learning (OPL) often involves minimizing a risk estimator based on\nimportance weighting to correct bias from the logging policy used to collect\ndata. However, this method can produce an estimator with a high variance. A\ncommon solution is to regularize the importance weights and learn the policy by\nminimizing an estimator with penalties derived from generalization bounds\nspecific to the estimator. This approach, known as pessimism, has gained recent\nattention but lacks a unified framework for analysis. To address this gap, we\nintroduce a comprehensive PAC-Bayesian framework to examine pessimism with\nregularized importance weighting. We derive a tractable PAC-Bayesian\ngeneralization bound that universally applies to common importance weight\nregularizations, enabling their comparison within a single framework. Our\nempirical results challenge common understanding, demonstrating the\neffectiveness of standard IW regularization techniques.\n","authors":["Imad Aouali","Victor-Emmanuel Brunel","David Rohde","Anna Korba"],"pdf_url":"https://arxiv.org/pdf/2406.03434v1.pdf","comment":"Accepted at UAI 2024"},{"id":"http://arxiv.org/abs/2406.03428v1","updated":"2024-06-05T16:25:57Z","published":"2024-06-05T16:25:57Z","title":"HelloFresh: LLM Evaluations on Streams of Real-World Human Editorial\n  Actions across X Community Notes and Wikipedia edits","summary":"  Benchmarks have been essential for driving progress in machine learning. A\nbetter understanding of LLM capabilities on real world tasks is vital for safe\ndevelopment. Designing adequate LLM benchmarks is challenging: Data from\nreal-world tasks is hard to collect, public availability of static evaluation\ndata results in test data contamination and benchmark overfitting, and\nperiodically generating new evaluation data is tedious and may result in\ntemporally inconsistent results. We introduce HelloFresh, based on continuous\nstreams of real-world data generated by intrinsically motivated human labelers.\nIt covers recent events from X (formerly Twitter) community notes and edits of\nWikipedia pages, mitigating the risk of test data contamination and benchmark\noverfitting. Any X user can propose an X note to add additional context to a\nmisleading post (formerly tweet); if the community classifies it as helpful, it\nis shown with the post. Similarly, Wikipedia relies on community-based\nconsensus, allowing users to edit articles or revert edits made by other users.\nVerifying whether an X note is helpful or whether a Wikipedia edit should be\naccepted are hard tasks that require grounding by querying the web. We backtest\nstate-of-the-art LLMs supplemented with simple web search access and find that\nHelloFresh yields a temporally consistent ranking. To enable continuous\nevaluation on HelloFresh, we host a public leaderboard and periodically updated\nevaluation data at https://tinyurl.com/hello-fresh-LLM.\n","authors":["Tim Franzmeyer","Aleksandar Shtedritski","Samuel Albanie","Philip Torr","João F. Henriques","Jakob N. Foerster"],"pdf_url":"https://arxiv.org/pdf/2406.03428v1.pdf","comment":"ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2310.00344v3","updated":"2024-06-05T16:21:40Z","published":"2023-09-30T11:38:13Z","title":"HarmonyDream: Task Harmonization Inside World Models","summary":"  Model-based reinforcement learning (MBRL) holds the promise of\nsample-efficient learning by utilizing a world model, which models how the\nenvironment works and typically encompasses components for two tasks:\nobservation modeling and reward modeling. In this paper, through a dedicated\nempirical investigation, we gain a deeper understanding of the role each task\nplays in world models and uncover the overlooked potential of sample-efficient\nMBRL by mitigating the domination of either observation or reward modeling. Our\nkey insight is that while prevalent approaches of explicit MBRL attempt to\nrestore abundant details of the environment via observation models, it is\ndifficult due to the environment's complexity and limited model capacity. On\nthe other hand, reward models, while dominating implicit MBRL and adept at\nlearning compact task-centric dynamics, are inadequate for sample-efficient\nlearning without richer learning signals. Motivated by these insights and\ndiscoveries, we propose a simple yet effective approach, HarmonyDream, which\nautomatically adjusts loss coefficients to maintain task harmonization, i.e. a\ndynamic equilibrium between the two tasks in world model learning. Our\nexperiments show that the base MBRL method equipped with HarmonyDream gains\n10%-69% absolute performance boosts on visual robotic tasks and sets a new\nstate-of-the-art result on the Atari 100K benchmark. Code is available at\nhttps://github.com/thuml/HarmonyDream.\n","authors":["Haoyu Ma","Jialong Wu","Ningya Feng","Chenjun Xiao","Dong Li","Jianye Hao","Jianmin Wang","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2310.00344v3.pdf","comment":"ICML 2024. Code is available at https://github.com/thuml/HarmonyDream"},{"id":"http://arxiv.org/abs/2401.01145v4","updated":"2024-06-05T16:17:40Z","published":"2024-01-02T10:55:01Z","title":"HAAQI-Net: A Non-intrusive Neural Music Audio Quality Assessment Model\n  for Hearing Aids","summary":"  This paper introduces HAAQI-Net, a non-intrusive deep learning model for\nmusic audio quality assessment tailored for hearing aid users. Unlike\ntraditional methods like the Hearing Aid Audio Quality Index (HAAQI), which\nrely on intrusive comparisons to a reference signal, HAAQI-Net offers a more\naccessible and efficient alternative. Using a bidirectional Long Short-Term\nMemory (BLSTM) architecture with attention mechanisms and features from the\npre-trained BEATs model, HAAQI-Net predicts HAAQI scores directly from music\naudio clips and hearing loss patterns. Results show HAAQI-Net's effectiveness,\nwith predicted scores achieving a Linear Correlation Coefficient (LCC) of\n0.9368, a Spearman's Rank Correlation Coefficient (SRCC) of 0.9486, and a Mean\nSquared Error (MSE) of 0.0064, reducing inference time from 62.52 seconds to\n2.54 seconds. Although effective, feature extraction via the large BEATs model\nincurs computational overhead. To address this, a knowledge distillation\nstrategy creates a student distillBEATs model, distilling information from the\nteacher BEATs model during HAAQI-Net training, reducing required parameters.\nThe distilled HAAQI-Net maintains strong performance with an LCC of 0.9071, an\nSRCC of 0.9307, and an MSE of 0.0091, while reducing parameters by 75.85% and\ninference time by 96.46%. This reduction enhances HAAQI-Net's efficiency and\nscalability, making it viable for real-world music audio quality assessment in\nhearing aid settings. This work also opens avenues for further research into\noptimizing deep learning models for specific applications, contributing to\naudio signal processing and quality assessment by providing insights into\ndeveloping efficient and accurate models for practical applications in hearing\naid technology.\n","authors":["Dyah A. M. G. Wisnu","Stefano Rini","Ryandhimas E. Zezario","Hsin-Min Wang","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2401.01145v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01306v3","updated":"2024-06-05T16:07:13Z","published":"2024-02-28T22:21:47Z","title":"NeuroPrune: A Neuro-inspired Topological Sparse Training Algorithm for\n  Large Language Models","summary":"  Transformer-based Language Models have become ubiquitous in Natural Language\nProcessing (NLP) due to their impressive performance on various tasks. However,\nexpensive training as well as inference remains a significant impediment to\ntheir widespread applicability. While enforcing sparsity at various levels of\nthe model architecture has found promise in addressing scaling and efficiency\nissues, there remains a disconnect between how sparsity affects network\ntopology. Inspired by brain neuronal networks, we explore sparsity approaches\nthrough the lens of network topology. Specifically, we exploit mechanisms seen\nin biological networks, such as preferential attachment and redundant synapse\npruning, and show that principled, model-agnostic sparsity approaches are\nperformant and efficient across diverse NLP tasks, spanning both classification\n(such as natural language inference) and generation (summarization, machine\ntranslation), despite our sole objective not being optimizing performance.\nNeuroPrune is competitive with (or sometimes superior to) baselines on\nperformance and can be up to $10$x faster in terms of training time for a given\nlevel of sparsity, simultaneously exhibiting measurable improvements in\ninference time in many cases.\n","authors":["Amit Dhurandhar","Tejaswini Pedapati","Ronny Luss","Soham Dan","Aurelie Lozano","Payel Das","Georgios Kollias"],"pdf_url":"https://arxiv.org/pdf/2404.01306v3.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2401.04612v2","updated":"2024-06-05T16:06:27Z","published":"2024-01-09T15:28:29Z","title":"Distribution-Free Conformal Joint Prediction Regions for Neural Marked\n  Temporal Point Processes","summary":"  Sequences of labeled events observed at irregular intervals in continuous\ntime are ubiquitous across various fields. Temporal Point Processes (TPPs)\nprovide a mathematical framework for modeling these sequences, enabling\ninferences such as predicting the arrival time of future events and their\nassociated label, called mark. However, due to model misspecification or lack\nof training data, these probabilistic models may provide a poor approximation\nof the true, unknown underlying process, with prediction regions extracted from\nthem being unreliable estimates of the underlying uncertainty. This paper\ndevelops more reliable methods for uncertainty quantification in neural TPP\nmodels via the framework of conformal prediction. A primary objective is to\ngenerate a distribution-free joint prediction region for an event's arrival\ntime and mark, with a finite-sample marginal coverage guarantee. A key\nchallenge is to handle both a strictly positive, continuous response and a\ncategorical response, without distributional assumptions. We first consider a\nsimple but conservative approach that combines individual prediction regions\nfor the event's arrival time and mark. Then, we introduce a more effective\nmethod based on bivariate highest density regions derived from the joint\npredictive density of arrival times and marks. By leveraging the dependencies\nbetween these two variables, this method excludes unlikely combinations of the\ntwo, resulting in sharper prediction regions while still attaining the\npre-specified coverage level. We also explore the generation of individual\nunivariate prediction regions for events' arrival times and marks through\nconformal regression and classification techniques. Moreover, we evaluate the\nstronger notion of conditional coverage. Finally, through extensive\nexperimentation on both simulated and real-world datasets, we assess the\nvalidity and efficiency of these methods.\n","authors":["Victor Dheur","Tanguy Bosser","Rafael Izbicki","Souhaib Ben Taieb"],"pdf_url":"https://arxiv.org/pdf/2401.04612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10635v2","updated":"2024-06-05T16:04:47Z","published":"2024-03-26T15:36:47Z","title":"Compressed Federated Reinforcement Learning with a Generative Model","summary":"  Reinforcement learning has recently gained unprecedented popularity, yet it\nstill grapples with sample inefficiency. Addressing this challenge, federated\nreinforcement learning (FedRL) has emerged, wherein agents collaboratively\nlearn a single policy by aggregating local estimations. However, this\naggregation step incurs significant communication costs. In this paper, we\npropose CompFedRL, a communication-efficient FedRL approach incorporating both\n\\textit{periodic aggregation} and (direct/error-feedback) compression\nmechanisms. Specifically, we consider compressed federated $Q$-learning with a\ngenerative model setup, where a central server learns an optimal $Q$-function\nby periodically aggregating compressed $Q$-estimates from local agents. For the\nfirst time, we characterize the impact of these two mechanisms (which have\nremained elusive) by providing a finite-time analysis of our algorithm,\ndemonstrating strong convergence behaviors when utilizing either direct or\nerror-feedback compression. Our bounds indicate improved solution accuracy\nconcerning the number of agents and other federated hyperparameters while\nsimultaneously reducing communication costs. To corroborate our theory, we also\nconduct in-depth numerical experiments to verify our findings, considering\nTop-$K$ and Sparsified-$K$ sparsification operators.\n","authors":["Ali Beikmohammadi","Sarit Khirirat","Sindri Magnússon"],"pdf_url":"https://arxiv.org/pdf/2404.10635v2.pdf","comment":"European Conference on Machine Learning and Principles and Practice\n  of Knowledge Discovery in Databases (ECML-PKDD 2024)"},{"id":"http://arxiv.org/abs/2402.01922v3","updated":"2024-06-05T16:03:55Z","published":"2024-02-02T21:48:50Z","title":"A General Framework for Learning from Weak Supervision","summary":"  Weakly supervised learning generally faces challenges in applicability to\nvarious scenarios with diverse weak supervision and in scalability due to the\ncomplexity of existing algorithms, thereby hindering the practical deployment.\nThis paper introduces a general framework for learning from weak supervision\n(GLWS) with a novel algorithm. Central to GLWS is an Expectation-Maximization\n(EM) formulation, adeptly accommodating various weak supervision sources,\nincluding instance partial labels, aggregate statistics, pairwise observations,\nand unlabeled data. We further present an advanced algorithm that significantly\nsimplifies the EM computational demands using a Non-deterministic Finite\nAutomaton (NFA) along with a forward-backward algorithm, which effectively\nreduces time complexity from quadratic or factorial often required in existing\nsolutions to linear scale. The problem of learning from arbitrary weak\nsupervision is therefore converted to the NFA modeling of them. GLWS not only\nenhances the scalability of machine learning models but also demonstrates\nsuperior performance and versatility across 11 weak supervision scenarios. We\nhope our work paves the way for further advancements and practical deployment\nin this field.\n","authors":["Hao Chen","Jindong Wang","Lei Feng","Xiang Li","Yidong Wang","Xing Xie","Masashi Sugiyama","Rita Singh","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2402.01922v3.pdf","comment":"24 pages, 20 tables, 9 figures"},{"id":"http://arxiv.org/abs/2310.04598v2","updated":"2024-06-05T15:56:54Z","published":"2023-10-06T21:31:17Z","title":"A Neuro-Symbolic Framework for Answering Graph Pattern Queries in\n  Knowledge Graphs","summary":"  The challenge of answering graph queries over incomplete knowledge graphs is\ngaining significant attention in the machine learning community. Neuro-symbolic\nmodels have emerged as a promising approach, combining good performance with\nhigh interpretability. These models utilize trained architectures to execute\natomic queries and integrate modules that mimic symbolic query operators.\nHowever, most neuro-symbolic query processors are constrained to tree-like\ngraph pattern queries. These queries admit a bottom-up execution with constant\nvalues or anchors at the leaves and the target variable at the root. While\nexpressive, tree-like queries fail to capture critical properties in knowledge\ngraphs, such as the existence of multiple edges between entities or the\npresence of triangles. We introduce a framework for answering arbitrary graph\npattern queries over incomplete knowledge graphs, encompassing both cyclic\nqueries and tree-like queries with existentially quantified leaves. These\nclasses of queries are vital for practical applications but are beyond the\nscope of most current neuro-symbolic models. Our approach employs an\napproximation scheme that facilitates acyclic traversals for cyclic patterns,\nthereby embedding additional symbolic bias into the query execution process.\nOur experimental evaluation demonstrates that our framework performs\ncompetitively on three datasets, effectively handling cyclic queries through\nour approximation strategy. Additionally, it maintains the performance of\nexisting neuro-symbolic models on anchored tree-like queries and extends their\ncapabilities to queries with existentially quantified variables.\n","authors":["Tamara Cucumides","Daniel Daza","Pablo Barceló","Michael Cochez","Floris Geerts","Juan L Reutter","Miguel Romero"],"pdf_url":"https://arxiv.org/pdf/2310.04598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14860v3","updated":"2024-06-05T15:56:49Z","published":"2024-02-21T00:49:43Z","title":"Ranking Large Language Models without Ground Truth","summary":"  Evaluation and ranking of large language models (LLMs) has become an\nimportant problem with the proliferation of these models and their impact.\nEvaluation methods either require human responses which are expensive to\nacquire or use pairs of LLMs to evaluate each other which can be unreliable. In\nthis paper, we provide a novel perspective where, given a dataset of prompts\n(viz. questions, instructions, etc.) and a set of LLMs, we rank them without\naccess to any ground truth or reference responses. Inspired by real life where\nboth an expert and a knowledgeable person can identify a novice our main idea\nis to consider triplets of models, where each one of them evaluates the other\ntwo, correctly identifying the worst model in the triplet with high\nprobability. We also analyze our idea and provide sufficient conditions for it\nto succeed. Applying this idea repeatedly, we propose two methods to rank LLMs.\nIn experiments on different generative tasks (summarization, multiple-choice,\nand dialog), our methods reliably recover close to true rankings without\nreference data. This points to a viable low-resource mechanism for practical\nuse.\n","authors":["Amit Dhurandhar","Rahul Nair","Moninder Singh","Elizabeth Daly","Karthikeyan Natesan Ramamurthy"],"pdf_url":"https://arxiv.org/pdf/2402.14860v3.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2406.03398v1","updated":"2024-06-05T15:55:08Z","published":"2024-06-05T15:55:08Z","title":"Methods for Class-Imbalanced Learning with Support Vector Machines: A\n  Review and an Empirical Evaluation","summary":"  This paper presents a review on methods for class-imbalanced learning with\nthe Support Vector Machine (SVM) and its variants. We first explain the\nstructure of SVM and its variants and discuss their inefficiency in learning\nwith class-imbalanced data sets. We introduce a hierarchical categorization of\nSVM-based models with respect to class-imbalanced learning. Specifically, we\ncategorize SVM-based models into re-sampling, algorithmic, and fusion methods,\nand discuss the principles of the representative models in each category. In\naddition, we conduct a series of empirical evaluations to compare the\nperformances of various representative SVM-based models in each category using\nbenchmark imbalanced data sets, ranging from low to high imbalanced ratios. Our\nfindings reveal that while algorithmic methods are less time-consuming owing to\nno data pre-processing requirements, fusion methods, which combine both\nre-sampling and algorithmic approaches, generally perform the best, but with a\nhigher computational load. A discussion on research gaps and future research\ndirections is provided.\n","authors":["Salim rezvani","Farhad Pourpanah","Chee Peng Lim","Q. M. Jonathan Wu"],"pdf_url":"https://arxiv.org/pdf/2406.03398v1.pdf","comment":"Accepted in Soft Computing"},{"id":"http://arxiv.org/abs/2406.03396v1","updated":"2024-06-05T15:53:25Z","published":"2024-06-05T15:53:25Z","title":"Noisy Data Visualization using Functional Data Analysis","summary":"  Data visualization via dimensionality reduction is an important tool in\nexploratory data analysis. However, when the data are noisy, many existing\nmethods fail to capture the underlying structure of the data. The method called\nEmpirical Intrinsic Geometry (EIG) was previously proposed for performing\ndimensionality reduction on high dimensional dynamical processes while\ntheoretically eliminating all noise. However, implementing EIG in practice\nrequires the construction of high-dimensional histograms, which suffer from the\ncurse of dimensionality. Here we propose a new data visualization method called\nFunctional Information Geometry (FIG) for dynamical processes that adapts the\nEIG framework while using approaches from functional data analysis to mitigate\nthe curse of dimensionality. We experimentally demonstrate that the resulting\nmethod outperforms a variant of EIG designed for visualization in terms of\ncapturing the true structure, hyperparameter robustness, and computational\nspeed. We then use our method to visualize EEG brain measurements of sleep\nactivity.\n","authors":["Haozhe Chen","Andres Felipe Duque Correa","Guy Wolf","Kevin R. Moon"],"pdf_url":"https://arxiv.org/pdf/2406.03396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01873v2","updated":"2024-06-05T15:53:01Z","published":"2024-06-04T01:02:22Z","title":"CR-UTP: Certified Robustness against Universal Text Perturbations on\n  Large Language Models","summary":"  It is imperative to ensure the stability of every prediction made by a\nlanguage model; that is, a language's prediction should remain consistent\ndespite minor input variations, like word substitutions. In this paper, we\ninvestigate the problem of certifying a language model's robustness against\nUniversal Text Perturbations (UTPs), which have been widely used in universal\nadversarial attacks and backdoor attacks. Existing certified robustness based\non random smoothing has shown considerable promise in certifying the\ninput-specific text perturbations (ISTPs), operating under the assumption that\nany random alteration of a sample's clean or adversarial words would negate the\nimpact of sample-wise perturbations. However, with UTPs, masking only the\nadversarial words can eliminate the attack. A naive method is to simply\nincrease the masking ratio and the likelihood of masking attack tokens, but it\nleads to a significant reduction in both certified accuracy and the certified\nradius due to input corruption by extensive masking. To solve this challenge,\nwe introduce a novel approach, the superior prompt search method, designed to\nidentify a superior prompt that maintains higher certified accuracy under\nextensive masking. Additionally, we theoretically motivate why ensembles are a\nparticularly suitable choice as base prompts for random smoothing. The method\nis denoted by superior prompt ensembling technique. We also empirically confirm\nthis technique, obtaining state-of-the-art results in multiple settings. These\nmethodologies, for the first time, enable high certified accuracy against both\nUTPs and ISTPs. The source code of CR-UTP is available at \\url\n{https://github.com/UCFML-Research/CR-UTP}.\n","authors":["Qian Lou","Xin Liang","Jiaqi Xue","Yancheng Zhang","Rui Xie","Mengxin Zheng"],"pdf_url":"https://arxiv.org/pdf/2406.01873v2.pdf","comment":"Accepted by ACL Findings 2024"},{"id":"http://arxiv.org/abs/2405.06627v3","updated":"2024-06-05T15:49:11Z","published":"2024-05-10T17:40:24Z","title":"Conformal Validity Guarantees Exist for Any Data Distribution (and How\n  to Find Them)","summary":"  As artificial intelligence (AI) / machine learning (ML) gain widespread\nadoption, practitioners are increasingly seeking means to quantify and control\nthe risk these systems incur. This challenge is especially salient when such\nsystems have autonomy to collect their own data, such as in black-box\noptimization and active learning, where their actions induce sequential\nfeedback-loop shifts in the data distribution. Conformal prediction is a\npromising approach to uncertainty and risk quantification, but prior variants'\nvalidity guarantees have assumed some form of ``quasi-exchangeability'' on the\ndata distribution, thereby excluding many types of sequential shifts. In this\npaper we prove that conformal prediction can theoretically be extended to\n\\textit{any} joint data distribution, not just exchangeable or\nquasi-exchangeable ones. Although the most general case is exceedingly\nimpractical to compute, for concrete practical applications we outline a\nprocedure for deriving specific conformal algorithms for any data distribution,\nand we use this procedure to derive tractable algorithms for a series of\nAI/ML-agent-induced covariate shifts. We evaluate the proposed algorithms\nempirically on synthetic black-box optimization and active learning tasks.\n","authors":["Drew Prinster","Samuel Stanton","Anqi Liu","Suchi Saria"],"pdf_url":"https://arxiv.org/pdf/2405.06627v3.pdf","comment":"ICML 2024. Code available at\n  https://github.com/drewprinster/conformal-mfcs"},{"id":"http://arxiv.org/abs/2306.06098v5","updated":"2024-06-05T15:45:58Z","published":"2023-06-09T17:58:47Z","title":"Error Feedback Can Accurately Compress Preconditioners","summary":"  Leveraging second-order information about the loss at the scale of deep\nnetworks is one of the main lines of approach for improving the performance of\ncurrent optimizers for deep learning. Yet, existing approaches for accurate\nfull-matrix preconditioning, such as Full-Matrix Adagrad (GGT) or Matrix-Free\nApproximate Curvature (M-FAC) suffer from massive storage costs when applied\neven to small-scale models, as they must store a sliding window of gradients,\nwhose memory requirements are multiplicative in the model dimension. In this\npaper, we address this issue via a novel and efficient error-feedback technique\nthat can be applied to compress preconditioners by up to two orders of\nmagnitude in practice, without loss of convergence. Specifically, our approach\ncompresses the gradient information via sparsification or low-rank compression\n\\emph{before} it is fed into the preconditioner, feeding the compression error\nback into future iterations. Experiments on deep neural networks show that this\napproach can compress full-matrix preconditioners to up to 99\\% sparsity\nwithout accuracy loss, effectively removing the memory overhead of full-matrix\npreconditioners such as GGT and M-FAC. Our code is available at\n\\url{https://github.com/IST-DASLab/EFCP}.\n","authors":["Ionut-Vlad Modoranu","Aleksei Kalinov","Eldar Kurtic","Elias Frantar","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2306.06098v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03390v1","updated":"2024-06-05T15:41:02Z","published":"2024-06-05T15:41:02Z","title":"Author, Content or Sharers? Estimating Spread Dynamics with Bayesian\n  Mixture Hawkes","summary":"  The spread of content on social media is shaped by intertwining factors on\nthree levels: the source, the content itself, and the pathways of content\nspread. At the lowest level, the popularity of the sharing user determines its\neventual reach. However, higher-level factors such as the nature of the online\nitem and the credibility of its source also play crucial roles in determining\nhow widely and rapidly the online item spreads. In this work, we propose the\nBayesian Mixture Hawkes (BMH) model to jointly learn the influence of source,\ncontent and spread. We formulate the BMH model as a hierarchical mixture model\nof separable Hawkes processes, accommodating different classes of Hawkes\ndynamics and the influence of feature sets on these classes. We test the BMH\nmodel on two learning tasks, cold-start popularity prediction and temporal\nprofile generalization performance, applying to two real-world retweet cascade\ndatasets referencing articles from controversial and traditional media\npublishers. The BMH model outperforms the state-of-the-art models and\npredictive baselines on both datasets and utilizes cascade- and item-level\ninformation better than the alternatives. Lastly, we perform a counter-factual\nanalysis where we apply the trained publisher-level BMH models to a set of\narticle headlines and show that effectiveness of headline writing style\n(neutral, clickbait, inflammatory) varies across publishers. The BMH model\nunveils differences in style effectiveness between controversial and reputable\npublishers, where we find clickbait to be notably more effective for reputable\npublishers as opposed to controversial ones, which links to the latter's\noveruse of clickbait.\n","authors":["Pio Calderon","Marian-Andrei Rizoiu"],"pdf_url":"https://arxiv.org/pdf/2406.03390v1.pdf","comment":"accepted in the European Conference on Machine Learning and\n  Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD) 2024"},{"id":"http://arxiv.org/abs/2406.03386v1","updated":"2024-06-05T15:36:57Z","published":"2024-06-05T15:36:57Z","title":"Learning Long Range Dependencies on Graphs via Random Walks","summary":"  Message-passing graph neural networks (GNNs), while excelling at capturing\nlocal relationships, often struggle with long-range dependencies on graphs.\nConversely, graph transformers (GTs) enable information exchange between all\nnodes but oversimplify the graph structure by treating them as a set of\nfixed-length vectors. This work proposes a novel architecture, NeuralWalker,\nthat overcomes the limitations of both methods by combining random walks with\nmessage passing. NeuralWalker achieves this by treating random walks as\nsequences, allowing for the application of recent advances in sequence models\nin order to capture long-range dependencies within these walks. Based on this\nconcept, we propose a framework that offers (1) more expressive graph\nrepresentations through random walk sequences, (2) the ability to utilize any\nsequence model for capturing long-range dependencies, and (3) the flexibility\nby integrating various GNN and GT architectures. Our experimental evaluations\ndemonstrate that NeuralWalker achieves significant performance improvements on\n19 graph and node benchmark datasets, notably outperforming existing methods by\nup to 13% on the PascalVoc-SP and COCO-SP datasets. Code is available at\nhttps://github.com/BorgwardtLab/NeuralWalker.\n","authors":["Dexiong Chen","Till Hendrik Schulz","Karsten Borgwardt"],"pdf_url":"https://arxiv.org/pdf/2406.03386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11244v3","updated":"2024-06-05T15:33:25Z","published":"2023-10-17T13:12:32Z","title":"Entity Matching using Large Language Models","summary":"  Entity Matching is the task of deciding whether two entity descriptions refer\nto the same real-world entity and is a central step in most data integration\npipelines. Many state-of-the-art entity matching methods rely on pre-trained\nlanguage models (PLMs) such as BERT or RoBERTa. Two major drawbacks of these\nmodels for entity matching are that (i) the models require significant amounts\nof task-specific training data and (ii) the fine-tuned models are not robust\nconcerning out-of-distribution entities. This paper investigates using\ngenerative large language models (LLMs) as a less task-specific training\ndata-dependent and more robust alternative to PLM-based matchers. Our study\ncovers hosted and open-source LLMs, which can be run locally. We evaluate these\nmodels in a zero-shot scenario and a scenario where task-specific training data\nis available. We compare different prompt designs and the prompt sensitivity of\nthe models and show that there is no single best prompt but needs to be tuned\nfor each model/dataset combination. We further investigate (i) the selection of\nin-context demonstrations, (ii) the generation of matching rules, as well as\n(iii) fine-tuning a hosted LLM using the same pool of training data. Our\nexperiments show that the best LLMs require no or only a few training examples\nto perform similarly to PLMs that were fine-tuned using thousands of examples.\nLLM-based matchers further exhibit higher robustness to unseen entities. We\nshow that GPT4 can generate structured explanations for matching decisions. The\nmodel can automatically identify potential causes of matching errors by\nanalyzing explanations of wrong decisions. We demonstrate that the model can\ngenerate meaningful textual descriptions of the identified error classes, which\ncan help data engineers improve entity matching pipelines.\n","authors":["Ralph Peeters","Christian Bizer"],"pdf_url":"https://arxiv.org/pdf/2310.11244v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.04288v2","updated":"2024-06-05T15:33:23Z","published":"2022-10-09T15:42:36Z","title":"CoopHash: Cooperative Learning of Multipurpose Descriptor and\n  Contrastive Pair Generator via Variational MCMC Teaching for Supervised Image\n  Hashing","summary":"  Leveraging supervised information can lead to superior retrieval performance\nin the image hashing domain but the performance degrades significantly without\nenough labeled data. One effective solution to boost performance is to employ\ngenerative models, such as Generative Adversarial Networks (GANs), to generate\nsynthetic data in an image hashing model. However, GAN-based methods are\ndifficult to train, which prevents the hashing approaches from jointly training\nthe generative models and the hash functions. This limitation results in\nsub-optimal retrieval performance. To overcome this limitation, we propose a\nnovel framework, the generative cooperative hashing network, which is based on\nenergy-based cooperative learning. This framework jointly learns a powerful\ngenerative representation of the data and a robust hash function via two\ncomponents: a top-down contrastive pair generator that synthesizes contrastive\nimages and a bottom-up multipurpose descriptor that simultaneously represents\nthe images from multiple perspectives, including probability density, hash\ncode, latent code, and category. The two components are jointly learned via a\nnovel likelihood-based cooperative learning scheme. We conduct experiments on\nseveral real-world datasets and show that the proposed method outperforms the\ncompeting hashing supervised methods, achieving up to 10\\% relative improvement\nover the current state-of-the-art supervised hashing methods, and exhibits a\nsignificantly better performance in out-of-distribution retrieval.\n","authors":["Khoa D. Doan","Jianwen Xie","Yaxuan Zhu","Yang Zhao","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2210.04288v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12336v2","updated":"2024-06-05T15:32:03Z","published":"2024-02-19T18:09:48Z","title":"Robust CLIP: Unsupervised Adversarial Fine-Tuning of Vision Embeddings\n  for Robust Large Vision-Language Models","summary":"  Multi-modal foundation models like OpenFlamingo, LLaVA, and GPT-4 are\nincreasingly used for various real-world tasks. Prior work has shown that these\nmodels are highly vulnerable to adversarial attacks on the vision modality.\nThese attacks can be leveraged to spread fake information or defraud users, and\nthus pose a significant risk, which makes the robustness of large multi-modal\nfoundation models a pressing problem. The CLIP model, or one of its variants,\nis used as a frozen vision encoder in many large vision-language models\n(LVLMs), e.g. LLaVA and OpenFlamingo. We propose an unsupervised adversarial\nfine-tuning scheme to obtain a robust CLIP vision encoder, which yields\nrobustness on all vision down-stream tasks (LVLMs, zero-shot classification)\nthat rely on CLIP. In particular, we show that stealth-attacks on users of\nLVLMs by a malicious third party providing manipulated images are no longer\npossible once one replaces the original CLIP model with our robust one. No\nretraining or fine-tuning of the down-stream LVLMs is required. The code and\nrobust models are available at https://github.com/chs20/RobustVLM\n","authors":["Christian Schlarmann","Naman Deep Singh","Francesco Croce","Matthias Hein"],"pdf_url":"https://arxiv.org/pdf/2402.12336v2.pdf","comment":"ICML 2024 Oral"},{"id":"http://arxiv.org/abs/2401.03892v3","updated":"2024-06-05T15:30:09Z","published":"2024-01-08T13:43:56Z","title":"Sampling in Unit Time with Kernel Fisher-Rao Flow","summary":"  We introduce a new mean-field ODE and corresponding interacting particle\nsystems (IPS) for sampling from an unnormalized target density. The IPS are\ngradient-free, available in closed form, and only require the ability to sample\nfrom a reference density and compute the (unnormalized) target-to-reference\ndensity ratio. The mean-field ODE is obtained by solving a Poisson equation for\na velocity field that transports samples along the geometric mixture of the two\ndensities, which is the path of a particular Fisher-Rao gradient flow. We\nemploy a RKHS ansatz for the velocity field, which makes the Poisson equation\ntractable and enables discretization of the resulting mean-field ODE over\nfinite samples. The mean-field ODE can be additionally be derived from a\ndiscrete-time perspective as the limit of successive linearizations of the\nMonge-Amp\\`ere equations within a framework known as sample-driven optimal\ntransport. We introduce a stochastic variant of our approach and demonstrate\nempirically that our IPS can produce high-quality samples from varied target\ndistributions, outperforming comparable gradient-free particle systems and\ncompetitive with gradient-based alternatives.\n","authors":["Aimee Maurais","Youssef Marzouk"],"pdf_url":"https://arxiv.org/pdf/2401.03892v3.pdf","comment":"To appear at ICML 2024. Updated with additional numerical examples"},{"id":"http://arxiv.org/abs/2405.13058v2","updated":"2024-06-05T15:28:43Z","published":"2024-05-20T11:10:49Z","title":"The AI Community Building the Future? A Quantitative Analysis of\n  Development Activity on Hugging Face Hub","summary":"  Open model developers have emerged as key actors in the political economy of\nartificial intelligence (AI), but we still have a limited understanding of\ncollaborative practices in the open AI ecosystem. This paper responds to this\ngap with a three-part quantitative analysis of development activity on the\nHugging Face (HF) Hub, a popular platform for building, sharing, and\ndemonstrating models. First, various types of activity across 348,181 model,\n65,761 dataset, and 156,642 space repositories exhibit right-skewed\ndistributions. Activity is extremely imbalanced between repositories; for\nexample, over 70% of models have 0 downloads, while 1% account for 99% of\ndownloads. Furthermore, licenses matter: there are statistically significant\ndifferences in collaboration patterns in model repositories with permissive,\nrestrictive, and no licenses. Second, we analyse a snapshot of the social\nnetwork structure of collaboration in model repositories, finding that the\ncommunity has a core-periphery structure, with a core of prolific developers\nand a majority of isolate developers (89%). Upon removing the isolate\ndevelopers from the network, collaboration is characterised by high reciprocity\nregardless of developers' network positions. Third, we examine model adoption\nthrough the lens of model usage in spaces, finding that a minority of models,\ndeveloped by a handful of companies, are widely used on the HF Hub. Overall,\nactivity on the HF Hub is characterised by Pareto distributions, congruent with\nOSS development patterns on platforms like GitHub. We conclude with\nrecommendations for researchers, companies, and policymakers to advance our\nunderstanding of open AI development.\n","authors":["Cailean Osborne","Jennifer Ding","Hannah Rose Kirk"],"pdf_url":"https://arxiv.org/pdf/2405.13058v2.pdf","comment":"27 pages, 5 figures, 9 tables"},{"id":"http://arxiv.org/abs/2406.03372v1","updated":"2024-06-05T15:28:04Z","published":"2024-06-05T15:28:04Z","title":"Training of Physical Neural Networks","summary":"  Physical neural networks (PNNs) are a class of neural-like networks that\nleverage the properties of physical systems to perform computation. While PNNs\nare so far a niche research area with small-scale laboratory demonstrations,\nthey are arguably one of the most underappreciated important opportunities in\nmodern AI. Could we train AI models 1000x larger than current ones? Could we do\nthis and also have them perform inference locally and privately on edge\ndevices, such as smartphones or sensors? Research over the past few years has\nshown that the answer to all these questions is likely \"yes, with enough\nresearch\": PNNs could one day radically change what is possible and practical\nfor AI systems. To do this will however require rethinking both how AI models\nwork, and how they are trained - primarily by considering the problems through\nthe constraints of the underlying hardware physics. To train PNNs at large\nscale, many methods including backpropagation-based and backpropagation-free\napproaches are now being explored. These methods have various trade-offs, and\nso far no method has been shown to scale to the same scale and performance as\nthe backpropagation algorithm widely used in deep learning today. However, this\nis rapidly changing, and a diverse ecosystem of training techniques provides\nclues for how PNNs may one day be utilized to create both more efficient\nrealizations of current-scale AI models, and to enable unprecedented-scale\nmodels.\n","authors":["Ali Momeni","Babak Rahmani","Benjamin Scellier","Logan G. Wright","Peter L. McMahon","Clara C. Wanjura","Yuhang Li","Anas Skalli","Natalia G. Berloff","Tatsuhiro Onodera","Ilker Oguz","Francesco Morichetti","Philipp del Hougne","Manuel Le Gallo","Abu Sebastian","Azalia Mirhoseini","Cheng Zhang","Danijela Marković","Daniel Brunner","Christophe Moser","Sylvain Gigan","Florian Marquardt","Aydogan Ozcan","Julie Grollier","Andrea J. Liu","Demetri Psaltis","Andrea Alù","Romain Fleury"],"pdf_url":"https://arxiv.org/pdf/2406.03372v1.pdf","comment":"29 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.17849v2","updated":"2024-06-05T15:26:58Z","published":"2024-05-28T05:56:11Z","title":"I-LLM: Efficient Integer-Only Inference for Fully-Quantized Low-Bit\n  Large Language Models","summary":"  Post-training quantization (PTQ) serves as a potent technique to accelerate\nthe inference of large language models (LLMs). Nonetheless, existing works\nstill necessitate a considerable number of floating-point (FP) operations\nduring inference, including additional quantization and de-quantization, as\nwell as non-linear operators such as RMSNorm and Softmax. This limitation\nhinders the deployment of LLMs on the edge and cloud devices. In this paper, we\nidentify the primary obstacle to integer-only quantization for LLMs lies in the\nlarge fluctuation of activations across channels and tokens in both linear and\nnon-linear operations. To address this issue, we propose I-LLM, a novel\ninteger-only fully-quantized PTQ framework tailored for LLMs. Specifically, (1)\nwe develop Fully-Smooth Block-Reconstruction (FSBR) to aggressively smooth\ninter-channel variations of all activations and weights. (2) to alleviate\ndegradation caused by inter-token variations, we introduce a novel approach\ncalled Dynamic Integer-only MatMul (DI-MatMul). This method enables dynamic\nquantization in full-integer matrix multiplication by dynamically quantizing\nthe input and outputs with integer-only operations. (3) we design\nDI-ClippedSoftmax, DI-Exp, and DI-Normalization, which utilize bit shift to\nexecute non-linear operators efficiently while maintaining accuracy. The\nexperiment shows that our I-LLM achieves comparable accuracy to the FP baseline\nand outperforms non-integer quantization methods. For example, I-LLM can\noperate at W4A4 with negligible loss of accuracy. To our knowledge, we are the\nfirst to bridge the gap between integer-only quantization and LLMs. We've\npublished our code on anonymous.4open.science, aiming to contribute to the\nadvancement of this field.\n","authors":["Xing Hu","Yuan Cheng","Dawei Yang","Zhihang Yuan","Jiangyong Yu","Chen Xu","Sifan Zhou"],"pdf_url":"https://arxiv.org/pdf/2405.17849v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08631v2","updated":"2024-06-05T15:25:06Z","published":"2023-09-13T01:27:48Z","title":"Large Language Models Can Infer Psychological Dispositions of Social\n  Media Users","summary":"  Large Language Models (LLMs) demonstrate increasingly human-like abilities\nacross a wide variety of tasks. In this paper, we investigate whether LLMs like\nChatGPT can accurately infer the psychological dispositions of social media\nusers and whether their ability to do so varies across socio-demographic\ngroups. Specifically, we test whether GPT-3.5 and GPT-4 can derive the Big Five\npersonality traits from users' Facebook status updates in a zero-shot learning\nscenario. Our results show an average correlation of r = .29 (range = [.22,\n.33]) between LLM-inferred and self-reported trait scores - a level of accuracy\nthat is similar to that of supervised machine learning models specifically\ntrained to infer personality. Our findings also highlight heterogeneity in the\naccuracy of personality inferences across different age groups and gender\ncategories: predictions were found to be more accurate for women and younger\nindividuals on several traits, suggesting a potential bias stemming from the\nunderlying training data or differences in online self-expression. The ability\nof LLMs to infer psychological dispositions from user-generated text has the\npotential to democratize access to cheap and scalable psychometric assessments\nfor both researchers and practitioners. On the one hand, this democratization\nmight facilitate large-scale research of high ecological validity and spark\ninnovation in personalized services. On the other hand, it also raises ethical\nconcerns regarding user privacy and self-determination, highlighting the need\nfor stringent ethical frameworks and regulation.\n","authors":["Heinrich Peters","Sandra Matz"],"pdf_url":"https://arxiv.org/pdf/2309.08631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03369v1","updated":"2024-06-05T15:24:20Z","published":"2024-06-05T15:24:20Z","title":"Posterior and variational inference for deep neural networks with\n  heavy-tailed weights","summary":"  We consider deep neural networks in a Bayesian framework with a prior\ndistribution sampling the network weights at random. Following a recent idea of\nAgapiou and Castillo (2023), who show that heavy-tailed prior distributions\nachieve automatic adaptation to smoothness, we introduce a simple Bayesian deep\nlearning prior based on heavy-tailed weights and ReLU activation. We show that\nthe corresponding posterior distribution achieves near-optimal minimax\ncontraction rates, simultaneously adaptive to both intrinsic dimension and\nsmoothness of the underlying function, in a variety of contexts including\nnonparametric regression, geometric data and Besov spaces. While most works so\nfar need a form of model selection built-in within the prior distribution, a\nkey aspect of our approach is that it does not require to sample\nhyperparameters to learn the architecture of the network. We also provide\nvariational Bayes counterparts of the results, that show that mean-field\nvariational approximations still benefit from near-optimal theoretical support.\n","authors":["Ismaël Castillo","Paul Egels"],"pdf_url":"https://arxiv.org/pdf/2406.03369v1.pdf","comment":"41 pages"},{"id":"http://arxiv.org/abs/2406.03361v1","updated":"2024-06-05T15:14:58Z","published":"2024-06-05T15:14:58Z","title":"What Matters in Hierarchical Search for Combinatorial Reasoning\n  Problems?","summary":"  Efficiently tackling combinatorial reasoning problems, particularly the\nnotorious NP-hard tasks, remains a significant challenge for AI research.\nRecent efforts have sought to enhance planning by incorporating hierarchical\nhigh-level search strategies, known as subgoal methods. While promising, their\nperformance against traditional low-level planners is inconsistent, raising\nquestions about their application contexts. In this study, we conduct an\nin-depth exploration of subgoal-planning methods for combinatorial reasoning.\nWe identify the attributes pivotal for leveraging the advantages of high-level\nsearch: hard-to-learn value functions, complex action spaces, presence of dead\nends in the environment, or using data collected from diverse experts. We\npropose a consistent evaluation methodology to achieve meaningful comparisons\nbetween methods and reevaluate the state-of-the-art algorithms.\n","authors":["Michał Zawalski","Gracjan Góral","Michał Tyrolski","Emilia Wiśnios","Franciszek Budrowski","Łukasz Kuciński","Piotr Miłoś"],"pdf_url":"https://arxiv.org/pdf/2406.03361v1.pdf","comment":"Accepted for Generative Models for Decision Making Workshop at ICLR\n  2024"},{"id":"http://arxiv.org/abs/2403.02648v2","updated":"2024-06-05T15:13:02Z","published":"2024-03-05T04:35:59Z","title":"Remove that Square Root: A New Efficient Scale-Invariant Version of\n  AdaGrad","summary":"  Adaptive methods are extremely popular in machine learning as they make\nlearning rate tuning less expensive. This paper introduces a novel optimization\nalgorithm named KATE, which presents a scale-invariant adaptation of the\nwell-known AdaGrad algorithm. We prove the scale-invariance of KATE for the\ncase of Generalized Linear Models. Moreover, for general smooth non-convex\nproblems, we establish a convergence rate of $O \\left(\\frac{\\log T}{\\sqrt{T}}\n\\right)$ for KATE, matching the best-known ones for AdaGrad and Adam. We also\ncompare KATE to other state-of-the-art adaptive algorithms Adam and AdaGrad in\nnumerical experiments with different problems, including complex machine\nlearning tasks like image classification and text classification on real data.\nThe results indicate that KATE consistently outperforms AdaGrad and\nmatches/surpasses the performance of Adam in all considered scenarios.\n","authors":["Sayantan Choudhury","Nazarii Tupitsa","Nicolas Loizou","Samuel Horvath","Martin Takac","Eduard Gorbunov"],"pdf_url":"https://arxiv.org/pdf/2403.02648v2.pdf","comment":"27 pages, 12 figures"},{"id":"http://arxiv.org/abs/2406.03356v1","updated":"2024-06-05T15:12:29Z","published":"2024-06-05T15:12:29Z","title":"Cooperative learning of Pl@ntNet's Artificial Intelligence algorithm:\n  how does it work and how can we improve it?","summary":"  Deep learning models for plant species identification rely on large annotated\ndatasets. The PlantNet system enables global data collection by allowing users\nto upload and annotate plant observations, leading to noisy labels due to\ndiverse user skills. Achieving consensus is crucial for training, but the vast\nscale of collected data makes traditional label aggregation strategies\nchallenging. Existing methods either retain all observations, resulting in\nnoisy training data or selectively keep those with sufficient votes, discarding\nvaluable information. Additionally, as many species are rarely observed, user\nexpertise can not be evaluated as an inter-user agreement: otherwise, botanical\nexperts would have a lower weight in the AI training step than the average\nuser. Our proposed label aggregation strategy aims to cooperatively train plant\nidentification AI models. This strategy estimates user expertise as a trust\nscore per user based on their ability to identify plant species from\ncrowdsourced data. The trust score is recursively estimated from correctly\nidentified species given the current estimated labels. This interpretable score\nexploits botanical experts' knowledge and the heterogeneity of users.\nSubsequently, our strategy removes unreliable observations but retains those\nwith limited trusted annotations, unlike other approaches. We evaluate\nPlantNet's strategy on a released large subset of the PlantNet database focused\non European flora, comprising over 6M observations and 800K users. We\ndemonstrate that estimating users' skills based on the diversity of their\nexpertise enhances labeling performance. Our findings emphasize the synergy of\nhuman annotation and data filtering in improving AI performance for a refined\ndataset. We explore incorporating AI-based votes alongside human input. This\ncan further enhance human-AI interactions to detect unreliable observations.\n","authors":["Tanguy Lefort","Antoine Affouard","Benjamin Charlier","Jean-Christophe Lombardo","Mathias Chouet","Hervé Goëau","Joseph Salmon","Pierre Bonnet","Alexis Joly"],"pdf_url":"https://arxiv.org/pdf/2406.03356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20233v2","updated":"2024-06-05T15:12:00Z","published":"2024-05-30T16:35:30Z","title":"Grokfast: Accelerated Grokking by Amplifying Slow Gradients","summary":"  One puzzling artifact in machine learning dubbed grokking is where delayed\ngeneralization is achieved tenfolds of iterations after near perfect\noverfitting to the training data. Focusing on the long delay itself on behalf\nof machine learning practitioners, our goal is to accelerate generalization of\na model under grokking phenomenon. By regarding a series of gradients of a\nparameter over training iterations as a random signal over time, we can\nspectrally decompose the parameter trajectories under gradient descent into two\ncomponents: the fast-varying, overfitting-yielding component and the\nslow-varying, generalization-inducing component. This analysis allows us to\naccelerate the grokking phenomenon more than $\\times 50$ with only a few lines\nof code that amplifies the slow-varying components of gradients. The\nexperiments show that our algorithm applies to diverse tasks involving images,\nlanguages, and graphs, enabling practical availability of this peculiar\nartifact of sudden generalization. Our code is available at\nhttps://github.com/ironjr/grokfast.\n","authors":["Jaerin Lee","Bong Gyun Kang","Kihoon Kim","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20233v2.pdf","comment":"17 pages, 13 figures. Typo fixed. Project page:\n  https://jaerinlee.com/research/grokfast"},{"id":"http://arxiv.org/abs/2402.10500v2","updated":"2024-06-05T15:10:08Z","published":"2024-02-16T08:19:34Z","title":"Active Preference Optimization for Sample Efficient RLHF","summary":"  Reinforcement Learning from Human Feedback (RLHF) is pivotal in aligning\nLarge Language Models (LLMs) with human preferences. Although aligned\ngenerative models have shown remarkable abilities in various tasks, their\nreliance on high-quality human preference data creates a costly bottleneck in\nthe practical application of RLHF. One primary reason is that current methods\nrely on uniformly picking prompt-generation pairs from a dataset of\nprompt-generations, to collect human feedback, resulting in sub-optimal\nalignment under a constrained budget, which highlights the criticality of\nadaptive strategies in efficient alignment. Recent works [Mehta et al., 2023,\nMuldrew et al., 2024] have tried to address this problem by designing various\nheuristics based on generation uncertainty. However, either the assumptions in\n[Mehta et al., 2023] are restrictive, or [Muldrew et al., 2024] do not provide\nany rigorous theoretical guarantee. To address these, we reformulate RLHF\nwithin contextual preference bandit framework, treating prompts as contexts,\nand develop an active-learning algorithm, $\\textit{Active Preference\nOptimization}$ ($\\texttt{APO}$), which enhances model alignment by querying\npreference data from the most important samples, achieving superior performance\nfor small sample budget. We analyze the theoretical performance guarantees of\n$\\texttt{APO}$ under the BTL preference model showing that the suboptimality\ngap of the policy learned via $\\texttt{APO}$ scales as $O(1/\\sqrt{T})$ for a\nbudget of $T$. We also show that collecting preference data by choosing prompts\nrandomly leads to a policy that suffers a constant sub-optimality. We perform\ndetailed experimental evaluations on practical preference datasets to validate\n$\\texttt{APO}$'s efficacy over the existing methods, establishing it as a\nsample-efficient and practical solution of alignment in a cost-effective and\nscalable manner.\n","authors":["Nirjhar Das","Souradip Chakraborty","Aldo Pacchiano","Sayak Ray Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2402.10500v2.pdf","comment":"New experimental results added. Some reorganization"},{"id":"http://arxiv.org/abs/2402.12061v2","updated":"2024-06-05T15:08:28Z","published":"2024-02-19T11:28:20Z","title":"All Language Models Large and Small","summary":"  Many leading language models (LMs) use high-intensity computational resources\nboth during training and execution. This poses the challenge of lowering\nresource costs for deployment and faster execution of decision-making tasks\namong others. We introduce a novel plug-and-play LM framework named Language\nOptimising Network Distribution (LONDI) framework. LONDI learns to selectively\nemploy large LMs only where complex decision-making and reasoning are required\nwhile using low-resource LMs (i.e. LMs require less GPU usage, but may not be\nable to solve the problem alone) everywhere else. LONDI consists of a system of\ntwo (off-)policy networks, an LM, a large LM (LLM), and a reinforcement\nlearning module that uses switching controls to quickly learn which system\nstates to call the LLM. We then introduce a variant of LONDI that maintains\nbudget constraints on LLM calls and hence its resource usage. Theoretically, we\nprove LONDI learns the subset of system states to activate the LLM required to\nsolve the task. We then prove that LONDI converges to optimal solutions while\nalso preserving budgetary constraints on LLM calls almost surely enabling it to\nsolve various tasks while significantly lowering computational costs. We test\nLONDI's performance in a range of tasks in ScienceWorld and BabyAI-Text and\ndemonstrate that LONDI can solve tasks only solvable by resource-intensive LLMs\nwhile reducing GPU usage by up to 30%.\n","authors":["Zhixun Chen","Yali Du","David Mguni"],"pdf_url":"https://arxiv.org/pdf/2402.12061v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03348v1","updated":"2024-06-05T15:05:24Z","published":"2024-06-05T15:05:24Z","title":"Position: A Call to Action for a Human-Centered AutoML Paradigm","summary":"  Automated machine learning (AutoML) was formed around the fundamental\nobjectives of automatically and efficiently configuring machine learning (ML)\nworkflows, aiding the research of new ML algorithms, and contributing to the\ndemocratization of ML by making it accessible to a broader audience. Over the\npast decade, commendable achievements in AutoML have primarily focused on\noptimizing predictive performance. This focused progress, while substantial,\nraises questions about how well AutoML has met its broader, original goals. In\nthis position paper, we argue that a key to unlocking AutoML's full potential\nlies in addressing the currently underexplored aspect of user interaction with\nAutoML systems, including their diverse roles, expectations, and expertise. We\nenvision a more human-centered approach in future AutoML research, promoting\nthe collaborative design of ML systems that tightly integrates the\ncomplementary strengths of human expertise and AutoML methodologies.\n","authors":["Marius Lindauer","Florian Karl","Anne Klier","Julia Moosbauer","Alexander Tornede","Andreas Mueller","Frank Hutter","Matthias Feurer","Bernd Bischl"],"pdf_url":"https://arxiv.org/pdf/2406.03348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03346v1","updated":"2024-06-05T15:04:28Z","published":"2024-06-05T15:04:28Z","title":"Normalizing Flows for Conformal Regression","summary":"  Conformal Prediction (CP) algorithms estimate the uncertainty of a prediction\nmodel by calibrating its outputs on labeled data. The same calibration scheme\nusually applies to any model and data without modifications. The obtained\nprediction intervals are valid by construction but could be inefficient, i.e.\nunnecessarily big, if the prediction errors are not uniformly distributed over\nthe input space.\n  We present a general scheme to localize the intervals by training the\ncalibration process. The standard prediction error is replaced by an optimized\ndistance metric that depends explicitly on the object attributes. Learning the\noptimal metric is equivalent to training a Normalizing Flow that acts on the\njoint distribution of the errors and the inputs. Unlike the Error Re-weighting\nCP algorithm of Papadopoulos et al. (2008), the framework allows estimating the\ngap between nominal and empirical conditional validity. The approach is\ncompatible with existing locally-adaptive CP strategies based on re-weighting\nthe calibration samples and applies to any point-prediction model without\nretraining.\n","authors":["Nicolo Colombo"],"pdf_url":"https://arxiv.org/pdf/2406.03346v1.pdf","comment":"To be presented at the 40th Conference on Uncertainty in Artificial\n  Intelligence (UAI 2024). 13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2406.03345v1","updated":"2024-06-05T15:04:27Z","published":"2024-06-05T15:04:27Z","title":"Feature Contamination: Neural Networks Learn Uncorrelated Features and\n  Fail to Generalize","summary":"  Learning representations that generalize under distribution shifts is\ncritical for building robust machine learning models. However, despite\nsignificant efforts in recent years, algorithmic advances in this direction\nhave been limited. In this work, we seek to understand the fundamental\ndifficulty of out-of-distribution generalization with deep neural networks. We\nfirst empirically show that perhaps surprisingly, even allowing a neural\nnetwork to explicitly fit the representations obtained from a teacher network\nthat can generalize out-of-distribution is insufficient for the generalization\nof the student network. Then, by a theoretical study of two-layer ReLU networks\noptimized by stochastic gradient descent (SGD) under a structured feature\nmodel, we identify a fundamental yet unexplored feature learning proclivity of\nneural networks, feature contamination: neural networks can learn uncorrelated\nfeatures together with predictive features, resulting in generalization failure\nunder distribution shifts. Notably, this mechanism essentially differs from the\nprevailing narrative in the literature that attributes the generalization\nfailure to spurious correlations. Overall, our results offer new insights into\nthe non-linear feature learning dynamics of neural networks and highlight the\nnecessity of considering inductive biases in out-of-distribution\ngeneralization.\n","authors":["Tianren Zhang","Chujie Zhao","Guanyu Chen","Yizhou Jiang","Feng Chen"],"pdf_url":"https://arxiv.org/pdf/2406.03345v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2312.03656v2","updated":"2024-06-05T15:03:37Z","published":"2023-12-06T18:25:53Z","title":"Interpretability Illusions in the Generalization of Simplified Models","summary":"  A common method to study deep learning systems is to use simplified model\nrepresentations--for example, using singular value decomposition to visualize\nthe model's hidden states in a lower dimensional space. This approach assumes\nthat the results of these simplifications are faithful to the original model.\nHere, we illustrate an important caveat to this assumption: even if the\nsimplified representations can accurately approximate the full model on the\ntraining set, they may fail to accurately capture the model's behavior out of\ndistribution. We illustrate this by training Transformer models on controlled\ndatasets with systematic generalization splits, including the Dyck\nbalanced-parenthesis languages and a code completion task. We simplify these\nmodels using tools like dimensionality reduction and clustering, and then\nexplicitly test how these simplified proxies match the behavior of the original\nmodel. We find consistent generalization gaps: cases in which the simplified\nproxies are more faithful to the original model on the in-distribution\nevaluations and less faithful on various tests of systematic generalization.\nThis includes cases where the original model generalizes systematically but the\nsimplified proxies fail, and cases where the simplified proxies generalize\nbetter. Together, our results raise questions about the extent to which\nmechanistic interpretations derived using tools like SVD can reliably predict\nwhat a model will do in novel situations.\n","authors":["Dan Friedman","Andrew Lampinen","Lucas Dixon","Danqi Chen","Asma Ghandeharioun"],"pdf_url":"https://arxiv.org/pdf/2312.03656v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.03341v1","updated":"2024-06-05T14:58:32Z","published":"2024-06-05T14:58:32Z","title":"Tackling GenAI Copyright Issues: Originality Estimation and\n  Genericization","summary":"  The rapid progress of generative AI technology has sparked significant\ncopyright concerns, leading to numerous lawsuits filed against AI developers.\nWhile some studies explore methods to mitigate copyright risks by steering the\noutputs of generative models away from those resembling copyrighted data,\nlittle attention has been paid to the question of how much of a resemblance is\nundesirable; more original or unique data are afforded stronger protection, and\nthe threshold level of resemblance for constituting infringement\ncorrespondingly lower. Here, leveraging this principle, we propose a\ngenericization method that modifies the outputs of a generative model to make\nthem more generic and less likely to infringe copyright. To achieve this, we\nintroduce a metric for quantifying the level of originality of data in a manner\nthat is consistent with the legal framework. This metric can be practically\nestimated by drawing samples from a generative model, which is then used for\nthe genericization process. Experiments demonstrate that our genericization\nmethod successfully modifies the output of a text-to-image generative model so\nthat it produces more generic, copyright-compliant images.\n","authors":["Hiroaki Chiba-Okabe","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2406.03341v1.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2406.03337v1","updated":"2024-06-05T14:52:43Z","published":"2024-06-05T14:52:43Z","title":"Identifying latent state transition in non-linear dynamical systems","summary":"  This work aims to improve generalization and interpretability of dynamical\nsystems by recovering the underlying lower-dimensional latent states and their\ntime evolutions. Previous work on disentangled representation learning within\nthe realm of dynamical systems focused on the latent states, possibly with\nlinear transition approximations. As such, they cannot identify nonlinear\ntransition dynamics, and hence fail to reliably predict complex future\nbehavior. Inspired by the advances in nonlinear ICA, we propose a state-space\nmodeling framework in which we can identify not just the latent states but also\nthe unknown transition function that maps the past states to the present. We\nintroduce a practical algorithm based on variational auto-encoders and\nempirically demonstrate in realistic synthetic settings that we can (i) recover\nlatent state dynamics with high accuracy, (ii) correspondingly achieve high\nfuture prediction accuracy, and (iii) adapt fast to new environments.\n","authors":["Çağlar Hızlı","Çağatay Yıldız","Matthias Bethge","ST John","Pekka Marttinen"],"pdf_url":"https://arxiv.org/pdf/2406.03337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03334v1","updated":"2024-06-05T14:49:15Z","published":"2024-06-05T14:49:15Z","title":"Reparameterization invariance in approximate Bayesian inference","summary":"  Current approximate posteriors in Bayesian neural networks (BNNs) exhibit a\ncrucial limitation: they fail to maintain invariance under reparameterization,\ni.e. BNNs assign different posterior densities to different parametrizations of\nidentical functions. This creates a fundamental flaw in the application of\nBayesian principles as it breaks the correspondence between uncertainty over\nthe parameters with uncertainty over the parametrized function. In this paper,\nwe investigate this issue in the context of the increasingly popular linearized\nLaplace approximation. Specifically, it has been observed that linearized\npredictives alleviate the common underfitting problems of the Laplace\napproximation. We develop a new geometric view of reparametrizations from which\nwe explain the success of linearization. Moreover, we demonstrate that these\nreparameterization invariance properties can be extended to the original neural\nnetwork predictive using a Riemannian diffusion process giving a\nstraightforward algorithm for approximate posterior sampling, which empirically\nimproves posterior fit.\n","authors":["Hrittik Roy","Marco Miani","Carl Henrik Ek","Philipp Hennig","Marvin Pförtner","Lukas Tatzel","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2406.03334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03479v3","updated":"2024-06-05T14:44:10Z","published":"2024-02-05T19:47:45Z","title":"DRED: Zero-Shot Transfer in Reinforcement Learning via Data-Regularised\n  Environment Design","summary":"  Autonomous agents trained using deep reinforcement learning (RL) often lack\nthe ability to successfully generalise to new environments, even when these\nenvironments share characteristics with the ones they have encountered during\ntraining. In this work, we investigate how the sampling of individual\nenvironment instances, or levels, affects the zero-shot generalisation (ZSG)\nability of RL agents. We discover that, for deep actor-critic architectures\nsharing their base layers, prioritising levels according to their value loss\nminimises the mutual information between the agent's internal representation\nand the set of training levels in the generated training data. This provides a\nnovel theoretical justification for the regularisation achieved by certain\nadaptive sampling strategies. We then turn our attention to unsupervised\nenvironment design (UED) methods, which assume control over level generation.\nWe find that existing UED methods can significantly shift the training\ndistribution, which translates to low ZSG performance. To prevent both\noverfitting and distributional shift, we introduce data-regularised environment\ndesign (DRED). DRED generates levels using a generative model trained to\napproximate the ground truth distribution of an initial set of level\nparameters. Through its grounding, DRED achieves significant improvements in\nZSG over adaptive level sampling strategies and UED methods. Our code and\nexperimental data are available at https://github.com/uoe-agents/dred.\n","authors":["Samuel Garcin","James Doran","Shangmin Guo","Christopher G. Lucas","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2402.03479v3.pdf","comment":"To appear in ICML 2024. A preliminary version of this work\n  (arXiv:2310.03494) was presented at the ALOE workshop, NeurIPS 2023. arXiv\n  admin note: text overlap with arXiv:2310.03494"},{"id":"http://arxiv.org/abs/2406.03324v1","updated":"2024-06-05T14:37:42Z","published":"2024-06-05T14:37:42Z","title":"UDQL: Bridging The Gap between MSE Loss and The Optimal Value Function\n  in Offline Reinforcement Learning","summary":"  The Mean Square Error (MSE) is commonly utilized to estimate the solution of\nthe optimal value function in the vast majority of offline reinforcement\nlearning (RL) models and has achieved outstanding performance. However, we find\nthat its principle can lead to overestimation phenomenon for the value\nfunction. In this paper, we first theoretically analyze overestimation\nphenomenon led by MSE and provide the theoretical upper bound of the\noverestimated error. Furthermore, to address it, we propose a novel Bellman\nunderestimated operator to counteract overestimation phenomenon and then prove\nits contraction characteristics. At last, we propose the offline RL algorithm\nbased on underestimated operator and diffusion policy model. Extensive\nexperimental results on D4RL tasks show that our method can outperform\nstate-of-the-art offline RL algorithms, which demonstrates that our theoretical\nanalysis and underestimation way are effective for offline RL tasks.\n","authors":["Yu Zhang","Rui Yu","Zhipeng Yao","Wenyuan Zhang","Jun Wang","Liming Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.03324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14848v3","updated":"2024-06-05T14:34:42Z","published":"2023-12-22T17:19:53Z","title":"Isolated pulsar population synthesis with simulation-based inference","summary":"  We combine pulsar population synthesis with simulation-based inference (SBI)\nto constrain the magnetorotational properties of isolated Galactic radio\npulsars. We first develop a framework to model neutron star birth properties\nand their dynamical and magnetorotational evolution. We specifically sample\ninitial magnetic field strengths, $B$, and spin periods, $P$, from lognormal\ndistributions and capture the late-time magnetic field decay with a power law.\nEach lognormal is described by a mean, $\\mu_{\\log B}, \\mu_{\\log P}$, and\nstandard deviation, $\\sigma_{\\log B}, \\sigma_{\\log P}$, while the power law is\ncharacterized by the index, $a_{\\rm late}$. We subsequently model the stars'\nradio emission and observational biases to mimic detections with three radio\nsurveys, and we produce a large database of synthetic $P$--$\\dot{P}$ diagrams\nby varying our five magnetorotational input parameters. We then follow an SBI\napproach that focuses on neural posterior estimation and train deep neural\nnetworks to infer the parameters' posterior distributions. After successfully\nvalidating these individual neural density estimators on simulated data, we use\nan ensemble of networks to infer the posterior distributions for the observed\npulsar population. We obtain $\\mu_{\\log B} = 13.10^{+0.08}_{-0.10}$,\n$\\sigma_{\\log B} = 0.45^{+0.05}_{-0.05}$ and $\\mu_{\\log P} =\n-1.00^{+0.26}_{-0.21}$, $\\sigma_{\\log P} = 0.38^{+0.33}_{-0.18}$ for the\nlognormal distributions and $a_{\\rm late} = -1.80^{+0.65}_{-0.61}$ for the\npower law at the $95\\%$ credible interval. We contrast our results with\nprevious studies and highlight uncertainties of the inferred $a_{\\rm late}$\nvalue. Our approach represents a crucial step toward robust statistical\ninference for complex population synthesis frameworks and forms the basis for\nfuture multiwavelength analyses of Galactic pulsars.\n","authors":["Vanessa Graber","Michele Ronchi","Celsa Pardo-Araujo","Nanda Rea"],"pdf_url":"https://arxiv.org/pdf/2312.14848v3.pdf","comment":"31 pages, 16 figures, 5 tables, 2 appendices; published version"},{"id":"http://arxiv.org/abs/2311.10093v4","updated":"2024-06-05T14:34:30Z","published":"2023-11-16T18:59:51Z","title":"The Chosen One: Consistent Characters in Text-to-Image Diffusion Models","summary":"  Recent advances in text-to-image generation models have unlocked vast\npotential for visual creativity. However, the users that use these models\nstruggle with the generation of consistent characters, a crucial aspect for\nnumerous real-world applications such as story visualization, game development,\nasset design, advertising, and more. Current methods typically rely on multiple\npre-existing images of the target character or involve labor-intensive manual\nprocesses. In this work, we propose a fully automated solution for consistent\ncharacter generation, with the sole input being a text prompt. We introduce an\niterative procedure that, at each stage, identifies a coherent set of images\nsharing a similar identity and extracts a more consistent identity from this\nset. Our quantitative analysis demonstrates that our method strikes a better\nbalance between prompt alignment and identity consistency compared to the\nbaseline methods, and these findings are reinforced by a user study. To\nconclude, we showcase several practical applications of our approach.\n","authors":["Omri Avrahami","Amir Hertz","Yael Vinker","Moab Arar","Shlomi Fruchter","Ohad Fried","Daniel Cohen-Or","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2311.10093v4.pdf","comment":"Accepted to SIGGRAPH 2024. Project page is available at\n  https://omriavrahami.com/the-chosen-one/"},{"id":"http://arxiv.org/abs/2401.16356v4","updated":"2024-06-05T14:30:30Z","published":"2024-01-29T17:59:26Z","title":"cDVGAN: One Flexible Model for Multi-class Gravitational Wave Signal and\n  Glitch Generation","summary":"  Simulating realistic time-domain observations of gravitational waves (GWs)\nand GW detector glitches can help in advancing GW data analysis. Simulated data\ncan be used in downstream tasks by augmenting datasets for signal searches,\nbalancing data sets for machine learning, and validating detection schemes. In\nthis work, we present Conditional Derivative GAN (cDVGAN), a novel conditional\nmodel in the Generative Adversarial Network framework for simulating multiple\nclasses of time-domain observations that represent gravitational waves (GWs)\nand detector glitches. cDVGAN can also generate generalized hybrid samples that\nspan the variation between classes through interpolation in the conditioned\nclass vector. cDVGAN introduces an additional player into the typical 2-player\nadversarial game of GANs, where an auxiliary discriminator analyzes the\nfirst-order derivative time-series. Our results show that this provides\nsynthetic data that better captures the features of the original data. cDVGAN\nconditions on three classes, two denoised from LIGO blip and tomte glitch\nevents from its 3rd observing run (O3), and the third representing binary black\nhole (BBH) mergers. Our proposed cDVGAN outperforms 4 different baseline GAN\nmodels in replicating the features of the three classes. Specifically, our\nexperiments show that training convolutional neural networks (CNNs) with our\ncDVGAN-generated data improves the detection of samples embedded in detector\nnoise beyond the synthetic data from other state-of-the-art GAN models. Our\nbest synthetic dataset yields as much as a 4.2% increase in\narea-under-the-curve (AUC) performance compared to synthetic datasets from\nbaseline GANs. Moreover, training the CNN with hybrid samples from our cDVGAN\noutperforms CNNs trained only on the standard classes, when identifying real\nsamples embedded in LIGO detector background (4% AUC improvement for cDVGAN).\n","authors":["Tom Dooney","Lyana Curier","Daniel Tan","Melissa Lopez","Chris Van Den Broeck","Stefano Bromuri"],"pdf_url":"https://arxiv.org/pdf/2401.16356v4.pdf","comment":"20 pages, 17 figures, 5 tables"},{"id":"http://arxiv.org/abs/2303.07272v4","updated":"2024-06-05T14:29:27Z","published":"2023-03-10T10:32:18Z","title":"Accounting for multiplicity in machine learning benchmark performance","summary":"  Machine learning methods are commonly evaluated and compared by their\nperformance on data sets from public repositories. This allows for multiple\nmethods, oftentimes several thousands, to be evaluated under identical\nconditions and across time. The highest ranked performance on a problem is\nreferred to as state-of-the-art (SOTA) performance, and is used, among other\nthings, as a reference point for publication of new methods. Using the\nhighest-ranked performance as an estimate for SOTA is a biased estimator,\ngiving overly optimistic results. The mechanisms at play are those of\nmultiplicity, a topic that is well-studied in the context of multiple\ncomparisons and multiple testing, but has, as far as the authors are aware of,\nbeen nearly absent from the discussion regarding SOTA estimates. The optimistic\nstate-of-the-art estimate is used as a standard for evaluating new methods, and\nmethods with substantial inferior results are easily overlooked. In this\narticle, we provide a probability distribution for the case of multiple\nclassifiers so that known analyses methods can be engaged and a better SOTA\nestimate can be provided. We demonstrate the impact of multiplicity through a\nsimulated example with independent classifiers. We show how classifier\ndependency impacts the variance, but also that the impact is limited when the\naccuracy is high. Finally, we discuss three real-world examples; Kaggle\ncompetitions that demonstrate various aspects.\n","authors":["Kajsa Møllersen","Einar Holsbø"],"pdf_url":"https://arxiv.org/pdf/2303.07272v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03314v1","updated":"2024-06-05T14:26:45Z","published":"2024-06-05T14:26:45Z","title":"Reproducibility study of FairAC","summary":"  This work aims to reproduce the findings of the paper \"Fair Attribute\nCompletion on Graph with Missing Attributes\" written by Guo, Chu, and Li\narXiv:2302.12977 by investigating the claims made in the paper. This paper\nsuggests that the results of the original paper are reproducible and thus, the\nclaims hold. However, the claim that FairAC is a generic framework for many\ndownstream tasks is very broad and could therefore only be partially tested.\nMoreover, we show that FairAC is generalizable to various datasets and\nsensitive attributes and show evidence that the improvement in group fairness\nof the FairAC framework does not come at the expense of individual fairness.\nLastly, the codebase of FairAC has been refactored and is now easily applicable\nfor various datasets and models.\n","authors":["Gijs de Jong","Macha J. Meijer","Derck W. E. Prinzhorn","Harold Ruiter"],"pdf_url":"https://arxiv.org/pdf/2406.03314v1.pdf","comment":"14 pages, 2 figures, accepted at TMLR"},{"id":"http://arxiv.org/abs/2402.10043v4","updated":"2024-06-05T14:25:23Z","published":"2024-02-15T16:05:35Z","title":"Negative impact of heavy-tailed uncertainty and error distributions on\n  the reliability of calibration statistics for machine learning regression\n  tasks","summary":"  Average calibration of the (variance-based) prediction uncertainties of\nmachine learning regression tasks can be tested in two ways: one is to estimate\nthe calibration error (CE) as the difference between the mean absolute error\n(MSE) and the mean variance (MV); the alternative is to compare the mean\nsquared z-scores (ZMS) to 1. The problem is that both approaches might lead to\ndifferent conclusions, as illustrated in this study for an ensemble of datasets\nfrom the recent machine learning uncertainty quantification (ML-UQ) literature.\nIt is shown that the estimation of MV, MSE and their confidence intervals\nbecomes unreliable for heavy-tailed uncertainty and error distributions, which\nseems to be a frequent feature of ML-UQ datasets. By contrast, the ZMS\nstatistic is less sensitive and offers the most reliable approach in this\ncontext. Unfortunately, the same problem is expected to affect also conditional\ncalibrations statistics, such as the popular ENCE, and very likely post-hoc\ncalibration methods based on similar statistics. Several solutions to\ncircumvent the outlined problems are proposed.\n","authors":["Pascal Pernot"],"pdf_url":"https://arxiv.org/pdf/2402.10043v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01538v3","updated":"2024-06-05T14:18:59Z","published":"2023-12-03T23:36:16Z","title":"Recurrent Distance Filtering for Graph Representation Learning","summary":"  Graph neural networks based on iterative one-hop message passing have been\nshown to struggle in harnessing the information from distant nodes effectively.\nConversely, graph transformers allow each node to attend to all other nodes\ndirectly, but lack graph inductive bias and have to rely on ad-hoc positional\nencoding. In this paper, we propose a new architecture to reconcile these\nchallenges. Our approach stems from the recent breakthroughs in long-range\nmodeling provided by deep state-space models: for a given target node, our\nmodel aggregates other nodes by their shortest distances to the target and uses\na linear RNN to encode the sequence of hop representations. The linear RNN is\nparameterized in a particular diagonal form for stable long-range signal\npropagation and is theoretically expressive enough to encode the neighborhood\nhierarchy. With no need for positional encoding, we empirically show that the\nperformance of our model is comparable to or better than that of\nstate-of-the-art graph transformers on various benchmarks, with a significantly\nreduced computational cost. Our code is open-source at\nhttps://github.com/skeletondyh/GRED.\n","authors":["Yuhui Ding","Antonio Orvieto","Bobby He","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2312.01538v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.15942v2","updated":"2024-06-05T14:16:19Z","published":"2024-05-24T21:09:53Z","title":"Can Implicit Bias Imply Adversarial Robustness?","summary":"  The implicit bias of gradient-based training algorithms has been considered\nmostly beneficial as it leads to trained networks that often generalize well.\nHowever, Frei et al. (2023) show that such implicit bias can harm adversarial\nrobustness. Specifically, they show that if the data consists of clusters with\nsmall inter-cluster correlation, a shallow (two-layer) ReLU network trained by\ngradient flow generalizes well, but it is not robust to adversarial attacks of\nsmall radius. Moreover, this phenomenon occurs despite the existence of a much\nmore robust classifier that can be explicitly constructed from a shallow\nnetwork. In this paper, we extend recent analyses of neuron alignment to show\nthat a shallow network with a polynomial ReLU activation (pReLU) trained by\ngradient flow not only generalizes well but is also robust to adversarial\nattacks. Our results highlight the importance of the interplay between data\nstructure and architecture design in the implicit bias and robustness of\ntrained networks.\n","authors":["Hancheng Min","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2405.15942v2.pdf","comment":"icml 2024 camera-ready"},{"id":"http://arxiv.org/abs/2402.10644v2","updated":"2024-06-05T14:13:22Z","published":"2024-02-16T12:44:15Z","title":"Linear Transformers with Learnable Kernel Functions are Better\n  In-Context Models","summary":"  Advancing the frontier of subquadratic architectures for Language Models\n(LMs) is crucial in the rapidly evolving field of natural language processing.\nCurrent innovations, including State Space Models, were initially celebrated\nfor surpassing Transformer performance on language modeling tasks. However,\nthese models have revealed deficiencies in essential In-Context Learning\ncapabilities - a domain where the Transformer traditionally shines. The Based\nmodel emerged as a hybrid solution, blending a Linear Transformer with a kernel\ninspired by the Taylor expansion of exponential functions, augmented by\nconvolutional networks. Mirroring the Transformer's in-context adeptness, it\nbecame a strong contender in the field. In our work, we present a singular,\nelegant alteration to the Based kernel that amplifies its In-Context Learning\nabilities evaluated with the Multi-Query Associative Recall task and overall\nlanguage modeling process, as demonstrated on the Pile dataset.\n","authors":["Yaroslav Aksenov","Nikita Balagansky","Sofia Maria Lo Cicero Vaina","Boris Shaposhnikov","Alexey Gorbatovski","Daniil Gavrilov"],"pdf_url":"https://arxiv.org/pdf/2402.10644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06751v2","updated":"2024-06-05T14:10:11Z","published":"2024-01-12T18:36:29Z","title":"The Unreasonable Effectiveness of Easy Training Data for Hard Tasks","summary":"  How can we train models to perform well on hard test data when hard training\ndata is by definition difficult to label correctly? This question has been\ntermed the scalable oversight problem and has drawn increasing attention as\nlanguage models have continually improved. In this paper, we present the\nsurprising conclusion that current pretrained language models often generalize\nrelatively well from easy to hard data, even performing as well as oracle\nmodels finetuned on hard data. We demonstrate this kind of easy-to-hard\ngeneralization using simple finetuning methods like in-context learning, linear\nclassifier heads, and QLoRA for seven different measures of datapoint hardness,\nincluding six empirically diverse human hardness measures (like grade level)\nand one model-based measure (loss-based). Furthermore, we show that even if one\ncares most about model performance on hard data, it can be better to collect\neasy data rather than hard data for finetuning, since hard data is generally\nnoisier and costlier to collect. Our experiments use open models up to 70b in\nsize and four publicly available question-answering datasets with questions\nranging in difficulty from 3rd grade science questions to college level STEM\nquestions and general-knowledge trivia. We conclude that easy-to-hard\ngeneralization in LMs is surprisingly strong for the tasks studied. Our code is\navailable at: https://github.com/allenai/easy-to-hard-generalization\n","authors":["Peter Hase","Mohit Bansal","Peter Clark","Sarah Wiegreffe"],"pdf_url":"https://arxiv.org/pdf/2401.06751v2.pdf","comment":"ACL 2024. 23 pages, 20 figures"},{"id":"http://arxiv.org/abs/2402.00592v3","updated":"2024-06-05T14:05:18Z","published":"2024-02-01T13:41:44Z","title":"Partial-Label Learning with a Reject Option","summary":"  In real-world applications, one often encounters ambiguously labeled data,\nwhere different annotators assign conflicting class labels. Partial-label\nlearning allows training classifiers in this weakly supervised setting, where\nstate-of-the-art methods already show good predictive performance. However,\neven the best algorithms give incorrect predictions, which can have severe\nconsequences when they impact actions or decisions. We propose a novel\nrisk-consistent partial-label learning algorithm with a reject option, that is,\nthe algorithm can reject unsure predictions. Extensive experiments on\nartificial and real-world datasets show that our method provides the best\ntrade-off between the number and accuracy of non-rejected predictions when\ncompared to our competitors, which use confidence thresholds for rejecting\nunsure predictions instead. When evaluated without the reject option, our\nnearest neighbor-based approach also achieves competitive prediction\nperformance.\n","authors":["Tobias Fuchs","Florian Kalinke","Klemens Böhm"],"pdf_url":"https://arxiv.org/pdf/2402.00592v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07583v3","updated":"2024-06-05T14:03:57Z","published":"2023-05-12T16:25:57Z","title":"MoMo: Momentum Models for Adaptive Learning Rates","summary":"  Training a modern machine learning architecture on a new task requires\nextensive learning-rate tuning, which comes at a high computational cost. Here\nwe develop new Polyak-type adaptive learning rates that can be used on top of\nany momentum method, and require less tuning to perform well. We first develop\nMoMo, a Momentum Model based adaptive learning rate for SGD-M (stochastic\ngradient descent with momentum). MoMo uses momentum estimates of the losses and\ngradients sampled at each iteration to build a model of the loss function. Our\nmodel makes use of any known lower bound of the loss function by using\ntruncation, e.g. most losses are lower-bounded by zero. The model is then\napproximately minimized at each iteration to compute the next step. We show how\nMoMo can be used in combination with any momentum-based method, and showcase\nthis by developing MoMo-Adam, which is Adam with our new model-based adaptive\nlearning rate. We show that MoMo attains a $\\mathcal{O}(1/\\sqrt{K})$\nconvergence rate for convex problems with interpolation, needing knowledge of\nno problem-specific quantities other than the optimal value. Additionally, for\nlosses with unknown lower bounds, we develop on-the-fly estimates of a lower\nbound, that are incorporated in our model. We show that MoMo and MoMo-Adam\nimprove over SGD-M and Adam in terms of robustness to hyperparameter tuning for\ntraining image classifiers on MNIST, CIFAR, and Imagenet, for recommender\nsystems on Criteo, for a transformer model on the translation task IWSLT14, and\nfor a diffusion model.\n","authors":["Fabian Schaipp","Ruben Ohana","Michael Eickenberg","Aaron Defazio","Robert M. Gower"],"pdf_url":"https://arxiv.org/pdf/2305.07583v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03288v1","updated":"2024-06-05T13:59:05Z","published":"2024-06-05T13:59:05Z","title":"Embarrassingly Parallel GFlowNets","summary":"  GFlowNets are a promising alternative to MCMC sampling for discrete\ncompositional random variables. Training GFlowNets requires repeated\nevaluations of the unnormalized target distribution or reward function.\nHowever, for large-scale posterior sampling, this may be prohibitive since it\nincurs traversing the data several times. Moreover, if the data are distributed\nacross clients, employing standard GFlowNets leads to intensive client-server\ncommunication. To alleviate both these issues, we propose embarrassingly\nparallel GFlowNet (EP-GFlowNet). EP-GFlowNet is a provably correct\ndivide-and-conquer method to sample from product distributions of the form\n$R(\\cdot) \\propto R_1(\\cdot) ... R_N(\\cdot)$ -- e.g., in parallel or federated\nBayes, where each $R_n$ is a local posterior defined on a data partition.\nFirst, in parallel, we train a local GFlowNet targeting each $R_n$ and send the\nresulting models to the server. Then, the server learns a global GFlowNet by\nenforcing our newly proposed \\emph{aggregating balance} condition, requiring a\nsingle communication step. Importantly, EP-GFlowNets can also be applied to\nmulti-objective optimization and model reuse. Our experiments illustrate the\nEP-GFlowNets's effectiveness on many tasks, including parallel Bayesian\nphylogenetics, multi-objective multiset, sequence generation, and federated\nBayesian structure learning.\n","authors":["Tiago da Silva","Luiz Max Carvalho","Amauri Souza","Samuel Kaski","Diego Mesquita"],"pdf_url":"https://arxiv.org/pdf/2406.03288v1.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2406.03287v1","updated":"2024-06-05T13:59:03Z","published":"2024-06-05T13:59:03Z","title":"SpikeLM: Towards General Spike-Driven Language Modeling via Elastic\n  Bi-Spiking Mechanisms","summary":"  Towards energy-efficient artificial intelligence similar to the human brain,\nthe bio-inspired spiking neural networks (SNNs) have advantages of biological\nplausibility, event-driven sparsity, and binary activation. Recently,\nlarge-scale language models exhibit promising generalization capability, making\nit a valuable issue to explore more general spike-driven models. However, the\nbinary spikes in existing SNNs fail to encode adequate semantic information,\nplacing technological challenges for generalization. This work proposes the\nfirst fully spiking mechanism for general language tasks, including both\ndiscriminative and generative ones. Different from previous spikes with {0,1}\nlevels, we propose a more general spike formulation with bi-directional,\nelastic amplitude, and elastic frequency encoding, while still maintaining the\naddition nature of SNNs. In a single time step, the spike is enhanced by\ndirection and amplitude information; in spike frequency, a strategy to control\nspike firing rate is well designed. We plug this elastic bi-spiking mechanism\nin language modeling, named SpikeLM. It is the first time to handle general\nlanguage tasks with fully spike-driven models, which achieve much higher\naccuracy than previously possible. SpikeLM also greatly bridges the performance\ngap between SNNs and ANNs in language modeling. Our code is available at\nhttps://github.com/Xingrun-Xing/SpikeLM.\n","authors":["Xingrun Xing","Zheng Zhang","Ziyi Ni","Shitao Xiao","Yiming Ju","Siqi Fan","Yequan Wang","Jiajun Zhang","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2406.03287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03280v1","updated":"2024-06-05T13:54:28Z","published":"2024-06-05T13:54:28Z","title":"FusionBench: A Comprehensive Benchmark of Deep Model Fusion","summary":"  Deep model fusion is an emerging technique that unifies the predictions or\nparameters of several deep neural networks into a single model in a\ncost-effective and data-efficient manner. This enables the unified model to\ntake advantage of the original models' strengths, potentially exceeding their\nperformance. Although a variety of deep model fusion techniques have been\nintroduced, their evaluations tend to be inconsistent and often inadequate to\nvalidate their effectiveness and robustness against distribution shifts. To\naddress this issue, we introduce FusionBench, which is the first comprehensive\nbenchmark dedicated to deep model fusion. FusionBench covers a wide range of\ntasks, including open-vocabulary image classification, text classification, and\ntext-to-text generation. Each category includes up to eight tasks with\ncorresponding task-specific models, featuring both full fine-tuning and LoRA\nfine-tuning, as well as models of different sizes, to ensure fair and balanced\ncomparisons of various multi-task model fusion techniques across different\ntasks, model scales, and fine-tuning strategies. We implement and evaluate a\nbroad spectrum of deep model fusion techniques. These techniques range from\nmodel ensemble methods, which combine the predictions to improve the overall\nperformance, to model merging, which integrates different models into a single\none, and model mixing methods, which upscale or recombine the components of the\noriginal models. FusionBench now contains 26 distinct tasks, 74 fine-tuned\nmodels, and 16 fusion techniques, and we are committed to consistently\nexpanding the benchmark with more tasks, models, and fusion techniques. In\naddition, we offer a well-documented set of resources and guidelines to aid\nresearchers in understanding and replicating the benchmark results. Homepage\nhttps://tanganke.github.io/fusion_bench/\n","authors":["Anke Tang","Li Shen","Yong Luo","Han Hu","Bo Do","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2406.03280v1.pdf","comment":"Project homepage: https://tanganke.github.io/fusion_bench/"},{"id":"http://arxiv.org/abs/2406.03278v1","updated":"2024-06-05T13:53:47Z","published":"2024-06-05T13:53:47Z","title":"Using GNN property predictors as molecule generators","summary":"  Graph neural networks (GNNs) have emerged as powerful tools to accurately\npredict materials and molecular properties in computational discovery\npipelines. In this article, we exploit the invertible nature of these neural\nnetworks to directly generate molecular structures with desired electronic\nproperties. Starting from a random graph or an existing molecule, we perform a\ngradient ascent while holding the GNN weights fixed in order to optimize its\ninput, the molecular graph, towards the target property. Valence rules are\nenforced strictly through a judicious graph construction. The method relies\nentirely on the property predictor; no additional training is required on\nmolecular structures. We demonstrate the application of this method by\ngenerating molecules with specific DFT-verified energy gaps and octanol-water\npartition coefficients (logP). Our approach hits target properties with rates\ncomparable to or better than state-of-the-art generative models while\nconsistently generating more diverse molecules.\n","authors":["Félix Therrien","Edward H. Sargent","Oleksandr Voznyy"],"pdf_url":"https://arxiv.org/pdf/2406.03278v1.pdf","comment":"7 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2406.03276v1","updated":"2024-06-05T13:53:20Z","published":"2024-06-05T13:53:20Z","title":"Revisiting Scalable Hessian Diagonal Approximations for Applications in\n  Reinforcement Learning","summary":"  Second-order information is valuable for many applications but challenging to\ncompute. Several works focus on computing or approximating Hessian diagonals,\nbut even this simplification introduces significant additional costs compared\nto computing a gradient. In the absence of efficient exact computation schemes\nfor Hessian diagonals, we revisit an early approximation scheme proposed by\nBecker and LeCun (1989, BL89), which has a cost similar to gradients and\nappears to have been overlooked by the community. We introduce HesScale, an\nimprovement over BL89, which adds negligible extra computation. On small\nnetworks, we find that this improvement is of higher quality than all\nalternatives, even those with theoretical guarantees, such as unbiasedness,\nwhile being much cheaper to compute. We use this insight in reinforcement\nlearning problems where small networks are used and demonstrate HesScale in\nsecond-order optimization and scaling the step-size parameter. In our\nexperiments, HesScale optimizes faster than existing methods and improves\nstability through step-size scaling. These findings are promising for scaling\nsecond-order methods in larger models in the future.\n","authors":["Mohamed Elsayed","Homayoon Farrahi","Felix Dangel","A. Rupam Mahmood"],"pdf_url":"https://arxiv.org/pdf/2406.03276v1.pdf","comment":"Published in the Proceedings of the 41st International Conference on\n  Machine Learning (ICML 2024). Code is available at\n  https://github.com/mohmdelsayed/HesScale. arXiv admin note: substantial text\n  overlap with arXiv:2210.11639"},{"id":"http://arxiv.org/abs/2406.03272v1","updated":"2024-06-05T13:50:59Z","published":"2024-06-05T13:50:59Z","title":"Multi-Microphone Speech Emotion Recognition using the Hierarchical\n  Token-semantic Audio Transformer Architecture","summary":"  Most emotion recognition systems fail in real-life situations (in the wild\nscenarios) where the audio is contaminated by reverberation. Our study explores\nnew methods to alleviate the performance degradation of Speech Emotion\nRecognition (SER) algorithms and develop a more robust system for adverse\nconditions. We propose processing multi-microphone signals to address these\nchallenges and improve emotion classification accuracy. We adopt a\nstate-of-the-art transformer model, the Hierarchical Token-semantic Audio\nTransformer (HTS-AT), to handle multi-channel audio inputs. We evaluate two\nstrategies: averaging mel-spectrograms across channels and summing\npatch-embedded representations. Our multimicrophone model achieves superior\nperformance compared to single-channel baselines when tested on real-world\nreverberant environments.\n","authors":["Ohad Cohen","Gershon Hazan","Sharon Gannot"],"pdf_url":"https://arxiv.org/pdf/2406.03272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05787v2","updated":"2024-06-05T13:44:00Z","published":"2024-02-08T16:24:44Z","title":"How do Transformers perform In-Context Autoregressive Learning?","summary":"  Transformers have achieved state-of-the-art performance in language modeling\ntasks. However, the reasons behind their tremendous success are still unclear.\nIn this paper, towards a better understanding, we train a Transformer model on\na simple next token prediction task, where sequences are generated as a\nfirst-order autoregressive process $s_{t+1} = W s_t$. We show how a trained\nTransformer predicts the next token by first learning $W$ in-context, then\napplying a prediction mapping. We call the resulting procedure in-context\nautoregressive learning. More precisely, focusing on commuting orthogonal\nmatrices $W$, we first show that a trained one-layer linear Transformer\nimplements one step of gradient descent for the minimization of an inner\nobjective function, when considering augmented tokens. When the tokens are not\naugmented, we characterize the global minima of a one-layer diagonal linear\nmulti-head Transformer. Importantly, we exhibit orthogonality between heads and\nshow that positional encoding captures trigonometric relations in the data. On\nthe experimental side, we consider the general case of non-commuting orthogonal\nmatrices and generalize our theoretical findings.\n","authors":["Michael E. Sander","Raja Giryes","Taiji Suzuki","Mathieu Blondel","Gabriel Peyré"],"pdf_url":"https://arxiv.org/pdf/2402.05787v2.pdf","comment":"20 pages ICML 2024"},{"id":"http://arxiv.org/abs/2406.03264v1","updated":"2024-06-05T13:41:26Z","published":"2024-06-05T13:41:26Z","title":"No-Regret Algorithms for Safe Bayesian Optimization with Monotonicity\n  Constraints","summary":"  We consider the problem of sequentially maximizing an unknown function $f$\nover a set of actions of the form $(s,\\mathbf{x})$, where the selected actions\nmust satisfy a safety constraint with respect to an unknown safety function\n$g$. We model $f$ and $g$ as lying in a reproducing kernel Hilbert space\n(RKHS), which facilitates the use of Gaussian process methods. While existing\nworks for this setting have provided algorithms that are guaranteed to identify\na near-optimal safe action, the problem of attaining low cumulative regret has\nremained largely unexplored, with a key challenge being that expanding the safe\nregion can incur high regret. To address this challenge, we show that if $g$ is\nmonotone with respect to just the single variable $s$ (with no such constraint\non $f$), sublinear regret becomes achievable with our proposed algorithm. In\naddition, we show that a modified version of our algorithm is able to attain\nsublinear regret (for suitably defined notions of regret) for the task of\nfinding a near-optimal $s$ corresponding to every $\\mathbf{x}$, as opposed to\nonly finding the global safe optimum. Our findings are supported with empirical\nevaluations on various objective and safety functions.\n","authors":["Arpan Losalka","Jonathan Scarlett"],"pdf_url":"https://arxiv.org/pdf/2406.03264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03263v1","updated":"2024-06-05T13:41:09Z","published":"2024-06-05T13:41:09Z","title":"Deep Generative Models for Proton Zero Degree Calorimeter Simulations in\n  ALICE, CERN","summary":"  Simulating detector responses is a crucial part of understanding the\ninner-workings of particle collisions in the Large Hadron Collider at CERN. The\ncurrent reliance on statistical Monte-Carlo simulations strains CERN's\ncomputational grid, underscoring the urgency for more efficient alternatives.\nAddressing these challenges, recent proposals advocate for generative machine\nlearning methods. In this study, we present an innovative deep learning\nsimulation approach tailored for the proton Zero Degree Calorimeter in the\nALICE experiment. Leveraging a Generative Adversarial Network model with\nSelective Diversity Increase loss, we directly simulate calorimeter responses.\nTo enhance its capabilities in modeling a broad range of calorimeter response\nintensities, we expand the SDI-GAN architecture with additional regularization.\nMoreover, to improve the spatial fidelity of the generated data, we introduce\nan auxiliary regressor network. Our method offers a significant speedup when\ncomparing to the traditional Monte-Carlo based approaches.\n","authors":["Patryk Będkowski","Jan Dubiński","Kamil Deja","Przemysław Rokita"],"pdf_url":"https://arxiv.org/pdf/2406.03263v1.pdf","comment":"8 pages, 3 figures, PP-RAI 2024 conference"},{"id":"http://arxiv.org/abs/2404.03528v3","updated":"2024-06-05T13:39:56Z","published":"2024-04-04T15:31:21Z","title":"BanglaAutoKG: Automatic Bangla Knowledge Graph Construction with\n  Semantic Neural Graph Filtering","summary":"  Knowledge Graphs (KGs) have proven essential in information processing and\nreasoning applications because they link related entities and give context-rich\ninformation, supporting efficient information retrieval and knowledge\ndiscovery; presenting information flow in a very effective manner. Despite\nbeing widely used globally, Bangla is relatively underrepresented in KGs due to\na lack of comprehensive datasets, encoders, NER (named entity recognition)\nmodels, POS (part-of-speech) taggers, and lemmatizers, hindering efficient\ninformation processing and reasoning applications in the language. Addressing\nthe KG scarcity in Bengali, we propose BanglaAutoKG, a pioneering framework\nthat is able to automatically construct Bengali KGs from any Bangla text. We\nutilize multilingual LLMs to understand various languages and correlate\nentities and relations universally. By employing a translation dictionary to\nidentify English equivalents and extracting word features from pre-trained BERT\nmodels, we construct the foundational KG. To reduce noise and align word\nembeddings with our goal, we employ graph-based polynomial filters. Lastly, we\nimplement a GNN-based semantic filter, which elevates contextual understanding\nand trims unnecessary edges, culminating in the formation of the definitive KG.\nEmpirical findings and case studies demonstrate the universal effectiveness of\nour model, capable of autonomously constructing semantically enriched KGs from\nany text.\n","authors":["Azmine Toushik Wasi","Taki Hasan Rafi","Raima Islam","Dong-Kyu Chae"],"pdf_url":"https://arxiv.org/pdf/2404.03528v3.pdf","comment":"7 pages, 3 figures. Accepted to LREC-COLING 2024. Read in ACL\n  Anthology: https://aclanthology.org/2024.lrec-main.189/"},{"id":"http://arxiv.org/abs/2312.17025v3","updated":"2024-06-05T13:39:20Z","published":"2023-12-28T13:50:42Z","title":"Experiential Co-Learning of Software-Developing Agents","summary":"  Recent advancements in large language models (LLMs) have brought significant\nchanges to various domains, especially through LLM-driven autonomous agents. A\nrepresentative scenario is in software development, where LLM agents\ndemonstrate efficient collaboration, task division, and assurance of software\nquality, markedly reducing the need for manual involvement. However, these\nagents frequently perform a variety of tasks independently, without benefiting\nfrom past experiences, which leads to repeated mistakes and inefficient\nattempts in multi-step task execution. To this end, we introduce Experiential\nCo-Learning, a novel LLM-agent learning framework in which instructor and\nassistant agents gather shortcut-oriented experiences from their historical\ntrajectories and use these past experiences for future task execution. The\nextensive experiments demonstrate that the framework enables agents to tackle\nunseen software-developing tasks more effectively. We anticipate that our\ninsights will guide LLM agents towards enhanced autonomy and contribute to\ntheir evolutionary growth in cooperative learning. The code and data are\navailable at https://github.com/OpenBMB/ChatDev.\n","authors":["Chen Qian","Yufan Dang","Jiahao Li","Wei Liu","Zihao Xie","Yifei Wang","Weize Chen","Cheng Yang","Xin Cong","Xiaoyin Che","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2312.17025v3.pdf","comment":"Accepted to ACL 2024, https://github.com/OpenBMB/ChatDev"},{"id":"http://arxiv.org/abs/2402.10024v2","updated":"2024-06-05T13:38:42Z","published":"2024-02-15T15:43:05Z","title":"Self-Augmented In-Context Learning for Unsupervised Word Translation","summary":"  Recent work has shown that, while large language models (LLMs) demonstrate\nstrong word translation or bilingual lexicon induction (BLI) capabilities in\nfew-shot setups, they still cannot match the performance of 'traditional'\nmapping-based approaches in the unsupervised scenario where no seed translation\npairs are available, especially for lower-resource languages. To address this\nchallenge with LLMs, we propose self-augmented in-context learning (SAIL) for\nunsupervised BLI: starting from a zero-shot prompt, SAIL iteratively induces a\nset of high-confidence word translation pairs for in-context learning (ICL)\nfrom an LLM, which it then reapplies to the same LLM in the ICL fashion. Our\nmethod shows substantial gains over zero-shot prompting of LLMs on two\nestablished BLI benchmarks spanning a wide range of language pairs, also\noutperforming mapping-based baselines across the board. In addition to\nachieving state-of-the-art unsupervised BLI performance, we also conduct\ncomprehensive analyses on SAIL and discuss its limitations.\n","authors":["Yaoyiran Li","Anna Korhonen","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2402.10024v2.pdf","comment":"ACL 2024 Main Conference; 11 Pages, 3 Figures, 9 Tables"},{"id":"http://arxiv.org/abs/2406.03260v1","updated":"2024-06-05T13:37:42Z","published":"2024-06-05T13:37:42Z","title":"Feature learning in finite-width Bayesian deep linear networks with\n  multiple outputs and convolutional layers","summary":"  Deep linear networks have been extensively studied, as they provide\nsimplified models of deep learning. However, little is known in the case of\nfinite-width architectures with multiple outputs and convolutional layers. In\nthis manuscript, we provide rigorous results for the statistics of functions\nimplemented by the aforementioned class of networks, thus moving closer to a\ncomplete characterization of feature learning in the Bayesian setting. Our\nresults include: (i) an exact and elementary non-asymptotic integral\nrepresentation for the joint prior distribution over the outputs, given in\nterms of a mixture of Gaussians; (ii) an analytical formula for the posterior\ndistribution in the case of squared error loss function (Gaussian likelihood);\n(iii) a quantitative description of the feature learning infinite-width regime,\nusing large deviation theory. From a physical perspective, deep architectures\nwith multiple outputs or convolutional layers represent different\nmanifestations of kernel shape renormalization, and our work provides a\ndictionary that translates this physics intuition and terminology into rigorous\nBayesian statistics.\n","authors":["Federico Bassetti","Marco Gherardi","Alessandro Ingrosso","Mauro Pastore","Pietro Rotondo"],"pdf_url":"https://arxiv.org/pdf/2406.03260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03258v1","updated":"2024-06-05T13:36:38Z","published":"2024-06-05T13:36:38Z","title":"Relaxed Quantile Regression: Prediction Intervals for Asymmetric Noise","summary":"  Constructing valid prediction intervals rather than point estimates is a\nwell-established approach for uncertainty quantification in the regression\nsetting. Models equipped with this capacity output an interval of values in\nwhich the ground truth target will fall with some prespecified probability.\nThis is an essential requirement in many real-world applications where simple\npoint predictions' inability to convey the magnitude and frequency of errors\nrenders them insufficient for high-stakes decisions. Quantile regression is a\nleading approach for obtaining such intervals via the empirical estimation of\nquantiles in the (non-parametric) distribution of outputs. This method is\nsimple, computationally inexpensive, interpretable, assumption-free, and\neffective. However, it does require that the specific quantiles being learned\nare chosen a priori. This results in (a) intervals that are arbitrarily\nsymmetric around the median which is sub-optimal for realistic skewed\ndistributions, or (b) learning an excessive number of intervals. In this work,\nwe propose Relaxed Quantile Regression (RQR), a direct alternative to quantile\nregression based interval construction that removes this arbitrary constraint\nwhilst maintaining its strengths. We demonstrate that this added flexibility\nresults in intervals with an improvement in desirable qualities (e.g. mean\nwidth) whilst retaining the essential coverage guarantees of quantile\nregression.\n","authors":["Thomas Pouplin","Alan Jeffares","Nabeel Seedat","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2406.03258v1.pdf","comment":"Accepted at International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2406.03255v1","updated":"2024-06-05T13:35:48Z","published":"2024-06-05T13:35:48Z","title":"On the Maximal Local Disparity of Fairness-Aware Classifiers","summary":"  Fairness has become a crucial aspect in the development of trustworthy\nmachine learning algorithms. Current fairness metrics to measure the violation\nof demographic parity have the following drawbacks: (i) the average difference\nof model predictions on two groups cannot reflect their distribution disparity,\nand (ii) the overall calculation along all possible predictions conceals the\nextreme local disparity at or around certain predictions. In this work, we\npropose a novel fairness metric called Maximal Cumulative ratio Disparity along\nvarying Predictions' neighborhood (MCDP), for measuring the maximal local\ndisparity of the fairness-aware classifiers. To accurately and efficiently\ncalculate the MCDP, we develop a provably exact and an approximate calculation\nalgorithm that greatly reduces the computational complexity with low estimation\nerror. We further propose a bi-level optimization algorithm using a\ndifferentiable approximation of the MCDP for improving the algorithmic\nfairness. Extensive experiments on both tabular and image datasets validate\nthat our fair training algorithm can achieve superior fairness-accuracy\ntrade-offs.\n","authors":["Jinqiu Jin","Haoxuan Li","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2406.03255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03253v1","updated":"2024-06-05T13:31:30Z","published":"2024-06-05T13:31:30Z","title":"Exploring Higher Order Structures in Graph Explanantions","summary":"  Recent advancements in graph learning contributed to explaining predictions\ngenerated by Graph Neural Networks. However, existing methodologies often fall\nshort when applied to real-world datasets. We introduce HOGE, a framework to\ncapture higher-order structures using cell complexes, which excel at modeling\nhigher-order relationships. In the real world, higher-order structures are\nubiquitous like in molecules or social networks, thus our work significantly\nenhances the practical applicability of graph explanations. HOGE produces\nclearer and more accurate explanations compared to prior methods. Our method\ncan be integrated with all existing graph explainers, ensuring seamless\nintegration into current frameworks. We evaluate on GraphXAI benchmark\ndatasets, HOGE achieves improved or comparable performance with minimal\ncomputational overhead. Ablation studies show that the performance gain\nobserved can be attributed to the higher-order structures that come from\nintroducing cell complexes.\n","authors":["Akshit Sinha","Sreeram Vennam","Charu Sharma","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2406.03253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16124v2","updated":"2024-06-05T13:26:48Z","published":"2023-12-26T17:18:09Z","title":"Olfactory Label Prediction on Aroma-Chemical Pairs","summary":"  The application of deep learning techniques on aroma-chemicals has resulted\nin models more accurate than human experts at predicting olfactory qualities.\nHowever, public research in this domain has been limited to predicting the\nqualities of single molecules, whereas in industry applications, perfumers and\nfood scientists are often concerned with blends of many molecules. In this\npaper, we apply both existing and novel approaches to a dataset we gathered\nconsisting of labeled pairs of molecules. We present graph neural network\nmodels capable of accurately predicting the odor qualities arising from blends\nof aroma-chemicals, with an analysis of how variations in architecture can lead\nto significant differences in predictive power.\n","authors":["Laura Sisson","Aryan Amit Barsainyan","Mrityunjay Sharma","Ritesh Kumar"],"pdf_url":"https://arxiv.org/pdf/2312.16124v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03249v1","updated":"2024-06-05T13:26:25Z","published":"2024-06-05T13:26:25Z","title":"Near-field Beamforming for Extremely Large-scale MIMO Based on\n  Unsupervised Deep Learning","summary":"  Extremely Large-scale Array (ELAA) is considered a frontier technology for\nfuture communication systems, pivotal in improving wireless systems' rate and\nspectral efficiency. However, as ELAA employs a multitude of antennas operating\nat higher frequencies, users are typically situated in the near-field region\nwhere the spherical wavefront propagates. This inevitably leads to a\nsignificant increase in the overhead of beam training, requiring complex\ntwo-dimensional beam searching in both the angle domain and the distance\ndomain. To address this problem, we propose a near-field beamforming method\nbased on unsupervised deep learning. Our convolutional neural network\nefficiently extracts complex channel state information features by\nstrategically selecting padding and kernel size. We optimize the beamformers to\nmaximize achievable rates in a multi-user network without relying on predefined\ncustom codebooks. Upon deployment, the model requires solely the input of\npre-estimated channel state information to derive the optimal beamforming\nvector. Simulation results show that our proposed scheme can obtain stable\nbeamforming gain compared with the baseline scheme. Furthermore, owing to the\ninherent traits of deep learning methodologies, this approach substantially\ndiminishes the beam training costs in near-field regions.\n","authors":["Jiali Nie","Yuanhao Cui","Zhaohui Yang","Weijie Yuan","Xiaojun Jing"],"pdf_url":"https://arxiv.org/pdf/2406.03249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13086v2","updated":"2024-06-05T13:26:16Z","published":"2024-03-19T18:32:48Z","title":"Listenable Maps for Audio Classifiers","summary":"  Despite the impressive performance of deep learning models across diverse\ntasks, their complexity poses challenges for interpretation. This challenge is\nparticularly evident for audio signals, where conveying interpretations becomes\ninherently difficult. To address this issue, we introduce Listenable Maps for\nAudio Classifiers (L-MAC), a posthoc interpretation method that generates\nfaithful and listenable interpretations. L-MAC utilizes a decoder on top of a\npretrained classifier to generate binary masks that highlight relevant portions\nof the input audio. We train the decoder with a loss function that maximizes\nthe confidence of the classifier decision on the masked-in portion of the audio\nwhile minimizing the probability of model output for the masked-out portion.\nQuantitative evaluations on both in-domain and out-of-domain data demonstrate\nthat L-MAC consistently produces more faithful interpretations than several\ngradient and masking-based methodologies. Furthermore, a user study confirms\nthat, on average, users prefer the interpretations generated by the proposed\ntechnique.\n","authors":["Francesco Paissan","Mirco Ravanelli","Cem Subakan"],"pdf_url":"https://arxiv.org/pdf/2403.13086v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2406.03242v1","updated":"2024-06-05T13:18:55Z","published":"2024-06-05T13:18:55Z","title":"Variational Pseudo Marginal Methods for Jet Reconstruction in Particle\n  Physics","summary":"  Reconstructing jets, which provide vital insights into the properties and\nhistories of subatomic particles produced in high-energy collisions, is a main\nproblem in data analyses in collider physics. This intricate task deals with\nestimating the latent structure of a jet (binary tree) and involves parameters\nsuch as particle energy, momentum, and types. While Bayesian methods offer a\nnatural approach for handling uncertainty and leveraging prior knowledge, they\nface significant challenges due to the super-exponential growth of potential\njet topologies as the number of observed particles increases. To address this,\nwe introduce a Combinatorial Sequential Monte Carlo approach for inferring jet\nlatent structures. As a second contribution, we leverage the resulting\nestimator to develop a variational inference algorithm for parameter learning.\nBuilding on this, we introduce a variational family using a pseudo-marginal\nframework for a fully Bayesian treatment of all variables, unifying the\ngenerative model with the inference process. We illustrate our method's\neffectiveness through experiments using data generated with a collider physics\ngenerative model, highlighting superior speed and accuracy across a range of\ntasks.\n","authors":["Hanming Yang","Antonio Khalil Moretti","Sebastian Macaluso","Philippe Chlenski","Christian A. Naesseth","Itsik Pe'er"],"pdf_url":"https://arxiv.org/pdf/2406.03242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02422v2","updated":"2024-06-05T13:17:23Z","published":"2024-06-04T15:39:49Z","title":"IterMask2: Iterative Unsupervised Anomaly Segmentation via Spatial and\n  Frequency Masking for Brain Lesions in MRI","summary":"  Unsupervised anomaly segmentation approaches to pathology segmentation train\na model on images of healthy subjects, that they define as the 'normal' data\ndistribution. At inference, they aim to segment any pathologies in new images\nas 'anomalies', as they exhibit patterns that deviate from those in 'normal'\ntraining data. Prevailing methods follow the 'corrupt-and-reconstruct'\nparadigm. They intentionally corrupt an input image, reconstruct it to follow\nthe learned 'normal' distribution, and subsequently segment anomalies based on\nreconstruction error. Corrupting an input image, however, inevitably leads to\nsuboptimal reconstruction even of normal regions, causing false positives. To\nalleviate this, we propose a novel iterative spatial mask-refining strategy\nIterMask2. We iteratively mask areas of the image, reconstruct them, and update\nthe mask based on reconstruction error. This iterative process progressively\nadds information about areas that are confidently normal as per the model. The\nincreasing content guides reconstruction of nearby masked areas, improving\nreconstruction of normal tissue under these areas, reducing false positives. We\nalso use high-frequency image content as an auxiliary input to provide\nadditional structural information for masked areas. This further improves\nreconstruction error of normal in comparison to anomalous areas, facilitating\nsegmentation of the latter. We conduct experiments on several brain lesion\ndatasets and demonstrate effectiveness of our method. Code is available at:\nhttps://github.com/ZiyunLiang/IterMask2\n","authors":["Ziyun Liang","Xiaoqing Guo","J. Alison Noble","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2406.02422v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03234v1","updated":"2024-06-05T13:13:58Z","published":"2024-06-05T13:13:58Z","title":"Fine-Grained Causal Dynamics Learning with Quantization for Improving\n  Robustness in Reinforcement Learning","summary":"  Causal dynamics learning has recently emerged as a promising approach to\nenhancing robustness in reinforcement learning (RL). Typically, the goal is to\nbuild a dynamics model that makes predictions based on the causal relationships\namong the entities. Despite the fact that causal connections often manifest\nonly under certain contexts, existing approaches overlook such fine-grained\nrelationships and lack a detailed understanding of the dynamics. In this work,\nwe propose a novel dynamics model that infers fine-grained causal structures\nand employs them for prediction, leading to improved robustness in RL. The key\nidea is to jointly learn the dynamics model with a discrete latent variable\nthat quantizes the state-action space into subgroups. This leads to recognizing\nmeaningful context that displays sparse dependencies, where causal structures\nare learned for each subgroup throughout the training. Experimental results\ndemonstrate the robustness of our method to unseen states and locally spurious\ncorrelations in downstream tasks where fine-grained causal reasoning is\ncrucial. We further illustrate the effectiveness of our subgroup-based approach\nwith quantization in discovering fine-grained causal relationships compared to\nprior methods.\n","authors":["Inwoo Hwang","Yunhyeok Kwak","Suhyung Choi","Byoung-Tak Zhang","Sanghack Lee"],"pdf_url":"https://arxiv.org/pdf/2406.03234v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.02678v3","updated":"2024-06-05T13:12:17Z","published":"2024-05-04T14:43:31Z","title":"Position: Quo Vadis, Unsupervised Time Series Anomaly Detection?","summary":"  The current state of machine learning scholarship in Timeseries Anomaly\nDetection (TAD) is plagued by the persistent use of flawed evaluation metrics,\ninconsistent benchmarking practices, and a lack of proper justification for the\nchoices made in novel deep learning-based model designs. Our paper presents a\ncritical analysis of the status quo in TAD, revealing the misleading track of\ncurrent research and highlighting problematic methods, and evaluation\npractices. Our position advocates for a shift in focus from solely pursuing\nnovel model designs to improving benchmarking practices, creating non-trivial\ndatasets, and critically evaluating the utility of complex methods against\nsimpler baselines. Our findings demonstrate the need for rigorous evaluation\nprotocols, the creation of simple baselines, and the revelation that\nstate-of-the-art deep anomaly detection models effectively learn linear\nmappings. These findings suggest the need for more exploration and development\nof simple and interpretable TAD methods. The increment of model complexity in\nthe state-of-the-art deep-learning based models unfortunately offers very\nlittle improvement. We offer insights and suggestions for the field to move\nforward.\n  Code: https://github.com/ssarfraz/QuoVadisTAD\n","authors":["M. Saquib Sarfraz","Mei-Yen Chen","Lukas Layer","Kunyu Peng","Marios Koulakis"],"pdf_url":"https://arxiv.org/pdf/2405.02678v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2403.18947v2","updated":"2024-06-05T13:07:17Z","published":"2024-02-21T15:17:20Z","title":"Self-Supervised Interpretable End-to-End Learning via Latent Functional\n  Modularity","summary":"  We introduce MoNet, a novel functionally modular network for self-supervised\nand interpretable end-to-end learning. By leveraging its functional modularity\nwith a latent-guided contrastive loss function, MoNet efficiently learns\ntask-specific decision-making processes in latent space without requiring\ntask-level supervision. Moreover, our method incorporates an online, post-hoc\nexplainability approach that enhances the interpretability of end-to-end\ninferences without compromising sensorimotor control performance. In real-world\nindoor environments, MoNet demonstrates effective visual autonomous navigation,\noutperforming baseline models by 7% to 28% in task specificity analysis. We\nfurther explore the interpretability of our network through post-hoc analysis\nof perceptual saliency maps and latent decision vectors. This provides valuable\ninsights into the incorporation of explainable artificial intelligence into\nrobotic learning, encompassing both perceptual and behavioral perspectives.\nSupplementary materials are available at\nhttps://sites.google.com/view/monet-lgc.\n","authors":["Hyunki Seong","David Hyunchul Shim"],"pdf_url":"https://arxiv.org/pdf/2403.18947v2.pdf","comment":"12 pages, 9 figures. Accepted at ICML 2024. Camera-ready version"},{"id":"http://arxiv.org/abs/2406.03231v1","updated":"2024-06-05T13:06:52Z","published":"2024-06-05T13:06:52Z","title":"CommonPower: Supercharging Machine Learning for Smart Grids","summary":"  The growing complexity of power system management has led to an increased\ninterest in the use of reinforcement learning (RL). However, no tool for\ncomprehensive and realistic benchmarking of RL in smart grids exists. One\nprerequisite for such a comparison is a safeguarding mechanism since vanilla RL\ncontrollers can not guarantee the satisfaction of system constraints. Other\ncentral requirements include flexible modeling of benchmarking scenarios,\ncredible baselines, and the possibility to investigate the impact of forecast\nuncertainties. Our Python tool CommonPower is the first modular framework\naddressing these needs. CommonPower offers a unified interface for single-agent\nand multi-agent RL training algorithms and includes a built-in model predictive\ncontrol approach based on a symbolic representation of the system equations.\nThis makes it possible to combine model predictive controllers with RL\ncontrollers in the same system. Leveraging the symbolic system model,\nCommonPower facilitates the study of safeguarding strategies via the flexible\nformulation of safety layers. Furthermore equipped with a generic forecasting\ninterface, CommonPower constitutes a versatile tool significantly augmenting\nthe exploration of safe RL controllers in smart grids on several dimensions.\n","authors":["Michael Eichelbeck","Hannah Markgraf","Matthias Althoff"],"pdf_url":"https://arxiv.org/pdf/2406.03231v1.pdf","comment":"For the corresponding code repository, see\n  https://github.com/TUMcps/commonpower"},{"id":"http://arxiv.org/abs/2406.03230v1","updated":"2024-06-05T13:06:33Z","published":"2024-06-05T13:06:33Z","title":"Defending Large Language Models Against Attacks With Residual Stream\n  Activation Analysis","summary":"  The widespread adoption of Large Language Models (LLMs), exemplified by\nOpenAI's ChatGPT, brings to the forefront the imperative to defend against\nadversarial threats on these models. These attacks, which manipulate an LLM's\noutput by introducing malicious inputs, undermine the model's integrity and the\ntrust users place in its outputs. In response to this challenge, our paper\npresents an innovative defensive strategy, given white box access to an LLM,\nthat harnesses residual activation analysis between transformer layers of the\nLLM. We apply an established methodology for analyzing distinctive activation\npatterns in the residual streams for a novel result of attack prompt\nclassification. We curate multiple datasets to demonstrate how this method of\nclassification has high accuracy across multiple types of attack scenarios,\nincluding our newly-created attack dataset. Furthermore, we enhance the model's\nresilience by integrating safety fine-tuning techniques for LLMs in order to\nmeasure its effect on our capability to detect attacks. The results underscore\nthe effectiveness of our approach in enhancing the detection and mitigation of\nadversarial inputs, advancing the security framework within which LLMs operate.\n","authors":["Amelia Kawasaki","Andrew Davis","Houssam Abbas"],"pdf_url":"https://arxiv.org/pdf/2406.03230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03229v1","updated":"2024-06-05T13:06:17Z","published":"2024-06-05T13:06:17Z","title":"Global Clipper: Enhancing Safety and Reliability of Transformer-based\n  Object Detection Models","summary":"  As transformer-based object detection models progress, their impact in\ncritical sectors like autonomous vehicles and aviation is expected to grow.\nSoft errors causing bit flips during inference have significantly impacted DNN\nperformance, altering predictions. Traditional range restriction solutions for\nCNNs fall short for transformers. This study introduces the Global Clipper and\nGlobal Hybrid Clipper, effective mitigation strategies specifically designed\nfor transformer-based models. It significantly enhances their resilience to\nsoft errors and reduces faulty inferences to ~ 0\\%. We also detail extensive\ntesting across over 64 scenarios involving two transformer models (DINO-DETR\nand Lite-DETR) and two CNN models (YOLOv3 and SSD) using three datasets,\ntotalling approximately 3.3 million inferences, to assess model robustness\ncomprehensively. Moreover, the paper explores unique aspects of attention\nblocks in transformers and their operational differences from CNNs.\n","authors":["Qutub Syed Sha","Michael Paulitsch","Karthik Pattabiraman","Korbinian Hagn","Fabian Oboril","Cornelius Buerkle","Kay-Ulrich Scholl","Gereon Hinz","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2406.03229v1.pdf","comment":"Accepted at IJCAI-AISafety'24 Workshop"},{"id":"http://arxiv.org/abs/2402.01401v3","updated":"2024-06-05T13:04:20Z","published":"2024-02-02T13:33:30Z","title":"An Information Theoretic Approach to Machine Unlearning","summary":"  To comply with AI and data regulations, the need to forget private or\ncopyrighted information from trained machine learning models is increasingly\nimportant. The key challenge in unlearning is forgetting the necessary data in\na timely manner, while preserving model performance. In this work, we address\nthe zero-shot unlearning scenario, whereby an unlearning algorithm must be able\nto remove data given only a trained model and the data to be forgotten. We\nexplore unlearning from an information theoretic perspective, connecting the\ninfluence of a sample to the information gain a model receives by observing it.\nFrom this, we derive a simple but principled zero-shot unlearning method based\non the geometry of the model. Our approach takes the form of minimising the\ngradient of a learned function with respect to a small neighbourhood around a\ntarget forget point. This induces a smoothing effect, causing forgetting by\nmoving the boundary of the classifier. We explore the intuition behind why this\napproach can jointly unlearn forget samples while preserving general model\nperformance through a series of low-dimensional experiments. We perform\nextensive empirical evaluation of our method over a range of contemporary\nbenchmarks, verifying that our method is competitive with state-of-the-art\nperformance under the strict constraints of zero-shot unlearning.\n","authors":["Jack Foster","Kyle Fogarty","Stefan Schoepf","Cengiz Öztireli","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2402.01401v3.pdf","comment":"Updated, new low-dimensional experiments and updated perspective on\n  unlearning from an information theoretic view"},{"id":"http://arxiv.org/abs/2404.13604v2","updated":"2024-06-05T12:54:39Z","published":"2024-04-21T10:26:13Z","title":"CKGConv: General Graph Convolution with Continuous Kernels","summary":"  The existing definitions of graph convolution, either from spatial or\nspectral perspectives, are inflexible and not unified. Defining a general\nconvolution operator in the graph domain is challenging due to the lack of\ncanonical coordinates, the presence of irregular structures, and the properties\nof graph symmetries. In this work, we propose a novel and general graph\nconvolution framework by parameterizing the kernels as continuous functions of\npseudo-coordinates derived via graph positional encoding. We name this\nContinuous Kernel Graph Convolution (CKGConv). Theoretically, we demonstrate\nthat CKGConv is flexible and expressive. CKGConv encompasses many existing\ngraph convolutions, and exhibits a stronger expressiveness, as powerful as\ngraph transformers in terms of distinguishing non-isomorphic graphs.\nEmpirically, we show that CKGConv-based Networks outperform existing graph\nconvolutional networks and perform comparably to the best graph transformers\nacross a variety of graph datasets. The code and models are publicly available\nat https://github.com/networkslab/CKGConv.\n","authors":["Liheng Ma","Soumyasundar Pal","Yitian Zhang","Jiaming Zhou","Yingxue Zhang","Mark Coates"],"pdf_url":"https://arxiv.org/pdf/2404.13604v2.pdf","comment":"On International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2406.03216v1","updated":"2024-06-05T12:53:37Z","published":"2024-06-05T12:53:37Z","title":"Choice of PEFT Technique in Continual Learning: Prompt Tuning is Not All\n  You Need","summary":"  Recent Continual Learning (CL) methods have combined pretrained Transformers\nwith prompt tuning, a parameter-efficient fine-tuning (PEFT) technique. We\nargue that the choice of prompt tuning in prior works was an undefended and\nunablated decision, which has been uncritically adopted by subsequent research,\nbut warrants further research to understand its implications. In this paper, we\nconduct this research and find that the choice of prompt tuning as a PEFT\nmethod hurts the overall performance of the CL system. To illustrate this, we\nreplace prompt tuning with LoRA in two state-of-the-art continual learning\nmethods: Learning to Prompt and S-Prompts. These variants consistently achieve\nhigher accuracy across a wide range of domain-incremental and class-incremental\nbenchmarks, while being competitive in inference speed. Our work highlights a\ncrucial argument: unexamined choices can hinder progress in the field, and\nrigorous ablations, such as the PEFT method, are required to drive meaningful\nadoption of CL techniques in real-world applications.\n","authors":["Martin Wistuba","Prabhu Teja Sivaprasad","Lukas Balles","Giovanni Zappella"],"pdf_url":"https://arxiv.org/pdf/2406.03216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03212v1","updated":"2024-06-05T12:51:20Z","published":"2024-06-05T12:51:20Z","title":"Inferring the time-varying coupling of dynamical systems with temporal\n  convolutional autoencoders","summary":"  Most approaches for assessing causality in complex dynamical systems fail\nwhen the interactions between variables are inherently non-linear and\nnon-stationary. Here we introduce Temporal Autoencoders for Causal Inference\n(TACI), a methodology that combines a new surrogate data metric for assessing\ncausal interactions with a novel two-headed machine learning architecture to\nidentify and measure the direction and strength of time-varying causal\ninteractions. Through tests on both synthetic and real-world datasets, we\ndemonstrate TACI's ability to accurately quantify dynamic causal interactions\nacross a variety of systems. Our findings display the method's effectiveness\ncompared to existing approaches and also highlight our approach's potential to\nbuild a deeper understanding of the mechanisms that underlie time-varying\ninteractions in physical and biological systems.\n","authors":["Josuan Calderon","Gordon J. Berman"],"pdf_url":"https://arxiv.org/pdf/2406.03212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18705v2","updated":"2024-06-05T12:51:08Z","published":"2024-03-27T15:54:55Z","title":"Conditional Wasserstein Distances with Applications in Bayesian OT Flow\n  Matching","summary":"  In inverse problems, many conditional generative models approximate the\nposterior measure by minimizing a distance between the joint measure and its\nlearned approximation. While this approach also controls the distance between\nthe posterior measures in the case of the Kullback--Leibler divergence, this is\nin general not hold true for the Wasserstein distance. In this paper, we\nintroduce a conditional Wasserstein distance via a set of restricted couplings\nthat equals the expected Wasserstein distance of the posteriors. Interestingly,\nthe dual formulation of the conditional Wasserstein-1 flow resembles losses in\nthe conditional Wasserstein GAN literature in a quite natural way. We derive\ntheoretical properties of the conditional Wasserstein distance, characterize\nthe corresponding geodesics and velocity fields as well as the flow ODEs.\nSubsequently, we propose to approximate the velocity fields by relaxing the\nconditional Wasserstein distance. Based on this, we propose an extension of OT\nFlow Matching for solving Bayesian inverse problems and demonstrate its\nnumerical advantages on an inverse problem and class-conditional image\ngeneration.\n","authors":["Jannis Chemseddine","Paul Hagemann","Gabriele Steidl","Christian Wald"],"pdf_url":"https://arxiv.org/pdf/2403.18705v2.pdf","comment":"This paper supersedes arXiv:2310.13433"},{"id":"http://arxiv.org/abs/2406.03209v1","updated":"2024-06-05T12:45:23Z","published":"2024-06-05T12:45:23Z","title":"Challenges and Considerations in the Evaluation of Bayesian Causal\n  Discovery","summary":"  Representing uncertainty in causal discovery is a crucial component for\nexperimental design, and more broadly, for safe and reliable causal decision\nmaking. Bayesian Causal Discovery (BCD) offers a principled approach to\nencapsulating this uncertainty. Unlike non-Bayesian causal discovery, which\nrelies on a single estimated causal graph and model parameters for assessment,\nevaluating BCD presents challenges due to the nature of its inferred quantity -\nthe posterior distribution. As a result, the research community has proposed\nvarious metrics to assess the quality of the approximate posterior. However,\nthere is, to date, no consensus on the most suitable metric(s) for evaluation.\nIn this work, we reexamine this question by dissecting various metrics and\nunderstanding their limitations. Through extensive empirical evaluation, we\nfind that many existing metrics fail to exhibit a strong correlation with the\nquality of approximation to the true posterior, especially in scenarios with\nlow sample sizes where BCD is most desirable. We highlight the suitability (or\nlack thereof) of these metrics under two distinct factors: the identifiability\nof the underlying causal model and the quantity of available data. Both factors\naffect the entropy of the true posterior, indicating that the current metrics\nare less fitting in settings of higher entropy. Our findings underline the\nimportance of a more nuanced evaluation of new methods by taking into account\nthe nature of the true posterior, as well as guide and motivate the development\nof new evaluation procedures for this challenge.\n","authors":["Amir Mohammad Karimi Mamaghan","Panagiotis Tigas","Karl Henrik Johansson","Yarin Gal","Yashas Annadani","Stefan Bauer"],"pdf_url":"https://arxiv.org/pdf/2406.03209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00112v2","updated":"2024-06-05T12:36:39Z","published":"2023-09-29T19:55:56Z","title":"Reinforcement Learning for Node Selection in Branch-and-Bound","summary":"  A big challenge in branch and bound lies in identifying the optimal node\nwithin the search tree from which to proceed. Current state-of-the-art\nselectors utilize either hand-crafted ensembles that automatically switch\nbetween naive sub-node selectors, or learned node selectors that rely on\nindividual node data. We propose a novel simulation technique that uses\nreinforcement learning (RL) while considering the entire tree state, rather\nthan just isolated nodes. To achieve this, we train a graph neural network that\nproduces a probability distribution based on the path from the model's root to\nits \"to-be-selected\" leaves. Modelling node-selection as a probability\ndistribution allows us to train the model using state-of-the-art RL techniques\nthat capture both intrinsic node-quality and node-evaluation costs. Our method\ninduces a high quality node selection policy on a set of varied and complex\nproblem sets, despite only being trained on specially designed, synthetic\ntravelling salesmen problem (TSP) instances. Using such a fixed pretrained\npolicy shows significant improvements on several benchmarks in optimality gap\nreductions and per-node efficiency under strict time constraints.\n","authors":["Alexander Mattick","Christopher Mutschler"],"pdf_url":"https://arxiv.org/pdf/2310.00112v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01702v2","updated":"2024-06-05T12:27:17Z","published":"2024-05-02T19:55:30Z","title":"Optimization without Retraction on the Random Generalized Stiefel\n  Manifold","summary":"  Optimization over the set of matrices $X$ that satisfy $X^\\top B X = I_p$,\nreferred to as the generalized Stiefel manifold, appears in many applications\ninvolving sampled covariance matrices such as the canonical correlation\nanalysis (CCA), independent component analysis (ICA), and the generalized\neigenvalue problem (GEVP). Solving these problems is typically done by\niterative methods that require a fully formed $B$. We propose a cheap\nstochastic iterative method that solves the optimization problem while having\naccess only to a random estimates of $B$. Our method does not enforce the\nconstraint in every iteration; instead, it produces iterations that converge to\ncritical points on the generalized Stiefel manifold defined in expectation. The\nmethod has lower per-iteration cost, requires only matrix multiplications, and\nhas the same convergence rates as its Riemannian optimization counterparts that\nrequire the full matrix $B$. Experiments demonstrate its effectiveness in\nvarious machine learning applications involving generalized orthogonality\nconstraints, including CCA, ICA, and the GEVP.\n","authors":["Simon Vary","Pierre Ablin","Bin Gao","P. -A. Absil"],"pdf_url":"https://arxiv.org/pdf/2405.01702v2.pdf","comment":"This v2 is the camera-ready version for ICML 2024"},{"id":"http://arxiv.org/abs/2403.09976v2","updated":"2024-06-05T12:25:53Z","published":"2024-03-15T02:46:19Z","title":"AD3: Implicit Action is the Key for World Models to Distinguish the\n  Diverse Visual Distractors","summary":"  Model-based methods have significantly contributed to distinguishing\ntask-irrelevant distractors for visual control. However, prior research has\nprimarily focused on heterogeneous distractors like noisy background videos,\nleaving homogeneous distractors that closely resemble controllable agents\nlargely unexplored, which poses significant challenges to existing methods. To\ntackle this problem, we propose Implicit Action Generator (IAG) to learn the\nimplicit actions of visual distractors, and present a new algorithm named\nimplicit Action-informed Diverse visual Distractors Distinguisher (AD3), that\nleverages the action inferred by IAG to train separated world models. Implicit\nactions effectively capture the behavior of background distractors, aiding in\ndistinguishing the task-irrelevant components, and the agent can optimize the\npolicy within the task-relevant state space. Our method achieves superior\nperformance on various visual control tasks featuring both heterogeneous and\nhomogeneous distractors. The indispensable role of implicit actions learned by\nIAG is also empirically validated.\n","authors":["Yucen Wang","Shenghua Wan","Le Gan","Shuai Feng","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2403.09976v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02176v2","updated":"2024-06-05T12:23:46Z","published":"2024-06-04T10:12:09Z","title":"AROMA: Preserving Spatial Structure for Latent PDE Modeling with Local\n  Neural Fields","summary":"  We present AROMA (Attentive Reduced Order Model with Attention), a framework\ndesigned to enhance the modeling of partial differential equations (PDEs) using\nlocal neural fields. Our flexible encoder-decoder architecture can obtain\nsmooth latent representations of spatial physical fields from a variety of data\ntypes, including irregular-grid inputs and point clouds. This versatility\neliminates the need for patching and allows efficient processing of diverse\ngeometries. The sequential nature of our latent representation can be\ninterpreted spatially and permits the use of a conditional transformer for\nmodeling the temporal dynamics of PDEs. By employing a diffusion-based\nformulation, we achieve greater stability and enable longer rollouts compared\nto conventional MSE training. AROMA's superior performance in simulating 1D and\n2D equations underscores the efficacy of our approach in capturing complex\ndynamical behaviors.\n","authors":["Louis Serrano","Thomas X Wang","Etienne Le Naour","Jean-Noël Vittaut","Patrick Gallinari"],"pdf_url":"https://arxiv.org/pdf/2406.02176v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03193v1","updated":"2024-06-05T12:23:02Z","published":"2024-06-05T12:23:02Z","title":"Graph Neural Network Explanations are Fragile","summary":"  Explainable Graph Neural Network (GNN) has emerged recently to foster the\ntrust of using GNNs. Existing GNN explainers are developed from various\nperspectives to enhance the explanation performance. We take the first step to\nstudy GNN explainers under adversarial attack--We found that an adversary\nslightly perturbing graph structure can ensure GNN model makes correct\npredictions, but the GNN explainer yields a drastically different explanation\non the perturbed graph. Specifically, we first formulate the attack problem\nunder a practical threat model (i.e., the adversary has limited knowledge about\nthe GNN explainer and a restricted perturbation budget). We then design two\nmethods (i.e., one is loss-based and the other is deduction-based) to realize\nthe attack. We evaluate our attacks on various GNN explainers and the results\nshow these explainers are fragile.\n","authors":["Jiate Li","Meng Pang","Yun Dong","Jinyuan Jia","Binghui Wang"],"pdf_url":"https://arxiv.org/pdf/2406.03193v1.pdf","comment":"17 pages, 64 figures"},{"id":"http://arxiv.org/abs/2403.07004v2","updated":"2024-06-05T12:20:29Z","published":"2024-03-07T13:14:21Z","title":"Convergence of Some Convex Message Passing Algorithms to a Fixed Point","summary":"  A popular approach to the MAP inference problem in graphical models is to\nminimize an upper bound obtained from a dual linear programming or Lagrangian\nrelaxation by (block-)coordinate descent. This is also known as\nconvex/convergent message passing; examples are max-sum diffusion and\nsequential tree-reweighted message passing (TRW-S). Convergence properties of\nthese methods are currently not fully understood. They have been proved to\nconverge to the set characterized by local consistency of active constraints,\nwith unknown convergence rate; however, it was not clear if the iterates\nconverge at all (to any point). We prove a stronger result (conjectured before\nbut never proved): the iterates converge to a fixed point of the method.\nMoreover, we show that the algorithm terminates within\n$\\mathcal{O}(1/\\varepsilon)$ iterations. We first prove this for a version of\ncoordinate descent applied to a general piecewise-affine convex objective. Then\nwe show that several convex message passing methods are special cases of this\nmethod. Finally, we show that a slightly different version of coordinate\ndescent can cycle.\n","authors":["Vaclav Voracek","Tomas Werner"],"pdf_url":"https://arxiv.org/pdf/2403.07004v2.pdf","comment":"ICML 2024; comments are welcome"},{"id":"http://arxiv.org/abs/2406.02309v2","updated":"2024-06-05T12:10:16Z","published":"2024-06-04T13:41:00Z","title":"Effects of Exponential Gaussian Distribution on (Double Sampling)\n  Randomized Smoothing","summary":"  Randomized Smoothing (RS) is currently a scalable certified defense method\nproviding robustness certification against adversarial examples. Although\nsignificant progress has been achieved in providing defenses against $\\ell_p$\nadversaries, the interaction between the smoothing distribution and the\nrobustness certification still remains vague. In this work, we comprehensively\nstudy the effect of two families of distributions, named Exponential Standard\nGaussian (ESG) and Exponential General Gaussian (EGG) distributions, on\nRandomized Smoothing and Double Sampling Randomized Smoothing (DSRS). We derive\nan analytic formula for ESG's certified radius, which converges to the origin\nformula of RS as the dimension $d$ increases. Additionally, we prove that EGG\ncan provide tighter constant factors than DSRS in providing $\\Omega(\\sqrt{d})$\nlower bounds of $\\ell_2$ certified radius, and thus further addresses the curse\nof dimensionality in RS. Our experiments on real-world datasets confirm our\ntheoretical analysis of the ESG distributions, that they provide almost the\nsame certification under different exponents $\\eta$ for both RS and DSRS. In\naddition, EGG brings a significant improvement to the DSRS certification, but\nthe mechanism can be different when the classifier properties are different.\nCompared to the primitive DSRS, the increase in certified accuracy provided by\nEGG is prominent, up to 6.4% on ImageNet.\n","authors":["Youwei Shu","Xi Xiao","Derui Wang","Yuxin Cao","Siji Chen","Jason Xue","Linyi Li","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2406.02309v2.pdf","comment":"ICML 2024 Poster"},{"id":"http://arxiv.org/abs/2103.09603v6","updated":"2024-06-05T12:09:23Z","published":"2021-03-17T12:42:41Z","title":"DoubleML -- An Object-Oriented Implementation of Double Machine Learning\n  in R","summary":"  The R package DoubleML implements the double/debiased machine learning\nframework of Chernozhukov et al. (2018). It provides functionalities to\nestimate parameters in causal models based on machine learning methods. The\ndouble machine learning framework consist of three key ingredients: Neyman\northogonality, high-quality machine learning estimation and sample splitting.\nEstimation of nuisance components can be performed by various state-of-the-art\nmachine learning methods that are available in the mlr3 ecosystem. DoubleML\nmakes it possible to perform inference in a variety of causal models, including\npartially linear and interactive regression models and their extensions to\ninstrumental variable estimation. The object-oriented implementation of\nDoubleML enables a high flexibility for the model specification and makes it\neasily extendable. This paper serves as an introduction to the double machine\nlearning framework and the R package DoubleML. In reproducible code examples\nwith simulated and real data sets, we demonstrate how DoubleML users can\nperform valid inference based on machine learning methods.\n","authors":["Philipp Bach","Victor Chernozhukov","Malte S. Kurz","Martin Spindler","Sven Klaassen"],"pdf_url":"https://arxiv.org/pdf/2103.09603v6.pdf","comment":"56 pages, 8 Figures, 1 Table; Updated version for DoubleML 1.0.0;\n  Updated version due to changes in R package paradox (for parameter tuning\n  with mlr3)"},{"id":"http://arxiv.org/abs/2007.14717v4","updated":"2024-06-05T12:03:47Z","published":"2020-07-29T09:56:05Z","title":"Almost exact recovery in noisy semi-supervised learning","summary":"  Graph-based semi-supervised learning methods combine the graph structure and\nlabeled data to classify unlabeled data. In this work, we study the effect of a\nnoisy oracle on classification. In particular, we derive the Maximum A\nPosteriori (MAP) estimator for clustering a Degree Corrected Stochastic Block\nModel (DC-SBM) when a noisy oracle reveals a fraction of the labels. We then\npropose an algorithm derived from a continuous relaxation of the MAP, and we\nestablish its consistency. Numerical experiments show that our approach\nachieves promising performance on synthetic and real data sets, even in the\ncase of very noisy labeled data.\n","authors":["Konstantin Avrachenkov","Maximilien Dreveton"],"pdf_url":"https://arxiv.org/pdf/2007.14717v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03172v1","updated":"2024-06-05T12:03:45Z","published":"2024-06-05T12:03:45Z","title":"Initialization-enhanced Physics-Informed Neural Network with Domain\n  Decomposition (IDPINN)","summary":"  We propose a new physics-informed neural network framework, IDPINN, based on\nthe enhancement of initialization and domain decomposition to improve\nprediction accuracy. We train a PINN using a small dataset to obtain an initial\nnetwork structure, including the weighted matrix and bias, which initializes\nthe PINN for each subdomain. Moreover, we leverage the smoothness condition on\nthe interface to enhance the prediction performance. We numerically evaluated\nit on several forward problems and demonstrated the benefits of IDPINN in terms\nof accuracy.\n","authors":["Chenhao Si","Ming Yan"],"pdf_url":"https://arxiv.org/pdf/2406.03172v1.pdf","comment":"20 pages, 14 figures"},{"id":"http://arxiv.org/abs/2406.03171v1","updated":"2024-06-05T12:03:27Z","published":"2024-06-05T12:03:27Z","title":"High-Dimensional Kernel Methods under Covariate Shift: Data-Dependent\n  Implicit Regularization","summary":"  This paper studies kernel ridge regression in high dimensions under covariate\nshifts and analyzes the role of importance re-weighting. We first derive the\nasymptotic expansion of high dimensional kernels under covariate shifts. By a\nbias-variance decomposition, we theoretically demonstrate that the re-weighting\nstrategy allows for decreasing the variance. For bias, we analyze the\nregularization of the arbitrary or well-chosen scale, showing that the bias can\nbehave very differently under different regularization scales. In our analysis,\nthe bias and variance can be characterized by the spectral decay of a\ndata-dependent regularized kernel: the original kernel matrix associated with\nan additional re-weighting matrix, and thus the re-weighting strategy can be\nregarded as a data-dependent regularization for better understanding. Besides,\nour analysis provides asymptotic expansion of kernel functions/vectors under\ncovariate shift, which has its own interest.\n","authors":["Yihang Chen","Fanghui Liu","Taiji Suzuki","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2406.03171v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2306.03615v2","updated":"2024-06-05T12:00:48Z","published":"2023-06-06T12:07:50Z","title":"PEARL: Zero-shot Cross-task Preference Alignment and Robust Reward\n  Learning for Robotic Manipulation","summary":"  In preference-based Reinforcement Learning (RL), obtaining a large number of\npreference labels are both time-consuming and costly. Furthermore, the queried\nhuman preferences cannot be utilized for the new tasks. In this paper, we\npropose Zero-shot Cross-task Preference Alignment and Robust Reward Learning\n(PEARL), which learns policies from cross-task preference transfer without any\nhuman labels of the target task. Our contributions include two novel components\nthat facilitate the transfer and learning process. The first is Cross-task\nPreference Alignment (CPA), which transfers the preferences between tasks via\noptimal transport. The key idea of CPA is to use Gromov-Wasserstein distance to\nalign the trajectories between tasks, and the solved optimal transport matrix\nserves as the correspondence between trajectories. The target task preferences\nare computed as the weighted sum of source task preference labels with the\ncorrespondence as weights. Moreover, to ensure robust learning from these\ntransferred labels, we introduce Robust Reward Learning (RRL), which considers\nboth reward mean and uncertainty by modeling rewards as Gaussian distributions.\nEmpirical results on robotic manipulation tasks from Meta-World and Robomimic\ndemonstrate that our method is capable of transferring preference labels across\ntasks accurately and then learns well-behaved policies. Notably, our approach\nsignificantly exceeds existing methods when there are few human preferences.\nThe code and videos of our method are available at:\nhttps://sites.google.com/view/pearl-preference.\n","authors":["Runze Liu","Yali Du","Fengshuo Bai","Jiafei Lyu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2306.03615v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2305.16610v3","updated":"2024-06-05T11:57:02Z","published":"2023-05-26T04:02:54Z","title":"Adaptively Perturbed Mirror Descent for Learning in Games","summary":"  This paper proposes a payoff perturbation technique for the Mirror Descent\n(MD) algorithm in games where the gradient of the payoff functions is monotone\nin the strategy profile space, potentially containing additive noise. The\noptimistic family of learning algorithms, exemplified by optimistic MD,\nsuccessfully achieves {\\it last-iterate} convergence in scenarios devoid of\nnoise, leading the dynamics to a Nash equilibrium. A recent re-emerging trend\nunderscores the promise of the perturbation approach, where payoff functions\nare perturbed based on the distance from an anchoring, or {\\it slingshot},\nstrategy. In response, we propose {\\it Adaptively Perturbed MD} (APMD), which\nadjusts the magnitude of the perturbation by repeatedly updating the slingshot\nstrategy at a predefined interval. This innovation empowers us to find a Nash\nequilibrium of the underlying game with guaranteed rates. Empirical\ndemonstrations affirm that our algorithm exhibits significantly accelerated\nconvergence.\n","authors":["Kenshi Abe","Kaito Ariu","Mitsuki Sakamoto","Atsushi Iwasaki"],"pdf_url":"https://arxiv.org/pdf/2305.16610v3.pdf","comment":"Accepted in ICML 2024"},{"id":"http://arxiv.org/abs/2406.03164v1","updated":"2024-06-05T11:56:54Z","published":"2024-06-05T11:56:54Z","title":"Topological Neural Networks go Persistent, Equivariant, and Continuous","summary":"  Topological Neural Networks (TNNs) incorporate higher-order relational\ninformation beyond pairwise interactions, enabling richer representations than\nGraph Neural Networks (GNNs). Concurrently, topological descriptors based on\npersistent homology (PH) are being increasingly employed to augment the GNNs.\nWe investigate the benefits of integrating these two paradigms. Specifically,\nwe introduce TopNets as a broad framework that subsumes and unifies various\nmethods in the intersection of GNNs/TNNs and PH such as (generalizations of)\nRePHINE and TOGL. TopNets can also be readily adapted to handle (symmetries in)\ngeometric complexes, extending the scope of TNNs and PH to spatial settings.\nTheoretically, we show that PH descriptors can provably enhance the\nexpressivity of simplicial message-passing networks. Empirically, (continuous\nand E(n)-equivariant extensions of) TopNets achieve strong performance across\ndiverse tasks, including antibody design, molecular dynamics simulation, and\ndrug property prediction.\n","authors":["Yogesh Verma","Amauri H Souza","Vikas Garg"],"pdf_url":"https://arxiv.org/pdf/2406.03164v1.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2307.16375v3","updated":"2024-06-05T11:44:44Z","published":"2023-07-31T02:39:54Z","title":"UniAP: Unifying Inter- and Intra-Layer Automatic Parallelism by Mixed\n  Integer Quadratic Programming","summary":"  Distributed learning is commonly used for training deep learning models,\nespecially large models. In distributed learning, manual parallelism (MP)\nmethods demand considerable human effort and have limited flexibility. Hence,\nautomatic parallelism (AP) methods have recently been proposed for automating\nthe parallel strategy optimization process. Existing AP methods suffer from\nsub-optimal solutions because they do not jointly optimize the two categories\nof parallel strategies (i.e., inter-layer parallelism and intra-layer\nparallelism). In this paper, we propose a novel AP method called UniAP, which\nunifies inter- and intra-layer automatic parallelism by mixed integer quadratic\nprogramming. To the best of our knowledge, UniAP is the first parallel method\nthat can jointly optimize the two categories of parallel strategies to find an\noptimal solution. Experimental results show that UniAP outperforms\nstate-of-the-art methods by up to 3.80$\\times$ in throughput and reduces\nstrategy optimization time by up to 107$\\times$ across five Transformer-based\nmodels.\n","authors":["Hao Lin","Ke Wu","Jie Li","Jun Li","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2307.16375v3.pdf","comment":"27 pages, 8 figures"},{"id":"http://arxiv.org/abs/2406.03161v1","updated":"2024-06-05T11:42:46Z","published":"2024-06-05T11:42:46Z","title":"Ethical considerations of use of hold-out sets in clinical prediction\n  model management","summary":"  Clinical prediction models are statistical or machine learning models used to\nquantify the risk of a certain health outcome using patient data. These can\nthen inform potential interventions on patients, causing an effect called\nperformative prediction: predictions inform interventions which influence the\noutcome they were trying to predict, leading to a potential underestimation of\nrisk in some patients if a model is updated on this data. One suggested\nresolution to this is the use of hold-out sets, in which a set of patients do\nnot receive model derived risk scores, such that a model can be safely\nretrained. We present an overview of clinical and research ethics regarding\npotential implementation of hold-out sets for clinical prediction models in\nhealth settings. We focus on the ethical principles of beneficence,\nnon-maleficence, autonomy and justice. We also discuss informed consent,\nclinical equipoise, and truth-telling. We present illustrative cases of\npotential hold-out set implementations and discuss statistical issues arising\nfrom different hold-out set sampling methods. We also discuss differences\nbetween hold-out sets and randomised control trials, in terms of ethics and\nstatistical issues. Finally, we give practical recommendations for researchers\ninterested in the use hold-out sets for clinical prediction models.\n","authors":["Louis Chislett","Louis JM Aslett","Alisha R Davies","Catalina A Vallejos","James Liley"],"pdf_url":"https://arxiv.org/pdf/2406.03161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05894v3","updated":"2024-06-05T11:38:12Z","published":"2024-02-08T18:33:21Z","title":"Large Language Model Meets Graph Neural Network in Knowledge\n  Distillation","summary":"  In service-oriented architectures, accurately predicting the Quality of\nService (QoS) is crucial for maintaining reliability and enhancing user\nsatisfaction. However, significant challenges remain due to existing methods\nalways overlooking high-order latent collaborative relationships between users\nand services and failing to dynamically adjust feature learning for every\nspecific user-service invocation, which are critical for learning accurate\nfeatures. Additionally, reliance on RNNs for capturing QoS evolution hampers\nmodels' ability to detect long-term trends due to difficulties in managing\nlong-range dependencies. To address these challenges, we propose the\n\\underline{T}arget-Prompt \\underline{O}nline \\underline{G}raph\n\\underline{C}ollaborative \\underline{L}earning (TOGCL) framework for\ntemporal-aware QoS prediction. TOGCL leverages a dynamic user-service\ninvocation graph to model historical interactions, providing a comprehensive\nrepresentation of user-service relationships. Building on this graph, it\ndevelops a target-prompt graph attention network to extract online deep latent\nfeatures of users and services at each time slice, simultaneously considering\nimplicit collaborative relationships between target users/services and their\nneighbors, as well as relevant historical QoS values. Additionally, a\nmulti-layer Transformer encoder is employed to uncover temporal feature\nevolution patterns of users and services, leading to temporal-aware QoS\nprediction. Extensive experiments conducted on the WS-DREAM dataset demonstrate\nthat our proposed TOGCL framework significantly outperforms state-of-the-art\nmethods across multiple metrics, achieving improvements of up to 38.80\\%. These\nresults underscore the effectiveness of the TOGCL framework for precise\ntemporal QoS prediction.\n","authors":["Shengxiang Hu","Guobing Zou","Song Yang","Yanglan Gan","Bofeng Zhang","Yixin Chen"],"pdf_url":"https://arxiv.org/pdf/2402.05894v3.pdf","comment":"14 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2304.05366v2","updated":"2024-06-05T11:36:13Z","published":"2023-04-11T17:22:22Z","title":"The No Free Lunch Theorem, Kolmogorov Complexity, and the Role of\n  Inductive Biases in Machine Learning","summary":"  No free lunch theorems for supervised learning state that no learner can\nsolve all problems or that all learners achieve exactly the same accuracy on\naverage over a uniform distribution on learning problems. Accordingly, these\ntheorems are often referenced in support of the notion that individual problems\nrequire specially tailored inductive biases. While virtually all uniformly\nsampled datasets have high complexity, real-world problems disproportionately\ngenerate low-complexity data, and we argue that neural network models share\nthis same preference, formalized using Kolmogorov complexity. Notably, we show\nthat architectures designed for a particular domain, such as computer vision,\ncan compress datasets on a variety of seemingly unrelated domains. Our\nexperiments show that pre-trained and even randomly initialized language models\nprefer to generate low-complexity sequences. Whereas no free lunch theorems\nseemingly indicate that individual problems require specialized learners, we\nexplain how tasks that often require human intervention such as picking an\nappropriately sized model when labeled data is scarce or plentiful can be\nautomated into a single learning algorithm. These observations justify the\ntrend in deep learning of unifying seemingly disparate problems with an\nincreasingly small set of machine learning models.\n","authors":["Micah Goldblum","Marc Finzi","Keefer Rowan","Andrew Gordon Wilson"],"pdf_url":"https://arxiv.org/pdf/2304.05366v2.pdf","comment":"Published at the International Conference on Machine Learning (ICML)\n  2024"},{"id":"http://arxiv.org/abs/2406.03157v1","updated":"2024-06-05T11:35:38Z","published":"2024-06-05T11:35:38Z","title":"A Combination Model Based on Sequential General Variational Mode\n  Decomposition Method for Time Series Prediction","summary":"  Accurate prediction of financial time series is a key concern for market\neconomy makers and investors. The article selects online store sales and\nAustralian beer sales as representatives of non-stationary, trending, and\nseasonal financial time series, and constructs a new SGVMD-ARIMA combination\nmodel in a non-linear combination way to predict financial time series. The\nARIMA model, LSTM model, and other classic decomposition prediction models are\nused as control models to compare the accuracy of different models. The\nempirical results indicate that the constructed combination prediction model\nhas universal advantages over the single prediction model and linear\ncombination prediction model of the control group. Within the prediction\ninterval, our proposed combination model has improved advantages over\ntraditional decomposition prediction control group models.\n","authors":["Wei Chen","Yuanyuan Yang","Jianyu Liu"],"pdf_url":"https://arxiv.org/pdf/2406.03157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03154v1","updated":"2024-06-05T11:30:16Z","published":"2024-06-05T11:30:16Z","title":"Detecting Model Misspecification in Amortized Bayesian Inference with\n  Neural Networks: An Extended Investigation","summary":"  Recent advances in probabilistic deep learning enable efficient amortized\nBayesian inference in settings where the likelihood function is only implicitly\ndefined by a simulation program (simulation-based inference; SBI). But how\nfaithful is such inference if the simulation represents reality somewhat\ninaccurately, that is, if the true system behavior at test time deviates from\nthe one seen during training? We conceptualize the types of such model\nmisspecification arising in SBI and systematically investigate how the\nperformance of neural posterior approximators gradually deteriorates as a\nconsequence, making inference results less and less trustworthy. To notify\nusers about this problem, we propose a new misspecification measure that can be\ntrained in an unsupervised fashion (i.e., without training data from the true\ndistribution) and reliably detects model misspecification at test time. Our\nexperiments clearly demonstrate the utility of our new measure both on toy\nexamples with an analytical ground-truth and on representative scientific tasks\nin cell biology, cognitive decision making, disease outbreak dynamics, and\ncomputer vision. We show how the proposed misspecification test warns users\nabout suspicious outputs, raises an alarm when predictions are not trustworthy,\nand guides model designers in their search for better simulators.\n","authors":["Marvin Schmitt","Paul-Christian Bürkner","Ullrich Köthe","Stefan T. Radev"],"pdf_url":"https://arxiv.org/pdf/2406.03154v1.pdf","comment":"Extended version of the conference paper\n  https://doi.org/10.1007/978-3-031-54605-1_35. arXiv admin note: text overlap\n  with arXiv:2112.08866"},{"id":"http://arxiv.org/abs/2402.14883v3","updated":"2024-06-05T11:30:02Z","published":"2024-02-22T04:55:14Z","title":"Double-I Watermark: Protecting Model Copyright for LLM Fine-tuning","summary":"  To support various applications, a prevalent and efficient approach for\nbusiness owners is leveraging their valuable datasets to fine-tune a\npre-trained LLM through the API provided by LLM owners or cloud servers.\nHowever, this process carries a substantial risk of model misuse, potentially\nresulting in severe economic consequences for business owners. Thus,\nsafeguarding the copyright of these customized models during LLM fine-tuning\nhas become an urgent practical requirement, but there are limited existing\nsolutions to provide such protection. To tackle this pressing issue, we propose\na novel watermarking approach named ``Double-I watermark''. Specifically, based\non the instruct-tuning data, two types of backdoor data paradigms are\nintroduced with trigger in the instruction and the input, respectively. By\nleveraging LLM's learning capability to incorporate customized backdoor samples\ninto the dataset, the proposed approach effectively injects specific\nwatermarking information into the customized model during fine-tuning, which\nmakes it easy to inject and verify watermarks in commercial scenarios. We\nevaluate the proposed \"Double-I watermark\" under various fine-tuning methods,\ndemonstrating its harmlessness, robustness, uniqueness, imperceptibility, and\nvalidity through both quantitative and qualitative analyses.\n","authors":["Shen Li","Liuyi Yao","Jinyang Gao","Lan Zhang","Yaliang Li"],"pdf_url":"https://arxiv.org/pdf/2402.14883v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02362v2","updated":"2024-06-05T11:22:46Z","published":"2024-06-04T14:39:51Z","title":"Temporal Graph Rewiring with Expander Graphs","summary":"  Evolving relations in real-world networks are often modelled by temporal\ngraphs. Graph rewiring techniques have been utilised on Graph Neural Networks\n(GNNs) to improve expressiveness and increase model performance. In this work,\nwe propose Temporal Graph Rewiring (TGR), the first approach for graph rewiring\non temporal graphs. TGR enables communication between temporally distant nodes\nin a continuous time dynamic graph by utilising expander graph propagation to\nconstruct a message passing highway for message passing between distant nodes.\nExpander graphs are suitable candidates for rewiring as they help overcome the\noversquashing problem often observed in GNNs. On the public tgbl-wiki\nbenchmark, we show that TGR improves the performance of a widely used TGN model\nby a significant margin. Our code repository is accessible at\nhttps://github.com/kpetrovicc/TGR.git .\n","authors":["Katarina Petrović","Shenyang Huang","Farimah Poursafaei","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2406.02362v2.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.12696v4","updated":"2024-06-05T11:19:44Z","published":"2023-08-24T10:29:25Z","title":"Disentanglement Learning via Topology","summary":"  We propose TopDis (Topological Disentanglement), a method for learning\ndisentangled representations via adding a multi-scale topological loss term.\nDisentanglement is a crucial property of data representations substantial for\nthe explainability and robustness of deep learning models and a step towards\nhigh-level cognition. The state-of-the-art methods are based on VAE and\nencourage the joint distribution of latent variables to be factorized. We take\na different perspective on disentanglement by analyzing topological properties\nof data manifolds. In particular, we optimize the topological similarity for\ndata manifolds traversals. To the best of our knowledge, our paper is the first\none to propose a differentiable topological loss for disentanglement learning.\nOur experiments have shown that the proposed TopDis loss improves\ndisentanglement scores such as MIG, FactorVAE score, SAP score, and DCI\ndisentanglement score with respect to state-of-the-art results while preserving\nthe reconstruction quality. Our method works in an unsupervised manner,\npermitting us to apply it to problems without labeled factors of variation. The\nTopDis loss works even when factors of variation are correlated. Additionally,\nwe show how to use the proposed topological loss to find disentangled\ndirections in a trained GAN.\n","authors":["Nikita Balabin","Daria Voronkova","Ilya Trofimov","Evgeny Burnaev","Serguei Barannikov"],"pdf_url":"https://arxiv.org/pdf/2308.12696v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03152v1","updated":"2024-06-05T11:16:55Z","published":"2024-06-05T11:16:55Z","title":"Dynamic Spectral Clustering with Provable Approximation Guarantee","summary":"  This paper studies clustering algorithms for dynamically evolving graphs\n$\\{G_t\\}$, in which new edges (and potential new vertices) are added into a\ngraph, and the underlying cluster structure of the graph can gradually change.\nThe paper proves that, under some mild condition on the cluster-structure, the\nclusters of the final graph $G_T$ of $n_T$ vertices at time $T$ can be well\napproximated by a dynamic variant of the spectral clustering algorithm. The\nalgorithm runs in amortised update time $O(1)$ and query time $o(n_T)$.\nExperimental studies on both synthetic and real-world datasets further confirm\nthe practicality of our designed algorithm.\n","authors":["Steinar Laenen","He Sun"],"pdf_url":"https://arxiv.org/pdf/2406.03152v1.pdf","comment":"This work is accepted at the 41st International Conference on Machine\n  Learning (ICML'24)"},{"id":"http://arxiv.org/abs/2406.03151v1","updated":"2024-06-05T11:15:45Z","published":"2024-06-05T11:15:45Z","title":"Which Side Are You On? A Multi-task Dataset for End-to-End Argument\n  Summarisation and Evaluation","summary":"  With the recent advances of large language models (LLMs), it is no longer\ninfeasible to build an automated debate system that helps people to synthesise\npersuasive arguments. Previous work attempted this task by integrating multiple\ncomponents. In our work, we introduce an argument mining dataset that captures\nthe end-to-end process of preparing an argumentative essay for a debate, which\ncovers the tasks of claim and evidence identification (Task 1 ED), evidence\nconvincingness ranking (Task 2 ECR), argumentative essay summarisation and\nhuman preference ranking (Task 3 ASR) and metric learning for automated\nevaluation of resulting essays, based on human feedback along argument quality\ndimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are\nfully annotated with the various properties supporting the aforementioned\ntasks. We evaluate multiple generative baselines for each of these tasks,\nincluding representative LLMs. We find, that while they show promising results\non individual tasks in our benchmark, their end-to-end performance on all four\ntasks in succession deteriorates significantly, both in automated measures as\nwell as in human-centred evaluation. This challenge presented by our proposed\ndataset motivates future research on end-to-end argument mining and\nsummarisation. The repository of this project is available at\nhttps://github.com/HarrywillDr/ArgSum-Datatset\n","authors":["Hao Li","Yuping Wu","Viktor Schlegel","Riza Batista-Navarro","Tharindu Madusanka","Iqra Zahid","Jiayan Zeng","Xiaochi Wang","Xinran He","Yizhi Li","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2406.03151v1.pdf","comment":"Published on ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2406.03150v1","updated":"2024-06-05T11:15:43Z","published":"2024-06-05T11:15:43Z","title":"Sample-specific Masks for Visual Reprogramming-based Prompting","summary":"  Visual reprogramming (VR) is a prompting technique that aims to re-purpose a\npre-trained model (e.g., a classifier on ImageNet) to target tasks (e.g.,\nmedical data prediction) by learning a small-scale pattern added into input\nimages instead of tuning considerable parameters within the model. The location\nof the pattern within input samples is usually determined by a pre-defined mask\nshared across all samples. In this paper, we show that the shared mask\npotentially limits VR's generalization and increases its approximation error\ndue to the lack of sample-level adaptation. Motivated by this finding, we\ndesign a new framework for VR called sample-specific multi-channel masks (SMM).\nSpecifically, SMM employs a lightweight ConvNet and patch-wise interpolation to\ngenerate sample-specific three-channel masks instead of a shared and\npre-defined mask. Since we generate different masks for individual samples, SMM\nis theoretically shown to reduce approximation error for the target tasks\ncompared with existing state-of-the-art VR methods. We also empirically\ndemonstrate its performance gain on both ResNet and ViT. The success of SMM\nfurther highlights the broader applicability of VR in leveraging the latent\nknowledge of pre-trained models for various target tasks. Our code is available\nat https://github.com/tmlr-group/SMM.\n","authors":["Chengyi Cai","Zesheng Ye","Lei Feng","Jianzhong Qi","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2406.03150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17811v2","updated":"2024-06-05T11:15:04Z","published":"2024-02-27T14:45:04Z","title":"TruthX: Alleviating Hallucinations by Editing Large Language Models in\n  Truthful Space","summary":"  Large Language Models (LLMs) sometimes suffer from producing hallucinations,\nespecially LLMs may generate untruthful responses despite knowing the correct\nknowledge. Activating the truthfulness within LLM is the key to fully unlocking\nLLM's knowledge potential. In this paper, we propose TruthX, an inference-time\nintervention method to activate the truthfulness of LLM by identifying and\nediting the features within LLM's internal representations that govern the\ntruthfulness. TruthX employs an auto-encoder to map LLM's representations into\nsemantic and truthful latent spaces respectively, and applies contrastive\nlearning to identify a truthful editing direction within the truthful space.\nDuring inference, by editing LLM's internal representations in truthful space,\nTruthX effectively enhances the truthfulness of LLM. Experiments show that\nTruthX improves the truthfulness of 13 advanced LLMs by an average of 20% on\nTruthfulQA benchmark. Further analyses suggest that TruthX can control LLM to\nproduce truthful or hallucinatory responses via editing only one vector in\nLLM's internal representations.\n","authors":["Shaolei Zhang","Tian Yu","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2402.17811v2.pdf","comment":"Accepted to ACL 2024 main conference, Project Page:\n  https://ictnlp.github.io/TruthX-site/"},{"id":"http://arxiv.org/abs/2406.03148v1","updated":"2024-06-05T11:06:33Z","published":"2024-06-05T11:06:33Z","title":"Aligning Transformers with Weisfeiler-Leman","summary":"  Graph neural network architectures aligned with the $k$-dimensional\nWeisfeiler--Leman ($k$-WL) hierarchy offer theoretically well-understood\nexpressive power. However, these architectures often fail to deliver\nstate-of-the-art predictive performance on real-world graphs, limiting their\npractical utility. While recent works aligning graph transformer architectures\nwith the $k$-WL hierarchy have shown promising empirical results, employing\ntransformers for higher orders of $k$ remains challenging due to a prohibitive\nruntime and memory complexity of self-attention as well as impractical\narchitectural assumptions, such as an infeasible number of attention heads.\nHere, we advance the alignment of transformers with the $k$-WL hierarchy,\nshowing stronger expressivity results for each $k$, making them more feasible\nin practice. In addition, we develop a theoretical framework that allows the\nstudy of established positional encodings such as Laplacian PEs and SPE. We\nevaluate our transformers on the large-scale PCQM4Mv2 dataset, showing\ncompetitive predictive performance with the state-of-the-art and demonstrating\nstrong downstream performance when fine-tuning them on small-scale molecular\ndatasets. Our code is available at\nhttps://github.com/luis-mueller/wl-transformers.\n","authors":["Luis Müller","Christopher Morris"],"pdf_url":"https://arxiv.org/pdf/2406.03148v1.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2406.03146v1","updated":"2024-06-05T11:01:42Z","published":"2024-06-05T11:01:42Z","title":"Tiny models from tiny data: Textual and null-text inversion for few-shot\n  distillation","summary":"  Few-shot image classification involves classifying images using very few\ntraining examples. Recent vision foundation models show excellent few-shot\ntransfer abilities, but are large and slow at inference. Using knowledge\ndistillation, the capabilities of high-performing but slow models can be\ntransferred to tiny, efficient models. However, common distillation methods\nrequire a large set of unlabeled data, which is not available in the few-shot\nsetting. To overcome this lack of data, there has been a recent interest in\nusing synthetic data.\n  We expand on this work by presenting a novel diffusion model inversion\ntechnique (TINT) combining the diversity of textual inversion with the\nspecificity of null-text inversion. Using this method in a few-shot\ndistillation pipeline leads to state-of-the-art accuracy among small student\nmodels on popular benchmarks, while being significantly faster than prior work.\nThis allows us to push even tiny models to high accuracy using only a tiny\napplication-specific dataset, albeit relying on extra data for pre-training.\n  Popular few-shot benchmarks involve evaluation over a large number of\nepisodes, which is computationally cumbersome for methods involving synthetic\ndata generation. Therefore, we also present a theoretical analysis on how the\nvariance of the accuracy estimator depends on the number of episodes and query\nexamples, and use these results to lower the computational effort required for\nmethod evaluation. In addition, to further motivate the use of generative\nmodels in few-shot distillation, we demonstrate that our method performs better\ncompared to training on real data mined from the dataset used to train the\ndiffusion model.\n  Source code will be made available at https://github.com/pixwse/tiny2.\n","authors":["Erik Landolsi","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2406.03146v1.pdf","comment":"21 pages (9 main pages + references and appendix)"},{"id":"http://arxiv.org/abs/2406.03145v1","updated":"2024-06-05T11:00:27Z","published":"2024-06-05T11:00:27Z","title":"E(n) Equivariant Message Passing Cellular Networks","summary":"  This paper introduces E(n) Equivariant Message Passing Cellular Networks\n(EMPCNs), an extension of E(n) Equivariant Graph Neural Networks to\nCW-complexes. Our approach addresses two aspects of geometric message passing\nnetworks: 1) enhancing their expressiveness by incorporating arbitrary cells,\nand 2) achieving this in a computationally efficient way with a decoupled\nEMPCNs technique. We demonstrate that EMPCNs achieve close to state-of-the-art\nperformance on multiple tasks without the need for steerability, including\nmany-body predictions and motion capture. Moreover, ablation studies confirm\nthat decoupled EMPCNs exhibit stronger generalization capabilities than their\nnon-topologically informed counterparts. These findings show that EMPCNs can be\nused as a scalable and expressive framework for higher-order message passing in\ngeometric and topological graphs\n","authors":["Veljko Kovac","Erik J. Bekkers","Pietro Liò","Floor Eijkelboom"],"pdf_url":"https://arxiv.org/pdf/2406.03145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03144v1","updated":"2024-06-05T11:00:03Z","published":"2024-06-05T11:00:03Z","title":"A Combination Model for Time Series Prediction using LSTM via Extracting\n  Dynamic Features Based on Spatial Smoothing and Sequential General\n  Variational Mode Decomposition","summary":"  In order to solve the problems such as difficult to extract effective\nfeatures and low accuracy of sales volume prediction caused by complex\nrelationships such as market sales volume in time series prediction, we\nproposed a time series prediction method of market sales volume based on\nSequential General VMD and spatial smoothing Long short-term memory neural\nnetwork (SS-LSTM) combination model. Firstly, the spatial smoothing algorithm\nis used to decompose and calculate the sample data of related industry sectors\naffected by the linkage effect of market sectors, extracting modal features\ncontaining information via Sequential General VMD on overall market and\nspecific price trends; Then, according to the background of different Market\ndata sets, LSTM network is used to model and predict the price of fundamental\ndata and modal characteristics. The experimental results of data prediction\nwith seasonal and periodic trends show that this method can achieve higher\nprice prediction accuracy and more accurate accuracy in specific market\ncontexts compared to traditional prediction methods Describe the changes in\nmarket sales volume.\n","authors":["Jianyu Liu","Wei Chen","Yong Zhang","Zhenfeng Chen","Bin Wan","Jinwei Hu"],"pdf_url":"https://arxiv.org/pdf/2406.03144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03142v1","updated":"2024-06-05T10:55:11Z","published":"2024-06-05T10:55:11Z","title":"On the Power of Randomization in Fair Classification and Representation","summary":"  Fair classification and fair representation learning are two important\nproblems in supervised and unsupervised fair machine learning, respectively.\nFair classification asks for a classifier that maximizes accuracy on a given\ndata distribution subject to fairness constraints. Fair representation maps a\ngiven data distribution over the original feature space to a distribution over\na new representation space such that all classifiers over the representation\nsatisfy fairness. In this paper, we examine the power of randomization in both\nthese problems to minimize the loss of accuracy that results when we impose\nfairness constraints. Previous work on fair classification has characterized\nthe optimal fair classifiers on a given data distribution that maximize\naccuracy subject to fairness constraints, e.g., Demographic Parity (DP), Equal\nOpportunity (EO), and Predictive Equality (PE). We refine these\ncharacterizations to demonstrate when the optimal randomized fair classifiers\ncan surpass their deterministic counterparts in accuracy. We also show how the\noptimal randomized fair classifier that we characterize can be obtained as a\nsolution to a convex optimization problem. Recent work has provided techniques\nto construct fair representations for a given data distribution such that any\nclassifier over this representation satisfies DP. However, the classifiers on\nthese fair representations either come with no or weak accuracy guarantees when\ncompared to the optimal fair classifier on the original data distribution.\nExtending our ideas for randomized fair classification, we improve on these\nworks, and construct DP-fair, EO-fair, and PE-fair representations that have\nprovably optimal accuracy and suffer no accuracy loss compared to the optimal\nDP-fair, EO-fair, and PE-fair classifiers respectively on the original data\ndistribution.\n","authors":["Sushant Agarwal","Amit Deshpande"],"pdf_url":"https://arxiv.org/pdf/2406.03142v1.pdf","comment":"Appeared in ACM FAccT 2022"},{"id":"http://arxiv.org/abs/2406.03141v1","updated":"2024-06-05T10:54:18Z","published":"2024-06-05T10:54:18Z","title":"Floating Anchor Diffusion Model for Multi-motif Scaffolding","summary":"  Motif scaffolding seeks to design scaffold structures for constructing\nproteins with functions derived from the desired motif, which is crucial for\nthe design of vaccines and enzymes. Previous works approach the problem by\ninpainting or conditional generation. Both of them can only scaffold motifs\nwith fixed positions, and the conditional generation cannot guarantee the\npresence of motifs. However, prior knowledge of the relative motif positions in\na protein is not readily available, and constructing a protein with multiple\nfunctions in one protein is more general and significant because of the\nsynergies between functions. We propose a Floating Anchor Diffusion (FADiff)\nmodel. FADiff allows motifs to float rigidly and independently in the process\nof diffusion, which guarantees the presence of motifs and automates the motif\nposition design. Our experiments demonstrate the efficacy of FADiff with high\nsuccess rates and designable novel scaffolds. To the best of our knowledge,\nFADiff is the first work to tackle the challenge of scaffolding multiple motifs\nwithout relying on the expertise of relative motif positions in the protein.\nCode is available at https://github.com/aim-uofa/FADiff.\n","authors":["Ke Liu","Weian Mao","Shuaike Shen","Xiaoran Jiao","Zheng Sun","Hao Chen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2406.03141v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2308.03743v3","updated":"2024-06-05T10:52:54Z","published":"2023-08-07T17:51:09Z","title":"The Copycat Perceptron: Smashing Barriers Through Collective Learning","summary":"  We characterize the equilibrium properties of a model of $y$ coupled binary\nperceptrons in the teacher-student scenario, subject to a suitable cost\nfunction, with an explicit ferromagnetic coupling proportional to the Hamming\ndistance between the students' weights. In contrast to recent works, we analyze\na more general setting in which thermal noise is present that affects each\nstudent's generalization performance. In the nonzero temperature regime, we\nfind that the coupling of replicas leads to a bend of the phase diagram towards\nsmaller values of $\\alpha$: This suggests that the free entropy landscape gets\nsmoother around the solution with perfect generalization (i.e., the teacher) at\na fixed fraction of examples, allowing standard thermal updating algorithms\nsuch as Simulated Annealing to easily reach the teacher solution and avoid\ngetting trapped in metastable states as it happens in the unreplicated case,\neven in the computationally \\textit{easy} regime of the inference phase\ndiagram. These results provide additional analytic and numerical evidence for\nthe recently conjectured Bayes-optimal property of Replicated Simulated\nAnnealing (RSA) for a sufficient number of replicas. From a learning\nperspective, these results also suggest that multiple students working together\n(in this case reviewing the same data) are able to learn the same rule both\nsignificantly faster and with fewer examples, a property that could be\nexploited in the context of cooperative and federated learning.\n","authors":["Giovanni Catania","Aurélien Decelle","Beatriz Seoane"],"pdf_url":"https://arxiv.org/pdf/2308.03743v3.pdf","comment":"2 figures in the main, 5 figures in the appendix"}],"Multimedia":[{"id":"http://arxiv.org/abs/2406.00758v2","updated":"2024-06-05T17:05:55Z","published":"2024-06-02T14:22:09Z","title":"Once-for-All: Controllable Generative Image Compression with Dynamic\n  Granularity Adaption","summary":"  Although recent generative image compression methods have demonstrated\nimpressive potential in optimizing the rate-distortion-perception trade-off,\nthey still face the critical challenge of flexible rate adaption to diverse\ncompression necessities and scenarios. To overcome this challenge, this paper\nproposes a Controllable Generative Image Compression framework, Control-GIC,\nthe first capable of fine-grained bitrate adaption across a broad spectrum\nwhile ensuring high-fidelity and generality compression. We base Control-GIC on\na VQGAN framework representing an image as a sequence of variable-length codes\n(i.e. VQ-indices), which can be losslessly compressed and exhibits a direct\npositive correlation with the bitrates. Therefore, drawing inspiration from the\nclassical coding principle, we naturally correlate the information density of\nlocal image patches with their granular representations, to achieve dynamic\nadjustment of the code quantity following different granularity decisions. This\nimplies we can flexibly determine a proper allocation of granularity for the\npatches to acquire desirable compression rates. We further develop a\nprobabilistic conditional decoder that can trace back to historic encoded\nmulti-granularity representations according to transmitted codes, and then\nreconstruct hierarchical granular features in the formalization of conditional\nprobability, enabling more informative aggregation to improve reconstruction\nrealism. Our experiments show that Control-GIC allows highly flexible and\ncontrollable bitrate adaption and even once compression on an entire dataset to\nfulfill constrained bitrate conditions. Experimental results demonstrate its\nsuperior performance over recent state-of-the-art methods.\n","authors":["Anqi Li","Yuxi Liu","Huihui Bai","Feng Li","Runmin Cong","Meng Wang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.00758v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03317v1","updated":"2024-06-05T14:29:44Z","published":"2024-06-05T14:29:44Z","title":"Save It for the \"Hot\" Day: An LLM-Empowered Visual Analytics System for\n  Heat Risk Management","summary":"  The escalating frequency and intensity of heat-related climate events,\nparticularly heatwaves, emphasize the pressing need for advanced heat risk\nmanagement strategies. Current approaches, primarily relying on numerical\nmodels, face challenges in spatial-temporal resolution and in capturing the\ndynamic interplay of environmental, social, and behavioral factors affecting\nheat risks. This has led to difficulties in translating risk assessments into\neffective mitigation actions. Recognizing these problems, we introduce a novel\napproach leveraging the burgeoning capabilities of Large Language Models (LLMs)\nto extract rich and contextual insights from news reports. We hence propose an\nLLM-empowered visual analytics system, Havior, that integrates the precise,\ndata-driven insights of numerical models with nuanced news report information.\nThis hybrid approach enables a more comprehensive assessment of heat risks and\nbetter identification, assessment, and mitigation of heat-related threats. The\nsystem incorporates novel visualization designs, such as \"thermoglyph\" and news\nglyph, enhancing intuitive understanding and analysis of heat risks. The\nintegration of LLM-based techniques also enables advanced information retrieval\nand semantic knowledge extraction that can be guided by experts' analytics\nneeds. Our case studies on two cities that faced significant heatwave events\nand interviews with five experts have demonstrated the usefulness of our system\nin providing in-depth and actionable insights for heat risk management.\n","authors":["Haobo Li","Wong Kam-Kwai","Yan Luo","Juntong Chen","Chengzhong Liu","Yaxuan Zhang","Alexis Kai Hon Lau","Huamin Qu","Dongyu Liu"],"pdf_url":"https://arxiv.org/pdf/2406.03317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03282v1","updated":"2024-06-05T13:55:56Z","published":"2024-06-05T13:55:56Z","title":"Globally and Locally Optimized Pannini Projection for High FoV Rendering\n  of 360-degree Images","summary":"  To render a spherical (360 degree or omnidirectional) image on planar\ndisplays, a 2D image -- called as viewport -- must be obtained by projecting a\nsphere region on a plane, according to the users viewing direction and a\npredefined field of view (FoV). However, any sphere to plan projection\nintroduces geometric distortions, such as object stretching and/or bending of\nstraight lines, which intensity increases with the considered FoV. In this\npaper, a fully automatic content-aware projection is proposed, aiming to reduce\nthe geometric distortions when high FoVs are used. This new projection is based\non the Pannini projection, whose parameters are firstly globally optimized\naccording to the image content, followed by a local conformality improvement of\nrelevant viewport objects. A crowdsourcing subjective test showed that the\nproposed projection is the most preferred solution among the considered\nstate-of-the-art sphere to plan projections, producing viewports with a more\npleasant visual quality.\n","authors":["Falah Jabar","Joao Ascenso","Maria Paula Queluz"],"pdf_url":"https://arxiv.org/pdf/2406.03282v1.pdf","comment":"15 pages, 12 figures, to be published in Signal Processing: Image\n  Communication"},{"id":"http://arxiv.org/abs/2403.18715v2","updated":"2024-06-05T13:53:42Z","published":"2024-03-27T16:04:47Z","title":"Mitigating Hallucinations in Large Vision-Language Models with\n  Instruction Contrastive Decoding","summary":"  Large Vision-Language Models (LVLMs) are increasingly adept at generating\ncontextually detailed and coherent responses from visual inputs. However, their\napplication in multimodal decision-making and open-ended generation is hindered\nby a notable rate of hallucinations, where generated text inaccurately\nrepresents the visual contents. To address this issue, this paper introduces\nthe Instruction Contrastive Decoding (ICD) method, a novel approach designed to\nreduce hallucinations during LVLM inference. Our method is inspired by our\nobservation that what we call disturbance instructions significantly exacerbate\nhallucinations in multimodal fusion modules. ICD contrasts distributions from\nstandard and instruction disturbance, thereby increasing alignment uncertainty\nand effectively subtracting hallucinated concepts from the original\ndistribution. Through comprehensive experiments on discriminative benchmarks\n(POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that\nICD significantly mitigates both object-level and attribute-level\nhallucinations. Moreover, our method not only addresses hallucinations but also\nsignificantly enhances the general perception and recognition capabilities of\nLVLMs.\n","authors":["Xintong Wang","Jingheng Pan","Liang Ding","Chris Biemann"],"pdf_url":"https://arxiv.org/pdf/2403.18715v2.pdf","comment":"Accepted to Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.03071v1","updated":"2024-06-05T08:56:24Z","published":"2024-06-05T08:56:24Z","title":"Exploiting LMM-based knowledge for image classification tasks","summary":"  In this paper we address image classification tasks leveraging knowledge\nencoded in Large Multimodal Models (LMMs). More specifically, we use the\nMiniGPT-4 model to extract semantic descriptions for the images, in a\nmultimodal prompting fashion. In the current literature, vision language models\nsuch as CLIP, among other approaches, are utilized as feature extractors, using\nonly the image encoder, for solving image classification tasks. In this paper,\nwe propose to additionally use the text encoder to obtain the text embeddings\ncorresponding to the MiniGPT-4-generated semantic descriptions. Thus, we use\nboth the image and text embeddings for solving the image classification task.\nThe experimental evaluation on three datasets validates the improved\nclassification performance achieved by exploiting LMM-based knowledge.\n","authors":["Maria Tzelepi","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2406.03071v1.pdf","comment":"Accepted for publication, 25th Int. Conf. on Engineering Applications\n  of Neural Networks (EANN/EAAAI 2024), Corfu, Greece, June 2024. This is the\n  \"submitted manuscript\""},{"id":"http://arxiv.org/abs/2406.02991v1","updated":"2024-06-05T06:43:48Z","published":"2024-06-05T06:43:48Z","title":"A Human-Annotated Video Dataset for Training and Evaluation of\n  360-Degree Video Summarization Methods","summary":"  In this paper we introduce a new dataset for 360-degree video summarization:\nthe transformation of 360-degree video content to concise 2D-video summaries\nthat can be consumed via traditional devices, such as TV sets and smartphones.\nThe dataset includes ground-truth human-generated summaries, that can be used\nfor training and objectively evaluating 360-degree video summarization methods.\nUsing this dataset, we train and assess two state-of-the-art summarization\nmethods that were originally proposed for 2D-video summarization, to serve as a\nbaseline for future comparisons with summarization methods that are\nspecifically tailored to 360-degree video. Finally, we present an interactive\ntool that was developed to facilitate the data annotation process and can\nassist other annotation activities that rely on video fragment selection.\n","authors":["Ioannis Kontostathis","Evlampios Apostolidis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2406.02991v1.pdf","comment":"Accepted for publication, 1st Int. Workshop on Video for Immersive\n  Experiences (Video4IMX-2024) at ACM IMX 2024, Stockholm, Sweden, June 2024.\n  This is the \"accepted version\""},{"id":"http://arxiv.org/abs/2403.15226v2","updated":"2024-06-05T05:52:44Z","published":"2024-03-22T14:20:34Z","title":"Not All Attention is Needed: Parameter and Computation Efficient\n  Transfer Learning for Multi-modal Large Language Models","summary":"  In this paper, we propose a novel parameter and computation efficient tuning\nmethod for Multi-modal Large Language Models (MLLMs), termed Efficient\nAttention Skipping (EAS). Concretely, we first reveal that multi-head\nattentions (MHAs), the main computational overhead of MLLMs, are often\nredundant to downstream tasks. Based on this observation, EAS evaluates the\nattention redundancy and skips the less important MHAs to speed up inference.\nBesides, we also propose a novel propagation-of-information adapter (PIA) to\nserve the attention skipping of EAS and keep parameter efficiency, which can be\nfurther re-parameterized into feed-forward networks (FFNs) for zero-extra\nlatency. To validate EAS, we apply it to a recently proposed MLLM called LaVIN\nand a classic VL pre-trained model called METER, and conduct extensive\nexperiments on a set of benchmarks. The experiments show that EAS not only\nretains high performance and parameter efficiency, but also greatly speeds up\ninference speed. For instance, LaVIN-EAS can obtain 89.98\\% accuracy on\nScineceQA while speeding up inference by 2.2 times to LaVIN\n","authors":["Qiong Wu","Weihao Ye","Yiyi Zhou","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2403.15226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02951v1","updated":"2024-06-05T05:20:12Z","published":"2024-06-05T05:20:12Z","title":"AVFF: Audio-Visual Feature Fusion for Video Deepfake Detection","summary":"  With the rapid growth in deepfake video content, we require improved and\ngeneralizable methods to detect them. Most existing detection methods either\nuse uni-modal cues or rely on supervised training to capture the dissonance\nbetween the audio and visual modalities. While the former disregards the\naudio-visual correspondences entirely, the latter predominantly focuses on\ndiscerning audio-visual cues within the training corpus, thereby potentially\noverlooking correspondences that can help detect unseen deepfakes. We present\nAudio-Visual Feature Fusion (AVFF), a two-stage cross-modal learning method\nthat explicitly captures the correspondence between the audio and visual\nmodalities for improved deepfake detection. The first stage pursues\nrepresentation learning via self-supervision on real videos to capture the\nintrinsic audio-visual correspondences. To extract rich cross-modal\nrepresentations, we use contrastive learning and autoencoding objectives, and\nintroduce a novel audio-visual complementary masking and feature fusion\nstrategy. The learned representations are tuned in the second stage, where\ndeepfake classification is pursued via supervised learning on both real and\nfake videos. Extensive experiments and analysis suggest that our novel\nrepresentation learning paradigm is highly discriminative in nature. We report\n98.6% accuracy and 99.1% AUC on the FakeAVCeleb dataset, outperforming the\ncurrent audio-visual state-of-the-art by 14.9% and 9.9%, respectively.\n","authors":["Trevine Oorloff","Surya Koppisetti","Nicolò Bonettini","Divyaraj Solanki","Ben Colman","Yaser Yacoob","Ali Shahriyari","Gaurav Bharaj"],"pdf_url":"https://arxiv.org/pdf/2406.02951v1.pdf","comment":"Accepted to CVPR 2024"}]},"2024-06-06T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2406.04344v1","updated":"2024-06-06T17:59:56Z","published":"2024-06-06T17:59:56Z","title":"Verbalized Machine Learning: Revisiting Machine Learning with Language\n  Models","summary":"  Motivated by the large progress made by large language models (LLMs), we\nintroduce the framework of verbalized machine learning (VML). In contrast to\nconventional machine learning models that are typically optimized over a\ncontinuous parameter space, VML constrains the parameter space to be\nhuman-interpretable natural language. Such a constraint leads to a new\nperspective of function approximation, where an LLM with a text prompt can be\nviewed as a function parameterized by the text prompt. Guided by this\nperspective, we revisit classical machine learning problems, such as regression\nand classification, and find that these problems can be solved by an\nLLM-parameterized learner and optimizer. The major advantages of VML include\n(1) easy encoding of inductive bias: prior knowledge about the problem and\nhypothesis class can be encoded in natural language and fed into the\nLLM-parameterized learner; (2) automatic model class selection: the optimizer\ncan automatically select a concrete model class based on data and verbalized\nprior knowledge, and it can update the model class during training; and (3)\ninterpretable learner updates: the LLM-parameterized optimizer can provide\nexplanations for why each learner update is performed. We conduct several\nstudies to empirically evaluate the effectiveness of VML, and hope that VML can\nserve as a stepping stone to stronger interpretability and trustworthiness in\nML.\n","authors":["Tim Z. Xiao","Robert Bamler","Bernhard Schölkopf","Weiyang Liu"],"pdf_url":"https://arxiv.org/pdf/2406.04344v1.pdf","comment":"Technical Report v1 (92 pages, 15 figures)"},{"id":"http://arxiv.org/abs/2406.04331v1","updated":"2024-06-06T17:59:10Z","published":"2024-06-06T17:59:10Z","title":"PaCE: Parsimonious Concept Engineering for Large Language Models","summary":"  Large Language Models (LLMs) are being used for a wide variety of tasks.\nWhile they are capable of generating human-like responses, they can also\nproduce undesirable output including potentially harmful information, racist or\nsexist language, and hallucinations. Alignment methods are designed to reduce\nsuch undesirable output, via techniques such as fine-tuning, prompt\nengineering, and representation engineering. However, existing methods face\nseveral challenges: some require costly fine-tuning for every alignment task;\nsome do not adequately remove undesirable concepts, failing alignment; some\nremove benign concepts, lowering the linguistic capabilities of LLMs. To\naddress these issues, we propose Parsimonious Concept Engineering (PaCE), a\nnovel activation engineering framework for alignment. First, to sufficiently\nmodel the concepts, we construct a large-scale concept dictionary in the\nactivation space, in which each atom corresponds to a semantic concept. Then,\ngiven any alignment task, we instruct a concept partitioner to efficiently\nannotate the concepts as benign or undesirable. Finally, at inference time, we\ndecompose the LLM activations along the concept dictionary via sparse coding,\nto accurately represent the activation as a linear combination of the benign\nand undesirable components. By removing the latter ones from the activation, we\nreorient the behavior of LLMs towards alignment goals. We conduct experiments\non tasks such as response detoxification, faithfulness enhancement, and\nsentiment revising, and show that PaCE achieves state-of-the-art alignment\nperformance while maintaining linguistic capabilities.\n","authors":["Jinqi Luo","Tianjiao Ding","Kwan Ho Ryan Chan","Darshan Thaker","Aditya Chattopadhyay","Chris Callison-Burch","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2406.04331v1.pdf","comment":"26 pages, 17 figures, 5 tables, dataset and code at\n  https://github.com/peterljq/Parsimonious-Concept-Engineering"},{"id":"http://arxiv.org/abs/2406.04313v1","updated":"2024-06-06T17:57:04Z","published":"2024-06-06T17:57:04Z","title":"Improving Alignment and Robustness with Short Circuiting","summary":"  AI systems can take harmful actions and are highly vulnerable to adversarial\nattacks. We present an approach, inspired by recent advances in representation\nengineering, that \"short-circuits\" models as they respond with harmful outputs.\nExisting techniques aimed at improving alignment, such as refusal training, are\noften bypassed. Techniques such as adversarial training try to plug these holes\nby countering specific attacks. As an alternative to refusal training and\nadversarial training, short-circuiting directly controls the representations\nthat are responsible for harmful outputs in the first place. Our technique can\nbe applied to both text-only and multimodal language models to prevent the\ngeneration of harmful outputs without sacrificing utility -- even in the\npresence of powerful unseen attacks. Notably, while adversarial robustness in\nstandalone image recognition remains an open challenge, short-circuiting allows\nthe larger multimodal system to reliably withstand image \"hijacks\" that aim to\nproduce harmful content. Finally, we extend our approach to AI agents,\ndemonstrating considerable reductions in the rate of harmful actions when they\nare under attack. Our approach represents a significant step forward in the\ndevelopment of reliable safeguards to harmful behavior and adversarial attacks.\n","authors":["Andy Zou","Long Phan","Justin Wang","Derek Duenas","Maxwell Lin","Maksym Andriushchenko","Rowan Wang","Zico Kolter","Matt Fredrikson","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2406.04313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12991v2","updated":"2024-06-06T17:46:48Z","published":"2024-02-20T13:20:39Z","title":"TRAP: Targeted Random Adversarial Prompt Honeypot for Black-Box\n  Identification","summary":"  Large Language Model (LLM) services and models often come with legal rules on\nwho can use them and how they must use them. Assessing the compliance of the\nreleased LLMs is crucial, as these rules protect the interests of the LLM\ncontributor and prevent misuse. In this context, we describe the novel\nfingerprinting problem of Black-box Identity Verification (BBIV). The goal is\nto determine whether a third-party application uses a certain LLM through its\nchat function. We propose a method called Targeted Random Adversarial Prompt\n(TRAP) that identifies the specific LLM in use. We repurpose adversarial\nsuffixes, originally proposed for jailbreaking, to get a pre-defined answer\nfrom the target LLM, while other models give random answers. TRAP detects the\ntarget LLMs with over 95% true positive rate at under 0.2% false positive rate\neven after a single interaction. TRAP remains effective even if the LLM has\nminor changes that do not significantly alter the original function.\n","authors":["Martin Gubri","Dennis Ulmer","Hwaran Lee","Sangdoo Yun","Seong Joon Oh"],"pdf_url":"https://arxiv.org/pdf/2402.12991v2.pdf","comment":"Accepted at ACL 2024 (findings)"},{"id":"http://arxiv.org/abs/2401.06688v2","updated":"2024-06-06T17:45:39Z","published":"2024-01-12T16:52:41Z","title":"Don't Rank, Combine! Combining Machine Translation Hypotheses Using\n  Quality Estimation","summary":"  Neural machine translation systems estimate probabilities of target sentences\ngiven source sentences, yet these estimates may not align with human\npreferences. This work introduces QE-fusion, a method that synthesizes\ntranslations using a quality estimation metric (QE), which correlates better\nwith human judgments. QE-fusion leverages a pool of candidates sampled from a\nmodel, combining spans from different candidates using a QE metric such as\nCometKiwi. We compare QE-fusion against beam search and recent reranking\ntechniques, such as Minimum Bayes Risk decoding or QE-reranking. Our method\nconsistently improves translation quality in terms of COMET and BLEURT scores\nwhen applied to large language models (LLMs) used for translation (PolyLM,\nXGLM, Llama2, Mistral, ALMA, and Tower) and to multilingual translation models\n(NLLB), over five language pairs. Notably, QE-fusion exhibits larger\nimprovements for LLMs due to their ability to generate diverse outputs. We\ndemonstrate that our approach generates novel translations in over half of the\ncases and consistently outperforms other methods across varying numbers of\ncandidates (5-200). Furthermore, we empirically establish that QE-fusion scales\nlinearly with the number of candidates in the pool.\n","authors":["Giorgos Vernikos","Andrei Popescu-Belis"],"pdf_url":"https://arxiv.org/pdf/2401.06688v2.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2406.04298v1","updated":"2024-06-06T17:42:37Z","published":"2024-06-06T17:42:37Z","title":"Measuring and Addressing Indexical Bias in Information Retrieval","summary":"  Information Retrieval (IR) systems are designed to deliver relevant content,\nbut traditional systems may not optimize rankings for fairness, neutrality, or\nthe balance of ideas. Consequently, IR can often introduce indexical biases, or\nbiases in the positional order of documents. Although indexical bias can\ndemonstrably affect people's opinion, voting patterns, and other behaviors,\nthese issues remain understudied as the field lacks reliable metrics and\nprocedures for automatically measuring indexical bias. Towards this end, we\nintroduce the PAIR framework, which supports automatic bias audits for ranked\ndocuments or entire IR systems. After introducing DUO, the first\ngeneral-purpose automatic bias metric, we run an extensive evaluation of 8 IR\nsystems on a new corpus of 32k synthetic and 4.7k natural documents, with 4k\nqueries spanning 1.4k controversial issue topics. A human behavioral study\nvalidates our approach, showing that our bias metric can help predict when and\nhow indexical bias will shift a reader's opinion.\n","authors":["Caleb Ziems","William Held","Jane Dwivedi-Yu","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2406.04298v1.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2403.07974v2","updated":"2024-06-06T17:41:21Z","published":"2024-03-12T17:58:04Z","title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large\n  Language Models for Code","summary":"  Large Language Models (LLMs) applied to code-related applications have\nemerged as a prominent field, attracting significant interest from both\nacademia and industry. However, as new and improved LLMs are developed,\nexisting evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient\nfor assessing their capabilities. In this work, we propose LiveCodeBench, a\ncomprehensive and contamination-free evaluation of LLMs for code, which\ncontinuously collects new problems over time from contests across three\ncompetition platforms, namely LeetCode, AtCoder, and CodeForces. Notably, our\nbenchmark also focuses on a broader range of code related capabilities, such as\nself-repair, code execution, and test output prediction, beyond just code\ngeneration. Currently, LiveCodeBench hosts four hundred high-quality coding\nproblems that were published between May 2023 and May 2024. We have evaluated\n18 base LLMs and 34 instruction-tuned LLMs on LiveCodeBench. We present\nempirical findings on contamination, holistic performance comparisons,\npotential overfitting in existing benchmarks as well as individual model\ncomparisons. We will release all prompts and model completions for further\ncommunity analysis, along with a general toolkit for adding new scenarios and\nmodel\n","authors":["Naman Jain","King Han","Alex Gu","Wen-Ding Li","Fanjia Yan","Tianjun Zhang","Sida Wang","Armando Solar-Lezama","Koushik Sen","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2403.07974v2.pdf","comment":"Website - https://livecodebench.github.io/"},{"id":"http://arxiv.org/abs/2406.04292v1","updated":"2024-06-06T17:37:47Z","published":"2024-06-06T17:37:47Z","title":"VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval","summary":"  Multi-modal retrieval becomes increasingly popular in practice. However, the\nexisting retrievers are mostly text-oriented, which lack the capability to\nprocess visual information. Despite the presence of vision-language models like\nCLIP, the current methods are severely limited in representing the text-only\nand image-only data. In this work, we present a new embedding model VISTA for\nuniversal multi-modal retrieval. Our work brings forth threefold technical\ncontributions. Firstly, we introduce a flexible architecture which extends a\npowerful text encoder with the image understanding capability by introducing\nvisual token embeddings. Secondly, we develop two data generation strategies,\nwhich bring high-quality composed image-text to facilitate the training of the\nembedding model. Thirdly, we introduce a multi-stage training algorithm, which\nfirst aligns the visual token embedding with the text encoder using massive\nweakly labeled data, and then develops multi-modal representation capability\nusing the generated composed image-text data. In our experiments, VISTA\nachieves superior performances across a variety of multi-modal retrieval tasks\nin both zero-shot and supervised settings. Our model, data, and source code are\navailable at https://github.com/FlagOpen/FlagEmbedding.\n","authors":["Junjie Zhou","Zheng Liu","Shitao Xiao","Bo Zhao","Yongping Xiong"],"pdf_url":"https://arxiv.org/pdf/2406.04292v1.pdf","comment":"Accepted to ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2406.04289v1","updated":"2024-06-06T17:34:24Z","published":"2024-06-06T17:34:24Z","title":"What Languages are Easy to Language-Model? A Perspective from Learning\n  Probabilistic Regular Languages","summary":"  What can large language models learn? By definition, language models (LM) are\ndistributions over strings. Therefore, an intuitive way of addressing the above\nquestion is to formalize it as a matter of learnability of classes of\ndistributions over strings. While prior work in this direction focused on\nassessing the theoretical limits, in contrast, we seek to understand the\nempirical learnability. Unlike prior empirical work, we evaluate neural LMs on\ntheir home turf-learning probabilistic languages-rather than as classifiers of\nformal languages. In particular, we investigate the learnability of regular LMs\n(RLMs) by RNN and Transformer LMs. We empirically test the learnability of RLMs\nas a function of various complexity parameters of the RLM and the hidden state\nsize of the neural LM. We find that the RLM rank, which corresponds to the size\nof linear space spanned by the logits of its conditional distributions, and the\nexpected length of sampled strings are strong and significant predictors of\nlearnability for both RNNs and Transformers. Several other predictors also\nreach significance, but with differing patterns between RNNs and Transformers.\n","authors":["Nadav Borenstein","Anej Svete","Robin Chan","Josef Valvoda","Franz Nowak","Isabelle Augenstein","Eleanor Chodroff","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2406.04289v1.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2401.16467v2","updated":"2024-06-06T17:31:07Z","published":"2024-01-29T18:45:30Z","title":"ReGAL: Refactoring Programs to Discover Generalizable Abstractions","summary":"  While large language models (LLMs) are increasingly being used for program\nsynthesis, they lack the global view needed to develop useful abstractions;\nthey generally predict programs one at a time, often repeating the same\nfunctionality. Generating redundant code from scratch is both inefficient and\nerror-prone. To address this, we propose Refactoring for Generalizable\nAbstraction Learning (ReGAL), a gradient-free method for learning a library of\nreusable functions via code refactorization, i.e., restructuring code without\nchanging its execution output. ReGAL learns from a small set of existing\nprograms, iteratively verifying and refining its abstractions via execution. We\nfind that the shared function libraries discovered by ReGAL make programs\neasier to predict across diverse domains. On five datasets -- LOGO graphics\ngeneration, Date reasoning, TextCraft (a Minecraft-based text-game) MATH, and\nTabMWP -- both open-source and proprietary LLMs improve in accuracy when\npredicting programs with ReGAL functions. For CodeLlama-13B, ReGAL results in\nabsolute accuracy increases of 11.5% on LOGO, 26.1% on date understanding, and\n8.1% on TextCraft, outperforming GPT-3.5 in two of three domains. Our analysis\nreveals ReGAL's abstractions encapsulate frequently-used subroutines as well as\nenvironment dynamics.\n","authors":["Elias Stengel-Eskin","Archiki Prasad","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2401.16467v2.pdf","comment":"ICML 2024 Camera-Ready; First two authors contributed equally; Code:\n  https://github.com/esteng/regal_program_learning"},{"id":"http://arxiv.org/abs/2406.04286v1","updated":"2024-06-06T17:29:57Z","published":"2024-06-06T17:29:57Z","title":"ABEX: Data Augmentation for Low-Resource NLU via Expanding Abstract\n  Descriptions","summary":"  We present ABEX, a novel and effective generative data augmentation\nmethodology for low-resource Natural Language Understanding (NLU) tasks. ABEX\nis based on ABstract-and-EXpand, a novel paradigm for generating diverse forms\nof an input document -- we first convert a document into its concise, abstract\ndescription and then generate new documents based on expanding the resultant\nabstraction. To learn the task of expanding abstract descriptions, we first\ntrain BART on a large-scale synthetic dataset with abstract-document pairs.\nNext, to generate abstract descriptions for a document, we propose a simple,\ncontrollable, and training-free method based on editing AMR graphs. ABEX brings\nthe best of both worlds: by expanding from abstract representations, it\npreserves the original semantic properties of the documents, like style and\nmeaning, thereby maintaining alignment with the original label and data\ndistribution. At the same time, the fundamental process of elaborating on\nabstract descriptions facilitates diverse generations. We demonstrate the\neffectiveness of ABEX on 4 NLU tasks spanning 12 datasets and 4 low-resource\nsettings. ABEX outperforms all our baselines qualitatively with improvements of\n0.04% - 38.8%. Qualitatively, ABEX outperforms all prior methods from\nliterature in terms of context and length diversity.\n","authors":["Sreyan Ghosh","Utkarsh Tyagi","Sonal Kumar","C. K. Evuru","S Ramaneswaran","S Sakshi","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2406.04286v1.pdf","comment":"ACL 2024 Main Conference. Code and data:\n  https://github.com/Sreyan88/ABEX"},{"id":"http://arxiv.org/abs/2406.04278v1","updated":"2024-06-06T17:26:00Z","published":"2024-06-06T17:26:00Z","title":"Characterizing Similarities and Divergences in Conversational Tones in\n  Humans and LLMs by Sampling with People","summary":"  Conversational tones -- the manners and attitudes in which speakers\ncommunicate -- are essential to effective communication. Amidst the increasing\npopularization of Large Language Models (LLMs) over recent years, it becomes\nnecessary to characterize the divergences in their conversational tones\nrelative to humans. However, existing investigations of conversational\nmodalities rely on pre-existing taxonomies or text corpora, which suffer from\nexperimenter bias and may not be representative of real-world distributions for\nthe studies' psycholinguistic domains. Inspired by methods from cognitive\nscience, we propose an iterative method for simultaneously eliciting\nconversational tones and sentences, where participants alternate between two\ntasks: (1) one participant identifies the tone of a given sentence and (2) a\ndifferent participant generates a sentence based on that tone. We run 100\niterations of this process with human participants and GPT-4, then obtain a\ndataset of sentences and frequent conversational tones. In an additional\nexperiment, humans and GPT-4 annotated all sentences with all tones. With data\nfrom 1,339 human participants, 33,370 human judgments, and 29,900 GPT-4\nqueries, we show how our approach can be used to create an interpretable\ngeometric representation of relations between conversational tones in humans\nand GPT-4. This work demonstrates how combining ideas from machine learning and\ncognitive science can address challenges in human-computer interactions.\n","authors":["Dun-Ming Huang","Pol Van Rijn","Ilia Sucholutsky","Raja Marjieh","Nori Jacoby"],"pdf_url":"https://arxiv.org/pdf/2406.04278v1.pdf","comment":"Accepted to Main Conference at ACL 2024"},{"id":"http://arxiv.org/abs/2406.04274v1","updated":"2024-06-06T17:23:49Z","published":"2024-06-06T17:23:49Z","title":"Self-Play with Adversarial Critic: Provable and Scalable Offline\n  Alignment for Language Models","summary":"  This work studies the challenge of aligning large language models (LLMs) with\noffline preference data. We focus on alignment by Reinforcement Learning from\nHuman Feedback (RLHF) in particular. While popular preference optimization\nmethods exhibit good empirical performance in practice, they are not\ntheoretically guaranteed to converge to the optimal policy and can provably\nfail when the data coverage is sparse by classical offline reinforcement\nlearning (RL) results. On the other hand, a recent line of work has focused on\ntheoretically motivated preference optimization methods with provable\nguarantees, but these are not computationally efficient for large-scale\napplications like LLM alignment. To bridge this gap, we propose SPAC, a new\noffline preference optimization method with self-play, inspired by the\non-average pessimism technique from the offline RL literature, to be the first\nprovable and scalable approach to LLM alignment. We both provide theoretical\nanalysis for its convergence under single-policy concentrability for the\ngeneral function approximation setting and demonstrate its competitive\nempirical performance for LLM alignment on a 7B Mistral model with Open LLM\nLeaderboard evaluations.\n","authors":["Xiang Ji","Sanjeev Kulkarni","Mengdi Wang","Tengyang Xie"],"pdf_url":"https://arxiv.org/pdf/2406.04274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04271v1","updated":"2024-06-06T17:22:08Z","published":"2024-06-06T17:22:08Z","title":"Buffer of Thoughts: Thought-Augmented Reasoning with Large Language\n  Models","summary":"  We introduce Buffer of Thoughts (BoT), a novel and versatile\nthought-augmented reasoning approach for enhancing accuracy, efficiency and\nrobustness of large language models (LLMs). Specifically, we propose\nmeta-buffer to store a series of informative high-level thoughts, namely\nthought-template, distilled from the problem-solving processes across various\ntasks. Then for each problem, we retrieve a relevant thought-template and\nadaptively instantiate it with specific reasoning structures to conduct\nefficient reasoning. To guarantee the scalability and stability, we further\npropose buffer-manager to dynamically update the meta-buffer, thus enhancing\nthe capacity of meta-buffer as more tasks are solved. We conduct extensive\nexperiments on 10 challenging reasoning-intensive tasks, and achieve\nsignificant performance improvements over previous SOTA methods: 11% on Game of\n24, 20% on Geometric Shapes and 51% on Checkmate-in-One. Further analysis\ndemonstrate the superior generalization ability and model robustness of our\nBoT, while requiring only 12% of the cost of multi-query prompting methods\n(e.g., tree/graph of thoughts) on average. Notably, we find that our\nLlama3-8B+BoT has the potential to surpass Llama3-70B model. Our project is\navailable at: https://github.com/YangLing0818/buffer-of-thought-llm\n","authors":["Ling Yang","Zhaochen Yu","Tianjun Zhang","Shiyi Cao","Minkai Xu","Wentao Zhang","Joseph E. Gonzalez","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2406.04271v1.pdf","comment":"Project: https://github.com/YangLing0818/buffer-of-thought-llm"},{"id":"http://arxiv.org/abs/2309.09836v2","updated":"2024-06-06T17:21:37Z","published":"2023-09-18T14:53:08Z","title":"RECAP: Retrieval-Augmented Audio Captioning","summary":"  We present RECAP (REtrieval-Augmented Audio CAPtioning), a novel and\neffective audio captioning system that generates captions conditioned on an\ninput audio and other captions similar to the audio retrieved from a datastore.\nAdditionally, our proposed method can transfer to any domain without the need\nfor any additional fine-tuning. To generate a caption for an audio sample, we\nleverage an audio-text model CLAP to retrieve captions similar to it from a\nreplaceable datastore, which are then used to construct a prompt. Next, we feed\nthis prompt to a GPT-2 decoder and introduce cross-attention layers between the\nCLAP encoder and GPT-2 to condition the audio for caption generation.\nExperiments on two benchmark datasets, Clotho and AudioCaps, show that RECAP\nachieves competitive performance in in-domain settings and significant\nimprovements in out-of-domain settings. Additionally, due to its capability to\nexploit a large text-captions-only datastore in a training-free fashion, RECAP\nshows unique capabilities of captioning novel audio events never seen during\ntraining and compositional audios with multiple events. To promote research in\nthis space, we also release 150,000+ new weakly labeled captions for AudioSet,\nAudioCaps, and Clotho.\n","authors":["Sreyan Ghosh","Sonal Kumar","Chandra Kiran Reddy Evuru","Ramani Duraiswami","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2309.09836v2.pdf","comment":"ICASSP 2024. Code and data: https://github.com/Sreyan88/RECAP"},{"id":"http://arxiv.org/abs/2402.11349v2","updated":"2024-06-06T17:20:21Z","published":"2024-02-17T17:52:24Z","title":"Language Models Don't Learn the Physical Manifestation of Language","summary":"  We argue that language-only models don't learn the physical manifestation of\nlanguage. We present an empirical investigation of visual-auditory properties\nof language through a series of tasks, termed H-Test. These tasks highlight a\nfundamental gap between human linguistic understanding and the sensory-deprived\nlinguistic understanding of LLMs. In support of our hypothesis, 1. deliberate\nreasoning (Chain-of-Thought), 2. few-shot examples, or 3. stronger LLM from the\nsame model family (LLaMA 2 13B -> LLaMA 2 70B) has no significant effect on\nH-Test performance.\n  We bring in the philosophical case of Mary, who learns about the world in a\nsensory-deprived environment as a useful conceptual framework to understand how\nlanguage-only models learn about the world (Jackson, 1986). Our experiments\nshow that some of the strongest proprietary LLMs stay near random chance\nbaseline accuracy of 50%, highlighting the limitations of linguistic knowledge\nacquired in the absence of sensory experience. Our code and data are available\nat <github.com/brucewlee/h-test>.\n","authors":["Bruce W. Lee","JaeHyuk Lim"],"pdf_url":"https://arxiv.org/pdf/2402.11349v2.pdf","comment":"ACL 2024 Main"},{"id":"http://arxiv.org/abs/2406.04267v1","updated":"2024-06-06T17:14:44Z","published":"2024-06-06T17:14:44Z","title":"Transformers need glasses! Information over-squashing in language tasks","summary":"  We study how information propagates in decoder-only Transformers, which are\nthe architectural backbone of most existing frontier large language models\n(LLMs). We rely on a theoretical signal propagation analysis -- specifically,\nwe analyse the representations of the last token in the final layer of the\nTransformer, as this is the representation used for next-token prediction. Our\nanalysis reveals a representational collapse phenomenon: we prove that certain\ndistinct sequences of inputs to the Transformer can yield arbitrarily close\nrepresentations in the final token. This effect is exacerbated by the\nlow-precision floating-point formats frequently used in modern LLMs. As a\nresult, the model is provably unable to respond to these sequences in different\nways -- leading to errors in, e.g., tasks involving counting or copying.\nFurther, we show that decoder-only Transformer language models can lose\nsensitivity to specific tokens in the input, which relates to the well-known\nphenomenon of over-squashing in graph neural networks. We provide empirical\nevidence supporting our claims on contemporary LLMs. Our theory also points to\nsimple solutions towards ameliorating these issues.\n","authors":["Federico Barbero","Andrea Banino","Steven Kapturowski","Dharshan Kumaran","João G. M. Araújo","Alex Vitvitskyi","Razvan Pascanu","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2406.04267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04264v1","updated":"2024-06-06T17:09:32Z","published":"2024-06-06T17:09:32Z","title":"MLVU: A Comprehensive Benchmark for Multi-Task Long Video Understanding","summary":"  The evaluation of Long Video Understanding (LVU) performance poses an\nimportant but challenging research problem. Despite previous efforts, the\nexisting video understanding benchmarks are severely constrained by several\nissues, especially the insufficient lengths of videos, a lack of diversity in\nvideo types and evaluation tasks, and the inappropriateness for evaluating LVU\nperformances. To address the above problems, we propose a new benchmark, called\nMLVU (Multi-task Long Video Understanding Benchmark), for the comprehensive and\nin-depth evaluation of LVU. MLVU presents the following critical values: 1) The\nsubstantial and flexible extension of video lengths, which enables the\nbenchmark to evaluate LVU performance across a wide range of durations. 2) The\ninclusion of various video genres, e.g., movies, surveillance footage,\negocentric videos, cartoons, game videos, etc., which reflects the models' LVU\nperformances in different scenarios. 3) The development of diversified\nevaluation tasks, which enables a comprehensive examination of MLLMs' key\nabilities in long-video understanding. The empirical study with 20 latest MLLMs\nreveals significant room for improvement in today's technique, as all existing\nmethods struggle with most of the evaluation tasks and exhibit severe\nperformance degradation when handling longer videos. Additionally, it suggests\nthat factors such as context length, image-understanding quality, and the\nchoice of LLM backbone can play critical roles in future advancements. We\nanticipate that MLVU will advance the research of long video understanding by\nproviding a comprehensive and in-depth analysis of MLLMs.\n","authors":["Junjie Zhou","Yan Shu","Bo Zhao","Boya Wu","Shitao Xiao","Xi Yang","Yongping Xiong","Bo Zhang","Tiejun Huang","Zheng Liu"],"pdf_url":"https://arxiv.org/pdf/2406.04264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12621v2","updated":"2024-06-06T17:04:41Z","published":"2024-02-20T01:04:21Z","title":"Reflect-RL: Two-Player Online RL Fine-Tuning for LMs","summary":"  As language models (LMs) demonstrate their capabilities in various fields,\ntheir application to tasks requiring multi-round interactions has become\nincreasingly popular. These tasks usually have complex dynamics, so supervised\nfine-tuning (SFT) on a limited offline dataset does not yield good performance.\nHowever, only a few works attempted to directly train the LMs within\ninteractive decision-making environments. We aim to create an effective\napproach to fine-tune LMs with online reinforcement learning (RL) in these\nenvironments. We propose Reflect-RL, a two-player system to fine-tune an LM\nusing SFT and online RL, where a frozen reflection model (player) assists the\npolicy model (player). To generate data for the warm-up SFT stage, we use\nnegative example generation to enhance the error-correction ability of the\nreflection model. Furthermore, we designed single-prompt action enumeration and\napplied curriculum learning to allow the policy model to learn more\nefficiently. Empirically, we verify that Reflect-RL outperforms SFT and online\nRL without reflection. Testing results indicate GPT-2 XL 1.56B fine-tuned with\nReflect-RL outperforms larger open-source LMs, such as Mistral 7B. The\nbenchmarks, dataset, and code involved in this work are publicly available:\nhttps://github.com/zhourunlong/Reflect-RL.\n","authors":["Runlong Zhou","Simon S. Du","Beibin Li"],"pdf_url":"https://arxiv.org/pdf/2402.12621v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2406.04244v1","updated":"2024-06-06T16:41:39Z","published":"2024-06-06T16:41:39Z","title":"Benchmark Data Contamination of Large Language Models: A Survey","summary":"  The rapid development of Large Language Models (LLMs) like GPT-4, Claude-3,\nand Gemini has transformed the field of natural language processing. However,\nit has also resulted in a significant issue known as Benchmark Data\nContamination (BDC). This occurs when language models inadvertently incorporate\nevaluation benchmark information from their training data, leading to\ninaccurate or unreliable performance during the evaluation phase of the\nprocess. This paper reviews the complex challenge of BDC in LLM evaluation and\nexplores alternative assessment methods to mitigate the risks associated with\ntraditional benchmarks. The paper also examines challenges and future\ndirections in mitigating BDC risks, highlighting the complexity of the issue\nand the need for innovative solutions to ensure the reliability of LLM\nevaluation in real-world applications.\n","authors":["Cheng Xu","Shuhao Guan","Derek Greene","M-Tahar Kechadi"],"pdf_url":"https://arxiv.org/pdf/2406.04244v1.pdf","comment":"31 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2402.14116v2","updated":"2024-06-06T16:41:21Z","published":"2024-02-21T20:30:45Z","title":"FanOutQA: A Multi-Hop, Multi-Document Question Answering Benchmark for\n  Large Language Models","summary":"  One type of question that is commonly found in day-to-day scenarios is\n``fan-out'' questions, complex multi-hop, multi-document reasoning questions\nthat require finding information about a large number of entities. However,\nthere exist few resources to evaluate this type of question-answering\ncapability among large language models. To evaluate complex reasoning in LLMs\nmore fully, we present FanOutQA, a high-quality dataset of fan-out\nquestion-answer pairs and human-annotated decompositions with English Wikipedia\nas the knowledge base. We formulate three benchmark settings across our dataset\nand benchmark 7 LLMs, including GPT-4, LLaMA 2, Claude-2.1, and Mixtral-8x7B,\nfinding that contemporary models still have room to improve reasoning over\ninter-document dependencies in a long context. We provide our dataset and\nopen-source tools to run models to encourage evaluation at https://fanoutqa.com\n","authors":["Andrew Zhu","Alyssa Hwang","Liam Dugan","Chris Callison-Burch"],"pdf_url":"https://arxiv.org/pdf/2402.14116v2.pdf","comment":"18 pages, 2 figures. ACL 2024"},{"id":"http://arxiv.org/abs/2406.04240v1","updated":"2024-06-06T16:39:00Z","published":"2024-06-06T16:39:00Z","title":"Hypernetworks for Personalizing ASR to Atypical Speech","summary":"  Parameter-efficient fine-tuning (PEFT) for personalizing automatic speech\nrecognition (ASR) has recently shown promise for adapting general population\nmodels to atypical speech. However, these approaches assume a priori knowledge\nof the atypical speech disorder being adapted for -- the diagnosis of which\nrequires expert knowledge that is not always available. Even given this\nknowledge, data scarcity and high inter/intra-speaker variability further limit\nthe effectiveness of traditional fine-tuning. To circumvent these challenges,\nwe first identify the minimal set of model parameters required for ASR\nadaptation. Our analysis of each individual parameter's effect on adaptation\nperformance allows us to reduce Word Error Rate (WER) by half while adapting\n0.03\\% of all weights. Alleviating the need for cohort-specific models, we next\npropose the novel use of a meta-learned hypernetwork to generate highly\nindividualized, utterance-level adaptations on-the-fly for a diverse set of\natypical speech characteristics. Evaluating adaptation at the global, cohort\nand individual-level, we show that hypernetworks generalize better to\nout-of-distribution speakers, while maintaining an overall relative WER\nreduction of 75.2% using 0.1% of the full parameter budget.\n","authors":["Max Mueller-Eberstein","Dianna Yee","Karren Yang","Gautam Varma Mantena","Colin Lea"],"pdf_url":"https://arxiv.org/pdf/2406.04240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01392v2","updated":"2024-06-06T16:38:34Z","published":"2024-06-03T14:56:09Z","title":"Sparsity-Accelerated Training for Large Language Models","summary":"  Large language models (LLMs) have demonstrated proficiency across various\nnatural language processing (NLP) tasks but often require additional training,\nsuch as continual pre-training and supervised fine-tuning. However, the costs\nassociated with this, primarily due to their large parameter count, remain\nhigh. This paper proposes leveraging \\emph{sparsity} in pre-trained LLMs to\nexpedite this training process. By observing sparsity in activated neurons\nduring forward iterations, we identify the potential for computational\nspeed-ups by excluding inactive neurons. We address associated challenges by\nextending existing neuron importance evaluation metrics and introducing a\nladder omission rate scheduler. Our experiments on Llama-2 demonstrate that\nSparsity-Accelerated Training (SAT) achieves comparable or superior performance\nto standard training while significantly accelerating the process.\nSpecifically, SAT achieves a $45\\%$ throughput improvement in continual\npre-training and saves $38\\%$ training time in supervised fine-tuning in\npractice. It offers a simple, hardware-agnostic, and easily deployable\nframework for additional LLM training. Our code is available at\nhttps://github.com/OpenDFM/SAT.\n","authors":["Da Ma","Lu Chen","Pengyu Wang","Hongshen Xu","Hanqi Li","Liangtai Sun","Su Zhu","Shuai Fan","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2406.01392v2.pdf","comment":"Accepted to ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2406.04233v1","updated":"2024-06-06T16:31:47Z","published":"2024-06-06T16:31:47Z","title":"FairytaleQA Translated: Enabling Educational Question and Answer\n  Generation in Less-Resourced Languages","summary":"  Question Answering (QA) datasets are crucial in assessing reading\ncomprehension skills for both machines and humans. While numerous datasets have\nbeen developed in English for this purpose, a noticeable void exists in\nless-resourced languages. To alleviate this gap, our paper introduces\nmachine-translated versions of FairytaleQA, a renowned QA dataset designed to\nassess and enhance narrative comprehension skills in young children. By\nemploying fine-tuned, modest-scale models, we establish benchmarks for both\nQuestion Generation (QG) and QA tasks within the translated datasets. In\naddition, we present a case study proposing a model for generating\nquestion-answer pairs, with an evaluation incorporating quality metrics such as\nquestion well-formedness, answerability, relevance, and children suitability.\nOur evaluation prioritizes quantifying and describing error cases, along with\nproviding directions for future work. This paper contributes to the advancement\nof QA and QG research in less-resourced languages, promoting accessibility and\ninclusivity in the development of these models for reading comprehension. The\ncode and data is publicly available at\ngithub.com/bernardoleite/fairytaleqa-translated.\n","authors":["Bernardo Leite","Tomás Freitas Osório","Henrique Lopes Cardoso"],"pdf_url":"https://arxiv.org/pdf/2406.04233v1.pdf","comment":"Preprint - Accepted for publication at ECTEL 2024"},{"id":"http://arxiv.org/abs/2406.04229v1","updated":"2024-06-06T16:29:25Z","published":"2024-06-06T16:29:25Z","title":"The CLRS-Text Algorithmic Reasoning Language Benchmark","summary":"  Eliciting reasoning capabilities from language models (LMs) is a critical\ndirection on the path towards building intelligent systems. Most recent studies\ndedicated to reasoning focus on out-of-distribution performance on\nprocedurally-generated synthetic benchmarks, bespoke-built to evaluate specific\nskills only. This trend makes results hard to transfer across publications,\nslowing down progress. Three years ago, a similar issue was identified and\nrectified in the field of neural algorithmic reasoning, with the advent of the\nCLRS benchmark. CLRS is a dataset generator comprising graph execution traces\nof classical algorithms from the Introduction to Algorithms textbook. Inspired\nby this, we propose CLRS-Text -- a textual version of these algorithmic traces.\nOut of the box, CLRS-Text is capable of procedurally generating trace data for\nthirty diverse, challenging algorithmic tasks across any desirable input\ndistribution, while offering a standard pipeline in which any additional\nalgorithmic tasks may be created in the benchmark. We fine-tune and evaluate\nvarious LMs as generalist executors on this benchmark, validating prior work\nand revealing a novel, interesting challenge for the LM reasoning community.\nOur code is available at\nhttps://github.com/google-deepmind/clrs/tree/master/clrs/_src/clrs_text.\n","authors":["Larisa Markeeva","Sean McLeish","Borja Ibarz","Wilfried Bounsi","Olga Kozlova","Alex Vitvitskyi","Charles Blundell","Tom Goldstein","Avi Schwarzschild","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2406.04229v1.pdf","comment":"Preprint, under review. Comments welcome"},{"id":"http://arxiv.org/abs/2406.04220v1","updated":"2024-06-06T16:18:30Z","published":"2024-06-06T16:18:30Z","title":"BEADs: Bias Evaluation Across Domains","summary":"  Recent improvements in large language models (LLMs) have significantly\nenhanced natural language processing (NLP) applications. However, these models\ncan also inherit and perpetuate biases from their training data. Addressing\nthis issue is crucial, yet many existing datasets do not offer evaluation\nacross diverse NLP tasks. To tackle this, we introduce the Bias Evaluations\nAcross Domains (BEADs) dataset, designed to support a wide range of NLP tasks,\nincluding text classification, bias entity recognition, bias quantification,\nand benign language generation. BEADs uses AI-driven annotation combined with\nexperts' verification to provide reliable labels. This method overcomes the\nlimitations of existing datasets that typically depend on crowd-sourcing,\nexpert-only annotations with limited bias evaluations, or unverified AI\nlabeling. Our empirical analysis shows that BEADs is effective in detecting and\nreducing biases across different language models, with smaller models\nfine-tuned on BEADs often outperforming LLMs in bias classification tasks.\nHowever, these models may still exhibit biases towards certain demographics.\nFine-tuning LLMs with our benign language data also reduces biases while\npreserving the models' knowledge. Our findings highlight the importance of\ncomprehensive bias evaluation and the potential of targeted fine-tuning for\nreducing the bias of LLMs. We are making BEADs publicly available at\nhttps://huggingface.co/datasets/shainar/BEAD\n  Warning: This paper contains examples that may be considered offensive.\n","authors":["Shaina Raza","Mizanur Rahman","Michael R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.04220v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2406.04218v1","updated":"2024-06-06T16:18:02Z","published":"2024-06-06T16:18:02Z","title":"Rethinking LLM and Linguistic Steganalysis: An Efficient Detection of\n  Strongly Concealed Stego","summary":"  To detect stego (steganographic text) in complex scenarios, linguistic\nsteganalysis (LS) with various motivations has been proposed and achieved\nexcellent performance. However, with the development of generative\nsteganography, some stegos have strong concealment, especially after the\nemergence of LLMs-based steganography, the existing LS has low detection or\neven cannot detect them. We designed a novel LS with two modes called LSGC. In\nthe generation mode, we created an LS-task \"description\" and used the\ngeneration ability of LLM to explain whether texts to be detected are stegos.\nOn this basis, we rethought the principle of LS and LLMs, and proposed the\nclassification mode. In this mode, LSGC deleted the LS-task \"description\" and\nchanged the \"causalLM\" LLMs to the \"sequenceClassification\" architecture. The\nLS features can be extracted by only one pass of the model, and a linear layer\nwith initialization weights is added to obtain the classification probability.\nExperiments on strongly concealed stegos show that LSGC significantly improves\ndetection and reaches SOTA performance. Additionally, LSGC in classification\nmode greatly reduces training time while maintaining high performance.\n","authors":["Yifan Tang","Yihao Wang","Ru Zhang","Jianyi Liu"],"pdf_url":"https://arxiv.org/pdf/2406.04218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10013v2","updated":"2024-06-06T16:16:12Z","published":"2024-02-15T15:25:30Z","title":"Bridging the Empirical-Theoretical Gap in Neural Network Formal Language\n  Learning Using Minimum Description Length","summary":"  Neural networks offer good approximation to many tasks but consistently fail\nto reach perfect generalization, even when theoretical work shows that such\nperfect solutions can be expressed by certain architectures. Using the task of\nformal language learning, we focus on one simple formal language and show that\nthe theoretically correct solution is in fact not an optimum of commonly used\nobjectives -- even with regularization techniques that according to common\nwisdom should lead to simple weights and good generalization (L1, L2) or other\nmeta-heuristics (early-stopping, dropout). On the other hand, replacing\nstandard targets with the Minimum Description Length objective (MDL) results in\nthe correct solution being an optimum.\n","authors":["Nur Lan","Emmanuel Chemla","Roni Katzir"],"pdf_url":"https://arxiv.org/pdf/2402.10013v2.pdf","comment":"9 pages, 5 figures, 3 appendix pages"},{"id":"http://arxiv.org/abs/2406.04216v1","updated":"2024-06-06T16:15:34Z","published":"2024-06-06T16:15:34Z","title":"What Do Language Models Learn in Context? The Structured Task Hypothesis","summary":"  Large language models (LLMs) exhibit an intriguing ability to learn a novel\ntask from in-context examples presented in a demonstration, termed in-context\nlearning (ICL). Understandably, a swath of research has been dedicated to\nuncovering the theories underpinning ICL. One popular hypothesis explains ICL\nby task selection. LLMs identify the task based on the demonstration and\ngeneralize it to the prompt. Another popular hypothesis is that ICL is a form\nof meta-learning, i.e., the models learn a learning algorithm at pre-training\ntime and apply it to the demonstration. Finally, a third hypothesis argues that\nLLMs use the demonstration to select a composition of tasks learned during\npre-training to perform ICL. In this paper, we empirically explore these three\nhypotheses that explain LLMs' ability to learn in context with a suite of\nexperiments derived from common text classification tasks. We invalidate the\nfirst two hypotheses with counterexamples and provide evidence in support of\nthe last hypothesis. Our results suggest an LLM could learn a novel task in\ncontext via composing tasks learned during pre-training.\n","authors":["Jiaoda Li","Yifan Hou","Mrinmaya Sachan","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2406.04216v1.pdf","comment":"This work is published in ACL 2024"},{"id":"http://arxiv.org/abs/2406.04215v1","updated":"2024-06-06T16:14:54Z","published":"2024-06-06T16:14:54Z","title":"mCSQA: Multilingual Commonsense Reasoning Dataset with Unified Creation\n  Strategy by Language Models and Humans","summary":"  It is very challenging to curate a dataset for language-specific knowledge\nand common sense in order to evaluate natural language understanding\ncapabilities of language models. Due to the limitation in the availability of\nannotators, most current multilingual datasets are created through translation,\nwhich cannot evaluate such language-specific aspects. Therefore, we propose\nMultilingual CommonsenseQA (mCSQA) based on the construction process of CSQA\nbut leveraging language models for a more efficient construction, e.g., by\nasking LM to generate questions/answers, refine answers and verify QAs followed\nby reduced human efforts for verification. Constructed dataset is a benchmark\nfor cross-lingual language-transfer capabilities of multilingual LMs, and\nexperimental results showed high language-transfer capabilities for questions\nthat LMs could easily solve, but lower transfer capabilities for questions\nrequiring deep knowledge or commonsense. This highlights the necessity of\nlanguage-specific datasets for evaluation and training. Finally, our method\ndemonstrated that multilingual LMs could create QA including language-specific\nknowledge, significantly reducing the dataset creation cost compared to manual\ncreation. The datasets are available at\nhttps://huggingface.co/datasets/yusuke1997/mCSQA.\n","authors":["Yusuke Sakai","Hidetaka Kamigaito","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2406.04215v1.pdf","comment":"Accepted at Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.04214v1","updated":"2024-06-06T16:14:16Z","published":"2024-06-06T16:14:16Z","title":"ValueBench: Towards Comprehensively Evaluating Value Orientations and\n  Understanding of Large Language Models","summary":"  Large Language Models (LLMs) are transforming diverse fields and gaining\nincreasing influence as human proxies. This development underscores the urgent\nneed for evaluating value orientations and understanding of LLMs to ensure\ntheir responsible integration into public-facing applications. This work\nintroduces ValueBench, the first comprehensive psychometric benchmark for\nevaluating value orientations and value understanding in LLMs. ValueBench\ncollects data from 44 established psychometric inventories, encompassing 453\nmultifaceted value dimensions. We propose an evaluation pipeline grounded in\nrealistic human-AI interactions to probe value orientations, along with novel\ntasks for evaluating value understanding in an open-ended value space. With\nextensive experiments conducted on six representative LLMs, we unveil their\nshared and distinctive value orientations and exhibit their ability to\napproximate expert conclusions in value-related extraction and generation\ntasks. ValueBench is openly accessible at\nhttps://github.com/Value4AI/ValueBench.\n","authors":["Yuanyi Ren","Haoran Ye","Hanjun Fang","Xin Zhang","Guojie Song"],"pdf_url":"https://arxiv.org/pdf/2406.04214v1.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2402.12451v2","updated":"2024-06-06T16:13:43Z","published":"2024-02-19T19:01:01Z","title":"The Revolution of Multimodal Large Language Models: A Survey","summary":"  Connecting text and visual modalities plays an essential role in generative\nintelligence. For this reason, inspired by the success of large language\nmodels, significant research efforts are being devoted to the development of\nMultimodal Large Language Models (MLLMs). These models can seamlessly integrate\nvisual and textual modalities, while providing a dialogue-based interface and\ninstruction-following capabilities. In this paper, we provide a comprehensive\nreview of recent visual-based MLLMs, analyzing their architectural choices,\nmultimodal alignment strategies, and training techniques. We also conduct a\ndetailed analysis of these models across a wide range of tasks, including\nvisual grounding, image generation and editing, visual understanding, and\ndomain-specific applications. Additionally, we compile and describe training\ndatasets and evaluation benchmarks, conducting comparisons among existing\nmodels in terms of performance and computational requirements. Overall, this\nsurvey offers a comprehensive overview of the current state of the art, laying\nthe groundwork for future MLLMs.\n","authors":["Davide Caffagni","Federico Cocchi","Luca Barsellotti","Nicholas Moratelli","Sara Sarto","Lorenzo Baraldi","Lorenzo Baraldi","Marcella Cornia","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2402.12451v2.pdf","comment":"ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2404.00929v2","updated":"2024-06-06T16:04:15Z","published":"2024-04-01T05:13:56Z","title":"A Survey on Multilingual Large Language Models: Corpora, Alignment, and\n  Bias","summary":"  Based on the foundation of Large Language Models (LLMs), Multilingual Large\nLanguage Models (MLLMs) have been developed to address the challenges of\nmultilingual natural language processing tasks, hoping to achieve knowledge\ntransfer from high-resource to low-resource languages. However, significant\nlimitations and challenges still exist, such as language imbalance,\nmultilingual alignment, and inherent bias. In this paper, we aim to provide a\ncomprehensive analysis of MLLMs, delving deeply into discussions surrounding\nthese critical issues. First of all, we start by presenting an overview of\nMLLMs, covering their evolution, key techniques, and multilingual capacities.\nSecondly, we explore widely utilized multilingual corpora for MLLMs' training\nand multilingual datasets oriented for downstream tasks that are crucial for\nenhancing the cross-lingual capability of MLLMs. Thirdly, we survey the\nexisting studies on multilingual representations and investigate whether the\ncurrent MLLMs can learn a universal language representation. Fourthly, we\ndiscuss bias on MLLMs including its category and evaluation metrics, and\nsummarize the existing debiasing techniques. Finally, we discuss existing\nchallenges and point out promising research directions. By demonstrating these\naspects, this paper aims to facilitate a deeper understanding of MLLMs and\ntheir potentiality in various domains.\n","authors":["Yuemei Xu","Ling Hu","Jiayi Zhao","Zihan Qiu","Yuqi Ye","Hanwen Gu"],"pdf_url":"https://arxiv.org/pdf/2404.00929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12464v4","updated":"2024-06-06T16:02:39Z","published":"2024-04-18T18:48:50Z","title":"NormAd: A Benchmark for Measuring the Cultural Adaptability of Large\n  Language Models","summary":"  The integration of Large Language Models (LLMs) into various global cultures\nfundamentally presents a cultural challenge: LLMs must navigate interactions,\nrespect social norms, and avoid transgressing cultural boundaries. However, it\nis still unclear if LLMs can adapt their outputs to diverse cultural norms. Our\nstudy focuses on this aspect. We introduce NormAd, a novel dataset, which\nincludes 2.6k stories that represent social and cultural norms from 75\ncountries, to assess the ability of LLMs to adapt to different granular levels\nof socio-cultural contexts such as the country of origin, its associated\ncultural values, and prevalent social norms. Our study reveals that LLMs\nstruggle with cultural reasoning across all contextual granularities, showing\nstronger adaptability to English-centric cultures over those from the Global\nSouth. Even with explicit social norms, the top-performing model,\nMistral-7b-Instruct, achieves only 81.8\\% accuracy, lagging behind the 95.6\\%\nachieved by humans. Evaluation on NormAd further reveals that LLMs struggle to\nadapt to stories involving gift-giving across cultures. Due to inherent\nagreement or sycophancy biases, LLMs find it considerably easier to assess the\nsocial acceptability of stories that adhere to cultural norms than those that\ndeviate from them. Our benchmark measures the cultural adaptability (or lack\nthereof) of LLMs, emphasizing the potential to make these technologies more\nequitable and useful for global audiences. We release the NormAd dataset and\nits associated code on GitHub.\n","authors":["Abhinav Rao","Akhila Yerukola","Vishwa Shah","Katharina Reinecke","Maarten Sap"],"pdf_url":"https://arxiv.org/pdf/2404.12464v4.pdf","comment":"Preprint. In Review"},{"id":"http://arxiv.org/abs/2311.09109v2","updated":"2024-06-06T16:01:08Z","published":"2023-11-15T16:56:49Z","title":"Does Pre-trained Language Model Actually Infer Unseen Links in Knowledge\n  Graph Completion?","summary":"  Knowledge graphs (KGs) consist of links that describe relationships between\nentities. Due to the difficulty of manually enumerating all relationships\nbetween entities, automatically completing them is essential for KGs. Knowledge\nGraph Completion (KGC) is a task that infers unseen relationships between\nentities in a KG. Traditional embedding-based KGC methods, such as RESCAL,\nTransE, DistMult, ComplEx, RotatE, HAKE, HousE, etc., infer missing links using\nonly the knowledge from training data. In contrast, the recent Pre-trained\nLanguage Model (PLM)-based KGC utilizes knowledge obtained during pre-training.\nTherefore, PLM-based KGC can estimate missing links between entities by reusing\nmemorized knowledge from pre-training without inference. This approach is\nproblematic because building KGC models aims to infer unseen links between\nentities. However, conventional evaluations in KGC do not consider inference\nand memorization abilities separately. Thus, a PLM-based KGC method, which\nachieves high performance in current KGC evaluations, may be ineffective in\npractical applications. To address this issue, we analyze whether PLM-based KGC\nmethods make inferences or merely access memorized knowledge. For this purpose,\nwe propose a method for constructing synthetic datasets specified in this\nanalysis and conclude that PLMs acquire the inference abilities required for\nKGC through pre-training, even though the performance improvements mostly come\nfrom textual information of entities and relations.\n","authors":["Yusuke Sakai","Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2311.09109v2.pdf","comment":"Accepted at NAACL 2024 main oral, 15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2406.04202v1","updated":"2024-06-06T16:00:20Z","published":"2024-06-06T16:00:20Z","title":"Legal Documents Drafting with Fine-Tuned Pre-Trained Large Language\n  Model","summary":"  With the development of large-scale Language Models (LLM), fine-tuning\npre-trained LLM has become a mainstream paradigm for solving downstream tasks\nof natural language processing. However, training a language model in the legal\nfield requires a large number of legal documents so that the language model can\nlearn legal terminology and the particularity of the format of legal documents.\nThe typical NLP approaches usually rely on many manually annotated data sets\nfor training. However, in the legal field application, it is difficult to\nobtain a large number of manually annotated data sets, which restricts the\ntypical method applied to the task of drafting legal documents. The\nexperimental results of this paper show that not only can we leverage a large\nnumber of annotation-free legal documents without Chinese word segmentation to\nfine-tune a large-scale language model, but more importantly, it can fine-tune\na pre-trained LLM on the local computer to achieve the generating legal\ndocument drafts task, and at the same time achieve the protection of\ninformation privacy and to improve information security issues.\n","authors":["Chun-Hsien Lin","Pu-Jen Cheng"],"pdf_url":"https://arxiv.org/pdf/2406.04202v1.pdf","comment":"12th International Conference on Software Engineering & Trends (SE\n  2024), April 27 ~ 28, 2024, Copenhagen, Denmark Volume Editors : David C.\n  Wyld, Dhinaharan Nagamalai (Eds) ISBN : 978-1-923107-24-3"},{"id":"http://arxiv.org/abs/2406.04197v1","updated":"2024-06-06T15:55:53Z","published":"2024-06-06T15:55:53Z","title":"DICE: Detecting In-distribution Contamination in LLM's Fine-tuning Phase\n  for Math Reasoning","summary":"  The advancement of large language models (LLMs) relies on evaluation using\npublic benchmarks, but data contamination can lead to overestimated\nperformance. Previous researches focus on detecting contamination by\ndetermining whether the model has seen the exact same data during training. In\nthis work, we argue that even training on data similar to benchmark data\ninflates performance on in-distribution tasks without improving overall\ncapacity, which we called In-distribution contamination. To effectively detect\nin-distribution contamination, we propose DICE, a novel method that leverages\nthe internal states of LLMs to locate-then-detect the contamination. DICE first\nidentifies the most sensitive layer to contamination, then trains a classifier\nbased on the internal states of that layer. Experiments reveal DICE's high\naccuracy in detecting in-distribution contamination across various LLMs and\nmath reasoning datasets. We also show the generalization capability of the\ntrained DICE detector, which is able to detect contamination across multiple\nbenchmarks with similar distributions. Additionally, we find that the DICE\ndetection scores are positively correlated with the performance of ten LLMs\nfine-tuned by either us or other organizations on four math reasoning datasets\n(with $R^2$ values between 0.6 and 0.75). This indicates that the\nin-distribution contamination problem potentially lead to an overestimation of\nthe true capabilities of many existing models. The code and data are available\nat https://github.com/THU-KEG/DICE.\n","authors":["Shangqing Tu","Kejian Zhu","Yushi Bai","Zijun Yao","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2406.04197v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2406.04175v1","updated":"2024-06-06T15:32:29Z","published":"2024-06-06T15:32:29Z","title":"Confabulation: The Surprising Value of Large Language Model\n  Hallucinations","summary":"  This paper presents a systematic defense of large language model (LLM)\nhallucinations or 'confabulations' as a potential resource instead of a\ncategorically negative pitfall. The standard view is that confabulations are\ninherently problematic and AI research should eliminate this flaw. In this\npaper, we argue and empirically demonstrate that measurable semantic\ncharacteristics of LLM confabulations mirror a human propensity to utilize\nincreased narrativity as a cognitive resource for sense-making and\ncommunication. In other words, it has potential value. Specifically, we analyze\npopular hallucination benchmarks and reveal that hallucinated outputs display\nincreased levels of narrativity and semantic coherence relative to veridical\noutputs. This finding reveals a tension in our usually dismissive\nunderstandings of confabulation. It suggests, counter-intuitively, that the\ntendency for LLMs to confabulate may be intimately associated with a positive\ncapacity for coherent narrative-text generation.\n","authors":["Peiqi Sui","Eamon Duede","Sophie Wu","Richard Jean So"],"pdf_url":"https://arxiv.org/pdf/2406.04175v1.pdf","comment":"Forthcoming at ACL2024 main conference. 1 figure"},{"id":"http://arxiv.org/abs/2312.03668v2","updated":"2024-06-06T15:24:16Z","published":"2023-12-06T18:34:42Z","title":"Integrating Pre-Trained Speech and Language Models for End-to-End Speech\n  Recognition","summary":"  Advances in machine learning have made it possible to perform various text\nand speech processing tasks, such as automatic speech recognition (ASR), in an\nend-to-end (E2E) manner. E2E approaches utilizing pre-trained models are\ngaining attention for conserving training data and resources. However, most of\ntheir applications in ASR involve only one of either a pre-trained speech or a\nlanguage model. This paper proposes integrating a pre-trained speech\nrepresentation model and a large language model (LLM) for E2E ASR. The proposed\nmodel enables the optimization of the entire ASR process, including acoustic\nfeature extraction and acoustic and language modeling, by combining pre-trained\nmodels with a bridge network and also enables the application of remarkable\ndevelopments in LLM utilization, such as parameter-efficient domain adaptation\nand inference optimization. Experimental results demonstrate that the proposed\nmodel achieves a performance comparable to that of modern E2E ASR models by\nutilizing powerful pre-training models with the proposed integrated approach.\n","authors":["Yukiya Hono","Koh Mitsuda","Tianyu Zhao","Kentaro Mitsui","Toshiaki Wakatsuki","Kei Sawada"],"pdf_url":"https://arxiv.org/pdf/2312.03668v2.pdf","comment":"17 pages, 4 figures, 9 tables, accepted for Findings of ACL 2024. The\n  model is available at https://huggingface.co/rinna/nue-asr"},{"id":"http://arxiv.org/abs/2212.10192v2","updated":"2024-06-06T15:20:27Z","published":"2022-12-20T12:03:19Z","title":"Adam: Dense Retrieval Distillation with Adaptive Dark Examples","summary":"  To improve the performance of the dual-encoder retriever, one effective\napproach is knowledge distillation from the cross-encoder ranker. Existing\nworks construct the candidate passages following the supervised learning\nsetting where a query is paired with a positive passage and a batch of\nnegatives. However, through empirical observation, we find that even the hard\nnegatives from advanced methods are still too trivial for the teacher to\ndistinguish, preventing the teacher from transferring abundant dark knowledge\nto the student through its soft label. To alleviate this issue, we propose\nADAM, a knowledge distillation framework that can better transfer the dark\nknowledge held in the teacher with Adaptive Dark exAMples. Different from\nprevious works that only rely on one positive and hard negatives as candidate\npassages, we create dark examples that all have moderate relevance to the query\nthrough mixing-up and masking in discrete space. Furthermore, as the quality of\nknowledge held in different training instances varies as measured by the\nteacher's confidence score, we propose a self-paced distillation strategy that\nadaptively concentrates on a subset of high-quality instances to conduct our\ndark-example-based knowledge distillation to help the student learn better. We\nconduct experiments on two widely-used benchmarks and verify the effectiveness\nof our method.\n","authors":["Chongyang Tao","Chang Liu","Tao Shen","Can Xu","Xiubo Geng","Binxing Jiao","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2212.10192v2.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2402.14298v3","updated":"2024-06-06T15:17:58Z","published":"2024-02-22T05:24:19Z","title":"Multi-modal Stance Detection: New Datasets and Model","summary":"  Stance detection is a challenging task that aims to identify public opinion\nfrom social media platforms with respect to specific targets. Previous work on\nstance detection largely focused on pure texts. In this paper, we study\nmulti-modal stance detection for tweets consisting of texts and images, which\nare prevalent in today's fast-growing social media platforms where people often\npost multi-modal messages. To this end, we create five new multi-modal stance\ndetection datasets of different domains based on Twitter, in which each example\nconsists of a text and an image. In addition, we propose a simple yet effective\nTargeted Multi-modal Prompt Tuning framework (TMPT), where target information\nis leveraged to learn multi-modal stance features from textual and visual\nmodalities. Experimental results on our five benchmark datasets show that the\nproposed TMPT achieves state-of-the-art performance in multi-modal stance\ndetection.\n","authors":["Bin Liang","Ang Li","Jingqian Zhao","Lin Gui","Min Yang","Yue Yu","Kam-Fai Wong","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2402.14298v3.pdf","comment":"ACL'24 Findings"},{"id":"http://arxiv.org/abs/2406.04156v1","updated":"2024-06-06T15:17:51Z","published":"2024-06-06T15:17:51Z","title":"Pointer-Guided Pre-Training: Infusing Large Language Models with\n  Paragraph-Level Contextual Awareness","summary":"  We introduce \"pointer-guided segment ordering\" (SO), a novel pre-training\ntechnique aimed at enhancing the contextual understanding of paragraph-level\ntext representations in large language models. Our methodology leverages a\nself-attention-driven pointer network to restore the original sequence of\nshuffled text segments, addressing the challenge of capturing the structural\ncoherence and contextual dependencies within documents. This pre-training\napproach is complemented by a fine-tuning methodology that incorporates dynamic\nsampling, augmenting the diversity of training instances and improving sample\nefficiency for various downstream applications. We evaluate our method on a\ndiverse set of datasets, demonstrating its efficacy in tasks requiring\nsequential text classification across scientific literature and financial\nreporting domains. Our experiments show that pointer-guided pre-training\nsignificantly enhances the model's ability to understand complex document\nstructures, leading to state-of-the-art performance in downstream\nclassification tasks.\n","authors":["Lars Hillebrand","Prabhupad Pradhan","Christian Bauckhage","Rafet Sifa"],"pdf_url":"https://arxiv.org/pdf/2406.04156v1.pdf","comment":"17 pages, 3 figures, 5 tables, accepted at ECML-PKDD 2024"},{"id":"http://arxiv.org/abs/2406.04151v1","updated":"2024-06-06T15:15:41Z","published":"2024-06-06T15:15:41Z","title":"AgentGym: Evolving Large Language Model-based Agents across Diverse\n  Environments","summary":"  Building generalist agents that can handle diverse tasks and evolve\nthemselves across different environments is a long-term goal in the AI\ncommunity. Large language models (LLMs) are considered a promising foundation\nto build such agents due to their generalized capabilities. Current approaches\neither have LLM-based agents imitate expert-provided trajectories step-by-step,\nrequiring human supervision, which is hard to scale and limits environmental\nexploration; or they let agents explore and learn in isolated environments,\nresulting in specialist agents with limited generalization. In this paper, we\ntake the first step towards building generally-capable LLM-based agents with\nself-evolution ability. We identify a trinity of ingredients: 1) diverse\nenvironments for agent exploration and learning, 2) a trajectory set to equip\nagents with basic capabilities and prior knowledge, and 3) an effective and\nscalable evolution method. We propose AgentGym, a new framework featuring a\nvariety of environments and tasks for broad, real-time, uni-format, and\nconcurrent agent exploration. AgentGym also includes a database with expanded\ninstructions, a benchmark suite, and high-quality trajectories across\nenvironments. Next, we propose a novel method, AgentEvol, to investigate the\npotential of agent self-evolution beyond previously seen data across tasks and\nenvironments. Experimental results show that the evolved agents can achieve\nresults comparable to SOTA models. We release the AgentGym suite, including the\nplatform, dataset, benchmark, checkpoints, and algorithm implementations. The\nAgentGym suite is available on https://github.com/WooooDyy/AgentGym.\n","authors":["Zhiheng Xi","Yiwen Ding","Wenxiang Chen","Boyang Hong","Honglin Guo","Junzhe Wang","Dingwen Yang","Chenyang Liao","Xin Guo","Wei He","Songyang Gao","Lu Chen","Rui Zheng","Yicheng Zou","Tao Gui","Qi Zhang","Xipeng Qiu","Xuanjing Huang","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2406.04151v1.pdf","comment":"Project site: https://agentgym.github.io"},{"id":"http://arxiv.org/abs/2402.15082v2","updated":"2024-06-06T15:11:29Z","published":"2024-02-23T03:59:18Z","title":"PEMT: Multi-Task Correlation Guided Mixture-of-Experts Enables\n  Parameter-Efficient Transfer Learning","summary":"  Parameter-efficient fine-tuning (PEFT) has emerged as an effective method for\nadapting pre-trained language models to various tasks efficiently. Recently,\nthere has been a growing interest in transferring knowledge from one or\nmultiple tasks to the downstream target task to achieve performance\nimprovements. However, current approaches typically either train adapters on\nindividual tasks or distill shared knowledge from source tasks, failing to\nfully exploit task-specific knowledge and the correlation between source and\ntarget tasks. To overcome these limitations, we propose PEMT, a novel\nparameter-efficient fine-tuning framework based on multi-task transfer\nlearning. PEMT extends the mixture-of-experts (MoE) framework to capture the\ntransferable knowledge as a weighted combination of adapters trained on source\ntasks. These weights are determined by a gated unit, measuring the correlation\nbetween the target and each source task using task description prompt vectors.\nTo fully exploit the task-specific knowledge, we also propose the Task Sparsity\nLoss to improve the sparsity of the gated unit. We conduct experiments on a\nbroad range of tasks over 17 datasets. The experimental results demonstrate our\nPEMT yields stable improvements over full fine-tuning, and state-of-the-art\nPEFT and knowledge transferring methods on various tasks. The results highlight\nthe effectiveness of our method which is capable of sufficiently exploiting the\nknowledge and correlation features across multiple tasks.\n","authors":["Zhisheng Lin","Han Fu","Chenghao Liu","Zhuo Li","Jianling Sun"],"pdf_url":"https://arxiv.org/pdf/2402.15082v2.pdf","comment":"Accepted to Findings of the ACL 2024"},{"id":"http://arxiv.org/abs/2406.04146v1","updated":"2024-06-06T15:11:11Z","published":"2024-06-06T15:11:11Z","title":"Towards Understanding Task-agnostic Debiasing Through the Lenses of\n  Intrinsic Bias and Forgetfulness","summary":"  While task-agnostic debiasing provides notable generalizability and reduced\nreliance on downstream data, its impact on language modeling ability and the\nrisk of relearning social biases from downstream task-specific data remain as\nthe two most significant challenges when debiasing Pretrained Language Models\n(PLMs). The impact on language modeling ability can be alleviated given a\nhigh-quality and long-contextualized debiasing corpus, but there remains a\ndeficiency in understanding the specifics of relearning biases. We empirically\nascertain that the effectiveness of task-agnostic debiasing hinges on the\nquantitative bias level of both the task-specific data used for downstream\napplications and the debiased model. We empirically show that the lower bound\nof the bias level of the downstream fine-tuned model can be approximated by the\nbias level of the debiased model, in most practical cases. To gain more\nin-depth understanding about how the parameters of PLMs change during\nfine-tuning due to the forgetting issue of PLMs, we propose a novel framework\nwhich can Propagate Socially-fair Debiasing to Downstream Fine-tuning,\nProSocialTuning. Our proposed framework can push the fine-tuned model to\napproach the bias lower bound during downstream fine-tuning, indicating that\nthe ineffectiveness of debiasing can be alleviated by overcoming the forgetting\nissue through regularizing successfully debiased attention heads based on the\nPLMs' bias levels from stages of pretraining and debiasing.\n","authors":["Guangliang Liu","Milad Afshari","Xitong Zhang","Zhiyu Xue","Avrajit Ghosh","Bidhan Bashyal","Rongrong Wang","Kristen Johnson"],"pdf_url":"https://arxiv.org/pdf/2406.04146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04145v1","updated":"2024-06-06T15:10:27Z","published":"2024-06-06T15:10:27Z","title":"Every Answer Matters: Evaluating Commonsense with Probabilistic Measures","summary":"  Large language models have demonstrated impressive performance on commonsense\ntasks; however, these tasks are often posed as multiple-choice questions,\nallowing models to exploit systematic biases. Commonsense is also inherently\nprobabilistic with multiple correct answers. The purpose of \"boiling water\"\ncould be making tea and cooking, but it also could be killing germs. Existing\ntasks do not capture the probabilistic nature of common sense. To this end, we\npresent commonsense frame completion (CFC), a new generative task that\nevaluates common sense via multiple open-ended generations. We also propose a\nmethod of probabilistic evaluation that strongly correlates with human\njudgments. Humans drastically outperform strong language model baselines on our\ndataset, indicating this approach is both a challenging and useful evaluation\nof machine common sense.\n","authors":["Qi Cheng","Michael Boratko","Pranay Kumar Yelugam","Tim O'Gorman","Nalini Singh","Andrew McCallum","Xiang Lorraine Li"],"pdf_url":"https://arxiv.org/pdf/2406.04145v1.pdf","comment":"ACL 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2309.06054v3","updated":"2024-06-06T15:09:52Z","published":"2023-09-12T08:45:25Z","title":"Breaking through the learning plateaus of in-context learning in\n  Transformer","summary":"  In-context learning, i.e., learning from context examples, is an impressive\nability of Transformer. Training Transformers to possess this in-context\nlearning skill is computationally intensive due to the occurrence of learning\nplateaus, which are periods within the training process where there is minimal\nor no enhancement in the model's in-context learning capability. To study the\nmechanism behind the learning plateaus, we conceptually seperate a component\nwithin the model's internal representation that is exclusively affected by the\nmodel's weights. We call this the \"weights component\", and the remainder is\nidentified as the \"context component\". By conducting meticulous and controlled\nexperiments on synthetic tasks, we note that the persistence of learning\nplateaus correlates with compromised functionality of the weights component.\nRecognizing the impaired performance of the weights component as a fundamental\nbehavior drives learning plateaus, we have developed three strategies to\nexpedite the learning of Transformers. The effectiveness of these strategies is\nfurther confirmed in natural language processing tasks. In conclusion, our\nresearch demonstrates the feasibility of cultivating a powerful in-context\nlearning ability within AI systems in an eco-friendly manner.\n","authors":["Jingwen Fu","Tao Yang","Yuwang Wang","Yan Lu","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.06054v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04143v1","updated":"2024-06-06T15:08:16Z","published":"2024-06-06T15:08:16Z","title":"Do Language Models Understand Morality? Towards a Robust Detection of\n  Moral Content","summary":"  The task of detecting moral values in text has significant implications in\nvarious fields, including natural language processing, social sciences, and\nethical decision-making. Previously proposed supervised models often suffer\nfrom overfitting, leading to hyper-specialized moral classifiers that struggle\nto perform well on data from different domains. To address this issue, we\nintroduce novel systems that leverage abstract concepts and common-sense\nknowledge acquired from Large Language Models and Natural Language Inference\nmodels during previous stages of training on multiple data sources. By doing\nso, we aim to develop versatile and robust methods for detecting moral values\nin real-world scenarios. Our approach uses the GPT 3.5 model as a zero-shot\nready-made unsupervised multi-label classifier for moral values detection,\neliminating the need for explicit training on labeled data. We compare it with\na smaller NLI-based zero-shot model. The results show that the NLI approach\nachieves competitive results compared to the Davinci model. Furthermore, we\nconduct an in-depth investigation of the performance of supervised systems in\nthe context of cross-domain multi-label moral value detection. This involves\ntraining supervised models on different domains to explore their effectiveness\nin handling data from different sources and comparing their performance with\nthe unsupervised methods. Our contributions encompass a thorough analysis of\nboth supervised and unsupervised methodologies for cross-domain value\ndetection. We introduce the Davinci model as a state-of-the-art zero-shot\nunsupervised moral values classifier, pushing the boundaries of moral value\ndetection without the need for explicit training on labeled data. Additionally,\nwe perform a comparative evaluation of our approach with the supervised models,\nshedding light on their respective strengths and weaknesses.\n","authors":["Luana Bulla","Aldo Gangemi","Misael Mongiovì"],"pdf_url":"https://arxiv.org/pdf/2406.04143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04136v1","updated":"2024-06-06T14:57:48Z","published":"2024-06-06T14:57:48Z","title":"Legal Judgment Reimagined: PredEx and the Rise of Intelligent AI\n  Interpretation in Indian Courts","summary":"  In the era of Large Language Models (LLMs), predicting judicial outcomes\nposes significant challenges due to the complexity of legal proceedings and the\nscarcity of expert-annotated datasets. Addressing this, we introduce\n\\textbf{Pred}iction with \\textbf{Ex}planation (\\texttt{PredEx}), the largest\nexpert-annotated dataset for legal judgment prediction and explanation in the\nIndian context, featuring over 15,000 annotations. This groundbreaking corpus\nsignificantly enhances the training and evaluation of AI models in legal\nanalysis, with innovations including the application of instruction tuning to\nLLMs. This method has markedly improved the predictive accuracy and explanatory\ndepth of these models for legal judgments. We employed various\ntransformer-based models, tailored for both general and Indian legal contexts.\nThrough rigorous lexical, semantic, and expert assessments, our models\neffectively leverage \\texttt{PredEx} to provide precise predictions and\nmeaningful explanations, establishing it as a valuable benchmark for both the\nlegal profession and the NLP community.\n","authors":["Shubham Kumar Nigam","Anurag Sharma","Danush Khanna","Noel Shallum","Kripabandhu Ghosh","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2406.04136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10890v2","updated":"2024-06-06T14:55:40Z","published":"2024-02-16T18:45:58Z","title":"When is Tree Search Useful for LLM Planning? It Depends on the\n  Discriminator","summary":"  In this paper, we examine how large language models (LLMs) solve multi-step\nproblems under a language agent framework with three components: a generator, a\ndiscriminator, and a planning method. We investigate the practical utility of\ntwo advanced planning methods, iterative correction and tree search. We present\na comprehensive analysis of how discrimination accuracy affects the overall\nperformance of agents when using these two methods or a simpler method,\nre-ranking. Experiments on two tasks, text-to-SQL parsing and mathematical\nreasoning, show that: (1) advanced planning methods demand discriminators with\nat least 90% accuracy to achieve significant improvements over re-ranking; (2)\ncurrent LLMs' discrimination abilities have not met the needs of advanced\nplanning methods to achieve such improvements; (3) with LLM-based\ndiscriminators, advanced planning methods may not adequately balance accuracy\nand efficiency. For example, compared to the other two methods, tree search is\nat least 10--20 times slower but leads to negligible performance gains, which\nhinders its real-world applications. Code and data are available at\nhttps://github.com/OSU-NLP-Group/llm-planning-eval.\n","authors":["Ziru Chen","Michael White","Raymond Mooney","Ali Payani","Yu Su","Huan Sun"],"pdf_url":"https://arxiv.org/pdf/2402.10890v2.pdf","comment":"ACL 2024 main"},{"id":"http://arxiv.org/abs/2406.04127v1","updated":"2024-06-06T14:49:06Z","published":"2024-06-06T14:49:06Z","title":"Are We Done with MMLU?","summary":"  Maybe not. We identify and analyse errors in the popular Massive Multitask\nLanguage Understanding (MMLU) benchmark. Even though MMLU is widely adopted,\nour analysis demonstrates numerous ground truth errors that obscure the true\ncapabilities of LLMs. For example, we find that 57% of the analysed questions\nin the Virology subset contain errors. To address this issue, we introduce a\ncomprehensive framework for identifying dataset errors using a novel error\ntaxonomy. Then, we create MMLU-Redux, which is a subset of 3,000 manually\nre-annotated questions across 30 MMLU subjects. Using MMLU-Redux, we\ndemonstrate significant discrepancies with the model performance metrics that\nwere originally reported. Our results strongly advocate for revising MMLU's\nerror-ridden questions to enhance its future utility and reliability as a\nbenchmark. Therefore, we open up MMLU-Redux for additional annotation\nhttps://huggingface.co/datasets/edinburgh-dawg/mmlu-redux.\n","authors":["Aryo Pradipta Gema","Joshua Ong Jun Leang","Giwon Hong","Alessio Devoto","Alberto Carlo Maria Mancino","Rohit Saxena","Xuanli He","Yu Zhao","Xiaotang Du","Mohammad Reza Ghasemi Madani","Claire Barale","Robert McHardy","Joshua Harris","Jean Kaddour","Emile van Krieken","Pasquale Minervini"],"pdf_url":"https://arxiv.org/pdf/2406.04127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07876v3","updated":"2024-06-06T14:46:44Z","published":"2023-08-15T16:41:53Z","title":"Leveraging Codebook Knowledge with NLI and ChatGPT for Zero-Shot\n  Political Relation Classification","summary":"  Is it possible accurately classify political relations within evolving event\nontologies without extensive annotations? This study investigates zero-shot\nlearning methods that use expert knowledge from existing annotation codebook,\nand evaluates the performance of advanced ChatGPT (GPT-3.5/4) and a natural\nlanguage inference (NLI)-based model called ZSP. ChatGPT uses codebook's\nlabeled summaries as prompts, whereas ZSP breaks down the classification task\ninto context, event mode, and class disambiguation to refine task-specific\nhypotheses. This decomposition enhances interpretability, efficiency, and\nadaptability to schema changes. The experiments reveal ChatGPT's strengths and\nlimitations, and crucially show ZSP's outperformance of dictionary-based\nmethods and its competitive edge over some supervised models. These findings\naffirm the value of ZSP for validating event records and advancing ontology\ndevelopment. Our study underscores the efficacy of leveraging transfer learning\nand existing domain expertise to enhance research efficiency and scalability.\n","authors":["Yibo Hu","Erick Skorupa Parolin","Latifur Khan","Patrick T. Brandt","Javier Osorio","Vito J. D'Orazio"],"pdf_url":"https://arxiv.org/pdf/2308.07876v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2403.02271v2","updated":"2024-06-06T14:43:30Z","published":"2024-03-04T17:58:09Z","title":"RIFF: Learning to Rephrase Inputs for Few-shot Fine-tuning of Language\n  Models","summary":"  Pre-trained Language Models (PLMs) can be accurately fine-tuned for\ndownstream text processing tasks. Recently, researchers have introduced several\nparameter-efficient fine-tuning methods that optimize input prompts or adjust a\nsmall number of model parameters (e.g LoRA). In this study, we explore the\nimpact of altering the input text of the original task in conjunction with\nparameter-efficient fine-tuning methods. To most effectively rewrite the input\ntext, we train a few-shot paraphrase model with a Maximum-Marginal Likelihood\nobjective. Using six few-shot text classification datasets, we show that\nenriching data with paraphrases at train and test time enhances the performance\nbeyond what can be achieved with parameter-efficient fine-tuning alone. The\ncode used for our experiments can be found at\nhttps://github.com/SaeedNajafi/RIFF.\n","authors":["Saeed Najafi","Alona Fyshe"],"pdf_url":"https://arxiv.org/pdf/2403.02271v2.pdf","comment":"Final Version (Findings of ACL2024)"},{"id":"http://arxiv.org/abs/2402.07483v2","updated":"2024-06-06T14:42:47Z","published":"2024-02-12T08:45:08Z","title":"T-RAG: Lessons from the LLM Trenches","summary":"  Large Language Models (LLM) have shown remarkable language capabilities\nfueling attempts to integrate them into applications across a wide range of\ndomains. An important application area is question answering over private\nenterprise documents where the main considerations are data security, which\nnecessitates applications that can be deployed on-prem, limited computational\nresources and the need for a robust application that correctly responds to\nqueries. Retrieval-Augmented Generation (RAG) has emerged as the most prominent\nframework for building LLM-based applications. While building a RAG is\nrelatively straightforward, making it robust and a reliable application\nrequires extensive customization and relatively deep knowledge of the\napplication domain. We share our experiences building and deploying an LLM\napplication for question answering over private organizational documents. Our\napplication combines the use of RAG with a finetuned open-source LLM.\nAdditionally, our system, which we call Tree-RAG (T-RAG), uses a tree structure\nto represent entity hierarchies within the organization. This is used to\ngenerate a textual description to augment the context when responding to user\nqueries pertaining to entities within the organization's hierarchy. Our\nevaluations, including a Needle in a Haystack test, show that this combination\nperforms better than a simple RAG or finetuning implementation. Finally, we\nshare some lessons learned based on our experiences building an LLM application\nfor real-world use.\n","authors":["Masoomali Fatehkia","Ji Kim Lucas","Sanjay Chawla"],"pdf_url":"https://arxiv.org/pdf/2402.07483v2.pdf","comment":"Added Needle in a Haystack analysis for T-RAG"},{"id":"http://arxiv.org/abs/2406.04116v1","updated":"2024-06-06T14:36:07Z","published":"2024-06-06T14:36:07Z","title":"Promoting Fairness and Diversity in Speech Datasets for Mental Health\n  and Neurological Disorders Research","summary":"  Current research in machine learning and artificial intelligence is largely\ncentered on modeling and performance evaluation, less so on data collection.\nHowever, recent research demonstrated that limitations and biases in data may\nnegatively impact trustworthiness and reliability. These aspects are\nparticularly impactful on sensitive domains such as mental health and\nneurological disorders, where speech data are used to develop AI applications\naimed at improving the health of patients and supporting healthcare providers.\nIn this paper, we chart the landscape of available speech datasets for this\ndomain, to highlight possible pitfalls and opportunities for improvement and\npromote fairness and diversity. We present a comprehensive list of desiderata\nfor building speech datasets for mental health and neurological disorders and\ndistill it into a checklist focused on ethical concerns to foster more\nresponsible research.\n","authors":["Eleonora Mancini","Ana Tanevska","Andrea Galassi","Alessio Galatolo","Federico Ruggeri","Paolo Torroni"],"pdf_url":"https://arxiv.org/pdf/2406.04116v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2406.04113v1","updated":"2024-06-06T14:30:59Z","published":"2024-06-06T14:30:59Z","title":"Uncovering Limitations of Large Language Models in Information Seeking\n  from Tables","summary":"  Tables are recognized for their high information density and widespread\nusage, serving as essential sources of information. Seeking information from\ntables (TIS) is a crucial capability for Large Language Models (LLMs), serving\nas the foundation of knowledge-based Q&A systems. However, this field presently\nsuffers from an absence of thorough and reliable evaluation. This paper\nintroduces a more reliable benchmark for Table Information Seeking (TabIS). To\navoid the unreliable evaluation caused by text similarity-based metrics, TabIS\nadopts a single-choice question format (with two options per question) instead\nof a text generation format. We establish an effective pipeline for generating\noptions, ensuring their difficulty and quality. Experiments conducted on 12\nLLMs reveal that while the performance of GPT-4-turbo is marginally\nsatisfactory, both other proprietary and open-source models perform\ninadequately. Further analysis shows that LLMs exhibit a poor understanding of\ntable structures, and struggle to balance between TIS performance and\nrobustness against pseudo-relevant tables (common in retrieval-augmented\nsystems). These findings uncover the limitations and potential challenges of\nLLMs in seeking information from tables. We release our data and code to\nfacilitate further research in this field.\n","authors":["Chaoxu Pang","Yixuan Cao","Chunhao Yang","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2406.04113v1.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.04109v1","updated":"2024-06-06T14:26:35Z","published":"2024-06-06T14:26:35Z","title":"Intention and Face in Dialog","summary":"  The notion of face described by Brown and Levinson (1987) has been studied in\ngreat detail, but a critical aspect of the framework, that which focuses on how\nintentions mediate the planning of turns which impose upon face, has received\nfar less attention. We present an analysis of three computational systems\ntrained for classifying both intention and politeness, focusing on how the\nformer influences the latter. In politeness theory, agents attend to the desire\nto have their wants appreciated (positive face), and a complementary desire to\nact unimpeded and maintain freedom (negative face). Similar to speech acts,\nutterances can perform so-called face acts which can either raise or threaten\nthe positive or negative face of the speaker or hearer. We begin by using an\nexisting corpus to train a model which classifies face acts, achieving a new\nSoTA in the process. We then observe that every face act has an underlying\nintention that motivates it and perform additional experiments integrating\ndialog act annotations to provide these intentions by proxy. Our analysis finds\nthat dialog acts improve performance on face act detection for minority classes\nand points to a close relationship between aspects of face and intent.\n","authors":["Adil Soubki","Owen Rambow"],"pdf_url":"https://arxiv.org/pdf/2406.04109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04106v1","updated":"2024-06-06T14:23:10Z","published":"2024-06-06T14:23:10Z","title":"Explainability and Hate Speech: Structured Explanations Make Social\n  Media Moderators Faster","summary":"  Content moderators play a key role in keeping the conversation on social\nmedia healthy. While the high volume of content they need to judge represents a\nbottleneck to the moderation pipeline, no studies have explored how models\ncould support them to make faster decisions. There is, by now, a vast body of\nresearch into detecting hate speech, sometimes explicitly motivated by a desire\nto help improve content moderation, but published research using real content\nmoderators is scarce. In this work we investigate the effect of explanations on\nthe speed of real-world moderators. Our experiments show that while generic\nexplanations do not affect their speed and are often ignored, structured\nexplanations lower moderators' decision making time by 7.4%.\n","authors":["Agostina Calabrese","Leonardo Neves","Neil Shah","Maarten W. Bos","Björn Ross","Mirella Lapata","Francesco Barbieri"],"pdf_url":"https://arxiv.org/pdf/2406.04106v1.pdf","comment":"11 pages, 14 figures, to be published at ACL 2024"},{"id":"http://arxiv.org/abs/2402.14328v2","updated":"2024-06-06T14:06:41Z","published":"2024-02-22T06:47:56Z","title":"Understanding and Patching Compositional Reasoning in LLMs","summary":"  LLMs have marked a revolutonary shift, yet they falter when faced with\ncompositional reasoning tasks. Our research embarks on a quest to uncover the\nroot causes of compositional reasoning failures of LLMs, uncovering that most\nof them stem from the improperly generated or leveraged implicit reasoning\nresults. Inspired by our empirical findings, we resort to Logit Lens and an\nintervention experiment to dissect the inner hidden states of LLMs. This deep\ndive reveals that implicit reasoning results indeed surface within middle\nlayers and play a causative role in shaping the final explicit reasoning\nresults. Our exploration further locates multi-head self-attention (MHSA)\nmodules within these layers, which emerge as the linchpins in accurate\ngeneration and leveraing of implicit reasoning results. Grounded on the above\nfindings, we develop CREME, a lightweight method to patch errors in\ncompositional reasoning via editing the located MHSA modules. Our empirical\nevidence stands testament to CREME's effectiveness, paving the way for\nautonomously and continuously enhancing compositional reasoning capabilities in\nlanguage models.\n","authors":["Zhaoyi Li","Gangwei Jiang","Hong Xie","Linqi Song","Defu Lian","Ying Wei"],"pdf_url":"https://arxiv.org/pdf/2402.14328v2.pdf","comment":"Accepted by ACL'2024 Findings"},{"id":"http://arxiv.org/abs/2406.02265v2","updated":"2024-06-06T13:59:03Z","published":"2024-06-04T12:41:54Z","title":"Understanding Retrieval Robustness for Retrieval-Augmented Image\n  Captioning","summary":"  Recent advances in retrieval-augmented models for image captioning highlight\nthe benefit of retrieving related captions for efficient, lightweight models\nwith strong domain-transfer capabilities. While these models demonstrate the\nsuccess of retrieval augmentation, retrieval models are still far from perfect\nin practice: the retrieved information can sometimes mislead the model,\nresulting in incorrect generation and worse performance. In this paper, we\nanalyze the robustness of a retrieval-augmented captioning model SmallCap. Our\nanalysis shows that the model is sensitive to tokens that appear in the\nmajority of the retrieved captions, and the input attribution shows that those\ntokens are likely copied into the generated output. Given these findings, we\npropose to train the model by sampling retrieved captions from more diverse\nsets. This decreases the chance that the model learns to copy majority tokens,\nand improves both in-domain and cross-domain performance.\n","authors":["Wenyan Li","Jiaang Li","Rita Ramos","Raphael Tang","Desmond Elliott"],"pdf_url":"https://arxiv.org/pdf/2406.02265v2.pdf","comment":"9 pages, long paper at ACL 2024"},{"id":"http://arxiv.org/abs/2403.18680v2","updated":"2024-06-06T13:58:20Z","published":"2024-03-27T15:22:16Z","title":"Non-Linear Inference Time Intervention: Improving LLM Truthfulness","summary":"  In this work, we explore LLM's internal representation space to identify\nattention heads that contain the most truthful and accurate information. We\nfurther developed the Inference Time Intervention (ITI) framework, which lets\nbias LLM without the need for fine-tuning. The improvement manifests in\nintroducing a non-linear multi-token probing and multi-token intervention:\nNon-Linear ITI (NL-ITI), which significantly enhances performance on evaluation\nbenchmarks. NL-ITI is tested on diverse multiple-choice datasets, including\nTruthfulQA, on which we report over 16% relative MC1 (accuracy of model\npointing to the correct answer) improvement with respect to the baseline ITI\nresults. Moreover, we achieved a 10% relative improvement over the recently\nreleased Truth Forest (TrFf) method that also focused on ITI improvement.\n","authors":["Jakub Hoscilowicz","Adam Wiacek","Jan Chojnacki","Adam Cieslak","Leszek Michon","Vitalii Urbanevych","Artur Janicki"],"pdf_url":"https://arxiv.org/pdf/2403.18680v2.pdf","comment":"Accepted on Interspeech 2024 Conference. Code is available at\n  https://github.com/Samsung/NL-ITI"},{"id":"http://arxiv.org/abs/2402.11517v3","updated":"2024-06-06T13:56:59Z","published":"2024-02-18T09:10:04Z","title":"Knowledge-to-SQL: Enhancing SQL Generation with Data Expert LLM","summary":"  Generating accurate SQL queries for user questions (text-to-SQL) has been a\nlong-standing challenge since it requires a deep understanding of both the\nuser's question and the corresponding database schema in order to retrieve the\ndesired content accurately. Existing methods rely on the comprehensive\ncapability of large language models (LLMs) to generate the SQL. However, some\nnecessary knowledge is not explicitly included in the database schema and user\nquestion or has been learned by LLMs. Thus, the generated SQL of the\nknowledge-insufficient questions may be inaccurate, negatively influencing the\ntext-to-SQL models' performance and robustness. To address this challenge, we\npropose the Knowledge-to-SQL framework, which employs tailored Data Expert LLM\n(DELLM) to provide helpful knowledge for all text-to-SQL models. Specifically,\nwe introduce the detailed implementation of DELLM regarding table reading and\nthe basic fine-tuning process. We further propose a Preference Learning via\nDatabase Feedback (PLDBF) strategy, refining the DELLM to generate more helpful\nknowledge for LLMs. Extensive experiments verify that DELLM can enhance the\nstate-of-the-art approaches for text-to-SQL tasks. The corresponding code of\nDELLM is released for further research.\n","authors":["Zijin Hong","Zheng Yuan","Hao Chen","Qinggang Zhang","Feiran Huang","Xiao Huang"],"pdf_url":"https://arxiv.org/pdf/2402.11517v3.pdf","comment":"Accepted to ACL2024 Findings"},{"id":"http://arxiv.org/abs/2312.10104v3","updated":"2024-06-06T13:54:23Z","published":"2023-12-15T03:11:03Z","title":"Lever LM: Configuring In-Context Sequence to Lever Large Vision Language\n  Models","summary":"  As Archimedes famously said, ``Give me a lever long enough and a fulcrum on\nwhich to place it, and I shall move the world'', in this study, we propose to\nuse a tiny Language Model (LM), \\eg, a Transformer with 67M parameters, to\nlever much larger Vision-Language Models (LVLMs) with 9B parameters.\nSpecifically, we use this tiny \\textbf{Lever-LM} to configure effective\nin-context demonstration (ICD) sequences to improve the In-Context Learinng\n(ICL) performance of LVLMs. Previous studies show that diverse ICD\nconfigurations like the selection and ordering of the demonstrations heavily\naffect the ICL performance, highlighting the significance of configuring\neffective ICD sequences. Motivated by this and by re-considering the the\nprocess of configuring ICD sequence, we find this is a mirror process of human\nsentence composition and further assume that effective ICD configurations may\ncontain internal statistical patterns that can be captured by Lever-LM. Then a\ndataset with effective ICD sequences is constructed to train Lever-LM. After\ntraining, given novel queries, new ICD sequences are configured by the trained\nLever-LM to solve vision-language tasks through ICL. Experiments show that\nthese ICD sequences can improve the ICL performance of two LVLMs compared with\nsome strong baselines in Visual Question Answering and Image Captioning,\nvalidating that Lever-LM can really capture the statistical patterns for\nlevering LVLMs.\n","authors":["Xu Yang","Yingzhe Peng","Haoxuan Ma","Shuo Xu","Chi Zhang","Yucheng Han","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.10104v3.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.18334v2","updated":"2024-06-06T13:50:26Z","published":"2024-02-28T13:54:57Z","title":"Learning to Generate Instruction Tuning Datasets for Zero-Shot Task\n  Adaptation","summary":"  We introduce Bonito, an open-source model for conditional task generation\nthat converts unannotated text into task-specific training datasets for\ninstruction tuning. We aim to enable zero-shot task adaptation of large\nlanguage models on users' specialized, private data. We train Bonito by\nfine-tuning a pretrained large language model on a new large-scale dataset with\n1.65M examples created by remixing existing instruction tuning datasets into\nmeta-templates. The meta-templates for a dataset produce training examples\nwhere the input is the unannotated text and the task attribute and the output\nconsists of the instruction and the response. We use Bonito to generate\nsynthetic tasks for seven datasets from specialized domains with unannotated\ntext across three task types -- yes-no question answering, extractive question\nanswering, and natural language inference -- and adapt language models. We show\nthat Bonito significantly improves the average performance of pretrained and\ninstruction tuned models over the de facto self supervised baseline. For\nexample, adapting Mistral-Instruct-v2 and instruction tuned variants of Mistral\nand Llama2 with Bonito improves the strong zero-shot performance by 22.1 F1\npoints whereas the next word prediction objective undoes some of the benefits\nof instruction tuning and reduces the average performance by 0.8 F1 points. We\nconduct additional experiments with Bonito to understand the effects of the\ndomain, the size of the training set, and the choice of alternative synthetic\ntask generators. Overall, we show that learning with synthetic instruction\ntuning datasets is an effective way to adapt language models to new domains.\nThe model, dataset, and code are available at\nhttps://github.com/BatsResearch/bonito.\n","authors":["Nihal V. Nayak","Yiyang Nan","Avi Trost","Stephen H. Bach"],"pdf_url":"https://arxiv.org/pdf/2402.18334v2.pdf","comment":"ACL Findings 2024"},{"id":"http://arxiv.org/abs/2404.15004v2","updated":"2024-06-06T13:46:36Z","published":"2024-04-23T13:09:11Z","title":"TAXI: Evaluating Categorical Knowledge Editing for Language Models","summary":"  Humans rarely learn one fact in isolation. Instead, learning a new fact\ninduces knowledge of other facts about the world. For example, in learning a\nkorat is a type of cat, you also infer it is a mammal and has claws, ensuring\nyour model of the world is consistent. Knowledge editing aims to inject new\nfacts into language models to improve their factuality, but current benchmarks\nfail to evaluate consistency, which is critical to ensure efficient, accurate,\nand generalizable edits. We manually create TAXI, a new benchmark dataset\nspecifically created to evaluate consistency in categorical knowledge edits.\nTAXI contains 11,120 multiple-choice queries for 976 edits spanning 41\ncategories (e.g., Dogs), 164 subjects (e.g., Labrador), and 183 properties\n(e.g., is a mammal). We then use TAXI to evaluate popular editors' categorical\nconsistency, measuring how often editing a subject's category appropriately\nedits its properties. We find that 1) the editors achieve marginal, yet\nnon-random consistency, 2) their consistency far underperforms human baselines,\nand 3) consistency is more achievable when editing atypical subjects Our code\nand data are available at https://github.com/derekpowell/taxi.\n","authors":["Derek Powell","Walter Gerych","Thomas Hartvigsen"],"pdf_url":"https://arxiv.org/pdf/2404.15004v2.pdf","comment":"Accepted to ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2401.18046v2","updated":"2024-06-06T13:40:47Z","published":"2024-01-31T18:07:12Z","title":"Multipath parsing in the brain","summary":"  Humans understand sentences word-by-word, in the order that they hear them.\nThis incrementality entails resolving temporary ambiguities about syntactic\nrelationships. We investigate how humans process these syntactic ambiguities by\ncorrelating predictions from incremental generative dependency parsers with\ntimecourse data from people undergoing functional neuroimaging while listening\nto an audiobook. In particular, we compare competing hypotheses regarding the\nnumber of developing syntactic analyses in play during word-by-word\ncomprehension: one vs more than one. This comparison involves evaluating\nsyntactic surprisal from a state-of-the-art dependency parser with LLM-adapted\nencodings against an existing fMRI dataset. In both English and Chinese data,\nwe find evidence for multipath parsing. Brain regions associated with this\nmultipath effect include bilateral superior temporal gyrus.\n","authors":["Berta Franzluebbers","Donald Dunagan","Miloš Stanojević","Jan Buys","John T. Hale"],"pdf_url":"https://arxiv.org/pdf/2401.18046v2.pdf","comment":"Accepted at ACL2024, main conference. 15 pages"},{"id":"http://arxiv.org/abs/2405.10517v2","updated":"2024-06-06T13:40:00Z","published":"2024-05-17T03:52:01Z","title":"Towards Better Question Generation in QA-based Event Extraction","summary":"  Event Extraction (EE) is an essential information extraction task that aims\nto extract event-related information from unstructured texts. The paradigm of\nthis task has shifted from conventional classification-based methods to more\ncontemporary question-answering-based (QA-based) approaches. However, in\nQA-based EE, the quality of the questions dramatically affects the extraction\naccuracy, and how to generate high-quality questions for QA-based EE remains a\nchallenge. In this work, to tackle this challenge, we suggest four criteria to\nevaluate the quality of a question and propose a reinforcement learning method,\nRLQG, for QA-based EE that can generate generalizable, high-quality, and\ncontext-dependent questions and provides clear guidance to QA models. The\nextensive experiments conducted on ACE and RAMS datasets have strongly\nvalidated our approach's effectiveness, which also demonstrates its robustness\nin scenarios with limited training data. The corresponding code of RLQG is\nreleased for further research.\n","authors":["Zijin Hong","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2405.10517v2.pdf","comment":"Accepted to ACL2024 Findings"},{"id":"http://arxiv.org/abs/2406.00083v2","updated":"2024-06-06T13:38:42Z","published":"2024-06-03T02:25:33Z","title":"BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of\n  Large Language Models","summary":"  Large Language Models (LLMs) are constrained by outdated information and a\ntendency to generate incorrect data, commonly referred to as \"hallucinations.\"\nRetrieval-Augmented Generation (RAG) addresses these limitations by combining\nthe strengths of retrieval-based methods and generative models. This approach\ninvolves retrieving relevant information from a large, up-to-date dataset and\nusing it to enhance the generation process, leading to more accurate and\ncontextually appropriate responses. Despite its benefits, RAG introduces a new\nattack surface for LLMs, particularly because RAG databases are often sourced\nfrom public data, such as the web. In this paper, we propose \\TrojRAG{} to\nidentify the vulnerabilities and attacks on retrieval parts (RAG database) and\ntheir indirect attacks on generative parts (LLMs). Specifically, we identify\nthat poisoning several customized content passages could achieve a retrieval\nbackdoor, where the retrieval works well for clean queries but always returns\ncustomized poisoned adversarial queries. Triggers and poisoned passages can be\nhighly customized to implement various attacks. For example, a trigger could be\na semantic group like \"The Republican Party, Donald Trump, etc.\" Adversarial\npassages can be tailored to different contents, not only linked to the triggers\nbut also used to indirectly attack generative LLMs without modifying them.\nThese attacks can include denial-of-service attacks on RAG and semantic\nsteering attacks on LLM generations conditioned by the triggers. Our\nexperiments demonstrate that by just poisoning 10 adversarial passages can\ninduce 98.2\\% success rate to retrieve the adversarial passages. Then, these\npassages can increase the reject ratio of RAG-based GPT-4 from 0.01\\% to 74.6\\%\nor increase the rate of negative responses from 0.22\\% to 72\\% for targeted\nqueries.\n","authors":["Jiaqi Xue","Mengxin Zheng","Yebowen Hu","Fei Liu","Xun Chen","Qian Lou"],"pdf_url":"https://arxiv.org/pdf/2406.00083v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16775v2","updated":"2024-06-06T13:38:26Z","published":"2024-02-26T17:45:36Z","title":"A Comprehensive Evaluation of Quantization Strategies for Large Language\n  Models","summary":"  Increasing the number of parameters in large language models (LLMs) usually\nimproves performance in downstream tasks but raises compute and memory costs,\nmaking deployment difficult in resource-limited settings. Quantization\ntechniques, which reduce the bits needed for model weights or activations with\nminimal performance loss, have become popular due to the rise of LLMs. However,\nmost quantization studies use pre-trained LLMs, and the impact of quantization\non instruction-tuned LLMs and the relationship between perplexity and benchmark\nperformance of quantized LLMs are not well understood. Evaluation of quantized\nLLMs is often limited to language modeling and a few classification tasks,\nleaving their performance on other benchmarks unclear. To address these gaps,\nwe propose a structured evaluation framework consisting of three critical\ndimensions: (1) knowledge \\& capacity, (2) alignment, and (3) efficiency, and\nconduct extensive experiments across ten diverse benchmarks. Our experimental\nresults indicate that LLMs with 4-bit quantization can retain performance\ncomparable to their non-quantized counterparts, and perplexity can serve as a\nproxy metric for quantized LLMs on most benchmarks. Furthermore, quantized LLMs\nwith larger parameter scales can outperform smaller LLMs. Despite the memory\nsavings achieved through quantization, it can also slow down the inference\nspeed of LLMs. Consequently, substantial engineering efforts and hardware\nsupport are imperative to achieve a balanced optimization of decoding speed and\nmemory consumption in the context of quantized LLMs.\n","authors":["Renren Jin","Jiangcun Du","Wuwei Huang","Wei Liu","Jian Luan","Bin Wang","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2402.16775v2.pdf","comment":"ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2402.04788v2","updated":"2024-06-06T13:38:13Z","published":"2024-02-07T12:28:32Z","title":"MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with\n  Vision-Language Benchmark","summary":"  Multimodal Large Language Models (MLLMs) have gained significant attention\nrecently, showing remarkable potential in artificial general intelligence.\nHowever, assessing the utility of MLLMs presents considerable challenges,\nprimarily due to the absence of multimodal benchmarks that align with human\npreferences. Drawing inspiration from the concept of LLM-as-a-Judge within\nLLMs, this paper introduces a novel benchmark, termed MLLM-as-a-Judge, to\nassess the ability of MLLMs in assisting judges across diverse modalities,\nencompassing three distinct tasks: Scoring Evaluation, Pair Comparison, and\nBatch Ranking. Our study reveals that, while MLLMs demonstrate remarkable\nhuman-like discernment in Pair Comparison, there is a significant divergence\nfrom human preferences in Scoring Evaluation and Batch Ranking. Furthermore, a\ncloser examination reveals persistent challenges in the judgment capacities of\nLLMs, including diverse biases, hallucinatory responses, and inconsistencies in\njudgment, even in advanced models such as GPT-4V. These findings emphasize the\npressing need for enhancements and further research efforts to be undertaken\nbefore regarding MLLMs as fully reliable evaluators. In light of this, we\nadvocate for additional efforts dedicated to supporting the continuous\ndevelopment within the domain of MLLM functioning as judges. The code and\ndataset are publicly available at our project homepage:\n\\url{https://mllm-judge.github.io/}.\n","authors":["Dongping Chen","Ruoxi Chen","Shilin Zhang","Yinuo Liu","Yaochen Wang","Huichi Zhou","Qihui Zhang","Pan Zhou","Yao Wan","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2402.04788v2.pdf","comment":"ICML 2024 (Oral)"},{"id":"http://arxiv.org/abs/2406.04064v1","updated":"2024-06-06T13:32:09Z","published":"2024-06-06T13:32:09Z","title":"Ask LLMs Directly, \"What shapes your bias?\": Measuring Social Bias in\n  Large Language Models","summary":"  Social bias is shaped by the accumulation of social perceptions towards\ntargets across various demographic identities. To fully understand such social\nbias in large language models (LLMs), it is essential to consider the composite\nof social perceptions from diverse perspectives among identities. Previous\nstudies have either evaluated biases in LLMs by indirectly assessing the\npresence of sentiments towards demographic identities in the generated text or\nmeasuring the degree of alignment with given stereotypes. These methods have\nlimitations in directly quantifying social biases at the level of distinct\nperspectives among identities. In this paper, we aim to investigate how social\nperceptions from various viewpoints contribute to the development of social\nbias in LLMs. To this end, we propose a novel strategy to intuitively quantify\nthese social perceptions and suggest metrics that can evaluate the social\nbiases within LLMs by aggregating diverse social perceptions. The experimental\nresults show the quantitative demonstration of the social attitude in LLMs by\nexamining social perception. The analysis we conducted shows that our proposed\nmetrics capture the multi-dimensional aspects of social bias, enabling a\nfine-grained and comprehensive investigation of bias in LLMs.\n","authors":["Jisu Shin","Hoyun Song","Huije Lee","Soyeong Jeong","Jong C. Park"],"pdf_url":"https://arxiv.org/pdf/2406.04064v1.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2205.10192v2","updated":"2024-06-06T13:27:20Z","published":"2022-05-20T14:10:28Z","title":"On the Trade-off between Redundancy and Local Coherence in Summarization","summary":"  Extractive summaries are usually presented as lists of sentences with no\nexpected cohesion between them and with plenty of redundant information if not\naccounted for. In this paper, we investigate the trade-offs incurred when\naiming to control for inter-sentential cohesion and redundancy in extracted\nsummaries, and their impact on their informativeness. As case study, we focus\non the summarization of long, highly redundant documents and consider two\noptimization scenarios, reward-guided and with no supervision. In the\nreward-guided scenario, we compare systems that control for redundancy and\ncohesion during sentence scoring. In the unsupervised scenario, we introduce\ntwo systems that aim to control all three properties -- informativeness,\nredundancy, and cohesion -- in a principled way. Both systems implement a\npsycholinguistic theory that simulates how humans keep track of relevant\ncontent units and how cohesion and non-redundancy constraints are applied in\nshort-term memory during reading. Extensive automatic and human evaluations\nreveal that systems optimizing for -- among other properties -- cohesion are\ncapable of better organizing content in summaries compared to systems that\noptimize only for redundancy, while maintaining comparable informativeness. We\nfind that the proposed unsupervised systems manage to extract highly cohesive\nsummaries across varying levels of document redundancy, although sacrificing\ninformativeness in the process. Finally, we lay evidence as to how simulated\ncognitive processes impact the trade-off between the analyzed summary\nproperties.\n","authors":["Ronald Cardenas","Matthias Galle","Shay B. Cohen"],"pdf_url":"https://arxiv.org/pdf/2205.10192v2.pdf","comment":"Accepted to JAIR"},{"id":"http://arxiv.org/abs/2306.09782v2","updated":"2024-06-06T13:22:26Z","published":"2023-06-16T11:37:15Z","title":"Full Parameter Fine-tuning for Large Language Models with Limited\n  Resources","summary":"  Large Language Models (LLMs) have revolutionized Natural Language Processing\n(NLP) but demand massive GPU resources for training. Lowering the threshold for\nLLMs training would encourage greater participation from researchers,\nbenefiting both academia and society. While existing approaches have focused on\nparameter-efficient fine-tuning, which tunes or adds a small number of\nparameters, few have addressed the challenge of tuning the full parameters of\nLLMs with limited resources. In this work, we propose a new optimizer,\nLOw-Memory Optimization (LOMO), which fuses the gradient computation and the\nparameter update in one step to reduce memory usage. By integrating LOMO with\nexisting memory saving techniques, we reduce memory usage to 10.8% compared to\nthe standard approach (DeepSpeed solution). Consequently, our approach enables\nthe full parameter fine-tuning of a 65B model on a single machine with 8 RTX\n3090, each with 24GB memory.Code and data are available at\nhttps://github.com/OpenLMLab/LOMO.\n","authors":["Kai Lv","Yuqing Yang","Tengxiao Liu","Qinghui Gao","Qipeng Guo","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2306.09782v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2310.10195v3","updated":"2024-06-06T13:22:25Z","published":"2023-10-16T09:04:28Z","title":"AdaLomo: Low-memory Optimization with Adaptive Learning Rate","summary":"  Large language models have achieved remarkable success, but their extensive\nparameter size necessitates substantial memory for training, thereby setting a\nhigh threshold. While the recently proposed low-memory optimization (LOMO)\nreduces memory footprint, its optimization technique, akin to stochastic\ngradient descent, is sensitive to hyper-parameters and exhibits suboptimal\nconvergence, failing to match the performance of the prevailing optimizer for\nlarge language models, AdamW. Through empirical analysis of the Adam optimizer,\nwe found that, compared to momentum, the adaptive learning rate is more\ncritical for bridging the gap. Building on this insight, we introduce the\nlow-memory optimization with adaptive learning rate (AdaLomo), which offers an\nadaptive learning rate for each parameter. To maintain memory efficiency, we\nemploy non-negative matrix factorization for the second-order moment estimation\nin the optimizer state. Additionally, we suggest the use of a grouped update\nnormalization to stabilize convergence. Our experiments with instruction-tuning\nand further pre-training demonstrate that AdaLomo achieves results on par with\nAdamW, while significantly reducing memory requirements, thereby lowering the\nhardware barrier to training large language models. The code is accessible at\nhttps://github.com/OpenLMLab/LOMO.\n","authors":["Kai Lv","Hang Yan","Qipeng Guo","Haijun Lv","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.10195v3.pdf","comment":"ACL 2024 camera ready version"},{"id":"http://arxiv.org/abs/2402.14008v2","updated":"2024-06-06T13:19:44Z","published":"2024-02-21T18:49:26Z","title":"OlympiadBench: A Challenging Benchmark for Promoting AGI with\n  Olympiad-Level Bilingual Multimodal Scientific Problems","summary":"  Recent advancements have seen Large Language Models (LLMs) and Large\nMultimodal Models (LMMs) surpassing general human capabilities in various\ntasks, approaching the proficiency level of human experts across multiple\ndomains. With traditional benchmarks becoming less challenging for these\nmodels, new rigorous challenges are essential to gauge their advanced\nabilities. In this work, we present OlympiadBench, an Olympiad-level bilingual\nmultimodal scientific benchmark, featuring 8,476 problems from Olympiad-level\nmathematics and physics competitions, including the Chinese college entrance\nexam. Each problem is detailed with expert-level annotations for step-by-step\nreasoning. Evaluating top-tier models on OlympiadBench, we implement a\ncomprehensive assessment methodology to accurately evaluate model responses.\nNotably, the best-performing model, GPT-4V, attains an average score of 17.97%\non OlympiadBench, with a mere 10.74% in physics, highlighting the benchmark\nrigor and the intricacy of physical reasoning. Our analysis orienting GPT-4V\npoints out prevalent issues with hallucinations, knowledge omissions, and\nlogical fallacies. We hope that our challenging benchmark can serve as a\nvaluable resource for helping future AGI research endeavors. The data and\nevaluation code are available at \\url{https://github.com/OpenBMB/OlympiadBench}\n","authors":["Chaoqun He","Renjie Luo","Yuzhuo Bai","Shengding Hu","Zhen Leng Thai","Junhao Shen","Jinyi Hu","Xu Han","Yujie Huang","Yuxiang Zhang","Jie Liu","Lei Qi","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.14008v2.pdf","comment":"Accepted by ACL 2024 (main), update"},{"id":"http://arxiv.org/abs/2311.09832v3","updated":"2024-06-06T13:17:43Z","published":"2023-11-16T11:58:31Z","title":"WatME: Towards Lossless Watermarking Through Lexical Redundancy","summary":"  Text watermarking has emerged as a pivotal technique for identifying\nmachine-generated text. However, existing methods often rely on arbitrary\nvocabulary partitioning during decoding to embed watermarks, which compromises\nthe availability of suitable tokens and significantly degrades the quality of\nresponses. This study assesses the impact of watermarking on different\ncapabilities of large language models (LLMs) from a cognitive science lens. Our\nfinding highlights a significant disparity; knowledge recall and logical\nreasoning are more adversely affected than language generation. These results\nsuggest a more profound effect of watermarking on LLMs than previously\nunderstood. To address these challenges, we introduce Watermarking with Mutual\nExclusion (WatME), a novel approach leveraging linguistic prior knowledge of\ninherent lexical redundancy in LLM vocabularies to seamlessly integrate\nwatermarks. Specifically, WatME dynamically optimizes token usage during the\ndecoding process by applying a mutually exclusive rule to the identified\nlexical redundancies. This strategy effectively prevents the unavailability of\nappropriate tokens and preserves the expressive power of LLMs. We provide both\ntheoretical analysis and empirical evidence showing that WatME effectively\npreserves the diverse capabilities of LLMs while ensuring watermark\ndetectability.\n","authors":["Liang Chen","Yatao Bian","Yang Deng","Deng Cai","Shuaiyi Li","Peilin Zhao","Kam-fai Wong"],"pdf_url":"https://arxiv.org/pdf/2311.09832v3.pdf","comment":"Accepted to ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2402.12691v2","updated":"2024-06-06T13:16:16Z","published":"2024-02-20T03:37:24Z","title":"Tree-Planted Transformers: Unidirectional Transformer Language Models\n  with Implicit Syntactic Supervision","summary":"  Syntactic Language Models (SLMs) can be trained efficiently to reach\nrelatively high performance; however, they have trouble with inference\nefficiency due to the explicit generation of syntactic structures. In this\npaper, we propose a new method dubbed tree-planting: instead of explicitly\ngenerating syntactic structures, we \"plant\" trees into attention weights of\nunidirectional Transformer LMs to implicitly reflect syntactic structures of\nnatural language. Specifically, unidirectional Transformer LMs trained with\ntree-planting will be called Tree-Planted Transformers (TPT), which inherit the\ntraining efficiency from SLMs without changing the inference efficiency of\ntheir underlying Transformer LMs. Targeted syntactic evaluations on the\nSyntaxGym benchmark demonstrated that TPTs, despite the lack of explicit\ngeneration of syntactic structures, significantly outperformed not only vanilla\nTransformer LMs but also various SLMs that generate hundreds of syntactic\nstructures in parallel. This result suggests that TPTs can learn human-like\nsyntactic knowledge as data-efficiently as SLMs while maintaining the modeling\nspace of Transformer LMs unchanged.\n","authors":["Ryo Yoshida","Taiga Someya","Yohei Oseki"],"pdf_url":"https://arxiv.org/pdf/2402.12691v2.pdf","comment":"Accepted by ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2402.11597v2","updated":"2024-06-06T12:55:17Z","published":"2024-02-18T14:25:19Z","title":"Multi-Task Inference: Can Large Language Models Follow Multiple\n  Instructions at Once?","summary":"  Large language models (LLMs) are typically prompted to follow a single\ninstruction per inference call. In this work, we analyze whether LLMs also hold\nthe capability to handle multiple instructions simultaneously, denoted as\nMulti-Task Inference. For this purpose, we introduce the MTI Bench(Multi-Task\nInference Benchmark), a comprehensive evaluation benchmark encompassing 5,000\ninstances across 25 tasks. Each task in the MTI Bench involves 2 to 3\nsub-tasks. As expected, we first demonstrate that Multi-Task Inference reduces\nthe total inference time by 1.46 times in average since it does not require\nmultiple inference calls. Interestingly, contrary to the expectation that LLMs\nwould perform better when tasks are divided, we find that state-of-the-art\nLLMs, such as Llama-2-Chat-70B and GPT-4, show up to 7.3% and 12.4% improved\nperformance with Multi-Task Inference compared to Single-Task Inference on the\nMTI Bench. We release the MTI Bench dataset and our code at this link\nhttps://github.com/guijinSON/MTI-Bench.\n","authors":["Guijin Son","Sangwon Baek","Sangdae Nam","Ilgyun Jeong","Seungone Kim"],"pdf_url":"https://arxiv.org/pdf/2402.11597v2.pdf","comment":"acl 2024 (main)"},{"id":"http://arxiv.org/abs/2402.12343v4","updated":"2024-06-06T12:54:48Z","published":"2024-02-19T18:16:51Z","title":"Emulated Disalignment: Safety Alignment for Large Language Models May\n  Backfire!","summary":"  Large language models (LLMs) undergo safety alignment to ensure safe\nconversations with humans. However, this paper introduces a training-free\nattack method capable of reversing safety alignment, converting the outcomes of\nstronger alignment into greater potential for harm by accessing only LLM output\ntoken distributions. Specifically, our method achieves this reversal by\ncontrasting the output token distribution of a safety-aligned language model\n(e.g., Llama-2-chat) against its pre-trained version (e.g., Llama-2), so that\nthe token predictions are shifted towards the opposite direction of safety\nalignment. We name this method emulated disalignment (ED) because sampling from\nthis contrastive distribution provably emulates the result of fine-tuning to\nminimize a safety reward. Our experiments with ED across three evaluation\ndatasets and four model families (Llama-1, Llama-2, Mistral, and Alpaca) show\nthat ED doubles the harmfulness of pre-trained models and outperforms strong\nbaselines, achieving the highest harmful rates in 43 out of 48 evaluation\nsubsets by a large margin. Eventually, given ED's reliance on language model\noutput token distributions, which particularly compromises open-source models,\nour findings highlight the need to reassess the open accessibility of language\nmodels, even if they have been safety-aligned. Code is available at\nhttps://github.com/ZHZisZZ/emulated-disalignment.\n","authors":["Zhanhui Zhou","Jie Liu","Zhichen Dong","Jiaheng Liu","Chao Yang","Wanli Ouyang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2402.12343v4.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2406.04025v1","updated":"2024-06-06T12:51:14Z","published":"2024-06-06T12:51:14Z","title":"The syntax-semantics interface in a child's path: A study of 3- to\n  11-year-olds' elicited production of Mandarin recursive relative clauses","summary":"  There have been apparently conflicting claims over the syntax-semantics\nrelationship in child acquisition. However, few of them have assessed the\nchild's path toward the acquisition of recursive relative clauses (RRCs). The\nauthors of the current paper did experiments to investigate 3- to 11-year-olds'\nmost-structured elicited production of eight Mandarin RRCs in a 4 (syntactic\ntypes)*2 (semantic conditions) design. The four syntactic types were RRCs with\na subject-gapped RC embedded in an object-gapped RC (SORRCs), RRCs with an\nobject-gapped RC embedded in another object-gapped RC (OORRCs), RRCs with an\nobject-gapped RC embedded in a subject-gapped RC (OSRRCs), and RRCs with a\nsubject-gapped RC embedded in another subject-gapped RC (SSRRCs). Each\nsyntactic type was put in two conditions differing in internal semantics:\nirreversible internal semantics (IIS) and reversible internal semantics (RIS).\nFor example, \"the balloon that [the girl that _ eats the banana] holds _\" is\nSORRCs in the IIS condition; \"the monkey that [the dog that _ bites the pig]\nhits_\" is SORRCs in the RIS condition. For each target, the participants were\nprovided with a speech-visual stimulus constructing a condition of irreversible\nexternal semantics (IES). The results showed that SSRRCs, OSRRCs and SORRCs in\nthe IIS-IES condition were produced two years earlier than their counterparts\nin the RIS-IES condition. Thus, a 2-stage development path is proposed: the\nlanguage acquisition device starts with the interface between (irreversible)\nsyntax and IIS, and ends with the interface between syntax and IES, both\nabiding by the syntax-semantic interface principle.\n","authors":["Caimei Yang","Qihang Yang","Xingzhi Su","Chenxi Fu","Xiaoyi Wang","Ying Yan","Zaijiang Man"],"pdf_url":"https://arxiv.org/pdf/2406.04025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11894v3","updated":"2024-06-06T12:49:44Z","published":"2024-02-19T07:15:59Z","title":"Automating Dataset Updates Towards Reliable and Timely Evaluation of\n  Large Language Models","summary":"  Large language models (LLMs) have achieved impressive performance across\nvarious natural language benchmarks, prompting a continual need to curate more\ndifficult datasets for larger LLMs, which is costly and time-consuming. In this\npaper, we propose to automate dataset updating and provide systematic analysis\nregarding its effectiveness in dealing with benchmark leakage issue, difficulty\ncontrol, and stability. Thus, once the current benchmark has been mastered or\nleaked, we can update it for timely and reliable evaluation. There are two\nupdating strategies: 1) mimicking strategy to generate similar samples based on\noriginal data, preserving stylistic and contextual essence, and 2) extending\nstrategy that further expands existing samples at varying cognitive levels by\nadapting Bloom's taxonomy of educational objectives. Extensive experiments on\nupdated MMLU and BIG-Bench demonstrate the stability of the proposed strategies\nand find that the mimicking strategy can effectively alleviate issues of\noverestimation from benchmark leakage. In cases where the efficient mimicking\nstrategy fails, our extending strategy still shows promising results.\nAdditionally, by controlling the difficulty, we can better discern the models'\nperformance and enable fine-grained analysis neither too difficult nor too easy\nan exam can fairly judge students' learning status. To the best of our\nknowledge, we are the first to automate updating benchmarks for reliable and\ntimely evaluation. Our demo leaderboard can be found at\nhttps://yingjiahao14.github.io/Automating-DatasetUpdates/.\n","authors":["Jiahao Ying","Yixin Cao","Yushi Bai","Qianru Sun","Bo Wang","Wei Tang","Zhaojun Ding","Yizhe Yang","Xuanjing Huang","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2402.11894v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02886v2","updated":"2024-06-06T12:47:31Z","published":"2024-06-05T03:08:25Z","title":"PLaD: Preference-based Large Language Model Distillation with\n  Pseudo-Preference Pairs","summary":"  Large Language Models (LLMs) have exhibited impressive capabilities in\nvarious tasks, yet their vast parameter sizes restrict their applicability in\nresource-constrained settings. Knowledge distillation (KD) offers a viable\nsolution by transferring expertise from large teacher models to compact student\nmodels. However, traditional KD techniques face specific challenges when\napplied to LLMs, including restricted access to LLM outputs, significant\nteacher-student capacity gaps, and the inherited mis-calibration issue. In this\nwork, we present PLaD, a novel preference-based LLM distillation framework.\nPLaD exploits the teacher-student capacity discrepancy to generate\npseudo-preference pairs where teacher outputs are preferred over student\noutputs. Then, PLaD leverages a ranking loss to re-calibrate student's\nestimation of sequence likelihood, which steers the student's focus towards\nunderstanding the relative quality of outputs instead of simply imitating the\nteacher. PLaD bypasses the need for access to teacher LLM's internal states,\ntackles the student's expressivity limitations, and mitigates the student\nmis-calibration issue. Through extensive experiments on two sequence generation\ntasks and with various LLMs, we demonstrate the effectiveness of our proposed\nPLaD framework.\n","authors":["Rongzhi Zhang","Jiaming Shen","Tianqi Liu","Haorui Wang","Zhen Qin","Feng Han","Jialu Liu","Simon Baumgartner","Michael Bendersky","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.02886v2.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.04024v1","updated":"2024-06-06T12:46:21Z","published":"2024-06-06T12:46:21Z","title":"American Sign Language Handshapes Reflect Pressures for Communicative\n  Efficiency","summary":"  Communicative efficiency is a prominent theory in linguistics and cognitive\nscience. While numerous studies have shown how the pressure to save energy is\nreflected in the form of spoken languages, few have explored this phenomenon in\nsigned languages. In this paper, we show how handshapes in American Sign\nLanguage (ASL) reflect these efficiency pressures and we present new evidence\nof communicative efficiency in the visual-gestural modality.\n  We focus on handshapes that are used in both native ASL signs and signs\nborrowed from English to compare efficiency pressures from both ASL and\nEnglish. First, we design new methodologies to quantify the articulatory effort\nrequired to produce handshapes as well as the perceptual effort needed to\nrecognize them. Then, we compare correlations between communicative effort and\nusage statistics in ASL and English. Our findings reveal that frequent ASL\nhandshapes are easier to produce and that pressures for communicative\nefficiency mostly come from ASL usage, not from English lexical borrowing.\n","authors":["Kayo Yin","Terry Regier","Dan Klein"],"pdf_url":"https://arxiv.org/pdf/2406.04024v1.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2404.14461v2","updated":"2024-06-06T12:45:52Z","published":"2024-04-22T05:08:53Z","title":"Competition Report: Finding Universal Jailbreak Backdoors in Aligned\n  LLMs","summary":"  Large language models are aligned to be safe, preventing users from\ngenerating harmful content like misinformation or instructions for illegal\nactivities. However, previous work has shown that the alignment process is\nvulnerable to poisoning attacks. Adversaries can manipulate the safety training\ndata to inject backdoors that act like a universal sudo command: adding the\nbackdoor string to any prompt enables harmful responses from models that,\notherwise, behave safely. Our competition, co-located at IEEE SaTML 2024,\nchallenged participants to find universal backdoors in several large language\nmodels. This report summarizes the key findings and promising ideas for future\nresearch.\n","authors":["Javier Rando","Francesco Croce","Kryštof Mitka","Stepan Shabalin","Maksym Andriushchenko","Nicolas Flammarion","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2404.14461v2.pdf","comment":"Competition Report"},{"id":"http://arxiv.org/abs/2401.14556v3","updated":"2024-06-06T12:33:19Z","published":"2024-01-25T22:50:48Z","title":"Looking Right is Sometimes Right: Investigating the Capabilities of\n  Decoder-only LLMs for Sequence Labeling","summary":"  Pre-trained language models based on masked language modeling (MLM) excel in\nnatural language understanding (NLU) tasks. While fine-tuned MLM-based encoders\nconsistently outperform causal language modeling decoders of comparable size,\nrecent decoder-only large language models (LLMs) perform on par with smaller\nMLM-based encoders. Although their performance improves with scale, LLMs fall\nshort of achieving state-of-the-art results in information extraction (IE)\ntasks, many of which are formulated as sequence labeling (SL). We hypothesize\nthat LLMs' poor SL performance stems from causal masking, which prevents the\nmodel from attending to tokens on the right of the current token. Yet, how\nexactly and to what extent LLMs' performance on SL can be improved remains\nunclear. We explore techniques for improving the SL performance of open LLMs on\nIE tasks by applying layer-wise removal of the causal mask (CM) during LLM\nfine-tuning. This approach yields performance gains competitive with\nstate-of-the-art SL models, matching or outperforming the results of CM removal\nfrom all blocks. Our findings hold for diverse SL tasks, demonstrating that\nopen LLMs with layer-dependent CM removal outperform strong MLM-based encoders\nand even instruction-tuned LLMs.\n","authors":["David Dukić","Jan Šnajder"],"pdf_url":"https://arxiv.org/pdf/2401.14556v3.pdf","comment":"Accepted at ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2311.09033v3","updated":"2024-06-06T12:31:51Z","published":"2023-11-15T15:25:28Z","title":"MELA: Multilingual Evaluation of Linguistic Acceptability","summary":"  In this work, we present the largest benchmark to date on linguistic\nacceptability: Multilingual Evaluation of Linguistic Acceptability -- MELA,\nwith 46K samples covering 10 languages from a diverse set of language families.\nWe establish LLM baselines on this benchmark, and investigate cross-lingual\ntransfer in acceptability judgements with XLM-R. In pursuit of multilingual\ninterpretability, we conduct probing experiments with fine-tuned XLM-R to\nexplore the process of syntax capability acquisition. Our results show that\nGPT-4o exhibits a strong multilingual ability, outperforming fine-tuned XLM-R,\nwhile open-source multilingual models lag behind by a noticeable gap.\nCross-lingual transfer experiments show that transfer in acceptability judgment\nis non-trivial: 500 Icelandic fine-tuning examples lead to 23 MCC performance\nin a completely unrelated language -- Chinese. Results of our probing\nexperiments indicate that training on MELA improves the performance of XLM-R on\nsyntax-related tasks. Our data is available at\nhttps://github.com/sjtu-compling/MELA.\n","authors":["Ziyin Zhang","Yikang Liu","Weifang Huang","Junyu Mao","Rui Wang","Hai Hu"],"pdf_url":"https://arxiv.org/pdf/2311.09033v3.pdf","comment":"ACL 2024 camera-ready"},{"id":"http://arxiv.org/abs/2401.10186v3","updated":"2024-06-06T12:29:44Z","published":"2024-01-18T18:15:46Z","title":"Beyond Traditional Benchmarks: Analyzing Behaviors of Open LLMs on\n  Data-to-Text Generation","summary":"  We analyze the behaviors of open large language models (LLMs) on the task of\ndata-to-text (D2T) generation, i.e., generating coherent and relevant text from\nstructured data. To avoid the issue of LLM training data contamination with\nstandard benchmarks, we design Quintd - a tool for collecting novel structured\ndata records from public APIs. We find that open LLMs (Llama 2, Mistral, and\nZephyr) can generate fluent and coherent texts in zero-shot settings from data\nin common formats collected with Quintd. However, we show that the semantic\naccuracy of the outputs is a major issue: both according to human annotators\nand our reference-free metric based on GPT-4, more than 80% of the outputs of\nopen LLMs contain at least one semantic error. We publicly release the code,\ndata, and model outputs.\n","authors":["Zdeněk Kasner","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2401.10186v3.pdf","comment":"Accepted to ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2402.11548v2","updated":"2024-06-06T12:23:58Z","published":"2024-02-18T11:41:07Z","title":"KMMLU: Measuring Massive Multitask Language Understanding in Korean","summary":"  We propose KMMLU, a new Korean benchmark with 35,030 expert-level\nmultiple-choice questions across 45 subjects ranging from humanities to STEM.\nWhile prior Korean benchmarks are translated from existing English benchmarks,\nKMMLU is collected from original Korean exams, capturing linguistic and\ncultural aspects of the Korean language. We test 27 public and proprietary LLMs\nand observe the best public model to score 50.5%, leaving significant room for\nimprovement. This model was primarily trained for English and Chinese, not\nKorean. Current LLMs tailored to Korean, such as Polyglot-Ko, perform far\nworse. Surprisingly, even the most capable proprietary LLMs, e.g., GPT-4 and\nHyperCLOVA X do not exceed 60%. This suggests that further work is needed to\nimprove LLMs for Korean, and we believe KMMLU offers the appropriate tool to\ntrack this progress. We make our dataset publicly available on the Hugging Face\nHub and integrate the benchmark into EleutherAI's Language Model Evaluation\nHarness.\n","authors":["Guijin Son","Hanwool Lee","Sungdong Kim","Seungone Kim","Niklas Muennighoff","Taekyoon Choi","Cheonbok Park","Kang Min Yoo","Stella Biderman"],"pdf_url":"https://arxiv.org/pdf/2402.11548v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2310.13571v4","updated":"2024-06-06T12:18:56Z","published":"2023-10-20T15:09:46Z","title":"Why Can Large Language Models Generate Correct Chain-of-Thoughts?","summary":"  This paper delves into the capabilities of large language models (LLMs),\nspecifically focusing on advancing the theoretical comprehension of\nchain-of-thought prompting. We investigate how LLMs can be effectively induced\nto generate a coherent chain of thoughts. To achieve this, we introduce a\ntwo-level hierarchical graphical model tailored for natural language\ngeneration. Within this framework, we establish a compelling geometrical\nconvergence rate that gauges the likelihood of an LLM-generated chain of\nthoughts compared to those originating from the true language. Our findings\nprovide a theoretical justification for the ability of LLMs to produce the\ncorrect sequence of thoughts (potentially) explaining performance gains in\ntasks demanding reasoning skills.\n","authors":["Rasul Tutunov","Antoine Grosnit","Juliusz Ziomek","Jun Wang","Haitham Bou-Ammar"],"pdf_url":"https://arxiv.org/pdf/2310.13571v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06733v3","updated":"2024-06-06T12:16:55Z","published":"2024-02-09T19:09:19Z","title":"NICE: To Optimize In-Context Examples or Not?","summary":"  Recent work shows that in-context learning and optimization of in-context\nexamples (ICE) can significantly improve the accuracy of large language models\n(LLMs) on a wide range of tasks, leading to an apparent consensus that ICE\noptimization is crucial for better performance. However, most of these studies\nassume a fixed or no instruction provided in the prompt. We challenge this\nconsensus by investigating the necessity of optimizing ICE when task-specific\ninstructions are provided and find that there are many tasks for which it\nyields diminishing returns. In particular, using a diverse set of tasks and a\nsystematically created instruction set with gradually added details, we find\nthat as the prompt instruction becomes more detailed, the returns on ICE\noptimization diminish. To characterize this behavior, we introduce a\ntask-specific metric called Normalized Invariability to Choice of Examples\n(NICE) that quantifies the learnability of tasks from a given instruction, and\nprovides a heuristic to help decide whether to optimize instructions or ICE for\na new task. Given a task, the proposed metric can reliably predict the utility\nof optimizing ICE compared to using random ICE. Our code is available at\nhttps://github.com/microsoft/nice-icl.\n","authors":["Pragya Srivastava","Satvik Golechha","Amit Deshpande","Amit Sharma"],"pdf_url":"https://arxiv.org/pdf/2402.06733v3.pdf","comment":"Accepted as a full paper (9 pages) at ACL 2024 (Main)"},{"id":"http://arxiv.org/abs/2406.03993v1","updated":"2024-06-06T12:08:43Z","published":"2024-06-06T12:08:43Z","title":"Assessing LLMs for Zero-shot Abstractive Summarization Through the Lens\n  of Relevance Paraphrasing","summary":"  Large Language Models (LLMs) have achieved state-of-the-art performance at\nzero-shot generation of abstractive summaries for given articles. However,\nlittle is known about the robustness of such a process of zero-shot\nsummarization. To bridge this gap, we propose relevance paraphrasing, a simple\nstrategy that can be used to measure the robustness of LLMs as summarizers. The\nrelevance paraphrasing approach identifies the most relevant sentences that\ncontribute to generating an ideal summary, and then paraphrases these inputs to\nobtain a minimally perturbed dataset. Then, by evaluating model performance for\nsummarization on both the original and perturbed datasets, we can assess the\nLLM's one aspect of robustness. We conduct extensive experiments with relevance\nparaphrasing on 4 diverse datasets, as well as 4 LLMs of different sizes\n(GPT-3.5-Turbo, Llama-2-13B, Mistral-7B, and Dolly-v2-7B). Our results indicate\nthat LLMs are not consistent summarizers for the minimally perturbed articles,\nnecessitating further improvements.\n","authors":["Hadi Askari","Anshuman Chhabra","Muhao Chen","Prasant Mohapatra"],"pdf_url":"https://arxiv.org/pdf/2406.03993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08295v3","updated":"2024-06-06T12:02:51Z","published":"2024-01-16T11:45:03Z","title":"SAPT: A Shared Attention Framework for Parameter-Efficient Continual\n  Learning of Large Language Models","summary":"  The continual learning (CL) ability is vital for deploying large language\nmodels (LLMs) in the dynamic world. Existing methods devise the learning module\nto acquire task-specific knowledge with parameter-efficient tuning (PET) block\nand the selection module to pick out the corresponding one for the testing\ninput, aiming at handling the challenges of catastrophic forgetting and\nknowledge transfer in CL. However, these methods tend to address only one of\nthe challenges, ignoring the potential of aligning the two modules to\neffectively address catastrophic forgetting and knowledge transfer\nsimultaneously. To this end, we propose a novel Shared Attention Framework\n(SAPT), to align the PET learning and selection via the Shared Attentive\nLearning \\& Selection module. Extensive Experiments on two CL benchmarks\ndemonstrate the superiority of SAPT. Moreover, SAPT consistently demonstrates\nits superiority when we scale it to different model sizes (from 770M to 13B),\ndifferent model architectures (T5 and LLaMA-2) and unseen tasks.\n","authors":["Weixiang Zhao","Shilong Wang","Yulin Hu","Yanyan Zhao","Bing Qin","Xuanyu Zhang","Qing Yang","Dongliang Xu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2401.08295v3.pdf","comment":"To appear at ACL 2024"},{"id":"http://arxiv.org/abs/2402.10571v2","updated":"2024-06-06T12:02:37Z","published":"2024-02-16T10:55:38Z","title":"Direct Preference Optimization with an Offset","summary":"  Direct preference optimization (DPO) is a successful fine-tuning strategy for\naligning large language models with human preferences without the need to train\na reward model or employ reinforcement learning. DPO, as originally formulated,\nrelies on binary preference data and fine-tunes a language model to increase\nthe likelihood of a preferred response over a dispreferred response. However,\nnot all preference pairs are equal. Sometimes, the preferred response is only\nslightly better than the dispreferred one. In other cases, the preference is\nmuch stronger. For instance, if a response contains harmful or toxic content,\nthe annotator will have a strong preference for that response. In this paper,\nwe propose a generalization of DPO, termed DPO with an offset (ODPO), that does\nnot treat every preference pair equally during fine-tuning. Intuitively, ODPO\nrequires the difference between the likelihood of the preferred and\ndispreferred response to be greater than an offset value. The offset is\ndetermined based on the extent to which one response is preferred over another.\nOur experiments on various tasks suggest that ODPO significantly outperforms\nDPO in aligning language models, especially when the number of preference pairs\nis limited.\n","authors":["Afra Amini","Tim Vieira","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2402.10571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15637v2","updated":"2024-06-06T12:01:09Z","published":"2024-02-23T22:39:12Z","title":"Addressing Order Sensitivity of In-Context Demonstration Examples in\n  Causal Language Models","summary":"  In-context learning has become a popular paradigm in natural language\nprocessing. However, its performance can be significantly influenced by the\norder of in-context demonstration examples. In this paper, we found that causal\nlanguage models (CausalLMs) are more sensitive to this order compared to prefix\nlanguage models (PrefixLMs). We attribute this phenomenon to the\nauto-regressive attention masks within CausalLMs, which restrict each token\nfrom accessing information from subsequent tokens. This results in different\nreceptive fields for samples at different positions, thereby leading to\nrepresentation disparities across positions. To tackle this challenge, we\nintroduce an unsupervised fine-tuning method, termed the Information-Augmented\nand Consistency-Enhanced approach. This approach utilizes contrastive learning\nto align representations of in-context examples across different positions and\nintroduces a consistency loss to ensure similar representations for inputs with\ndifferent permutations. This enhances the model's predictive consistency across\npermutations. Experimental results on five benchmarks suggest that our proposed\nmethod can reduce the sensitivity of CausalLMs to the order of in-context\nexamples and exhibit robust generalizability, particularly when demonstrations\nare sourced from a candidate pool different from that used in the training\nphase, or when the number of in-context examples differs from what is used\nduring training.\n","authors":["Yanzheng Xiang","Hanqi Yan","Lin Gui","Yulan He"],"pdf_url":"https://arxiv.org/pdf/2402.15637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03986v1","updated":"2024-06-06T12:00:41Z","published":"2024-06-06T12:00:41Z","title":"On The Persona-based Summarization of Domain-Specific Documents","summary":"  In an ever-expanding world of domain-specific knowledge, the increasing\ncomplexity of consuming, and storing information necessitates the generation of\nsummaries from large information repositories. However, every persona of a\ndomain has different requirements of information and hence their summarization.\nFor example, in the healthcare domain, a persona-based (such as Doctor, Nurse,\nPatient etc.) approach is imperative to deliver targeted medical information\nefficiently. Persona-based summarization of domain-specific information by\nhumans is a high cognitive load task and is generally not preferred. The\nsummaries generated by two different humans have high variability and do not\nscale in cost and subject matter expertise as domains and personas grow.\nFurther, AI-generated summaries using generic Large Language Models (LLMs) may\nnot necessarily offer satisfactory accuracy for different domains unless they\nhave been specifically trained on domain-specific data and can also be very\nexpensive to use in day-to-day operations. Our contribution in this paper is\ntwo-fold: 1) We present an approach to efficiently fine-tune a domain-specific\nsmall foundation LLM using a healthcare corpus and also show that we can\neffectively evaluate the summarization quality using AI-based critiquing. 2) We\nfurther show that AI-based critiquing has good concordance with Human-based\ncritiquing of the summaries. Hence, such AI-based pipelines to generate\ndomain-specific persona-based summaries can be easily scaled to other domains\nsuch as legal, enterprise documents, education etc. in a very efficient and\ncost-effective manner.\n","authors":["Ankan Mullick","Sombit Bose","Rounak Saha","Ayan Kumar Bhowmick","Pawan Goyal","Niloy Ganguly","Prasenjit Dey","Ravi Kokku"],"pdf_url":"https://arxiv.org/pdf/2406.03986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06840v2","updated":"2024-06-06T11:55:36Z","published":"2024-03-11T16:01:05Z","title":"RA-ISF: Learning to Answer and Understand from Retrieval Augmentation\n  via Iterative Self-Feedback","summary":"  Large language models (LLMs) demonstrate exceptional performance in numerous\ntasks but still heavily rely on knowledge stored in their parameters. Moreover,\nupdating this knowledge incurs high training costs. Retrieval-augmented\ngeneration (RAG) methods address this issue by integrating external knowledge.\nThe model can answer questions it couldn't previously by retrieving knowledge\nrelevant to the query. This approach improves performance in certain scenarios\nfor specific tasks. However, if irrelevant texts are retrieved, it may impair\nmodel performance. In this paper, we propose Retrieval Augmented Iterative\nSelf-Feedback (RA-ISF), a framework that iteratively decomposes tasks and\nprocesses them in three submodules to enhance the model's problem-solving\ncapabilities. Experiments show that our method outperforms existing benchmarks,\nperforming well on models like GPT3.5, Llama2, significantly enhancing factual\nreasoning capabilities and reducing hallucinations.\n","authors":["Yanming Liu","Xinyue Peng","Xuhong Zhang","Weihao Liu","Jianwei Yin","Jiannan Cao","Tianyu Du"],"pdf_url":"https://arxiv.org/pdf/2403.06840v2.pdf","comment":"20 pages, multiple figures. Providing second version RA-ISF"},{"id":"http://arxiv.org/abs/2403.06932v2","updated":"2024-06-06T11:54:56Z","published":"2024-03-11T17:18:53Z","title":"ERA-CoT: Improving Chain-of-Thought through Entity Relationship Analysis","summary":"  Large language models (LLMs) have achieved commendable accomplishments in\nvarious natural language processing tasks. However, LLMs still encounter\nsignificant challenges when dealing with complex scenarios involving multiple\nentities. These challenges arise from the presence of implicit relationships\nthat demand multi-step reasoning. In this paper, we propose a novel approach\nERA-CoT, which aids LLMs in understanding context by capturing relationships\nbetween entities and supports the reasoning of diverse tasks through\nChain-of-Thoughts (CoT). Experimental results show that ERA-CoT demonstrates\nthe superior performance of our proposed method compared to current CoT\nprompting methods, achieving a significant improvement of an average of 5.1\\%\non GPT3.5 compared to previous SOTA baselines. Our analysis indicates that\nERA-CoT increases the LLM's understanding of entity relationships,\nsignificantly improves the accuracy of question answering, and enhances the\nreasoning ability of LLMs.\n","authors":["Yanming Liu","Xinyue Peng","Tianyu Du","Jianwei Yin","Weihao Liu","Xuhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.06932v2.pdf","comment":"15 pages, second version of ERA-CoT"},{"id":"http://arxiv.org/abs/2405.17345v2","updated":"2024-06-06T11:53:23Z","published":"2024-05-27T16:49:22Z","title":"Exploring and steering the moral compass of Large Language Models","summary":"  Large Language Models (LLMs) have become central to advancing automation and\ndecision-making across various sectors, raising significant ethical questions.\nThis study proposes a comprehensive comparative analysis of the most advanced\nLLMs to assess their moral profiles. We subjected several state-of-the-art\nmodels to a selection of ethical dilemmas and found that all the proprietary\nones are mostly utilitarian and all of the open-weights ones align mostly with\nvalues-based ethics. Furthermore, when using the Moral Foundations\nQuestionnaire, all models we probed - except for Llama 2-7B - displayed a\nstrong liberal bias. Lastly, in order to causally intervene in one of the\nstudied models, we propose a novel similarity-specific activation steering\ntechnique. Using this method, we were able to reliably steer the model's moral\ncompass to different ethical schools. All of these results showcase that there\nis an ethical dimension in already deployed LLMs, an aspect that is generally\noverlooked.\n","authors":["Alejandro Tlaie"],"pdf_url":"https://arxiv.org/pdf/2405.17345v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08047v3","updated":"2024-06-06T11:49:26Z","published":"2023-09-14T22:20:27Z","title":"Bias in News Summarization: Measures, Pitfalls and Corpora","summary":"  Summarization is an important application of large language models (LLMs).\nMost previous evaluation of summarization models has focused on their content\nselection, faithfulness, grammaticality and coherence. However, it is well\nknown that LLMs can reproduce and reinforce harmful social biases. This raises\nthe question: Do biases affect model outputs in a constrained setting like\nsummarization? To help answer this question, we first motivate and introduce a\nnumber of definitions for biased behaviours in summarization models, along with\npractical operationalizations. Since we find that biases inherent to input\ndocuments can confound bias analysis in summaries, we propose a method to\ngenerate input documents with carefully controlled demographic attributes. This\nallows us to study summarizer behavior in a controlled setting, while still\nworking with realistic input documents. We measure gender bias in English\nsummaries generated by both purpose-built summarization models and general\npurpose chat models as a case study. We find content selection in single\ndocument summarization to be largely unaffected by gender bias, while\nhallucinations exhibit evidence of bias. To demonstrate the generality of our\napproach, we additionally investigate racial bias, including intersectional\nsettings.\n","authors":["Julius Steen","Katja Markert"],"pdf_url":"https://arxiv.org/pdf/2309.08047v3.pdf","comment":"Findings of ACL 24 Camera Ready"},{"id":"http://arxiv.org/abs/2405.20267v2","updated":"2024-06-06T11:36:09Z","published":"2024-05-30T17:19:19Z","title":"Auto Arena of LLMs: Automating LLM Evaluations with Agent Peer-battles\n  and Committee Discussions","summary":"  As LLMs evolve on a daily basis, there is an urgent need for a trustworthy\nevaluation method that can provide robust evaluation results in a timely\nfashion. Currently, as static benchmarks are prone to contamination concerns,\nusers tend to trust human voting platforms, such as Chatbot Arena. However,\nhuman annotations require extensive manual efforts. To provide an automatic,\nrobust, and trustworthy evaluation framework, we innovatively propose the\nAuto-Arena of LLMs, which automates the entire evaluation process with LLM\nagents. Firstly, an examiner LLM devises queries. Then, a pair of candidate\nLLMs engage in a multi-round peer-battle around the query, during which the\nLLM's true performance gaps become visible. Finally, a committee of LLM judges\ncollectively discuss and determine the winner, which alleviates bias and\npromotes fairness. In our extensive experiment on the 17 newest LLMs,\nAuto-Arena shows the highest correlation with human preferences, providing a\npromising alternative to human evaluation platforms.\n","authors":["Ruochen Zhao","Wenxuan Zhang","Yew Ken Chia","Deli Zhao","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2405.20267v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03963v1","updated":"2024-06-06T11:14:27Z","published":"2024-06-06T11:14:27Z","title":"A + B: A General Generator-Reader Framework for Optimizing LLMs to\n  Unleash Synergy Potential","summary":"  Retrieval-Augmented Generation (RAG) is an effective solution to supplement\nnecessary knowledge to large language models (LLMs). Targeting its bottleneck\nof retriever performance, \"generate-then-read\" pipeline is proposed to replace\nthe retrieval stage with generation from the LLM itself. Although promising,\nthis research direction is underexplored and still cannot work in the scenario\nwhen source knowledge is given. In this paper, we formalize a general \"A + B\"\nframework with varying combinations of foundation models and types for\nsystematic investigation. We explore the efficacy of the base and chat versions\nof LLMs and found their different functionalities suitable for generator A and\nreader B, respectively. Their combinations consistently outperform single\nmodels, especially in complex scenarios. Furthermore, we extend the application\nof the \"A + B\" framework to scenarios involving source documents through\ncontinuous learning, enabling the direct integration of external knowledge into\nLLMs. This approach not only facilitates effective acquisition of new knowledge\nbut also addresses the challenges of safety and helpfulness post-adaptation.\nThe paper underscores the versatility of the \"A + B\" framework, demonstrating\nits potential to enhance the practical application of LLMs across various\ndomains.\n","authors":["Wei Tang","Yixin Cao","Jiahao Ying","Bo Wang","Yuyue Zhao","Yong Liao","Pengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.03963v1.pdf","comment":"Accepted to ACL'24 (Findings)"},{"id":"http://arxiv.org/abs/2402.13874v3","updated":"2024-06-06T11:12:57Z","published":"2024-02-21T15:35:04Z","title":"$Se^2$: Sequential Example Selection for In-Context Learning","summary":"  The remarkable capability of large language models (LLMs) for in-context\nlearning (ICL) needs to be activated by demonstration examples. Prior work has\nextensively explored the selection of examples for ICL, predominantly following\nthe \"select then organize\" paradigm, such approaches often neglect the internal\nrelationships between examples and exist an inconsistency between the training\nand inference. In this paper, we formulate the problem as a $Se$quential\n$Se$lection problem and introduce $Se^2$, a sequential-aware method that\nleverages the LLM's feedback on varying context, aiding in capturing\ninter-relationships and sequential information among examples, significantly\nenriching the contextuality and relevance of ICL prompts. Meanwhile, we utilize\nbeam search to seek and construct example sequences, enhancing both quality and\ndiversity. Extensive experiments across 23 NLP tasks from 8 distinct categories\nillustrate that $Se^2$ markedly surpasses competitive baselines and achieves\n42\\% relative improvement over random selection. Further in-depth analysis\nshows the effectiveness of proposed strategies, highlighting $Se^2$'s\nexceptional stability and adaptability across various scenarios. Code available\nat https://github.com/microsoft/LMOps.\n","authors":["Haoyu Liu","Jianfeng Liu","Shaohan Huang","Yuefeng Zhan","Hao Sun","Weiwei Deng","Furu Wei","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.13874v3.pdf","comment":"Accepted by ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2402.07891v3","updated":"2024-06-06T11:07:17Z","published":"2024-02-12T18:54:02Z","title":"Label-Efficient Model Selection for Text Generation","summary":"  Model selection for a given target task can be costly, as it may entail\nextensive annotation of the quality of outputs of different models. We\nintroduce DiffUse, an efficient method to make an informed decision between\ncandidate text generation models based on preference annotations. DiffUse\nreduces the required amount of annotations, thus saving valuable time and\nresources in performing evaluation. DiffUse intelligently selects instances by\nclustering embeddings that represent the semantic differences between model\noutputs. Thus, it is able to identify a subset of examples that are more\ninformative for preference decisions. Our method is model-agnostic, and can be\napplied to any text generation model for selecting between models, prompts and\nconfigurations. Moreover, we propose a practical iterative approach for\ndynamically determining how many instances to annotate. In a series of\nexperiments over hundreds of model pairs, we demonstrate that DiffUse can\ndramatically reduce the required number of annotations -- by up to 75% -- while\nmaintaining high evaluation reliability.\n","authors":["Shir Ashury-Tahan","Ariel Gera","Benjamin Sznajder","Leshem Choshen","Liat Ein-Dor","Eyal Shnarch"],"pdf_url":"https://arxiv.org/pdf/2402.07891v3.pdf","comment":"Accepted to ACL (main conference)"},{"id":"http://arxiv.org/abs/2406.03953v1","updated":"2024-06-06T10:54:44Z","published":"2024-06-06T10:54:44Z","title":"Tox-BART: Leveraging Toxicity Attributes for Explanation Generation of\n  Implicit Hate Speech","summary":"  Employing language models to generate explanations for an incoming implicit\nhate post is an active area of research. The explanation is intended to make\nexplicit the underlying stereotype and aid content moderators. The training\noften combines top-k relevant knowledge graph (KG) tuples to provide world\nknowledge and improve performance on standard metrics. Interestingly, our study\npresents conflicting evidence for the role of the quality of KG tuples in\ngenerating implicit explanations. Consequently, simpler models incorporating\nexternal toxicity signals outperform KG-infused models. Compared to the\nKG-based setup, we observe a comparable performance for SBIC (LatentHatred)\ndatasets with a performance variation of +0.44 (+0.49), +1.83 (-1.56), and\n-4.59 (+0.77) in BLEU, ROUGE-L, and BERTScore. Further human evaluation and\nerror analysis reveal that our proposed setup produces more precise\nexplanations than zero-shot GPT-3.5, highlighting the intricate nature of the\ntask.\n","authors":["Neemesh Yadav","Sarah Masud","Vikram Goyal","Vikram Goyal","Md Shad Akhtar","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2406.03953v1.pdf","comment":"17 Pages, 5 Figures, 13 Tables, ACL Findings 2024"},{"id":"http://arxiv.org/abs/2402.07214v3","updated":"2024-06-06T10:53:50Z","published":"2024-02-11T14:21:37Z","title":"Through the Lens of Split Vote: Exploring Disagreement, Difficulty and\n  Calibration in Legal Case Outcome Classification","summary":"  In legal decisions, split votes (SV) occur when judges cannot reach a\nunanimous decision, posing a difficulty for lawyers who must navigate diverse\nlegal arguments and opinions. In high-stakes domains, understanding the\nalignment of perceived difficulty between humans and AI systems is crucial to\nbuild trust. However, existing NLP calibration methods focus on a classifier's\nawareness of predictive performance, measured against the human majority class,\noverlooking inherent human label variation (HLV). This paper explores split\nvotes as naturally observable human disagreement and value pluralism. We\ncollect judges' vote distributions from the European Court of Human Rights\n(ECHR), and present SV-ECHR, a case outcome classification (COC) dataset with\nSV information. We build a taxonomy of disagreement with SV-specific\nsubcategories. We further assess the alignment of perceived difficulty between\nmodels and humans, as well as confidence- and human-calibration of COC models.\nWe observe limited alignment with the judge vote distribution. To our\nknowledge, this is the first systematic exploration of calibration to human\njudgements in legal NLP. Our study underscores the necessity for further\nresearch on measuring and enhancing model calibration considering HLV in legal\ndecision tasks.\n","authors":["Shanshan Xu","T. Y. S. S Santosh","Oana Ichim","Barbara Plank","Matthias Grabmair"],"pdf_url":"https://arxiv.org/pdf/2402.07214v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03949v1","updated":"2024-06-06T10:50:26Z","published":"2024-06-06T10:50:26Z","title":"UltraMedical: Building Specialized Generalists in Biomedicine","summary":"  Large Language Models (LLMs) have demonstrated remarkable capabilities across\nvarious domains and are moving towards more specialized areas. Recent advanced\nproprietary models such as GPT-4 and Gemini have achieved significant\nadvancements in biomedicine, which have also raised privacy and security\nchallenges. The construction of specialized generalists hinges largely on\nhigh-quality datasets, enhanced by techniques like supervised fine-tuning and\nreinforcement learning from human or AI feedback, and direct preference\noptimization. However, these leading technologies (e.g., preference learning)\nare still significantly limited in the open source community due to the\nscarcity of specialized data. In this paper, we present the UltraMedical\ncollections, which consist of high-quality manual and synthetic datasets in the\nbiomedicine domain, featuring preference annotations across multiple advanced\nLLMs. By utilizing these datasets, we fine-tune a suite of specialized medical\nmodels based on Llama-3 series, demonstrating breathtaking capabilities across\nvarious medical benchmarks. Moreover, we develop powerful reward models skilled\nin biomedical and general reward benchmark, enhancing further online preference\nlearning within the biomedical LLM community.\n","authors":["Kaiyan Zhang","Sihang Zeng","Ermo Hua","Ning Ding","Zhang-Ren Chen","Zhiyuan Ma","Haoxin Li","Ganqu Cui","Biqing Qi","Xuekai Zhu","Xingtai Lv","Hu Jinfang","Zhiyuan Liu","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.03949v1.pdf","comment":"Datasets and models are available at\n  https://github.com/TsinghuaC3I/UltraMedical"},{"id":"http://arxiv.org/abs/2405.20703v2","updated":"2024-06-06T10:23:48Z","published":"2024-05-31T08:57:09Z","title":"It is Simple Sometimes: A Study On Improving Aspect-Based Sentiment\n  Analysis Performance","summary":"  Aspect-Based Sentiment Analysis (ABSA) involves extracting opinions from\ntextual data about specific entities and their corresponding aspects through\nvarious complementary subtasks. Several prior research has focused on\ndeveloping ad hoc designs of varying complexities for these subtasks. In this\npaper, we present a generative framework extensible to any ABSA subtask. We\nbuild upon the instruction tuned model proposed by Scaria et al. (2023), who\npresent an instruction-based model with task descriptions followed by\nin-context examples on ABSA subtasks. We propose PFInstruct, an extension to\nthis instruction learning paradigm by appending an NLP-related task prefix to\nthe task description. This simple approach leads to improved performance across\nall tested SemEval subtasks, surpassing previous state-of-the-art (SOTA) on the\nATE subtask (Rest14) by +3.28 F1-score, and on the AOOE subtask by an average\nof +5.43 F1-score across SemEval datasets. Furthermore, we explore the impact\nof the prefix-enhanced prompt quality on the ABSA subtasks and find that even a\nnoisy prefix enhances model performance compared to the baseline. Our method\nalso achieves competitive results on a biomedical domain dataset (ERSA).\n","authors":["Laura Cabello","Uchenna Akujuobi"],"pdf_url":"https://arxiv.org/pdf/2405.20703v2.pdf","comment":"Accepted to ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2406.03930v1","updated":"2024-06-06T10:16:43Z","published":"2024-06-06T10:16:43Z","title":"Culturally Aware and Adapted NLP: A Taxonomy and a Survey of the State\n  of the Art","summary":"  The surge of interest in culturally aware and adapted Natural Language\nProcessing (NLP) has inspired much recent research. However, the lack of common\nunderstanding of the concept of \"culture\" has made it difficult to evaluate\nprogress in this emerging area. Drawing on prior research in NLP and related\nfields, we propose an extensive taxonomy of elements of culture that can\nprovide a systematic framework for analyzing and understanding research\nprogress. Using the taxonomy, we survey existing resources and models for\nculturally aware and adapted NLP, providing an overview of the state of the art\nand the research gaps that still need to be filled.\n","authors":["Chen Cecilia Liu","Iryna Gurevych","Anna Korhonen"],"pdf_url":"https://arxiv.org/pdf/2406.03930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03916v1","updated":"2024-06-06T09:56:49Z","published":"2024-06-06T09:56:49Z","title":"ArMeme: Propagandistic Content in Arabic Memes","summary":"  With the rise of digital communication, memes have become a significant\nmedium for cultural and political expression that is often used to mislead\naudiences. Identification of such misleading and persuasive multimodal content\nhas become more important among various stakeholders, including social media\nplatforms, policymakers, and the broader society as they often cause harm to\nindividuals, organizations, and/or society. While there has been effort to\ndevelop AI-based automatic systems for resource-rich languages (e.g., English),\nit is relatively little to none for medium to low resource languages. In this\nstudy, we focused on developing an Arabic memes dataset with manual annotations\nof propagandistic content. We annotated ~6K Arabic memes collected from various\nsocial media platforms, which is a first resource for Arabic multimodal\nresearch. We provide a comprehensive analysis aiming to develop computational\ntools for their detection. We will make them publicly available for the\ncommunity.\n","authors":["Firoj Alam","Abul Hasnat","Fatema Ahmed","Md Arid Hasan","Maram Hasanain"],"pdf_url":"https://arxiv.org/pdf/2406.03916v1.pdf","comment":"disinformation, misinformation, factuality, harmfulness, fake news,\n  propaganda, multimodality, text, images"},{"id":"http://arxiv.org/abs/2402.07844v3","updated":"2024-06-06T09:42:17Z","published":"2024-02-12T17:53:22Z","title":"Mercury: A Code Efficiency Benchmark for LLM Code Synthesis","summary":"  Amidst the recent strides in evaluating Large Language Models for Code (Code\nLLMs), existing benchmarks have mainly focused on the functional correctness of\ngenerated code, neglecting the importance of their computational efficiency. To\nfill the gap, we present Mercury, the first code efficiency benchmark for Code\nLLMs. It comprises 1,889 Python tasks, each accompanied by adequate solutions\nthat serve as real-world efficiency baselines, enabling a comprehensive\nanalysis of the runtime distribution. Based on the distribution, we introduce a\nnew metric Beyond, which computes a runtime-percentile-weighted Pass score to\nreflect functional correctness and code efficiency simultaneously. On Mercury,\nleading Code LLMs can achieve 65% on Pass, while less than 50% on Beyond. Given\nthat an ideal Beyond score would be aligned with the Pass score, it indicates\nthat while Code LLMs exhibit impressive capabilities in generating functionally\ncorrect code, there remains a notable gap in their efficiency. Finally, our\nempirical experiments reveal that Direct Preference Optimization (DPO) serves\nas a robust baseline for enhancing code efficiency compared with Supervised\nFine Tuning (SFT), which paves a promising avenue for future exploration of\nefficient code generation. Our code and data are available on GitHub:\nhttps://github.com/Elfsong/Mercury.\n","authors":["Mingzhe Du","Anh Tuan Luu","Bin Ji","Qian Liu","See-Kiong Ng"],"pdf_url":"https://arxiv.org/pdf/2402.07844v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03897v1","updated":"2024-06-06T09:36:14Z","published":"2024-06-06T09:36:14Z","title":"HeSum: a Novel Dataset for Abstractive Text Summarization in Hebrew","summary":"  While large language models (LLMs) excel in various natural language tasks in\nEnglish, their performance in lower-resourced languages like Hebrew, especially\nfor generative tasks such as abstractive summarization, remains unclear. The\nhigh morphological richness in Hebrew adds further challenges due to the\nambiguity in sentence comprehension and the complexities in meaning\nconstruction. In this paper, we address this resource and evaluation gap by\nintroducing HeSum, a novel benchmark specifically designed for abstractive text\nsummarization in Modern Hebrew. HeSum consists of 10,000 article-summary pairs\nsourced from Hebrew news websites written by professionals. Linguistic analysis\nconfirms HeSum's high abstractness and unique morphological challenges. We show\nthat HeSum presents distinct difficulties for contemporary state-of-the-art\nLLMs, establishing it as a valuable testbed for generative language technology\nin Hebrew, and MRLs generative challenges in general.\n","authors":["Tzuf Paz-Argaman","Itai Mondshine","Asaf Achi Mordechai","Reut Tsarfaty"],"pdf_url":"https://arxiv.org/pdf/2406.03897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09048v3","updated":"2024-06-06T09:35:53Z","published":"2023-11-15T15:38:28Z","title":"GRASP: A novel benchmark for evaluating language GRounding And Situated\n  Physics understanding in multimodal language models","summary":"  This paper presents GRASP, a novel benchmark to evaluate the language\ngrounding and physical understanding capabilities of video-based multimodal\nlarge language models (LLMs). This evaluation is accomplished via a two-tier\napproach leveraging Unity simulations. The first level tests for language\ngrounding by assessing a model's ability to relate simple textual descriptions\nwith visual information. The second level evaluates the model's understanding\nof \"Intuitive Physics\" principles, such as object permanence and continuity. In\naddition to releasing the benchmark, we use it to evaluate several\nstate-of-the-art multimodal LLMs. Our evaluation reveals significant\nshortcomings in the language grounding and intuitive physics capabilities of\nthese models. Although they exhibit at least some grounding capabilities,\nparticularly for colors and shapes, these capabilities depend heavily on the\nprompting strategy. At the same time, all models perform below or at the chance\nlevel of 50% in the Intuitive Physics tests, while human subjects are on\naverage 80% correct. These identified limitations underline the importance of\nusing benchmarks like GRASP to monitor the progress of future models in\ndeveloping these competencies.\n","authors":["Serwan Jassim","Mario Holubar","Annika Richter","Cornelius Wolff","Xenia Ohmer","Elia Bruni"],"pdf_url":"https://arxiv.org/pdf/2311.09048v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03129v2","updated":"2024-06-06T09:30:24Z","published":"2024-03-05T17:15:28Z","title":"CoGenesis: A Framework Collaborating Large and Small Language Models for\n  Secure Context-Aware Instruction Following","summary":"  With the advancement of language models (LMs), their exposure to private data\nis increasingly inevitable, and their deployment (especially for smaller ones)\non personal devices, such as PCs and smartphones, has become a prevailing\ntrend. In contexts laden with user information, enabling models to both\nsafeguard user privacy and execute commands efficiently emerges as an essential\nresearch imperative. In this paper, we propose CoGenesis, a collaborative\ngeneration framework integrating large (hosted on cloud infrastructure) and\nsmall models (deployed on local devices) to address privacy concerns logically.\nInitially, we design a pipeline to create personalized writing instruction\ndatasets enriched with extensive context details as the testbed of this\nresearch issue. Subsequently, we introduce two variants of CoGenesis based on\nsketch and logits respectively. Our experimental findings, based on our\nsynthesized dataset and two additional open-source datasets, indicate that: 1)\nLarge-scale models perform well when provided with user context but struggle in\nthe absence of such context. 2) While specialized smaller models fine-tuned on\nthe synthetic dataset show promise, they still lag behind their larger\ncounterparts. 3) Our CoGenesis framework, utilizing mixed-scale models,\nshowcases competitive performance, providing a feasible solution to privacy\nissues.\n","authors":["Kaiyan Zhang","Jianyu Wang","Ermo Hua","Biqing Qi","Ning Ding","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.03129v2.pdf","comment":"Accepted to ACL 2024 (Main Conference)"},{"id":"http://arxiv.org/abs/2306.03061v3","updated":"2024-06-06T09:30:19Z","published":"2023-06-05T17:32:35Z","title":"Structured Voronoi Sampling","summary":"  Gradient-based sampling algorithms have demonstrated their effectiveness in\ntext generation, especially in the context of controlled text generation.\nHowever, there exists a lack of theoretically grounded and principled\napproaches for this task. In this paper, we take an important step toward\nbuilding a principled approach for sampling from language models with\ngradient-based methods. We use discrete distributions given by language models\nto define densities and develop an algorithm based on Hamiltonian Monte Carlo\nto sample from them. We name our gradient-based technique Structured Voronoi\nSampling (SVS). In an experimental setup where the reference distribution is\nknown, we show that the empirical distribution of SVS samples is closer to the\nreference distribution compared to alternative sampling schemes. Furthermore,\nin a controlled generation task, SVS is able to generate fluent and diverse\nsamples while following the control targets significantly better than other\nmethods.\n","authors":["Afra Amini","Li Du","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2306.03061v3.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2406.03151v2","updated":"2024-06-06T09:30:11Z","published":"2024-06-05T11:15:45Z","title":"Which Side Are You On? A Multi-task Dataset for End-to-End Argument\n  Summarisation and Evaluation","summary":"  With the recent advances of large language models (LLMs), it is no longer\ninfeasible to build an automated debate system that helps people to synthesise\npersuasive arguments. Previous work attempted this task by integrating multiple\ncomponents. In our work, we introduce an argument mining dataset that captures\nthe end-to-end process of preparing an argumentative essay for a debate, which\ncovers the tasks of claim and evidence identification (Task 1 ED), evidence\nconvincingness ranking (Task 2 ECR), argumentative essay summarisation and\nhuman preference ranking (Task 3 ASR) and metric learning for automated\nevaluation of resulting essays, based on human feedback along argument quality\ndimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are\nfully annotated with the various properties supporting the aforementioned\ntasks. We evaluate multiple generative baselines for each of these tasks,\nincluding representative LLMs. We find, that while they show promising results\non individual tasks in our benchmark, their end-to-end performance on all four\ntasks in succession deteriorates significantly, both in automated measures as\nwell as in human-centred evaluation. This challenge presented by our proposed\ndataset motivates future research on end-to-end argument mining and\nsummarisation. The repository of this project is available at\nhttps://github.com/HarrywillDr/ArgSum-Datatset\n","authors":["Hao Li","Yuping Wu","Viktor Schlegel","Riza Batista-Navarro","Tharindu Madusanka","Iqra Zahid","Jiayan Zeng","Xiaochi Wang","Xinran He","Yizhi Li","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2406.03151v2.pdf","comment":"Published on ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2309.09524v2","updated":"2024-06-06T09:28:49Z","published":"2023-09-18T07:02:04Z","title":"Improved Factorized Neural Transducer Model For text-only Domain\n  Adaptation","summary":"  Adapting End-to-End ASR models to out-of-domain datasets with text data is\nchallenging. Factorized neural Transducer (FNT) aims to address this issue by\nintroducing a separate vocabulary decoder to predict the vocabulary.\nNonetheless, this approach has limitations in fusing acoustic and language\ninformation seamlessly. Moreover, a degradation in word error rate (WER) on the\ngeneral test sets was also observed, leading to doubts about its overall\nperformance. In response to this challenge, we present the improved factorized\nneural Transducer (IFNT) model structure designed to comprehensively integrate\nacoustic and language information while enabling effective text adaptation. We\nassess the performance of our proposed method on English and Mandarin datasets.\nThe results indicate that IFNT not only surpasses the neural Transducer and FNT\nin baseline performance in both scenarios but also exhibits superior adaptation\nability compared to FNT. On source domains, IFNT demonstrated statistically\nsignificant accuracy improvements, achieving a relative enhancement of 1.2% to\n2.8% in baseline accuracy compared to the neural Transducer. On out-of-domain\ndatasets, IFNT shows relative WER(CER) improvements of up to 30.2% over the\nstandard neural Transducer with shallow fusion, and relative WER(CER)\nreductions ranging from 1.1% to 2.8% on test sets compared to the FNT model.\n","authors":["Junzhe Liu","Jianwei Yu","Xie Chen"],"pdf_url":"https://arxiv.org/pdf/2309.09524v2.pdf","comment":"Interspeech 2024 cameraready"},{"id":"http://arxiv.org/abs/2406.03893v1","updated":"2024-06-06T09:28:08Z","published":"2024-06-06T09:28:08Z","title":"How Good is Zero-Shot MT Evaluation for Low Resource Indian Languages?","summary":"  While machine translation evaluation has been studied primarily for\nhigh-resource languages, there has been a recent interest in evaluation for\nlow-resource languages due to the increasing availability of data and models.\nIn this paper, we focus on a zero-shot evaluation setting focusing on\nlow-resource Indian languages, namely Assamese, Kannada, Maithili, and Punjabi.\nWe collect sufficient Multi-Dimensional Quality Metrics (MQM) and Direct\nAssessment (DA) annotations to create test sets and meta-evaluate a plethora of\nautomatic evaluation metrics. We observe that even for learned metrics, which\nare known to exhibit zero-shot performance, the Kendall Tau and Pearson\ncorrelations with human annotations are only as high as 0.32 and 0.45.\nSynthetic data approaches show mixed results and overall do not help close the\ngap by much for these languages. This indicates that there is still a long way\nto go for low-resource evaluation.\n","authors":["Anushka Singh","Ananya B. Sai","Raj Dabre","Ratish Puduppully","Anoop Kunchukuttan","Mitesh M Khapra"],"pdf_url":"https://arxiv.org/pdf/2406.03893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08800v2","updated":"2024-06-06T09:27:29Z","published":"2023-12-14T10:35:13Z","title":"Evaluating Large Language Models for Health-related Queries with\n  Presuppositions","summary":"  As corporations rush to integrate large language models (LLMs) to their\nsearch offerings, it is critical that they provide factually accurate\ninformation that is robust to any presuppositions that a user may express. In\nthis work, we introduce UPHILL, a dataset consisting of health-related queries\nwith varying degrees of presuppositions. Using UPHILL, we evaluate the factual\naccuracy and consistency of InstructGPT, ChatGPT, and BingChat models. We find\nthat while model responses rarely disagree with true health claims (posed as\nquestions), they often fail to challenge false claims: responses from\nInstructGPT agree with 32% of the false claims, ChatGPT 26% and BingChat 23%.\nAs we increase the extent of presupposition in input queries, the responses\nfrom InstructGPT and ChatGPT agree with the claim considerably more often,\nregardless of its veracity. Responses from BingChat, which rely on retrieved\nwebpages, are not as susceptible. Given the moderate factual accuracy, and the\ninability of models to consistently correct false assumptions, our work calls\nfor a careful assessment of current LLMs for use in high-stakes scenarios.\n","authors":["Navreet Kaur","Monojit Choudhury","Danish Pruthi"],"pdf_url":"https://arxiv.org/pdf/2312.08800v2.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2403.19260v2","updated":"2024-06-06T09:21:34Z","published":"2024-03-28T09:34:31Z","title":"NaijaHate: Evaluating Hate Speech Detection on Nigerian Twitter Using\n  Representative Data","summary":"  To address the global issue of online hate, hate speech detection (HSD)\nsystems are typically developed on datasets from the United States, thereby\nfailing to generalize to English dialects from the Majority World. Furthermore,\nHSD models are often evaluated on non-representative samples, raising concerns\nabout overestimating model performance in real-world settings. In this work, we\nintroduce NaijaHate, the first dataset annotated for HSD which contains a\nrepresentative sample of Nigerian tweets. We demonstrate that HSD evaluated on\nbiased datasets traditionally used in the literature consistently overestimates\nreal-world performance by at least two-fold. We then propose NaijaXLM-T, a\npretrained model tailored to the Nigerian Twitter context, and establish the\nkey role played by domain-adaptive pretraining and finetuning in maximizing HSD\nperformance. Finally, owing to the modest performance of HSD systems in\nreal-world conditions, we find that content moderators would need to review\nabout ten thousand Nigerian tweets flagged as hateful daily to moderate 60% of\nall hateful content, highlighting the challenges of moderating hate speech at\nscale as social media usage continues to grow globally. Taken together, these\nresults pave the way towards robust HSD systems and a better protection of\nsocial media users from hateful content in low-resource settings.\n","authors":["Manuel Tonneau","Pedro Vitor Quinta de Castro","Karim Lasri","Ibrahim Farouq","Lakshminarayanan Subramanian","Victor Orozco-Olvera","Samuel P. Fraiberger"],"pdf_url":"https://arxiv.org/pdf/2403.19260v2.pdf","comment":"ACL 2024 main conference. Data and models available at\n  https://github.com/worldbank/NaijaHate"},{"id":"http://arxiv.org/abs/2406.03882v1","updated":"2024-06-06T09:21:13Z","published":"2024-06-06T09:21:13Z","title":"Spontaneous Speech-Based Suicide Risk Detection Using Whisper and Large\n  Language Models","summary":"  The early detection of suicide risk is important since it enables the\nintervention to prevent potential suicide attempts. This paper studies the\nautomatic detection of suicide risk based on spontaneous speech from\nadolescents, and collects a Mandarin dataset with 15 hours of suicide speech\nfrom more than a thousand adolescents aged from ten to eighteen for our\nexperiments. To leverage the diverse acoustic and linguistic features embedded\nin spontaneous speech, both the Whisper speech model and textual large language\nmodels (LLMs) are used for suicide risk detection. Both all-parameter\nfinetuning and parameter-efficient finetuning approaches are used to adapt the\npre-trained models for suicide risk detection, and multiple audio-text fusion\napproaches are evaluated to combine the representations of Whisper and the LLM.\nThe proposed system achieves a detection accuracy of 0.807 and an F1-score of\n0.846 on the test set with 119 subjects, indicating promising potential for\nreal suicide risk detection applications.\n","authors":["Ziyun Cui","Chang Lei","Wen Wu","Yinan Duan","Diyang Qu","Ji Wu","Runsen Chen","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.03882v1.pdf","comment":"Accepted by Interspeech 2024"},{"id":"http://arxiv.org/abs/2406.03881v1","updated":"2024-06-06T09:18:42Z","published":"2024-06-06T09:18:42Z","title":"Evaluating the IWSLT2023 Speech Translation Tasks: Human Annotations,\n  Automatic Metrics, and Segmentation","summary":"  Human evaluation is a critical component in machine translation system\ndevelopment and has received much attention in text translation research.\nHowever, little prior work exists on the topic of human evaluation for speech\ntranslation, which adds additional challenges such as noisy data and\nsegmentation mismatches. We take first steps to fill this gap by conducting a\ncomprehensive human evaluation of the results of several shared tasks from the\nlast International Workshop on Spoken Language Translation (IWSLT 2023). We\npropose an effective evaluation strategy based on automatic resegmentation and\ndirect assessment with segment context. Our analysis revealed that: 1) the\nproposed evaluation strategy is robust and scores well-correlated with other\ntypes of human judgements; 2) automatic metrics are usually, but not always,\nwell-correlated with direct assessment scores; and 3) COMET as a slightly\nstronger automatic metric than chrF, despite the segmentation noise introduced\nby the resegmentation step systems. We release the collected human-annotated\ndata in order to encourage further investigation.\n","authors":["Matthias Sperber","Ondřej Bojar","Barry Haddow","Dávid Javorský","Xutai Ma","Matteo Negri","Jan Niehues","Peter Polák","Elizabeth Salesky","Katsuhito Sudoh","Marco Turchi"],"pdf_url":"https://arxiv.org/pdf/2406.03881v1.pdf","comment":"LREC-COLING2024 publication (with corrections for Table 3)"},{"id":"http://arxiv.org/abs/2406.03878v1","updated":"2024-06-06T09:13:13Z","published":"2024-06-06T09:13:13Z","title":"Decoder-only Streaming Transformer for Simultaneous Translation","summary":"  Simultaneous Machine Translation (SiMT) generates translation while reading\nsource tokens, essentially producing the target prefix based on the source\nprefix. To achieve good performance, it leverages the relationship between\nsource and target prefixes to exact a policy to guide the generation of\ntranslations. Although existing SiMT methods primarily focus on the\nEncoder-Decoder architecture, we explore the potential of Decoder-only\narchitecture, owing to its superior performance in various tasks and its\ninherent compatibility with SiMT. However, directly applying the Decoder-only\narchitecture to SiMT poses challenges in terms of training and inference. To\nalleviate the above problems, we propose the first Decoder-only SiMT model,\nnamed Decoder-only Streaming Transformer (DST). Specifically, DST separately\nencodes the positions of the source and target prefixes, ensuring that the\nposition of the target prefix remains unaffected by the expansion of the source\nprefix. Furthermore, we propose a Streaming Self-Attention (SSA) mechanism\ntailored for the Decoder-only architecture. It is capable of obtaining\ntranslation policy by assessing the sufficiency of input source information and\nintegrating with the soft-attention mechanism to generate translations.\nExperiments demonstrate that our approach achieves state-of-the-art performance\non three translation tasks.\n","authors":["Shoutao Guo","Shaolei Zhang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2406.03878v1.pdf","comment":"Accepted to ACL 2024. 14 pages, 10 Tables, 5 Figures"},{"id":"http://arxiv.org/abs/2406.03872v1","updated":"2024-06-06T09:02:31Z","published":"2024-06-06T09:02:31Z","title":"BLSP-Emo: Towards Empathetic Large Speech-Language Models","summary":"  The recent release of GPT-4o showcased the potential of end-to-end multimodal\nmodels, not just in terms of low latency but also in their ability to\nunderstand and generate expressive speech with rich emotions. While the details\nare unknown to the open research community, it likely involves significant\namounts of curated data and compute, neither of which is readily accessible. In\nthis paper, we present BLSP-Emo (Bootstrapped Language-Speech Pretraining with\nEmotion support), a novel approach to developing an end-to-end speech-language\nmodel capable of understanding both semantics and emotions in speech and\ngenerate empathetic responses. BLSP-Emo utilizes existing speech recognition\n(ASR) and speech emotion recognition (SER) datasets through a two-stage\nprocess. The first stage focuses on semantic alignment, following recent work\non pretraining speech-language models using ASR data. The second stage performs\nemotion alignment with the pretrained speech-language model on an emotion-aware\ncontinuation task constructed from SER data. Our experiments demonstrate that\nthe BLSP-Emo model excels in comprehending speech and delivering empathetic\nresponses, both in instruction-following tasks and conversations.\n","authors":["Chen Wang","Minpeng Liao","Zhongqiang Huang","Junhong Wu","Chengqing Zong","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.03872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03869v1","updated":"2024-06-06T08:58:14Z","published":"2024-06-06T08:58:14Z","title":"Recovering document annotations for sentence-level bitext","summary":"  Data availability limits the scope of any given task. In machine translation,\nhistorical models were incapable of handling longer contexts, so the lack of\ndocument-level datasets was less noticeable. Now, despite the emergence of\nlong-sequence methods, we remain within a sentence-level paradigm and without\ndata to adequately approach context-aware machine translation. Most large-scale\ndatasets have been processed through a pipeline that discards document-level\nmetadata. In this work, we reconstruct document-level information for three\n(ParaCrawl, News Commentary, and Europarl) large datasets in German, French,\nSpanish, Italian, Polish, and Portuguese (paired with English). We then\nintroduce a document-level filtering technique as an alternative to traditional\nbitext filtering. We present this filtering with analysis to show that this\nmethod prefers context-consistent translations rather than those that may have\nbeen sentence-level machine translated. Last we train models on these longer\ncontexts and demonstrate improvement in document-level translation without\ndegradation of sentence-level translation. We release our dataset, ParaDocs,\nand resulting models as a resource to the community.\n","authors":["Rachel Wicks","Matt Post","Philipp Koehn"],"pdf_url":"https://arxiv.org/pdf/2406.03869v1.pdf","comment":"ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2406.03857v1","updated":"2024-06-06T08:42:36Z","published":"2024-06-06T08:42:36Z","title":"MuJo: Multimodal Joint Feature Space Learning for Human Activity\n  Recognition","summary":"  Human Activity Recognition is a longstanding problem in AI with applications\nin a broad range of areas: from healthcare, sports and fitness, security, and\nhuman computer interaction to robotics. The performance of HAR in real-world\nsettings is strongly dependent on the type and quality of the input signal that\ncan be acquired. Given an unobstructed, high-quality camera view of a scene,\ncomputer vision systems, in particular in conjunction with foundational models\n(e.g., CLIP), can today fairly reliably distinguish complex activities. On the\nother hand, recognition using modalities such as wearable sensors (which are\noften more broadly available, e.g, in mobile phones and smartwatches) is a more\ndifficult problem, as the signals often contain less information and labeled\ntraining data is more difficult to acquire. In this work, we show how we can\nimprove HAR performance across different modalities using multimodal\ncontrastive pretraining. Our approach MuJo (Multimodal Joint Feature Space\nLearning), learns a multimodal joint feature space with video, language, pose,\nand IMU sensor data. The proposed approach combines contrastive and multitask\nlearning methods and analyzes different multitasking strategies for learning a\ncompact shared representation. A large dataset with parallel video, language,\npose, and sensor data points is also introduced to support the research, along\nwith an analysis of the robustness of the multimodal joint space for\nmodal-incomplete and low-resource data. On the MM-Fit dataset, our model\nachieves an impressive Macro F1-Score of up to 0.992 with only 2% of the train\ndata and 0.999 when using all available training data for classification tasks.\nMoreover, in the scenario where the MM-Fit dataset is unseen, we demonstrate a\ngeneralization performance of up to 0.638.\n","authors":["Stefan Gerd Fritsch","Cennet Oguz","Vitor Fortes Rey","Lala Ray","Maximilian Kiefer-Emmanouilidis","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2406.03857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03855v1","updated":"2024-06-06T08:41:46Z","published":"2024-06-06T08:41:46Z","title":"Performance of large language models in numerical vs. semantic medical\n  knowledge: Benchmarking on evidence-based Q&As","summary":"  Clinical problem-solving requires processing of semantic medical knowledge\nsuch as illness scripts and numerical medical knowledge of diagnostic tests for\nevidence-based decision-making. As large language models (LLMs) show promising\nresults in many aspects of language-based clinical practice, their ability to\ngenerate non-language evidence-based answers to clinical questions is\ninherently limited by tokenization. Therefore, we evaluated LLMs' performance\non two question types: numeric (correlating findings) and semantic\n(differentiating entities) while examining differences within and between LLMs\nin medical aspects and comparing their performance to humans. To generate\nstraightforward multi-choice questions and answers (QAs) based on\nevidence-based medicine (EBM), we used a comprehensive medical knowledge graph\n(encompassed data from more than 50,00 peer-reviewed articles) and created the\n\"EBMQA\". EBMQA contains 105,000 QAs labeled with medical and non-medical topics\nand classified into numerical or semantic questions. We benchmarked this\ndataset using more than 24,500 QAs on two state-of-the-art LLMs: Chat-GPT4 and\nClaude3-Opus. We evaluated the LLMs accuracy on semantic and numerical question\ntypes and according to sub-labeled topics. For validation, six medical experts\nwere tested on 100 numerical EBMQA questions. We found that both LLMs excelled\nmore in semantic than numerical QAs, with Claude3 surpassing GPT4 in numerical\nQAs. However, both LLMs showed inter and intra gaps in different medical\naspects and remained inferior to humans. Thus, their medical advice should be\naddressed carefully.\n","authors":["Eden Avnat","Michal Levy","Daniel Herstain","Elia Yanko","Daniel Ben Joya","Michal Tzuchman Katz","Dafna Eshel","Sahar Laros","Yael Dagan","Shahar Barami","Joseph Mermelstein","Shahar Ovadia","Noam Shomron","Varda Shalev","Raja-Elie E. Abdulnour"],"pdf_url":"https://arxiv.org/pdf/2406.03855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03853v1","updated":"2024-06-06T08:40:28Z","published":"2024-06-06T08:40:28Z","title":"Speculative Decoding via Early-exiting for Faster LLM Inference with\n  Thompson Sampling Control Mechanism","summary":"  The recent advancements in large language models (LLMs) have been\nextraordinary, yet the escalating inference costs associated with them present\nchallenges in real-world applications. To address these challenges, we propose\na novel approach called Early-exiting Speculative Decoding (EESD) with lossless\nacceleration. Specifically, EESD utilizes a segment of the LLM to generate\ndraft tokens, incorporating Early-exiting structures after the first N layers.\nTo enhance the quality of draft tokens, a self-distillation method is\nintegrated. This early-exiting design not only reduces deployment and training\ncosts but also significantly accelerates the token generation speed. Moreover,\nwe introduce a novel sampling mechanism that leverages Thompson Sampling to\nregulate the generation processes, automatically determining the quantity of\ndraft tokens in each round. The original LLM is then employed to validate these\ndraft tokens through a single forward pass, and thus guarantees that the final\noutput text maintains a distribution consistent with vanilla auto-regressive\ndecoding. The experimental results on both 13B and 70B models demonstrate that\nour approach decodes tokens at a markedly accelerated rate compared to prior\nmethods, showing the effectiveness of our approach.\n","authors":["Jiahao Liu","Qifan Wang","Jingang Wang","Xunliang Cai"],"pdf_url":"https://arxiv.org/pdf/2406.03853v1.pdf","comment":"Accepted by ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2406.03170v2","updated":"2024-06-06T08:29:23Z","published":"2024-06-05T12:03:19Z","title":"StatBot.Swiss: Bilingual Open Data Exploration in Natural Language","summary":"  The potential for improvements brought by Large Language Models (LLMs) in\nText-to-SQL systems is mostly assessed on monolingual English datasets.\nHowever, LLMs' performance for other languages remains vastly unexplored. In\nthis work, we release the StatBot.Swiss dataset, the first bilingual benchmark\nfor evaluating Text-to-SQL systems based on real-world applications. The\nStatBot.Swiss dataset contains 455 natural language/SQL-pairs over 35 big\ndatabases with varying level of complexity for both English and German.\n  We evaluate the performance of state-of-the-art LLMs such as GPT-3.5-Turbo\nand mixtral-8x7b-instruct for the Text-to-SQL translation task using an\nin-context learning approach. Our experimental analysis illustrates that\ncurrent LLMs struggle to generalize well in generating SQL queries on our novel\nbilingual dataset.\n","authors":["Farhad Nooralahzadeh","Yi Zhang","Ellery Smith","Sabine Maennel","Cyril Matthey-Doret","Raphaël de Fondville","Kurt Stockinger"],"pdf_url":"https://arxiv.org/pdf/2406.03170v2.pdf","comment":"This work is accepted at ACL Findings 2024"},{"id":"http://arxiv.org/abs/2406.03847v1","updated":"2024-06-06T08:25:43Z","published":"2024-06-06T08:25:43Z","title":"Lean Workbook: A large-scale Lean problem set formalized from natural\n  language math problems","summary":"  Large language models have demonstrated impressive capabilities across\nvarious natural language processing tasks, especially in solving mathematical\nproblems. However, large language models are not good at math theorem proving\nusing formal languages like Lean. A significant challenge in this area is the\nscarcity of training data available in these formal languages. To address this\nissue, we propose a novel pipeline that iteratively generates and filters\nsynthetic data to translate natural language mathematical problems into Lean 4\nstatements, and vice versa. Our results indicate that the synthetic data\npipeline can provide useful training data and improve the performance of LLMs\nin translating and understanding complex mathematical problems and proofs. Our\nfinal dataset contains about 57K formal-informal question pairs along with\nsearched proof from the math contest forum and 21 new IMO questions. We\nopen-source our code at https://github.com/InternLM/InternLM-Math and our data\nat https://huggingface.co/datasets/InternLM/Lean-Workbook.\n","authors":["Huaiyuan Ying","Zijian Wu","Yihan Geng","Jiayu Wang","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2406.03847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03167v3","updated":"2024-06-06T08:22:16Z","published":"2024-03-05T18:01:59Z","title":"PARADISE: Evaluating Implicit Planning Skills of Language Models with\n  Procedural Warnings and Tips Dataset","summary":"  Recently, there has been growing interest within the community regarding\nwhether large language models are capable of planning or executing plans.\nHowever, most prior studies use LLMs to generate high-level plans for\nsimplified scenarios lacking linguistic complexity and domain diversity,\nlimiting analysis of their planning abilities. These setups constrain\nevaluation methods (e.g., predefined action space), architectural choices\n(e.g., only generative models), and overlook the linguistic nuances essential\nfor realistic analysis. To tackle this, we present PARADISE, an abductive\nreasoning task using Q\\&A format on practical procedural text sourced from\nwikiHow. It involves warning and tip inference tasks directly associated with\ngoals, excluding intermediary steps, with the aim of testing the ability of the\nmodels to infer implicit knowledge of the plan solely from the given goal. Our\nexperiments, utilizing fine-tuned language models and zero-shot prompting,\nreveal the effectiveness of task-specific small models over large language\nmodels in most scenarios. Despite advancements, all models fall short of human\nperformance. Notably, our analysis uncovers intriguing insights, such as\nvariations in model behavior with dropped keywords, struggles of BERT-family\nand GPT-4 with physical and abstract goals, and the proposed tasks offering\nvaluable prior knowledge for other unseen procedural tasks. The PARADISE\ndataset and associated resources are publicly available for further research\nexploration with https://github.com/GGLAB-KU/paradise.\n","authors":["Arda Uzunoglu","Abdalfatah Rashid Safa","Gözde Gül Şahin"],"pdf_url":"https://arxiv.org/pdf/2403.03167v3.pdf","comment":"9 pages, ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2404.15522v2","updated":"2024-06-06T08:15:54Z","published":"2024-04-23T21:08:49Z","title":"LogicBench: Towards Systematic Evaluation of Logical Reasoning Ability\n  of Large Language Models","summary":"  Recently developed large language models (LLMs) have been shown to perform\nremarkably well on a wide range of language understanding tasks. But, can they\nreally \"reason\" over the natural language? This question has been receiving\nsignificant research attention and many reasoning skills such as commonsense,\nnumerical, and qualitative have been studied. However, the crucial skill\npertaining to 'logical reasoning' has remained underexplored. Existing work\ninvestigating this reasoning ability of LLMs has focused only on a couple of\ninference rules (such as modus ponens and modus tollens) of propositional and\nfirst-order logic. Addressing the above limitation, we comprehensively evaluate\nthe logical reasoning ability of LLMs on 25 different reasoning patterns\nspanning over propositional, first-order, and non-monotonic logics. To enable\nsystematic evaluation, we introduce LogicBench, a natural language\nquestion-answering dataset focusing on the use of a single inference rule. We\nconduct detailed analysis with a range of LLMs such as GPT-4, ChatGPT, Gemini,\nLlama-2, and Mistral using chain-of-thought prompting. Experimental results\nshow that existing LLMs do not fare well on LogicBench; especially, they\nstruggle with instances involving complex reasoning and negations. Furthermore,\nthey sometimes overlook contextual information necessary for reasoning to\narrive at the correct conclusion. We believe that our work and findings\nfacilitate future research for evaluating and enhancing the logical reasoning\nability of LLMs. Data and code are available at\nhttps://github.com/Mihir3009/LogicBench.\n","authors":["Mihir Parmar","Nisarg Patel","Neeraj Varshney","Mutsumi Nakamura","Man Luo","Santosh Mashetty","Arindam Mitra","Chitta Baral"],"pdf_url":"https://arxiv.org/pdf/2404.15522v2.pdf","comment":"Accepted at ACL(Main) 2024 | First version available @\n  https://openreview.net/forum?id=7NR2ZVzZxx"},{"id":"http://arxiv.org/abs/2402.16438v2","updated":"2024-06-06T08:14:46Z","published":"2024-02-26T09:36:05Z","title":"Language-Specific Neurons: The Key to Multilingual Capabilities in Large\n  Language Models","summary":"  Large language models (LLMs) demonstrate remarkable multilingual capabilities\nwithout being pre-trained on specially curated multilingual parallel corpora.\nIt remains a challenging problem to explain the underlying mechanisms by which\nLLMs process multilingual texts. In this paper, we delve into the composition\nof Transformer architectures in LLMs to pinpoint language-specific regions.\nSpecially, we propose a novel detection method, language activation probability\nentropy (LAPE), to identify language-specific neurons within LLMs. Based on\nLAPE, we conduct comprehensive experiments on several representative LLMs, such\nas LLaMA-2, BLOOM, and Mistral. Our findings indicate that LLMs' proficiency in\nprocessing a particular language is predominantly due to a small subset of\nneurons, primarily situated in the models' top and bottom layers. Furthermore,\nwe showcase the feasibility to \"steer\" the output language of LLMs by\nselectively activating or deactivating language-specific neurons. Our research\nprovides important evidence to the understanding and exploration of the\nmultilingual capabilities of LLMs.\n","authors":["Tianyi Tang","Wenyang Luo","Haoyang Huang","Dongdong Zhang","Xiaolei Wang","Xin Zhao","Furu Wei","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2402.16438v2.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2406.03827v1","updated":"2024-06-06T08:03:05Z","published":"2024-06-06T08:03:05Z","title":"Chaos with Keywords: Exposing Large Language Models Sycophancy to\n  Misleading Keywords and Evaluating Defense Strategies","summary":"  This study explores the sycophantic tendencies of Large Language Models\n(LLMs), where these models tend to provide answers that match what users want\nto hear, even if they are not entirely correct. The motivation behind this\nexploration stems from the common behavior observed in individuals searching\nthe internet for facts with partial or misleading knowledge. Similar to using\nweb search engines, users may recall fragments of misleading keywords and\nsubmit them to an LLM, hoping for a comprehensive response. Our empirical\nanalysis of several LLMs shows the potential danger of these models amplifying\nmisinformation when presented with misleading keywords. Additionally, we\nthoroughly assess four existing hallucination mitigation strategies to reduce\nLLMs sycophantic behavior. Our experiments demonstrate the effectiveness of\nthese strategies for generating factually correct statements. Furthermore, our\nanalyses delve into knowledge-probing experiments on factual keywords and\ndifferent categories of sycophancy mitigation.\n","authors":["Aswin RRV","Nemika Tyagi","Md Nayem Uddin","Neeraj Varshney","Chitta Baral"],"pdf_url":"https://arxiv.org/pdf/2406.03827v1.pdf","comment":"To be published in Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2401.06568v2","updated":"2024-06-06T07:51:16Z","published":"2024-01-12T13:23:21Z","title":"Lost in the Source Language: How Large Language Models Evaluate the\n  Quality of Machine Translation","summary":"  This study investigates how Large Language Models (LLMs) leverage source and\nreference data in machine translation evaluation task, aiming to better\nunderstand the mechanisms behind their remarkable performance in this task. We\ndesign the controlled experiments across various input modes and model types,\nand employ both coarse-grained and fine-grained prompts to discern the utility\nof source versus reference information. We find that reference information\nsignificantly enhances the evaluation accuracy, while surprisingly, source\ninformation sometimes is counterproductive, indicating LLMs' inability to fully\nleverage the cross-lingual capability when evaluating translations. Further\nanalysis of the fine-grained evaluation and fine-tuning experiments show\nsimilar results. These findings also suggest a potential research direction for\nLLMs that fully exploits the cross-lingual capability of LLMs to achieve better\nperformance in machine translation evaluation tasks.\n","authors":["Xu Huang","Zhirui Zhang","Xiang Geng","Yichao Du","Jiajun Chen","Shujian Huang"],"pdf_url":"https://arxiv.org/pdf/2401.06568v2.pdf","comment":"Accepted by ACL2024 Findings"},{"id":"http://arxiv.org/abs/2402.17447v2","updated":"2024-06-06T07:41:21Z","published":"2024-02-27T12:03:56Z","title":"Deep Learning Based Named Entity Recognition Models for Recipes","summary":"  Food touches our lives through various endeavors, including flavor,\nnourishment, health, and sustainability. Recipes are cultural capsules\ntransmitted across generations via unstructured text. Automated protocols for\nrecognizing named entities, the building blocks of recipe text, are of immense\nvalue for various applications ranging from information extraction to novel\nrecipe generation. Named entity recognition is a technique for extracting\ninformation from unstructured or semi-structured data with known labels.\nStarting with manually-annotated data of 6,611 ingredient phrases, we created\nan augmented dataset of 26,445 phrases cumulatively. Simultaneously, we\nsystematically cleaned and analyzed ingredient phrases from RecipeDB, the\ngold-standard recipe data repository, and annotated them using the Stanford\nNER. Based on the analysis, we sampled a subset of 88,526 phrases using a\nclustering-based approach while preserving the diversity to create the\nmachine-annotated dataset. A thorough investigation of NER approaches on these\nthree datasets involving statistical, fine-tuning of deep learning-based\nlanguage models and few-shot prompting on large language models (LLMs) provides\ndeep insights. We conclude that few-shot prompting on LLMs has abysmal\nperformance, whereas the fine-tuned spaCy-transformer emerges as the best model\nwith macro-F1 scores of 95.9%, 96.04%, and 95.71% for the manually-annotated,\naugmented, and machine-annotated datasets, respectively.\n","authors":["Mansi Goel","Ayush Agarwal","Shubham Agrawal","Janak Kapuriya","Akhil Vamshi Konam","Rishabh Gupta","Shrey Rastogi"," Niharika","Ganesh Bagler"],"pdf_url":"https://arxiv.org/pdf/2402.17447v2.pdf","comment":"13 pages, 6 main figures and 2 in appendices, and 3 main tables;\n  Accepted for publication in LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2405.09482v2","updated":"2024-06-06T07:40:58Z","published":"2024-05-15T16:22:16Z","title":"Beyond Flesch-Kincaid: Prompt-based Metrics Improve Difficulty\n  Classification of Educational Texts","summary":"  Using large language models (LLMs) for educational applications like\ndialogue-based teaching is a hot topic. Effective teaching, however, requires\nteachers to adapt the difficulty of content and explanations to the education\nlevel of their students. Even the best LLMs today struggle to do this well. If\nwe want to improve LLMs on this adaptation task, we need to be able to measure\nadaptation success reliably. However, current Static metrics for text\ndifficulty, like the Flesch-Kincaid Reading Ease score, are known to be crude\nand brittle. We, therefore, introduce and evaluate a new set of Prompt-based\nmetrics for text difficulty. Based on a user study, we create Prompt-based\nmetrics as inputs for LLMs. They leverage LLM's general language understanding\ncapabilities to capture more abstract and complex features than Static metrics.\nRegression experiments show that adding our Prompt-based metrics significantly\nimproves text difficulty classification over Static metrics alone. Our results\ndemonstrate the promise of using LLMs to evaluate text adaptation to different\neducation levels.\n","authors":["Donya Rooein","Paul Rottger","Anastassia Shaitarova","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2405.09482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03816v1","updated":"2024-06-06T07:40:00Z","published":"2024-06-06T07:40:00Z","title":"ReST-MCTS*: LLM Self-Training via Process Reward Guided Tree Search","summary":"  Recent methodologies in LLM self-training mostly rely on LLM generating\nresponses and filtering those with correct output answers as training data.\nThis approach often yields a low-quality fine-tuning training set (e.g.,\nincorrect plans or intermediate reasoning). In this paper, we develop a\nreinforced self-training approach, called ReST-MCTS*, based on integrating\nprocess reward guidance with tree search MCTS* for collecting higher-quality\nreasoning traces as well as per-step value to train policy and reward models.\nReST-MCTS* circumvents the per-step manual annotation typically used to train\nprocess rewards by tree-search-based reinforcement learning: Given oracle final\ncorrect answers, ReST-MCTS* is able to infer the correct process rewards by\nestimating the probability this step can help lead to the correct answer. These\ninferred rewards serve dual purposes: they act as value targets for further\nrefining the process reward model and also facilitate the selection of\nhigh-quality traces for policy model self-training. We first show that the\ntree-search policy in ReST-MCTS* achieves higher accuracy compared with prior\nLLM reasoning baselines such as Best-of-N and Tree-of-Thought, within the same\nsearch budget. We then show that by using traces searched by this tree-search\npolicy as training data, we can continuously enhance the three language models\nfor multiple iterations, and outperform other self-training algorithms such as\nReST$^\\text{EM}$ and Self-Rewarding LM.\n","authors":["Dan Zhang","Sining Zhoubian","Yisong Yue","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2406.03816v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2406.03814v1","updated":"2024-06-06T07:39:17Z","published":"2024-06-06T07:39:17Z","title":"Improving Zero-Shot Chinese-English Code-Switching ASR with kNN-CTC and\n  Gated Monolingual Datastores","summary":"  The kNN-CTC model has proven to be effective for monolingual automatic speech\nrecognition (ASR). However, its direct application to multilingual scenarios\nlike code-switching, presents challenges. Although there is potential for\nperformance improvement, a kNN-CTC model utilizing a single bilingual datastore\ncan inadvertently introduce undesirable noise from the alternative language. To\naddress this, we propose a novel kNN-CTC-based code-switching ASR (CS-ASR)\nframework that employs dual monolingual datastores and a gated datastore\nselection mechanism to reduce noise interference. Our method selects the\nappropriate datastore for decoding each frame, ensuring the injection of\nlanguage-specific information into the ASR process. We apply this framework to\ncutting-edge CTC-based models, developing an advanced CS-ASR system. Extensive\nexperiments demonstrate the remarkable effectiveness of our gated datastore\nmechanism in enhancing the performance of zero-shot Chinese-English CS-ASR.\n","authors":["Jiaming Zhou","Shiwan Zhao","Hui Wang","Tian-Hao Zhang","Haoqin Sun","Xuechen Wang","Yong Qin"],"pdf_url":"https://arxiv.org/pdf/2406.03814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00301v2","updated":"2024-06-06T07:32:15Z","published":"2024-05-01T03:50:09Z","title":"Enhanced Language Model Truthfulness with Learnable Intervention and\n  Uncertainty Expression","summary":"  Large language models (LLMs) can generate long-form and coherent text, yet\nthey often hallucinate facts, which undermines their reliability. To mitigate\nthis issue, inference-time methods steer LLM representations toward the\n\"truthful directions\" previously learned for truth elicitation. However,\napplying these truthful directions with the same intensity fails to generalize\nacross different query contexts. We propose LITO, a Learnable Intervention\nmethod for Truthfulness Optimization that automatically identifies the optimal\nintervention intensity tailored to each specific context. LITO explores a\nsequence of model generations based on increasing levels of intervention\nintensities. It selects the most accurate response or refuses to answer when\nthe predictions are highly uncertain. Experiments on multiple LLMs and\nquestion-answering datasets demonstrate that LITO improves truthfulness while\npreserving task accuracy. The adaptive nature of LITO counters the limitations\nof one-size-fits-all intervention methods, maximizing truthfulness by\nreflecting the model's internal knowledge only when it is confident. Our code\nis available at https://github.com/launchnlp/LITO.\n","authors":["Farima Fatahi Bayat","Xin Liu","H. V. Jagadish","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2405.00301v2.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.01165v2","updated":"2024-06-06T07:31:39Z","published":"2024-03-02T10:38:10Z","title":"STAR: Constraint LoRA with Dynamic Active Learning for Data-Efficient\n  Fine-Tuning of Large Language Models","summary":"  Though Large Language Models (LLMs) have demonstrated the powerful\ncapabilities of few-shot learning through prompting methods, supervised\ntraining is still necessary for complex reasoning tasks. Because of their\nextensive parameters and memory consumption, both Parameter-Efficient\nFine-Tuning (PEFT) methods and Memory-Efficient Fine-Tuning methods have been\nproposed for LLMs. Nevertheless, the issue of large annotated data consumption,\nthe aim of Data-Efficient Fine-Tuning, remains unexplored. One obvious way is\nto combine the PEFT method with active learning. However, the experimental\nresults show that such a combination is not trivial and yields inferior\nresults. Through probe experiments, such observation might be explained by two\nmain reasons: uncertainty gap and poor model calibration. Therefore, in this\npaper, we propose a novel approach to effectively integrate uncertainty-based\nactive learning and LoRA. Specifically, for the uncertainty gap, we introduce a\ndynamic uncertainty measurement that combines the uncertainty of the base model\nand the uncertainty of the full model during the iteration of active learning.\nFor poor model calibration, we incorporate the regularization method during\nLoRA training to keep the model from being over-confident, and the Monte-Carlo\ndropout mechanism is employed to enhance the uncertainty estimation.\nExperimental results show that the proposed approach outperforms existing\nbaseline models on three complex reasoning tasks.\n","authors":["Linhai Zhang","Jialong Wu","Deyu Zhou","Guoqiang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.01165v2.pdf","comment":"Accepted by ACL2024(Findings)"},{"id":"http://arxiv.org/abs/2402.18158v2","updated":"2024-06-06T07:30:44Z","published":"2024-02-28T08:43:05Z","title":"Evaluating Quantized Large Language Models","summary":"  Post-training quantization (PTQ) has emerged as a promising technique to\nreduce the cost of large language models (LLMs). Specifically, PTQ can\neffectively mitigate memory consumption and reduce computational overhead in\nLLMs. To meet the requirements of both high efficiency and performance across\ndiverse scenarios, a comprehensive evaluation of quantized LLMs is essential to\nguide the selection of quantization methods. This paper presents a thorough\nevaluation of these factors by evaluating the effect of PTQ on Weight,\nActivation, and KV Cache on 11 model families, including OPT, LLaMA2, Falcon,\nBloomz, Mistral, ChatGLM, Vicuna, LongChat, StableLM, Gemma, and Mamba, with\nparameters ranging from 125M to 180B. The evaluation encompasses five types of\ntasks: basic NLP, emergent ability, trustworthiness, dialogue, and long-context\ntasks. Moreover, we also evaluate the state-of-the-art (SOTA) quantization\nmethods to demonstrate their applicability. Based on the extensive experiments,\nwe systematically summarize the effect of quantization, provide recommendations\nto apply quantization techniques, and point out future directions. The code can\nbe found in https://github.com/thu-nics/qllm-eval.\n","authors":["Shiyao Li","Xuefei Ning","Luning Wang","Tengxuan Liu","Xiangsheng Shi","Shengen Yan","Guohao Dai","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2402.18158v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03807v1","updated":"2024-06-06T07:30:14Z","published":"2024-06-06T07:30:14Z","title":"Tool-Planner: Dynamic Solution Tree Planning for Large Language Model\n  with Tool Clustering","summary":"  Large language models (LLMs) have demonstrated exceptional reasoning\ncapabilities, enabling them to solve various complex problems. Recently, this\nability has been applied to the paradigm of tool learning. Tool learning\ninvolves providing examples of tool usage and their corresponding functions,\nallowing LLMs to formulate plans and demonstrate the process of invoking and\nexecuting each tool. LLMs can address tasks that they cannot complete\nindependently, thereby enhancing their potential across different tasks.\nHowever, this approach faces two key challenges. First, redundant error\ncorrection leads to unstable planning and long execution time. Additionally,\ndesigning a correct plan among multiple tools is also a challenge in tool\nlearning. To address these issues, we propose Tool-Planner, a task-processing\nframework based on toolkits. Tool-Planner groups tools based on the API\nfunctions with the same function into a toolkit and allows LLMs to implement\nplanning across the various toolkits. When a tool error occurs, the language\nmodel can reselect and adjust tools based on the toolkit. Experiments show that\nour approach demonstrates a high pass and win rate across different datasets\nand optimizes the planning scheme for tool learning in models such as GPT-4 and\nClaude 3, showcasing the potential of our method.\n","authors":["Yanming Liu","Xinyue Peng","Yuwei Zhang","Jiannan Cao","Xuhong Zhang","Sheng Cheng","Xun Wang","Jianwei Yin","Tianyu Du"],"pdf_url":"https://arxiv.org/pdf/2406.03807v1.pdf","comment":"46pages first version"},{"id":"http://arxiv.org/abs/2403.01166v2","updated":"2024-06-06T07:27:33Z","published":"2024-03-02T10:38:31Z","title":"DINER: Debiasing Aspect-based Sentiment Analysis with Multi-variable\n  Causal Inference","summary":"  Though notable progress has been made, neural-based aspect-based sentiment\nanalysis (ABSA) models are prone to learn spurious correlations from annotation\nbiases, resulting in poor robustness on adversarial data transformations. Among\nthe debiasing solutions, causal inference-based methods have attracted much\nresearch attention, which can be mainly categorized into causal intervention\nmethods and counterfactual reasoning methods. However, most of the present\ndebiasing methods focus on single-variable causal inference, which is not\nsuitable for ABSA with two input variables (the target aspect and the review).\nIn this paper, we propose a novel framework based on multi-variable causal\ninference for debiasing ABSA. In this framework, different types of biases are\ntackled based on different causal intervention methods. For the review branch,\nthe bias is modeled as indirect confounding from context, where backdoor\nadjustment intervention is employed for debiasing. For the aspect branch, the\nbias is described as a direct correlation with labels, where counterfactual\nreasoning is adopted for debiasing. Extensive experiments demonstrate the\neffectiveness of the proposed method compared to various baselines on the two\nwidely used real-world aspect robustness test set datasets.\n","authors":["Jialong Wu","Linhai Zhang","Deyu Zhou","Guoqiang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.01166v2.pdf","comment":"Accepted by ACL2024(Findings)"},{"id":"http://arxiv.org/abs/2406.03792v1","updated":"2024-06-06T07:03:29Z","published":"2024-06-06T07:03:29Z","title":"Light-PEFT: Lightening Parameter-Efficient Fine-Tuning via Early Pruning","summary":"  Parameter-efficient fine-tuning (PEFT) has emerged as the predominant\ntechnique for fine-tuning in the era of large language models. However,\nexisting PEFT methods still have inadequate training efficiency. Firstly, the\nutilization of large-scale foundation models during the training process is\nexcessively redundant for certain fine-tuning tasks. Secondly, as the model\nsize increases, the growth in trainable parameters of empirically added PEFT\nmodules becomes non-negligible and redundant, leading to inefficiency. To\nachieve task-specific efficient fine-tuning, we propose the Light-PEFT\nframework, which includes two methods: Masked Early Pruning of the Foundation\nModel and Multi-Granularity Early Pruning of PEFT. The Light-PEFT framework\nallows for the simultaneous estimation of redundant parameters in both the\nfoundation model and PEFT modules during the early stage of training. These\nparameters can then be pruned for more efficient fine-tuning. We validate our\napproach on GLUE, SuperGLUE, QA tasks, and various models. With Light-PEFT,\nparameters of the foundation model can be pruned by up to over 40%, while still\ncontrolling trainable parameters to be only 25% of the original PEFT method.\nCompared to utilizing the PEFT method directly, Light-PEFT achieves training\nand inference speedup, reduces memory usage, and maintains comparable\nperformance and the plug-and-play feature of PEFT.\n","authors":["Naibin Gu","Peng Fu","Xiyu Liu","Bowen Shen","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2406.03792v1.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.03790v1","updated":"2024-06-06T07:01:50Z","published":"2024-06-06T07:01:50Z","title":"End-to-End Trainable Soft Retriever for Low-resource Relation Extraction","summary":"  This study addresses a crucial challenge in instance-based relation\nextraction using text generation models: end-to-end training in target relation\nextraction task is not applicable to retrievers due to the non-differentiable\nnature of instance selection. We propose a novel End-to-end TRAinable Soft\nK-nearest neighbor retriever (ETRASK) by the neural prompting method that\nutilizes a soft, differentiable selection of the $k$ nearest instances. This\napproach enables the end-to-end training of retrievers in target tasks. On the\nTACRED benchmark dataset with a low-resource setting where the training data\nwas reduced to 10\\%, our method achieved a state-of-the-art F1 score of 71.5\\%.\nMoreover, ETRASK consistently improved the baseline model by adding instances\nfor all settings. These results highlight the efficacy of our approach in\nenhancing relation extraction performance, especially in resource-constrained\nenvironments. Our findings offer a promising direction for future research with\nextraction and the broader application of text generation in natural language\nprocessing.\n","authors":["Kohei Makino","Makoto Miwa","Yutaka Sasaki"],"pdf_url":"https://arxiv.org/pdf/2406.03790v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2308.12568v2","updated":"2024-06-06T06:59:43Z","published":"2023-08-24T05:15:43Z","title":"A Small and Fast BERT for Chinese Medical Punctuation Restoration","summary":"  In clinical dictation, utterances after automatic speech recognition (ASR)\nwithout explicit punctuation marks may lead to the misunderstanding of dictated\nreports. To give a precise and understandable clinical report with ASR,\nautomatic punctuation restoration is required. Considering a practical\nscenario, we propose a fast and light pre-trained model for Chinese medical\npunctuation restoration based on 'pretraining and fine-tuning' paradigm. In\nthis work, we distill pre-trained models by incorporating supervised\ncontrastive learning and a novel auxiliary pre-training task (Punctuation Mark\nPrediction) to make it well-suited for punctuation restoration. Our experiments\non various distilled models reveal that our model can achieve 95% performance\nwhile 10% model size relative to state-of-the-art Chinese RoBERTa.\n","authors":["Tongtao Ling","Yutao Lai","Lei Chen","Shilei Huang","Yi Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12568v2.pdf","comment":"5 pages, 2 figures, Accepted by INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2406.03452v2","updated":"2024-06-06T06:53:36Z","published":"2024-06-05T16:52:21Z","title":"Using Synchronic Definitions and Semantic Relations to Classify Semantic\n  Change Types","summary":"  There is abundant evidence of the fact that the way words change their\nmeaning can be classified in different types of change, highlighting the\nrelationship between the old and new meanings (among which generalization,\nspecialization and co-hyponymy transfer). In this paper, we present a way of\ndetecting these types of change by constructing a model that leverages\ninformation both from synchronic lexical relations and definitions of word\nmeanings. Specifically, we use synset definitions and hierarchy information\nfrom WordNet and test it on a digitized version of Blank's (1997) dataset of\nsemantic change types. Finally, we show how the sense relationships can improve\nmodels for both approximation of human judgments of semantic relatedness as\nwell as binary Lexical Semantic Change Detection.\n","authors":["Pierluigi Cassotti","Stefano De Pascale","Nina Tahmasebi"],"pdf_url":"https://arxiv.org/pdf/2406.03452v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00199v2","updated":"2024-06-06T06:43:38Z","published":"2024-05-31T21:21:19Z","title":"Exfiltration of personal information from ChatGPT via prompt injection","summary":"  We report that ChatGPT 4 and 4o are susceptible to a prompt injection attack\nthat allows an attacker to exfiltrate users' personal data. It is applicable\nwithout the use of any 3rd party tools and all users are currently affected.\nThis vulnerability is exacerbated by the recent introduction of ChatGPT's\nmemory feature, which allows an attacker to command ChatGPT to monitor the user\nfor the desired personal data.\n","authors":["Gregory Schwartzman"],"pdf_url":"https://arxiv.org/pdf/2406.00199v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03776v1","updated":"2024-06-06T06:40:19Z","published":"2024-06-06T06:40:19Z","title":"XL-HeadTags: Leveraging Multimodal Retrieval Augmentation for the\n  Multilingual Generation of News Headlines and Tags","summary":"  Millions of news articles published online daily can overwhelm readers.\nHeadlines and entity (topic) tags are essential for guiding readers to decide\nif the content is worth their time. While headline generation has been\nextensively studied, tag generation remains largely unexplored, yet it offers\nreaders better access to topics of interest. The need for conciseness in\ncapturing readers' attention necessitates improved content selection strategies\nfor identifying salient and relevant segments within lengthy articles, thereby\nguiding language models effectively. To address this, we propose to leverage\nauxiliary information such as images and captions embedded in the articles to\nretrieve relevant sentences and utilize instruction tuning with variations to\ngenerate both headlines and tags for news articles in a multilingual context.\nTo make use of the auxiliary information, we have compiled a dataset named\nXL-HeadTags, which includes 20 languages across 6 diverse language families.\nThrough extensive evaluation, we demonstrate the effectiveness of our\nplug-and-play multimodal-multilingual retrievers for both tasks. Additionally,\nwe have developed a suite of tools for processing and evaluating multilingual\ntexts, significantly contributing to the research community by enabling more\naccurate and efficient analysis across languages.\n","authors":["Faisal Tareque Shohan","Mir Tafseer Nayeem","Samsul Islam","Abu Ubaida Akash","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2406.03776v1.pdf","comment":"ACL 2024 camera ready"},{"id":"http://arxiv.org/abs/2406.01026v2","updated":"2024-06-06T06:32:45Z","published":"2024-06-03T06:20:12Z","title":"Strengthened Symbol Binding Makes Large Language Models Reliable\n  Multiple-Choice Selectors","summary":"  Multiple-Choice Questions (MCQs) constitute a critical area of research in\nthe study of Large Language Models (LLMs). Previous works have investigated the\nselection bias problem in MCQs within few-shot scenarios, in which the LLM's\nperformance may be influenced by the presentation of answer choices, leaving\nthe selection bias during Supervised Fine-Tuning (SFT) unexplored. In this\npaper, we reveal that selection bias persists in the SFT phase , primarily due\nto the LLM's inadequate Multiple Choice Symbol Binding (MCSB) ability. This\nlimitation implies that the model struggles to associate the answer options\nwith their corresponding symbols (e.g., A/B/C/D) effectively. To enhance the\nmodel's MCSB capability, we first incorporate option contents into the loss\nfunction and subsequently adjust the weights of the option symbols and\ncontents, guiding the model to understand the option content of the current\nsymbol. Based on this, we introduce an efficient SFT algorithm for MCQs, termed\nPoint-wise Intelligent Feedback (PIF). PIF constructs negative instances by\nrandomly combining the incorrect option contents with all candidate symbols,\nand proposes a point-wise loss to provide feedback on these negative samples\ninto LLMs. Our experimental results demonstrate that PIF significantly reduces\nthe model's selection bias by improving its MCSB capability. Remarkably, PIF\nexhibits a substantial enhancement in the accuracy for MCQs.\n","authors":["Mengge Xue","Zhenyu Hu","Liqun Liu","Kuo Liao","Shuang Li","Honglin Han","Meng Zhao","Chengguo Yin"],"pdf_url":"https://arxiv.org/pdf/2406.01026v2.pdf","comment":"Accept at ACL2024 Main"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2406.04345v1","updated":"2024-06-06T17:59:58Z","published":"2024-06-06T17:59:58Z","title":"Stereo-Depth Fusion through Virtual Pattern Projection","summary":"  This paper presents a novel general-purpose stereo and depth data fusion\nparadigm that mimics the active stereo principle by replacing the unreliable\nphysical pattern projector with a depth sensor. It works by projecting virtual\npatterns consistent with the scene geometry onto the left and right images\nacquired by a conventional stereo camera, using the sparse hints obtained from\na depth sensor, to facilitate the visual correspondence. Purposely, any depth\nsensing device can be seamlessly plugged into our framework, enabling the\ndeployment of a virtual active stereo setup in any possible environment and\novercoming the severe limitations of physical pattern projection, such as the\nlimited working range and environmental conditions. Exhaustive experiments on\nindoor and outdoor datasets featuring both long and close range, including\nthose providing raw, unfiltered depth hints from off-the-shelf depth sensors,\nhighlight the effectiveness of our approach in notably boosting the robustness\nand accuracy of algorithms and deep stereo without any code modification and\neven without re-training. Additionally, we assess the performance of our\nstrategy on active stereo evaluation datasets with conventional pattern\nprojection. Indeed, in all these scenarios, our virtual pattern projection\nparadigm achieves state-of-the-art performance. The source code is available\nat: https://github.com/bartn8/vppstereo.\n","authors":["Luca Bartolomei","Matteo Poggi","Fabio Tosi","Andrea Conti","Stefano Mattoccia"],"pdf_url":"https://arxiv.org/pdf/2406.04345v1.pdf","comment":"extended version of ICCV 2023: \"Active Stereo Without Pattern\n  Projector\""},{"id":"http://arxiv.org/abs/2406.04342v1","updated":"2024-06-06T17:59:56Z","published":"2024-06-06T17:59:56Z","title":"Learning 1D Causal Visual Representation with De-focus Attention\n  Networks","summary":"  Modality differences have led to the development of heterogeneous\narchitectures for vision and language models. While images typically require 2D\nnon-causal modeling, texts utilize 1D causal modeling. This distinction poses\nsignificant challenges in constructing unified multi-modal models. This paper\nexplores the feasibility of representing images using 1D causal modeling. We\nidentify an \"over-focus\" issue in existing 1D causal vision models, where\nattention overly concentrates on a small proportion of visual tokens. The issue\nof \"over-focus\" hinders the model's ability to extract diverse visual features\nand to receive effective gradients for optimization. To address this, we\npropose De-focus Attention Networks, which employ learnable bandpass filters to\ncreate varied attention patterns. During training, large and scheduled drop\npath rates, and an auxiliary loss on globally pooled features for global\nunderstanding tasks are introduced. These two strategies encourage the model to\nattend to a broader range of tokens and enhance network optimization. Extensive\nexperiments validate the efficacy of our approach, demonstrating that 1D causal\nvisual representation can perform comparably to 2D non-causal representation in\ntasks such as global perception, dense prediction, and multi-modal\nunderstanding. Code is released at\nhttps://github.com/OpenGVLab/De-focus-Attention-Networks.\n","authors":["Chenxin Tao","Xizhou Zhu","Shiqian Su","Lewei Lu","Changyao Tian","Xuan Luo","Gao Huang","Hongsheng Li","Yu Qiao","Jie Zhou","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2406.04342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04343v1","updated":"2024-06-06T17:59:56Z","published":"2024-06-06T17:59:56Z","title":"Flash3D: Feed-Forward Generalisable 3D Scene Reconstruction from a\n  Single Image","summary":"  In this paper, we propose Flash3D, a method for scene reconstruction and\nnovel view synthesis from a single image which is both very generalisable and\nefficient. For generalisability, we start from a \"foundation\" model for\nmonocular depth estimation and extend it to a full 3D shape and appearance\nreconstructor. For efficiency, we base this extension on feed-forward Gaussian\nSplatting. Specifically, we predict a first layer of 3D Gaussians at the\npredicted depth, and then add additional layers of Gaussians that are offset in\nspace, allowing the model to complete the reconstruction behind occlusions and\ntruncations. Flash3D is very efficient, trainable on a single GPU in a day, and\nthus accessible to most researchers. It achieves state-of-the-art results when\ntrained and tested on RealEstate10k. When transferred to unseen datasets like\nNYU it outperforms competitors by a large margin. More impressively, when\ntransferred to KITTI, Flash3D achieves better PSNR than methods trained\nspecifically on that dataset. In some instances, it even outperforms recent\nmethods that use multiple views as input. Code, models, demo, and more results\nare available at https://www.robots.ox.ac.uk/~vgg/research/flash3d/.\n","authors":["Stanislaw Szymanowicz","Eldar Insafutdinov","Chuanxia Zheng","Dylan Campbell","João F. Henriques","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2406.04343v1.pdf","comment":"Project page: https://www.robots.ox.ac.uk/~vgg/research/flash3d/"},{"id":"http://arxiv.org/abs/2406.04344v1","updated":"2024-06-06T17:59:56Z","published":"2024-06-06T17:59:56Z","title":"Verbalized Machine Learning: Revisiting Machine Learning with Language\n  Models","summary":"  Motivated by the large progress made by large language models (LLMs), we\nintroduce the framework of verbalized machine learning (VML). In contrast to\nconventional machine learning models that are typically optimized over a\ncontinuous parameter space, VML constrains the parameter space to be\nhuman-interpretable natural language. Such a constraint leads to a new\nperspective of function approximation, where an LLM with a text prompt can be\nviewed as a function parameterized by the text prompt. Guided by this\nperspective, we revisit classical machine learning problems, such as regression\nand classification, and find that these problems can be solved by an\nLLM-parameterized learner and optimizer. The major advantages of VML include\n(1) easy encoding of inductive bias: prior knowledge about the problem and\nhypothesis class can be encoded in natural language and fed into the\nLLM-parameterized learner; (2) automatic model class selection: the optimizer\ncan automatically select a concrete model class based on data and verbalized\nprior knowledge, and it can update the model class during training; and (3)\ninterpretable learner updates: the LLM-parameterized optimizer can provide\nexplanations for why each learner update is performed. We conduct several\nstudies to empirically evaluate the effectiveness of VML, and hope that VML can\nserve as a stepping stone to stronger interpretability and trustworthiness in\nML.\n","authors":["Tim Z. Xiao","Robert Bamler","Bernhard Schölkopf","Weiyang Liu"],"pdf_url":"https://arxiv.org/pdf/2406.04344v1.pdf","comment":"Technical Report v1 (92 pages, 15 figures)"},{"id":"http://arxiv.org/abs/2406.04341v1","updated":"2024-06-06T17:59:52Z","published":"2024-06-06T17:59:52Z","title":"Interpreting the Second-Order Effects of Neurons in CLIP","summary":"  We interpret the function of individual neurons in CLIP by automatically\ndescribing them using text. Analyzing the direct effects (i.e. the flow from a\nneuron through the residual stream to the output) or the indirect effects\n(overall contribution) fails to capture the neurons' function in CLIP.\nTherefore, we present the \"second-order lens\", analyzing the effect flowing\nfrom a neuron through the later attention heads, directly to the output. We\nfind that these effects are highly selective: for each neuron, the effect is\nsignificant for <2% of the images. Moreover, each effect can be approximated by\na single direction in the text-image space of CLIP. We describe neurons by\ndecomposing these directions into sparse sets of text representations. The sets\nreveal polysemantic behavior - each neuron corresponds to multiple, often\nunrelated, concepts (e.g. ships and cars). Exploiting this neuron polysemy, we\nmass-produce \"semantic\" adversarial examples by generating images with concepts\nspuriously correlated to the incorrect class. Additionally, we use the\nsecond-order effects for zero-shot segmentation and attribute discovery in\nimages. Our results indicate that a scalable understanding of neurons can be\nused for model deception and for introducing new model capabilities.\n","authors":["Yossi Gandelsman","Alexei A. Efros","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2406.04341v1.pdf","comment":"project page:\n  https://yossigandelsman.github.io/clip_neurons/index.html"},{"id":"http://arxiv.org/abs/2406.04340v1","updated":"2024-06-06T17:59:50Z","published":"2024-06-06T17:59:50Z","title":"GLACE: Global Local Accelerated Coordinate Encoding","summary":"  Scene coordinate regression (SCR) methods are a family of visual localization\nmethods that directly regress 2D-3D matches for camera pose estimation. They\nare effective in small-scale scenes but face significant challenges in\nlarge-scale scenes that are further amplified in the absence of ground truth 3D\npoint clouds for supervision. Here, the model can only rely on reprojection\nconstraints and needs to implicitly triangulate the points. The challenges stem\nfrom a fundamental dilemma: The network has to be invariant to observations of\nthe same landmark at different viewpoints and lighting conditions, etc., but at\nthe same time discriminate unrelated but similar observations. The latter\nbecomes more relevant and severe in larger scenes. In this work, we tackle this\nproblem by introducing the concept of co-visibility to the network. We propose\nGLACE, which integrates pre-trained global and local encodings and enables SCR\nto scale to large scenes with only a single small-sized network. Specifically,\nwe propose a novel feature diffusion technique that implicitly groups the\nreprojection constraints with co-visibility and avoids overfitting to trivial\nsolutions. Additionally, our position decoder parameterizes the output\npositions for large-scale scenes more effectively. Without using 3D models or\ndepth maps for supervision, our method achieves state-of-the-art results on\nlarge-scale scenes with a low-map-size model. On Cambridge landmarks, with a\nsingle model, we achieve 17% lower median position error than Poker, the\nensemble variant of the state-of-the-art SCR method ACE. Code is available at:\nhttps://github.com/cvg/glace.\n","authors":["Fangjinhua Wang","Xudong Jiang","Silvano Galliani","Christoph Vogel","Marc Pollefeys"],"pdf_url":"https://arxiv.org/pdf/2406.04340v1.pdf","comment":"Large-scale visual localization with a single optimizable MLP. CVPR\n  2024. Code: https://github.com/cvg/glace. Project page:\n  https://xjiangan.github.io/glace"},{"id":"http://arxiv.org/abs/2406.04338v1","updated":"2024-06-06T17:59:47Z","published":"2024-06-06T17:59:47Z","title":"Physics3D: Learning Physical Properties of 3D Gaussians via Video\n  Diffusion","summary":"  In recent years, there has been rapid development in 3D generation models,\nopening up new possibilities for applications such as simulating the dynamic\nmovements of 3D objects and customizing their behaviors. However, current 3D\ngenerative models tend to focus only on surface features such as color and\nshape, neglecting the inherent physical properties that govern the behavior of\nobjects in the real world. To accurately simulate physics-aligned dynamics, it\nis essential to predict the physical properties of materials and incorporate\nthem into the behavior prediction process. Nonetheless, predicting the diverse\nmaterials of real-world objects is still challenging due to the complex nature\nof their physical attributes. In this paper, we propose \\textbf{Physics3D}, a\nnovel method for learning various physical properties of 3D objects through a\nvideo diffusion model. Our approach involves designing a highly generalizable\nphysical simulation system based on a viscoelastic material model, which\nenables us to simulate a wide range of materials with high-fidelity\ncapabilities. Moreover, we distill the physical priors from a video diffusion\nmodel that contains more understanding of realistic object materials. Extensive\nexperiments demonstrate the effectiveness of our method with both elastic and\nplastic materials. Physics3D shows great potential for bridging the gap between\nthe physical world and virtual neural space, providing a better integration and\napplication of realistic physical principles in virtual environments. Project\npage: https://liuff19.github.io/Physics3D.\n","authors":["Fangfu Liu","Hanyang Wang","Shunyu Yao","Shengjun Zhang","Jie Zhou","Yueqi Duan"],"pdf_url":"https://arxiv.org/pdf/2406.04338v1.pdf","comment":"Project page: https://liuff19.github.io/Physics3D"},{"id":"http://arxiv.org/abs/2406.04339v1","updated":"2024-06-06T17:59:47Z","published":"2024-06-06T17:59:47Z","title":"RoboMamba: Multimodal State Space Model for Efficient Robot Reasoning\n  and Manipulation","summary":"  A fundamental objective in robot manipulation is to enable models to\ncomprehend visual scenes and execute actions. Although existing robot\nMultimodal Large Language Models (MLLMs) can handle a range of basic tasks,\nthey still face challenges in two areas: 1) inadequate reasoning ability to\ntackle complex tasks, and 2) high computational costs for MLLM fine-tuning and\ninference. The recently proposed state space model (SSM) known as Mamba\ndemonstrates promising capabilities in non-trivial sequence modeling with\nlinear inference complexity. Inspired by this, we introduce RoboMamba, an\nend-to-end robotic MLLM that leverages the Mamba model to deliver both robotic\nreasoning and action capabilities, while maintaining efficient fine-tuning and\ninference. Specifically, we first integrate the vision encoder with Mamba,\naligning visual data with language embedding through co-training, empowering\nour model with visual common sense and robot-related reasoning. To further\nequip RoboMamba with action pose prediction abilities, we explore an efficient\nfine-tuning strategy with a simple policy head. We find that once RoboMamba\npossesses sufficient reasoning capability, it can acquire manipulation skills\nwith minimal fine-tuning parameters (0.1\\% of the model) and time (20 minutes).\nIn experiments, RoboMamba demonstrates outstanding reasoning capabilities on\ngeneral and robotic evaluation benchmarks. Meanwhile, our model showcases\nimpressive pose prediction results in both simulation and real-world\nexperiments, achieving inference speeds 7 times faster than existing robot\nMLLMs. Our project web page: https://sites.google.com/view/robomamba-web\n","authors":["Jiaming Liu","Mengzhen Liu","Zhenyu Wang","Lily Lee","Kaichen Zhou","Pengju An","Senqiao Yang","Renrui Zhang","Yandong Guo","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.04339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04337v1","updated":"2024-06-06T17:59:44Z","published":"2024-06-06T17:59:44Z","title":"Coherent Zero-Shot Visual Instruction Generation","summary":"  Despite the advances in text-to-image synthesis, particularly with diffusion\nmodels, generating visual instructions that require consistent representation\nand smooth state transitions of objects across sequential steps remains a\nformidable challenge. This paper introduces a simple, training-free framework\nto tackle the issues, capitalizing on the advancements in diffusion models and\nlarge language models (LLMs). Our approach systematically integrates text\ncomprehension and image generation to ensure visual instructions are visually\nappealing and maintain consistency and accuracy throughout the instruction\nsequence. We validate the effectiveness by testing multi-step instructions and\ncomparing the text alignment and consistency with several baselines. Our\nexperiments show that our approach can visualize coherent and visually pleasing\ninstructions\n","authors":["Quynh Phung","Songwei Ge","Jia-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2406.04337v1.pdf","comment":"https://instruct-vis-zero.github.io/"},{"id":"http://arxiv.org/abs/2406.04334v1","updated":"2024-06-06T17:59:34Z","published":"2024-06-06T17:59:34Z","title":"DeepStack: Deeply Stacking Visual Tokens is Surprisingly Simple and\n  Effective for LMMs","summary":"  Most large multimodal models (LMMs) are implemented by feeding visual tokens\nas a sequence into the first layer of a large language model (LLM). The\nresulting architecture is simple but significantly increases computation and\nmemory costs, as it has to handle a large number of additional tokens in its\ninput layer. This paper presents a new architecture DeepStack for LMMs.\nConsidering $N$ layers in the language and vision transformer of LMMs, we stack\nthe visual tokens into $N$ groups and feed each group to its aligned\ntransformer layer \\textit{from bottom to top}. Surprisingly, this simple method\ngreatly enhances the power of LMMs to model interactions among visual tokens\nacross layers but with minimal additional cost. We apply DeepStack to both\nlanguage and vision transformer in LMMs, and validate the effectiveness of\nDeepStack LMMs with extensive empirical results. Using the same context length,\nour DeepStack 7B and 13B parameters surpass their counterparts by \\textbf{2.7}\nand \\textbf{2.9} on average across \\textbf{9} benchmarks, respectively. Using\nonly one-fifth of the context length, DeepStack rivals closely to the\ncounterparts that use the full context length. These gains are particularly\npronounced on high-resolution tasks, e.g., \\textbf{4.2}, \\textbf{11.0}, and\n\\textbf{4.0} improvements on TextVQA, DocVQA, and InfoVQA compared to\nLLaVA-1.5-7B, respectively. We further apply DeepStack to vision transformer\nlayers, which brings us a similar amount of improvements, \\textbf{3.8} on\naverage compared with LLaVA-1.5-7B.\n","authors":["Lingchen Meng","Jianwei Yang","Rui Tian","Xiyang Dai","Zuxuan Wu","Jianfeng Gao","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2406.04334v1.pdf","comment":"Project Page: https://deepstack-vl.github.io/"},{"id":"http://arxiv.org/abs/2406.04332v1","updated":"2024-06-06T17:59:23Z","published":"2024-06-06T17:59:23Z","title":"Coarse-To-Fine Tensor Trains for Compact Visual Representations","summary":"  The ability to learn compact, high-quality, and easy-to-optimize\nrepresentations for visual data is paramount to many applications such as novel\nview synthesis and 3D reconstruction. Recent work has shown substantial success\nin using tensor networks to design such compact and high-quality\nrepresentations. However, the ability to optimize tensor-based representations,\nand in particular, the highly compact tensor train representation, is still\nlacking. This has prevented practitioners from deploying the full potential of\ntensor networks for visual data. To this end, we propose 'Prolongation\nUpsampling Tensor Train (PuTT)', a novel method for learning tensor train\nrepresentations in a coarse-to-fine manner. Our method involves the prolonging\nor `upsampling' of a learned tensor train representation, creating a sequence\nof 'coarse-to-fine' tensor trains that are incrementally refined. We evaluate\nour representation along three axes: (1). compression, (2). denoising\ncapability, and (3). image completion capability. To assess these axes, we\nconsider the tasks of image fitting, 3D fitting, and novel view synthesis,\nwhere our method shows an improved performance compared to state-of-the-art\ntensor-based methods. For full results see our project webpage:\nhttps://sebulo.github.io/PuTT_website/\n","authors":["Sebastian Loeschcke","Dan Wang","Christian Leth-Espensen","Serge Belongie","Michael J. Kastoryano","Sagie Benaim"],"pdf_url":"https://arxiv.org/pdf/2406.04332v1.pdf","comment":"Project webpage: https://sebulo.github.io/PuTT_website/"},{"id":"http://arxiv.org/abs/2406.04333v1","updated":"2024-06-06T17:59:23Z","published":"2024-06-06T17:59:23Z","title":"BitsFusion: 1.99 bits Weight Quantization of Diffusion Model","summary":"  Diffusion-based image generation models have achieved great success in recent\nyears by showing the capability of synthesizing high-quality content. However,\nthese models contain a huge number of parameters, resulting in a significantly\nlarge model size. Saving and transferring them is a major bottleneck for\nvarious applications, especially those running on resource-constrained devices.\nIn this work, we develop a novel weight quantization method that quantizes the\nUNet from Stable Diffusion v1.5 to 1.99 bits, achieving a model with 7.9X\nsmaller size while exhibiting even better generation quality than the original\none. Our approach includes several novel techniques, such as assigning optimal\nbits to each layer, initializing the quantized model for better performance,\nand improving the training strategy to dramatically reduce quantization error.\nFurthermore, we extensively evaluate our quantized model across various\nbenchmark datasets and through human evaluation to demonstrate its superior\ngeneration quality.\n","authors":["Yang Sui","Yanyu Li","Anil Kag","Yerlan Idelbayev","Junli Cao","Ju Hu","Dhritiman Sagar","Bo Yuan","Sergey Tulyakov","Jian Ren"],"pdf_url":"https://arxiv.org/pdf/2406.04333v1.pdf","comment":"Project Page: https://snap-research.github.io/BitsFusion"},{"id":"http://arxiv.org/abs/2406.04330v1","updated":"2024-06-06T17:59:10Z","published":"2024-06-06T17:59:10Z","title":"Parameter-Inverted Image Pyramid Networks","summary":"  Image pyramids are commonly used in modern computer vision tasks to obtain\nmulti-scale features for precise understanding of images. However, image\npyramids process multiple resolutions of images using the same large-scale\nmodel, which requires significant computational cost. To overcome this issue,\nwe propose a novel network architecture known as the Parameter-Inverted Image\nPyramid Networks (PIIP). Our core idea is to use models with different\nparameter sizes to process different resolution levels of the image pyramid,\nthereby balancing computational efficiency and performance. Specifically, the\ninput to PIIP is a set of multi-scale images, where higher resolution images\nare processed by smaller networks. We further propose a feature interaction\nmechanism to allow features of different resolutions to complement each other\nand effectively integrate information from different spatial scales. Extensive\nexperiments demonstrate that the PIIP achieves superior performance in tasks\nsuch as object detection, segmentation, and image classification, compared to\ntraditional image pyramid methods and single-branch networks, while reducing\ncomputational cost. Notably, when applying our method on a large-scale vision\nfoundation model InternViT-6B, we improve its performance by 1%-2% on detection\nand segmentation with only 40%-60% of the original computation. These results\nvalidate the effectiveness of the PIIP approach and provide a new technical\ndirection for future vision computing tasks. Our code and models are available\nat https://github.com/OpenGVLab/PIIP.\n","authors":["Xizhou Zhu","Xue Yang","Zhaokai Wang","Hao Li","Wenhan Dou","Junqi Ge","Lewei Lu","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2406.04330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04325v1","updated":"2024-06-06T17:58:54Z","published":"2024-06-06T17:58:54Z","title":"ShareGPT4Video: Improving Video Understanding and Generation with Better\n  Captions","summary":"  We present the ShareGPT4Video series, aiming to facilitate the video\nunderstanding of large video-language models (LVLMs) and the video generation\nof text-to-video models (T2VMs) via dense and precise captions. The series\ncomprises: 1) ShareGPT4Video, 40K GPT4V annotated dense captions of videos with\nvarious lengths and sources, developed through carefully designed data\nfiltering and annotating strategy. 2) ShareCaptioner-Video, an efficient and\ncapable captioning model for arbitrary videos, with 4.8M high-quality aesthetic\nvideos annotated by it. 3) ShareGPT4Video-8B, a simple yet superb LVLM that\nreached SOTA performance on three advancing video benchmarks. To achieve this,\ntaking aside the non-scalable costly human annotators, we find using GPT4V to\ncaption video with a naive multi-frame or frame-concatenation input strategy\nleads to less detailed and sometimes temporal-confused results. We argue the\nchallenge of designing a high-quality video captioning strategy lies in three\naspects: 1) Inter-frame precise temporal change understanding. 2) Intra-frame\ndetailed content description. 3) Frame-number scalability for arbitrary-length\nvideos. To this end, we meticulously designed a differential video captioning\nstrategy, which is stable, scalable, and efficient for generating captions for\nvideos with arbitrary resolution, aspect ratios, and length. Based on it, we\nconstruct ShareGPT4Video, which contains 40K high-quality videos spanning a\nwide range of categories, and the resulting captions encompass rich world\nknowledge, object attributes, camera movements, and crucially, detailed and\nprecise temporal descriptions of events. Based on ShareGPT4Video, we further\ndevelop ShareCaptioner-Video, a superior captioner capable of efficiently\ngenerating high-quality captions for arbitrary videos...\n","authors":["Lin Chen","Xilin Wei","Jinsong Li","Xiaoyi Dong","Pan Zhang","Yuhang Zang","Zehui Chen","Haodong Duan","Bin Lin","Zhenyu Tang","Li Yuan","Yu Qiao","Dahua Lin","Feng Zhao","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2406.04325v1.pdf","comment":"Project Page: https://sharegpt4video.github.io/"},{"id":"http://arxiv.org/abs/2310.12956v2","updated":"2024-06-06T17:58:45Z","published":"2023-10-19T17:55:06Z","title":"Eureka-Moments in Transformers: Multi-Step Tasks Reveal Softmax Induced\n  Optimization Problems","summary":"  In this work, we study rapid improvements of the training loss in\ntransformers when being confronted with multi-step decision tasks. We found\nthat transformers struggle to learn the intermediate task and both training and\nvalidation loss saturate for hundreds of epochs. When transformers finally\nlearn the intermediate task, they do this rapidly and unexpectedly. We call\nthese abrupt improvements Eureka-moments, since the transformer appears to\nsuddenly learn a previously incomprehensible concept. We designed synthetic\ntasks to study the problem in detail, but the leaps in performance can be\nobserved also for language modeling and in-context learning (ICL). We suspect\nthat these abrupt transitions are caused by the multi-step nature of these\ntasks. Indeed, we find connections and show that ways to improve on the\nsynthetic multi-step tasks can be used to improve the training of language\nmodeling and ICL. Using the synthetic data we trace the problem back to the\nSoftmax function in the self-attention block of transformers and show ways to\nalleviate the problem. These fixes reduce the required number of training\nsteps, lead to higher likelihood to learn the intermediate task, to higher\nfinal accuracy and training becomes more robust to hyper-parameters.\n","authors":["David T. Hoffmann","Simon Schrodi","Jelena Bratulić","Nadine Behrmann","Volker Fischer","Thomas Brox"],"pdf_url":"https://arxiv.org/pdf/2310.12956v2.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2406.04324v1","updated":"2024-06-06T17:58:27Z","published":"2024-06-06T17:58:27Z","title":"SF-V: Single Forward Video Generation Model","summary":"  Diffusion-based video generation models have demonstrated remarkable success\nin obtaining high-fidelity videos through the iterative denoising process.\nHowever, these models require multiple denoising steps during sampling,\nresulting in high computational costs. In this work, we propose a novel\napproach to obtain single-step video generation models by leveraging\nadversarial training to fine-tune pre-trained video diffusion models. We show\nthat, through the adversarial training, the multi-steps video diffusion model,\ni.e., Stable Video Diffusion (SVD), can be trained to perform single forward\npass to synthesize high-quality videos, capturing both temporal and spatial\ndependencies in the video data. Extensive experiments demonstrate that our\nmethod achieves competitive generation quality of synthesized videos with\nsignificantly reduced computational overhead for the denoising process (i.e.,\naround $23\\times$ speedup compared with SVD and $6\\times$ speedup compared with\nexisting works, with even better generation quality), paving the way for\nreal-time video synthesis and editing. More visualization results are made\npublicly available at https://snap-research.github.io/SF-V.\n","authors":["Zhixing Zhang","Yanyu Li","Yushu Wu","Yanwu Xu","Anil Kag","Ivan Skorokhodov","Willi Menapace","Aliaksandr Siarohin","Junli Cao","Dimitris Metaxas","Sergey Tulyakov","Jian Ren"],"pdf_url":"https://arxiv.org/pdf/2406.04324v1.pdf","comment":"Project Page: https://snap-research.github.io/SF-V"},{"id":"http://arxiv.org/abs/2406.04322v1","updated":"2024-06-06T17:58:15Z","published":"2024-06-06T17:58:15Z","title":"DIRECT-3D: Learning Direct Text-to-3D Generation on Massive Noisy 3D\n  Data","summary":"  We present DIRECT-3D, a diffusion-based 3D generative model for creating\nhigh-quality 3D assets (represented by Neural Radiance Fields) from text\nprompts. Unlike recent 3D generative models that rely on clean and well-aligned\n3D data, limiting them to single or few-class generation, our model is directly\ntrained on extensive noisy and unaligned `in-the-wild' 3D assets, mitigating\nthe key challenge (i.e., data scarcity) in large-scale 3D generation. In\nparticular, DIRECT-3D is a tri-plane diffusion model that integrates two\ninnovations: 1) A novel learning framework where noisy data are filtered and\naligned automatically during the training process. Specifically, after an\ninitial warm-up phase using a small set of clean data, an iterative\noptimization is introduced in the diffusion process to explicitly estimate the\n3D pose of objects and select beneficial data based on conditional density. 2)\nAn efficient 3D representation that is achieved by disentangling object\ngeometry and color features with two separate conditional diffusion models that\nare optimized hierarchically. Given a prompt input, our model generates\nhigh-quality, high-resolution, realistic, and complex 3D objects with accurate\ngeometric details in seconds. We achieve state-of-the-art performance in both\nsingle-class generation and text-to-3D generation. We also demonstrate that\nDIRECT-3D can serve as a useful 3D geometric prior of objects, for example to\nalleviate the well-known Janus problem in 2D-lifting methods such as\nDreamFusion. The code and models are available for research purposes at:\nhttps://github.com/qihao067/direct3d.\n","authors":["Qihao Liu","Yi Zhang","Song Bai","Adam Kortylewski","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2406.04322v1.pdf","comment":"Accepted to CVPR 2024; code: https://github.com/qihao067/direct3d;\n  project page: https://direct-3d.github.io/"},{"id":"http://arxiv.org/abs/2406.04323v1","updated":"2024-06-06T17:58:15Z","published":"2024-06-06T17:58:15Z","title":"ATraDiff: Accelerating Online Reinforcement Learning with Imaginary\n  Trajectories","summary":"  Training autonomous agents with sparse rewards is a long-standing problem in\nonline reinforcement learning (RL), due to low data efficiency. Prior work\novercomes this challenge by extracting useful knowledge from offline data,\noften accomplished through the learning of action distribution from offline\ndata and utilizing the learned distribution to facilitate online RL. However,\nsince the offline data are given and fixed, the extracted knowledge is\ninherently limited, making it difficult to generalize to new tasks. We propose\na novel approach that leverages offline data to learn a generative diffusion\nmodel, coined as Adaptive Trajectory Diffuser (ATraDiff). This model generates\nsynthetic trajectories, serving as a form of data augmentation and consequently\nenhancing the performance of online RL methods. The key strength of our\ndiffuser lies in its adaptability, allowing it to effectively handle varying\ntrajectory lengths and mitigate distribution shifts between online and offline\ndata. Because of its simplicity, ATraDiff seamlessly integrates with a wide\nspectrum of RL methods. Empirical evaluation shows that ATraDiff consistently\nachieves state-of-the-art performance across a variety of environments, with\nparticularly pronounced improvements in complicated settings. Our code and demo\nvideo are available at https://atradiff.github.io .\n","authors":["Qianlan Yang","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2406.04323v1.pdf","comment":"ICML 2024 Accepted"},{"id":"http://arxiv.org/abs/2406.04321v1","updated":"2024-06-06T17:58:11Z","published":"2024-06-06T17:58:11Z","title":"VidMuse: A Simple Video-to-Music Generation Framework with\n  Long-Short-Term Modeling","summary":"  In this work, we systematically study music generation conditioned solely on\nthe video. First, we present a large-scale dataset comprising 190K video-music\npairs, including various genres such as movie trailers, advertisements, and\ndocumentaries. Furthermore, we propose VidMuse, a simple framework for\ngenerating music aligned with video inputs. VidMuse stands out by producing\nhigh-fidelity music that is both acoustically and semantically aligned with the\nvideo. By incorporating local and global visual cues, VidMuse enables the\ncreation of musically coherent audio tracks that consistently match the video\ncontent through Long-Short-Term modeling. Through extensive experiments,\nVidMuse outperforms existing models in terms of audio quality, diversity, and\naudio-visual alignment. The code and datasets will be available at\nhttps://github.com/ZeyueT/VidMuse/.\n","authors":["Zeyue Tian","Zhaoyang Liu","Ruibin Yuan","Jiahao Pan","Xiaoqiang Huang","Qifeng Liu","Xu Tan","Qifeng Chen","Wei Xue","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2406.04321v1.pdf","comment":"The code and datasets will be available at\n  https://github.com/ZeyueT/VidMuse/"},{"id":"http://arxiv.org/abs/2406.04318v1","updated":"2024-06-06T17:58:00Z","published":"2024-06-06T17:58:00Z","title":"Adaptive Sampling of k-Space in Magnetic Resonance for Rapid Pathology\n  Prediction","summary":"  Magnetic Resonance (MR) imaging, despite its proven diagnostic utility,\nremains an inaccessible imaging modality for disease surveillance at the\npopulation level. A major factor rendering MR inaccessible is lengthy scan\ntimes. An MR scanner collects measurements associated with the underlying\nanatomy in the Fourier space, also known as the k-space. Creating a\nhigh-fidelity image requires collecting large quantities of such measurements,\nincreasing the scan time. Traditionally to accelerate an MR scan, image\nreconstruction from under-sampled k-space data is the method of choice.\nHowever, recent works show the feasibility of bypassing image reconstruction\nand directly learning to detect disease directly from a sparser learned subset\nof the k-space measurements. In this work, we propose Adaptive Sampling for MR\n(ASMR), a sampling method that learns an adaptive policy to sequentially select\nk-space samples to optimize for target disease detection. On 6 out of 8\npathology classification tasks spanning the Knee, Brain, and Prostate MR scans,\nASMR reaches within 2% of the performance of a fully sampled classifier while\nusing only 8% of the k-space, as well as outperforming prior state-of-the-art\nwork in k-space sampling such as EMRT, LOUPE, and DPS.\n","authors":["Chen-Yu Yen","Raghav Singhal","Umang Sharma","Rajesh Ranganath","Sumit Chopra","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2406.04318v1.pdf","comment":"ICML 2024. Project website at https://adaptive-sampling-mr.github.io"},{"id":"http://arxiv.org/abs/2406.04316v1","updated":"2024-06-06T17:57:20Z","published":"2024-06-06T17:57:20Z","title":"Omni6DPose: A Benchmark and Model for Universal 6D Object Pose\n  Estimation and Tracking","summary":"  6D Object Pose Estimation is a crucial yet challenging task in computer\nvision, suffering from a significant lack of large-scale datasets. This\nscarcity impedes comprehensive evaluation of model performance, limiting\nresearch advancements. Furthermore, the restricted number of available\ninstances or categories curtails its applications. To address these issues,\nthis paper introduces Omni6DPose, a substantial dataset characterized by its\ndiversity in object categories, large scale, and variety in object materials.\nOmni6DPose is divided into three main components: ROPE (Real 6D Object Pose\nEstimation Dataset), which includes 332K images annotated with over 1.5M\nannotations across 581 instances in 149 categories; SOPE(Simulated 6D Object\nPose Estimation Dataset), consisting of 475K images created in a mixed reality\nsetting with depth simulation, annotated with over 5M annotations across 4162\ninstances in the same 149 categories; and the manually aligned real scanned\nobjects used in both ROPE and SOPE. Omni6DPose is inherently challenging due to\nthe substantial variations and ambiguities. To address this challenge, we\nintroduce GenPose++, an enhanced version of the SOTA category-level pose\nestimation framework, incorporating two pivotal improvements: Semantic-aware\nfeature extraction and Clustering-based aggregation. Moreover, we provide a\ncomprehensive benchmarking analysis to evaluate the performance of previous\nmethods on this large-scale dataset in the realms of 6D object pose estimation\nand pose tracking.\n","authors":["Jiyao Zhang","Weiyao Huang","Bo Peng","Mingdong Wu","Fei Hu","Zijian Chen","Bo Zhao","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2406.04316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04314v1","updated":"2024-06-06T17:57:09Z","published":"2024-06-06T17:57:09Z","title":"Step-aware Preference Optimization: Aligning Preference with Denoising\n  Performance at Each Step","summary":"  Recently, Direct Preference Optimization (DPO) has extended its success from\naligning large language models (LLMs) to aligning text-to-image diffusion\nmodels with human preferences. Unlike most existing DPO methods that assume all\ndiffusion steps share a consistent preference order with the final generated\nimages, we argue that this assumption neglects step-specific denoising\nperformance and that preference labels should be tailored to each step's\ncontribution. To address this limitation, we propose Step-aware Preference\nOptimization (SPO), a novel post-training approach that independently evaluates\nand adjusts the denoising performance at each step, using a step-aware\npreference model and a step-wise resampler to ensure accurate step-aware\nsupervision. Specifically, at each denoising step, we sample a pool of images,\nfind a suitable win-lose pair, and, most importantly, randomly select a single\nimage from the pool to initialize the next denoising step. This step-wise\nresampler process ensures the next win-lose image pair comes from the same\nimage, making the win-lose comparison independent of the previous step. To\nassess the preferences at each step, we train a separate step-aware preference\nmodel that can be applied to both noisy and clean images. Our experiments with\nStable Diffusion v1.5 and SDXL demonstrate that SPO significantly outperforms\nthe latest Diffusion-DPO in aligning generated images with complex, detailed\nprompts and enhancing aesthetics, while also achieving more than 20x times\nfaster in training efficiency. Code and model:\nhttps://rockeycoss.github.io/spo.github.io/\n","authors":["Zhanhao Liang","Yuhui Yuan","Shuyang Gu","Bohan Chen","Tiankai Hang","Ji Li","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2406.04314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04313v1","updated":"2024-06-06T17:57:04Z","published":"2024-06-06T17:57:04Z","title":"Improving Alignment and Robustness with Short Circuiting","summary":"  AI systems can take harmful actions and are highly vulnerable to adversarial\nattacks. We present an approach, inspired by recent advances in representation\nengineering, that \"short-circuits\" models as they respond with harmful outputs.\nExisting techniques aimed at improving alignment, such as refusal training, are\noften bypassed. Techniques such as adversarial training try to plug these holes\nby countering specific attacks. As an alternative to refusal training and\nadversarial training, short-circuiting directly controls the representations\nthat are responsible for harmful outputs in the first place. Our technique can\nbe applied to both text-only and multimodal language models to prevent the\ngeneration of harmful outputs without sacrificing utility -- even in the\npresence of powerful unseen attacks. Notably, while adversarial robustness in\nstandalone image recognition remains an open challenge, short-circuiting allows\nthe larger multimodal system to reliably withstand image \"hijacks\" that aim to\nproduce harmful content. Finally, we extend our approach to AI agents,\ndemonstrating considerable reductions in the rate of harmful actions when they\nare under attack. Our approach represents a significant step forward in the\ndevelopment of reliable safeguards to harmful behavior and adversarial attacks.\n","authors":["Andy Zou","Long Phan","Justin Wang","Derek Duenas","Maxwell Lin","Maksym Andriushchenko","Rowan Wang","Zico Kolter","Matt Fredrikson","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2406.04313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04312v1","updated":"2024-06-06T17:56:40Z","published":"2024-06-06T17:56:40Z","title":"ReNO: Enhancing One-step Text-to-Image Models through Reward-based Noise\n  Optimization","summary":"  Text-to-Image (T2I) models have made significant advancements in recent\nyears, but they still struggle to accurately capture intricate details\nspecified in complex compositional prompts. While fine-tuning T2I models with\nreward objectives has shown promise, it suffers from \"reward hacking\" and may\nnot generalize well to unseen prompt distributions. In this work, we propose\nReward-based Noise Optimization (ReNO), a novel approach that enhances T2I\nmodels at inference by optimizing the initial noise based on the signal from\none or multiple human preference reward models. Remarkably, solving this\noptimization problem with gradient ascent for 50 iterations yields impressive\nresults on four different one-step models across two competitive benchmarks,\nT2I-CompBench and GenEval. Within a computational budget of 20-50 seconds,\nReNO-enhanced one-step models consistently surpass the performance of all\ncurrent open-source Text-to-Image models. Extensive user studies demonstrate\nthat our model is preferred nearly twice as often compared to the popular SDXL\nmodel and is on par with the proprietary Stable Diffusion 3 with 8B parameters.\nMoreover, given the same computational resources, a ReNO-optimized one-step\nmodel outperforms widely-used open-source models such as SDXL and\nPixArt-$\\alpha$, highlighting the efficiency and effectiveness of ReNO in\nenhancing T2I model performance at inference time. Code is available at\nhttps://github.com/ExplainableML/ReNO.\n","authors":["Luca Eyring","Shyamgopal Karthik","Karsten Roth","Alexey Dosovitskiy","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2406.04312v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2406.04309v1","updated":"2024-06-06T17:55:34Z","published":"2024-06-06T17:55:34Z","title":"ReFiNe: Recursive Field Networks for Cross-modal Multi-scene\n  Representation","summary":"  The common trade-offs of state-of-the-art methods for multi-shape\nrepresentation (a single model \"packing\" multiple objects) involve trading\nmodeling accuracy against memory and storage. We show how to encode multiple\nshapes represented as continuous neural fields with a higher degree of\nprecision than previously possible and with low memory usage. Key to our\napproach is a recursive hierarchical formulation that exploits object\nself-similarity, leading to a highly compressed and efficient shape latent\nspace. Thanks to the recursive formulation, our method supports spatial and\nglobal-to-local latent feature fusion without needing to initialize and\nmaintain auxiliary data structures, while still allowing for continuous field\nqueries to enable applications such as raytracing. In experiments on a set of\ndiverse datasets, we provide compelling qualitative results and demonstrate\nstate-of-the-art multi-scene reconstruction and compression results with a\nsingle network per dataset.\n","authors":["Sergey Zakharov","Katherine Liu","Adrien Gaidon","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2406.04309v1.pdf","comment":"SIGGRAPH 2024. Project Page:\n  https://zakharos.github.io/projects/refine/"},{"id":"http://arxiv.org/abs/2405.17398v2","updated":"2024-06-06T17:52:03Z","published":"2024-05-27T17:49:15Z","title":"Vista: A Generalizable Driving World Model with High Fidelity and\n  Versatile Controllability","summary":"  World models can foresee the outcomes of different actions, which is of\nparamount importance for autonomous driving. Nevertheless, existing driving\nworld models still have limitations in generalization to unseen environments,\nprediction fidelity of critical details, and action controllability for\nflexible application. In this paper, we present Vista, a generalizable driving\nworld model with high fidelity and versatile controllability. Based on a\nsystematic diagnosis of existing methods, we introduce several key ingredients\nto address these limitations. To accurately predict real-world dynamics at high\nresolution, we propose two novel losses to promote the learning of moving\ninstances and structural information. We also devise an effective latent\nreplacement approach to inject historical frames as priors for coherent\nlong-horizon rollouts. For action controllability, we incorporate a versatile\nset of controls from high-level intentions (command, goal point) to low-level\nmaneuvers (trajectory, angle, and speed) through an efficient learning\nstrategy. After large-scale training, the capabilities of Vista can seamlessly\ngeneralize to different scenarios. Extensive experiments on multiple datasets\nshow that Vista outperforms the most advanced general-purpose video generator\nin over 70% of comparisons and surpasses the best-performing driving world\nmodel by 55% in FID and 27% in FVD. Moreover, for the first time, we utilize\nthe capacity of Vista itself to establish a generalizable reward for real-world\naction evaluation without accessing the ground truth actions.\n","authors":["Shenyuan Gao","Jiazhi Yang","Li Chen","Kashyap Chitta","Yihang Qiu","Andreas Geiger","Jun Zhang","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2405.17398v2.pdf","comment":"Code and model: https://github.com/OpenDriveLab/Vista, video demos:\n  https://vista-demo.github.io"},{"id":"http://arxiv.org/abs/2406.04303v1","updated":"2024-06-06T17:49:21Z","published":"2024-06-06T17:49:21Z","title":"Vision-LSTM: xLSTM as Generic Vision Backbone","summary":"  Transformers are widely used as generic backbones in computer vision, despite\ninitially introduced for natural language processing. Recently, the Long\nShort-Term Memory (LSTM) has been extended to a scalable and performant\narchitecture - the xLSTM - which overcomes long-standing LSTM limitations via\nexponential gating and parallelizable matrix memory structure. In this report,\nwe introduce Vision-LSTM (ViL), an adaption of the xLSTM building blocks to\ncomputer vision. ViL comprises a stack of xLSTM blocks where odd blocks process\nthe sequence of patch tokens from top to bottom while even blocks go from\nbottom to top. Experiments show that ViL holds promise to be further deployed\nas new generic backbone for computer vision architectures.\n","authors":["Benedikt Alkin","Maximilian Beck","Korbinian Pöppel","Sepp Hochreiter","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2406.04303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04301v1","updated":"2024-06-06T17:47:48Z","published":"2024-06-06T17:47:48Z","title":"Neural Surface Reconstruction from Sparse Views Using Epipolar Geometry","summary":"  This paper addresses the challenge of reconstructing surfaces from sparse\nview inputs, where ambiguity and occlusions due to missing information pose\nsignificant hurdles. We present a novel approach, named EpiS, that incorporates\nEpipolar information into the reconstruction process. Existing methods in\nsparse-view neural surface learning have mainly focused on mean and variance\nconsiderations using cost volumes for feature extraction. In contrast, our\nmethod aggregates coarse information from the cost volume into Epipolar\nfeatures extracted from multiple source views, enabling the generation of\nfine-grained Signal Distance Function (SDF)-aware features. Additionally, we\nemploy an attention mechanism along the line dimension to facilitate feature\nfusion based on the SDF feature. Furthermore, to address the information gaps\nin sparse conditions, we integrate depth information from monocular depth\nestimation using global and local regularization techniques. The global\nregularization utilizes a triplet loss function, while the local regularization\nemploys a derivative loss function. Extensive experiments demonstrate that our\napproach outperforms state-of-the-art methods, especially in cases with sparse\nand generalizable conditions.\n","authors":["Kaichen Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.04301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01349v3","updated":"2024-06-06T17:39:50Z","published":"2024-06-03T14:13:13Z","title":"Unleashing Generalization of End-to-End Autonomous Driving with\n  Controllable Long Video Generation","summary":"  Using generative models to synthesize new data has become a de-facto standard\nin autonomous driving to address the data scarcity issue. Though existing\napproaches are able to boost perception models, we discover that these\napproaches fail to improve the performance of planning of end-to-end autonomous\ndriving models as the generated videos are usually less than 8 frames and the\nspatial and temporal inconsistencies are not negligible. To this end, we\npropose Delphi, a novel diffusion-based long video generation method with a\nshared noise modeling mechanism across the multi-views to increase spatial\nconsistency, and a feature-aligned module to achieves both precise\ncontrollability and temporal consistency. Our method can generate up to 40\nframes of video without loss of consistency which is about 5 times longer\ncompared with state-of-the-art methods. Instead of randomly generating new\ndata, we further design a sampling policy to let Delphi generate new data that\nare similar to those failure cases to improve the sample efficiency. This is\nachieved by building a failure-case driven framework with the help of\npre-trained visual language models. Our extensive experiment demonstrates that\nour Delphi generates a higher quality of long videos surpassing previous\nstate-of-the-art methods. Consequentially, with only generating 4% of the\ntraining dataset size, our framework is able to go beyond perception and\nprediction tasks, for the first time to the best of our knowledge, boost the\nplanning performance of the end-to-end autonomous driving model by a margin of\n25%.\n","authors":["Enhui Ma","Lijun Zhou","Tao Tang","Zhan Zhang","Dong Han","Junpeng Jiang","Kun Zhan","Peng Jia","Xianpeng Lang","Haiyang Sun","Di Lin","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2406.01349v3.pdf","comment":"Project Page: https://westlake-autolab.github.io/delphi.github.io/, 8\n  figures"},{"id":"http://arxiv.org/abs/2406.04295v1","updated":"2024-06-06T17:39:09Z","published":"2024-06-06T17:39:09Z","title":"Everything to the Synthetic: Diffusion-driven Test-time Adaptation via\n  Synthetic-Domain Alignment","summary":"  Test-time adaptation (TTA) aims to enhance the performance of source-domain\npretrained models when tested on unknown shifted target domains. Traditional\nTTA methods primarily adapt model weights based on target data streams, making\nmodel performance sensitive to the amount and order of target data. Recently,\ndiffusion-driven TTA methods have demonstrated strong performance by using an\nunconditional diffusion model, which is also trained on the source domain to\ntransform target data into synthetic data as a source domain projection. This\nallows the source model to make predictions without weight adaptation. In this\npaper, we argue that the domains of the source model and the synthetic data in\ndiffusion-driven TTA methods are not aligned. To adapt the source model to the\nsynthetic domain of the unconditional diffusion model, we introduce a\nSynthetic-Domain Alignment (SDA) framework to fine-tune the source model with\nsynthetic data. Specifically, we first employ a conditional diffusion model to\ngenerate labeled samples, creating a synthetic dataset. Subsequently, we use\nthe aforementioned unconditional diffusion model to add noise to and denoise\neach sample before fine-tuning. This process mitigates the potential domain gap\nbetween the conditional and unconditional models. Extensive experiments across\nvarious models and benchmarks demonstrate that SDA achieves superior domain\nalignment and consistently outperforms existing diffusion-driven TTA methods.\nOur code is available at\nhttps://github.com/SHI-Labs/Diffusion-Driven-Test-Time-Adaptation-via-Synthetic-Domain-Alignment.\n","authors":["Jiayi Guo","Junhao Zhao","Chunjiang Ge","Chaoqun Du","Zanlin Ni","Shiji Song","Humphrey Shi","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2406.04295v1.pdf","comment":"GitHub:\n  https://github.com/SHI-Labs/Diffusion-Driven-Test-Time-Adaptation-via-Synthetic-Domain-Alignment"},{"id":"http://arxiv.org/abs/2406.04292v1","updated":"2024-06-06T17:37:47Z","published":"2024-06-06T17:37:47Z","title":"VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval","summary":"  Multi-modal retrieval becomes increasingly popular in practice. However, the\nexisting retrievers are mostly text-oriented, which lack the capability to\nprocess visual information. Despite the presence of vision-language models like\nCLIP, the current methods are severely limited in representing the text-only\nand image-only data. In this work, we present a new embedding model VISTA for\nuniversal multi-modal retrieval. Our work brings forth threefold technical\ncontributions. Firstly, we introduce a flexible architecture which extends a\npowerful text encoder with the image understanding capability by introducing\nvisual token embeddings. Secondly, we develop two data generation strategies,\nwhich bring high-quality composed image-text to facilitate the training of the\nembedding model. Thirdly, we introduce a multi-stage training algorithm, which\nfirst aligns the visual token embedding with the text encoder using massive\nweakly labeled data, and then develops multi-modal representation capability\nusing the generated composed image-text data. In our experiments, VISTA\nachieves superior performances across a variety of multi-modal retrieval tasks\nin both zero-shot and supervised settings. Our model, data, and source code are\navailable at https://github.com/FlagOpen/FlagEmbedding.\n","authors":["Junjie Zhou","Zheng Liu","Shitao Xiao","Bo Zhao","Yongping Xiong"],"pdf_url":"https://arxiv.org/pdf/2406.04292v1.pdf","comment":"Accepted to ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2406.04287v1","updated":"2024-06-06T17:33:23Z","published":"2024-06-06T17:33:23Z","title":"SpectralZoom: Efficient Segmentation with an Adaptive Hyperspectral\n  Camera","summary":"  Hyperspectral image segmentation is crucial for many fields such as\nagriculture, remote sensing, biomedical imaging, battlefield sensing and\nastronomy. However, the challenge of hyper and multi spectral imaging is its\nlarge data footprint. We propose both a novel camera design and a vision\ntransformer-based (ViT) algorithm that alleviate both the captured data\nfootprint and the computational load for hyperspectral segmentation. Our camera\nis able to adaptively sample image regions or patches at different resolutions,\ninstead of capturing the entire hyperspectral cube at one high resolution. Our\nsegmentation algorithm works in concert with the camera, applying ViT-based\nsegmentation only to adaptively selected patches. We show results both in\nsimulation and on a real hardware platform demonstrating both accurate\nsegmentation results and reduced computational burden.\n","authors":["Jackson Arnold","Sophia Rossi","Chloe Petrosino","Ethan Mitchell","Sanjeev J. Koppal"],"pdf_url":"https://arxiv.org/pdf/2406.04287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04280v1","updated":"2024-06-06T17:26:40Z","published":"2024-06-06T17:26:40Z","title":"xMIL: Insightful Explanations for Multiple Instance Learning in\n  Histopathology","summary":"  Multiple instance learning (MIL) is an effective and widely used approach for\nweakly supervised machine learning. In histopathology, MIL models have achieved\nremarkable success in tasks like tumor detection, biomarker prediction, and\noutcome prognostication. However, MIL explanation methods are still lagging\nbehind, as they are limited to small bag sizes or disregard instance\ninteractions. We revisit MIL through the lens of explainable AI (XAI) and\nintroduce xMIL, a refined framework with more general assumptions. We\ndemonstrate how to obtain improved MIL explanations using layer-wise relevance\npropagation (LRP) and conduct extensive evaluation experiments on three toy\nsettings and four real-world histopathology datasets. Our approach consistently\noutperforms previous explanation attempts with particularly improved\nfaithfulness scores on challenging biomarker prediction tasks. Finally, we\nshowcase how xMIL explanations enable pathologists to extract insights from MIL\nmodels, representing a significant advance for knowledge discovery and model\ndebugging in digital histopathology.\n","authors":["Julius Hense","Mina Jamshidi Idaji","Oliver Eberle","Thomas Schnake","Jonas Dippel","Laure Ciernik","Oliver Buchstab","Andreas Mock","Frederick Klauschen","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04277v1","updated":"2024-06-06T17:25:33Z","published":"2024-06-06T17:25:33Z","title":"VideoTetris: Towards Compositional Text-to-Video Generation","summary":"  Diffusion models have demonstrated great success in text-to-video (T2V)\ngeneration. However, existing methods may face challenges when handling complex\n(long) video generation scenarios that involve multiple objects or dynamic\nchanges in object numbers. To address these limitations, we propose\nVideoTetris, a novel framework that enables compositional T2V generation.\nSpecifically, we propose spatio-temporal compositional diffusion to precisely\nfollow complex textual semantics by manipulating and composing the attention\nmaps of denoising networks spatially and temporally. Moreover, we propose an\nenhanced video data preprocessing to enhance the training data regarding motion\ndynamics and prompt understanding, equipped with a new reference frame\nattention mechanism to improve the consistency of auto-regressive video\ngeneration. Extensive experiments demonstrate that our VideoTetris achieves\nimpressive qualitative and quantitative results in compositional T2V\ngeneration. Code is available at: https://github.com/YangLing0818/VideoTetris\n","authors":["Ye Tian","Ling Yang","Haotian Yang","Yuan Gao","Yufan Deng","Jingmin Chen","Xintao Wang","Zhaochen Yu","Xin Tao","Pengfei Wan","Di Zhang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2406.04277v1.pdf","comment":"Code: https://github.com/YangLing0818/VideoTetris"},{"id":"http://arxiv.org/abs/2406.04273v1","updated":"2024-06-06T17:23:05Z","published":"2024-06-06T17:23:05Z","title":"ELFS: Enhancing Label-Free Coreset Selection via Clustering-based\n  Pseudo-Labeling","summary":"  High-quality human-annotated data is crucial for modern deep learning\npipelines, yet the human annotation process is both costly and time-consuming.\nGiven a constrained human labeling budget, selecting an informative and\nrepresentative data subset for labeling can significantly reduce human\nannotation effort. Well-performing state-of-the-art (SOTA) coreset selection\nmethods require ground-truth labels over the whole dataset, failing to reduce\nthe human labeling burden. Meanwhile, SOTA label-free coreset selection methods\ndeliver inferior performance due to poor geometry-based scores. In this paper,\nwe introduce ELFS, a novel label-free coreset selection method. ELFS employs\ndeep clustering to estimate data difficulty scores without ground-truth labels.\nFurthermore, ELFS uses a simple but effective double-end pruning method to\nmitigate bias on calculated scores, which further improves the performance on\nselected coresets. We evaluate ELFS on five vision benchmarks and show that\nELFS consistently outperforms SOTA label-free baselines. For instance, at a 90%\npruning rate, ELFS surpasses the best-performing baseline by 5.3% on CIFAR10\nand 7.1% on CIFAR100. Moreover, ELFS even achieves comparable performance to\nsupervised coreset selection at low pruning rates (e.g., 30% and 50%) on\nCIFAR10 and ImageNet-1K.\n","authors":["Haizhong Zheng","Elisa Tsai","Yifu Lu","Jiachen Sun","Brian R. Bartoldson","Bhavya Kailkhura","Atul Prakash"],"pdf_url":"https://arxiv.org/pdf/2406.04273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04264v1","updated":"2024-06-06T17:09:32Z","published":"2024-06-06T17:09:32Z","title":"MLVU: A Comprehensive Benchmark for Multi-Task Long Video Understanding","summary":"  The evaluation of Long Video Understanding (LVU) performance poses an\nimportant but challenging research problem. Despite previous efforts, the\nexisting video understanding benchmarks are severely constrained by several\nissues, especially the insufficient lengths of videos, a lack of diversity in\nvideo types and evaluation tasks, and the inappropriateness for evaluating LVU\nperformances. To address the above problems, we propose a new benchmark, called\nMLVU (Multi-task Long Video Understanding Benchmark), for the comprehensive and\nin-depth evaluation of LVU. MLVU presents the following critical values: 1) The\nsubstantial and flexible extension of video lengths, which enables the\nbenchmark to evaluate LVU performance across a wide range of durations. 2) The\ninclusion of various video genres, e.g., movies, surveillance footage,\negocentric videos, cartoons, game videos, etc., which reflects the models' LVU\nperformances in different scenarios. 3) The development of diversified\nevaluation tasks, which enables a comprehensive examination of MLLMs' key\nabilities in long-video understanding. The empirical study with 20 latest MLLMs\nreveals significant room for improvement in today's technique, as all existing\nmethods struggle with most of the evaluation tasks and exhibit severe\nperformance degradation when handling longer videos. Additionally, it suggests\nthat factors such as context length, image-understanding quality, and the\nchoice of LLM backbone can play critical roles in future advancements. We\nanticipate that MLVU will advance the research of long video understanding by\nproviding a comprehensive and in-depth analysis of MLLMs.\n","authors":["Junjie Zhou","Yan Shu","Bo Zhao","Boya Wu","Shitao Xiao","Xi Yang","Yongping Xiong","Bo Zhang","Tiejun Huang","Zheng Liu"],"pdf_url":"https://arxiv.org/pdf/2406.04264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04254v1","updated":"2024-06-06T17:00:10Z","published":"2024-06-06T17:00:10Z","title":"GeoGen: Geometry-Aware Generative Modeling via Signed Distance Functions","summary":"  We introduce a new generative approach for synthesizing 3D geometry and\nimages from single-view collections. Most existing approaches predict\nvolumetric density to render multi-view consistent images. By employing\nvolumetric rendering using neural radiance fields, they inherit a key\nlimitation: the generated geometry is noisy and unconstrained, limiting the\nquality and utility of the output meshes. To address this issue, we propose\nGeoGen, a new SDF-based 3D generative model trained in an end-to-end manner.\nInitially, we reinterpret the volumetric density as a Signed Distance Function\n(SDF). This allows us to introduce useful priors to generate valid meshes.\nHowever, those priors prevent the generative model from learning details,\nlimiting the applicability of the method to real-world scenarios. To alleviate\nthat problem, we make the transformation learnable and constrain the rendered\ndepth map to be consistent with the zero-level set of the SDF. Through the lens\nof adversarial training, we encourage the network to produce higher fidelity\ndetails on the output meshes. For evaluation, we introduce a synthetic dataset\nof human avatars captured from 360-degree camera angles, to overcome the\nchallenges presented by real-world datasets, which often lack 3D consistency\nand do not cover all camera angles. Our experiments on multiple datasets show\nthat GeoGen produces visually and quantitatively better geometry than the\nprevious generative models based on neural radiance fields.\n","authors":["Salvatore Esposito","Qingshan Xu","Kacper Kania","Charlie Hewitt","Octave Mariotti","Lohit Petikam","Julien Valentin","Arno Onken","Oisin Mac Aodha"],"pdf_url":"https://arxiv.org/pdf/2406.04254v1.pdf","comment":"Computer Vision and Pattern Recognition 2024"},{"id":"http://arxiv.org/abs/2406.04253v1","updated":"2024-06-06T16:58:00Z","published":"2024-06-06T16:58:00Z","title":"A Survey on 3D Human Avatar Modeling -- From Reconstruction to\n  Generation","summary":"  3D modeling has long been an important area in computer vision and computer\ngraphics. Recently, thanks to the breakthroughs in neural representations and\ngenerative models, we witnessed a rapid development of 3D modeling. 3D human\nmodeling, lying at the core of many real-world applications, such as gaming and\nanimation, has attracted significant attention. Over the past few years, a\nlarge body of work on creating 3D human avatars has been introduced, forming a\nnew and abundant knowledge base for 3D human modeling. The scale of the\nliterature makes it difficult for individuals to keep track of all the works.\nThis survey aims to provide a comprehensive overview of these emerging\ntechniques for 3D human avatar modeling, from both reconstruction and\ngeneration perspectives. Firstly, we review representative methods for 3D human\nreconstruction, including methods based on pixel-aligned implicit function,\nneural radiance field, and 3D Gaussian Splatting, etc. We then summarize\nrepresentative methods for 3D human generation, especially those using large\nlanguage models like CLIP, diffusion models, and various 3D representations,\nwhich demonstrate state-of-the-art performance. Finally, we discuss our\nreflection on existing methods and open challenges for 3D human avatar\nmodeling, shedding light on future research.\n","authors":["Ruihe Wang","Yukang Cao","Kai Han","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2406.04253v1.pdf","comment":"30 pages, 21 figures"},{"id":"http://arxiv.org/abs/2406.04251v1","updated":"2024-06-06T16:55:07Z","published":"2024-06-06T16:55:07Z","title":"Localized Gaussian Point Management","summary":"  Point management is a critical component in optimizing 3D Gaussian Splatting\n(3DGS) models, as the point initiation (e.g., via structure from motion) is\ndistributionally inappropriate. Typically, the Adaptive Density Control (ADC)\nalgorithm is applied, leveraging view-averaged gradient magnitude thresholding\nfor point densification, opacity thresholding for pruning, and regular\nall-points opacity reset. However, we reveal that this strategy is limited in\ntackling intricate/special image regions (e.g., transparent) as it is unable to\nidentify all the 3D zones that require point densification, and lacking an\nappropriate mechanism to handle the ill-conditioned points with negative\nimpacts (occlusion due to false high opacity). To address these limitations, we\npropose a Localized Point Management (LPM) strategy, capable of identifying\nthose error-contributing zones in the highest demand for both point addition\nand geometry calibration. Zone identification is achieved by leveraging the\nunderlying multiview geometry constraints, with the guidance of image rendering\nerrors. We apply point densification in the identified zone, whilst resetting\nthe opacity of those points residing in front of these regions so that a new\nopportunity is created to correct ill-conditioned points. Serving as a\nversatile plugin, LPM can be seamlessly integrated into existing 3D Gaussian\nSplatting models. Experimental evaluation across both static 3D and dynamic 4D\nscenes validate the efficacy of our LPM strategy in boosting a variety of\nexisting 3DGS models both quantitatively and qualitatively. Notably, LPM\nimproves both vanilla 3DGS and SpaceTimeGS to achieve state-of-the-art\nrendering quality while retaining real-time speeds, outperforming on\nchallenging datasets such as Tanks & Temples and the Neural 3D Video Dataset.\n","authors":["Haosen Yang","Chenhao Zhang","Wenqing Wang","Marco Volino","Adrian Hilton","Li Zhang","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.04251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04249v1","updated":"2024-06-06T16:52:42Z","published":"2024-06-06T16:52:42Z","title":"Conv-INR: Convolutional Implicit Neural Representation for Multimodal\n  Visual Signals","summary":"  Implicit neural representation (INR) has recently emerged as a promising\nparadigm for signal representations. Typically, INR is parameterized by a\nmultiplayer perceptron (MLP) which takes the coordinates as the inputs and\ngenerates corresponding attributes of a signal. However, MLP-based INRs face\ntwo critical issues: i) individually considering each coordinate while ignoring\nthe connections; ii) suffering from the spectral bias thus failing to learn\nhigh-frequency components. While target visual signals usually exhibit strong\nlocal structures and neighborhood dependencies, and high-frequency components\nare significant in these signals, the issues harm the representational capacity\nof INRs. This paper proposes Conv-INR, the first INR model fully based on\nconvolution. Due to the inherent attributes of convolution, Conv-INR can\nsimultaneously consider adjacent coordinates and learn high-frequency\ncomponents effectively. Compared to existing MLP-based INRs, Conv-INR has\nbetter representational capacity and trainability without requiring primary\nfunction expansion. We conduct extensive experiments on four tasks, including\nimage fitting, CT/MRI reconstruction, and novel view synthesis, Conv-INR all\nsignificantly surpasses existing MLP-based INRs, validating the effectiveness.\nFinally, we raise three reparameterization methods that can further enhance the\nperformance of the vanilla Conv-INR without introducing any extra inference\ncost.\n","authors":["Zhicheng Cai"],"pdf_url":"https://arxiv.org/pdf/2406.04249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04236v1","updated":"2024-06-06T16:35:36Z","published":"2024-06-06T16:35:36Z","title":"Understanding Information Storage and Transfer in Multi-modal Large\n  Language Models","summary":"  Understanding the mechanisms of information storage and transfer in\nTransformer-based models is important for driving model understanding progress.\nRecent work has studied these mechanisms for Large Language Models (LLMs),\nrevealing insights on how information is stored in a model's parameters and how\ninformation flows to and from these parameters in response to specific prompts.\nHowever, these studies have not yet been extended to Multi-modal Large Language\nModels (MLLMs). Given their expanding capabilities and real-world use, we start\nby studying one aspect of these models -- how MLLMs process information in a\nfactual visual question answering task. We use a constraint-based formulation\nwhich views a visual question as having a set of visual or textual constraints\nthat the model's generated answer must satisfy to be correct (e.g. What movie\ndirected by the director in this photo has won a Golden Globe?). Under this\nsetting, we contribute i) a method that extends causal information tracing from\npure language to the multi-modal setting, and ii) VQA-Constraints, a test-bed\nof 9.7K visual questions annotated with constraints. We use these tools to\nstudy two open-source MLLMs, LLaVa and multi-modal Phi-2. Our key findings show\nthat these MLLMs rely on MLP and self-attention blocks in much earlier layers\nfor information storage, compared to LLMs whose mid-layer MLPs are more\nimportant. We also show that a consistent small subset of visual tokens output\nby the vision encoder are responsible for transferring information from the\nimage to these causal blocks. We validate these mechanisms by introducing\nMultEdit, a model-editing algorithm that can correct errors and insert new\nlong-tailed information into MLLMs by targeting these causal blocks.\n","authors":["Samyadeep Basu","Martin Grayson","Cecily Morrison","Besmira Nushi","Soheil Feizi","Daniela Massiceti"],"pdf_url":"https://arxiv.org/pdf/2406.04236v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2406.04230v1","updated":"2024-06-06T16:30:41Z","published":"2024-06-06T16:30:41Z","title":"M3LEO: A Multi-Modal, Multi-Label Earth Observation Dataset Integrating\n  Interferometric SAR and RGB Data","summary":"  Satellite-based remote sensing has revolutionised the way we address global\nchallenges in a rapidly evolving world. Huge quantities of Earth Observation\n(EO) data are generated by satellite sensors daily, but processing these large\ndatasets for use in ML pipelines is technically and computationally\nchallenging. Specifically, different types of EO data are often hosted on a\nvariety of platforms, with differing availability for Python preprocessing\ntools. In addition, spatial alignment across data sources and data tiling can\npresent significant technical hurdles for novice users. While some preprocessed\nEO datasets exist, their content is often limited to optical or near-optical\nwavelength data, which is ineffective at night or in adverse weather\nconditions. Synthetic Aperture Radar (SAR), an active sensing technique based\non microwave length radiation, offers a viable alternative. However, the\napplication of machine learning to SAR has been limited due to a lack of\nML-ready data and pipelines, particularly for the full diversity of SAR data,\nincluding polarimetry, coherence and interferometry. We introduce M3LEO, a\nmulti-modal, multi-label EO dataset that includes polarimetric,\ninterferometric, and coherence SAR data derived from Sentinel-1, alongside\nSentinel-2 RGB imagery and a suite of labelled tasks for model evaluation.\nM3LEO spans 17.5TB and contains approximately 10M data chips across six\ngeographic regions. The dataset is complemented by a flexible PyTorch Lightning\nframework, with configuration management using Hydra. We provide tools to\nprocess any dataset available on popular platforms such as Google Earth Engine\nfor integration with our framework. Initial experiments validate the utility of\nour data and framework, showing that SAR imagery contains information\nadditional to that extractable from RGB data. Data at huggingface.co/M3LEO, and\ncode at github.com/spaceml-org/M3LEO.\n","authors":["Matthew J Allen","Francisco Dorr","Joseph Alejandro Gallego Mejia","Laura Martínez-Ferrer","Anna Jungbluth","Freddie Kalaitzis","Raúl Ramos-Pollán"],"pdf_url":"https://arxiv.org/pdf/2406.04230v1.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2406.04227v1","updated":"2024-06-06T16:28:04Z","published":"2024-06-06T16:28:04Z","title":"R-CONV: An Analytical Approach for Efficient Data Reconstruction via\n  Convolutional Gradients","summary":"  In the effort to learn from extensive collections of distributed data,\nfederated learning has emerged as a promising approach for preserving privacy\nby using a gradient-sharing mechanism instead of exchanging raw data. However,\nrecent studies show that private training data can be leaked through many\ngradient attacks. While previous analytical-based attacks have successfully\nreconstructed input data from fully connected layers, their effectiveness\ndiminishes when applied to convolutional layers. This paper introduces an\nadvanced data leakage method to efficiently exploit convolutional layers'\ngradients. We present a surprising finding: even with non-fully invertible\nactivation functions, such as ReLU, we can analytically reconstruct training\nsamples from the gradients. To the best of our knowledge, this is the first\nanalytical approach that successfully reconstructs convolutional layer inputs\ndirectly from the gradients, bypassing the need to reconstruct layers' outputs.\nPrior research has mainly concentrated on the weight constraints of convolution\nlayers, overlooking the significance of gradient constraints. Our findings\ndemonstrate that existing analytical methods used to estimate the risk of\ngradient attacks lack accuracy. In some layers, attacks can be launched with\nless than 5% of the reported constraints.\n","authors":["Tamer Ahmed Eltaras","Qutaibah Malluhi","Alessandro Savino","Stefano Di Carlo","Adnan Qayyum","Junaid Qadir"],"pdf_url":"https://arxiv.org/pdf/2406.04227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00892v2","updated":"2024-06-06T16:21:08Z","published":"2024-05-01T22:33:45Z","title":"Wake Vision: A Large-scale, Diverse Dataset and Benchmark Suite for\n  TinyML Person Detection","summary":"  Tiny machine learning (TinyML), which enables machine learning applications\non extremely low-power devices, suffers from limited size and quality of\nrelevant datasets. To address this issue, we introduce Wake Vision, a\nlarge-scale, diverse dataset tailored for person detection, the canonical task\nfor TinyML visual sensing. Wake Vision comprises over 6 million images,\nrepresenting a hundredfold increase compared to the previous standard, and has\nundergone thorough quality filtering. We provide two Wake Vision training sets:\nWake Vision (Large) and Wake Vision (Quality), a smaller set with\nhigher-quality labels. Our results demonstrate that using the Wake Vision\n(Quality) training set produces more accurate models than the Wake Vision\n(Large) training set, strongly suggesting that label quality is more important\nthan quantity in our setting. We find use for the large training set for\npre-training and knowledge distillation. To minimize label errors that can\nobscure true model performance, we manually label the validation and test sets,\nimproving the test set error rate from 7.8% in the prior standard to only 2.2%.\nIn addition to the dataset, we provide a collection of five detailed benchmark\nsets to facilitate the evaluation of model quality in challenging real world\nscenarios that are often ignored when focusing solely on overall accuracy.\nThese novel fine-grained benchmarks assess model performance on specific\nsegments of the test data, such as varying lighting conditions, distances from\nthe camera, and demographic characteristics of subjects. Our results\ndemonstrate that using Wake Vision for training results in a 2.49% increase in\naccuracy compared to the established dataset. We also show the importance of\ndataset quality for low-capacity models and the value of dataset size for\nhigh-capacity models. wakevision.ai\n","authors":["Colby Banbury","Emil Njor","Matthew Stewart","Pete Warden","Manjunath Kudlur","Nat Jeffries","Xenofon Fafoutis","Vijay Janapa Reddi"],"pdf_url":"https://arxiv.org/pdf/2405.00892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04221v1","updated":"2024-06-06T16:20:07Z","published":"2024-06-06T16:20:07Z","title":"Matching Anything by Segmenting Anything","summary":"  The robust association of the same objects across video frames in complex\nscenes is crucial for many applications, especially Multiple Object Tracking\n(MOT). Current methods predominantly rely on labeled domain-specific video\ndatasets, which limits the cross-domain generalization of learned similarity\nembeddings. We propose MASA, a novel method for robust instance association\nlearning, capable of matching any objects within videos across diverse domains\nwithout tracking labels. Leveraging the rich object segmentation from the\nSegment Anything Model (SAM), MASA learns instance-level correspondence through\nexhaustive data transformations. We treat the SAM outputs as dense object\nregion proposals and learn to match those regions from a vast image collection.\nWe further design a universal MASA adapter which can work in tandem with\nfoundational segmentation or detection models and enable them to track any\ndetected objects. Those combinations present strong zero-shot tracking ability\nin complex domains. Extensive tests on multiple challenging MOT and MOTS\nbenchmarks indicate that the proposed method, using only unlabeled static\nimages, achieves even better performance than state-of-the-art methods trained\nwith fully annotated in-domain video sequences, in zero-shot association.\nProject Page: https://matchinganything.github.io/\n","authors":["Siyuan Li","Lei Ke","Martin Danelljan","Luigi Piccinelli","Mattia Segu","Luc Van Gool","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2406.04221v1.pdf","comment":"CVPR 2024 Highlight. code at: https://github.com/siyuanliii/masa"},{"id":"http://arxiv.org/abs/2402.12451v2","updated":"2024-06-06T16:13:43Z","published":"2024-02-19T19:01:01Z","title":"The Revolution of Multimodal Large Language Models: A Survey","summary":"  Connecting text and visual modalities plays an essential role in generative\nintelligence. For this reason, inspired by the success of large language\nmodels, significant research efforts are being devoted to the development of\nMultimodal Large Language Models (MLLMs). These models can seamlessly integrate\nvisual and textual modalities, while providing a dialogue-based interface and\ninstruction-following capabilities. In this paper, we provide a comprehensive\nreview of recent visual-based MLLMs, analyzing their architectural choices,\nmultimodal alignment strategies, and training techniques. We also conduct a\ndetailed analysis of these models across a wide range of tasks, including\nvisual grounding, image generation and editing, visual understanding, and\ndomain-specific applications. Additionally, we compile and describe training\ndatasets and evaluation benchmarks, conducting comparisons among existing\nmodels in terms of performance and computational requirements. Overall, this\nsurvey offers a comprehensive overview of the current state of the art, laying\nthe groundwork for future MLLMs.\n","authors":["Davide Caffagni","Federico Cocchi","Luca Barsellotti","Nicholas Moratelli","Sara Sarto","Lorenzo Baraldi","Lorenzo Baraldi","Marcella Cornia","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2402.12451v2.pdf","comment":"ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2406.04207v1","updated":"2024-06-06T16:04:30Z","published":"2024-06-06T16:04:30Z","title":"CDMamba: Remote Sensing Image Change Detection with Mamba","summary":"  Recently, the Mamba architecture based on state space models has demonstrated\nremarkable performance in a series of natural language processing tasks and has\nbeen rapidly applied to remote sensing change detection (CD) tasks. However,\nmost methods enhance the global receptive field by directly modifying the\nscanning mode of Mamba, neglecting the crucial role that local information\nplays in dense prediction tasks (e.g., CD). In this article, we propose a model\ncalled CDMamba, which effectively combines global and local features for\nhandling CD tasks. Specifically, the Scaled Residual ConvMamba (SRCM) block is\nproposed to utilize the ability of Mamba to extract global features and\nconvolution to enhance the local details, to alleviate the issue that current\nMamba-based methods lack detailed clues and are difficult to achieve fine\ndetection in dense prediction tasks. Furthermore, considering the\ncharacteristics of bi-temporal feature interaction required for CD, the\nAdaptive Global Local Guided Fusion (AGLGF) block is proposed to dynamically\nfacilitate the bi-temporal interaction guided by other temporal global/local\nfeatures. Our intuition is that more discriminative change features can be\nacquired with the guidance of other temporal features. Extensive experiments on\nthree datasets demonstrate that our proposed CDMamba outperforms the current\nstate-of-the-art methods. Our code will be open-sourced at\nhttps://github.com/zmoka-zht/CDMamba.\n","authors":["Haotian Zhang","Keyan Chen","Chenyang Liu","Hao Chen","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2406.04207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04206v1","updated":"2024-06-06T16:04:06Z","published":"2024-06-06T16:04:06Z","title":"Diffusion-based image inpainting with internal learning","summary":"  Diffusion models are now the undisputed state-of-the-art for image generation\nand image restoration. However, they require large amounts of computational\npower for training and inference. In this paper, we propose lightweight\ndiffusion models for image inpainting that can be trained on a single image, or\na few images. We show that our approach competes with large state-of-the-art\nmodels in specific cases. We also show that training a model on a single image\nis particularly relevant for image acquisition modality that differ from the\nRGB images of standard learning databases. We show results in three different\ncontexts: texture images, line drawing images, and materials BRDF, for which we\nachieve state-of-the-art results in terms of realism, with a computational load\nthat is greatly reduced compared to concurrent methods.\n","authors":["Nicolas Cherel","Andrés Almansa","Yann Gousseau","Alasdair Newson"],"pdf_url":"https://arxiv.org/pdf/2406.04206v1.pdf","comment":"5 pages, 4 figures. EUSIPCO 2024"},{"id":"http://arxiv.org/abs/2406.04178v1","updated":"2024-06-06T15:35:41Z","published":"2024-06-06T15:35:41Z","title":"Encoding Semantic Priors into the Weights of Implicit Neural\n  Representation","summary":"  Implicit neural representation (INR) has recently emerged as a promising\nparadigm for signal representations, which takes coordinates as inputs and\ngenerates corresponding signal values. Since these coordinates contain no\nsemantic features, INR fails to take any semantic information into\nconsideration. However, semantic information has been proven critical in many\nvision tasks, especially for visual signal representation. This paper proposes\na reparameterization method termed as SPW, which encodes the semantic priors to\nthe weights of INR, thus making INR contain semantic information implicitly and\nenhancing its representational capacity. Specifically, SPW uses the Semantic\nNeural Network (SNN) to extract both low- and high-level semantic information\nof the target visual signal and generates the semantic vector, which is input\ninto the Weight Generation Network (WGN) to generate the weights of INR model.\nFinally, INR uses the generated weights with semantic priors to map the\ncoordinates to the signal values. After training, we only retain the generated\nweights while abandoning both SNN and WGN, thus SPW introduces no extra costs\nin inference. Experimental results show that SPW can improve the performance of\nvarious INR models significantly on various tasks, including image fitting, CT\nreconstruction, MRI reconstruction, and novel view synthesis. Further\nexperiments illustrate that model with SPW has lower weight redundancy and\nlearns more novel representations, validating the effectiveness of SPW.\n","authors":["Zhicheng Cai","Qiu Shen"],"pdf_url":"https://arxiv.org/pdf/2406.04178v1.pdf","comment":"ICME 2024"},{"id":"http://arxiv.org/abs/2406.04177v1","updated":"2024-06-06T15:35:25Z","published":"2024-06-06T15:35:25Z","title":"A Voxel-based Approach for Simulating Microbial Decomposition in Soil:\n  Comparison with LBM and Improvement of Morphological Models","summary":"  This study presents a new computational approach for simulating the microbial\ndecomposition of organic matter, from 3D micro-computed tomography (micro-CT)\nimages of soil. The method employs a valuated graph of connected voxels to\nsimulate transformation and diffusion processes involved in microbial\ndecomposition within the complex soil matrix. The resulting model can be\nadapted to simulate any diffusion-transformation processes in porous media. We\nimplemented parallelization strategies and explored different numerical\nmethods, including implicit, explicit, synchronous, and asynchronous schemes.\nTo validate our method, we compared simulation outputs with those provided by\nLBioS and by Mosaic models. LBioS uses a lattice-Boltzmann method for diffusion\nand Mosaic takes benefit of Pore Network Geometrical Modelling (PNGM) by means\nof geometrical primitives such as spheres and ellipsoids. This approach\nachieved comparable results to traditional LBM-based simulations, but required\nonly one-fourth of the computing time. Compared to Mosaic simulation, the\nproposed method is slower but more accurate and does not require any\ncalibration. Furthermore, we present a theoretical framework and an application\nexample to enhance PNGM-based simulations. This is accomplished by\napproximating the diffusional conductance coefficients using stochastic\ngradient descent and data generated by the current approach.\n","authors":["Mouad Klai","Olivier Monga","Mohamed Soufiane Jouini","Valérie Pot"],"pdf_url":"https://arxiv.org/pdf/2406.04177v1.pdf","comment":"Preprint submitted to IEEE Access"},{"id":"http://arxiv.org/abs/2406.00329v2","updated":"2024-06-06T15:27:12Z","published":"2024-06-01T07:08:45Z","title":"Whole Heart 3D+T Representation Learning Through Sparse 2D Cardiac MR\n  Images","summary":"  Cardiac Magnetic Resonance (CMR) imaging serves as the gold-standard for\nevaluating cardiac morphology and function. Typically, a multi-view CMR stack,\ncovering short-axis (SA) and 2/3/4-chamber long-axis (LA) views, is acquired\nfor a thorough cardiac assessment. However, efficiently streamlining the\ncomplex, high-dimensional 3D+T CMR data and distilling compact, coherent\nrepresentation remains a challenge. In this work, we introduce a whole-heart\nself-supervised learning framework that utilizes masked imaging modeling to\nautomatically uncover the correlations between spatial and temporal patches\nthroughout the cardiac stacks. This process facilitates the generation of\nmeaningful and well-clustered heart representations without relying on the\ntraditionally required, and often costly, labeled data. The learned heart\nrepresentation can be directly used for various downstream tasks. Furthermore,\nour method demonstrates remarkable robustness, ensuring consistent\nrepresentations even when certain CMR planes are missing/flawed. We train our\nmodel on 14,000 unlabeled CMR data from UK BioBank and evaluate it on 1,000\nannotated data. The proposed method demonstrates superior performance to\nbaselines in tasks that demand comprehensive 3D+T cardiac information, e.g.\ncardiac phenotype (ejection fraction and ventricle volume) prediction and\nmulti-plane/multi-frame CMR segmentation, highlighting its effectiveness in\nextracting comprehensive cardiac features that are both anatomically and\npathologically relevant.\n","authors":["Yundi Zhang","Chen Chen","Suprosanna Shit","Sophie Starck","Daniel Rueckert","Jiazhen Pan"],"pdf_url":"https://arxiv.org/pdf/2406.00329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07364v4","updated":"2024-06-06T15:24:15Z","published":"2023-12-12T15:33:08Z","title":"Collapse-Aware Triplet Decoupling for Adversarially Robust Image\n  Retrieval","summary":"  Adversarial training has achieved substantial performance in defending image\nretrieval against adversarial examples. However, existing studies in deep\nmetric learning (DML) still suffer from two major limitations: weak adversary\nand model collapse. In this paper, we address these two limitations by\nproposing Collapse-Aware TRIplet DEcoupling (CA-TRIDE). Specifically, TRIDE\nyields a stronger adversary by spatially decoupling the perturbation targets\ninto the anchor and the other candidates. Furthermore, CA prevents the\nconsequential model collapse, based on a novel metric, collapseness, which is\nincorporated into the optimization of perturbation. We also identify two\ndrawbacks of the existing robustness metric in image retrieval and propose a\nnew metric for a more reasonable robustness evaluation. Extensive experiments\non three datasets demonstrate that CA-TRIDE outperforms existing defense\nmethods in both conventional and new metrics. Codes are available at\nhttps://github.com/michaeltian108/CA-TRIDE.\n","authors":["Qiwei Tian","Chenhao Lin","Zhengyu Zhao","Qian Li","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2312.07364v4.pdf","comment":"Accepted by ICML2024"},{"id":"http://arxiv.org/abs/2405.05847v2","updated":"2024-06-06T15:22:31Z","published":"2024-05-09T15:34:15Z","title":"Learned feature representations are biased by complexity, learning\n  order, position, and more","summary":"  Representation learning, and interpreting learned representations, are key\nareas of focus in machine learning and neuroscience. Both fields generally use\nrepresentations as a means to understand or improve a system's computations. In\nthis work, however, we explore surprising dissociations between representation\nand computation that may pose challenges for such efforts. We create datasets\nin which we attempt to match the computational role that different features\nplay, while manipulating other properties of the features or the data. We train\nvarious deep learning architectures to compute these multiple abstract features\nabout their inputs. We find that their learned feature representations are\nsystematically biased towards representing some features more strongly than\nothers, depending upon extraneous properties such as feature complexity, the\norder in which features are learned, and the distribution of features over the\ninputs. For example, features that are simpler to compute or learned first tend\nto be represented more strongly and densely than features that are more complex\nor learned later, even if all features are learned equally well. We also\nexplore how these biases are affected by architectures, optimizers, and\ntraining regimes (e.g., in transformers, features decoded earlier in the output\nsequence also tend to be represented more strongly). Our results help to\ncharacterize the inductive biases of gradient-based representation learning.\nThese results also highlight a key challenge for interpretability $-$ or for\ncomparing the representations of models and brains $-$ disentangling extraneous\nbiases from the computationally important aspects of a system's internal\nrepresentations.\n","authors":["Andrew Kyle Lampinen","Stephanie C. Y. Chan","Katherine Hermann"],"pdf_url":"https://arxiv.org/pdf/2405.05847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04158v1","updated":"2024-06-06T15:18:59Z","published":"2024-06-06T15:18:59Z","title":"Sparse Multi-baseline SAR Cross-modal 3D Reconstruction of Vehicle\n  Targets","summary":"  Multi-baseline SAR 3D imaging faces significant challenges due to data\nsparsity. In recent years, deep learning techniques have achieved notable\nsuccess in enhancing the quality of sparse SAR 3D imaging. However, previous\nwork typically rely on full-aperture high-resolution radar images to supervise\nthe training of deep neural networks (DNNs), utilizing only single-modal\ninformation from radar data. Consequently, imaging performance is limited, and\nacquiring full-aperture data for multi-baseline SAR is costly and sometimes\nimpractical in real-world applications. In this paper, we propose a Cross-Modal\nReconstruction Network (CMR-Net), which integrates differentiable render and\ncross-modal supervision with optical images to reconstruct highly sparse\nmulti-baseline SAR 3D images of vehicle targets into visually structured and\nhigh-resolution images. We meticulously designed the network architecture and\ntraining strategies to enhance network generalization capability. Remarkably,\nCMR-Net, trained solely on simulated data, demonstrates high-resolution\nreconstruction capabilities on both publicly available simulation datasets and\nreal measured datasets, outperforming traditional sparse reconstruction\nalgorithms based on compressed sensing and other learning-based methods.\nAdditionally, using optical images as supervision provides a cost-effective way\nto build training datasets, reducing the difficulty of method dissemination.\nOur work showcases the broad prospects of deep learning in multi-baseline SAR\n3D imaging and offers a novel path for researching radar imaging based on\ncross-modal learning theory.\n","authors":["Da Li","Guoqiang Zhao","Houjun Sun","Jiacheng Bao"],"pdf_url":"https://arxiv.org/pdf/2406.04158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04155v1","updated":"2024-06-06T15:17:33Z","published":"2024-06-06T15:17:33Z","title":"Improving Physics-Augmented Continuum Neural Radiance Field-Based\n  Geometry-Agnostic System Identification with Lagrangian Particle Optimization","summary":"  Geometry-agnostic system identification is a technique for identifying the\ngeometry and physical properties of an object from video sequences without any\ngeometric assumptions. Recently, physics-augmented continuum neural radiance\nfields (PAC-NeRF) has demonstrated promising results for this technique by\nutilizing a hybrid Eulerian-Lagrangian representation, in which the geometry is\nrepresented by the Eulerian grid representations of NeRF, the physics is\ndescribed by a material point method (MPM), and they are connected via\nLagrangian particles. However, a notable limitation of PAC-NeRF is that its\nperformance is sensitive to the learning of the geometry from the first frames\nowing to its two-step optimization. First, the grid representations are\noptimized with the first frames of video sequences, and then the physical\nproperties are optimized through video sequences utilizing the fixed\nfirst-frame grid representations. This limitation can be critical when learning\nof the geometric structure is difficult, for example, in a few-shot (sparse\nview) setting. To overcome this limitation, we propose Lagrangian particle\noptimization (LPO), in which the positions and features of particles are\noptimized through video sequences in Lagrangian space. This method allows for\nthe optimization of the geometric structure across the entire video sequence\nwithin the physical constraints imposed by the MPM. The experimental results\ndemonstrate that the LPO is useful for geometric correction and physical\nidentification in sparse-view settings.\n","authors":["Takuhiro Kaneko"],"pdf_url":"https://arxiv.org/pdf/2406.04155v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n  https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/lpo/"},{"id":"http://arxiv.org/abs/2212.13462v2","updated":"2024-06-06T15:12:31Z","published":"2022-12-27T12:09:16Z","title":"MVTN: Learning Multi-View Transformations for 3D Understanding","summary":"  Multi-view projection techniques have shown themselves to be highly effective\nin achieving top-performing results in the recognition of 3D shapes. These\nmethods involve learning how to combine information from multiple view-points.\nHowever, the camera view-points from which these views are obtained are often\nfixed for all shapes. To overcome the static nature of current multi-view\ntechniques, we propose learning these view-points. Specifically, we introduce\nthe Multi-View Transformation Network (MVTN), which uses differentiable\nrendering to determine optimal view-points for 3D shape recognition. As a\nresult, MVTN can be trained end-to-end with any multi-view network for 3D shape\nclassification. We integrate MVTN into a novel adaptive multi-view pipeline\nthat is capable of rendering both 3D meshes and point clouds. Our approach\ndemonstrates state-of-the-art performance in 3D classification and shape\nretrieval on several benchmarks (ModelNet40, ScanObjectNN, ShapeNet Core55).\nFurther analysis indicates that our approach exhibits improved robustness to\nocclusion compared to other methods. We also investigate additional aspects of\nMVTN, such as 2D pretraining and its use for segmentation. To support further\nresearch in this area, we have released MVTorch, a PyTorch library for 3D\nunderstanding and generation using multi-view projections.\n","authors":["Abdullah Hamdi","Faisal AlZahrani","Silvio Giancola","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2212.13462v2.pdf","comment":"under review journal extension for the ICCV 2021 paper\n  arXiv:2011.13244"},{"id":"http://arxiv.org/abs/2309.06054v3","updated":"2024-06-06T15:09:52Z","published":"2023-09-12T08:45:25Z","title":"Breaking through the learning plateaus of in-context learning in\n  Transformer","summary":"  In-context learning, i.e., learning from context examples, is an impressive\nability of Transformer. Training Transformers to possess this in-context\nlearning skill is computationally intensive due to the occurrence of learning\nplateaus, which are periods within the training process where there is minimal\nor no enhancement in the model's in-context learning capability. To study the\nmechanism behind the learning plateaus, we conceptually seperate a component\nwithin the model's internal representation that is exclusively affected by the\nmodel's weights. We call this the \"weights component\", and the remainder is\nidentified as the \"context component\". By conducting meticulous and controlled\nexperiments on synthetic tasks, we note that the persistence of learning\nplateaus correlates with compromised functionality of the weights component.\nRecognizing the impaired performance of the weights component as a fundamental\nbehavior drives learning plateaus, we have developed three strategies to\nexpedite the learning of Transformers. The effectiveness of these strategies is\nfurther confirmed in natural language processing tasks. In conclusion, our\nresearch demonstrates the feasibility of cultivating a powerful in-context\nlearning ability within AI systems in an eco-friendly manner.\n","authors":["Jingwen Fu","Tao Yang","Yuwang Wang","Yan Lu","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.06054v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04144v1","updated":"2024-06-06T15:08:41Z","published":"2024-06-06T15:08:41Z","title":"Redundancy-aware Action Spaces for Robot Learning","summary":"  Joint space and task space control are the two dominant action modes for\ncontrolling robot arms within the robot learning literature. Actions in joint\nspace provide precise control over the robot's pose, but tend to suffer from\ninefficient training; actions in task space boast data-efficient training but\nsacrifice the ability to perform tasks in confined spaces due to limited\ncontrol over the full joint configuration. This work analyses the criteria for\ndesigning action spaces for robot manipulation and introduces ER (End-effector\nRedundancy), a novel action space formulation that, by addressing the\nredundancies present in the manipulator, aims to combine the advantages of both\njoint and task spaces, offering fine-grained comprehensive control with\noveractuated robot arms whilst achieving highly efficient robot learning. We\npresent two implementations of ER, ERAngle (ERA) and ERJoint (ERJ), and we show\nthat ERJ in particular demonstrates superior performance across multiple\nsettings, especially when precise control over the robot configuration is\nrequired. We validate our results both in simulated and real robotic\nenvironments.\n","authors":["Pietro Mazzaglia","Nicholas Backshall","Xiao Ma","Stephen James"],"pdf_url":"https://arxiv.org/pdf/2406.04144v1.pdf","comment":"Published in the RA-L journal"},{"id":"http://arxiv.org/abs/2406.04138v1","updated":"2024-06-06T14:59:39Z","published":"2024-06-06T14:59:39Z","title":"The 3D-PC: a benchmark for visual perspective taking in humans and\n  machines","summary":"  Visual perspective taking (VPT) is the ability to perceive and reason about\nthe perspectives of others. It is an essential feature of human intelligence,\nwhich develops over the first decade of life and requires an ability to process\nthe 3D structure of visual scenes. A growing number of reports have indicated\nthat deep neural networks (DNNs) become capable of analyzing 3D scenes after\ntraining on large image datasets. We investigated if this emergent ability for\n3D analysis in DNNs is sufficient for VPT with the 3D perception challenge\n(3D-PC): a novel benchmark for 3D perception in humans and DNNs. The 3D-PC is\ncomprised of three 3D-analysis tasks posed within natural scene images: 1. a\nsimple test of object depth order, 2. a basic VPT task (VPT-basic), and 3.\nanother version of VPT (VPT-Strategy) designed to limit the effectiveness of\n\"shortcut\" visual strategies. We tested human participants (N=33) and linearly\nprobed or text-prompted over 300 DNNs on the challenge and found that nearly\nall of the DNNs approached or exceeded human accuracy in analyzing object depth\norder. Surprisingly, DNN accuracy on this task correlated with their object\nrecognition performance. In contrast, there was an extraordinary gap between\nDNNs and humans on VPT-basic. Humans were nearly perfect, whereas most DNNs\nwere near chance. Fine-tuning DNNs on VPT-basic brought them close to human\nperformance, but they, unlike humans, dropped back to chance when tested on\nVPT-perturb. Our challenge demonstrates that the training routines and\narchitectures of today's DNNs are well-suited for learning basic 3D properties\nof scenes and objects but are ill-suited for reasoning about these properties\nlike humans do. We release our 3D-PC datasets and code to help bridge this gap\nin 3D perception between humans and machines.\n","authors":["Drew Linsley","Peisen Zhou","Alekh Karkada Ashok","Akash Nagaraj","Gaurav Gaonkar","Francis E Lewis","Zygmunt Pizlo","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2406.04138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08511v2","updated":"2024-06-06T14:56:47Z","published":"2023-09-15T16:17:54Z","title":"Generalised Diffusion Probabilistic Scale-Spaces","summary":"  Diffusion probabilistic models excel at sampling new images from learned\ndistributions. Originally motivated by drift-diffusion concepts from physics,\nthey apply image perturbations such as noise and blur in a forward process that\nresults in a tractable probability distribution. A corresponding learned\nreverse process generates images and can be conditioned on side information,\nwhich leads to a wide variety of practical applications. Most of the research\nfocus currently lies on practice-oriented extensions. In contrast, the\ntheoretical background remains largely unexplored, in particular the relations\nto drift-diffusion. In order to shed light on these connections to classical\nimage filtering, we propose a generalised scale-space theory for diffusion\nprobabilistic models. Moreover, we show conceptual and empirical connections to\ndiffusion and osmosis filters.\n","authors":["Pascal Peter"],"pdf_url":"https://arxiv.org/pdf/2309.08511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04129v1","updated":"2024-06-06T14:50:15Z","published":"2024-06-06T14:50:15Z","title":"LenslessFace: An End-to-End Optimized Lensless System for\n  Privacy-Preserving Face Verification","summary":"  Lensless cameras, innovatively replacing traditional lenses for ultra-thin,\nflat optics, encode light directly onto sensors, producing images that are not\nimmediately recognizable. This compact, lightweight, and cost-effective imaging\nsolution offers inherent privacy advantages, making it attractive for\nprivacy-sensitive applications like face verification. Typical lensless face\nverification adopts a two-stage process of reconstruction followed by\nverification, incurring privacy risks from reconstructed faces and high\ncomputational costs. This paper presents an end-to-end optimization approach\nfor privacy-preserving face verification directly on encoded lensless captures,\nensuring that the entire software pipeline remains encoded with no visible\nfaces as intermediate results. To achieve this, we propose several techniques\nto address unique challenges from the lensless setup which precludes\ntraditional face detection and alignment. Specifically, we propose a face\ncenter alignment scheme, an augmentation curriculum to build robustness against\nvariations, and a knowledge distillation method to smooth optimization and\nenhance performance. Evaluations under both simulation and real environment\ndemonstrate our method outperforms two-stage lensless verification while\nenhancing privacy and efficiency. Project website:\n\\url{lenslessface.github.io}.\n","authors":["Xin Cai","Hailong Zhang","Chenchen Wang","Wentao Liu","Jinwei Gu","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2406.04129v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2311.18610v2","updated":"2024-06-06T14:37:01Z","published":"2023-11-30T15:10:21Z","title":"DiffCAD: Weakly-Supervised Probabilistic CAD Model Retrieval and\n  Alignment from an RGB Image","summary":"  Perceiving 3D structures from RGB images based on CAD model primitives can\nenable an effective, efficient 3D object-based representation of scenes.\nHowever, current approaches rely on supervision from expensive annotations of\nCAD models associated with real images, and encounter challenges due to the\ninherent ambiguities in the task -- both in depth-scale ambiguity in monocular\nperception, as well as inexact matches of CAD database models to real\nobservations. We thus propose DiffCAD, the first weakly-supervised\nprobabilistic approach to CAD retrieval and alignment from an RGB image. We\nformulate this as a conditional generative task, leveraging diffusion to learn\nimplicit probabilistic models capturing the shape, pose, and scale of CAD\nobjects in an image. This enables multi-hypothesis generation of different\nplausible CAD reconstructions, requiring only a few hypotheses to characterize\nambiguities in depth/scale and inexact shape matches. Our approach is trained\nonly on synthetic data, leveraging monocular depth and mask estimates to enable\nrobust zero-shot adaptation to various real target domains. Despite being\ntrained solely on synthetic data, our multi-hypothesis approach can even\nsurpass the supervised state-of-the-art on the Scan2CAD dataset by 5.9% with 8\nhypotheses.\n","authors":["Daoyi Gao","Dávid Rozenberszki","Stefan Leutenegger","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2311.18610v2.pdf","comment":"SIGGRAPH 2024, Project page: https://daoyig.github.io/DiffCAD/"},{"id":"http://arxiv.org/abs/2406.04115v1","updated":"2024-06-06T14:36:02Z","published":"2024-06-06T14:36:02Z","title":"Global Parameterization-based Texture Space Optimization","summary":"  Texture mapping is a common technology in the area of computer graphics, it\nmaps the 3D surface space onto the 2D texture space. However, the loose texture\nspace will reduce the efficiency of data storage and GPU memory addressing in\nthe rendering process. Many of the existing methods focus on repacking given\ntextures, but they still suffer from high computational cost and hardly produce\na wholly tight texture space. In this paper, we propose a method to optimize\nthe texture space and produce a new texture mapping which is compact based on\nglobal parameterization. The proposed method is computationally robust and\nefficient. Experiments show the effectiveness of the proposed method and the\npotency in improving the storage and rendering efficiency.\n","authors":["Wei Chen","Yuxue Ren","Na Lei","Zhongxuan Luo","Xianfeng Gu"],"pdf_url":"https://arxiv.org/pdf/2406.04115v1.pdf","comment":"Preprint submitted to Comput. Math. Math. Phys"},{"id":"http://arxiv.org/abs/2406.04111v1","updated":"2024-06-06T14:28:43Z","published":"2024-06-06T14:28:43Z","title":"UrbanSARFloods: Sentinel-1 SLC-Based Benchmark Dataset for Urban and\n  Open-Area Flood Mapping","summary":"  Due to its cloud-penetrating capability and independence from solar\nillumination, satellite Synthetic Aperture Radar (SAR) is the preferred data\nsource for large-scale flood mapping, providing global coverage and including\nvarious land cover classes. However, most studies on large-scale SAR-derived\nflood mapping using deep learning algorithms have primarily focused on flooded\nopen areas, utilizing available open-access datasets (e.g., Sen1Floods11) and\nwith limited attention to urban floods. To address this gap, we introduce\n\\textbf{UrbanSARFloods}, a floodwater dataset featuring pre-processed\nSentinel-1 intensity data and interferometric coherence imagery acquired before\nand during flood events. It contains 8,879 $512\\times 512$ chips covering\n807,500 $km^2$ across 20 land cover classes and 5 continents, spanning 18 flood\nevents. We used UrbanSARFloods to benchmark existing state-of-the-art\nconvolutional neural networks (CNNs) for segmenting open and urban flood areas.\nOur findings indicate that prevalent approaches, including the Weighted\nCross-Entropy (WCE) loss and the application of transfer learning with\npretrained models, fall short in overcoming the obstacles posed by imbalanced\ndata and the constraints of a small training dataset. Urban flood detection\nremains challenging. Future research should explore strategies for addressing\nimbalanced data challenges and investigate transfer learning's potential for\nSAR-based large-scale flood mapping. Besides, expanding this dataset to include\nadditional flood events holds promise for enhancing its utility and\ncontributing to advancements in flood mapping techniques.\n","authors":["Jie Zhao","Zhitong Xiong","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.04111v1.pdf","comment":"Accepted by CVPR 2024 EarthVision Workshop"},{"id":"http://arxiv.org/abs/2406.04103v1","updated":"2024-06-06T14:20:21Z","published":"2024-06-06T14:20:21Z","title":"Multistep Distillation of Diffusion Models via Moment Matching","summary":"  We present a new method for making diffusion models faster to sample. The\nmethod distills many-step diffusion models into few-step models by matching\nconditional expectations of the clean data given noisy data along the sampling\ntrajectory. Our approach extends recently proposed one-step methods to the\nmulti-step case, and provides a new perspective by interpreting these\napproaches in terms of moment matching. By using up to 8 sampling steps, we\nobtain distilled models that outperform not only their one-step versions but\nalso their original many-step teacher models, obtaining new state-of-the-art\nresults on the Imagenet dataset. We also show promising results on a large\ntext-to-image model where we achieve fast generation of high resolution images\ndirectly in image space, without needing autoencoders or upsamplers.\n","authors":["Tim Salimans","Thomas Mensink","Jonathan Heek","Emiel Hoogeboom"],"pdf_url":"https://arxiv.org/pdf/2406.04103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04101v1","updated":"2024-06-06T14:16:03Z","published":"2024-06-06T14:16:03Z","title":"How Far Can We Compress Instant-NGP-Based NeRF?","summary":"  In recent years, Neural Radiance Field (NeRF) has demonstrated remarkable\ncapabilities in representing 3D scenes. To expedite the rendering process,\nlearnable explicit representations have been introduced for combination with\nimplicit NeRF representation, which however results in a large storage space\nrequirement. In this paper, we introduce the Context-based NeRF Compression\n(CNC) framework, which leverages highly efficient context models to provide a\nstorage-friendly NeRF representation. Specifically, we excavate both level-wise\nand dimension-wise context dependencies to enable probability prediction for\ninformation entropy reduction. Additionally, we exploit hash collision and\noccupancy grids as strong prior knowledge for better context modeling. To the\nbest of our knowledge, we are the first to construct and exploit context models\nfor NeRF compression. We achieve a size reduction of 100$\\times$ and 70$\\times$\nwith improved fidelity against the baseline Instant-NGP on Synthesic-NeRF and\nTanks and Temples datasets, respectively. Additionally, we attain 86.7\\% and\n82.3\\% storage size reduction against the SOTA NeRF compression method BiRF.\nOur code is available here: https://github.com/YihangChen-ee/CNC.\n","authors":["Yihang Chen","Qianyi Wu","Mehrtash Harandi","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2406.04101v1.pdf","comment":"Project Page: https://yihangchen-ee.github.io/project_cnc/ Code:\n  https://github.com/yihangchen-ee/cnc/. We further propose a 3DGS compression\n  method HAC, which is based on CNC:\n  https://yihangchen-ee.github.io/project_hac/"},{"id":"http://arxiv.org/abs/2406.04100v1","updated":"2024-06-06T14:15:15Z","published":"2024-06-06T14:15:15Z","title":"Class-Aware Cartilage Segmentation for Autonomous US-CT Registration in\n  Robotic Intercostal Ultrasound Imaging","summary":"  Ultrasound imaging has been widely used in clinical examinations owing to the\nadvantages of being portable, real-time, and radiation-free. Considering the\npotential of extensive deployment of autonomous examination systems in\nhospitals, robotic US imaging has attracted increased attention. However, due\nto the inter-patient variations, it is still challenging to have an optimal\npath for each patient, particularly for thoracic applications with limited\nacoustic windows, e.g., intercostal liver imaging. To address this problem, a\nclass-aware cartilage bone segmentation network with geometry-constraint\npost-processing is presented to capture patient-specific rib skeletons. Then, a\ndense skeleton graph-based non-rigid registration is presented to map the\nintercostal scanning path from a generic template to individual patients. By\nexplicitly considering the high-acoustic impedance bone structures, the\ntransferred scanning path can be precisely located in the intercostal space,\nenhancing the visibility of internal organs by reducing the acoustic shadow. To\nevaluate the proposed approach, the final path mapping performance is validated\non five distinct CTs and two volunteer US data, resulting in ten pairs of CT-US\ncombinations. Results demonstrate that the proposed graph-based registration\nmethod can robustly and precisely map the path from CT template to individual\npatients (Euclidean error: $2.21\\pm1.11~mm$).\n","authors":["Zhongliang Jiang","Yunfeng Kang","Yuan Bi","Xuesong Li","Chenyang Li","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2406.04100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04090v1","updated":"2024-06-06T14:01:28Z","published":"2024-06-06T14:01:28Z","title":"Interpretable Lightweight Transformer via Unrolling of Learned Graph\n  Smoothness Priors","summary":"  We build interpretable and lightweight transformer-like neural networks by\nunrolling iterative optimization algorithms that minimize graph smoothness\npriors -- the quadratic graph Laplacian regularizer (GLR) and the $\\ell_1$-norm\ngraph total variation (GTV) -- subject to an interpolation constraint. The\ncrucial insight is that a normalized signal-dependent graph learning module\namounts to a variant of the basic self-attention mechanism in conventional\ntransformers. Unlike \"black-box\" transformers that require learning of large\nkey, query and value matrices to compute scaled dot products as affinities and\nsubsequent output embeddings, resulting in huge parameter sets, our unrolled\nnetworks employ shallow CNNs to learn low-dimensional features per node to\nestablish pairwise Mahalanobis distances and construct sparse similarity\ngraphs. At each layer, given a learned graph, the target interpolated signal is\nsimply a low-pass filtered output derived from the minimization of an assumed\ngraph smoothness prior, leading to a dramatic reduction in parameter count.\nExperiments for two image interpolation applications verify the restoration\nperformance, parameter efficiency and robustness to covariate shift of our\ngraph-based unrolled networks compared to conventional transformers.\n","authors":["Tam Thuc Do","Parham Eftekhar","Seyed Alireza Hosseini","Gene Cheung","Philip Chou"],"pdf_url":"https://arxiv.org/pdf/2406.04090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02265v2","updated":"2024-06-06T13:59:03Z","published":"2024-06-04T12:41:54Z","title":"Understanding Retrieval Robustness for Retrieval-Augmented Image\n  Captioning","summary":"  Recent advances in retrieval-augmented models for image captioning highlight\nthe benefit of retrieving related captions for efficient, lightweight models\nwith strong domain-transfer capabilities. While these models demonstrate the\nsuccess of retrieval augmentation, retrieval models are still far from perfect\nin practice: the retrieved information can sometimes mislead the model,\nresulting in incorrect generation and worse performance. In this paper, we\nanalyze the robustness of a retrieval-augmented captioning model SmallCap. Our\nanalysis shows that the model is sensitive to tokens that appear in the\nmajority of the retrieved captions, and the input attribution shows that those\ntokens are likely copied into the generated output. Given these findings, we\npropose to train the model by sampling retrieved captions from more diverse\nsets. This decreases the chance that the model learns to copy majority tokens,\nand improves both in-domain and cross-domain performance.\n","authors":["Wenyan Li","Jiaang Li","Rita Ramos","Raphael Tang","Desmond Elliott"],"pdf_url":"https://arxiv.org/pdf/2406.02265v2.pdf","comment":"9 pages, long paper at ACL 2024"},{"id":"http://arxiv.org/abs/2312.10104v3","updated":"2024-06-06T13:54:23Z","published":"2023-12-15T03:11:03Z","title":"Lever LM: Configuring In-Context Sequence to Lever Large Vision Language\n  Models","summary":"  As Archimedes famously said, ``Give me a lever long enough and a fulcrum on\nwhich to place it, and I shall move the world'', in this study, we propose to\nuse a tiny Language Model (LM), \\eg, a Transformer with 67M parameters, to\nlever much larger Vision-Language Models (LVLMs) with 9B parameters.\nSpecifically, we use this tiny \\textbf{Lever-LM} to configure effective\nin-context demonstration (ICD) sequences to improve the In-Context Learinng\n(ICL) performance of LVLMs. Previous studies show that diverse ICD\nconfigurations like the selection and ordering of the demonstrations heavily\naffect the ICL performance, highlighting the significance of configuring\neffective ICD sequences. Motivated by this and by re-considering the the\nprocess of configuring ICD sequence, we find this is a mirror process of human\nsentence composition and further assume that effective ICD configurations may\ncontain internal statistical patterns that can be captured by Lever-LM. Then a\ndataset with effective ICD sequences is constructed to train Lever-LM. After\ntraining, given novel queries, new ICD sequences are configured by the trained\nLever-LM to solve vision-language tasks through ICL. Experiments show that\nthese ICD sequences can improve the ICL performance of two LVLMs compared with\nsome strong baselines in Visual Question Answering and Image Captioning,\nvalidating that Lever-LM can really capture the statistical patterns for\nlevering LVLMs.\n","authors":["Xu Yang","Yingzhe Peng","Haoxuan Ma","Shuo Xu","Chi Zhang","Yucheng Han","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.10104v3.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.06430v2","updated":"2024-06-06T13:52:04Z","published":"2023-10-10T08:54:14Z","title":"Conformal Prediction for Deep Classifier via Label Ranking","summary":"  Conformal prediction is a statistical framework that generates prediction\nsets containing ground-truth labels with a desired coverage guarantee. The\npredicted probabilities produced by machine learning models are generally\nmiscalibrated, leading to large prediction sets in conformal prediction. To\naddress this issue, we propose a novel algorithm named $\\textit{Sorted Adaptive\nPrediction Sets}$ (SAPS), which discards all the probability values except for\nthe maximum softmax probability. The key idea behind SAPS is to minimize the\ndependence of the non-conformity score on the probability values while\nretaining the uncertainty information. In this manner, SAPS can produce compact\nprediction sets and communicate instance-wise uncertainty. Extensive\nexperiments validate that SAPS not only lessens the prediction sets but also\nbroadly enhances the conditional coverage rate of prediction sets.\n","authors":["Jianguo Huang","Huajun Xi","Linjun Zhang","Huaxiu Yao","Yue Qiu","Hongxin Wei"],"pdf_url":"https://arxiv.org/pdf/2310.06430v2.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2406.00670v2","updated":"2024-06-06T13:46:15Z","published":"2024-06-02T08:32:51Z","title":"Cascade-CLIP: Cascaded Vision-Language Embeddings Alignment for\n  Zero-Shot Semantic Segmentation","summary":"  Pre-trained vision-language models, e.g., CLIP, have been successfully\napplied to zero-shot semantic segmentation. Existing CLIP-based approaches\nprimarily utilize visual features from the last layer to align with text\nembeddings, while they neglect the crucial information in intermediate layers\nthat contain rich object details. However, we find that directly aggregating\nthe multi-level visual features weakens the zero-shot ability for novel\nclasses. The large differences between the visual features from different\nlayers make these features hard to align well with the text embeddings. We\nresolve this problem by introducing a series of independent decoders to align\nthe multi-level visual features with the text embeddings in a cascaded way,\nforming a novel but simple framework named Cascade-CLIP. Our Cascade-CLIP is\nflexible and can be easily applied to existing zero-shot semantic segmentation\nmethods. Experimental results show that our simple Cascade-CLIP achieves\nsuperior zero-shot performance on segmentation benchmarks, like COCO-Stuff,\nPascal-VOC, and Pascal-Context. Our code is available at:\nhttps://github.com/HVision-NKU/Cascade-CLIP\n","authors":["Yunheng Li","ZhongYu Li","Quansheng Zeng","Qibin Hou","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2406.00670v2.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2405.15769v2","updated":"2024-06-06T13:42:09Z","published":"2024-05-24T17:59:26Z","title":"FastDrag: Manipulate Anything in One Step","summary":"  Drag-based image editing using generative models provides precise control\nover image contents, enabling users to manipulate anything in an image with a\nfew clicks. However, prevailing methods typically adopt $n$-step iterations for\nlatent semantic optimization to achieve drag-based image editing, which is\ntime-consuming and limits practical applications. In this paper, we introduce a\nnovel one-step drag-based image editing method, i.e., FastDrag, to accelerate\nthe editing process. Central to our approach is a latent warpage function\n(LWF), which simulates the behavior of a stretched material to adjust the\nlocation of individual pixels within the latent space. This innovation achieves\none-step latent semantic optimization and hence significantly promotes editing\nspeeds. Meanwhile, null regions emerging after applying LWF are addressed by\nour proposed bilateral nearest neighbor interpolation (BNNI) strategy. This\nstrategy interpolates these regions using similar features from neighboring\nareas, thus enhancing semantic integrity. Additionally, a\nconsistency-preserving strategy is introduced to maintain the consistency\nbetween the edited and original images by adopting semantic information from\nthe original image, saved as key and value pairs in self-attention module\nduring diffusion inversion, to guide the diffusion sampling. Our FastDrag is\nvalidated on the DragBench dataset, demonstrating substantial improvements in\nprocessing time over existing methods, while achieving enhanced editing\nperformance. Project page: https://fastdrag-site.github.io/ .\n","authors":["Xuanjia Zhao","Jian Guan","Congyi Fan","Dongli Xu","Youtian Lin","Haiwei Pan","Pengming Feng"],"pdf_url":"https://arxiv.org/pdf/2405.15769v2.pdf","comment":"13 pages, 13 figures, Project page: https://fastdrag-site.github.io/"},{"id":"http://arxiv.org/abs/2402.04788v2","updated":"2024-06-06T13:38:13Z","published":"2024-02-07T12:28:32Z","title":"MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with\n  Vision-Language Benchmark","summary":"  Multimodal Large Language Models (MLLMs) have gained significant attention\nrecently, showing remarkable potential in artificial general intelligence.\nHowever, assessing the utility of MLLMs presents considerable challenges,\nprimarily due to the absence of multimodal benchmarks that align with human\npreferences. Drawing inspiration from the concept of LLM-as-a-Judge within\nLLMs, this paper introduces a novel benchmark, termed MLLM-as-a-Judge, to\nassess the ability of MLLMs in assisting judges across diverse modalities,\nencompassing three distinct tasks: Scoring Evaluation, Pair Comparison, and\nBatch Ranking. Our study reveals that, while MLLMs demonstrate remarkable\nhuman-like discernment in Pair Comparison, there is a significant divergence\nfrom human preferences in Scoring Evaluation and Batch Ranking. Furthermore, a\ncloser examination reveals persistent challenges in the judgment capacities of\nLLMs, including diverse biases, hallucinatory responses, and inconsistencies in\njudgment, even in advanced models such as GPT-4V. These findings emphasize the\npressing need for enhancements and further research efforts to be undertaken\nbefore regarding MLLMs as fully reliable evaluators. In light of this, we\nadvocate for additional efforts dedicated to supporting the continuous\ndevelopment within the domain of MLLM functioning as judges. The code and\ndataset are publicly available at our project homepage:\n\\url{https://mllm-judge.github.io/}.\n","authors":["Dongping Chen","Ruoxi Chen","Shilin Zhang","Yinuo Liu","Yaochen Wang","Huichi Zhou","Qihui Zhang","Pan Zhou","Yao Wan","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2402.04788v2.pdf","comment":"ICML 2024 (Oral)"},{"id":"http://arxiv.org/abs/2403.07746v2","updated":"2024-06-06T13:34:38Z","published":"2024-03-12T15:28:51Z","title":"Unleashing HyDRa: Hybrid Fusion, Depth Consistency and Radar for Unified\n  3D Perception","summary":"  Low-cost, vision-centric 3D perception systems for autonomous driving have\nmade significant progress in recent years, narrowing the gap to expensive\nLiDAR-based methods. The primary challenge in becoming a fully reliable\nalternative lies in robust depth prediction capabilities, as camera-based\nsystems struggle with long detection ranges and adverse lighting and weather\nconditions. In this work, we introduce HyDRa, a novel camera-radar fusion\narchitecture for diverse 3D perception tasks. Building upon the principles of\ndense BEV (Bird's Eye View)-based architectures, HyDRa introduces a hybrid\nfusion approach to combine the strengths of complementary camera and radar\nfeatures in two distinct representation spaces. Our Height Association\nTransformer module leverages radar features already in the perspective view to\nproduce more robust and accurate depth predictions. In the BEV, we refine the\ninitial sparse representation by a Radar-weighted Depth Consistency. HyDRa\nachieves a new state-of-the-art for camera-radar fusion of 64.2 NDS (+1.8) and\n58.4 AMOTA (+1.5) on the public nuScenes dataset. Moreover, our new\nsemantically rich and spatially accurate BEV features can be directly converted\ninto a powerful occupancy representation, beating all previous camera-based\nmethods on the Occ3D benchmark by an impressive 3.7 mIoU. Code and models are\navailable at https://github.com/phi-wol/hydra.\n","authors":["Philipp Wolters","Johannes Gilg","Torben Teepe","Fabian Herzog","Anouar Laouichi","Martin Hofmann","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2403.07746v2.pdf","comment":"10 pages, 4 figures Added eval on VoD"},{"id":"http://arxiv.org/abs/2402.03412v2","updated":"2024-06-06T13:30:40Z","published":"2024-02-05T16:11:04Z","title":"See More Details: Efficient Image Super-Resolution by Experts Mining","summary":"  Reconstructing high-resolution (HR) images from low-resolution (LR) inputs\nposes a significant challenge in image super-resolution (SR). While recent\napproaches have demonstrated the efficacy of intricate operations customized\nfor various objectives, the straightforward stacking of these disparate\noperations can result in a substantial computational burden, hampering their\npractical utility. In response, we introduce SeemoRe, an efficient SR model\nemploying expert mining. Our approach strategically incorporates experts at\ndifferent levels, adopting a collaborative methodology. At the macro scale, our\nexperts address rank-wise and spatial-wise informative features, providing a\nholistic understanding. Subsequently, the model delves into the subtleties of\nrank choice by leveraging a mixture of low-rank experts. By tapping into\nexperts specialized in distinct key factors crucial for accurate SR, our model\nexcels in uncovering intricate intra-feature details. This collaborative\napproach is reminiscent of the concept of \"see more\", allowing our model to\nachieve an optimal performance with minimal computational costs in efficient\nsettings. The source will be publicly made available at\nhttps://github.com/eduardzamfir/seemoredetails\n","authors":["Eduard Zamfir","Zongwei Wu","Nancy Mehta","Yulun Zhang","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2402.03412v2.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2405.16849v2","updated":"2024-06-06T13:27:38Z","published":"2024-05-27T05:49:12Z","title":"Sync4D: Video Guided Controllable Dynamics for Physics-Based 4D\n  Generation","summary":"  In this work, we introduce a novel approach for creating controllable\ndynamics in 3D-generated Gaussians using casually captured reference videos.\nOur method transfers the motion of objects from reference videos to a variety\nof generated 3D Gaussians across different categories, ensuring precise and\ncustomizable motion transfer. We achieve this by employing blend skinning-based\nnon-parametric shape reconstruction to extract the shape and motion of\nreference objects. This process involves segmenting the reference objects into\nmotion-related parts based on skinning weights and establishing shape\ncorrespondences with generated target shapes. To address shape and temporal\ninconsistencies prevalent in existing methods, we integrate physical\nsimulation, driving the target shapes with matched motion. This integration is\noptimized through a displacement loss to ensure reliable and genuine dynamics.\nOur approach supports diverse reference inputs, including humans, quadrupeds,\nand articulated objects, and can generate dynamics of arbitrary length,\nproviding enhanced fidelity and applicability. Unlike methods heavily reliant\non diffusion video generation models, our technique offers specific and\nhigh-quality motion transfer, maintaining both shape integrity and temporal\nconsistency.\n","authors":["Zhoujie Fu","Jiacheng Wei","Wenhao Shen","Chaoyue Song","Xiaofeng Yang","Fayao Liu","Xulei Yang","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2405.16849v2.pdf","comment":"Our project page: https://sync4dphys.github.io/"},{"id":"http://arxiv.org/abs/2401.13388v3","updated":"2024-06-06T13:25:09Z","published":"2024-01-24T11:36:44Z","title":"UNIMO-G: Unified Image Generation through Multimodal Conditional\n  Diffusion","summary":"  Existing text-to-image diffusion models primarily generate images from text\nprompts. However, the inherent conciseness of textual descriptions poses\nchallenges in faithfully synthesizing images with intricate details, such as\nspecific entities or scenes. This paper presents UNIMO-G, a simple multimodal\nconditional diffusion framework that operates on multimodal prompts with\ninterleaved textual and visual inputs, which demonstrates a unified ability for\nboth text-driven and subject-driven image generation. UNIMO-G comprises two\ncore components: a Multimodal Large Language Model (MLLM) for encoding\nmultimodal prompts, and a conditional denoising diffusion network for\ngenerating images based on the encoded multimodal input. We leverage a\ntwo-stage training strategy to effectively train the framework: firstly\npre-training on large-scale text-image pairs to develop conditional image\ngeneration capabilities, and then instruction tuning with multimodal prompts to\nachieve unified image generation proficiency. A well-designed data processing\npipeline involving language grounding and image segmentation is employed to\nconstruct multi-modal prompts. UNIMO-G excels in both text-to-image generation\nand zero-shot subject-driven synthesis, and is notably effective in generating\nhigh-fidelity images from complex multimodal prompts involving multiple image\nentities.\n","authors":["Wei Li","Xue Xu","Jiachen Liu","Xinyan Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.13388v3.pdf","comment":"Accepted by ACL 2024, Main Conference, Long Paper"},{"id":"http://arxiv.org/abs/2406.04050v1","updated":"2024-06-06T13:17:24Z","published":"2024-06-06T13:17:24Z","title":"Semmeldetector: Application of Machine Learning in Commercial Bakeries","summary":"  The Semmeldetector, is a machine learning application that utilizes object\ndetection models to detect, classify and count baked goods in images. Our\napplication allows commercial bakers to track unsold baked goods, which allows\nthem to optimize production and increase resource efficiency. We compiled a\ndataset comprising 1151 images that distinguishes between 18 different types of\nbaked goods to train our detection models. To facilitate model training, we\nused a Copy-Paste augmentation pipeline to expand our dataset. We trained the\nstate-of-the-art object detection model YOLOv8 on our detection task. We tested\nthe impact of different training data, model scale, and online image\naugmentation pipelines on model performance. Our overall best performing model,\nachieved an AP@0.5 of 89.1% on our test set. Based on our results, we conclude\nthat machine learning can be a valuable tool even for unforeseen industries\nlike bakeries, even with very limited datasets.\n","authors":["Thomas H. Schmitt","Maximilian Bundscherer","Tobias Bocklet"],"pdf_url":"https://arxiv.org/pdf/2406.04050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04039v1","updated":"2024-06-06T13:05:32Z","published":"2024-06-06T13:05:32Z","title":"Shaping History: Advanced Machine Learning Techniques for the Analysis\n  and Dating of Cuneiform Tablets over Three Millennia","summary":"  Cuneiform tablets, emerging in ancient Mesopotamia around the late fourth\nmillennium BCE, represent one of humanity's earliest writing systems.\nCharacterized by wedge-shaped marks on clay tablets, these artifacts provided\ninsight into Mesopotamian civilization across various domains. Traditionally,\nthe analysis and dating of these tablets rely on subjective assessment of shape\nand writing style, leading to uncertainties in pinpointing their exact temporal\norigins. Recent advances in digitization have revolutionized the study of\ncuneiform by enhancing accessibility and analytical capabilities. Our research\nuniquely focuses on the silhouette of tablets as significant indicators of\ntheir historical periods, diverging from most studies that concentrate on\ntextual content. Utilizing an unprecedented dataset of over 94,000 images from\nthe Cuneiform Digital Library Initiative collection, we apply deep learning\nmethods to classify cuneiform tablets, covering over 3,000 years of history. By\nleveraging statistical, computational techniques, and generative modeling\nthrough Variational Auto-Encoders (VAEs), we achieve substantial advancements\nin the automatic classification of these ancient documents, focusing on the\ntablets' silhouettes as key predictors. Our classification approach begins with\na Decision Tree using height-to-width ratios and culminates with a ResNet50\nmodel, achieving a 61% macro F1-score for tablet silhouettes. Moreover, we\nintroduce novel VAE-powered tools to enhance explainability and enable\nresearchers to explore changes in tablet shapes across different eras and\ngenres. This research contributes to document analysis and diplomatics by\ndemonstrating the value of large-scale data analysis combined with statistical\nmethods. These insights offer valuable tools for historians and epigraphists,\nenriching our understanding of cuneiform tablets and the cultures that produced\nthem.\n","authors":["Danielle Kapon","Michael Fire","Shai Gordin"],"pdf_url":"https://arxiv.org/pdf/2406.04039v1.pdf","comment":"24 pages, 18 figures"},{"id":"http://arxiv.org/abs/2406.04032v1","updated":"2024-06-06T13:02:00Z","published":"2024-06-06T13:02:00Z","title":"Zero-Painter: Training-Free Layout Control for Text-to-Image Synthesis","summary":"  We present Zero-Painter, a novel training-free framework for\nlayout-conditional text-to-image synthesis that facilitates the creation of\ndetailed and controlled imagery from textual prompts. Our method utilizes\nobject masks and individual descriptions, coupled with a global text prompt, to\ngenerate images with high fidelity. Zero-Painter employs a two-stage process\ninvolving our novel Prompt-Adjusted Cross-Attention (PACA) and Region-Grouped\nCross-Attention (ReGCA) blocks, ensuring precise alignment of generated objects\nwith textual prompts and mask shapes. Our extensive experiments demonstrate\nthat Zero-Painter surpasses current state-of-the-art methods in preserving\ntextual details and adhering to mask shapes.\n","authors":["Marianna Ohanyan","Hayk Manukyan","Zhangyang Wang","Shant Navasardyan","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2406.04032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04031v1","updated":"2024-06-06T13:00:42Z","published":"2024-06-06T13:00:42Z","title":"Jailbreak Vision Language Models via Bi-Modal Adversarial Prompt","summary":"  In the realm of large vision language models (LVLMs), jailbreak attacks serve\nas a red-teaming approach to bypass guardrails and uncover safety implications.\nExisting jailbreaks predominantly focus on the visual modality, perturbing\nsolely visual inputs in the prompt for attacks. However, they fall short when\nconfronted with aligned models that fuse visual and textual features\nsimultaneously for generation. To address this limitation, this paper\nintroduces the Bi-Modal Adversarial Prompt Attack (BAP), which executes\njailbreaks by optimizing textual and visual prompts cohesively. Initially, we\nadversarially embed universally harmful perturbations in an image, guided by a\nfew-shot query-agnostic corpus (e.g., affirmative prefixes and negative\ninhibitions). This process ensures that image prompt LVLMs to respond\npositively to any harmful queries. Subsequently, leveraging the adversarial\nimage, we optimize textual prompts with specific harmful intent. In particular,\nwe utilize a large language model to analyze jailbreak failures and employ\nchain-of-thought reasoning to refine textual prompts through a\nfeedback-iteration manner. To validate the efficacy of our approach, we\nconducted extensive evaluations on various datasets and LVLMs, demonstrating\nthat our method significantly outperforms other methods by large margins\n(+29.03% in attack success rate on average). Additionally, we showcase the\npotential of our attacks on black-box commercial LVLMs, such as Gemini and\nChatGLM.\n","authors":["Zonghao Ying","Aishan Liu","Tianyuan Zhang","Zhengmin Yu","Siyuan Liang","Xianglong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2406.04031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04002v1","updated":"2024-06-06T12:22:56Z","published":"2024-06-06T12:22:56Z","title":"3rd Place Solution for PVUW Challenge 2024: Video Panoptic Segmentation","summary":"  Video panoptic segmentation is an advanced task that extends panoptic\nsegmentation by applying its concept to video sequences. In the hope of\naddressing the challenge of video panoptic segmentation in diverse conditions,\nWe utilize DVIS++ as our baseline model and enhance it by introducing a\ncomprehensive approach centered on the query-wise ensemble, supplemented by\nadditional techniques. Our proposed approach achieved a VPQ score of 57.01 on\nthe VIPSeg test set, and ranked 3rd in the VPS track of the 3rd Pixel-level\nVideo Understanding in the Wild Challenge.\n","authors":["Ruipu Wu","Jifei Che","Han Li","Chengjing Wu","Ting Liu","Luoqi Liu"],"pdf_url":"https://arxiv.org/pdf/2406.04002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03999v1","updated":"2024-06-06T12:17:57Z","published":"2024-06-06T12:17:57Z","title":"Unveiling the Dynamics of Information Interplay in Supervised Learning","summary":"  In this paper, we use matrix information theory as an analytical tool to\nanalyze the dynamics of the information interplay between data representations\nand classification head vectors in the supervised learning process.\nSpecifically, inspired by the theory of Neural Collapse, we introduce matrix\nmutual information ratio (MIR) and matrix entropy difference ratio (HDR) to\nassess the interactions of data representation and class classification heads\nin supervised learning, and we determine the theoretical optimal values for MIR\nand HDR when Neural Collapse happens. Our experiments show that MIR and HDR can\neffectively explain many phenomena occurring in neural networks, for example,\nthe standard supervised training dynamics, linear mode connectivity, and the\nperformance of label smoothing and pruning. Additionally, we use MIR and HDR to\ngain insights into the dynamics of grokking, which is an intriguing phenomenon\nobserved in supervised training, where the model demonstrates generalization\ncapabilities long after it has learned to fit the training data. Furthermore,\nwe introduce MIR and HDR as loss terms in supervised and semi-supervised\nlearning to optimize the information interactions among samples and\nclassification heads. The empirical results provide evidence of the method's\neffectiveness, demonstrating that the utilization of MIR and HDR not only aids\nin comprehending the dynamics throughout the training process but can also\nenhances the training procedure itself.\n","authors":["Kun Song","Zhiquan Tan","Bochao Zou","Huimin Ma","Weiran Huang"],"pdf_url":"https://arxiv.org/pdf/2406.03999v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2203.00387v2","updated":"2024-06-06T12:13:19Z","published":"2022-03-01T12:13:46Z","title":"Motion-aware Dynamic Graph Neural Network for Video Compressive Sensing","summary":"  Video snapshot compressive imaging (SCI) utilizes a 2D detector to capture\nsequential video frames and compress them into a single measurement. Various\nreconstruction methods have been developed to recover the high-speed video\nframes from the snapshot measurement. However, most existing reconstruction\nmethods are incapable of efficiently capturing long-range spatial and temporal\ndependencies, which are critical for video processing. In this paper, we\npropose a flexible and robust approach based on the graph neural network (GNN)\nto efficiently model non-local interactions between pixels in space and time\nregardless of the distance. Specifically, we develop a motion-aware dynamic GNN\nfor better video representation, i.e., represent each node as the aggregation\nof relative neighbors under the guidance of frame-by-frame motions, which\nconsists of motion-aware dynamic sampling, cross-scale node sampling, global\nknowledge integration, and graph aggregation. Extensive results on both\nsimulation and real data demonstrate both the effectiveness and efficiency of\nthe proposed approach, and the visualization illustrates the intrinsic dynamic\nsampling operations of our proposed model for boosting the video SCI\nreconstruction results. The code and model will be released.\n","authors":["Ruiying Lu","Ziheng Cheng","Bo Chen","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2203.00387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03984v1","updated":"2024-06-06T11:57:25Z","published":"2024-06-06T11:57:25Z","title":"LNQ Challenge 2023: Learning Mediastinal Lymph Node Segmentation with a\n  Probabilistic Lymph Node Atlas","summary":"  The evaluation of lymph node metastases plays a crucial role in achieving\nprecise cancer staging, influencing subsequent decisions regarding treatment\noptions. Lymph node detection poses challenges due to the presence of unclear\nboundaries and the diverse range of sizes and morphological characteristics,\nmaking it a resource-intensive process. As part of the LNQ 2023 MICCAI\nchallenge, we propose the use of anatomical priors as a tool to address the\nchallenges that persist in mediastinal lymph node segmentation in combination\nwith the partial annotation of the challenge training data. The model ensemble\nusing all suggested modifications yields a Dice score of 0.6033 and segments\n57% of the ground truth lymph nodes, compared to 27% when training on CT only.\nSegmentation accuracy is improved significantly by incorporating a\nprobabilistic lymph node atlas in loss weighting and post-processing. The\nlargest performance gains are achieved by oversampling fully annotated data to\naccount for the partial annotation of the challenge training data, as well as\nadding additional data augmentation to address the high heterogeneity of the CT\nimages and lymph node appearance. Our code is available at\nhttps://github.com/MICAI-IMI-UzL/LNQ2023.\n","authors":["Sofija Engelson","Jan Ehrhardt","Timo Kepp","Joshua Niemeijer","Heinz Handels"],"pdf_url":"https://arxiv.org/pdf/2406.03984v1.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n  Biomedical Imaging (MELBA) https://melba-journal.org/2024:009"},{"id":"http://arxiv.org/abs/2403.09871v2","updated":"2024-06-06T11:42:24Z","published":"2024-03-14T21:01:06Z","title":"ThermoHands: A Benchmark for 3D Hand Pose Estimation from Egocentric\n  Thermal Images","summary":"  In this work, we present ThermoHands, a new benchmark for thermal image-based\negocentric 3D hand pose estimation, aimed at overcoming challenges like varying\nlighting conditions and obstructions (e.g., handwear). The benchmark includes a\nmulti-view and multi-spectral dataset collected from 28 subjects performing\nhand-object and hand-virtual interactions under diverse scenarios, accurately\nannotated with 3D hand poses through an automated process. We introduce a new\nbaseline method, TherFormer, utilizing dual transformer modules for effective\negocentric 3D hand pose estimation in thermal imagery. Our experimental results\nhighlight TherFormer's leading performance and affirm thermal imaging's\neffectiveness in enabling robust 3D hand pose estimation in adverse conditions.\n","authors":["Fangqiang Ding","Yunzhou Zhu","Xiangyu Wen","Gaowen Liu","Chris Xiaoxuan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.09871v2.pdf","comment":"15 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2406.03961v1","updated":"2024-06-06T11:13:44Z","published":"2024-06-06T11:13:44Z","title":"LDM-RSIC: Exploring Distortion Prior with Latent Diffusion Models for\n  Remote Sensing Image Compression","summary":"  Deep learning-based image compression algorithms typically focus on designing\nencoding and decoding networks and improving the accuracy of entropy model\nestimation to enhance the rate-distortion (RD) performance. However, few\nalgorithms leverage the compression distortion prior from existing compression\nalgorithms to improve RD performance. In this paper, we propose a latent\ndiffusion model-based remote sensing image compression (LDM-RSIC) method, which\naims to enhance the final decoding quality of RS images by utilizing the\ngenerated distortion prior from a LDM. Our approach consists of two stages. In\nthe first stage, a self-encoder learns prior from the high-quality input image.\nIn the second stage, the prior is generated through an LDM, conditioned on the\ndecoded image of an existing learning-based image compression algorithm, to be\nused as auxiliary information for generating the texture-rich enhanced image.\nTo better utilize the prior, a channel attention and gate-based dynamic feature\nattention module (DFAM) is embedded into a Transformer-based multi-scale\nenhancement network (MEN) for image enhancement. Extensive experiments\ndemonstrate the proposed LDM-RSIC significantly outperforms existing\nstate-of-the-art traditional and learning-based image compression algorithms in\nterms of both subjective perception and objective metrics. Additionally, we use\nthe LDM-based scheme to improve the traditional image compression algorithm\nJPEG2000 and obtain 32.00% bit savings on the DOTA testing set. The code will\nbe available at https://github.com/mlkk518/LDM-RSIC.\n","authors":["Junhui Li","Jutao Li","Xingsong Hou","Huake Wang","Yutao Zhang","Yujie Dun","Wenke Sun"],"pdf_url":"https://arxiv.org/pdf/2406.03961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14792v2","updated":"2024-06-06T10:52:14Z","published":"2023-12-22T16:06:43Z","title":"The Rate-Distortion-Perception-Classification Tradeoff: Joint Source\n  Coding and Modulation via Inverse-Domain GANs","summary":"  The joint source-channel coding (JSCC) framework leverages deep learning to\nlearn from data the best codes for source and channel coding. When the output\nsignal, rather than being binary, is directly mapped onto the IQ domain\n(complex-valued), we call the resulting framework joint source coding and\nmodulation (JSCM). We consider a JSCM scenario and show the existence of a\nstrict tradeoff between channel rate, distortion, perception, and\nclassification accuracy, a tradeoff that we name RDPC. We then propose two\nimage compression methods to navigate that tradeoff: the RDPCO algorithm which,\nunder simple assumptions, directly solves the optimization problem\ncharacterizing the tradeoff, and an algorithm based on an inverse-domain\ngenerative adversarial network (ID-GAN), which is more general and achieves\nextreme compression. Simulation results corroborate the theoretical findings,\nshowing that both algorithms exhibit the RDPC tradeoff. They also demonstrate\nthat the proposed ID-GAN algorithm effectively balances image distortion,\nperception, and classification accuracy, and significantly outperforms\ntraditional separation-based methods and recent deep JSCM architectures in\nterms of one or more of these metrics.\n","authors":["Junli Fang","João F. C. Mota","Baoshan Lu","Weicheng Zhang","Xuemin Hong"],"pdf_url":"https://arxiv.org/pdf/2312.14792v2.pdf","comment":"Paper accepted in IEEE Transactions on Signal Processing"},{"id":"http://arxiv.org/abs/2402.02500v2","updated":"2024-06-06T10:32:40Z","published":"2024-02-04T14:18:45Z","title":"Point Cloud Matters: Rethinking the Impact of Different Observation\n  Spaces on Robot Learning","summary":"  In robot learning, the observation space is crucial due to the distinct\ncharacteristics of different modalities, which can potentially become a\nbottleneck alongside policy design. In this study, we explore the influence of\nvarious observation spaces on robot learning, focusing on three predominant\nmodalities: RGB, RGB-D, and point cloud. We introduce OBSBench, a benchmark\ncomprising two simulators and 125 tasks, along with standardized pipelines for\nvarious encoders and policy baselines. Extensive experiments on diverse\ncontact-rich manipulation tasks reveal a notable trend: point cloud-based\nmethods, even those with the simplest designs, frequently outperform their RGB\nand RGB-D counterparts. This trend persists in both scenarios: training from\nscratch and utilizing pre-training. Furthermore, our findings demonstrate that\npoint cloud observations often yield better policy performance and\nsignificantly stronger generalization capabilities across various geometric and\nvisual conditions. These outcomes suggest that the 3D point cloud is a valuable\nobservation modality for intricate robotic tasks. We also suggest that\nincorporating both appearance and coordinate information can enhance the\nperformance of point cloud methods. We hope our work provides valuable insights\nand guidance for designing more generalizable and robust robotic models. Codes\nare available at https://github.com/HaoyiZhu/PointCloudMatters.\n","authors":["Haoyi Zhu","Yating Wang","Di Huang","Weicai Ye","Wanli Ouyang","Tong He"],"pdf_url":"https://arxiv.org/pdf/2402.02500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00773v2","updated":"2024-06-06T10:08:22Z","published":"2024-06-02T15:20:59Z","title":"Diffusion Tuning: Transferring Diffusion Models via Chain of Forgetting","summary":"  Diffusion models have significantly advanced the field of generative\nmodeling. However, training a diffusion model is computationally expensive,\ncreating a pressing need to adapt off-the-shelf diffusion models for downstream\ngeneration tasks. Current fine-tuning methods focus on parameter-efficient\ntransfer learning but overlook the fundamental transfer characteristics of\ndiffusion models. In this paper, we investigate the transferability of\ndiffusion models and observe a monotonous chain of forgetting trend of\ntransferability along the reverse process. Based on this observation and novel\ntheoretical insights, we present Diff-Tuning, a frustratingly simple transfer\napproach that leverages the chain of forgetting tendency. Diff-Tuning\nencourages the fine-tuned model to retain the pre-trained knowledge at the end\nof the denoising chain close to the generated data while discarding the other\nnoise side. We conduct comprehensive experiments to evaluate Diff-Tuning,\nincluding the transfer of pre-trained Diffusion Transformer models to eight\ndownstream generations and the adaptation of Stable Diffusion to five control\nconditions with ControlNet. Diff-Tuning achieves a 26% improvement over\nstandard fine-tuning and enhances the convergence speed of ControlNet by 24%.\nNotably, parameter-efficient transfer learning techniques for diffusion models\ncan also benefit from Diff-Tuning.\n","authors":["Jincheng Zhong","Xingzhuo Guo","Jiaxiang Dong","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2406.00773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03919v1","updated":"2024-06-06T10:02:06Z","published":"2024-06-06T10:02:06Z","title":"Vectorized Conditional Neural Fields: A Framework for Solving\n  Time-dependent Parametric Partial Differential Equations","summary":"  Transformer models are increasingly used for solving Partial Differential\nEquations (PDEs). Several adaptations have been proposed, all of which suffer\nfrom the typical problems of Transformers, such as quadratic memory and time\ncomplexity. Furthermore, all prevalent architectures for PDE solving lack at\nleast one of several desirable properties of an ideal surrogate model, such as\n(i) generalization to PDE parameters not seen during training, (ii) spatial and\ntemporal zero-shot super-resolution, (iii) continuous temporal extrapolation,\n(iv) support for 1D, 2D, and 3D PDEs, and (v) efficient inference for longer\ntemporal rollouts. To address these limitations, we propose Vectorized\nConditional Neural Fields (VCNeFs), which represent the solution of\ntime-dependent PDEs as neural fields. Contrary to prior methods, however,\nVCNeFs compute, for a set of multiple spatio-temporal query points, their\nsolutions in parallel and model their dependencies through attention\nmechanisms. Moreover, VCNeF can condition the neural field on both the initial\nconditions and the parameters of the PDEs. An extensive set of experiments\ndemonstrates that VCNeFs are competitive with and often outperform existing\nML-based surrogate models.\n","authors":["Jan Hagnberger","Marimuthu Kalimuthu","Daniel Musekamp","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2406.03919v1.pdf","comment":"Accepted for publication at the 41st International Conference on\n  Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2406.03917v1","updated":"2024-06-06T09:57:56Z","published":"2024-06-06T09:57:56Z","title":"Frequency-based Matcher for Long-tailed Semantic Segmentation","summary":"  The successful application of semantic segmentation technology in the real\nworld has been among the most exciting achievements in the computer vision\ncommunity over the past decade. Although the long-tailed phenomenon has been\ninvestigated in many fields, e.g., classification and object detection, it has\nnot received enough attention in semantic segmentation and has become a\nnon-negligible obstacle to applying semantic segmentation technology in\nautonomous driving and virtual reality. Therefore, in this work, we focus on a\nrelatively under-explored task setting, long-tailed semantic segmentation\n(LTSS). We first establish three representative datasets from different\naspects, i.e., scene, object, and human. We further propose a dual-metric\nevaluation system and construct the LTSS benchmark to demonstrate the\nperformance of semantic segmentation methods and long-tailed solutions. We also\npropose a transformer-based algorithm to improve LTSS, frequency-based matcher,\nwhich solves the oversuppression problem by one-to-many matching and\nautomatically determines the number of matching queries for each class. Given\nthe comprehensiveness of this work and the importance of the issues revealed,\nthis work aims to promote the empirical study of semantic segmentation tasks.\nOur datasets, codes, and models will be publicly available.\n","authors":["Shan Li","Lu Yang","Pu Cao","Liulei Li","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2406.03917v1.pdf","comment":"Accepted for publication as a Regular paper in the IEEE Transactions\n  on Multimedia"},{"id":"http://arxiv.org/abs/2312.01616v5","updated":"2024-06-06T09:57:54Z","published":"2023-12-04T04:14:09Z","title":"SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation\n  System","summary":"  Accuracy and computational efficiency are the most important metrics to\nVisual Inertial Navigation System (VINS). The existing VINS algorithms with\neither high accuracy or low computational complexity, are difficult to provide\nthe high precision localization in resource-constrained devices. To this end,\nwe propose a novel filter-based VINS framework named SchurVINS, which could\nguarantee both high accuracy by building a complete residual model and low\ncomputational complexity with Schur complement. Technically, we first formulate\nthe full residual model where Gradient, Hessian and observation covariance are\nexplicitly modeled. Then Schur complement is employed to decompose the full\nmodel into ego-motion residual model and landmark residual model. Finally,\nExtended Kalman Filter (EKF) update is implemented in these two models with\nhigh efficiency. Experiments on EuRoC and TUM-VI datasets show that our method\nnotably outperforms state-of-the-art (SOTA) methods in both accuracy and\ncomputational complexity. The experimental code of SchurVINS is available at\nhttps://github.com/bytedance/SchurVINS.\n","authors":["Yunfei Fan","Tianyu Zhao","Guidong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01616v5.pdf","comment":"Accepted by CVPR2024"},{"id":"http://arxiv.org/abs/2406.03916v1","updated":"2024-06-06T09:56:49Z","published":"2024-06-06T09:56:49Z","title":"ArMeme: Propagandistic Content in Arabic Memes","summary":"  With the rise of digital communication, memes have become a significant\nmedium for cultural and political expression that is often used to mislead\naudiences. Identification of such misleading and persuasive multimodal content\nhas become more important among various stakeholders, including social media\nplatforms, policymakers, and the broader society as they often cause harm to\nindividuals, organizations, and/or society. While there has been effort to\ndevelop AI-based automatic systems for resource-rich languages (e.g., English),\nit is relatively little to none for medium to low resource languages. In this\nstudy, we focused on developing an Arabic memes dataset with manual annotations\nof propagandistic content. We annotated ~6K Arabic memes collected from various\nsocial media platforms, which is a first resource for Arabic multimodal\nresearch. We provide a comprehensive analysis aiming to develop computational\ntools for their detection. We will make them publicly available for the\ncommunity.\n","authors":["Firoj Alam","Abul Hasnat","Fatema Ahmed","Md Arid Hasan","Maram Hasanain"],"pdf_url":"https://arxiv.org/pdf/2406.03916v1.pdf","comment":"disinformation, misinformation, factuality, harmfulness, fake news,\n  propaganda, multimodality, text, images"},{"id":"http://arxiv.org/abs/2210.04288v3","updated":"2024-06-06T09:45:27Z","published":"2022-10-09T15:42:36Z","title":"CoopHash: Cooperative Learning of Multipurpose Descriptor and\n  Contrastive Pair Generator via Variational MCMC Teaching for Supervised Image\n  Hashing","summary":"  Leveraging supervised information can lead to superior retrieval performance\nin the image hashing domain but the performance degrades significantly without\nenough labeled data. One effective solution to boost performance is to employ\ngenerative models, such as Generative Adversarial Networks (GANs), to generate\nsynthetic data in an image hashing model. However, GAN-based methods are\ndifficult to train, which prevents the hashing approaches from jointly training\nthe generative models and the hash functions. This limitation results in\nsub-optimal retrieval performance. To overcome this limitation, we propose a\nnovel framework, the generative cooperative hashing network, which is based on\nenergy-based cooperative learning. This framework jointly learns a powerful\ngenerative representation of the data and a robust hash function via two\ncomponents: a top-down contrastive pair generator that synthesizes contrastive\nimages and a bottom-up multipurpose descriptor that simultaneously represents\nthe images from multiple perspectives, including probability density, hash\ncode, latent code, and category. The two components are jointly learned via a\nnovel likelihood-based cooperative learning scheme. We conduct experiments on\nseveral real-world datasets and show that the proposed method outperforms the\ncompeting hashing supervised methods, achieving up to 10\\% relative improvement\nover the current state-of-the-art supervised hashing methods, and exhibits a\nsignificantly better performance in out-of-distribution retrieval.\n","authors":["Khoa D. Doan","Jianwen Xie","Yaxuan Zhu","Yang Zhao","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2210.04288v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03907v1","updated":"2024-06-06T09:41:39Z","published":"2024-06-06T09:41:39Z","title":"Exploring the Zero-Shot Capabilities of Vision-Language Models for\n  Improving Gaze Following","summary":"  Contextual cues related to a person's pose and interactions with objects and\nother people in the scene can provide valuable information for gaze following.\nWhile existing methods have focused on dedicated cue extraction methods, in\nthis work we investigate the zero-shot capabilities of Vision-Language Models\n(VLMs) for extracting a wide array of contextual cues to improve gaze following\nperformance. We first evaluate various VLMs, prompting strategies, and\nin-context learning (ICL) techniques for zero-shot cue recognition performance.\nWe then use these insights to extract contextual cues for gaze following, and\ninvestigate their impact when incorporated into a state of the art model for\nthe task. Our analysis indicates that BLIP-2 is the overall top performing VLM\nand that ICL can improve performance. We also observe that VLMs are sensitive\nto the choice of the text prompt although ensembling over multiple text prompts\ncan provide more robust performance. Additionally, we discover that using the\nentire image along with an ellipse drawn around the target person is the most\neffective strategy for visual prompting. For gaze following, incorporating the\nextracted cues results in better generalization performance, especially when\nconsidering a larger set of cues, highlighting the potential of this approach.\n","authors":["Anshul Gupta","Pierre Vuillecard","Arya Farkhondeh","Jean-Marc Odobez"],"pdf_url":"https://arxiv.org/pdf/2406.03907v1.pdf","comment":"Accepted at the GAZE Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2406.03903v1","updated":"2024-06-06T09:38:06Z","published":"2024-06-06T09:38:06Z","title":"Data-Centric Label Smoothing for Explainable Glaucoma Screening from Eye\n  Fundus Images","summary":"  As current computing capabilities increase, modern machine learning and\ncomputer vision system tend to increase in complexity, mostly by means of\nlarger models and advanced optimization strategies. Although often neglected,\nin many problems there is also much to be gained by considering potential\nimprovements in understanding and better leveraging already-available training\ndata, including annotations. This so-called data-centric approach can lead to\nsubstantial performance increases, sometimes beyond what can be achieved by\nlarger models. In this paper we adopt such an approach for the task of\njustifiable glaucoma screening from retinal images. In particular, we focus on\nhow to combine information from multiple annotators of different skills into a\ntailored label smoothing scheme that allows us to better employ a large\ncollection of fundus images, instead of discarding samples suffering from\ninter-rater variability. Internal validation results indicate that our bespoke\nlabel smoothing approach surpasses the performance of a standard resnet50 model\nand also the same model trained with conventional label smoothing techniques,\nin particular for the multi-label scenario of predicting clinical reasons of\nglaucoma likelihood in a highly imbalanced screening context. Our code is made\navailable at github.com/agaldran/justraigs .\n","authors":["Adrian Galdran","Miguel A. González Ballester"],"pdf_url":"https://arxiv.org/pdf/2406.03903v1.pdf","comment":"Accepted to ISBI 2024 (Challenges), 2nd position in the JustRAIGS\n  challenge (https://justraigs.grand-challenge.org/)"},{"id":"http://arxiv.org/abs/2406.03902v1","updated":"2024-06-06T09:37:56Z","published":"2024-06-06T09:37:56Z","title":"C^2RV: Cross-Regional and Cross-View Learning for Sparse-View CBCT\n  Reconstruction","summary":"  Cone beam computed tomography (CBCT) is an important imaging technology\nwidely used in medical scenarios, such as diagnosis and preoperative planning.\nUsing fewer projection views to reconstruct CT, also known as sparse-view\nreconstruction, can reduce ionizing radiation and further benefit\ninterventional radiology. Compared with sparse-view reconstruction for\ntraditional parallel/fan-beam CT, CBCT reconstruction is more challenging due\nto the increased dimensionality caused by the measurement process based on\ncone-shaped X-ray beams. As a 2D-to-3D reconstruction problem, although\nimplicit neural representations have been introduced to enable efficient\ntraining, only local features are considered and different views are processed\nequally in previous works, resulting in spatial inconsistency and poor\nperformance on complicated anatomies. To this end, we propose C^2RV by\nleveraging explicit multi-scale volumetric representations to enable\ncross-regional learning in the 3D space. Additionally, the scale-view\ncross-attention module is introduced to adaptively aggregate multi-scale and\nmulti-view features. Extensive experiments demonstrate that our C^2RV achieves\nconsistent and significant improvement over previous state-of-the-art methods\non datasets with diverse anatomy.\n","authors":["Yiqun Lin","Jiewen Yang","Hualiang Wang","Xinpeng Ding","Wei Zhao","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2406.03902v1.pdf","comment":"Accepted to CVPR 2024"},{"id":"http://arxiv.org/abs/2406.03901v1","updated":"2024-06-06T09:37:46Z","published":"2024-06-06T09:37:46Z","title":"Polyp and Surgical Instrument Segmentation with Double Encoder-Decoder\n  Networks","summary":"  This paper describes a solution for the MedAI competition, in which\nparticipants were required to segment both polyps and surgical instruments from\nendoscopic images. Our approach relies on a double encoder-decoder neural\nnetwork which we have previously applied for polyp segmentation, but with a\nseries of enhancements: a more powerful encoder architecture, an improved\noptimization procedure, and the post-processing of segmentations based on\ntempered model ensembling. Experimental results show that our method produces\nsegmentations that show a good agreement with manual delineations provided by\nmedical experts.\n","authors":["Adrian Galdran"],"pdf_url":"https://arxiv.org/pdf/2406.03901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03879v1","updated":"2024-06-06T09:14:32Z","published":"2024-06-06T09:14:32Z","title":"Decay Pruning Method: Smooth Pruning With a Self-Rectifying Procedure","summary":"  Current structured pruning methods often result in considerable accuracy\ndrops due to abrupt network changes and loss of information from pruned\nstructures. To address these issues, we introduce the Decay Pruning Method\n(DPM), a novel smooth pruning approach with a self-rectifying mechanism. DPM\nconsists of two key components: (i) Smooth Pruning: It converts conventional\nsingle-step pruning into multi-step smooth pruning, gradually reducing\nredundant structures to zero over N steps with ongoing optimization. (ii)\nSelf-Rectifying: This procedure further enhances the aforementioned process by\nrectifying sub-optimal pruning based on gradient information. Our approach\ndemonstrates strong generalizability and can be easily integrated with various\nexisting pruning methods. We validate the effectiveness of DPM by integrating\nit with three popular pruning methods: OTOv2, Depgraph, and Gate Decorator.\nExperimental results show consistent improvements in performance compared to\nthe original pruning methods, along with further reductions of FLOPs in most\nscenarios.\n","authors":["Minghao Yang","Linlin Gao","Pengyuan Li","Wenbo Li","Yihong Dong","Zhiying Cui"],"pdf_url":"https://arxiv.org/pdf/2406.03879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03877v1","updated":"2024-06-06T09:12:30Z","published":"2024-06-06T09:12:30Z","title":"Bench2Drive: Towards Multi-Ability Benchmarking of Closed-Loop\n  End-To-End Autonomous Driving","summary":"  In an era marked by the rapid scaling of foundation models, autonomous\ndriving technologies are approaching a transformative threshold where\nend-to-end autonomous driving (E2E-AD) emerges due to its potential of scaling\nup in the data-driven manner. However, existing E2E-AD methods are mostly\nevaluated under the open-loop log-replay manner with L2 errors and collision\nrate as metrics (e.g., in nuScenes), which could not fully reflect the driving\nperformance of algorithms as recently acknowledged in the community. For those\nE2E-AD methods evaluated under the closed-loop protocol, they are tested in\nfixed routes (e.g., Town05Long and Longest6 in CARLA) with the driving score as\nmetrics, which is known for high variance due to the unsmoothed metric function\nand large randomness in the long route. Besides, these methods usually collect\ntheir own data for training, which makes algorithm-level fair comparison\ninfeasible.\n  To fulfill the paramount need of comprehensive, realistic, and fair testing\nenvironments for Full Self-Driving (FSD), we present Bench2Drive, the first\nbenchmark for evaluating E2E-AD systems' multiple abilities in a closed-loop\nmanner. Bench2Drive's official training data consists of 2 million fully\nannotated frames, collected from 10000 short clips uniformly distributed under\n44 interactive scenarios (cut-in, overtaking, detour, etc), 23 weathers (sunny,\nfoggy, rainy, etc), and 12 towns (urban, village, university, etc) in CARLA v2.\nIts evaluation protocol requires E2E-AD models to pass 44 interactive scenarios\nunder different locations and weathers which sums up to 220 routes and thus\nprovides a comprehensive and disentangled assessment about their driving\ncapability under different situations. We implement state-of-the-art E2E-AD\nmodels and evaluate them in Bench2Drive, providing insights regarding current\nstatus and future directions.\n","authors":["Xiaosong Jia","Zhenjie Yang","Qifeng Li","Zhiyuan Zhang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2406.03877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03873v1","updated":"2024-06-06T09:04:48Z","published":"2024-06-06T09:04:48Z","title":"Quantum Implicit Neural Representations","summary":"  Implicit neural representations have emerged as a powerful paradigm to\nrepresent signals such as images and sounds. This approach aims to utilize\nneural networks to parameterize the implicit function of the signal. However,\nwhen representing implicit functions, traditional neural networks such as\nReLU-based multilayer perceptrons face challenges in accurately modeling\nhigh-frequency components of signals. Recent research has begun to explore the\nuse of Fourier Neural Networks (FNNs) to overcome this limitation. In this\npaper, we propose Quantum Implicit Representation Network (QIREN), a novel\nquantum generalization of FNNs. Furthermore, through theoretical analysis, we\ndemonstrate that QIREN possesses a quantum advantage over classical FNNs.\nLastly, we conducted experiments in signal representation, image\nsuperresolution, and image generation tasks to show the superior performance of\nQIREN compared to state-of-the-art (SOTA) models. Our work not only\nincorporates quantum advantages into implicit neural representations but also\nuncovers a promising application direction for Quantum Neural Networks.\n","authors":["Jiaming Zhao","Wenbo Qiao","Peng Zhang","Hui Gao"],"pdf_url":"https://arxiv.org/pdf/2406.03873v1.pdf","comment":"This paper was accepted by icml 2024"},{"id":"http://arxiv.org/abs/2402.04356v3","updated":"2024-06-06T08:56:56Z","published":"2024-02-06T19:42:18Z","title":"Bidirectional Autoregressive Diffusion Model for Dance Generation","summary":"  Dance serves as a powerful medium for expressing human emotions, but the\nlifelike generation of dance is still a considerable challenge. Recently,\ndiffusion models have showcased remarkable generative abilities across various\ndomains. They hold promise for human motion generation due to their adaptable\nmany-to-many nature. Nonetheless, current diffusion-based motion generation\nmodels often create entire motion sequences directly and unidirectionally,\nlacking focus on the motion with local and bidirectional enhancement. When\nchoreographing high-quality dance movements, people need to take into account\nnot only the musical context but also the nearby music-aligned dance motions.\nTo authentically capture human behavior, we propose a Bidirectional\nAutoregressive Diffusion Model (BADM) for music-to-dance generation, where a\nbidirectional encoder is built to enforce that the generated dance is\nharmonious in both the forward and backward directions. To make the generated\ndance motion smoother, a local information decoder is built for local motion\nenhancement. The proposed framework is able to generate new motions based on\nthe input conditions and nearby motions, which foresees individual motion\nslices iteratively and consolidates all predictions. To further refine the\nsynchronicity between the generated dance and the beat, the beat information is\nincorporated as an input to generate better music-aligned dance movements.\nExperimental results demonstrate that the proposed model achieves\nstate-of-the-art performance compared to existing unidirectional approaches on\nthe prominent benchmark for music-to-dance generation.\n","authors":["Canyu Zhang","Youbao Tang","Ning Zhang","Ruei-Sung Lin","Mei Han","Jing Xiao","Song Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04356v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03866v1","updated":"2024-06-06T08:53:01Z","published":"2024-06-06T08:53:01Z","title":"LLplace: The 3D Indoor Scene Layout Generation and Editing via Large\n  Language Model","summary":"  Designing 3D indoor layouts is a crucial task with significant applications\nin virtual reality, interior design, and automated space planning. Existing\nmethods for 3D layout design either rely on diffusion models, which utilize\nspatial relationship priors, or heavily leverage the inferential capabilities\nof proprietary Large Language Models (LLMs), which require extensive prompt\nengineering and in-context exemplars via black-box trials. These methods often\nface limitations in generalization and dynamic scene editing. In this paper, we\nintroduce LLplace, a novel 3D indoor scene layout designer based on lightweight\nfine-tuned open-source LLM Llama3. LLplace circumvents the need for spatial\nrelationship priors and in-context exemplars, enabling efficient and credible\nroom layout generation based solely on user inputs specifying the room type and\ndesired objects. We curated a new dialogue dataset based on the 3D-Front\ndataset, expanding the original data volume and incorporating dialogue data for\nadding and removing objects. This dataset can enhance the LLM's spatial\nunderstanding. Furthermore, through dialogue, LLplace activates the LLM's\ncapability to understand 3D layouts and perform dynamic scene editing, enabling\nthe addition and removal of objects. Our approach demonstrates that LLplace can\neffectively generate and edit 3D indoor layouts interactively and outperform\nexisting methods in delivering high-quality 3D design solutions. Code and\ndataset will be released.\n","authors":["Yixuan Yang","Junru Lu","Zixiang Zhao","Zhen Luo","James J. Q. Yu","Victor Sanchez","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2406.03866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03865v1","updated":"2024-06-06T08:51:26Z","published":"2024-06-06T08:51:26Z","title":"Semantic Similarity Score for Measuring Visual Similarity at Semantic\n  Level","summary":"  Semantic communication, as a revolutionary communication architecture, is\nconsidered a promising novel communication paradigm. Unlike traditional\nsymbol-based error-free communication systems, semantic-based visual\ncommunication systems extract, compress, transmit, and reconstruct images at\nthe semantic level. However, widely used image similarity evaluation metrics,\nwhether pixel-based MSE or PSNR or structure-based MS-SSIM, struggle to\naccurately measure the loss of semantic-level information of the source during\nsystem transmission. This presents challenges in evaluating the performance of\nvisual semantic communication systems, especially when comparing them with\ntraditional communication systems. To address this, we propose a semantic\nevaluation metric -- SeSS (Semantic Similarity Score), based on Scene Graph\nGeneration and graph matching, which shifts the similarity scores between\nimages into semantic-level graph matching scores. Meanwhile, semantic\nsimilarity scores for tens of thousands of image pairs are manually annotated\nto fine-tune the hyperparameters in the graph matching algorithm, aligning the\nmetric more closely with human semantic perception. The performance of the SeSS\nis tested on different datasets, including (1)images transmitted by traditional\nand semantic communication systems at different compression rates, (2)images\ntransmitted by traditional and semantic communication systems at different\nsignal-to-noise ratios, (3)images generated by large-scale model with different\nnoise levels introduced, and (4)cases of images subjected to certain special\ntransformations. The experiments demonstrate the effectiveness of SeSS,\nindicating that the metric can measure the semantic-level differences in\nsemantic-level information of images and can be used for evaluation in visual\nsemantic communication systems.\n","authors":["Senran Fan","Zhicheng Bao","Chen Dong","Haotai Liang","Xiaodong Xu","Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.03865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03859v1","updated":"2024-06-06T08:46:00Z","published":"2024-06-06T08:46:00Z","title":"From operculum and body tail movements to different coupling of physical\n  activity and respiratory frequency in farmed gilthead sea bream and European\n  sea bass. Insights on aquaculture biosensing","summary":"  The AEFishBIT tri-axial accelerometer was externally attached to the\noperculum to assess the divergent activity and respiratory patterns of two\nmarine farmed fish, the gilthead sea bream (Sparus aurata) and European sea\nbass (Dicentrarchus labrax). Analysis of raw data from exercised fish\nhighlighted the large amplitude of operculum aperture and body tail movements\nin European sea bass, which were overall more stable at low-medium exercise\nintensity levels. Cosinor analysis in free-swimming fish (on-board data\nprocessing) highlighted a pronounced daily rhythmicity of locomotor activity\nand respiratory frequency in both gilthead sea bream and European sea bass.\nAcrophases of activity and respiration were coupled in gilthead sea bream,\nacting feeding time (once daily at 11:00 h) as a main synchronizing factor. By\ncontrast, locomotor activity and respiratory frequency were out of phase in\nEuropean sea bass with activity acrophase on early morning and respiration\nacrophase on the afternoon. The daily range of activity and respiration\nvariation was also higher in European sea bass, probably as part of the\nadaptation of this fish species to act as a fast swimming predator. In any\ncase, lower locomotor activity and enhanced respiration were associated with\nlarger body weight in both fish species. This agrees with the notion that\nselection for fast growth in farming conditions is accompanied by a lower\nactivity profile, which may favor an efficient feed conversion for growth\npurposes. Therefore, the use of behavioral monitoring is becoming a reliable\nand large-scale promising tool for selecting more efficient farmed fish,\nallowing researchers and farmers to establish stricter criteria of welfare for\nmore sustainable and ethical fish production.\n","authors":["Miguel A. Ferrer","Josep A. Calduch-Giner","Moises Díaz","Javier Sosa","Enrique Rosell-Moll","Judith Santana Abril","Graciela Santana Sosa","Tomás Bautista Delgado","Cristina Carmona","Juan Antonio Martos-Sitcha","Enric Cabruja","Juan Manuel Afonso","Aurelio Vega","Manuel Lozano","Juan Antonio Montiel-Nelson","Jaume Pérez-Sánchez"],"pdf_url":"https://arxiv.org/pdf/2406.03859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03857v1","updated":"2024-06-06T08:42:36Z","published":"2024-06-06T08:42:36Z","title":"MuJo: Multimodal Joint Feature Space Learning for Human Activity\n  Recognition","summary":"  Human Activity Recognition is a longstanding problem in AI with applications\nin a broad range of areas: from healthcare, sports and fitness, security, and\nhuman computer interaction to robotics. The performance of HAR in real-world\nsettings is strongly dependent on the type and quality of the input signal that\ncan be acquired. Given an unobstructed, high-quality camera view of a scene,\ncomputer vision systems, in particular in conjunction with foundational models\n(e.g., CLIP), can today fairly reliably distinguish complex activities. On the\nother hand, recognition using modalities such as wearable sensors (which are\noften more broadly available, e.g, in mobile phones and smartwatches) is a more\ndifficult problem, as the signals often contain less information and labeled\ntraining data is more difficult to acquire. In this work, we show how we can\nimprove HAR performance across different modalities using multimodal\ncontrastive pretraining. Our approach MuJo (Multimodal Joint Feature Space\nLearning), learns a multimodal joint feature space with video, language, pose,\nand IMU sensor data. The proposed approach combines contrastive and multitask\nlearning methods and analyzes different multitasking strategies for learning a\ncompact shared representation. A large dataset with parallel video, language,\npose, and sensor data points is also introduced to support the research, along\nwith an analysis of the robustness of the multimodal joint space for\nmodal-incomplete and low-resource data. On the MM-Fit dataset, our model\nachieves an impressive Macro F1-Score of up to 0.992 with only 2% of the train\ndata and 0.999 when using all available training data for classification tasks.\nMoreover, in the scenario where the MM-Fit dataset is unseen, we demonstrate a\ngeneralization performance of up to 0.638.\n","authors":["Stefan Gerd Fritsch","Cennet Oguz","Vitor Fortes Rey","Lala Ray","Maximilian Kiefer-Emmanouilidis","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2406.03857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01900v2","updated":"2024-06-06T08:38:10Z","published":"2024-06-04T02:05:57Z","title":"Follow-Your-Emoji: Fine-Controllable and Expressive Freestyle Portrait\n  Animation","summary":"  We present Follow-Your-Emoji, a diffusion-based framework for portrait\nanimation, which animates a reference portrait with target landmark sequences.\nThe main challenge of portrait animation is to preserve the identity of the\nreference portrait and transfer the target expression to this portrait while\nmaintaining temporal consistency and fidelity. To address these challenges,\nFollow-Your-Emoji equipped the powerful Stable Diffusion model with two\nwell-designed technologies. Specifically, we first adopt a new explicit motion\nsignal, namely expression-aware landmark, to guide the animation process. We\ndiscover this landmark can not only ensure the accurate motion alignment\nbetween the reference portrait and target motion during inference but also\nincrease the ability to portray exaggerated expressions (i.e., large pupil\nmovements) and avoid identity leakage. Then, we propose a facial fine-grained\nloss to improve the model's ability of subtle expression perception and\nreference portrait appearance reconstruction by using both expression and\nfacial masks. Accordingly, our method demonstrates significant performance in\ncontrolling the expression of freestyle portraits, including real humans,\ncartoons, sculptures, and even animals. By leveraging a simple and effective\nprogressive generation strategy, we extend our model to stable long-term\nanimation, thus increasing its potential application value. To address the lack\nof a benchmark for this field, we introduce EmojiBench, a comprehensive\nbenchmark comprising diverse portrait images, driving videos, and landmarks. We\nshow extensive evaluations on EmojiBench to verify the superiority of\nFollow-Your-Emoji.\n","authors":["Yue Ma","Hongyu Liu","Hongfa Wang","Heng Pan","Yingqing He","Junkun Yuan","Ailing Zeng","Chengfei Cai","Heung-Yeung Shum","Wei Liu","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2406.01900v2.pdf","comment":"Project Page: https://follow-your-emoji.github.io/"},{"id":"http://arxiv.org/abs/1912.12095v2","updated":"2024-06-06T08:28:13Z","published":"2019-12-27T13:48:03Z","title":"One Point, One Object: Simultaneous 3D Object Segmentation and 6-DOF\n  Pose Estimation","summary":"  We propose a single-shot method for simultaneous 3D object segmentation and\n6-DOF pose estimation in pure 3D point clouds scenes based on a consensus that\n\\emph{one point only belongs to one object}, i.e., each point has the potential\npower to predict the 6-DOF pose of its corresponding object. Unlike the\nrecently proposed methods of the similar task, which rely on 2D detectors to\npredict the projection of 3D corners of the 3D bounding boxes and the 6-DOF\npose must be estimated by a PnP like spatial transformation method, ours is\nconcise enough not to require additional spatial transformation between\ndifferent dimensions. Due to the lack of training data for many objects, the\nrecently proposed 2D detection methods try to generate training data by using\nrendering engine and achieve good results. However, rendering in 3D space along\nwith 6-DOF is relatively difficult. Therefore, we propose an augmented reality\ntechnology to generate the training data in semi-virtual reality 3D space. The\nkey component of our method is a multi-task CNN architecture that can\nsimultaneously predicts the 3D object segmentation and 6-DOF pose estimation in\npure 3D point clouds.\n  For experimental evaluation, we generate expanded training data for two\nstate-of-the-arts 3D object datasets \\cite{PLCHF}\\cite{TLINEMOD} by using\nAugmented Reality technology (AR). We evaluate our proposed method on the two\ndatasets. The results show that our method can be well generalized into\nmultiple scenarios and provide performance comparable to or better than the\nstate-of-the-arts.\n","authors":["Hongsen Liu"],"pdf_url":"https://arxiv.org/pdf/1912.12095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01287v2","updated":"2024-06-06T08:26:55Z","published":"2024-02-02T10:23:03Z","title":"Spiking CenterNet: A Distillation-boosted Spiking Neural Network for\n  Object Detection","summary":"  In the era of AI at the edge, self-driving cars, and climate change, the need\nfor energy-efficient, small, embedded AI is growing. Spiking Neural Networks\n(SNNs) are a promising approach to address this challenge, with their\nevent-driven information flow and sparse activations. We propose Spiking\nCenterNet for object detection on event data. It combines an SNN CenterNet\nadaptation with an efficient M2U-Net-based decoder. Our model significantly\noutperforms comparable previous work on Prophesee's challenging GEN1 Automotive\nDetection Dataset while using less than half the energy. Distilling the\nknowledge of a non-spiking teacher into our SNN further increases performance.\nTo the best of our knowledge, our work is the first approach that takes\nadvantage of knowledge distillation in the field of spiking object detection.\n","authors":["Lennard Bodden","Franziska Schwaiger","Duc Bach Ha","Lars Kreuzberg","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2402.01287v2.pdf","comment":"8 pages, 5 figures. Accepted at IJCNN 2024"},{"id":"http://arxiv.org/abs/2403.06189v2","updated":"2024-06-06T08:19:12Z","published":"2024-03-10T12:11:34Z","title":"Harmonious Group Choreography with Trajectory-Controllable Diffusion","summary":"  Creating group choreography from music has gained attention in cultural\nentertainment and virtual reality, aiming to coordinate visually cohesive and\ndiverse group movements. Despite increasing interest, recent works face\nchallenges in achieving aesthetically appealing choreography, primarily for two\nkey issues: multi-dancer collision and single-dancer foot slide. To address\nthese issues, we propose a Trajectory-Controllable Diffusion (TCDiff), a novel\napproach that harnesses non-overlapping trajectories to facilitate coherent\ndance movements. Specifically, to tackle dancer collisions, we introduce a\nDance-Beat Navigator capable of generating trajectories for multiple dancers\nbased on the music, complemented by a Distance-Consistency loss to maintain\nappropriate spacing among trajectories within a reasonable threshold. To\nmitigate foot sliding, we present a Footwork Adaptor that utilizes trajectory\ndisplacement from adjacent frames to enable flexible footwork, coupled with a\nRelative Forward-Kinematic loss to adjust the positioning of individual\ndancers' root nodes and joints. Extensive experiments demonstrate that our\nmethod achieves state-of-the-art results.\n","authors":["Yuqin Dai","Wanlu Zhu","Ronghui Li","Zeping Ren","Xiangzheng Zhou","Xiu Li","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2403.06189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03835v1","updated":"2024-06-06T08:12:38Z","published":"2024-06-06T08:12:38Z","title":"Monocular Localization with Semantics Map for Autonomous Vehicles","summary":"  Accurate and robust localization remains a significant challenge for\nautonomous vehicles. The cost of sensors and limitations in local computational\nefficiency make it difficult to scale to large commercial applications.\nTraditional vision-based approaches focus on texture features that are\nsusceptible to changes in lighting, season, perspective, and appearance.\nAdditionally, the large storage size of maps with descriptors and complex\noptimization processes hinder system performance. To balance efficiency and\naccuracy, we propose a novel lightweight visual semantic localization algorithm\nthat employs stable semantic features instead of low-level texture features.\nFirst, semantic maps are constructed offline by detecting semantic objects,\nsuch as ground markers, lane lines, and poles, using cameras or LiDAR sensors.\nThen, online visual localization is performed through data association of\nsemantic features and map objects. We evaluated our proposed localization\nframework in the publicly available KAIST Urban dataset and in scenarios\nrecorded by ourselves. The experimental results demonstrate that our method is\na reliable and practical localization solution in various autonomous driving\nlocalization tasks.\n","authors":["Jixiang Wan","Xudong Zhang","Shuzhou Dong","Yuwei Zhang","Yuchen Yang","Ruoxi Wu","Ye Jiang","Jijunnan Li","Jinquan Lin","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2406.03835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05001v3","updated":"2024-06-06T08:01:55Z","published":"2023-06-08T07:45:24Z","title":"COURIER: Contrastive User Intention Reconstruction for Large-Scale\n  Visual Recommendation","summary":"  With the advancement of multimedia internet, the impact of visual\ncharacteristics on the decision of users to click or not within the online\nretail industry is increasingly significant. Thus, incorporating visual\nfeatures is a promising direction for further performance improvements in\nclick-through rate (CTR). However, experiments on our production system\nrevealed that simply injecting the image embeddings trained with established\npre-training methods only has marginal improvements. We believe that the main\nadvantage of existing image feature pre-training methods lies in their\neffectiveness for cross-modal predictions. However, this differs significantly\nfrom the task of CTR prediction in recommendation systems. In recommendation\nsystems, other modalities of information (such as text) can be directly used as\nfeatures in downstream models. Even if the performance of cross-modal\nprediction tasks is excellent, it is challenging to provide significant\ninformation gain for the downstream models. We argue that a visual feature\npre-training method tailored for recommendation is necessary for further\nimprovements beyond existing modality features. To this end, we propose an\neffective user intention reconstruction module to mine visual features related\nto user interests from behavior histories, which constructs a many-to-one\ncorrespondence. We further propose a contrastive training method to learn the\nuser intentions and prevent the collapse of embedding vectors. We conduct\nextensive experimental evaluations on public datasets and our production system\nto verify that our method can learn users' visual interests. Our method\nachieves $0.46\\%$ improvement in offline AUC and $0.88\\%$ improvement in Taobao\nGMV (Cross Merchandise Volume) with p-value$<$0.01.\n","authors":["Jia-Qi Yang","Chenglei Dai","Dan OU","Dongshuai Li","Ju Huang","De-Chuan Zhan","Xiaoyi Zeng","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2306.05001v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20607v2","updated":"2024-06-06T07:51:34Z","published":"2024-05-31T03:47:44Z","title":"Textual Inversion and Self-supervised Refinement for Radiology Report\n  Generation","summary":"  Existing mainstream approaches follow the encoder-decoder paradigm for\ngenerating radiology reports. They focus on improving the network structure of\nencoders and decoders, which leads to two shortcomings: overlooking the\nmodality gap and ignoring report content constraints. In this paper, we\nproposed Textual Inversion and Self-supervised Refinement (TISR) to address the\nabove two issues. Specifically, textual inversion can project text and image\ninto the same space by representing images as pseudo words to eliminate the\ncross-modeling gap. Subsequently, self-supervised refinement refines these\npseudo words through contrastive loss computation between images and texts,\nenhancing the fidelity of generated reports to images. Notably, TISR is\northogonal to most existing methods, plug-and-play. We conduct experiments on\ntwo widely-used public datasets and achieve significant improvements on various\nbaselines, which demonstrates the effectiveness and generalization of TISR. The\ncode will be available soon.\n","authors":["Yuanjiang Luo","Hongxiang Li","Xuan Wu","Meng Cao","Xiaoshuang Huang","Zhihong Zhu","Peixi Liao","Hu Chen","Yi Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20607v2.pdf","comment":"This paper has been early accepted by MICCAI 2024!"},{"id":"http://arxiv.org/abs/2406.03818v1","updated":"2024-06-06T07:49:02Z","published":"2024-06-06T07:49:02Z","title":"Amortized Equation Discovery in Hybrid Dynamical Systems","summary":"  Hybrid dynamical systems are prevalent in science and engineering to express\ncomplex systems with continuous and discrete states. To learn the laws of\nsystems, all previous methods for equation discovery in hybrid systems follow a\ntwo-stage paradigm, i.e. they first group time series into small cluster\nfragments and then discover equations in each fragment separately through\nmethods in non-hybrid systems. Although effective, these methods do not fully\ntake advantage of the commonalities in the shared dynamics of multiple\nfragments that are driven by the same equations. Besides, the two-stage\nparadigm breaks the interdependence between categorizing and representing\ndynamics that jointly form hybrid systems. In this paper, we reformulate the\nproblem and propose an end-to-end learning framework, i.e. Amortized Equation\nDiscovery (AMORE), to jointly categorize modes and discover equations\ncharacterizing the dynamics of each mode by all segments of the mode.\nExperiments on four hybrid and six non-hybrid systems show that our method\noutperforms previous methods on equation discovery, segmentation, and\nforecasting.\n","authors":["Yongtuo Liu","Sara Magliacane","Miltiadis Kofinas","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2406.03818v1.pdf","comment":"24 pages, 5 figures, accepted by International Conference on Machine\n  Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2404.14745v2","updated":"2024-06-06T07:46:24Z","published":"2024-04-23T04:54:32Z","title":"TAAT: Think and Act from Arbitrary Texts in Text2Motion","summary":"  Text2Motion aims to generate human motions from texts. Existing datasets rely\non the assumption that texts include action labels (such as \"walk, bend, and\npick up\"), which is not flexible for practical scenarios. This paper redefines\nthis problem with a more realistic assumption that the texts are arbitrary.\nSpecifically, arbitrary texts include existing action texts composed of action\nlabels (e.g., A person walks and bends to pick up something), and introduce\nscene texts without explicit action labels (e.g., A person notices his wallet\non the ground ahead).\n  To bridge the gaps between this realistic setting and existing datasets, we\nexpand the action texts on the HumanML3D dataset to more scene texts, thereby\ncreating a new HumanML3D++ dataset including arbitrary texts. In this\nchallenging dataset, we benchmark existing state-of-the-art methods and propose\na novel two-stage framework to extract action labels from arbitrary texts by\nthe Large Language Model (LLM) and then generate motions from action labels.\nExtensive experiments are conducted under different application scenarios to\nvalidate the effectiveness of the proposed framework on existing and proposed\ndatasets. The results indicate that Text2Motion in this realistic setting is\nvery challenging, fostering new research in this practical direction. Our\ndataset and code will be released.\n","authors":["Runqi Wang","Caoyuan Ma","Guopeng Li","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2404.14745v2.pdf","comment":"Updated errors in author information"},{"id":"http://arxiv.org/abs/2406.03262v2","updated":"2024-06-06T07:20:10Z","published":"2024-06-05T13:40:07Z","title":"ADer: A Comprehensive Benchmark for Multi-class Visual Anomaly Detection","summary":"  Visual anomaly detection aims to identify anomalous regions in images through\nunsupervised learning paradigms, with increasing application demand and value\nin fields such as industrial inspection and medical lesion detection. Despite\nsignificant progress in recent years, there is a lack of comprehensive\nbenchmarks to adequately evaluate the performance of various mainstream methods\nacross different datasets under the practical multi-class setting. The absence\nof standardized experimental setups can lead to potential biases in training\nepochs, resolution, and metric results, resulting in erroneous conclusions.\nThis paper addresses this issue by proposing a comprehensive visual anomaly\ndetection benchmark, \\textbf{\\textit{ADer}}, which is a modular framework that\nis highly extensible for new methods. The benchmark includes multiple datasets\nfrom industrial and medical domains, implementing fifteen state-of-the-art\nmethods and nine comprehensive metrics. Additionally, we have open-sourced the\nGPU-assisted \\href{https://pypi.org/project/ADEval}{ADEval} package to address\nthe slow evaluation problem of metrics like time-consuming mAU-PRO on\nlarge-scale data, significantly reducing evaluation time by more than\n\\textit{1000-fold}. Through extensive experimental results, we objectively\nreveal the strengths and weaknesses of different methods and provide insights\ninto the challenges and future directions of multi-class visual anomaly\ndetection. We hope that \\textbf{\\textit{ADer}} will become a valuable resource\nfor researchers and practitioners in the field, promoting the development of\nmore robust and generalizable anomaly detection systems. Full codes have been\nattached in Appendix and open-sourced at\n\\url{https://github.com/zhangzjn/ader}.\n","authors":["Jiangning Zhang","Haoyang He","Zhenye Gan","Qingdong He","Yuxuan Cai","Zhucun Xue","Yabiao Wang","Chengjie Wang","Lei Xie","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2406.03262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03799v1","updated":"2024-06-06T07:12:50Z","published":"2024-06-06T07:12:50Z","title":"Enhanced Semantic Segmentation Pipeline for WeatherProof Dataset\n  Challenge","summary":"  This report describes the winning solution to the WeatherProof Dataset\nChallenge (CVPR 2024 UG2+ Track 3). Details regarding the challenge are\navailable at https://cvpr2024ug2challenge.github.io/track3.html. We propose an\nenhanced semantic segmentation pipeline for this challenge. Firstly, we improve\nsemantic segmentation models, using backbone pretrained with Depth Anything to\nimprove UperNet model and SETRMLA model, and adding language guidance based on\nboth weather and category information to InternImage model. Secondly, we\nintroduce a new dataset WeatherProofExtra with wider viewing angle and employ\ndata augmentation methods, including adverse weather and super-resolution.\nFinally, effective training strategies and ensemble method are applied to\nimprove final performance further. Our solution is ranked 1st on the final\nleaderboard. Code will be available at\nhttps://github.com/KaneiGi/WeatherProofChallenge.\n","authors":["Nan Zhang","Xidan Zhang","Jianing Wei","Fangjun Wang","Zhiming Tan"],"pdf_url":"https://arxiv.org/pdf/2406.03799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03793v1","updated":"2024-06-06T07:05:20Z","published":"2024-06-06T07:05:20Z","title":"Low-Rank Similarity Mining for Multimodal Dataset Distillation","summary":"  Though dataset distillation has witnessed rapid development in recent years,\nthe distillation of multimodal data, e.g., image-text pairs, poses unique and\nunder-explored challenges. Unlike unimodal data, image-text contrastive\nlearning (ITC) data lack inherent categorization and should instead place\ngreater emphasis on modality correspondence. In this work, we propose Low-Rank\nSimilarity Mining (LoRS) for multimodal dataset distillation, that concurrently\ndistills a ground truth similarity matrix with image-text pairs, and leverages\nlow-rank factorization for efficiency and scalability. The proposed approach\nbrings significant improvement to the existing algorithms, marking a\nsignificant contribution to the field of visual-language dataset distillation.\nWe advocate adopting LoRS as a foundational synthetic data setup for image-text\ndataset distillation. Our code is available at\nhttps://github.com/silicx/LoRS_Distill.\n","authors":["Yue Xu","Zhilin Lin","Yusong Qiu","Cewu Lu","Yong-Lu Li"],"pdf_url":"https://arxiv.org/pdf/2406.03793v1.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2406.02881v2","updated":"2024-06-06T06:59:46Z","published":"2024-06-05T02:59:08Z","title":"Inv-Adapter: ID Customization Generation via Image Inversion and\n  Lightweight Adapter","summary":"  The remarkable advancement in text-to-image generation models significantly\nboosts the research in ID customization generation. However, existing\npersonalization methods cannot simultaneously satisfy high fidelity and\nhigh-efficiency requirements. Their main bottleneck lies in the prompt image\nencoder, which produces weak alignment signals with the text-to-image model and\nsignificantly increased model size. Towards this end, we propose a lightweight\nInv-Adapter, which first extracts diffusion-domain representations of ID images\nutilizing a pre-trained text-to-image model via DDIM image inversion, without\nadditional image encoder. Benefiting from the high alignment of the extracted\nID prompt features and the intermediate features of the text-to-image model, we\nthen embed them efficiently into the base text-to-image model by carefully\ndesigning a lightweight attention adapter. We conduct extensive experiments to\nassess ID fidelity, generation loyalty, speed, and training parameters, all of\nwhich show that the proposed Inv-Adapter is highly competitive in ID\ncustomization generation and model scale.\n","authors":["Peng Xing","Ning Wang","Jianbo Ouyang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2406.02881v2.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2406.03776v1","updated":"2024-06-06T06:40:19Z","published":"2024-06-06T06:40:19Z","title":"XL-HeadTags: Leveraging Multimodal Retrieval Augmentation for the\n  Multilingual Generation of News Headlines and Tags","summary":"  Millions of news articles published online daily can overwhelm readers.\nHeadlines and entity (topic) tags are essential for guiding readers to decide\nif the content is worth their time. While headline generation has been\nextensively studied, tag generation remains largely unexplored, yet it offers\nreaders better access to topics of interest. The need for conciseness in\ncapturing readers' attention necessitates improved content selection strategies\nfor identifying salient and relevant segments within lengthy articles, thereby\nguiding language models effectively. To address this, we propose to leverage\nauxiliary information such as images and captions embedded in the articles to\nretrieve relevant sentences and utilize instruction tuning with variations to\ngenerate both headlines and tags for news articles in a multilingual context.\nTo make use of the auxiliary information, we have compiled a dataset named\nXL-HeadTags, which includes 20 languages across 6 diverse language families.\nThrough extensive evaluation, we demonstrate the effectiveness of our\nplug-and-play multimodal-multilingual retrievers for both tasks. Additionally,\nwe have developed a suite of tools for processing and evaluating multilingual\ntexts, significantly contributing to the research community by enabling more\naccurate and efficient analysis across languages.\n","authors":["Faisal Tareque Shohan","Mir Tafseer Nayeem","Samsul Islam","Abu Ubaida Akash","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2406.03776v1.pdf","comment":"ACL 2024 camera ready"},{"id":"http://arxiv.org/abs/2310.00165v2","updated":"2024-06-06T06:33:47Z","published":"2023-09-29T22:09:07Z","title":"SCoRe: Submodular Combinatorial Representation Learning","summary":"  In this paper we introduce the SCoRe (Submodular Combinatorial Representation\nLearning) framework, a novel approach in representation learning that addresses\ninter-class bias and intra-class variance. SCoRe provides a new combinatorial\nviewpoint to representation learning, by introducing a family of loss functions\nbased on set-based submodular information measures. We develop two novel\ncombinatorial formulations for loss functions, using the Total Information and\nTotal Correlation, that naturally minimize intra-class variance and inter-class\nbias. Several commonly used metric/contrastive learning loss functions like\nsupervised contrastive loss, orthogonal projection loss, and N-pairs loss, are\nall instances of SCoRe, thereby underlining the versatility and applicability\nof SCoRe in a broad spectrum of learning scenarios. Novel objectives in SCoRe\nnaturally model class-imbalance with up to 7.6\\% improvement in classification\non CIFAR-10-LT, CIFAR-100-LT, MedMNIST, 2.1% on ImageNet-LT, and 19.4% in\nobject detection on IDD and LVIS (v1.0), demonstrating its effectiveness over\nexisting approaches.\n","authors":["Anay Majee","Suraj Kothawade","Krishnateja Killamsetty","Rishabh Iyer"],"pdf_url":"https://arxiv.org/pdf/2310.00165v2.pdf","comment":"Accepted to ICML 2024"},{"id":"http://arxiv.org/abs/2405.16488v2","updated":"2024-06-06T06:26:59Z","published":"2024-05-26T08:54:43Z","title":"Partial train and isolate, mitigate backdoor attack","summary":"  Neural networks are widely known to be vulnerable to backdoor attacks, a\nmethod that poisons a portion of the training data to make the target model\nperform well on normal data sets, while outputting attacker-specified or random\ncategories on the poisoned samples. Backdoor attacks are full of threats.\nPoisoned samples are becoming more and more similar to corresponding normal\nsamples, and even the human eye cannot easily distinguish them. On the other\nhand, the accuracy of models carrying backdoors on normal samples is no\ndifferent from that of clean models.In this article, by observing the\ncharacteristics of backdoor attacks, We provide a new model training method\n(PT) that freezes part of the model to train a model that can isolate\nsuspicious samples. Then, on this basis, a clean model is fine-tuned to resist\nbackdoor attacks.\n","authors":["Yong Li","Han Gao"],"pdf_url":"https://arxiv.org/pdf/2405.16488v2.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2406.00307v2","updated":"2024-06-06T06:08:45Z","published":"2024-06-01T05:41:12Z","title":"HENASY: Learning to Assemble Scene-Entities for Egocentric\n  Video-Language Model","summary":"  Current video-language models (VLMs) rely extensively on instance-level\nalignment between video and language modalities, which presents two major\nlimitations: (1) visual reasoning disobeys the natural perception that humans\ndo in first-person perspective, leading to a lack of reasoning interpretation;\nand (2) learning is limited in capturing inherent fine-grained relationships\nbetween two modalities.\n  In this paper, we take an inspiration from human perception and explore a\ncompositional approach for egocentric video representation. We introduce HENASY\n(Hierarchical ENtities ASsemblY), which includes a spatiotemporal token\ngrouping mechanism to explicitly assemble dynamically evolving scene entities\nthrough time and model their relationship for video representation. By\nleveraging compositional structure understanding, HENASY possesses strong\ninterpretability via visual grounding with free-form text queries. We further\nexplore a suite of multi-grained contrastive losses to facilitate\nentity-centric understandings. This comprises three alignment types:\nvideo-narration, noun-entity, verb-entities alignments.\n  Our method demonstrates strong interpretability in both quantitative and\nqualitative experiments; while maintaining competitive performances on five\ndownstream tasks via zero-shot transfer or as video/text representation,\nincluding video/text retrieval, action recognition, multi-choice query, natural\nlanguage query, and moments query.\n","authors":["Khoa Vo","Thinh Phan","Kashu Yamazaki","Minh Tran","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2406.00307v2.pdf","comment":"under submission"},{"id":"http://arxiv.org/abs/2305.12659v2","updated":"2024-06-06T05:57:51Z","published":"2023-05-22T03:03:29Z","title":"UVOSAM: A Mask-free Paradigm for Unsupervised Video Object Segmentation\n  via Segment Anything Model","summary":"  The current state-of-the-art methods for unsupervised video object\nsegmentation (UVOS) require extensive training on video datasets with mask\nannotations, limiting their effectiveness in handling challenging scenarios.\nHowever, the Segment Anything Model (SAM) introduces a new prompt-driven\nparadigm for image segmentation, offering new possibilities. In this study, we\ninvestigate SAM's potential for UVOS through different prompt strategies. We\nthen propose UVOSAM, a mask-free paradigm for UVOS that utilizes the STD-Net\ntracker. STD-Net incorporates a spatial-temporal decoupled deformable attention\nmechanism to establish an effective correlation between intra- and inter-frame\nfeatures, remarkably enhancing the quality of box prompts in complex video\nscenes. Extensive experiments on the DAVIS2017-unsupervised and\nYoutubeVIS19\\&21 datasets demonstrate the superior performance of UVOSAM\nwithout mask supervision compared to existing mask-supervised methods, as well\nas its ability to generalize to weakly-annotated video datasets. Code can be\nfound at https://github.com/alibaba/UVOSAM.\n","authors":["Zhenghao Zhang","Shengfan Zhang","Zhichao Wei","Zuozhuo Dai","Siyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.12659v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03095v2","updated":"2024-06-06T05:28:27Z","published":"2024-06-05T09:36:15Z","title":"EgoSurgery-Tool: A Dataset of Surgical Tool and Hand Detection from\n  Egocentric Open Surgery Videos","summary":"  Surgical tool detection is a fundamental task for understanding egocentric\nopen surgery videos. However, detecting surgical tools presents significant\nchallenges due to their highly imbalanced class distribution, similar shapes\nand similar textures, and heavy occlusion. The lack of a comprehensive\nlarge-scale dataset compounds these challenges. In this paper, we introduce\nEgoSurgery-Tool, an extension of the existing EgoSurgery-Phase dataset, which\ncontains real open surgery videos captured using an egocentric camera attached\nto the surgeon's head, along with phase annotations. EgoSurgery-Tool has been\ndensely annotated with surgical tools and comprises over 49K surgical tool\nbounding boxes across 15 categories, constituting a large-scale surgical tool\ndetection dataset. EgoSurgery-Tool also provides annotations for hand detection\nwith over 46K hand-bounding boxes, capturing hand-object interactions that are\ncrucial for understanding activities in egocentric open surgery.\nEgoSurgery-Tool is superior to existing datasets due to its larger scale,\ngreater variety of surgical tools, more annotations, and denser scenes. We\nconduct a comprehensive analysis of EgoSurgery-Tool using nine popular object\ndetectors to assess their effectiveness in both surgical tool and hand\ndetection. The dataset will be released at\nhttps://github.com/Fujiry0/EgoSurgery.\n","authors":["Ryo Fujii","Hideo Saito","Hiroki Kajita"],"pdf_url":"https://arxiv.org/pdf/2406.03095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03441v6","updated":"2024-06-06T05:27:59Z","published":"2023-12-06T11:50:14Z","title":"UFineBench: Towards Text-based Person Retrieval with Ultra-fine\n  Granularity","summary":"  Existing text-based person retrieval datasets often have relatively\ncoarse-grained text annotations. This hinders the model to comprehend the\nfine-grained semantics of query texts in real scenarios. To address this\nproblem, we contribute a new benchmark named \\textbf{UFineBench} for text-based\nperson retrieval with ultra-fine granularity.\n  Firstly, we construct a new \\textbf{dataset} named UFine6926. We collect a\nlarge number of person images and manually annotate each image with two\ndetailed textual descriptions, averaging 80.8 words each. The average word\ncount is three to four times that of the previous datasets. In addition of\nstandard in-domain evaluation, we also propose a special \\textbf{evaluation\nparadigm} more representative of real scenarios. It contains a new evaluation\nset with cross domains, cross textual granularity and cross textual styles,\nnamed UFine3C, and a new evaluation metric for accurately measuring retrieval\nability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a\nmore efficient \\textbf{algorithm} especially designed for text-based person\nretrieval with ultra fine-grained texts. It achieves fine granularity mining by\nadopting a shared cross-modal granularity decoder and hard negative match\nmechanism.\n  With standard in-domain evaluation, CFAM establishes competitive performance\nacross various datasets, especially on our ultra fine-grained UFine6926.\nFurthermore, by evaluating on UFine3C, we demonstrate that training on our\nUFine6926 significantly improves generalization to real scenarios compared with\nother coarse-grained datasets. The dataset and code will be made publicly\navailable at \\url{https://github.com/Zplusdragon/UFineBench}.\n","authors":["Jialong Zuo","Hanyu Zhou","Ying Nie","Feng Zhang","Tianyu Guo","Nong Sang","Yunhe Wang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2312.03441v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02918v2","updated":"2024-06-06T05:22:45Z","published":"2024-06-05T04:13:03Z","title":"U-KAN Makes Strong Backbone for Medical Image Segmentation and\n  Generation","summary":"  U-Net has become a cornerstone in various visual applications such as image\nsegmentation and diffusion probability models. While numerous innovative\ndesigns and improvements have been introduced by incorporating transformers or\nMLPs, the networks are still limited to linearly modeling patterns as well as\nthe deficient interpretability. To address these challenges, our intuition is\ninspired by the impressive results of the Kolmogorov-Arnold Networks (KANs) in\nterms of accuracy and interpretability, which reshape the neural network\nlearning via the stack of non-linear learnable activation functions derived\nfrom the Kolmogorov-Anold representation theorem. Specifically, in this paper,\nwe explore the untapped potential of KANs in improving backbones for vision\ntasks. We investigate, modify and re-design the established U-Net pipeline by\nintegrating the dedicated KAN layers on the tokenized intermediate\nrepresentation, termed U-KAN. Rigorous medical image segmentation benchmarks\nverify the superiority of U-KAN by higher accuracy even with less computation\ncost. We further delved into the potential of U-KAN as an alternative U-Net\nnoise predictor in diffusion models, demonstrating its applicability in\ngenerating task-oriented model architectures. These endeavours unveil valuable\ninsights and sheds light on the prospect that with U-KAN, you can make strong\nbackbone for medical image segmentation and generation. Project page:\nhttps://yes-ukan.github.io/\n","authors":["Chenxin Li","Xinyu Liu","Wuyang Li","Cheng Wang","Hengyu Liu","Yixuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2406.02918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19732v3","updated":"2024-06-06T04:59:27Z","published":"2024-05-30T06:24:14Z","title":"Two Optimizers Are Better Than One: LLM Catalyst Empowers Gradient-Based\n  Optimization for Prompt Tuning","summary":"  Learning a skill generally relies on both practical experience by doer and\ninsightful high-level guidance by instructor. Will this strategy also work well\nfor solving complex non-convex optimization problems? Here, a common\ngradient-based optimizer acts like a disciplined doer, making locally optimal\nupdate at each step. Recent methods utilize large language models (LLMs) to\noptimize solutions for concrete problems by inferring from natural language\ninstructions, akin to a high-level instructor. In this paper, we show that\nthese two optimizers are complementary to each other, suggesting a\ncollaborative optimization approach. The gradient-based optimizer and LLM-based\noptimizer are combined in an interleaved manner. We instruct LLMs using task\ndescriptions and timely optimization trajectories recorded during\ngradient-based optimization. Inferred results from LLMs are used as restarting\npoints for the next stage of gradient optimization. By leveraging both the\nlocally rigorous gradient-based optimizer and the high-level deductive\nLLM-based optimizer, our combined optimization method consistently yields\nimprovements over competitive baseline prompt tuning methods. Our results\ndemonstrate the synergistic effect of conventional gradient-based optimization\nand the inference ability of LLMs. The code is released at\nhttps://github.com/guozix/LLM-catalyst.\n","authors":["Zixian Guo","Ming Liu","Zhilong Ji","Jinfeng Bai","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.19732v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03747v1","updated":"2024-06-06T04:57:29Z","published":"2024-06-06T04:57:29Z","title":"Instance Segmentation and Teeth Classification in Panoramic X-rays","summary":"  Teeth segmentation and recognition are critical in various dental\napplications and dental diagnosis. Automatic and accurate segmentation\napproaches have been made possible by integrating deep learning models.\nAlthough teeth segmentation has been studied in the past, only some techniques\nwere able to effectively classify and segment teeth simultaneously. This\narticle offers a pipeline of two deep learning models, U-Net and YOLOv8, which\nresults in BB-UNet, a new architecture for the classification and segmentation\nof teeth on panoramic X-rays that is efficient and reliable. We have improved\nthe quality and reliability of teeth segmentation by utilising the YOLOv8 and\nU-Net capabilities. The proposed networks have been evaluated using the mean\naverage precision (mAP) and dice coefficient for YOLOv8 and BB-UNet,\nrespectively. We have achieved a 3\\% increase in mAP score for teeth\nclassification compared to existing methods, and a 10-15\\% increase in dice\ncoefficient for teeth segmentation compared to U-Net across different\ncategories of teeth. A new Dental dataset was created based on UFBA-UESC\ndataset with Bounding-Box and Polygon annotations of 425 dental panoramic\nX-rays. The findings of this research pave the way for a wider adoption of\nobject detection models in the field of dental diagnosis.\n","authors":["Devichand Budagam","Ayush Kumar","Sayan Ghosh","Anuj Shrivastav","Azamat Zhanatuly Imanbayev","Iskander Rafailovich Akhmetov","Dmitrii Kaplun","Sergey Antonov","Artem Rychenkov","Gleb Cyganov","Aleksandr Sinitca"],"pdf_url":"https://arxiv.org/pdf/2406.03747v1.pdf","comment":"submtted to Expert Systems with Applications Journal"},{"id":"http://arxiv.org/abs/2406.03744v1","updated":"2024-06-06T04:44:10Z","published":"2024-06-06T04:44:10Z","title":"ReDistill: Residual Encoded Distillation for Peak Memory Reduction","summary":"  The expansion of neural network sizes and the enhancement of image resolution\nthrough modern camera sensors result in heightened memory and power demands for\nneural networks. Reducing peak memory, which is the maximum memory consumed\nduring the execution of a neural network, is critical to deploy neural networks\non edge devices with limited memory budget. A naive approach to reducing peak\nmemory is aggressive down-sampling of feature maps via pooling with large\nstride, which often results in unacceptable degradation in network performance.\nTo mitigate this problem, we propose residual encoded distillation (ReDistill)\nfor peak memory reduction in a teacher-student framework, in which a student\nnetwork with less memory is derived from the teacher network using aggressive\npooling. We apply our distillation method to multiple problems in computer\nvision including image classification and diffusion based image generation. For\nimage classification, our method yields 2x-3.2x measured peak memory on an edge\nGPU with negligible degradation in accuracy for most CNN based architectures.\nAdditionally, our method yields improved test accuracy for tiny vision\ntransformer (ViT) based models distilled from large CNN based teacher\narchitectures. For diffusion-based image generation, our proposed distillation\nmethod yields a denoising network with 4x lower theoretical peak memory while\nmaintaining decent diversity and fidelity for image generation. Experiments\ndemonstrate our method's superior performance compared to other feature-based\nand response-based distillation methods.\n","authors":["Fang Chen","Gourav Datta","Mujahid Al Rafi","Hyeran Jeon","Meng Tang"],"pdf_url":"https://arxiv.org/pdf/2406.03744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17814v2","updated":"2024-06-06T04:23:06Z","published":"2024-05-28T04:18:00Z","title":"FAIntbench: A Holistic and Precise Benchmark for Bias Evaluation in\n  Text-to-Image Models","summary":"  The rapid development and reduced barriers to entry for Text-to-Image (T2I)\nmodels have raised concerns about the biases in their outputs, but existing\nresearch lacks a holistic definition and evaluation framework of biases,\nlimiting the enhancement of debiasing techniques. To address this issue, we\nintroduce FAIntbench, a holistic and precise benchmark for biases in T2I\nmodels. In contrast to existing benchmarks that evaluate bias in limited\naspects, FAIntbench evaluate biases from four dimensions: manifestation of\nbias, visibility of bias, acquired attributes, and protected attributes. We\napplied FAIntbench to evaluate seven recent large-scale T2I models and\nconducted human evaluation, whose results demonstrated the effectiveness of\nFAIntbench in identifying various biases. Our study also revealed new research\nquestions about biases, including the side-effect of distillation. The findings\npresented here are preliminary, highlighting the potential of FAIntbench to\nadvance future research aimed at mitigating the biases in T2I models. Our\nbenchmark is publicly available to ensure the reproducibility.\n","authors":["Hanjun Luo","Ziye Deng","Ruizhe Chen","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2405.17814v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02343v2","updated":"2024-06-06T04:15:44Z","published":"2024-06-04T14:19:50Z","title":"Cluster-Aware Similarity Diffusion for Instance Retrieval","summary":"  Diffusion-based re-ranking is a common method used for retrieving instances\nby performing similarity propagation in a nearest neighbor graph. However,\nexisting techniques that construct the affinity graph based on pairwise\ninstances can lead to the propagation of misinformation from outliers and other\nmanifolds, resulting in inaccurate results. To overcome this issue, we propose\na novel Cluster-Aware Similarity (CAS) diffusion for instance retrieval. The\nprimary concept of CAS is to conduct similarity diffusion within local\nclusters, which can reduce the influence from other manifolds explicitly. To\nobtain a symmetrical and smooth similarity matrix, our Bidirectional Similarity\nDiffusion strategy introduces an inverse constraint term to the optimization\nobjective of local cluster diffusion. Additionally, we have optimized a\nNeighbor-guided Similarity Smoothing approach to ensure similarity consistency\namong the local neighbors of each instance. Evaluations in instance retrieval\nand object re-identification validate the effectiveness of the proposed CAS,\nour code is publicly available.\n","authors":["Jifei Luo","Hantao Yao","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2406.02343v2.pdf","comment":"This paper has been accepted by ICML2024"},{"id":"http://arxiv.org/abs/2210.17180v2","updated":"2024-06-06T04:15:29Z","published":"2022-10-31T09:54:28Z","title":"Automated Dominative Subspace Mining for Efficient Neural Architecture\n  Search","summary":"  Neural Architecture Search (NAS) aims to automatically find effective\narchitectures within a predefined search space. However, the search space is\noften extremely large. As a result, directly searching in such a large search\nspace is non-trivial and also very time-consuming. To address the above issues,\nin each search step, we seek to limit the search space to a small but effective\nsubspace to boost both the search performance and search efficiency. To this\nend, we propose a novel Neural Architecture Search method via Dominative\nSubspace Mining (DSM-NAS) that finds promising architectures in automatically\nmined subspaces. Specifically, we first perform a global search, i.e .,\ndominative subspace mining, to find a good subspace from a set of candidates.\nThen, we perform a local search within the mined subspace to find effective\narchitectures. More critically, we further boost search performance by taking\nwell-designed/ searched architectures to initialize candidate subspaces.\nExperimental results demonstrate that DSM-NAS not only reduces the search cost\nbut also discovers better architectures than state-of-the-art methods in\nvarious benchmark search spaces.\n","authors":["Yaofo Chen","Yong Guo","Daihai Liao","Fanbing Lv","Hengjie Song","James Tin-Yau Kwok","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2210.17180v2.pdf","comment":"Published in IEEE TCSVT"},{"id":"http://arxiv.org/abs/2402.17316v3","updated":"2024-06-06T04:08:24Z","published":"2024-02-27T08:47:19Z","title":"Towards Robust and Efficient Cloud-Edge Elastic Model Adaptation via\n  Selective Entropy Distillation","summary":"  The conventional deep learning paradigm often involves training a deep model\non a server and then deploying the model or its distilled ones to\nresource-limited edge devices. Usually, the models shall remain fixed once\ndeployed (at least for some period) due to the potential high cost of model\nadaptation for both the server and edge sides. However, in many real-world\nscenarios, the test environments may change dynamically (known as distribution\nshifts), which often results in degraded performance. Thus, one has to adapt\nthe edge models promptly to attain promising performance. Moreover, with the\nincreasing data collected at the edge, this paradigm also fails to further\nadapt the cloud model for better performance. To address these, we encounter\ntwo primary challenges: 1) the edge model has limited computation power and may\nonly support forward propagation; 2) the data transmission budget between cloud\nand edge devices is limited in latency-sensitive scenarios. In this paper, we\nestablish a Cloud-Edge Elastic Model Adaptation (CEMA) paradigm in which the\nedge models only need to perform forward propagation and the edge models can be\nadapted online. In our CEMA, to reduce the communication burden, we devise two\ncriteria to exclude unnecessary samples from uploading to the cloud, i.e.,\ndynamic unreliable and low-informative sample exclusion. Based on the uploaded\nsamples, we update and distribute the affine parameters of normalization layers\nby distilling from the stronger foundation model to the edge model with a\nsample replay strategy. Extensive experimental results on ImageNet-C and\nImageNet-R verify the effectiveness of our CEMA.\n","authors":["Yaofo Chen","Shuaicheng Niu","Yaowei Wang","Shoukai Xu","Hengjie Song","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2402.17316v3.pdf","comment":"Published in ICLR 2024"},{"id":"http://arxiv.org/abs/2405.14156v2","updated":"2024-06-06T03:58:29Z","published":"2024-05-23T04:08:23Z","title":"Unveiling the Tapestry of Consistency in Large Vision-Language Models","summary":"  Large vision-language models (LVLMs) have recently achieved rapid progress,\nexhibiting great perception and reasoning abilities concerning visual\ninformation. However, when faced with prompts in different sizes of solution\nspaces, LVLMs fail to always give consistent answers regarding the same\nknowledge point. This inconsistency of answers between different solution\nspaces is prevalent in LVLMs and erodes trust. To this end, we provide a\nmulti-modal benchmark ConBench, to intuitively analyze how LVLMs perform when\nthe solution space of a prompt revolves around a knowledge point. Based on the\nConBench tool, we are the first to reveal the tapestry and get the following\nfindings: (1) In the discriminate realm, the larger the solution space of the\nprompt, the lower the accuracy of the answers. (2) Establish the relationship\nbetween the discriminative and generative realms: the accuracy of the\ndiscriminative question type exhibits a strong positive correlation with its\nConsistency with the caption. (3) Compared to open-source models, closed-source\nmodels exhibit a pronounced bias advantage in terms of Consistency. Eventually,\nwe ameliorate the consistency of LVLMs by trigger-based diagnostic refinement,\nindirectly improving the performance of their caption. We hope this paper will\naccelerate the research community in better evaluating their models and\nencourage future advancements in the consistency domain.\n","authors":["Yuan Zhang","Fei Xiao","Tao Huang","Chun-Kai Fan","Hongyuan Dong","Jiawen Li","Jiacong Wang","Kuan Cheng","Shanghang Zhang","Haoyuan Guo"],"pdf_url":"https://arxiv.org/pdf/2405.14156v2.pdf","comment":"This project is available at\n  https://github.com/foundation-multimodal-models/ConBench"},{"id":"http://arxiv.org/abs/2406.03728v1","updated":"2024-06-06T03:57:08Z","published":"2024-06-06T03:57:08Z","title":"Evaluating Durability: Benchmark Insights into Multimodal Watermarking","summary":"  With the development of large models, watermarks are increasingly employed to\nassert copyright, verify authenticity, or monitor content distribution. As\napplications become more multimodal, the utility of watermarking techniques\nbecomes even more critical. The effectiveness and reliability of these\nwatermarks largely depend on their robustness to various disturbances. However,\nthe robustness of these watermarks in real-world scenarios, particularly under\nperturbations and corruption, is not well understood. To highlight the\nsignificance of robustness in watermarking techniques, our study evaluated the\nrobustness of watermarked content generated by image and text generation models\nagainst common real-world image corruptions and text perturbations. Our results\ncould pave the way for the development of more robust watermarking techniques\nin the future. Our project website can be found at\n\\url{https://mmwatermark-robustness.github.io/}.\n","authors":["Jielin Qiu","William Han","Xuandong Zhao","Shangbang Long","Christos Faloutsos","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2406.03728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03723v1","updated":"2024-06-06T03:37:39Z","published":"2024-06-06T03:37:39Z","title":"Gear-NeRF: Free-Viewpoint Rendering and Tracking with Motion-aware\n  Spatio-Temporal Sampling","summary":"  Extensions of Neural Radiance Fields (NeRFs) to model dynamic scenes have\nenabled their near photo-realistic, free-viewpoint rendering. Although these\nmethods have shown some potential in creating immersive experiences, two\ndrawbacks limit their ubiquity: (i) a significant reduction in reconstruction\nquality when the computing budget is limited, and (ii) a lack of semantic\nunderstanding of the underlying scenes. To address these issues, we introduce\nGear-NeRF, which leverages semantic information from powerful image\nsegmentation models. Our approach presents a principled way for learning a\nspatio-temporal (4D) semantic embedding, based on which we introduce the\nconcept of gears to allow for stratified modeling of dynamic regions of the\nscene based on the extent of their motion. Such differentiation allows us to\nadjust the spatio-temporal sampling resolution for each region in proportion to\nits motion scale, achieving more photo-realistic dynamic novel view synthesis.\nAt the same time, almost for free, our approach enables free-viewpoint tracking\nof objects of interest - a functionality not yet achieved by existing\nNeRF-based methods. Empirical studies validate the effectiveness of our method,\nwhere we achieve state-of-the-art rendering and tracking performance on\nmultiple challenging datasets.\n","authors":["Xinhang Liu","Yu-Wing Tai","Chi-Keung Tang","Pedro Miraldo","Suhas Lohit","Moitreya Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2406.03723v1.pdf","comment":"Paper accepted to IEEE/CVF CVPR 2024 (Spotlight). Work done when XL\n  was an intern at MERL. Project Page Link:\n  https://merl.com/research/highlights/gear-nerf"},{"id":"http://arxiv.org/abs/2406.03721v1","updated":"2024-06-06T03:34:42Z","published":"2024-06-06T03:34:42Z","title":"Attribute-Aware Implicit Modality Alignment for Text Attribute Person\n  Search","summary":"  Text attribute person search aims to find specific pedestrians through given\ntextual attributes, which is very meaningful in the scene of searching for\ndesignated pedestrians through witness descriptions. The key challenge is the\nsignificant modality gap between textual attributes and images. Previous\nmethods focused on achieving explicit representation and alignment through\nunimodal pre-trained models. Nevertheless, the absence of inter-modality\ncorrespondence in these models may lead to distortions in the local information\nof intra-modality. Moreover, these methods only considered the alignment of\ninter-modality and ignored the differences between different attribute\ncategories. To mitigate the above problems, we propose an Attribute-Aware\nImplicit Modality Alignment (AIMA) framework to learn the correspondence of\nlocal representations between textual attributes and images and combine global\nrepresentation matching to narrow the modality gap. Firstly, we introduce the\nCLIP model as the backbone and design prompt templates to transform attribute\ncombinations into structured sentences. This facilitates the model's ability to\nbetter understand and match image details. Next, we design a Masked Attribute\nPrediction (MAP) module that predicts the masked attributes after the\ninteraction of image and masked textual attribute features through multi-modal\ninteraction, thereby achieving implicit local relationship alignment. Finally,\nwe propose an Attribute-IoU Guided Intra-Modal Contrastive (A-IoU IMC) loss,\naligning the distribution of different textual attributes in the embedding\nspace with their IoU distribution, achieving better semantic arrangement.\nExtensive experiments on the Market-1501 Attribute, PETA, and PA100K datasets\nshow that the performance of our proposed method significantly surpasses the\ncurrent state-of-the-art methods.\n","authors":["Xin Wang","Fangfang Liu","Zheng Li","Caili Guo"],"pdf_url":"https://arxiv.org/pdf/2406.03721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03720v1","updated":"2024-06-06T03:31:41Z","published":"2024-06-06T03:31:41Z","title":"JIGMARK: A Black-Box Approach for Enhancing Image Watermarks against\n  Diffusion Model Edits","summary":"  In this study, we investigate the vulnerability of image watermarks to\ndiffusion-model-based image editing, a challenge exacerbated by the\ncomputational cost of accessing gradient information and the closed-source\nnature of many diffusion models. To address this issue, we introduce JIGMARK.\nThis first-of-its-kind watermarking technique enhances robustness through\ncontrastive learning with pairs of images, processed and unprocessed by\ndiffusion models, without needing a direct backpropagation of the diffusion\nprocess. Our evaluation reveals that JIGMARK significantly surpasses existing\nwatermarking solutions in resilience to diffusion-model edits, demonstrating a\nTrue Positive Rate more than triple that of leading baselines at a 1% False\nPositive Rate while preserving image quality. At the same time, it consistently\nimproves the robustness against other conventional perturbations (like JPEG,\nblurring, etc.) and malicious watermark attacks over the state-of-the-art,\noften by a large margin. Furthermore, we propose the Human Aligned Variation\n(HAV) score, a new metric that surpasses traditional similarity measures in\nquantifying the number of image derivatives from image editing.\n","authors":["Minzhou Pan","Yi Zeng","Xue Lin","Ning Yu","Cho-Jui Hsieh","Peter Henderson","Ruoxi Jia"],"pdf_url":"https://arxiv.org/pdf/2406.03720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12424v4","updated":"2024-06-06T03:02:25Z","published":"2024-02-19T16:34:50Z","title":"Tables as Texts or Images: Evaluating the Table Reasoning Ability of\n  LLMs and MLLMs","summary":"  In this paper, we investigate the effectiveness of various LLMs in\ninterpreting tabular data through different prompting strategies and data\nformats. Our analyses extend across six benchmarks for table-related tasks such\nas question-answering and fact-checking. We introduce for the first time the\nassessment of LLMs' performance on image-based table representations.\nSpecifically, we compare five text-based and three image-based table\nrepresentations, demonstrating the role of representation and prompting on LLM\nperformance. Our study provides insights into the effective use of LLMs on\ntable-related tasks.\n","authors":["Naihao Deng","Zhenjie Sun","Ruiqi He","Aman Sikka","Yulong Chen","Lin Ma","Yue Zhang","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2402.12424v4.pdf","comment":"Accepted to ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2404.13874v2","updated":"2024-06-06T02:53:37Z","published":"2024-04-22T04:49:22Z","title":"VALOR-EVAL: Holistic Coverage and Faithfulness Evaluation of Large\n  Vision-Language Models","summary":"  Large Vision-Language Models (LVLMs) suffer from hallucination issues,\nwherein the models generate plausible-sounding but factually incorrect outputs,\nundermining their reliability. A comprehensive quantitative evaluation is\nnecessary to identify and understand the extent of hallucinations in these\nmodels. However, existing benchmarks are often limited in scope, focusing\nmainly on object hallucinations. Furthermore, current evaluation methods\nstruggle to effectively address the subtle semantic distinctions between model\noutputs and reference data, as well as the balance between hallucination and\ninformativeness. To address these issues, we introduce a multi-dimensional\nbenchmark covering objects, attributes, and relations, with challenging images\nselected based on associative biases. Moreover, we propose a large language\nmodel (LLM)-based two-stage evaluation framework that generalizes the popular\nCHAIR metric and incorporates both faithfulness and coverage into the\nevaluation. Experiments on 10 established LVLMs demonstrate that our evaluation\nmetric is more comprehensive and better correlated with humans than existing\nwork when evaluating on our challenging human-annotated benchmark dataset. Our\nwork also highlights the critical balance between faithfulness and coverage of\nmodel outputs, and encourages future works to address hallucinations in LVLMs\nwhile keeping their outputs informative.\n","authors":["Haoyi Qiu","Wenbo Hu","Zi-Yi Dou","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2404.13874v2.pdf","comment":"ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2406.03702v1","updated":"2024-06-06T02:51:57Z","published":"2024-06-06T02:51:57Z","title":"DSNet: A Novel Way to Use Atrous Convolutions in Semantic Segmentation","summary":"  Atrous convolutions are employed as a method to increase the receptive field\nin semantic segmentation tasks. However, in previous works of semantic\nsegmentation, it was rarely employed in the shallow layers of the model. We\nrevisit the design of atrous convolutions in modern convolutional neural\nnetworks (CNNs), and demonstrate that the concept of using large kernels to\napply atrous convolutions could be a more powerful paradigm. We propose three\nguidelines to apply atrous convolutions more efficiently. Following these\nguidelines, we propose DSNet, a Dual-Branch CNN architecture, which\nincorporates atrous convolutions in the shallow layers of the model\narchitecture, as well as pretraining the nearly entire encoder on ImageNet to\nachieve better performance. To demonstrate the effectiveness of our approach,\nour models achieve a new state-of-the-art trade-off between accuracy and speed\non ADE20K, Cityscapes and BDD datasets. Specifically, DSNet achieves 40.0% mIOU\nwith inference speed of 179.2 FPS on ADE20K, and 80.4% mIOU with speed of 81.9\nFPS on Cityscapes. Source code and models are available at Github:\nhttps://github.com/takaniwa/DSNet.\n","authors":["Zilu Guo","Liuyang Bian","Xuan Huang","Hu Wei","Jingyu Li","Huasheng Ni"],"pdf_url":"https://arxiv.org/pdf/2406.03702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04406v3","updated":"2024-06-06T02:51:17Z","published":"2023-10-06T17:55:11Z","title":"Language Agent Tree Search Unifies Reasoning Acting and Planning in\n  Language Models","summary":"  While language models (LMs) have shown potential across a range of\ndecision-making tasks, their reliance on simple acting processes limits their\nbroad deployment as autonomous agents. In this paper, we introduce Language\nAgent Tree Search (LATS) -- the first general framework that synergizes the\ncapabilities of LMs in reasoning, acting, and planning. By leveraging the\nin-context learning ability of LMs, we integrate Monte Carlo Tree Search into\nLATS to enable LMs as agents, along with LM-powered value functions and\nself-reflections for proficient exploration and enhanced decision-making. A key\nfeature of our approach is the incorporation of an environment for external\nfeedback, which offers a more deliberate and adaptive problem-solving mechanism\nthat surpasses the constraints of existing techniques. Our experimental\nevaluation across diverse domains, including programming, interactive\nquestion-answering (QA), web navigation, and math, validates the effectiveness\nand generality of LATS in decision-making while maintaining competitive or\nimproved reasoning performance. Notably, LATS achieves state-of-the-art pass@1\naccuracy (92.7%) for programming on HumanEval with GPT-4 and demonstrates\ngradient-free performance (average score of 75.9) comparable to gradient-based\nfine-tuning for web navigation on WebShop with GPT-3.5. Code can be found at\nhttps://github.com/lapisrocks/LanguageAgentTreeSearch\n","authors":["Andy Zhou","Kai Yan","Michal Shlapentokh-Rothman","Haohan Wang","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.04406v3.pdf","comment":"Code at https://github.com/lapisrocks/LanguageAgentTreeSearch"},{"id":"http://arxiv.org/abs/2306.06209v3","updated":"2024-06-06T02:44:11Z","published":"2023-05-11T10:05:57Z","title":"Backdoor Attack with Sparse and Invisible Trigger","summary":"  Deep neural networks (DNNs) are vulnerable to backdoor attacks, where the\nadversary manipulates a small portion of training data such that the victim\nmodel predicts normally on the benign samples but classifies the triggered\nsamples as the target class. The backdoor attack is an emerging yet threatening\ntraining-phase threat, leading to serious risks in DNN-based applications. In\nthis paper, we revisit the trigger patterns of existing backdoor attacks. We\nreveal that they are either visible or not sparse and therefore are not\nstealthy enough. More importantly, it is not feasible to simply combine\nexisting methods to design an effective sparse and invisible backdoor attack.\nTo address this problem, we formulate the trigger generation as a bi-level\noptimization problem with sparsity and invisibility constraints and propose an\neffective method to solve it. The proposed method is dubbed sparse and\ninvisible backdoor attack (SIBA). We conduct extensive experiments on benchmark\ndatasets under different settings, which verify the effectiveness of our attack\nand its resistance to existing backdoor defenses. The codes for reproducing\nmain experiments are available at \\url{https://github.com/YinghuaGao/SIBA}.\n","authors":["Yinghua Gao","Yiming Li","Xueluan Gong","Zhifeng Li","Shu-Tao Xia","Qian Wang"],"pdf_url":"https://arxiv.org/pdf/2306.06209v3.pdf","comment":"This paper was accepted by IEEE Transactions on Information Forensics\n  and Security (TIFS). The first two authors contributed equally to this work.\n  14 pages"},{"id":"http://arxiv.org/abs/2406.03697v1","updated":"2024-06-06T02:32:41Z","published":"2024-06-06T02:32:41Z","title":"Superpoint Gaussian Splatting for Real-Time High-Fidelity Dynamic Scene\n  Reconstruction","summary":"  Rendering novel view images in dynamic scenes is a crucial yet challenging\ntask. Current methods mainly utilize NeRF-based methods to represent the static\nscene and an additional time-variant MLP to model scene deformations, resulting\nin relatively low rendering quality as well as slow inference speed. To tackle\nthese challenges, we propose a novel framework named Superpoint Gaussian\nSplatting (SP-GS). Specifically, our framework first employs explicit 3D\nGaussians to reconstruct the scene and then clusters Gaussians with similar\nproperties (e.g., rotation, translation, and location) into superpoints.\nEmpowered by these superpoints, our method manages to extend 3D Gaussian\nsplatting to dynamic scenes with only a slight increase in computational\nexpense. Apart from achieving state-of-the-art visual quality and real-time\nrendering under high resolutions, the superpoint representation provides a\nstronger manipulation capability. Extensive experiments demonstrate the\npracticality and effectiveness of our approach on both synthetic and real-world\ndatasets. Please see our project page at\nhttps://dnvtmf.github.io/SP_GS.github.io.\n","authors":["Diwen Wan","Ruijie Lu","Gang Zeng"],"pdf_url":"https://arxiv.org/pdf/2406.03697v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2406.03694v1","updated":"2024-06-06T02:22:43Z","published":"2024-06-06T02:22:43Z","title":"Untrained Neural Nets for Snapshot Compressive Imaging: Theory and\n  Algorithms","summary":"  Snapshot compressive imaging (SCI) recovers high-dimensional (3D) data cubes\nfrom a single 2D measurement, enabling diverse applications like video and\nhyperspectral imaging to go beyond standard techniques in terms of acquisition\nspeed and efficiency. In this paper, we focus on SCI recovery algorithms that\nemploy untrained neural networks (UNNs), such as deep image prior (DIP), to\nmodel source structure. Such UNN-based methods are appealing as they have the\npotential of avoiding the computationally intensive retraining required for\ndifferent source models and different measurement scenarios. We first develop a\ntheoretical framework for characterizing the performance of such UNN-based\nmethods. The theoretical framework, on the one hand, enables us to optimize the\nparameters of data-modulating masks, and on the other hand, provides a\nfundamental connection between the number of data frames that can be recovered\nfrom a single measurement to the parameters of the untrained NN. We also employ\nthe recently proposed bagged-deep-image-prior (bagged-DIP) idea to develop SCI\nBagged Deep Video Prior (SCI-BDVP) algorithms that address the common\nchallenges faced by standard UNN solutions. Our experimental results show that\nin video SCI our proposed solution achieves state-of-the-art among UNN methods,\nand in the case of noisy measurements, it even outperforms supervised\nsolutions.\n","authors":["Mengyu Zhao","Xi Chen","Xin Yuan","Shirin Jalali"],"pdf_url":"https://arxiv.org/pdf/2406.03694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03688v1","updated":"2024-06-06T02:19:18Z","published":"2024-06-06T02:19:18Z","title":"Shadow and Light: Digitally Reconstructed Radiographs for Disease\n  Classification","summary":"  In this paper, we introduce DRR-RATE, a large-scale synthetic chest X-ray\ndataset derived from the recently released CT-RATE dataset. DRR-RATE comprises\nof 50,188 frontal Digitally Reconstructed Radiographs (DRRs) from 21,304 unique\npatients. Each image is paired with a corresponding radiology text report and\nbinary labels for 18 pathology classes. Given the controllable nature of DRR\ngeneration, it facilitates the inclusion of lateral view images and images from\nany desired viewing position. This opens up avenues for research into new and\nnovel multimodal applications involving paired CT, X-ray images from various\nviews, text, and binary labels. We demonstrate the applicability of DRR-RATE\nalongside existing large-scale chest X-ray resources, notably the CheXpert\ndataset and CheXnet model. Experiments demonstrate that CheXnet, when trained\nand tested on the DRR-RATE dataset, achieves sufficient to high AUC scores for\nthe six common pathologies cited in common literature: Atelectasis,\nCardiomegaly, Consolidation, Lung Lesion, Lung Opacity, and Pleural Effusion.\nAdditionally, CheXnet trained on the CheXpert dataset can accurately identify\nseveral pathologies, even when operating out of distribution. This confirms\nthat the generated DRR images effectively capture the essential pathology\nfeatures from CT images. The dataset and labels are publicly accessible at\nhttps://huggingface.co/datasets/farrell236/DRR-RATE.\n","authors":["Benjamin Hou","Qingqing Zhu","Tejas Sudarshan Mathai","Qiao Jin","Zhiyong Lu","Ronald M. Summers"],"pdf_url":"https://arxiv.org/pdf/2406.03688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03051v2","updated":"2024-06-06T02:18:41Z","published":"2024-06-05T08:26:44Z","title":"Adapter-X: A Novel General Parameter-Efficient Fine-Tuning Framework for\n  Vision","summary":"  Parameter-efficient fine-tuning (PEFT) has become increasingly important as\nfoundation models continue to grow in both popularity and size. Adapter has\nbeen particularly well-received due to their potential for parameter reduction\nand adaptability across diverse tasks. However, striking a balance between high\nefficiency and robust generalization across tasks remains a challenge for\nadapter-based methods. We analyze existing methods and find that: 1) parameter\nsharing is the key to reducing redundancy; 2) more tunable parameters, dynamic\nallocation, and block-specific design are keys to improving performance.\nUnfortunately, no previous work considers all these factors. Inspired by this\ninsight, we introduce a novel framework named Adapter-X. First, a Sharing\nMixture of Adapters (SMoA) module is proposed to fulfill token-level dynamic\nallocation, increased tunable parameters, and inter-block sharing at the same\ntime. Second, some block-specific designs like Prompt Generator (PG) are\nintroduced to further enhance the ability of adaptation. Extensive experiments\nacross 2D image and 3D point cloud modalities demonstrate that Adapter-X\nrepresents a significant milestone as it is the first to outperform full\nfine-tuning in both 2D image and 3D point cloud modalities with significantly\nfewer parameters, i.e., only 0.20% and 1.88% of original trainable parameters\nfor 2D and 3D classification tasks. Our code will be publicly available.\n","authors":["Minglei Li","Peng Ye","Yongqi Huang","Lin Zhang","Tao Chen","Tong He","Jiayuan Fan","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2406.03051v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2406.04331v1","updated":"2024-06-06T17:59:10Z","published":"2024-06-06T17:59:10Z","title":"PaCE: Parsimonious Concept Engineering for Large Language Models","summary":"  Large Language Models (LLMs) are being used for a wide variety of tasks.\nWhile they are capable of generating human-like responses, they can also\nproduce undesirable output including potentially harmful information, racist or\nsexist language, and hallucinations. Alignment methods are designed to reduce\nsuch undesirable output, via techniques such as fine-tuning, prompt\nengineering, and representation engineering. However, existing methods face\nseveral challenges: some require costly fine-tuning for every alignment task;\nsome do not adequately remove undesirable concepts, failing alignment; some\nremove benign concepts, lowering the linguistic capabilities of LLMs. To\naddress these issues, we propose Parsimonious Concept Engineering (PaCE), a\nnovel activation engineering framework for alignment. First, to sufficiently\nmodel the concepts, we construct a large-scale concept dictionary in the\nactivation space, in which each atom corresponds to a semantic concept. Then,\ngiven any alignment task, we instruct a concept partitioner to efficiently\nannotate the concepts as benign or undesirable. Finally, at inference time, we\ndecompose the LLM activations along the concept dictionary via sparse coding,\nto accurately represent the activation as a linear combination of the benign\nand undesirable components. By removing the latter ones from the activation, we\nreorient the behavior of LLMs towards alignment goals. We conduct experiments\non tasks such as response detoxification, faithfulness enhancement, and\nsentiment revising, and show that PaCE achieves state-of-the-art alignment\nperformance while maintaining linguistic capabilities.\n","authors":["Jinqi Luo","Tianjiao Ding","Kwan Ho Ryan Chan","Darshan Thaker","Aditya Chattopadhyay","Chris Callison-Burch","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2406.04331v1.pdf","comment":"26 pages, 17 figures, 5 tables, dataset and code at\n  https://github.com/peterljq/Parsimonious-Concept-Engineering"},{"id":"http://arxiv.org/abs/2406.04298v1","updated":"2024-06-06T17:42:37Z","published":"2024-06-06T17:42:37Z","title":"Measuring and Addressing Indexical Bias in Information Retrieval","summary":"  Information Retrieval (IR) systems are designed to deliver relevant content,\nbut traditional systems may not optimize rankings for fairness, neutrality, or\nthe balance of ideas. Consequently, IR can often introduce indexical biases, or\nbiases in the positional order of documents. Although indexical bias can\ndemonstrably affect people's opinion, voting patterns, and other behaviors,\nthese issues remain understudied as the field lacks reliable metrics and\nprocedures for automatically measuring indexical bias. Towards this end, we\nintroduce the PAIR framework, which supports automatic bias audits for ranked\ndocuments or entire IR systems. After introducing DUO, the first\ngeneral-purpose automatic bias metric, we run an extensive evaluation of 8 IR\nsystems on a new corpus of 32k synthetic and 4.7k natural documents, with 4k\nqueries spanning 1.4k controversial issue topics. A human behavioral study\nvalidates our approach, showing that our bias metric can help predict when and\nhow indexical bias will shift a reader's opinion.\n","authors":["Caleb Ziems","William Held","Jane Dwivedi-Yu","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2406.04298v1.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2406.04292v1","updated":"2024-06-06T17:37:47Z","published":"2024-06-06T17:37:47Z","title":"VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval","summary":"  Multi-modal retrieval becomes increasingly popular in practice. However, the\nexisting retrievers are mostly text-oriented, which lack the capability to\nprocess visual information. Despite the presence of vision-language models like\nCLIP, the current methods are severely limited in representing the text-only\nand image-only data. In this work, we present a new embedding model VISTA for\nuniversal multi-modal retrieval. Our work brings forth threefold technical\ncontributions. Firstly, we introduce a flexible architecture which extends a\npowerful text encoder with the image understanding capability by introducing\nvisual token embeddings. Secondly, we develop two data generation strategies,\nwhich bring high-quality composed image-text to facilitate the training of the\nembedding model. Thirdly, we introduce a multi-stage training algorithm, which\nfirst aligns the visual token embedding with the text encoder using massive\nweakly labeled data, and then develops multi-modal representation capability\nusing the generated composed image-text data. In our experiments, VISTA\nachieves superior performances across a variety of multi-modal retrieval tasks\nin both zero-shot and supervised settings. Our model, data, and source code are\navailable at https://github.com/FlagOpen/FlagEmbedding.\n","authors":["Junjie Zhou","Zheng Liu","Shitao Xiao","Bo Zhao","Yongping Xiong"],"pdf_url":"https://arxiv.org/pdf/2406.04292v1.pdf","comment":"Accepted to ACL 2024 main conference"},{"id":"http://arxiv.org/abs/2402.15838v3","updated":"2024-06-06T17:32:45Z","published":"2024-02-24T15:31:59Z","title":"ListT5: Listwise Reranking with Fusion-in-Decoder Improves Zero-shot\n  Retrieval","summary":"  We propose ListT5, a novel reranking approach based on Fusion-in-Decoder\n(FiD) that handles multiple candidate passages at both train and inference\ntime. We also introduce an efficient inference framework for listwise ranking\nbased on m-ary tournament sort with output caching. We evaluate and compare our\nmodel on the BEIR benchmark for zero-shot retrieval task, demonstrating that\nListT5 (1) outperforms the state-of-the-art RankT5 baseline with a notable +1.3\ngain in the average NDCG@10 score, (2) has an efficiency comparable to\npointwise ranking models and surpasses the efficiency of previous listwise\nranking models, and (3) overcomes the lost-in-the-middle problem of previous\nlistwise rerankers. Our code, model checkpoints, and the evaluation framework\nare fully open-sourced at \\url{https://github.com/soyoung97/ListT5}.\n","authors":["Soyoung Yoon","Eunbi Choi","Jiyeon Kim","Hyeongu Yun","Yireun Kim","Seung-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2402.15838v3.pdf","comment":"Accepted to ACL 2024 main (long)"},{"id":"http://arxiv.org/abs/2406.04257v1","updated":"2024-06-06T17:03:51Z","published":"2024-06-06T17:03:51Z","title":"Data Measurements for Decentralized Data Markets","summary":"  Decentralized data markets can provide more equitable forms of data\nacquisition for machine learning. However, to realize practical marketplaces,\nefficient techniques for seller selection need to be developed. We propose and\nbenchmark federated data measurements to allow a data buyer to find sellers\nwith relevant and diverse datasets. Diversity and relevance measures enable a\nbuyer to make relative comparisons between sellers without requiring\nintermediate brokers and training task-dependent models.\n","authors":["Charles Lu","Mohammad Mohammadi Amiri","Ramesh Raskar"],"pdf_url":"https://arxiv.org/pdf/2406.04257v1.pdf","comment":"20 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.07876v3","updated":"2024-06-06T14:46:44Z","published":"2023-08-15T16:41:53Z","title":"Leveraging Codebook Knowledge with NLI and ChatGPT for Zero-Shot\n  Political Relation Classification","summary":"  Is it possible accurately classify political relations within evolving event\nontologies without extensive annotations? This study investigates zero-shot\nlearning methods that use expert knowledge from existing annotation codebook,\nand evaluates the performance of advanced ChatGPT (GPT-3.5/4) and a natural\nlanguage inference (NLI)-based model called ZSP. ChatGPT uses codebook's\nlabeled summaries as prompts, whereas ZSP breaks down the classification task\ninto context, event mode, and class disambiguation to refine task-specific\nhypotheses. This decomposition enhances interpretability, efficiency, and\nadaptability to schema changes. The experiments reveal ChatGPT's strengths and\nlimitations, and crucially show ZSP's outperformance of dictionary-based\nmethods and its competitive edge over some supervised models. These findings\naffirm the value of ZSP for validating event records and advancing ontology\ndevelopment. Our study underscores the efficacy of leveraging transfer learning\nand existing domain expertise to enhance research efficiency and scalability.\n","authors":["Yibo Hu","Erick Skorupa Parolin","Latifur Khan","Patrick T. Brandt","Javier Osorio","Vito J. D'Orazio"],"pdf_url":"https://arxiv.org/pdf/2308.07876v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2406.00083v2","updated":"2024-06-06T13:38:42Z","published":"2024-06-03T02:25:33Z","title":"BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of\n  Large Language Models","summary":"  Large Language Models (LLMs) are constrained by outdated information and a\ntendency to generate incorrect data, commonly referred to as \"hallucinations.\"\nRetrieval-Augmented Generation (RAG) addresses these limitations by combining\nthe strengths of retrieval-based methods and generative models. This approach\ninvolves retrieving relevant information from a large, up-to-date dataset and\nusing it to enhance the generation process, leading to more accurate and\ncontextually appropriate responses. Despite its benefits, RAG introduces a new\nattack surface for LLMs, particularly because RAG databases are often sourced\nfrom public data, such as the web. In this paper, we propose \\TrojRAG{} to\nidentify the vulnerabilities and attacks on retrieval parts (RAG database) and\ntheir indirect attacks on generative parts (LLMs). Specifically, we identify\nthat poisoning several customized content passages could achieve a retrieval\nbackdoor, where the retrieval works well for clean queries but always returns\ncustomized poisoned adversarial queries. Triggers and poisoned passages can be\nhighly customized to implement various attacks. For example, a trigger could be\na semantic group like \"The Republican Party, Donald Trump, etc.\" Adversarial\npassages can be tailored to different contents, not only linked to the triggers\nbut also used to indirectly attack generative LLMs without modifying them.\nThese attacks can include denial-of-service attacks on RAG and semantic\nsteering attacks on LLM generations conditioned by the triggers. Our\nexperiments demonstrate that by just poisoning 10 adversarial passages can\ninduce 98.2\\% success rate to retrieve the adversarial passages. Then, these\npassages can increase the reject ratio of RAG-based GPT-4 from 0.01\\% to 74.6\\%\nor increase the rate of negative responses from 0.22\\% to 72\\% for targeted\nqueries.\n","authors":["Jiaqi Xue","Mengxin Zheng","Yebowen Hu","Fei Liu","Xun Chen","Qian Lou"],"pdf_url":"https://arxiv.org/pdf/2406.00083v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03986v1","updated":"2024-06-06T12:00:41Z","published":"2024-06-06T12:00:41Z","title":"On The Persona-based Summarization of Domain-Specific Documents","summary":"  In an ever-expanding world of domain-specific knowledge, the increasing\ncomplexity of consuming, and storing information necessitates the generation of\nsummaries from large information repositories. However, every persona of a\ndomain has different requirements of information and hence their summarization.\nFor example, in the healthcare domain, a persona-based (such as Doctor, Nurse,\nPatient etc.) approach is imperative to deliver targeted medical information\nefficiently. Persona-based summarization of domain-specific information by\nhumans is a high cognitive load task and is generally not preferred. The\nsummaries generated by two different humans have high variability and do not\nscale in cost and subject matter expertise as domains and personas grow.\nFurther, AI-generated summaries using generic Large Language Models (LLMs) may\nnot necessarily offer satisfactory accuracy for different domains unless they\nhave been specifically trained on domain-specific data and can also be very\nexpensive to use in day-to-day operations. Our contribution in this paper is\ntwo-fold: 1) We present an approach to efficiently fine-tune a domain-specific\nsmall foundation LLM using a healthcare corpus and also show that we can\neffectively evaluate the summarization quality using AI-based critiquing. 2) We\nfurther show that AI-based critiquing has good concordance with Human-based\ncritiquing of the summaries. Hence, such AI-based pipelines to generate\ndomain-specific persona-based summaries can be easily scaled to other domains\nsuch as legal, enterprise documents, education etc. in a very efficient and\ncost-effective manner.\n","authors":["Ankan Mullick","Sombit Bose","Rounak Saha","Ayan Kumar Bhowmick","Pawan Goyal","Niloy Ganguly","Prasenjit Dey","Ravi Kokku"],"pdf_url":"https://arxiv.org/pdf/2406.03986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03933v1","updated":"2024-06-06T10:17:52Z","published":"2024-06-06T10:17:52Z","title":"Beyond Similarity: Personalized Federated Recommendation with Composite\n  Aggregation","summary":"  Federated recommendation aims to collect global knowledge by aggregating\nlocal models from massive devices, to provide recommendations while ensuring\nprivacy. Current methods mainly leverage aggregation functions invented by\nfederated vision community to aggregate parameters from similar clients, e.g.,\nclustering aggregation. Despite considerable performance, we argue that it is\nsuboptimal to apply them to federated recommendation directly. This is mainly\nreflected in the disparate model architectures. Different from structured\nparameters like convolutional neural networks in federated vision, federated\nrecommender models usually distinguish itself by employing one-to-one item\nembedding table. Such a discrepancy induces the challenging embedding skew\nissue, which continually updates the trained embeddings but ignores the\nnon-trained ones during aggregation, thus failing to predict future items\naccurately. To this end, we propose a personalized Federated recommendation\nmodel with Composite Aggregation (FedCA), which not only aggregates similar\nclients to enhance trained embeddings, but also aggregates complementary\nclients to update non-trained embeddings. Besides, we formulate the overall\nlearning process into a unified optimization algorithm to jointly learn the\nsimilarity and complementarity. Extensive experiments on several real-world\ndatasets substantiate the effectiveness of our proposed model. The source codes\nare available at https://github.com/hongleizhang/FedCA.\n","authors":["Honglei Zhang","Haoxuan Li","Jundong Chen","Sen Cui","Kunda Yan","Abudukelimu Wuerkaixi","Xin Zhou","Zhiqi Shen","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2406.03933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.04288v3","updated":"2024-06-06T09:45:27Z","published":"2022-10-09T15:42:36Z","title":"CoopHash: Cooperative Learning of Multipurpose Descriptor and\n  Contrastive Pair Generator via Variational MCMC Teaching for Supervised Image\n  Hashing","summary":"  Leveraging supervised information can lead to superior retrieval performance\nin the image hashing domain but the performance degrades significantly without\nenough labeled data. One effective solution to boost performance is to employ\ngenerative models, such as Generative Adversarial Networks (GANs), to generate\nsynthetic data in an image hashing model. However, GAN-based methods are\ndifficult to train, which prevents the hashing approaches from jointly training\nthe generative models and the hash functions. This limitation results in\nsub-optimal retrieval performance. To overcome this limitation, we propose a\nnovel framework, the generative cooperative hashing network, which is based on\nenergy-based cooperative learning. This framework jointly learns a powerful\ngenerative representation of the data and a robust hash function via two\ncomponents: a top-down contrastive pair generator that synthesizes contrastive\nimages and a bottom-up multipurpose descriptor that simultaneously represents\nthe images from multiple perspectives, including probability density, hash\ncode, latent code, and category. The two components are jointly learned via a\nnovel likelihood-based cooperative learning scheme. We conduct experiments on\nseveral real-world datasets and show that the proposed method outperforms the\ncompeting hashing supervised methods, achieving up to 10\\% relative improvement\nover the current state-of-the-art supervised hashing methods, and exhibits a\nsignificantly better performance in out-of-distribution retrieval.\n","authors":["Khoa D. Doan","Jianwen Xie","Yaxuan Zhu","Yang Zhao","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2210.04288v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03892v1","updated":"2024-06-06T09:26:48Z","published":"2024-06-06T09:26:48Z","title":"Polyhedral Conic Classifier for CTR Prediction","summary":"  This paper introduces a novel approach for click-through rate (CTR)\nprediction within industrial recommender systems, addressing the inherent\nchallenges of numerical imbalance and geometric asymmetry. These challenges\nstem from imbalanced datasets, where positive (click) instances occur less\nfrequently than negatives (non-clicks), and geometrically asymmetric\ndistributions, where positive samples exhibit visually coherent patterns while\nnegatives demonstrate greater diversity. To address these challenges, we have\nused a deep neural network classifier that uses the polyhedral conic functions.\nThis classifier is similar to the one-class classifiers in spirit and it\nreturns compact polyhedral acceptance regions to separate the positive class\nsamples from the negative samples that have diverse distributions. Extensive\nexperiments have been conducted to test the proposed approach using\nstate-of-the-art (SOTA) CTR prediction models on four public datasets, namely\nCriteo, Avazu, MovieLens and Frappe. The experimental evaluations highlight the\nsuperiority of our proposed approach over Binary Cross Entropy (BCE) Loss,\nwhich is widely used in CTR prediction tasks.\n","authors":["Beyza Turkmen","Ramazan Tarik Turksoy","Hasan Saribas","Hakan Cevikalp"],"pdf_url":"https://arxiv.org/pdf/2406.03892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03858v1","updated":"2024-06-06T08:45:36Z","published":"2024-06-06T08:45:36Z","title":"Reducing the climate impact of data portals: a case study","summary":"  The carbon footprint share of the information and communication technology\n(ICT) sector has steadily increased in the past decade and is predicted to make\nup as much as 23 \\% of global emissions in 2030. This shows a pressing need for\ndevelopers, including the information retrieval community, to make their code\nmore energy-efficient. In this project proposal, we discuss techniques to\nreduce the energy footprint of the MaRDI (Mathematical Research Data\nInitiative) Portal, a MediaWiki-based knowledge base. In future work, we plan\nto implement these changes and provide concrete measurements on the gain in\nenergy efficiency. Researchers developing similar knowledge bases can adapt our\nmeasures to reduce their environmental footprint. In this way, we are working\non mitigating the climate impact of Information Retrieval research.\n","authors":["Noah Gießing","Madhurima Deb","Ankit Satpute","Moritz Schubotz","Olaf Teschke"],"pdf_url":"https://arxiv.org/pdf/2406.03858v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2310.04400v2","updated":"2024-06-06T08:36:57Z","published":"2023-10-06T17:50:38Z","title":"On the Embedding Collapse when Scaling up Recommendation Models","summary":"  Recent advances in foundation models have led to a promising trend of\ndeveloping large recommendation models to leverage vast amounts of available\ndata. Still, mainstream models remain embarrassingly small in size and na\\\"ive\nenlarging does not lead to sufficient performance gain, suggesting a deficiency\nin the model scalability. In this paper, we identify the embedding collapse\nphenomenon as the inhibition of scalability, wherein the embedding matrix tends\nto occupy a low-dimensional subspace. Through empirical and theoretical\nanalysis, we demonstrate a \\emph{two-sided effect} of feature interaction\nspecific to recommendation models. On the one hand, interacting with collapsed\nembeddings restricts embedding learning and exacerbates the collapse issue. On\nthe other hand, interaction is crucial in mitigating the fitting of spurious\nfeatures as a scalability guarantee. Based on our analysis, we propose a simple\nyet effective multi-embedding design incorporating embedding-set-specific\ninteraction modules to learn embedding sets with large diversity and thus\nreduce collapse. Extensive experiments demonstrate that this proposed design\nprovides consistent scalability and effective collapse mitigation for various\nrecommendation models. Code is available at this repository:\nhttps://github.com/thuml/Multi-Embedding.\n","authors":["Xingzhuo Guo","Junwei Pan","Ximei Wang","Baixu Chen","Jie Jiang","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2310.04400v2.pdf","comment":"ICML 2024 Accepted"},{"id":"http://arxiv.org/abs/2402.17447v2","updated":"2024-06-06T07:41:21Z","published":"2024-02-27T12:03:56Z","title":"Deep Learning Based Named Entity Recognition Models for Recipes","summary":"  Food touches our lives through various endeavors, including flavor,\nnourishment, health, and sustainability. Recipes are cultural capsules\ntransmitted across generations via unstructured text. Automated protocols for\nrecognizing named entities, the building blocks of recipe text, are of immense\nvalue for various applications ranging from information extraction to novel\nrecipe generation. Named entity recognition is a technique for extracting\ninformation from unstructured or semi-structured data with known labels.\nStarting with manually-annotated data of 6,611 ingredient phrases, we created\nan augmented dataset of 26,445 phrases cumulatively. Simultaneously, we\nsystematically cleaned and analyzed ingredient phrases from RecipeDB, the\ngold-standard recipe data repository, and annotated them using the Stanford\nNER. Based on the analysis, we sampled a subset of 88,526 phrases using a\nclustering-based approach while preserving the diversity to create the\nmachine-annotated dataset. A thorough investigation of NER approaches on these\nthree datasets involving statistical, fine-tuning of deep learning-based\nlanguage models and few-shot prompting on large language models (LLMs) provides\ndeep insights. We conclude that few-shot prompting on LLMs has abysmal\nperformance, whereas the fine-tuned spaCy-transformer emerges as the best model\nwith macro-F1 scores of 95.9%, 96.04%, and 95.71% for the manually-annotated,\naugmented, and machine-annotated datasets, respectively.\n","authors":["Mansi Goel","Ayush Agarwal","Shubham Agrawal","Janak Kapuriya","Akhil Vamshi Konam","Rishabh Gupta","Shrey Rastogi"," Niharika","Ganesh Bagler"],"pdf_url":"https://arxiv.org/pdf/2402.17447v2.pdf","comment":"13 pages, 6 main figures and 2 in appendices, and 3 main tables;\n  Accepted for publication in LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2405.02664v2","updated":"2024-06-06T07:39:00Z","published":"2024-05-04T13:25:06Z","title":"MedPromptExtract (Medical Data Extraction Tool): Anonymization and\n  Hi-fidelity Automated data extraction using NLP and prompt engineering","summary":"  A major roadblock in the seamless digitization of medical records remains the\nlack of interoperability of existing records. Extracting relevant medical\ninformation required for further treatment planning or even research is a time\nconsuming labour intensive task involving expenditure of valuable time of\ndoctors. In this demo paper we present, MedPromptExtract an automated tool\nusing a combination of semi supervised learning, large language models, natural\nlanguage processing and prompt engineering to convert unstructured medical\nrecords to structured data which is amenable for further analysis.\n","authors":["Roomani Srivastava","Suraj Prasad","Lipika Bhat","Sarvesh Deshpande","Barnali Das","Kshitij Jadhav"],"pdf_url":"https://arxiv.org/pdf/2405.02664v2.pdf","comment":"4 pages, 3 figures, pre-print sumitted to CIKM 2024"},{"id":"http://arxiv.org/abs/2406.03776v1","updated":"2024-06-06T06:40:19Z","published":"2024-06-06T06:40:19Z","title":"XL-HeadTags: Leveraging Multimodal Retrieval Augmentation for the\n  Multilingual Generation of News Headlines and Tags","summary":"  Millions of news articles published online daily can overwhelm readers.\nHeadlines and entity (topic) tags are essential for guiding readers to decide\nif the content is worth their time. While headline generation has been\nextensively studied, tag generation remains largely unexplored, yet it offers\nreaders better access to topics of interest. The need for conciseness in\ncapturing readers' attention necessitates improved content selection strategies\nfor identifying salient and relevant segments within lengthy articles, thereby\nguiding language models effectively. To address this, we propose to leverage\nauxiliary information such as images and captions embedded in the articles to\nretrieve relevant sentences and utilize instruction tuning with variations to\ngenerate both headlines and tags for news articles in a multilingual context.\nTo make use of the auxiliary information, we have compiled a dataset named\nXL-HeadTags, which includes 20 languages across 6 diverse language families.\nThrough extensive evaluation, we demonstrate the effectiveness of our\nplug-and-play multimodal-multilingual retrievers for both tasks. Additionally,\nwe have developed a suite of tools for processing and evaluating multilingual\ntexts, significantly contributing to the research community by enabling more\naccurate and efficient analysis across languages.\n","authors":["Faisal Tareque Shohan","Mir Tafseer Nayeem","Samsul Islam","Abu Ubaida Akash","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2406.03776v1.pdf","comment":"ACL 2024 camera ready"},{"id":"http://arxiv.org/abs/2404.10496v3","updated":"2024-06-06T06:01:49Z","published":"2024-04-16T12:10:01Z","title":"Spiral of Silences: How is Large Language Model Killing Information\n  Retrieval? -- A Case Study on Open Domain Question Answering","summary":"  The practice of Retrieval-Augmented Generation (RAG), which integrates Large\nLanguage Models (LLMs) with retrieval systems, has become increasingly\nprevalent. However, the repercussions of LLM-derived content infiltrating the\nweb and influencing the retrieval-generation feedback loop are largely\nuncharted territories. In this study, we construct and iteratively run a\nsimulation pipeline to deeply investigate the short-term and long-term effects\nof LLM text on RAG systems. Taking the trending Open Domain Question Answering\n(ODQA) task as a point of entry, our findings reveal a potential digital\n\"Spiral of Silence\" effect, with LLM-generated text consistently outperforming\nhuman-authored content in search rankings, thereby diminishing the presence and\nimpact of human contributions online. This trend risks creating an imbalanced\ninformation ecosystem, where the unchecked proliferation of erroneous\nLLM-generated content may result in the marginalization of accurate\ninformation. We urge the academic community to take heed of this potential\nissue, ensuring a diverse and authentic digital information landscape.\n","authors":["Xiaoyang Chen","Ben He","Hongyu Lin","Xianpei Han","Tianshu Wang","Boxi Cao","Le Sun","Yingfei Sun"],"pdf_url":"https://arxiv.org/pdf/2404.10496v3.pdf","comment":"Accepted to ACL2024"},{"id":"http://arxiv.org/abs/2406.03248v2","updated":"2024-06-06T04:31:37Z","published":"2024-06-05T13:23:23Z","title":"Large Language Models as Evaluators for Recommendation Explanations","summary":"  The explainability of recommender systems has attracted significant attention\nin academia and industry. Many efforts have been made for explainable\nrecommendations, yet evaluating the quality of the explanations remains a\nchallenging and unresolved issue. In recent years, leveraging LLMs as\nevaluators presents a promising avenue in Natural Language Processing tasks\n(e.g., sentiment classification, information extraction), as they perform\nstrong capabilities in instruction following and common-sense reasoning.\nHowever, evaluating recommendation explanatory texts is different from these\nNLG tasks, as its criteria are related to human perceptions and are usually\nsubjective. In this paper, we investigate whether LLMs can serve as evaluators\nof recommendation explanations. To answer the question, we utilize real user\nfeedback on explanations given from previous work and additionally collect\nthird-party annotations and LLM evaluations. We design and apply a 3-level meta\nevaluation strategy to measure the correlation between evaluator labels and the\nground truth provided by users. Our experiments reveal that LLMs, such as GPT4,\ncan provide comparable evaluations with appropriate prompts and settings. We\nalso provide further insights into combining human labels with the LLM\nevaluation process and utilizing ensembles of multiple heterogeneous LLM\nevaluators to enhance the accuracy and stability of evaluations. Our study\nverifies that utilizing LLMs as evaluators can be an accurate, reproducible and\ncost-effective solution for evaluating recommendation explanation texts. Our\ncode is available at https://github.com/Xiaoyu-SZ/LLMasEvaluator.\n","authors":["Xiaoyu Zhang","Yishan Li","Jiayin Wang","Bowen Sun","Weizhi Ma","Peijie Sun","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.03248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03721v1","updated":"2024-06-06T03:34:42Z","published":"2024-06-06T03:34:42Z","title":"Attribute-Aware Implicit Modality Alignment for Text Attribute Person\n  Search","summary":"  Text attribute person search aims to find specific pedestrians through given\ntextual attributes, which is very meaningful in the scene of searching for\ndesignated pedestrians through witness descriptions. The key challenge is the\nsignificant modality gap between textual attributes and images. Previous\nmethods focused on achieving explicit representation and alignment through\nunimodal pre-trained models. Nevertheless, the absence of inter-modality\ncorrespondence in these models may lead to distortions in the local information\nof intra-modality. Moreover, these methods only considered the alignment of\ninter-modality and ignored the differences between different attribute\ncategories. To mitigate the above problems, we propose an Attribute-Aware\nImplicit Modality Alignment (AIMA) framework to learn the correspondence of\nlocal representations between textual attributes and images and combine global\nrepresentation matching to narrow the modality gap. Firstly, we introduce the\nCLIP model as the backbone and design prompt templates to transform attribute\ncombinations into structured sentences. This facilitates the model's ability to\nbetter understand and match image details. Next, we design a Masked Attribute\nPrediction (MAP) module that predicts the masked attributes after the\ninteraction of image and masked textual attribute features through multi-modal\ninteraction, thereby achieving implicit local relationship alignment. Finally,\nwe propose an Attribute-IoU Guided Intra-Modal Contrastive (A-IoU IMC) loss,\naligning the distribution of different textual attributes in the embedding\nspace with their IoU distribution, achieving better semantic arrangement.\nExtensive experiments on the Market-1501 Attribute, PETA, and PA100K datasets\nshow that the performance of our proposed method significantly surpasses the\ncurrent state-of-the-art methods.\n","authors":["Xin Wang","Fangfang Liu","Zheng Li","Caili Guo"],"pdf_url":"https://arxiv.org/pdf/2406.03721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09889v2","updated":"2024-06-06T01:59:40Z","published":"2024-04-15T15:55:01Z","title":"Is Table Retrieval a Solved Problem? Exploring Join-Aware Multi-Table\n  Retrieval","summary":"  Retrieving relevant tables containing the necessary information to accurately\nanswer a given question over tables is critical to open-domain\nquestion-answering (QA) systems. Previous methods assume the answer to such a\nquestion can be found either in a single table or multiple tables identified\nthrough question decomposition or rewriting. However, neither of these\napproaches is sufficient, as many questions require retrieving multiple tables\nand joining them through a join plan that cannot be discerned from the user\nquery itself. If the join plan is not considered in the retrieval stage, the\nsubsequent steps of reasoning and answering based on those retrieved tables are\nlikely to be incorrect. To address this problem, we introduce a method that\nuncovers useful join relations for any query and database during table\nretrieval. We use a novel re-ranking method formulated as a mixed-integer\nprogram that considers not only table-query relevance but also table-table\nrelevance that requires inferring join relationships. Our method outperforms\nthe state-of-the-art approaches for table retrieval by up to 9.3% in F1 score\nand for end-to-end QA by up to 5.4% in accuracy.\n","authors":["Peter Baile Chen","Yi Zhang","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2404.09889v2.pdf","comment":"ACL 2024 camera ready"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2406.04344v1","updated":"2024-06-06T17:59:56Z","published":"2024-06-06T17:59:56Z","title":"Verbalized Machine Learning: Revisiting Machine Learning with Language\n  Models","summary":"  Motivated by the large progress made by large language models (LLMs), we\nintroduce the framework of verbalized machine learning (VML). In contrast to\nconventional machine learning models that are typically optimized over a\ncontinuous parameter space, VML constrains the parameter space to be\nhuman-interpretable natural language. Such a constraint leads to a new\nperspective of function approximation, where an LLM with a text prompt can be\nviewed as a function parameterized by the text prompt. Guided by this\nperspective, we revisit classical machine learning problems, such as regression\nand classification, and find that these problems can be solved by an\nLLM-parameterized learner and optimizer. The major advantages of VML include\n(1) easy encoding of inductive bias: prior knowledge about the problem and\nhypothesis class can be encoded in natural language and fed into the\nLLM-parameterized learner; (2) automatic model class selection: the optimizer\ncan automatically select a concrete model class based on data and verbalized\nprior knowledge, and it can update the model class during training; and (3)\ninterpretable learner updates: the LLM-parameterized optimizer can provide\nexplanations for why each learner update is performed. We conduct several\nstudies to empirically evaluate the effectiveness of VML, and hope that VML can\nserve as a stepping stone to stronger interpretability and trustworthiness in\nML.\n","authors":["Tim Z. Xiao","Robert Bamler","Bernhard Schölkopf","Weiyang Liu"],"pdf_url":"https://arxiv.org/pdf/2406.04344v1.pdf","comment":"Technical Report v1 (92 pages, 15 figures)"},{"id":"http://arxiv.org/abs/2406.04336v1","updated":"2024-06-06T17:59:41Z","published":"2024-06-06T17:59:41Z","title":"On the Expressive Power of Spectral Invariant Graph Neural Networks","summary":"  Incorporating spectral information to enhance Graph Neural Networks (GNNs)\nhas shown promising results but raises a fundamental challenge due to the\ninherent ambiguity of eigenvectors. Various architectures have been proposed to\naddress this ambiguity, referred to as spectral invariant architectures.\nNotable examples include GNNs and Graph Transformers that use spectral\ndistances, spectral projection matrices, or other invariant spectral features.\nHowever, the potential expressive power of these spectral invariant\narchitectures remains largely unclear. The goal of this work is to gain a deep\ntheoretical understanding of the expressive power obtainable when using\nspectral features. We first introduce a unified message-passing framework for\ndesigning spectral invariant GNNs, called Eigenspace Projection GNN (EPNN). A\ncomprehensive analysis shows that EPNN essentially unifies all prior spectral\ninvariant architectures, in that they are either strictly less expressive or\nequivalent to EPNN. A fine-grained expressiveness hierarchy among different\narchitectures is also established. On the other hand, we prove that EPNN itself\nis bounded by a recently proposed class of Subgraph GNNs, implying that all\nthese spectral invariant architectures are strictly less expressive than 3-WL.\nFinally, we discuss whether using spectral features can gain additional\nexpressiveness when combined with more expressive GNNs.\n","authors":["Bohang Zhang","Lingxiao Zhao","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2406.04336v1.pdf","comment":"31 pages; 3 figures; to appear in ICML 2024"},{"id":"http://arxiv.org/abs/2403.00720v2","updated":"2024-06-06T17:59:38Z","published":"2024-03-01T18:12:46Z","title":"Subhomogeneous Deep Equilibrium Models","summary":"  Implicit-depth neural networks have grown as powerful alternatives to\ntraditional networks in various applications in recent years. However, these\nmodels often lack guarantees of existence and uniqueness, raising stability,\nperformance, and reproducibility issues. In this paper, we present a new\nanalysis of the existence and uniqueness of fixed points for implicit-depth\nneural networks based on the concept of subhomogeneous operators and the\nnonlinear Perron-Frobenius theory. Compared to previous similar analyses, our\ntheory allows for weaker assumptions on the parameter matrices, thus yielding a\nmore flexible framework for well-defined implicit networks. We illustrate the\nperformance of the resulting subhomogeneous networks on feedforward,\nconvolutional, and graph neural network examples.\n","authors":["Pietro Sittoni","Francesco Tudisco"],"pdf_url":"https://arxiv.org/pdf/2403.00720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04332v1","updated":"2024-06-06T17:59:23Z","published":"2024-06-06T17:59:23Z","title":"Coarse-To-Fine Tensor Trains for Compact Visual Representations","summary":"  The ability to learn compact, high-quality, and easy-to-optimize\nrepresentations for visual data is paramount to many applications such as novel\nview synthesis and 3D reconstruction. Recent work has shown substantial success\nin using tensor networks to design such compact and high-quality\nrepresentations. However, the ability to optimize tensor-based representations,\nand in particular, the highly compact tensor train representation, is still\nlacking. This has prevented practitioners from deploying the full potential of\ntensor networks for visual data. To this end, we propose 'Prolongation\nUpsampling Tensor Train (PuTT)', a novel method for learning tensor train\nrepresentations in a coarse-to-fine manner. Our method involves the prolonging\nor `upsampling' of a learned tensor train representation, creating a sequence\nof 'coarse-to-fine' tensor trains that are incrementally refined. We evaluate\nour representation along three axes: (1). compression, (2). denoising\ncapability, and (3). image completion capability. To assess these axes, we\nconsider the tasks of image fitting, 3D fitting, and novel view synthesis,\nwhere our method shows an improved performance compared to state-of-the-art\ntensor-based methods. For full results see our project webpage:\nhttps://sebulo.github.io/PuTT_website/\n","authors":["Sebastian Loeschcke","Dan Wang","Christian Leth-Espensen","Serge Belongie","Michael J. Kastoryano","Sagie Benaim"],"pdf_url":"https://arxiv.org/pdf/2406.04332v1.pdf","comment":"Project webpage: https://sebulo.github.io/PuTT_website/"},{"id":"http://arxiv.org/abs/2406.04329v1","updated":"2024-06-06T17:59:10Z","published":"2024-06-06T17:59:10Z","title":"Simplified and Generalized Masked Diffusion for Discrete Data","summary":"  Masked (or absorbing) diffusion is actively explored as an alternative to\nautoregressive models for generative modeling of discrete data. However,\nexisting work in this area has been hindered by unnecessarily complex model\nformulations and unclear relationships between different perspectives, leading\nto suboptimal parameterization, training objectives, and ad hoc adjustments to\ncounteract these issues. In this work, we aim to provide a simple and general\nframework that unlocks the full potential of masked diffusion models. We show\nthat the continuous-time variational objective of masked diffusion models is a\nsimple weighted integral of cross-entropy losses. Our framework also enables\ntraining generalized masked diffusion models with state-dependent masking\nschedules. When evaluated by perplexity, our models trained on OpenWebText\nsurpass prior diffusion language models at GPT-2 scale and demonstrate superior\nperformance on 4 out of 5 zero-shot language modeling tasks. Furthermore, our\nmodels vastly outperform previous discrete diffusion models on pixel-level\nimage modeling, achieving 2.78~(CIFAR-10) and 3.42 (ImageNet 64$\\times$64) bits\nper dimension that are comparable or better than autoregressive models of\nsimilar sizes.\n","authors":["Jiaxin Shi","Kehang Han","Zhe Wang","Arnaud Doucet","Michalis K. Titsias"],"pdf_url":"https://arxiv.org/pdf/2406.04329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04331v1","updated":"2024-06-06T17:59:10Z","published":"2024-06-06T17:59:10Z","title":"PaCE: Parsimonious Concept Engineering for Large Language Models","summary":"  Large Language Models (LLMs) are being used for a wide variety of tasks.\nWhile they are capable of generating human-like responses, they can also\nproduce undesirable output including potentially harmful information, racist or\nsexist language, and hallucinations. Alignment methods are designed to reduce\nsuch undesirable output, via techniques such as fine-tuning, prompt\nengineering, and representation engineering. However, existing methods face\nseveral challenges: some require costly fine-tuning for every alignment task;\nsome do not adequately remove undesirable concepts, failing alignment; some\nremove benign concepts, lowering the linguistic capabilities of LLMs. To\naddress these issues, we propose Parsimonious Concept Engineering (PaCE), a\nnovel activation engineering framework for alignment. First, to sufficiently\nmodel the concepts, we construct a large-scale concept dictionary in the\nactivation space, in which each atom corresponds to a semantic concept. Then,\ngiven any alignment task, we instruct a concept partitioner to efficiently\nannotate the concepts as benign or undesirable. Finally, at inference time, we\ndecompose the LLM activations along the concept dictionary via sparse coding,\nto accurately represent the activation as a linear combination of the benign\nand undesirable components. By removing the latter ones from the activation, we\nreorient the behavior of LLMs towards alignment goals. We conduct experiments\non tasks such as response detoxification, faithfulness enhancement, and\nsentiment revising, and show that PaCE achieves state-of-the-art alignment\nperformance while maintaining linguistic capabilities.\n","authors":["Jinqi Luo","Tianjiao Ding","Kwan Ho Ryan Chan","Darshan Thaker","Aditya Chattopadhyay","Chris Callison-Burch","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2406.04331v1.pdf","comment":"26 pages, 17 figures, 5 tables, dataset and code at\n  https://github.com/peterljq/Parsimonious-Concept-Engineering"},{"id":"http://arxiv.org/abs/2406.04327v1","updated":"2024-06-06T17:59:09Z","published":"2024-06-06T17:59:09Z","title":"Causal Estimation of Memorisation Profiles","summary":"  Understanding memorisation in language models has practical and societal\nimplications, e.g., studying models' training dynamics or preventing copyright\ninfringements. Prior work defines memorisation as the causal effect of training\nwith an instance on the model's ability to predict that instance. This\ndefinition relies on a counterfactual: the ability to observe what would have\nhappened had the model not seen that instance. Existing methods struggle to\nprovide computationally efficient and accurate estimates of this\ncounterfactual. Further, they often estimate memorisation for a model\narchitecture rather than for a specific model instance. This paper fills an\nimportant gap in the literature, proposing a new, principled, and efficient\nmethod to estimate memorisation based on the difference-in-differences design\nfrom econometrics. Using this method, we characterise a model's memorisation\nprofile--its memorisation trends across training--by only observing its\nbehaviour on a small set of instances throughout training. In experiments with\nthe Pythia model suite, we find that memorisation (i) is stronger and more\npersistent in larger models, (ii) is determined by data order and learning\nrate, and (iii) has stable trends across model sizes, thus making memorisation\nin larger models predictable from smaller ones.\n","authors":["Pietro Lesci","Clara Meister","Thomas Hofmann","Andreas Vlachos","Tiago Pimentel"],"pdf_url":"https://arxiv.org/pdf/2406.04327v1.pdf","comment":"Published at the ACL 2024 Conference (main)"},{"id":"http://arxiv.org/abs/2406.04328v1","updated":"2024-06-06T17:59:09Z","published":"2024-06-06T17:59:09Z","title":"The Brain's Bitter Lesson: Scaling Speech Decoding With Self-Supervised\n  Learning","summary":"  The past few years have produced a series of spectacular advances in the\ndecoding of speech from brain activity. The engine of these advances has been\nthe acquisition of labelled data, with increasingly large datasets acquired\nfrom single subjects. However, participants exhibit anatomical and other\nindividual differences, and datasets use varied scanners and task designs. As a\nresult, prior work has struggled to leverage data from multiple subjects,\nmultiple datasets, multiple tasks, and unlabelled datasets. In turn, the field\nhas not benefited from the rapidly growing number of open neural data\nrepositories to exploit large-scale data and deep learning. To address this, we\ndevelop an initial set of neuroscience-inspired self-supervised objectives,\ntogether with a neural architecture, for representation learning from\nheterogeneous and unlabelled neural recordings. Experimental results show that\nrepresentations learned with these objectives generalise across subjects,\ndatasets, and tasks, and are also learned faster than using only labelled data.\nIn addition, we set new benchmarks for two foundational speech decoding tasks.\nTaken together, these methods now unlock the potential for training speech\ndecoding models with orders of magnitude more existing data.\n","authors":["Dulhan Jayalath","Gilad Landau","Brendan Shillingford","Mark Woolrich","Oiwi Parker Jones"],"pdf_url":"https://arxiv.org/pdf/2406.04328v1.pdf","comment":"10 pages, 4 figures, under review"},{"id":"http://arxiv.org/abs/2310.12956v2","updated":"2024-06-06T17:58:45Z","published":"2023-10-19T17:55:06Z","title":"Eureka-Moments in Transformers: Multi-Step Tasks Reveal Softmax Induced\n  Optimization Problems","summary":"  In this work, we study rapid improvements of the training loss in\ntransformers when being confronted with multi-step decision tasks. We found\nthat transformers struggle to learn the intermediate task and both training and\nvalidation loss saturate for hundreds of epochs. When transformers finally\nlearn the intermediate task, they do this rapidly and unexpectedly. We call\nthese abrupt improvements Eureka-moments, since the transformer appears to\nsuddenly learn a previously incomprehensible concept. We designed synthetic\ntasks to study the problem in detail, but the leaps in performance can be\nobserved also for language modeling and in-context learning (ICL). We suspect\nthat these abrupt transitions are caused by the multi-step nature of these\ntasks. Indeed, we find connections and show that ways to improve on the\nsynthetic multi-step tasks can be used to improve the training of language\nmodeling and ICL. Using the synthetic data we trace the problem back to the\nSoftmax function in the self-attention block of transformers and show ways to\nalleviate the problem. These fixes reduce the required number of training\nsteps, lead to higher likelihood to learn the intermediate task, to higher\nfinal accuracy and training becomes more robust to hyper-parameters.\n","authors":["David T. Hoffmann","Simon Schrodi","Jelena Bratulić","Nadine Behrmann","Volker Fischer","Thomas Brox"],"pdf_url":"https://arxiv.org/pdf/2310.12956v2.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2406.04323v1","updated":"2024-06-06T17:58:15Z","published":"2024-06-06T17:58:15Z","title":"ATraDiff: Accelerating Online Reinforcement Learning with Imaginary\n  Trajectories","summary":"  Training autonomous agents with sparse rewards is a long-standing problem in\nonline reinforcement learning (RL), due to low data efficiency. Prior work\novercomes this challenge by extracting useful knowledge from offline data,\noften accomplished through the learning of action distribution from offline\ndata and utilizing the learned distribution to facilitate online RL. However,\nsince the offline data are given and fixed, the extracted knowledge is\ninherently limited, making it difficult to generalize to new tasks. We propose\na novel approach that leverages offline data to learn a generative diffusion\nmodel, coined as Adaptive Trajectory Diffuser (ATraDiff). This model generates\nsynthetic trajectories, serving as a form of data augmentation and consequently\nenhancing the performance of online RL methods. The key strength of our\ndiffuser lies in its adaptability, allowing it to effectively handle varying\ntrajectory lengths and mitigate distribution shifts between online and offline\ndata. Because of its simplicity, ATraDiff seamlessly integrates with a wide\nspectrum of RL methods. Empirical evaluation shows that ATraDiff consistently\nachieves state-of-the-art performance across a variety of environments, with\nparticularly pronounced improvements in complicated settings. Our code and demo\nvideo are available at https://atradiff.github.io .\n","authors":["Qianlan Yang","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2406.04323v1.pdf","comment":"ICML 2024 Accepted"},{"id":"http://arxiv.org/abs/2406.04321v1","updated":"2024-06-06T17:58:11Z","published":"2024-06-06T17:58:11Z","title":"VidMuse: A Simple Video-to-Music Generation Framework with\n  Long-Short-Term Modeling","summary":"  In this work, we systematically study music generation conditioned solely on\nthe video. First, we present a large-scale dataset comprising 190K video-music\npairs, including various genres such as movie trailers, advertisements, and\ndocumentaries. Furthermore, we propose VidMuse, a simple framework for\ngenerating music aligned with video inputs. VidMuse stands out by producing\nhigh-fidelity music that is both acoustically and semantically aligned with the\nvideo. By incorporating local and global visual cues, VidMuse enables the\ncreation of musically coherent audio tracks that consistently match the video\ncontent through Long-Short-Term modeling. Through extensive experiments,\nVidMuse outperforms existing models in terms of audio quality, diversity, and\naudio-visual alignment. The code and datasets will be available at\nhttps://github.com/ZeyueT/VidMuse/.\n","authors":["Zeyue Tian","Zhaoyang Liu","Ruibin Yuan","Jiahao Pan","Xiaoqiang Huang","Qifeng Liu","Xu Tan","Qifeng Chen","Wei Xue","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2406.04321v1.pdf","comment":"The code and datasets will be available at\n  https://github.com/ZeyueT/VidMuse/"},{"id":"http://arxiv.org/abs/2406.04320v1","updated":"2024-06-06T17:58:09Z","published":"2024-06-06T17:58:09Z","title":"Chimera: Effectively Modeling Multivariate Time Series with\n  2-Dimensional State Space Models","summary":"  Modeling multivariate time series is a well-established problem with a wide\nrange of applications from healthcare to financial markets. Traditional State\nSpace Models (SSMs) are classical approaches for univariate time series\nmodeling due to their simplicity and expressive power to represent linear\ndependencies. They, however, have fundamentally limited expressive power to\ncapture non-linear dependencies, are slow in practice, and fail to model the\ninter-variate information flow. Despite recent attempts to improve the\nexpressive power of SSMs by using deep structured SSMs, the existing methods\nare either limited to univariate time series, fail to model complex patterns\n(e.g., seasonal patterns), fail to dynamically model the dependencies of\nvariate and time dimensions, and/or are input-independent. We present Chimera\nthat uses two input-dependent 2-D SSM heads with different discretization\nprocesses to learn long-term progression and seasonal patterns. To improve the\nefficiency of complex 2D recurrence, we present a fast training using a new\n2-dimensional parallel selective scan. We further present and discuss\n2-dimensional Mamba and Mamba-2 as the spacial cases of our 2D SSM. Our\nexperimental evaluation shows the superior performance of Chimera on extensive\nand diverse benchmarks, including ECG and speech time series classification,\nlong-term and short-term time series forecasting, and time series anomaly\ndetection.\n","authors":["Ali Behrouz","Michele Santacatterina","Ramin Zabih"],"pdf_url":"https://arxiv.org/pdf/2406.04320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04318v1","updated":"2024-06-06T17:58:00Z","published":"2024-06-06T17:58:00Z","title":"Adaptive Sampling of k-Space in Magnetic Resonance for Rapid Pathology\n  Prediction","summary":"  Magnetic Resonance (MR) imaging, despite its proven diagnostic utility,\nremains an inaccessible imaging modality for disease surveillance at the\npopulation level. A major factor rendering MR inaccessible is lengthy scan\ntimes. An MR scanner collects measurements associated with the underlying\nanatomy in the Fourier space, also known as the k-space. Creating a\nhigh-fidelity image requires collecting large quantities of such measurements,\nincreasing the scan time. Traditionally to accelerate an MR scan, image\nreconstruction from under-sampled k-space data is the method of choice.\nHowever, recent works show the feasibility of bypassing image reconstruction\nand directly learning to detect disease directly from a sparser learned subset\nof the k-space measurements. In this work, we propose Adaptive Sampling for MR\n(ASMR), a sampling method that learns an adaptive policy to sequentially select\nk-space samples to optimize for target disease detection. On 6 out of 8\npathology classification tasks spanning the Knee, Brain, and Prostate MR scans,\nASMR reaches within 2% of the performance of a fully sampled classifier while\nusing only 8% of the k-space, as well as outperforming prior state-of-the-art\nwork in k-space sampling such as EMRT, LOUPE, and DPS.\n","authors":["Chen-Yu Yen","Raghav Singhal","Umang Sharma","Rajesh Ranganath","Sumit Chopra","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2406.04318v1.pdf","comment":"ICML 2024. Project website at https://adaptive-sampling-mr.github.io"},{"id":"http://arxiv.org/abs/2406.04317v1","updated":"2024-06-06T17:57:49Z","published":"2024-06-06T17:57:49Z","title":"Regularized KL-Divergence for Well-Defined Function-Space Variational\n  Inference in Bayesian neural networks","summary":"  Bayesian neural networks (BNN) promise to combine the predictive performance\nof neural networks with principled uncertainty modeling important for\nsafety-critical systems and decision making. However, posterior uncertainty\nestimates depend on the choice of prior, and finding informative priors in\nweight-space has proven difficult. This has motivated variational inference\n(VI) methods that pose priors directly on the function generated by the BNN\nrather than on weights. In this paper, we address a fundamental issue with such\nfunction-space VI approaches pointed out by Burt et al. (2020), who showed that\nthe objective function (ELBO) is negative infinite for most priors of interest.\nOur solution builds on generalized VI (Knoblauch et al., 2019) with the\nregularized KL divergence (Quang, 2019) and is, to the best of our knowledge,\nthe first well-defined variational objective for function-space inference in\nBNNs with Gaussian process (GP) priors. Experiments show that our method\nincorporates the properties specified by the GP prior on synthetic and small\nreal-world data sets, and provides competitive uncertainty estimates for\nregression, classification and out-of-distribution detection compared to BNN\nbaselines with both function and weight-space priors.\n","authors":["Tristan Cinquin","Robert Bamler"],"pdf_url":"https://arxiv.org/pdf/2406.04317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04313v1","updated":"2024-06-06T17:57:04Z","published":"2024-06-06T17:57:04Z","title":"Improving Alignment and Robustness with Short Circuiting","summary":"  AI systems can take harmful actions and are highly vulnerable to adversarial\nattacks. We present an approach, inspired by recent advances in representation\nengineering, that \"short-circuits\" models as they respond with harmful outputs.\nExisting techniques aimed at improving alignment, such as refusal training, are\noften bypassed. Techniques such as adversarial training try to plug these holes\nby countering specific attacks. As an alternative to refusal training and\nadversarial training, short-circuiting directly controls the representations\nthat are responsible for harmful outputs in the first place. Our technique can\nbe applied to both text-only and multimodal language models to prevent the\ngeneration of harmful outputs without sacrificing utility -- even in the\npresence of powerful unseen attacks. Notably, while adversarial robustness in\nstandalone image recognition remains an open challenge, short-circuiting allows\nthe larger multimodal system to reliably withstand image \"hijacks\" that aim to\nproduce harmful content. Finally, we extend our approach to AI agents,\ndemonstrating considerable reductions in the rate of harmful actions when they\nare under attack. Our approach represents a significant step forward in the\ndevelopment of reliable safeguards to harmful behavior and adversarial attacks.\n","authors":["Andy Zou","Long Phan","Justin Wang","Derek Duenas","Maxwell Lin","Maksym Andriushchenko","Rowan Wang","Zico Kolter","Matt Fredrikson","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2406.04313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04309v1","updated":"2024-06-06T17:55:34Z","published":"2024-06-06T17:55:34Z","title":"ReFiNe: Recursive Field Networks for Cross-modal Multi-scene\n  Representation","summary":"  The common trade-offs of state-of-the-art methods for multi-shape\nrepresentation (a single model \"packing\" multiple objects) involve trading\nmodeling accuracy against memory and storage. We show how to encode multiple\nshapes represented as continuous neural fields with a higher degree of\nprecision than previously possible and with low memory usage. Key to our\napproach is a recursive hierarchical formulation that exploits object\nself-similarity, leading to a highly compressed and efficient shape latent\nspace. Thanks to the recursive formulation, our method supports spatial and\nglobal-to-local latent feature fusion without needing to initialize and\nmaintain auxiliary data structures, while still allowing for continuous field\nqueries to enable applications such as raytracing. In experiments on a set of\ndiverse datasets, we provide compelling qualitative results and demonstrate\nstate-of-the-art multi-scene reconstruction and compression results with a\nsingle network per dataset.\n","authors":["Sergey Zakharov","Katherine Liu","Adrien Gaidon","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2406.04309v1.pdf","comment":"SIGGRAPH 2024. Project Page:\n  https://zakharos.github.io/projects/refine/"},{"id":"http://arxiv.org/abs/2406.04308v1","updated":"2024-06-06T17:55:02Z","published":"2024-06-06T17:55:02Z","title":"Approximation-Aware Bayesian Optimization","summary":"  High-dimensional Bayesian optimization (BO) tasks such as molecular design\noften require 10,000 function evaluations before obtaining meaningful results.\nWhile methods like sparse variational Gaussian processes (SVGPs) reduce\ncomputational requirements in these settings, the underlying approximations\nresult in suboptimal data acquisitions that slow the progress of optimization.\nIn this paper we modify SVGPs to better align with the goals of BO: targeting\ninformed data acquisition rather than global posterior fidelity. Using the\nframework of utility-calibrated variational inference, we unify GP\napproximation and data acquisition into a joint optimization problem, thereby\nensuring optimal decisions under a limited computational budget. Our approach\ncan be used with any decision-theoretic acquisition function and is compatible\nwith trust region methods like TuRBO. We derive efficient joint objectives for\nthe expected improvement and knowledge gradient acquisition functions in both\nthe standard and batch BO settings. Our approach outperforms standard SVGPs on\nhigh-dimensional benchmark tasks in control and molecular design.\n","authors":["Natalie Maus","Kyurae Kim","Geoff Pleiss","David Eriksson","John P. Cunningham","Jacob R. Gardner"],"pdf_url":"https://arxiv.org/pdf/2406.04308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04306v1","updated":"2024-06-06T17:53:34Z","published":"2024-06-06T17:53:34Z","title":"Semantically Diverse Language Generation for Uncertainty Estimation in\n  Language Models","summary":"  Large language models (LLMs) can suffer from hallucinations when generating\ntext. These hallucinations impede various applications in society and industry\nby making LLMs untrustworthy. Current LLMs generate text in an autoregressive\nfashion by predicting and appending text tokens. When an LLM is uncertain about\nthe semantic meaning of the next tokens to generate, it is likely to start\nhallucinating. Thus, it has been suggested that hallucinations stem from\npredictive uncertainty. We introduce Semantically Diverse Language Generation\n(SDLG) to quantify predictive uncertainty in LLMs. SDLG steers the LLM to\ngenerate semantically diverse yet likely alternatives for an initially\ngenerated text. This approach provides a precise measure of aleatoric semantic\nuncertainty, detecting whether the initial text is likely to be hallucinated.\nExperiments on question-answering tasks demonstrate that SDLG consistently\noutperforms existing methods while being the most computationally efficient,\nsetting a new standard for uncertainty estimation in LLMs.\n","authors":["Lukas Aichberger","Kajetan Schweighofer","Mykyta Ielanskyi","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2406.04306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10790v5","updated":"2024-06-06T17:50:16Z","published":"2022-08-23T07:50:52Z","title":"Event-Triggered Time-Varying Bayesian Optimization","summary":"  We consider the problem of sequentially optimizing a time-varying objective\nfunction using time-varying Bayesian optimization (TVBO). To cope with stale\ndata arising from time variations, current approaches to TVBO require prior\nknowledge of a constant rate of change. However, in practice, the rate of\nchange is usually unknown. We propose an event-triggered algorithm, ET-GP-UCB,\nthat treats the optimization problem as static until it detects changes in the\nobjective function and then resets the dataset. This allows the algorithm to\nadapt online to realized temporal changes without the need for exact prior\nknowledge. The event trigger is based on probabilistic uniform error bounds\nused in Gaussian process regression. We derive regret bounds of adaptive resets\nwithout exact prior knowledge on the temporal changes, and show in numerical\nexperiments that ET-GP-UCB outperforms state-of-the-art algorithms on both\nsynthetic and real-world data. The results demonstrate that ET-GP-UCB is\nreadily applicable to various settings without extensive hyperparameter tuning.\n","authors":["Paul Brunzema","Alexander von Rohr","Friedrich Solowjow","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2208.10790v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04303v1","updated":"2024-06-06T17:49:21Z","published":"2024-06-06T17:49:21Z","title":"Vision-LSTM: xLSTM as Generic Vision Backbone","summary":"  Transformers are widely used as generic backbones in computer vision, despite\ninitially introduced for natural language processing. Recently, the Long\nShort-Term Memory (LSTM) has been extended to a scalable and performant\narchitecture - the xLSTM - which overcomes long-standing LSTM limitations via\nexponential gating and parallelizable matrix memory structure. In this report,\nwe introduce Vision-LSTM (ViL), an adaption of the xLSTM building blocks to\ncomputer vision. ViL comprises a stack of xLSTM blocks where odd blocks process\nthe sequence of patch tokens from top to bottom while even blocks go from\nbottom to top. Experiments show that ViL holds promise to be further deployed\nas new generic backbone for computer vision architectures.\n","authors":["Benedikt Alkin","Maximilian Beck","Korbinian Pöppel","Sepp Hochreiter","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2406.04303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04302v1","updated":"2024-06-06T17:48:24Z","published":"2024-06-06T17:48:24Z","title":"Representational Alignment Supports Effective Machine Teaching","summary":"  A good teacher should not only be knowledgeable; but should be able to\ncommunicate in a way that the student understands -- to share the student's\nrepresentation of the world. In this work, we integrate insights from machine\nteaching and pragmatic communication with the burgeoning literature on\nrepresentational alignment to characterize a utility curve defining a\nrelationship between representational alignment and teacher capability for\npromoting student learning. To explore the characteristics of this utility\ncurve, we design a supervised learning environment that disentangles\nrepresentational alignment from teacher accuracy. We conduct extensive\ncomputational experiments with machines teaching machines, complemented by a\nseries of experiments in which machines teach humans. Drawing on our findings\nthat improved representational alignment with a student improves student\nlearning outcomes (i.e., task accuracy), we design a classroom matching\nprocedure that assigns students to teachers based on the utility curve. If we\nare to design effective machine teachers, it is not enough to build teachers\nthat are accurate -- we want teachers that can align, representationally, to\ntheir students too.\n","authors":["Ilia Sucholutsky","Katherine M. Collins","Maya Malaviya","Nori Jacoby","Weiyang Liu","Theodore R. Sumers","Michalis Korakakis","Umang Bhatt","Mark Ho","Joshua B. Tenenbaum","Brad Love","Zachary A. Pardos","Adrian Weller","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2406.04302v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2402.12991v2","updated":"2024-06-06T17:46:48Z","published":"2024-02-20T13:20:39Z","title":"TRAP: Targeted Random Adversarial Prompt Honeypot for Black-Box\n  Identification","summary":"  Large Language Model (LLM) services and models often come with legal rules on\nwho can use them and how they must use them. Assessing the compliance of the\nreleased LLMs is crucial, as these rules protect the interests of the LLM\ncontributor and prevent misuse. In this context, we describe the novel\nfingerprinting problem of Black-box Identity Verification (BBIV). The goal is\nto determine whether a third-party application uses a certain LLM through its\nchat function. We propose a method called Targeted Random Adversarial Prompt\n(TRAP) that identifies the specific LLM in use. We repurpose adversarial\nsuffixes, originally proposed for jailbreaking, to get a pre-defined answer\nfrom the target LLM, while other models give random answers. TRAP detects the\ntarget LLMs with over 95% true positive rate at under 0.2% false positive rate\neven after a single interaction. TRAP remains effective even if the LLM has\nminor changes that do not significantly alter the original function.\n","authors":["Martin Gubri","Dennis Ulmer","Hwaran Lee","Sangdoo Yun","Seong Joon Oh"],"pdf_url":"https://arxiv.org/pdf/2402.12991v2.pdf","comment":"Accepted at ACL 2024 (findings)"},{"id":"http://arxiv.org/abs/2401.06688v2","updated":"2024-06-06T17:45:39Z","published":"2024-01-12T16:52:41Z","title":"Don't Rank, Combine! Combining Machine Translation Hypotheses Using\n  Quality Estimation","summary":"  Neural machine translation systems estimate probabilities of target sentences\ngiven source sentences, yet these estimates may not align with human\npreferences. This work introduces QE-fusion, a method that synthesizes\ntranslations using a quality estimation metric (QE), which correlates better\nwith human judgments. QE-fusion leverages a pool of candidates sampled from a\nmodel, combining spans from different candidates using a QE metric such as\nCometKiwi. We compare QE-fusion against beam search and recent reranking\ntechniques, such as Minimum Bayes Risk decoding or QE-reranking. Our method\nconsistently improves translation quality in terms of COMET and BLEURT scores\nwhen applied to large language models (LLMs) used for translation (PolyLM,\nXGLM, Llama2, Mistral, ALMA, and Tower) and to multilingual translation models\n(NLLB), over five language pairs. Notably, QE-fusion exhibits larger\nimprovements for LLMs due to their ability to generate diverse outputs. We\ndemonstrate that our approach generates novel translations in over half of the\ncases and consistently outperforms other methods across varying numbers of\ncandidates (5-200). Furthermore, we empirically establish that QE-fusion scales\nlinearly with the number of candidates in the pool.\n","authors":["Giorgos Vernikos","Andrei Popescu-Belis"],"pdf_url":"https://arxiv.org/pdf/2401.06688v2.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2406.04299v1","updated":"2024-06-06T17:45:00Z","published":"2024-06-06T17:45:00Z","title":"NoisyGL: A Comprehensive Benchmark for Graph Neural Networks under Label\n  Noise","summary":"  Graph Neural Networks (GNNs) exhibit strong potential in node classification\ntask through a message-passing mechanism. However, their performance often\nhinges on high-quality node labels, which are challenging to obtain in\nreal-world scenarios due to unreliable sources or adversarial attacks.\nConsequently, label noise is common in real-world graph data, negatively\nimpacting GNNs by propagating incorrect information during training. To address\nthis issue, the study of Graph Neural Networks under Label Noise (GLN) has\nrecently gained traction. However, due to variations in dataset selection, data\nsplitting, and preprocessing techniques, the community currently lacks a\ncomprehensive benchmark, which impedes deeper understanding and further\ndevelopment of GLN. To fill this gap, we introduce NoisyGL in this paper, the\nfirst comprehensive benchmark for graph neural networks under label noise.\nNoisyGL enables fair comparisons and detailed analyses of GLN methods on noisy\nlabeled graph data across various datasets, with unified experimental settings\nand interface. Our benchmark has uncovered several important insights that were\nmissed in previous research, and we believe these findings will be highly\nbeneficial for future studies. We hope our open-source benchmark library will\nfoster further advancements in this field. The code of the benchmark can be\nfound in https://github.com/eaglelab-zju/NoisyGL.\n","authors":["Zhonghao Wang","Danyu Sun","Sheng Zhou","Haobo Wang","Jiapei Fan","Longtao Huang","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2406.04299v1.pdf","comment":"Submitted to the 38th Conference on Neural Information Processing\n  Systems (NeurIPS 2024) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2403.07974v2","updated":"2024-06-06T17:41:21Z","published":"2024-03-12T17:58:04Z","title":"LiveCodeBench: Holistic and Contamination Free Evaluation of Large\n  Language Models for Code","summary":"  Large Language Models (LLMs) applied to code-related applications have\nemerged as a prominent field, attracting significant interest from both\nacademia and industry. However, as new and improved LLMs are developed,\nexisting evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient\nfor assessing their capabilities. In this work, we propose LiveCodeBench, a\ncomprehensive and contamination-free evaluation of LLMs for code, which\ncontinuously collects new problems over time from contests across three\ncompetition platforms, namely LeetCode, AtCoder, and CodeForces. Notably, our\nbenchmark also focuses on a broader range of code related capabilities, such as\nself-repair, code execution, and test output prediction, beyond just code\ngeneration. Currently, LiveCodeBench hosts four hundred high-quality coding\nproblems that were published between May 2023 and May 2024. We have evaluated\n18 base LLMs and 34 instruction-tuned LLMs on LiveCodeBench. We present\nempirical findings on contamination, holistic performance comparisons,\npotential overfitting in existing benchmarks as well as individual model\ncomparisons. We will release all prompts and model completions for further\ncommunity analysis, along with a general toolkit for adding new scenarios and\nmodel\n","authors":["Naman Jain","King Han","Alex Gu","Wen-Ding Li","Fanjia Yan","Tianjun Zhang","Sida Wang","Armando Solar-Lezama","Koushik Sen","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2403.07974v2.pdf","comment":"Website - https://livecodebench.github.io/"},{"id":"http://arxiv.org/abs/2402.09470v2","updated":"2024-06-06T17:39:53Z","published":"2024-02-12T08:16:10Z","title":"Rolling Diffusion Models","summary":"  Diffusion models have recently been increasingly applied to temporal data\nsuch as video, fluid mechanics simulations, or climate data. These methods\ngenerally treat subsequent frames equally regarding the amount of noise in the\ndiffusion process. This paper explores Rolling Diffusion: a new approach that\nuses a sliding window denoising process. It ensures that the diffusion process\nprogressively corrupts through time by assigning more noise to frames that\nappear later in a sequence, reflecting greater uncertainty about the future as\nthe generation process unfolds. Empirically, we show that when the temporal\ndynamics are complex, Rolling Diffusion is superior to standard diffusion. In\nparticular, this result is demonstrated in a video prediction task using the\nKinetics-600 video dataset and in a chaotic fluid dynamics forecasting\nexperiment.\n","authors":["David Ruhe","Jonathan Heek","Tim Salimans","Emiel Hoogeboom"],"pdf_url":"https://arxiv.org/pdf/2402.09470v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04291v1","updated":"2024-06-06T17:37:39Z","published":"2024-06-06T17:37:39Z","title":"Stratified Prediction-Powered Inference for Hybrid Language Model\n  Evaluation","summary":"  Prediction-powered inference (PPI) is a method that improves statistical\nestimates based on limited human-labeled data. PPI achieves this by combining\nsmall amounts of human-labeled data with larger amounts of data labeled by a\nreasonably accurate -- but potentially biased -- automatic system, in a way\nthat results in tighter confidence intervals for certain parameters of interest\n(e.g., the mean performance of a language model). In this paper, we propose a\nmethod called Stratified Prediction-Powered Inference (StratPPI), in which we\nshow that the basic PPI estimates can be considerably improved by employing\nsimple data stratification strategies. Without making any assumptions on the\nunderlying automatic labeling system or data distribution, we derive an\nalgorithm for computing provably valid confidence intervals for population\nparameters (such as averages) that is based on stratified sampling. In\nparticular, we show both theoretically and empirically that, with appropriate\nchoices of stratification and sample allocation, our approach can provide\nsubstantially tighter confidence intervals than unstratified approaches.\nSpecifically, StratPPI is expected to improve in cases where the performance of\nthe autorater varies across different conditional distributions of the target\ndata.\n","authors":["Adam Fisch","Joshua Maynez","R. Alex Hofer","Bhuwan Dhingra","Amir Globerson","William W. Cohen"],"pdf_url":"https://arxiv.org/pdf/2406.04291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16467v2","updated":"2024-06-06T17:31:07Z","published":"2024-01-29T18:45:30Z","title":"ReGAL: Refactoring Programs to Discover Generalizable Abstractions","summary":"  While large language models (LLMs) are increasingly being used for program\nsynthesis, they lack the global view needed to develop useful abstractions;\nthey generally predict programs one at a time, often repeating the same\nfunctionality. Generating redundant code from scratch is both inefficient and\nerror-prone. To address this, we propose Refactoring for Generalizable\nAbstraction Learning (ReGAL), a gradient-free method for learning a library of\nreusable functions via code refactorization, i.e., restructuring code without\nchanging its execution output. ReGAL learns from a small set of existing\nprograms, iteratively verifying and refining its abstractions via execution. We\nfind that the shared function libraries discovered by ReGAL make programs\neasier to predict across diverse domains. On five datasets -- LOGO graphics\ngeneration, Date reasoning, TextCraft (a Minecraft-based text-game) MATH, and\nTabMWP -- both open-source and proprietary LLMs improve in accuracy when\npredicting programs with ReGAL functions. For CodeLlama-13B, ReGAL results in\nabsolute accuracy increases of 11.5% on LOGO, 26.1% on date understanding, and\n8.1% on TextCraft, outperforming GPT-3.5 in two of three domains. Our analysis\nreveals ReGAL's abstractions encapsulate frequently-used subroutines as well as\nenvironment dynamics.\n","authors":["Elias Stengel-Eskin","Archiki Prasad","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2401.16467v2.pdf","comment":"ICML 2024 Camera-Ready; First two authors contributed equally; Code:\n  https://github.com/esteng/regal_program_learning"},{"id":"http://arxiv.org/abs/2406.04284v1","updated":"2024-06-06T17:28:56Z","published":"2024-06-06T17:28:56Z","title":"What is Dataset Distillation Learning?","summary":"  Dataset distillation has emerged as a strategy to overcome the hurdles\nassociated with large datasets by learning a compact set of synthetic data that\nretains essential information from the original dataset. While distilled data\ncan be used to train high performing models, little is understood about how the\ninformation is stored. In this study, we posit and answer three questions about\nthe behavior, representativeness, and point-wise information content of\ndistilled data. We reveal distilled data cannot serve as a substitute for real\ndata during training outside the standard evaluation setting for dataset\ndistillation. Additionally, the distillation process retains high task\nperformance by compressing information related to the early training dynamics\nof real models. Finally, we provide an framework for interpreting distilled\ndata and reveal that individual distilled data points contain meaningful\nsemantic information. This investigation sheds light on the intricate nature of\ndistilled data, providing a better understanding on how they can be effectively\nutilized.\n","authors":["William Yang","Ye Zhu","Zhiwei Deng","Olga Russakovsky"],"pdf_url":"https://arxiv.org/pdf/2406.04284v1.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.04280v1","updated":"2024-06-06T17:26:40Z","published":"2024-06-06T17:26:40Z","title":"xMIL: Insightful Explanations for Multiple Instance Learning in\n  Histopathology","summary":"  Multiple instance learning (MIL) is an effective and widely used approach for\nweakly supervised machine learning. In histopathology, MIL models have achieved\nremarkable success in tasks like tumor detection, biomarker prediction, and\noutcome prognostication. However, MIL explanation methods are still lagging\nbehind, as they are limited to small bag sizes or disregard instance\ninteractions. We revisit MIL through the lens of explainable AI (XAI) and\nintroduce xMIL, a refined framework with more general assumptions. We\ndemonstrate how to obtain improved MIL explanations using layer-wise relevance\npropagation (LRP) and conduct extensive evaluation experiments on three toy\nsettings and four real-world histopathology datasets. Our approach consistently\noutperforms previous explanation attempts with particularly improved\nfaithfulness scores on challenging biomarker prediction tasks. Finally, we\nshowcase how xMIL explanations enable pathologists to extract insights from MIL\nmodels, representing a significant advance for knowledge discovery and model\ndebugging in digital histopathology.\n","authors":["Julius Hense","Mina Jamshidi Idaji","Oliver Eberle","Thomas Schnake","Jonas Dippel","Laure Ciernik","Oliver Buchstab","Andreas Mock","Frederick Klauschen","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2406.04280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04276v1","updated":"2024-06-06T17:25:07Z","published":"2024-06-06T17:25:07Z","title":"Generative AI-in-the-loop: Integrating LLMs and GPTs into the Next\n  Generation Networks","summary":"  In recent years, machine learning (ML) techniques have created numerous\nopportunities for intelligent mobile networks and have accelerated the\nautomation of network operations. However, complex network tasks may involve\nvariables and considerations even beyond the capacity of traditional ML\nalgorithms. On the other hand, large language models (LLMs) have recently\nemerged, demonstrating near-human-level performance in cognitive tasks across\nvarious fields. However, they remain prone to hallucinations and often lack\ncommon sense in basic tasks. Therefore, they are regarded as assistive tools\nfor humans. In this work, we propose the concept of \"generative AI-in-the-loop\"\nand utilize the semantic understanding, context awareness, and reasoning\nabilities of LLMs to assist humans in handling complex or unforeseen situations\nin mobile communication networks. We believe that combining LLMs and ML models\nallows both to leverage their respective capabilities and achieve better\nresults than either model alone. To support this idea, we begin by analyzing\nthe capabilities of LLMs and compare them with traditional ML algorithms. We\nthen explore potential LLM-based applications in line with the requirements of\nnext-generation networks. We further examine the integration of ML and LLMs,\ndiscussing how they can be used together in mobile networks. Unlike existing\nstudies, our research emphasizes the fusion of LLMs with traditional ML-driven\nnext-generation networks and serves as a comprehensive refinement of existing\nsurveys. Finally, we provide a case study to enhance ML-based network intrusion\ndetection with synthesized data generated by LLMs. Our case study further\ndemonstrates the advantages of our proposed idea.\n","authors":["Han Zhang","Akram Bin Sediq","Ali Afana","Melike Erol-Kantarci"],"pdf_url":"https://arxiv.org/pdf/2406.04276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04274v1","updated":"2024-06-06T17:23:49Z","published":"2024-06-06T17:23:49Z","title":"Self-Play with Adversarial Critic: Provable and Scalable Offline\n  Alignment for Language Models","summary":"  This work studies the challenge of aligning large language models (LLMs) with\noffline preference data. We focus on alignment by Reinforcement Learning from\nHuman Feedback (RLHF) in particular. While popular preference optimization\nmethods exhibit good empirical performance in practice, they are not\ntheoretically guaranteed to converge to the optimal policy and can provably\nfail when the data coverage is sparse by classical offline reinforcement\nlearning (RL) results. On the other hand, a recent line of work has focused on\ntheoretically motivated preference optimization methods with provable\nguarantees, but these are not computationally efficient for large-scale\napplications like LLM alignment. To bridge this gap, we propose SPAC, a new\noffline preference optimization method with self-play, inspired by the\non-average pessimism technique from the offline RL literature, to be the first\nprovable and scalable approach to LLM alignment. We both provide theoretical\nanalysis for its convergence under single-policy concentrability for the\ngeneral function approximation setting and demonstrate its competitive\nempirical performance for LLM alignment on a 7B Mistral model with Open LLM\nLeaderboard evaluations.\n","authors":["Xiang Ji","Sanjeev Kulkarni","Mengdi Wang","Tengyang Xie"],"pdf_url":"https://arxiv.org/pdf/2406.04274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06031v2","updated":"2024-06-06T17:20:07Z","published":"2024-02-08T20:07:47Z","title":"An operator learning perspective on parameter-to-observable maps","summary":"  Computationally efficient surrogates for parametrized physical models play a\ncrucial role in science and engineering. Operator learning provides data-driven\nsurrogates that map between function spaces. However, instead of full-field\nmeasurements, often the available data are only finite-dimensional\nparametrizations of model inputs or finite observables of model outputs.\nBuilding on Fourier Neural Operators, this paper introduces the Fourier Neural\nMappings (FNMs) framework that is able to accommodate such finite-dimensional\nvector inputs or outputs. The paper develops universal approximation theorems\nfor the method. Moreover, in many applications the underlying\nparameter-to-observable (PtO) map is defined implicitly through an\ninfinite-dimensional operator, such as the solution operator of a partial\ndifferential equation. A natural question is whether it is more data-efficient\nto learn the PtO map end-to-end or first learn the solution operator and\nsubsequently compute the observable from the full-field solution. A theoretical\nanalysis of Bayesian nonparametric regression of linear functionals, which is\nof independent interest, suggests that the end-to-end approach can actually\nhave worse sample complexity. Extending beyond the theory, numerical results\nfor the FNM approximation of three nonlinear PtO maps demonstrate the benefits\nof the operator learning perspective that this paper adopts.\n","authors":["Daniel Zhengyu Huang","Nicholas H. Nelsen","Margaret Trautner"],"pdf_url":"https://arxiv.org/pdf/2402.06031v2.pdf","comment":"63 pages, 10 figures, 1 table"},{"id":"http://arxiv.org/abs/2406.04268v1","updated":"2024-06-06T17:15:02Z","published":"2024-06-06T17:15:02Z","title":"Open-Endedness is Essential for Artificial Superhuman Intelligence","summary":"  In recent years there has been a tremendous surge in the general capabilities\nof AI systems, mainly fuelled by training foundation models on internetscale\ndata. Nevertheless, the creation of openended, ever self-improving AI remains\nelusive. In this position paper, we argue that the ingredients are now in place\nto achieve openendedness in AI systems with respect to a human observer.\nFurthermore, we claim that such open-endedness is an essential property of any\nartificial superhuman intelligence (ASI). We begin by providing a concrete\nformal definition of open-endedness through the lens of novelty and\nlearnability. We then illustrate a path towards ASI via open-ended systems\nbuilt on top of foundation models, capable of making novel, humanrelevant\ndiscoveries. We conclude by examining the safety implications of\ngenerally-capable openended AI. We expect that open-ended foundation models\nwill prove to be an increasingly fertile and safety-critical area of research\nin the near future.\n","authors":["Edward Hughes","Michael Dennis","Jack Parker-Holder","Feryal Behbahani","Aditi Mavalankar","Yuge Shi","Tom Schaul","Tim Rocktaschel"],"pdf_url":"https://arxiv.org/pdf/2406.04268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04267v1","updated":"2024-06-06T17:14:44Z","published":"2024-06-06T17:14:44Z","title":"Transformers need glasses! Information over-squashing in language tasks","summary":"  We study how information propagates in decoder-only Transformers, which are\nthe architectural backbone of most existing frontier large language models\n(LLMs). We rely on a theoretical signal propagation analysis -- specifically,\nwe analyse the representations of the last token in the final layer of the\nTransformer, as this is the representation used for next-token prediction. Our\nanalysis reveals a representational collapse phenomenon: we prove that certain\ndistinct sequences of inputs to the Transformer can yield arbitrarily close\nrepresentations in the final token. This effect is exacerbated by the\nlow-precision floating-point formats frequently used in modern LLMs. As a\nresult, the model is provably unable to respond to these sequences in different\nways -- leading to errors in, e.g., tasks involving counting or copying.\nFurther, we show that decoder-only Transformer language models can lose\nsensitivity to specific tokens in the input, which relates to the well-known\nphenomenon of over-squashing in graph neural networks. We provide empirical\nevidence supporting our claims on contemporary LLMs. Our theory also points to\nsimple solutions towards ameliorating these issues.\n","authors":["Federico Barbero","Andrea Banino","Steven Kapturowski","Dharshan Kumaran","João G. M. Araújo","Alex Vitvitskyi","Razvan Pascanu","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2406.04267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00258v2","updated":"2024-06-06T17:14:03Z","published":"2024-02-01T01:06:32Z","title":"Multi-group Learning for Hierarchical Groups","summary":"  The multi-group learning model formalizes the learning scenario in which a\nsingle predictor must generalize well on multiple, possibly overlapping\nsubgroups of interest. We extend the study of multi-group learning to the\nnatural case where the groups are hierarchically structured. We design an\nalgorithm for this setting that outputs an interpretable and deterministic\ndecision tree predictor with near-optimal sample complexity. We then conduct an\nempirical evaluation of our algorithm and find that it achieves attractive\ngeneralization properties on real datasets with hierarchical group structure.\n","authors":["Samuel Deng","Daniel Hsu"],"pdf_url":"https://arxiv.org/pdf/2402.00258v2.pdf","comment":"Accepted in International Conference on Machine Learning 2024 (ICML\n  2024)"},{"id":"http://arxiv.org/abs/2406.04261v1","updated":"2024-06-06T17:05:09Z","published":"2024-06-06T17:05:09Z","title":"Simulating, Fast and Slow: Learning Policies for Black-Box Optimization","summary":"  In recent years, solving optimization problems involving black-box simulators\nhas become a point of focus for the machine learning community due to their\nubiquity in science and engineering. The simulators describe a forward process\n$f_{\\mathrm{sim}}: (\\psi, x) \\rightarrow y$ from simulation parameters $\\psi$\nand input data $x$ to observations $y$, and the goal of the optimization\nproblem is to find parameters $\\psi$ that minimize a desired loss function.\nSophisticated optimization algorithms typically require gradient information\nregarding the forward process, $f_{\\mathrm{sim}}$, with respect to the\nparameters $\\psi$. However, obtaining gradients from black-box simulators can\noften be prohibitively expensive or, in some cases, impossible. Furthermore, in\nmany applications, practitioners aim to solve a set of related problems. Thus,\nstarting the optimization ``ab initio\", i.e. from scratch, each time might be\ninefficient if the forward model is expensive to evaluate. To address those\nchallenges, this paper introduces a novel method for solving classes of similar\nblack-box optimization problems by learning an active learning policy that\nguides a differentiable surrogate's training and uses the surrogate's gradients\nto optimize the simulation parameters with gradient descent. After training the\npolicy, downstream optimization of problems involving black-box simulators\nrequires up to $\\sim$90\\% fewer expensive simulator calls compared to baselines\nsuch as local surrogate-based approaches, numerical optimization, and Bayesian\nmethods.\n","authors":["Fabio Valerio Massoli","Tim Bakker","Thomas Hehn","Tribhuvanesh Orekondy","Arash Behboodi"],"pdf_url":"https://arxiv.org/pdf/2406.04261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12621v2","updated":"2024-06-06T17:04:41Z","published":"2024-02-20T01:04:21Z","title":"Reflect-RL: Two-Player Online RL Fine-Tuning for LMs","summary":"  As language models (LMs) demonstrate their capabilities in various fields,\ntheir application to tasks requiring multi-round interactions has become\nincreasingly popular. These tasks usually have complex dynamics, so supervised\nfine-tuning (SFT) on a limited offline dataset does not yield good performance.\nHowever, only a few works attempted to directly train the LMs within\ninteractive decision-making environments. We aim to create an effective\napproach to fine-tune LMs with online reinforcement learning (RL) in these\nenvironments. We propose Reflect-RL, a two-player system to fine-tune an LM\nusing SFT and online RL, where a frozen reflection model (player) assists the\npolicy model (player). To generate data for the warm-up SFT stage, we use\nnegative example generation to enhance the error-correction ability of the\nreflection model. Furthermore, we designed single-prompt action enumeration and\napplied curriculum learning to allow the policy model to learn more\nefficiently. Empirically, we verify that Reflect-RL outperforms SFT and online\nRL without reflection. Testing results indicate GPT-2 XL 1.56B fine-tuned with\nReflect-RL outperforms larger open-source LMs, such as Mistral 7B. The\nbenchmarks, dataset, and code involved in this work are publicly available:\nhttps://github.com/zhourunlong/Reflect-RL.\n","authors":["Runlong Zhou","Simon S. Du","Beibin Li"],"pdf_url":"https://arxiv.org/pdf/2402.12621v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2406.04257v1","updated":"2024-06-06T17:03:51Z","published":"2024-06-06T17:03:51Z","title":"Data Measurements for Decentralized Data Markets","summary":"  Decentralized data markets can provide more equitable forms of data\nacquisition for machine learning. However, to realize practical marketplaces,\nefficient techniques for seller selection need to be developed. We propose and\nbenchmark federated data measurements to allow a data buyer to find sellers\nwith relevant and diverse datasets. Diversity and relevance measures enable a\nbuyer to make relative comparisons between sellers without requiring\nintermediate brokers and training task-dependent models.\n","authors":["Charles Lu","Mohammad Mohammadi Amiri","Ramesh Raskar"],"pdf_url":"https://arxiv.org/pdf/2406.04257v1.pdf","comment":"20 pages, 11 figures"},{"id":"http://arxiv.org/abs/2406.04250v1","updated":"2024-06-06T16:54:20Z","published":"2024-06-06T16:54:20Z","title":"Online learning of quantum processes","summary":"  Among recent insights into learning quantum states, online learning and\nshadow tomography procedures are notable for their ability to accurately\npredict expectation values even of adaptively chosen observables. In contrast\nto the state case, quantum process learning tasks with a similarly adaptive\nnature have received little attention. In this work, we investigate online\nlearning tasks for quantum processes. Whereas online learning is infeasible for\ngeneral quantum channels, we show that channels of bounded gate complexity as\nwell as Pauli channels can be online learned in the regret and mistake-bounded\nmodels of online learning. In fact, we can online learn probabilistic mixtures\nof any exponentially large set of known channels. We also provide a provably\nsample-efficient shadow tomography procedure for Pauli channels. Our results\nextend beyond quantum channels to non-Markovian multi-time processes, with\nfavorable regret and mistake bounds, as well as a shadow tomography procedure.\nWe complement our online learning upper bounds with mistake as well as\ncomputational lower bounds. On the technical side, we make use of the\nmultiplicative weights update algorithm, classical adaptive data analysis, and\nBell sampling, as well as tools from the theory of quantum combs for multi-time\nquantum processes. Our work initiates a study of online learning for classes of\nquantum channels and, more generally, non-Markovian quantum processes. Given\nthe importance of online learning for state shadow tomography, this may serve\nas a step towards quantum channel variants of adaptive shadow tomography.\n","authors":["Asad Raza","Matthias C. Caro","Jens Eisert","Sumeet Khatri"],"pdf_url":"https://arxiv.org/pdf/2406.04250v1.pdf","comment":"14 + 72 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.14922v3","updated":"2024-06-06T16:48:20Z","published":"2023-12-22T18:55:25Z","title":"Learning from higher-order statistics, efficiently: hypothesis tests,\n  random features, and neural networks","summary":"  Neural networks excel at discovering statistical patterns in high-dimensional\ndata sets. In practice, higher-order cumulants, which quantify the non-Gaussian\ncorrelations between three or more variables, are particularly important for\nthe performance of neural networks. But how efficient are neural networks at\nextracting features from higher-order cumulants? We study this question in the\nspiked cumulant model, where the statistician needs to recover a privileged\ndirection or \"spike\" from the order-$p\\ge 4$ cumulants of $d$-dimensional\ninputs. Existing literature established the presence of a wide\nstatistical-to-computational gap in this problem. We deepen this line of work\nby finding an exact formula for the likelihood ratio norm which proves that\nstatistical distinguishability requires $n\\gtrsim d$ samples, while\ndistinguishing the two distributions in polynomial time requires $n \\gtrsim\nd^2$ samples for a wide class of algorithms, i.e. those covered by the\nlow-degree conjecture. Numerical experiments show that neural networks do\nindeed learn to distinguish the two distributions with quadratic sample\ncomplexity, while \"lazy\" methods like random features are not better than\nrandom guessing in this regime. Our results show that neural networks extract\ninformation from higher-ordercorrelations in the spiked cumulant model\nefficiently, and reveal a large gap in the amount of data required by neural\nnetworks and random features to learn from higher-order cumulants.\n","authors":["Eszter Székely","Lorenzo Bardone","Federica Gerace","Sebastian Goldt"],"pdf_url":"https://arxiv.org/pdf/2312.14922v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04245v1","updated":"2024-06-06T16:44:08Z","published":"2024-06-06T16:44:08Z","title":"Online learning of a panoply of quantum objects","summary":"  In many quantum tasks, there is an unknown quantum object that one wishes to\nlearn. An online strategy for this task involves adaptively refining a\nhypothesis to reproduce such an object or its measurement statistics. A common\nevaluation metric for such a strategy is its regret, or roughly the accumulated\nerrors in hypothesis statistics. We prove a sublinear regret bound for learning\nover general subsets of positive semidefinite matrices via the\nregularized-follow-the-leader algorithm and apply it to various settings where\none wishes to learn quantum objects. For concrete applications, we present a\nsublinear regret bound for learning quantum states, effects, channels,\ninteractive measurements, strategies, co-strategies, and the collection of\ninner products of pure states. Our bound applies to many other quantum objects\nwith compact, convex representations. In proving our regret bound, we establish\nvarious matrix analysis results useful in quantum information theory. This\nincludes a generalization of Pinsker's inequality for arbitrary positive\nsemidefinite operators with possibly different traces, which may be of\nindependent interest and applicable to more general classes of divergences.\n","authors":["Akshay Bansal","Ian George","Soumik Ghosh","Jamie Sikora","Alice Zheng"],"pdf_url":"https://arxiv.org/pdf/2406.04245v1.pdf","comment":"34 pages. Comments welcome"},{"id":"http://arxiv.org/abs/2406.04240v1","updated":"2024-06-06T16:39:00Z","published":"2024-06-06T16:39:00Z","title":"Hypernetworks for Personalizing ASR to Atypical Speech","summary":"  Parameter-efficient fine-tuning (PEFT) for personalizing automatic speech\nrecognition (ASR) has recently shown promise for adapting general population\nmodels to atypical speech. However, these approaches assume a priori knowledge\nof the atypical speech disorder being adapted for -- the diagnosis of which\nrequires expert knowledge that is not always available. Even given this\nknowledge, data scarcity and high inter/intra-speaker variability further limit\nthe effectiveness of traditional fine-tuning. To circumvent these challenges,\nwe first identify the minimal set of model parameters required for ASR\nadaptation. Our analysis of each individual parameter's effect on adaptation\nperformance allows us to reduce Word Error Rate (WER) by half while adapting\n0.03\\% of all weights. Alleviating the need for cohort-specific models, we next\npropose the novel use of a meta-learned hypernetwork to generate highly\nindividualized, utterance-level adaptations on-the-fly for a diverse set of\natypical speech characteristics. Evaluating adaptation at the global, cohort\nand individual-level, we show that hypernetworks generalize better to\nout-of-distribution speakers, while maintaining an overall relative WER\nreduction of 75.2% using 0.1% of the full parameter budget.\n","authors":["Max Mueller-Eberstein","Dianna Yee","Karren Yang","Gautam Varma Mantena","Colin Lea"],"pdf_url":"https://arxiv.org/pdf/2406.04240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04239v1","updated":"2024-06-06T16:38:53Z","published":"2024-06-06T16:38:53Z","title":"Solving Inverse Problems in Protein Space Using Diffusion-Based Priors","summary":"  The interaction of a protein with its environment can be understood and\ncontrolled via its 3D structure. Experimental methods for protein structure\ndetermination, such as X-ray crystallography or cryogenic electron microscopy,\nshed light on biological processes but introduce challenging inverse problems.\nLearning-based approaches have emerged as accurate and efficient methods to\nsolve these inverse problems for 3D structure determination, but are\nspecialized for a predefined type of measurement. Here, we introduce a\nversatile framework to turn raw biophysical measurements of varying types into\n3D atomic models. Our method combines a physics-based forward model of the\nmeasurement process with a pretrained generative model providing a\ntask-agnostic, data-driven prior. Our method outperforms posterior sampling\nbaselines on both linear and non-linear inverse problems. In particular, it is\nthe first diffusion-based method for refining atomic models from cryo-EM\ndensity maps.\n","authors":["Axel Levy","Eric R. Chan","Sara Fridovich-Keil","Frédéric Poitevin","Ellen D. Zhong","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2406.04239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17673v2","updated":"2024-06-06T16:35:51Z","published":"2024-03-26T13:02:43Z","title":"How Private are DP-SGD Implementations?","summary":"  We demonstrate a substantial gap between the privacy guarantees of the\nAdaptive Batch Linear Queries (ABLQ) mechanism under different types of batch\nsampling: (i) Shuffling, and (ii) Poisson subsampling; the typical analysis of\nDifferentially Private Stochastic Gradient Descent (DP-SGD) follows by\ninterpreting it as a post-processing of ABLQ. While shuffling-based DP-SGD is\nmore commonly used in practical implementations, it has not been amenable to\neasy privacy analysis, either analytically or even numerically. On the other\nhand, Poisson subsampling-based DP-SGD is challenging to scalably implement,\nbut has a well-understood privacy analysis, with multiple open-source\nnumerically tight privacy accountants available. This has led to a common\npractice of using shuffling-based DP-SGD in practice, but using the privacy\nanalysis for the corresponding Poisson subsampling version. Our result shows\nthat there can be a substantial gap between the privacy analysis when using the\ntwo types of batch sampling, and thus advises caution in reporting privacy\nparameters for DP-SGD.\n","authors":["Lynn Chua","Badih Ghazi","Pritish Kamath","Ravi Kumar","Pasin Manurangsi","Amer Sinha","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.17673v2.pdf","comment":"Proceedings of ICML 2024"},{"id":"http://arxiv.org/abs/2308.03312v8","updated":"2024-06-06T16:35:20Z","published":"2023-08-07T05:40:58Z","title":"Exploiting Code Symmetries for Learning Program Semantics","summary":"  This paper tackles the challenge of teaching code semantics to Large Language\nModels (LLMs) for program analysis by incorporating code symmetries into the\nmodel architecture. We introduce a group-theoretic framework that defines code\nsymmetries as semantics-preserving transformations, where forming a code\nsymmetry group enables precise and efficient reasoning of code semantics. Our\nsolution, SymC, develops a novel variant of self-attention that is provably\nequivariant to code symmetries from the permutation group defined over the\nprogram dependence graph. SymC obtains superior performance on five program\nanalysis tasks, outperforming state-of-the-art code models without any\npre-training. Our results suggest that code LLMs that encode the code\nstructural prior via the code symmetry group generalize better and faster.\n","authors":["Kexin Pei","Weichen Li","Qirui Jin","Shuyang Liu","Scott Geng","Lorenzo Cavallaro","Junfeng Yang","Suman Jana"],"pdf_url":"https://arxiv.org/pdf/2308.03312v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04229v1","updated":"2024-06-06T16:29:25Z","published":"2024-06-06T16:29:25Z","title":"The CLRS-Text Algorithmic Reasoning Language Benchmark","summary":"  Eliciting reasoning capabilities from language models (LMs) is a critical\ndirection on the path towards building intelligent systems. Most recent studies\ndedicated to reasoning focus on out-of-distribution performance on\nprocedurally-generated synthetic benchmarks, bespoke-built to evaluate specific\nskills only. This trend makes results hard to transfer across publications,\nslowing down progress. Three years ago, a similar issue was identified and\nrectified in the field of neural algorithmic reasoning, with the advent of the\nCLRS benchmark. CLRS is a dataset generator comprising graph execution traces\nof classical algorithms from the Introduction to Algorithms textbook. Inspired\nby this, we propose CLRS-Text -- a textual version of these algorithmic traces.\nOut of the box, CLRS-Text is capable of procedurally generating trace data for\nthirty diverse, challenging algorithmic tasks across any desirable input\ndistribution, while offering a standard pipeline in which any additional\nalgorithmic tasks may be created in the benchmark. We fine-tune and evaluate\nvarious LMs as generalist executors on this benchmark, validating prior work\nand revealing a novel, interesting challenge for the LM reasoning community.\nOur code is available at\nhttps://github.com/google-deepmind/clrs/tree/master/clrs/_src/clrs_text.\n","authors":["Larisa Markeeva","Sean McLeish","Borja Ibarz","Wilfried Bounsi","Olga Kozlova","Alex Vitvitskyi","Charles Blundell","Tom Goldstein","Avi Schwarzschild","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2406.04229v1.pdf","comment":"Preprint, under review. Comments welcome"},{"id":"http://arxiv.org/abs/2406.04227v1","updated":"2024-06-06T16:28:04Z","published":"2024-06-06T16:28:04Z","title":"R-CONV: An Analytical Approach for Efficient Data Reconstruction via\n  Convolutional Gradients","summary":"  In the effort to learn from extensive collections of distributed data,\nfederated learning has emerged as a promising approach for preserving privacy\nby using a gradient-sharing mechanism instead of exchanging raw data. However,\nrecent studies show that private training data can be leaked through many\ngradient attacks. While previous analytical-based attacks have successfully\nreconstructed input data from fully connected layers, their effectiveness\ndiminishes when applied to convolutional layers. This paper introduces an\nadvanced data leakage method to efficiently exploit convolutional layers'\ngradients. We present a surprising finding: even with non-fully invertible\nactivation functions, such as ReLU, we can analytically reconstruct training\nsamples from the gradients. To the best of our knowledge, this is the first\nanalytical approach that successfully reconstructs convolutional layer inputs\ndirectly from the gradients, bypassing the need to reconstruct layers' outputs.\nPrior research has mainly concentrated on the weight constraints of convolution\nlayers, overlooking the significance of gradient constraints. Our findings\ndemonstrate that existing analytical methods used to estimate the risk of\ngradient attacks lack accuracy. In some layers, attacks can be launched with\nless than 5% of the reported constraints.\n","authors":["Tamer Ahmed Eltaras","Qutaibah Malluhi","Alessandro Savino","Stefano Di Carlo","Adnan Qayyum","Junaid Qadir"],"pdf_url":"https://arxiv.org/pdf/2406.04227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08841v3","updated":"2024-06-06T16:21:09Z","published":"2023-08-17T08:00:20Z","title":"Machine Learning-Assisted Discovery of Flow Reactor Designs","summary":"  Additive manufacturing has enabled the fabrication of advanced reactor\ngeometries, permitting larger, more complex design spaces. Identifying\npromising configurations within such spaces presents a significant challenge\nfor current approaches. Furthermore, existing parameterisations of reactor\ngeometries are low-dimensional with expensive optimisation limiting more\ncomplex solutions. To address this challenge, we establish a machine\nlearning-assisted approach for the design of the next-generation of chemical\nreactors, combining the application of high-dimensional parameterisations,\ncomputational fluid dynamics, and multi-fidelity Bayesian optimisation. We\nassociate the development of mixing-enhancing vortical flow structures in novel\ncoiled reactors with performance, and use our approach to identify key\ncharacteristics of optimal designs. By appealing to the principles of flow\ndynamics, we rationalise the selection of novel design features that lead to\nexperimental plug flow performance improvements of 60% over conventional\ndesigns. Our results demonstrate that coupling advanced manufacturing\ntechniques with `augmented-intelligence' approaches can lead to superior design\nperformance and, consequently, emissions-reduction and sustainability.\n","authors":["Tom Savage","Nausheen Basha","Jonathan McDonough","James Krassowski","Omar K Matar","Ehecatl Antonio del Rio Chanona"],"pdf_url":"https://arxiv.org/pdf/2308.08841v3.pdf","comment":"11 pages, 9 figures, as accepted Nature Chemical Engineering"},{"id":"http://arxiv.org/abs/2406.04219v1","updated":"2024-06-06T16:18:20Z","published":"2024-06-06T16:18:20Z","title":"Multi-Agent Imitation Learning: Value is Easy, Regret is Hard","summary":"  We study a multi-agent imitation learning (MAIL) problem where we take the\nperspective of a learner attempting to coordinate a group of agents based on\ndemonstrations of an expert doing so. Most prior work in MAIL essentially\nreduces the problem to matching the behavior of the expert within the support\nof the demonstrations. While doing so is sufficient to drive the value gap\nbetween the learner and the expert to zero under the assumption that agents are\nnon-strategic, it does not guarantee robustness to deviations by strategic\nagents. Intuitively, this is because strategic deviations can depend on a\ncounterfactual quantity: the coordinator's recommendations outside of the state\ndistribution their recommendations induce. In response, we initiate the study\nof an alternative objective for MAIL in Markov Games we term the regret gap\nthat explicitly accounts for potential deviations by agents in the group. We\nfirst perform an in-depth exploration of the relationship between the value and\nregret gaps. First, we show that while the value gap can be efficiently\nminimized via a direct extension of single-agent IL algorithms, even value\nequivalence can lead to an arbitrarily large regret gap. This implies that\nachieving regret equivalence is harder than achieving value equivalence in\nMAIL. We then provide a pair of efficient reductions to no-regret online convex\noptimization that are capable of minimizing the regret gap (a) under a coverage\nassumption on the expert (MALICE) or (b) with access to a queryable expert\n(BLADES).\n","authors":["Jingwu Tang","Gokul Swamy","Fei Fang","Zhiwei Steven Wu"],"pdf_url":"https://arxiv.org/pdf/2406.04219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04216v1","updated":"2024-06-06T16:15:34Z","published":"2024-06-06T16:15:34Z","title":"What Do Language Models Learn in Context? The Structured Task Hypothesis","summary":"  Large language models (LLMs) exhibit an intriguing ability to learn a novel\ntask from in-context examples presented in a demonstration, termed in-context\nlearning (ICL). Understandably, a swath of research has been dedicated to\nuncovering the theories underpinning ICL. One popular hypothesis explains ICL\nby task selection. LLMs identify the task based on the demonstration and\ngeneralize it to the prompt. Another popular hypothesis is that ICL is a form\nof meta-learning, i.e., the models learn a learning algorithm at pre-training\ntime and apply it to the demonstration. Finally, a third hypothesis argues that\nLLMs use the demonstration to select a composition of tasks learned during\npre-training to perform ICL. In this paper, we empirically explore these three\nhypotheses that explain LLMs' ability to learn in context with a suite of\nexperiments derived from common text classification tasks. We invalidate the\nfirst two hypotheses with counterexamples and provide evidence in support of\nthe last hypothesis. Our results suggest an LLM could learn a novel task in\ncontext via composing tasks learned during pre-training.\n","authors":["Jiaoda Li","Yifan Hou","Mrinmaya Sachan","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2406.04216v1.pdf","comment":"This work is published in ACL 2024"},{"id":"http://arxiv.org/abs/2406.04215v1","updated":"2024-06-06T16:14:54Z","published":"2024-06-06T16:14:54Z","title":"mCSQA: Multilingual Commonsense Reasoning Dataset with Unified Creation\n  Strategy by Language Models and Humans","summary":"  It is very challenging to curate a dataset for language-specific knowledge\nand common sense in order to evaluate natural language understanding\ncapabilities of language models. Due to the limitation in the availability of\nannotators, most current multilingual datasets are created through translation,\nwhich cannot evaluate such language-specific aspects. Therefore, we propose\nMultilingual CommonsenseQA (mCSQA) based on the construction process of CSQA\nbut leveraging language models for a more efficient construction, e.g., by\nasking LM to generate questions/answers, refine answers and verify QAs followed\nby reduced human efforts for verification. Constructed dataset is a benchmark\nfor cross-lingual language-transfer capabilities of multilingual LMs, and\nexperimental results showed high language-transfer capabilities for questions\nthat LMs could easily solve, but lower transfer capabilities for questions\nrequiring deep knowledge or commonsense. This highlights the necessity of\nlanguage-specific datasets for evaluation and training. Finally, our method\ndemonstrated that multilingual LMs could create QA including language-specific\nknowledge, significantly reducing the dataset creation cost compared to manual\ncreation. The datasets are available at\nhttps://huggingface.co/datasets/yusuke1997/mCSQA.\n","authors":["Yusuke Sakai","Hidetaka Kamigaito","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2406.04215v1.pdf","comment":"Accepted at Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2403.13872v2","updated":"2024-06-06T16:13:41Z","published":"2024-03-20T15:27:17Z","title":"Spatial-Temporal Graph Representation Learning for Tactical Networks\n  Future State Prediction","summary":"  Resource allocation in tactical ad-hoc networks presents unique challenges\ndue to their dynamic and multi-hop nature. Accurate prediction of future\nnetwork connectivity is essential for effective resource allocation in such\nenvironments. In this paper, we introduce the Spatial-Temporal Graph\nEncoder-Decoder (STGED) framework for Tactical Communication Networks that\nleverages both spatial and temporal features of network states to learn latent\ntactical behaviors effectively. STGED hierarchically utilizes graph-based\nattention mechanism to spatially encode a series of communication network\nstates, leverages a recurrent neural network to temporally encode the evolution\nof states, and a fully-connected feed-forward network to decode the\nconnectivity in the future state. Through extensive experiments, we demonstrate\nthat STGED consistently outperforms baseline models by large margins across\ndifferent time-steps input, achieving an accuracy of up to 99.2\\% for the\nfuture state prediction task of tactical communication networks.\n","authors":["Liu Junhua","Albrethsen Justin","Goh Lincoln","Yau David","Lim Kwan Hui"],"pdf_url":"https://arxiv.org/pdf/2403.13872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03437v2","updated":"2024-06-06T16:13:41Z","published":"2024-06-05T16:33:30Z","title":"Transfer Learning for Latent Variable Network Models","summary":"  We study transfer learning for estimation in latent variable network models.\nIn our setting, the conditional edge probability matrices given the latent\nvariables are represented by $P$ for the source and $Q$ for the target. We wish\nto estimate $Q$ given two kinds of data: (1) edge data from a subgraph induced\nby an $o(1)$ fraction of the nodes of $Q$, and (2) edge data from all of $P$.\nIf the source $P$ has no relation to the target $Q$, the estimation error must\nbe $\\Omega(1)$. However, we show that if the latent variables are shared, then\nvanishing error is possible. We give an efficient algorithm that utilizes the\nordering of a suitably defined graph distance. Our algorithm achieves $o(1)$\nerror and does not assume a parametric form on the source or target networks.\nNext, for the specific case of Stochastic Block Models we prove a minimax lower\nbound and show that a simple algorithm achieves this rate. Finally, we\nempirically demonstrate our algorithm's use on real-world and simulated graph\ntransfer problems.\n","authors":["Akhil Jalan","Arya Mazumdar","Soumendu Sundar Mukherjee","Purnamrita Sarkar"],"pdf_url":"https://arxiv.org/pdf/2406.03437v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02290v2","updated":"2024-06-06T16:09:31Z","published":"2024-06-04T13:05:47Z","title":"A Study of Optimizations for Fine-tuning Large Language Models","summary":"  Fine-tuning large language models is a popular choice among users trying to\nadapt them for specific applications. However, fine-tuning these models is a\ndemanding task because the user has to examine several factors, such as\nresource budget, runtime, model size and context length among others. A\nspecific challenge is that fine-tuning is memory intensive, imposing\nconstraints on the required hardware memory and context length of training data\nthat can be handled. In this work, we share a detailed study on a variety of\nfine-tuning optimizations across different fine-tuning scenarios. In\nparticular, we assess Gradient Checkpointing, Low-Rank Adaptation, DeepSpeed's\nZero Redundancy Optimizer and FlashAttention. With a focus on memory and\nruntime, we examine the impact of different optimization combinations on GPU\nmemory usage and execution runtime during fine-tuning phase. We provide our\nrecommendation on the best default optimization for balancing memory and\nruntime across diverse model sizes. We share effective strategies for\nfine-tuning very large models with tens or hundreds of billions of parameters\nand enabling large context lengths during fine-tuning. Furthermore, we propose\nthe appropriate optimization mixtures for fine-tuning under GPU resource\nlimitations.\n","authors":["Arjun Singh","Nikhil Pandey","Anup Shirgaonkar","Pavan Manoj","Vijay Aski"],"pdf_url":"https://arxiv.org/pdf/2406.02290v2.pdf","comment":"10 pages, 4 figures. Revised text for clarity, updated references"},{"id":"http://arxiv.org/abs/2406.04208v1","updated":"2024-06-06T16:05:45Z","published":"2024-06-06T16:05:45Z","title":"Aligning Agents like Large Language Models","summary":"  Training agents to behave as desired in complex 3D environments from\nhigh-dimensional sensory information is challenging. Imitation learning from\ndiverse human behavior provides a scalable approach for training an agent with\na sensible behavioral prior, but such an agent may not perform the specific\nbehaviors of interest when deployed. To address this issue, we draw an analogy\nbetween the undesirable behaviors of imitation learning agents and the\nunhelpful responses of unaligned large language models (LLMs). We then\ninvestigate how the procedure for aligning LLMs can be applied to aligning\nagents in a 3D environment from pixels. For our analysis, we utilize an\nacademically illustrative part of a modern console game in which the human\nbehavior distribution is multi-modal, but we want our agent to imitate a single\nmode of this behavior. We demonstrate that we can align our agent to\nconsistently perform the desired mode, while providing insights and advice for\nsuccessfully applying this approach to training agents. Project webpage at\nhttps://adamjelley.github.io/aligning-agents-like-llms .\n","authors":["Adam Jelley","Yuhan Cao","Dave Bignell","Sam Devlin","Tabish Rashid"],"pdf_url":"https://arxiv.org/pdf/2406.04208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09109v2","updated":"2024-06-06T16:01:08Z","published":"2023-11-15T16:56:49Z","title":"Does Pre-trained Language Model Actually Infer Unseen Links in Knowledge\n  Graph Completion?","summary":"  Knowledge graphs (KGs) consist of links that describe relationships between\nentities. Due to the difficulty of manually enumerating all relationships\nbetween entities, automatically completing them is essential for KGs. Knowledge\nGraph Completion (KGC) is a task that infers unseen relationships between\nentities in a KG. Traditional embedding-based KGC methods, such as RESCAL,\nTransE, DistMult, ComplEx, RotatE, HAKE, HousE, etc., infer missing links using\nonly the knowledge from training data. In contrast, the recent Pre-trained\nLanguage Model (PLM)-based KGC utilizes knowledge obtained during pre-training.\nTherefore, PLM-based KGC can estimate missing links between entities by reusing\nmemorized knowledge from pre-training without inference. This approach is\nproblematic because building KGC models aims to infer unseen links between\nentities. However, conventional evaluations in KGC do not consider inference\nand memorization abilities separately. Thus, a PLM-based KGC method, which\nachieves high performance in current KGC evaluations, may be ineffective in\npractical applications. To address this issue, we analyze whether PLM-based KGC\nmethods make inferences or merely access memorized knowledge. For this purpose,\nwe propose a method for constructing synthetic datasets specified in this\nanalysis and conclude that PLMs acquire the inference abilities required for\nKGC through pre-training, even though the performance improvements mostly come\nfrom textual information of entities and relations.\n","authors":["Yusuke Sakai","Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2311.09109v2.pdf","comment":"Accepted at NAACL 2024 main oral, 15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2406.04201v1","updated":"2024-06-06T15:59:17Z","published":"2024-06-06T15:59:17Z","title":"Towards Principled Superhuman AI for Multiplayer Symmetric Games","summary":"  Multiplayer games, when the number of players exceeds two, present unique\nchallenges that fundamentally distinguish them from the extensively studied\ntwo-player zero-sum games. These challenges arise from the non-uniqueness of\nequilibria and the risk of agents performing highly suboptimally when adopting\nequilibrium strategies. While a line of recent works developed learning systems\nsuccessfully achieving human-level or even superhuman performance in popular\nmultiplayer games such as Mahjong, Poker, and Diplomacy, two critical questions\nremain unaddressed: (1) What is the correct solution concept that AI agents\nshould find? and (2) What is the general algorithmic framework that provably\nsolves all games within this class? This paper takes the first step towards\nsolving these unique challenges of multiplayer games by provably addressing\nboth questions in multiplayer symmetric normal-form games. We also demonstrate\nthat many meta-algorithms developed in prior practical systems for multiplayer\ngames can fail to achieve even the basic goal of obtaining agent's equal share\nof the total reward.\n","authors":["Jiawei Ge","Yuanhao Wang","Wenzhe Li","Chi Jin"],"pdf_url":"https://arxiv.org/pdf/2406.04201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10727v2","updated":"2024-06-06T15:52:17Z","published":"2024-02-16T14:40:22Z","title":"Predictive Uncertainty Quantification via Risk Decompositions for\n  Strictly Proper Scoring Rules","summary":"  Uncertainty quantification in predictive modeling often relies on ad hoc\nmethods as there is no universally accepted formal framework for that. This\npaper introduces a theoretical approach to understanding uncertainty through\nstatistical risks, distinguishing between aleatoric (data-related) and\nepistemic (model-related) uncertainties. We explain how to split pointwise risk\ninto Bayes risk and excess risk. In particular, we show that excess risk,\nrelated to epistemic uncertainty, aligns with Bregman divergences. To turn\nconsidered risk measures into actual uncertainty estimates, we suggest using\nthe Bayesian approach by approximating the risks with the help of posterior\ndistributions. We tested our method on image datasets, evaluating its\nperformance in detecting out-of-distribution and misclassified data using the\nAUROC metric. Our results confirm the effectiveness of the considered approach\nand offer practical guidance for estimating uncertainty in real-world\napplications.\n","authors":["Nikita Kotelevskii","Maxim Panov"],"pdf_url":"https://arxiv.org/pdf/2402.10727v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05835v2","updated":"2024-06-06T15:51:35Z","published":"2024-04-08T20:02:19Z","title":"Parameter-Adaptive Approximate MPC: Tuning Neural-Network Controllers\n  without Retraining","summary":"  Model Predictive Control (MPC) is a method to control nonlinear systems with\nguaranteed stability and constraint satisfaction but suffers from high\ncomputation times. Approximate MPC (AMPC) with neural networks (NNs) has\nemerged to address this limitation, enabling deployment on resource-constrained\nembedded systems. However, when tuning AMPCs for real-world systems, large\ndatasets need to be regenerated and the NN needs to be retrained at every\ntuning step. This work introduces a novel, parameter-adaptive AMPC architecture\ncapable of online tuning without recomputing large datasets and retraining. By\nincorporating local sensitivities of nonlinear programs, the proposed method\nnot only mimics optimal MPC inputs but also adjusts to known changes in\nphysical parameters of the model using linear predictions while still\nguaranteeing stability. We showcase the effectiveness of parameter-adaptive\nAMPC by controlling the swing-ups of two different real cartpole systems with a\nseverely resource-constrained microcontroller (MCU). We use the same NN across\nboth system instances that have different parameters. This work not only\nrepresents the first experimental demonstration of AMPC for fast-moving systems\non low-cost MCUs to the best of our knowledge, but also showcases\ngeneralization across system instances and variations through our\nparameter-adaptation method. Taken together, these contributions represent a\nmarked step toward the practical application of AMPC in real-world systems.\n","authors":["Henrik Hose","Alexander Gräfe","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2404.05835v2.pdf","comment":"Accepted to L4DC 2024"},{"id":"http://arxiv.org/abs/2402.14490v3","updated":"2024-06-06T15:51:21Z","published":"2024-02-22T12:27:38Z","title":"Imbalanced Data Clustering using Equilibrium K-Means","summary":"  Centroid-based clustering algorithms, such as hard K-means (HKM) and fuzzy\nK-means (FKM), have suffered from learning bias towards large clusters. Their\ncentroids tend to be crowded in large clusters, compromising performance when\nthe true underlying data groups vary in size (i.e., imbalanced data). To\naddress this, we propose a new clustering objective function based on the\nBoltzmann operator, which introduces a novel centroid repulsion mechanism,\nwhere data points surrounding the centroids repel other centroids. Larger\nclusters repel more, effectively mitigating the issue of large cluster learning\nbias. The proposed new algorithm, called equilibrium K-means (EKM), is simple,\nalternating between two steps; resource-saving, with the same time and space\ncomplexity as FKM; and scalable to large datasets via batch learning. We\nsubstantially evaluate the performance of EKM on synthetic and real-world\ndatasets. The results show that EKM performs competitively on balanced data and\nsignificantly outperforms benchmark algorithms on imbalanced data. Deep\nclustering experiments demonstrate that EKM is a better alternative to HKM and\nFKM on imbalanced data as more discriminative representation can be obtained.\nAdditionally, we reformulate HKM, FKM, and EKM in a general form of gradient\ndescent and demonstrate how this general form facilitates a uniform study of\nK-means algorithms.\n","authors":["Yudong He"],"pdf_url":"https://arxiv.org/pdf/2402.14490v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.14734v2","updated":"2024-06-06T15:50:45Z","published":"2021-12-29T18:42:15Z","title":"Sequential memory improves sample and memory efficiency in Episodic\n  Control","summary":"  State of the art deep reinforcement learning algorithms are sample\ninefficient due to the large number of episodes they require to achieve\nasymptotic performance. Episodic Reinforcement Learning (ERL) algorithms,\ninspired by the mammalian hippocampus, typically use extended memory systems to\nbootstrap learning from past events to overcome this sample-inefficiency\nproblem. However, such memory augmentations are often used as mere buffers,\nfrom which isolated past experiences are drawn to learn from in an offline\nfashion (e.g., replay). Here, we demonstrate that including a bias in the\nacquired memory content derived from the order of episodic sampling improves\nboth the sample and memory efficiency of an episodic control algorithm. We test\nour Sequential Episodic Control (SEC) model in a foraging task to show that\nstoring and using integrated episodes as event sequences leads to faster\nlearning with fewer memory requirements as opposed to a standard ERL benchmark,\nModel-Free Episodic Control, that buffers isolated events only. We also study\nthe effect of memory constraints and forgetting on the sequential and\nnon-sequential version of the SEC algorithm. Furthermore, we discuss how a\nhippocampal-like fast memory system could bootstrap slow cortical and\nsubcortical learning subserving habit formation in the mammalian brain.\n","authors":["Ismael T. Freire","Adrián F. Amil","Paul F. M. J. Verschure"],"pdf_url":"https://arxiv.org/pdf/2112.14734v2.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2405.11684v2","updated":"2024-06-06T15:41:38Z","published":"2024-05-19T22:04:11Z","title":"Learning Regularities from Data using Spiking Functions: A Theory","summary":"  Deep neural networks trained in an end-to-end manner are proven to be\nefficient in a wide range of machine learning tasks. However, there is one\ndrawback of end-to-end learning: The learned features and information are\nimplicitly represented in neural network parameters, which cannot be used as\nregularities, concepts or knowledge to explicitly represent the data\nprobability distribution. To resolve this issue, we propose in this paper a new\nmachine learning theory, which defines in mathematics what are regularities.\nBriefly, regularities are concise representations of the non-random features,\nor 'non-randomness' in the data probability distribution. Combining this with\ninformation theory, we claim that regularities can also be regarded as a small\namount of information encoding a large amount of information. Our theory is\nbased on spiking functions. That is, if a function can react to, or spike on\nspecific data samples more frequently than random noise inputs, we say that\nsuch a function discovers non-randomness from the data distribution. Also, we\nsay that the discovered non-randomness is encoded into regularities if the\nfunction is simple enough. Our theory also discusses applying multiple spiking\nfunctions to the same data distribution. In this process, we claim that the\n'best' regularities, or the optimal spiking functions, are those who can\ncapture the largest amount of information from the data distribution, and then\nencode the captured information in the most concise way. Theorems and\nhypotheses are provided to describe in mathematics what are 'best' regularities\nand optimal spiking functions. Finally, we propose a machine learning approach,\nwhich can potentially obtain the optimal spiking functions regarding the given\ndataset in practice.\n","authors":["Canlin Zhang","Xiuwen Liu"],"pdf_url":"https://arxiv.org/pdf/2405.11684v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04184v1","updated":"2024-06-06T15:40:29Z","published":"2024-06-06T15:40:29Z","title":"Shield Synthesis for LTL Modulo Theories","summary":"  In recent years, Machine Learning (ML) models have achieved remarkable\nsuccess in various domains. However, these models also tend to demonstrate\nunsafe behaviors, precluding their deployment in safety-critical systems. To\ncope with this issue, ample research focuses on developing methods that\nguarantee the safe behaviour of a given ML model. A prominent example is\nshielding which incorporates an external component (a \"shield\") that blocks\nunwanted behavior. Despite significant progress, shielding suffers from a main\nsetback: it is currently geared towards properties encoded solely in\npropositional logics (e.g., LTL) and is unsuitable for richer logics. This, in\nturn, limits the widespread applicability of shielding in many real-world\nsystems. In this work, we address this gap, and extend shielding to LTL modulo\ntheories, by building upon recent advances in reactive synthesis modulo\ntheories. This allowed us to develop a novel approach for generating shields\nconforming to complex safety specifications in these more expressive, logics.\nWe evaluated our shields and demonstrate their ability to handle rich data with\ntemporal dynamics. To the best of our knowledge, this is the first approach for\nsynthesizing shields for such expressivity.\n","authors":["Andoni Rodriguez","Guy Amir","Davide Corsi","Cesar Sanchez","Guy Katz"],"pdf_url":"https://arxiv.org/pdf/2406.04184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20250v2","updated":"2024-06-06T15:31:08Z","published":"2024-05-30T17:02:18Z","title":"Entropy annealing for policy mirror descent in continuous time and space","summary":"  Entropy regularization has been extensively used in policy optimization\nalgorithms to regularize the optimization landscape and accelerate convergence;\nhowever, it comes at the cost of introducing an additional regularization bias.\nThis work quantifies the impact of entropy regularization on the convergence of\npolicy gradient methods for stochastic exit time control problems. We analyze a\ncontinuous-time policy mirror descent dynamics, which updates the policy based\non the gradient of an entropy-regularized value function and adjusts the\nstrength of entropy regularization as the algorithm progresses. We prove that\nwith a fixed entropy level, the dynamics converges exponentially to the optimal\nsolution of the regularized problem. We further show that when the entropy\nlevel decays at suitable polynomial rates, the annealed flow converges to the\nsolution of the unregularized problem at a rate of $\\mathcal O(1/S)$ for\ndiscrete action spaces and, under suitable conditions, at a rate of $\\mathcal\nO(1/\\sqrt{S})$ for general action spaces, with $S$ being the gradient flow\ntime. This paper explains how entropy regularization improves policy\noptimization, even with the true gradient, from the perspective of convergence\nrate.\n","authors":["Deven Sethi","David Šiška","Yufei Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.20250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04170v1","updated":"2024-06-06T15:27:52Z","published":"2024-06-06T15:27:52Z","title":"Element-wise Multiplication Based Physics-informed Neural Networks","summary":"  As a promising framework for resolving partial differential equations (PDEs),\nphysics-informed neural networks (PINNs) have received widespread attention\nfrom industrial and scientific fields. However, lack of expressive ability and\ninitialization pathology issues are found to prevent the application of PINNs\nin complex PDEs. In this work, we propose Element-wise Multiplication Based\nPhysics-informed Neural Networks (EM-PINNs) to resolve these issues. The\nelement-wise multiplication operation is adopted to transform features into\nhigh-dimensional, non-linear spaces, which effectively enhance the expressive\ncapability of PINNs. Benefiting from element-wise multiplication operation,\nEM-PINNs can eliminate the initialization pathologies of PINNs. The proposed\nstructure is verified on various benchmarks. The results show that EM-PINNs\nhave strong expressive ability.\n","authors":["Feilong Jiang","Xiaonan Hou","Min Xia"],"pdf_url":"https://arxiv.org/pdf/2406.04170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00329v2","updated":"2024-06-06T15:27:12Z","published":"2024-06-01T07:08:45Z","title":"Whole Heart 3D+T Representation Learning Through Sparse 2D Cardiac MR\n  Images","summary":"  Cardiac Magnetic Resonance (CMR) imaging serves as the gold-standard for\nevaluating cardiac morphology and function. Typically, a multi-view CMR stack,\ncovering short-axis (SA) and 2/3/4-chamber long-axis (LA) views, is acquired\nfor a thorough cardiac assessment. However, efficiently streamlining the\ncomplex, high-dimensional 3D+T CMR data and distilling compact, coherent\nrepresentation remains a challenge. In this work, we introduce a whole-heart\nself-supervised learning framework that utilizes masked imaging modeling to\nautomatically uncover the correlations between spatial and temporal patches\nthroughout the cardiac stacks. This process facilitates the generation of\nmeaningful and well-clustered heart representations without relying on the\ntraditionally required, and often costly, labeled data. The learned heart\nrepresentation can be directly used for various downstream tasks. Furthermore,\nour method demonstrates remarkable robustness, ensuring consistent\nrepresentations even when certain CMR planes are missing/flawed. We train our\nmodel on 14,000 unlabeled CMR data from UK BioBank and evaluate it on 1,000\nannotated data. The proposed method demonstrates superior performance to\nbaselines in tasks that demand comprehensive 3D+T cardiac information, e.g.\ncardiac phenotype (ejection fraction and ventricle volume) prediction and\nmulti-plane/multi-frame CMR segmentation, highlighting its effectiveness in\nextracting comprehensive cardiac features that are both anatomically and\npathologically relevant.\n","authors":["Yundi Zhang","Chen Chen","Suprosanna Shit","Sophie Starck","Daniel Rueckert","Jiazhen Pan"],"pdf_url":"https://arxiv.org/pdf/2406.00329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03668v2","updated":"2024-06-06T15:24:16Z","published":"2023-12-06T18:34:42Z","title":"Integrating Pre-Trained Speech and Language Models for End-to-End Speech\n  Recognition","summary":"  Advances in machine learning have made it possible to perform various text\nand speech processing tasks, such as automatic speech recognition (ASR), in an\nend-to-end (E2E) manner. E2E approaches utilizing pre-trained models are\ngaining attention for conserving training data and resources. However, most of\ntheir applications in ASR involve only one of either a pre-trained speech or a\nlanguage model. This paper proposes integrating a pre-trained speech\nrepresentation model and a large language model (LLM) for E2E ASR. The proposed\nmodel enables the optimization of the entire ASR process, including acoustic\nfeature extraction and acoustic and language modeling, by combining pre-trained\nmodels with a bridge network and also enables the application of remarkable\ndevelopments in LLM utilization, such as parameter-efficient domain adaptation\nand inference optimization. Experimental results demonstrate that the proposed\nmodel achieves a performance comparable to that of modern E2E ASR models by\nutilizing powerful pre-training models with the proposed integrated approach.\n","authors":["Yukiya Hono","Koh Mitsuda","Tianyu Zhao","Kentaro Mitsui","Toshiaki Wakatsuki","Kei Sawada"],"pdf_url":"https://arxiv.org/pdf/2312.03668v2.pdf","comment":"17 pages, 4 figures, 9 tables, accepted for Findings of ACL 2024. The\n  model is available at https://huggingface.co/rinna/nue-asr"},{"id":"http://arxiv.org/abs/2406.04165v1","updated":"2024-06-06T15:22:33Z","published":"2024-06-06T15:22:33Z","title":"Repurposing Language Models into Embedding Models: Finding the\n  Compute-Optimal Recipe","summary":"  Text embeddings are essential for many tasks, such as document retrieval,\nclustering, and semantic similarity assessment. In this paper, we study how to\ncontrastively train text embedding models in a compute-optimal fashion, given a\nsuite of pre-trained decoder-only language models. Our innovation is an\nalgorithm that produces optimal configurations of model sizes, data quantities,\nand fine-tuning methods for text-embedding models at different computational\nbudget levels. The resulting recipe, which we obtain through extensive\nexperiments, can be used by practitioners to make informed design choices for\ntheir embedding models. Specifically, our findings suggest that full\nfine-tuning and low-rank adaptation fine-tuning produce optimal models at lower\nand higher computational budgets respectively.\n","authors":["Alicja Ziarko","Albert Q. Jiang","Bartosz Piotrowski","Wenda Li","Mateja Jamnik","Piotr Miłoś"],"pdf_url":"https://arxiv.org/pdf/2406.04165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05847v2","updated":"2024-06-06T15:22:31Z","published":"2024-05-09T15:34:15Z","title":"Learned feature representations are biased by complexity, learning\n  order, position, and more","summary":"  Representation learning, and interpreting learned representations, are key\nareas of focus in machine learning and neuroscience. Both fields generally use\nrepresentations as a means to understand or improve a system's computations. In\nthis work, however, we explore surprising dissociations between representation\nand computation that may pose challenges for such efforts. We create datasets\nin which we attempt to match the computational role that different features\nplay, while manipulating other properties of the features or the data. We train\nvarious deep learning architectures to compute these multiple abstract features\nabout their inputs. We find that their learned feature representations are\nsystematically biased towards representing some features more strongly than\nothers, depending upon extraneous properties such as feature complexity, the\norder in which features are learned, and the distribution of features over the\ninputs. For example, features that are simpler to compute or learned first tend\nto be represented more strongly and densely than features that are more complex\nor learned later, even if all features are learned equally well. We also\nexplore how these biases are affected by architectures, optimizers, and\ntraining regimes (e.g., in transformers, features decoded earlier in the output\nsequence also tend to be represented more strongly). Our results help to\ncharacterize the inductive biases of gradient-based representation learning.\nThese results also highlight a key challenge for interpretability $-$ or for\ncomparing the representations of models and brains $-$ disentangling extraneous\nbiases from the computationally important aspects of a system's internal\nrepresentations.\n","authors":["Andrew Kyle Lampinen","Stephanie C. Y. Chan","Katherine Hermann"],"pdf_url":"https://arxiv.org/pdf/2405.05847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04163v1","updated":"2024-06-06T15:20:37Z","published":"2024-06-06T15:20:37Z","title":"Essentially Sharp Estimates on the Entropy Regularization Error in\n  Discrete Discounted Markov Decision Processes","summary":"  We study the error introduced by entropy regularization of infinite-horizon\ndiscrete discounted Markov decision processes. We show that this error\ndecreases exponentially in the inverse regularization strength both in a\nweighted KL-divergence and in value with a problem-specific exponent. We\nprovide a lower bound matching our upper bound up to a polynomial factor. Our\nproof relies on the correspondence of the solutions of entropy-regularized\nMarkov decision processes with gradient flows of the unregularized reward with\nrespect to a Riemannian metric common in natural policy gradient methods.\nFurther, this correspondence allows us to identify the limit of the gradient\nflow as the generalized maximum entropy optimal policy, thereby characterizing\nthe implicit bias of the Kakade gradient flow which corresponds to a\ntime-continuous version of the natural policy gradient method. We use this to\nshow that for entropy-regularized natural policy gradient methods the overall\nerror decays exponentially in the square root of the number of iterations\nimproving existing sublinear guarantees.\n","authors":["Johannes Müller","Semih Cayci"],"pdf_url":"https://arxiv.org/pdf/2406.04163v1.pdf","comment":"25 pages, 1 figure"},{"id":"http://arxiv.org/abs/2406.04156v1","updated":"2024-06-06T15:17:51Z","published":"2024-06-06T15:17:51Z","title":"Pointer-Guided Pre-Training: Infusing Large Language Models with\n  Paragraph-Level Contextual Awareness","summary":"  We introduce \"pointer-guided segment ordering\" (SO), a novel pre-training\ntechnique aimed at enhancing the contextual understanding of paragraph-level\ntext representations in large language models. Our methodology leverages a\nself-attention-driven pointer network to restore the original sequence of\nshuffled text segments, addressing the challenge of capturing the structural\ncoherence and contextual dependencies within documents. This pre-training\napproach is complemented by a fine-tuning methodology that incorporates dynamic\nsampling, augmenting the diversity of training instances and improving sample\nefficiency for various downstream applications. We evaluate our method on a\ndiverse set of datasets, demonstrating its efficacy in tasks requiring\nsequential text classification across scientific literature and financial\nreporting domains. Our experiments show that pointer-guided pre-training\nsignificantly enhances the model's ability to understand complex document\nstructures, leading to state-of-the-art performance in downstream\nclassification tasks.\n","authors":["Lars Hillebrand","Prabhupad Pradhan","Christian Bauckhage","Rafet Sifa"],"pdf_url":"https://arxiv.org/pdf/2406.04156v1.pdf","comment":"17 pages, 3 figures, 5 tables, accepted at ECML-PKDD 2024"},{"id":"http://arxiv.org/abs/2406.04155v1","updated":"2024-06-06T15:17:33Z","published":"2024-06-06T15:17:33Z","title":"Improving Physics-Augmented Continuum Neural Radiance Field-Based\n  Geometry-Agnostic System Identification with Lagrangian Particle Optimization","summary":"  Geometry-agnostic system identification is a technique for identifying the\ngeometry and physical properties of an object from video sequences without any\ngeometric assumptions. Recently, physics-augmented continuum neural radiance\nfields (PAC-NeRF) has demonstrated promising results for this technique by\nutilizing a hybrid Eulerian-Lagrangian representation, in which the geometry is\nrepresented by the Eulerian grid representations of NeRF, the physics is\ndescribed by a material point method (MPM), and they are connected via\nLagrangian particles. However, a notable limitation of PAC-NeRF is that its\nperformance is sensitive to the learning of the geometry from the first frames\nowing to its two-step optimization. First, the grid representations are\noptimized with the first frames of video sequences, and then the physical\nproperties are optimized through video sequences utilizing the fixed\nfirst-frame grid representations. This limitation can be critical when learning\nof the geometric structure is difficult, for example, in a few-shot (sparse\nview) setting. To overcome this limitation, we propose Lagrangian particle\noptimization (LPO), in which the positions and features of particles are\noptimized through video sequences in Lagrangian space. This method allows for\nthe optimization of the geometric structure across the entire video sequence\nwithin the physical constraints imposed by the MPM. The experimental results\ndemonstrate that the LPO is useful for geometric correction and physical\nidentification in sparse-view settings.\n","authors":["Takuhiro Kaneko"],"pdf_url":"https://arxiv.org/pdf/2406.04155v1.pdf","comment":"Accepted to CVPR 2024. Project page:\n  https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/lpo/"},{"id":"http://arxiv.org/abs/2406.04153v1","updated":"2024-06-06T15:17:00Z","published":"2024-06-06T15:17:00Z","title":"Learned Feature Importance Scores for Automated Feature Engineering","summary":"  Feature engineering has demonstrated substantial utility for many machine\nlearning workflows, such as in the small data regime or when distribution\nshifts are severe. Thus automating this capability can relieve much manual\neffort and improve model performance. Towards this, we propose AutoMAN, or\nAutomated Mask-based Feature Engineering, an automated feature engineering\nframework that achieves high accuracy, low latency, and can be extended to\nheterogeneous and time-varying data. AutoMAN is based on effectively exploring\nthe candidate transforms space, without explicitly manifesting transformed\nfeatures. This is achieved by learning feature importance masks, which can be\nextended to support other modalities such as time series. AutoMAN learns\nfeature transform importance end-to-end, incorporating a dataset's task target\ndirectly into feature engineering, resulting in state-of-the-art performance\nwith significantly lower latency compared to alternatives.\n","authors":["Yihe Dong","Sercan Arik","Nathanael Yoder","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2406.04153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04148v1","updated":"2024-06-06T15:13:48Z","published":"2024-06-06T15:13:48Z","title":"Fast Redescription Mining Using Locality-Sensitive Hashing","summary":"  Redescription mining is a data analysis technique that has found applications\nin diverse fields. The most used redescription mining approaches involve two\nphases: finding matching pairs among data attributes and extending the pairs.\nThis process is relatively efficient when the number of attributes remains\nlimited and when the attributes are Boolean, but becomes almost intractable\nwhen the data consist of many numerical attributes. In this paper, we present\nnew algorithms that perform the matching and extension orders of magnitude\nfaster than the existing approaches. Our algorithms are based on\nlocality-sensitive hashing with a tailored approach to handle the\ndiscretisation of numerical attributes as used in redescription mining.\n","authors":["Maiju Karjalainen","Esther Galbrun","Pauli Miettinen"],"pdf_url":"https://arxiv.org/pdf/2406.04148v1.pdf","comment":"20 pages, 4 figures, to appear at ECML-PKDD 2024"},{"id":"http://arxiv.org/abs/2406.03145v2","updated":"2024-06-06T15:12:55Z","published":"2024-06-05T11:00:27Z","title":"E(n) Equivariant Message Passing Cellular Networks","summary":"  This paper introduces E(n) Equivariant Message Passing Cellular Networks\n(EMPCNs), an extension of E(n) Equivariant Graph Neural Networks to\nCW-complexes. Our approach addresses two aspects of geometric message passing\nnetworks: 1) enhancing their expressiveness by incorporating arbitrary cells,\nand 2) achieving this in a computationally efficient way with a decoupled\nEMPCNs technique. We demonstrate that EMPCNs achieve close to state-of-the-art\nperformance on multiple tasks without the need for steerability, including\nmany-body predictions and motion capture. Moreover, ablation studies confirm\nthat decoupled EMPCNs exhibit stronger generalization capabilities than their\nnon-topologically informed counterparts. These findings show that EMPCNs can be\nused as a scalable and expressive framework for higher-order message passing in\ngeometric and topological graphs\n","authors":["Veljko Kovač","Erik J. Bekkers","Pietro Liò","Floor Eijkelboom"],"pdf_url":"https://arxiv.org/pdf/2406.03145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15082v2","updated":"2024-06-06T15:11:29Z","published":"2024-02-23T03:59:18Z","title":"PEMT: Multi-Task Correlation Guided Mixture-of-Experts Enables\n  Parameter-Efficient Transfer Learning","summary":"  Parameter-efficient fine-tuning (PEFT) has emerged as an effective method for\nadapting pre-trained language models to various tasks efficiently. Recently,\nthere has been a growing interest in transferring knowledge from one or\nmultiple tasks to the downstream target task to achieve performance\nimprovements. However, current approaches typically either train adapters on\nindividual tasks or distill shared knowledge from source tasks, failing to\nfully exploit task-specific knowledge and the correlation between source and\ntarget tasks. To overcome these limitations, we propose PEMT, a novel\nparameter-efficient fine-tuning framework based on multi-task transfer\nlearning. PEMT extends the mixture-of-experts (MoE) framework to capture the\ntransferable knowledge as a weighted combination of adapters trained on source\ntasks. These weights are determined by a gated unit, measuring the correlation\nbetween the target and each source task using task description prompt vectors.\nTo fully exploit the task-specific knowledge, we also propose the Task Sparsity\nLoss to improve the sparsity of the gated unit. We conduct experiments on a\nbroad range of tasks over 17 datasets. The experimental results demonstrate our\nPEMT yields stable improvements over full fine-tuning, and state-of-the-art\nPEFT and knowledge transferring methods on various tasks. The results highlight\nthe effectiveness of our method which is capable of sufficiently exploiting the\nknowledge and correlation features across multiple tasks.\n","authors":["Zhisheng Lin","Han Fu","Chenghao Liu","Zhuo Li","Jianling Sun"],"pdf_url":"https://arxiv.org/pdf/2402.15082v2.pdf","comment":"Accepted to Findings of the ACL 2024"},{"id":"http://arxiv.org/abs/2309.06054v3","updated":"2024-06-06T15:09:52Z","published":"2023-09-12T08:45:25Z","title":"Breaking through the learning plateaus of in-context learning in\n  Transformer","summary":"  In-context learning, i.e., learning from context examples, is an impressive\nability of Transformer. Training Transformers to possess this in-context\nlearning skill is computationally intensive due to the occurrence of learning\nplateaus, which are periods within the training process where there is minimal\nor no enhancement in the model's in-context learning capability. To study the\nmechanism behind the learning plateaus, we conceptually seperate a component\nwithin the model's internal representation that is exclusively affected by the\nmodel's weights. We call this the \"weights component\", and the remainder is\nidentified as the \"context component\". By conducting meticulous and controlled\nexperiments on synthetic tasks, we note that the persistence of learning\nplateaus correlates with compromised functionality of the weights component.\nRecognizing the impaired performance of the weights component as a fundamental\nbehavior drives learning plateaus, we have developed three strategies to\nexpedite the learning of Transformers. The effectiveness of these strategies is\nfurther confirmed in natural language processing tasks. In conclusion, our\nresearch demonstrates the feasibility of cultivating a powerful in-context\nlearning ability within AI systems in an eco-friendly manner.\n","authors":["Jingwen Fu","Tao Yang","Yuwang Wang","Yan Lu","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.06054v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04144v1","updated":"2024-06-06T15:08:41Z","published":"2024-06-06T15:08:41Z","title":"Redundancy-aware Action Spaces for Robot Learning","summary":"  Joint space and task space control are the two dominant action modes for\ncontrolling robot arms within the robot learning literature. Actions in joint\nspace provide precise control over the robot's pose, but tend to suffer from\ninefficient training; actions in task space boast data-efficient training but\nsacrifice the ability to perform tasks in confined spaces due to limited\ncontrol over the full joint configuration. This work analyses the criteria for\ndesigning action spaces for robot manipulation and introduces ER (End-effector\nRedundancy), a novel action space formulation that, by addressing the\nredundancies present in the manipulator, aims to combine the advantages of both\njoint and task spaces, offering fine-grained comprehensive control with\noveractuated robot arms whilst achieving highly efficient robot learning. We\npresent two implementations of ER, ERAngle (ERA) and ERJoint (ERJ), and we show\nthat ERJ in particular demonstrates superior performance across multiple\nsettings, especially when precise control over the robot configuration is\nrequired. We validate our results both in simulated and real robotic\nenvironments.\n","authors":["Pietro Mazzaglia","Nicholas Backshall","Xiao Ma","Stephen James"],"pdf_url":"https://arxiv.org/pdf/2406.04144v1.pdf","comment":"Published in the RA-L journal"},{"id":"http://arxiv.org/abs/2406.04143v1","updated":"2024-06-06T15:08:16Z","published":"2024-06-06T15:08:16Z","title":"Do Language Models Understand Morality? Towards a Robust Detection of\n  Moral Content","summary":"  The task of detecting moral values in text has significant implications in\nvarious fields, including natural language processing, social sciences, and\nethical decision-making. Previously proposed supervised models often suffer\nfrom overfitting, leading to hyper-specialized moral classifiers that struggle\nto perform well on data from different domains. To address this issue, we\nintroduce novel systems that leverage abstract concepts and common-sense\nknowledge acquired from Large Language Models and Natural Language Inference\nmodels during previous stages of training on multiple data sources. By doing\nso, we aim to develop versatile and robust methods for detecting moral values\nin real-world scenarios. Our approach uses the GPT 3.5 model as a zero-shot\nready-made unsupervised multi-label classifier for moral values detection,\neliminating the need for explicit training on labeled data. We compare it with\na smaller NLI-based zero-shot model. The results show that the NLI approach\nachieves competitive results compared to the Davinci model. Furthermore, we\nconduct an in-depth investigation of the performance of supervised systems in\nthe context of cross-domain multi-label moral value detection. This involves\ntraining supervised models on different domains to explore their effectiveness\nin handling data from different sources and comparing their performance with\nthe unsupervised methods. Our contributions encompass a thorough analysis of\nboth supervised and unsupervised methodologies for cross-domain value\ndetection. We introduce the Davinci model as a state-of-the-art zero-shot\nunsupervised moral values classifier, pushing the boundaries of moral value\ndetection without the need for explicit training on labeled data. Additionally,\nwe perform a comparative evaluation of our approach with the supervised models,\nshedding light on their respective strengths and weaknesses.\n","authors":["Luana Bulla","Aldo Gangemi","Misael Mongiovì"],"pdf_url":"https://arxiv.org/pdf/2406.04143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04142v1","updated":"2024-06-06T15:08:06Z","published":"2024-06-06T15:08:06Z","title":"Stochastic Polyak Step-sizes and Momentum: Convergence Guarantees and\n  Practical Performance","summary":"  Stochastic gradient descent with momentum, also known as Stochastic Heavy\nBall method (SHB), is one of the most popular algorithms for solving\nlarge-scale stochastic optimization problems in various machine learning tasks.\nIn practical scenarios, tuning the step-size and momentum parameters of the\nmethod is a prohibitively expensive and time-consuming process. In this work,\ninspired by the recent advantages of stochastic Polyak step-size in the\nperformance of stochastic gradient descent (SGD), we propose and explore new\nPolyak-type variants suitable for the update rule of the SHB method. In\nparticular, using the Iterate Moving Average (IMA) viewpoint of SHB, we propose\nand analyze three novel step-size selections: MomSPS$_{\\max}$, MomDecSPS, and\nMomAdaSPS. For MomSPS$_{\\max}$, we provide convergence guarantees for SHB to a\nneighborhood of the solution for convex and smooth problems (without assuming\ninterpolation). If interpolation is also satisfied, then using MomSPS$_{\\max}$,\nSHB converges to the true solution at a fast rate matching the deterministic\nHB. The other two variants, MomDecSPS and MomAdaSPS, are the first adaptive\nstep-sizes for SHB that guarantee convergence to the exact minimizer without\nprior knowledge of the problem parameters and without assuming interpolation.\nThe convergence analysis of SHB is tight and obtains the convergence guarantees\nof SGD with stochastic Polyak step-sizes as a special case. We supplement our\nanalysis with experiments that validate the theory and demonstrate the\neffectiveness and robustness of the new algorithms.\n","authors":["Dimitris Oikonomou","Nicolas Loizou"],"pdf_url":"https://arxiv.org/pdf/2406.04142v1.pdf","comment":"39 pages, 20 Figures"},{"id":"http://arxiv.org/abs/2405.12684v2","updated":"2024-06-06T15:06:54Z","published":"2024-05-21T11:19:50Z","title":"Model Free Prediction with Uncertainty Assessment","summary":"  Deep nonparametric regression, characterized by the utilization of deep\nneural networks to learn target functions, has emerged as a focus of research\nattention in recent years. Despite considerable progress in understanding\nconvergence rates, the absence of asymptotic properties hinders rigorous\nstatistical inference. To address this gap, we propose a novel framework that\ntransforms the deep estimation paradigm into a platform conducive to\nconditional mean estimation, leveraging the conditional diffusion model.\nTheoretically, we develop an end-to-end convergence rate for the conditional\ndiffusion model and establish the asymptotic normality of the generated\nsamples. Consequently, we are equipped to construct confidence regions,\nfacilitating robust statistical inference. Furthermore, through numerical\nexperiments, we empirically validate the efficacy of our proposed methodology.\n","authors":["Yuling Jiao","Lican Kang","Jin Liu","Heng Peng","Heng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.12684v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04137v1","updated":"2024-06-06T14:57:52Z","published":"2024-06-06T14:57:52Z","title":"Optimal Batched Linear Bandits","summary":"  We introduce the E$^4$ algorithm for the batched linear bandit problem,\nincorporating an Explore-Estimate-Eliminate-Exploit framework. With a proper\nchoice of exploration rate, we prove E$^4$ achieves the finite-time minimax\noptimal regret with only $O(\\log\\log T)$ batches, and the asymptotically\noptimal regret with only $3$ batches as $T\\rightarrow\\infty$, where $T$ is the\ntime horizon. We further prove a lower bound on the batch complexity of linear\ncontextual bandits showing that any asymptotically optimal algorithm must\nrequire at least $3$ batches in expectation as $T\\rightarrow\\infty$, which\nindicates E$^4$ achieves the asymptotic optimality in regret and batch\ncomplexity simultaneously. To the best of our knowledge, E$^4$ is the first\nalgorithm for linear bandits that simultaneously achieves the minimax and\nasymptotic optimality in regret with the corresponding optimal batch\ncomplexities. In addition, we show that with another choice of exploration rate\nE$^4$ achieves an instance-dependent regret bound requiring at most $O(\\log T)$\nbatches, and maintains the minimax optimality and asymptotic optimality. We\nconduct thorough experiments to evaluate our algorithm on randomly generated\ninstances and the challenging \\textit{End of Optimism} instances\n\\citep{lattimore2017end} which were shown to be hard to learn for optimism\nbased algorithms. Empirical results show that E$^4$ consistently outperforms\nbaseline algorithms with respect to regret minimization, batch complexity, and\ncomputational efficiency.\n","authors":["Xuanfei Ren","Tianyuan Jin","Pan Xu"],"pdf_url":"https://arxiv.org/pdf/2406.04137v1.pdf","comment":"26 pages, 6 figures, 4 tables. To appear in the proceedings of the\n  41st International Conference on Machine Learning (ICML 2024)"},{"id":"http://arxiv.org/abs/2406.04136v1","updated":"2024-06-06T14:57:48Z","published":"2024-06-06T14:57:48Z","title":"Legal Judgment Reimagined: PredEx and the Rise of Intelligent AI\n  Interpretation in Indian Courts","summary":"  In the era of Large Language Models (LLMs), predicting judicial outcomes\nposes significant challenges due to the complexity of legal proceedings and the\nscarcity of expert-annotated datasets. Addressing this, we introduce\n\\textbf{Pred}iction with \\textbf{Ex}planation (\\texttt{PredEx}), the largest\nexpert-annotated dataset for legal judgment prediction and explanation in the\nIndian context, featuring over 15,000 annotations. This groundbreaking corpus\nsignificantly enhances the training and evaluation of AI models in legal\nanalysis, with innovations including the application of instruction tuning to\nLLMs. This method has markedly improved the predictive accuracy and explanatory\ndepth of these models for legal judgments. We employed various\ntransformer-based models, tailored for both general and Indian legal contexts.\nThrough rigorous lexical, semantic, and expert assessments, our models\neffectively leverage \\texttt{PredEx} to provide precise predictions and\nmeaningful explanations, establishing it as a valuable benchmark for both the\nlegal profession and the NLP community.\n","authors":["Shubham Kumar Nigam","Anurag Sharma","Danush Khanna","Noel Shallum","Kripabandhu Ghosh","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2406.04136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08511v2","updated":"2024-06-06T14:56:47Z","published":"2023-09-15T16:17:54Z","title":"Generalised Diffusion Probabilistic Scale-Spaces","summary":"  Diffusion probabilistic models excel at sampling new images from learned\ndistributions. Originally motivated by drift-diffusion concepts from physics,\nthey apply image perturbations such as noise and blur in a forward process that\nresults in a tractable probability distribution. A corresponding learned\nreverse process generates images and can be conditioned on side information,\nwhich leads to a wide variety of practical applications. Most of the research\nfocus currently lies on practice-oriented extensions. In contrast, the\ntheoretical background remains largely unexplored, in particular the relations\nto drift-diffusion. In order to shed light on these connections to classical\nimage filtering, we propose a generalised scale-space theory for diffusion\nprobabilistic models. Moreover, we show conceptual and empirical connections to\ndiffusion and osmosis filters.\n","authors":["Pascal Peter"],"pdf_url":"https://arxiv.org/pdf/2309.08511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10890v2","updated":"2024-06-06T14:55:40Z","published":"2024-02-16T18:45:58Z","title":"When is Tree Search Useful for LLM Planning? It Depends on the\n  Discriminator","summary":"  In this paper, we examine how large language models (LLMs) solve multi-step\nproblems under a language agent framework with three components: a generator, a\ndiscriminator, and a planning method. We investigate the practical utility of\ntwo advanced planning methods, iterative correction and tree search. We present\na comprehensive analysis of how discrimination accuracy affects the overall\nperformance of agents when using these two methods or a simpler method,\nre-ranking. Experiments on two tasks, text-to-SQL parsing and mathematical\nreasoning, show that: (1) advanced planning methods demand discriminators with\nat least 90% accuracy to achieve significant improvements over re-ranking; (2)\ncurrent LLMs' discrimination abilities have not met the needs of advanced\nplanning methods to achieve such improvements; (3) with LLM-based\ndiscriminators, advanced planning methods may not adequately balance accuracy\nand efficiency. For example, compared to the other two methods, tree search is\nat least 10--20 times slower but leads to negligible performance gains, which\nhinders its real-world applications. Code and data are available at\nhttps://github.com/OSU-NLP-Group/llm-planning-eval.\n","authors":["Ziru Chen","Michael White","Raymond Mooney","Ali Payani","Yu Su","Huan Sun"],"pdf_url":"https://arxiv.org/pdf/2402.10890v2.pdf","comment":"ACL 2024 main"},{"id":"http://arxiv.org/abs/2308.07876v3","updated":"2024-06-06T14:46:44Z","published":"2023-08-15T16:41:53Z","title":"Leveraging Codebook Knowledge with NLI and ChatGPT for Zero-Shot\n  Political Relation Classification","summary":"  Is it possible accurately classify political relations within evolving event\nontologies without extensive annotations? This study investigates zero-shot\nlearning methods that use expert knowledge from existing annotation codebook,\nand evaluates the performance of advanced ChatGPT (GPT-3.5/4) and a natural\nlanguage inference (NLI)-based model called ZSP. ChatGPT uses codebook's\nlabeled summaries as prompts, whereas ZSP breaks down the classification task\ninto context, event mode, and class disambiguation to refine task-specific\nhypotheses. This decomposition enhances interpretability, efficiency, and\nadaptability to schema changes. The experiments reveal ChatGPT's strengths and\nlimitations, and crucially show ZSP's outperformance of dictionary-based\nmethods and its competitive edge over some supervised models. These findings\naffirm the value of ZSP for validating event records and advancing ontology\ndevelopment. Our study underscores the efficacy of leveraging transfer learning\nand existing domain expertise to enhance research efficiency and scalability.\n","authors":["Yibo Hu","Erick Skorupa Parolin","Latifur Khan","Patrick T. Brandt","Javier Osorio","Vito J. D'Orazio"],"pdf_url":"https://arxiv.org/pdf/2308.07876v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2403.02271v2","updated":"2024-06-06T14:43:30Z","published":"2024-03-04T17:58:09Z","title":"RIFF: Learning to Rephrase Inputs for Few-shot Fine-tuning of Language\n  Models","summary":"  Pre-trained Language Models (PLMs) can be accurately fine-tuned for\ndownstream text processing tasks. Recently, researchers have introduced several\nparameter-efficient fine-tuning methods that optimize input prompts or adjust a\nsmall number of model parameters (e.g LoRA). In this study, we explore the\nimpact of altering the input text of the original task in conjunction with\nparameter-efficient fine-tuning methods. To most effectively rewrite the input\ntext, we train a few-shot paraphrase model with a Maximum-Marginal Likelihood\nobjective. Using six few-shot text classification datasets, we show that\nenriching data with paraphrases at train and test time enhances the performance\nbeyond what can be achieved with parameter-efficient fine-tuning alone. The\ncode used for our experiments can be found at\nhttps://github.com/SaeedNajafi/RIFF.\n","authors":["Saeed Najafi","Alona Fyshe"],"pdf_url":"https://arxiv.org/pdf/2403.02271v2.pdf","comment":"Final Version (Findings of ACL2024)"},{"id":"http://arxiv.org/abs/2405.18353v2","updated":"2024-06-06T14:32:38Z","published":"2024-05-28T16:52:52Z","title":"Simulating infinite-dimensional nonlinear diffusion bridges","summary":"  The diffusion bridge is a type of diffusion process that conditions on\nhitting a specific state within a finite time period. It has broad applications\nin fields such as Bayesian inference, financial mathematics, control theory,\nand shape analysis. However, simulating the diffusion bridge for natural data\ncan be challenging due to both the intractability of the drift term and\ncontinuous representations of the data. Although several methods are available\nto simulate finite-dimensional diffusion bridges, infinite-dimensional cases\nremain unresolved. In the paper, we present a solution to this problem by\nmerging score-matching techniques with operator learning, enabling a direct\napproach to score-matching for the infinite-dimensional bridge. We construct\nthe score to be discretization invariant, which is natural given the underlying\nspatially continuous process. We conduct a series of experiments, ranging from\nsynthetic examples with closed-form solutions to the stochastic nonlinear\nevolution of real-world biological shape data, and our method demonstrates high\nefficacy, particularly due to its ability to adapt to any resolution without\nextra training.\n","authors":["Gefan Yang","Elizabeth Louise Baker","Michael L. Severinsen","Christy Anna Hipsley","Stefan Sommer"],"pdf_url":"https://arxiv.org/pdf/2405.18353v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09639v3","updated":"2024-06-06T14:31:03Z","published":"2023-10-14T18:42:56Z","title":"DPZero: Private Fine-Tuning of Language Models without Backpropagation","summary":"  The widespread practice of fine-tuning large language models (LLMs) on\ndomain-specific data faces two major challenges in memory and privacy. First,\nas the size of LLMs continues to grow, the memory demands of gradient-based\ntraining methods via backpropagation become prohibitively high. Second, given\nthe tendency of LLMs to memorize training data, it is important to protect\npotentially sensitive information in the fine-tuning data from being\nregurgitated. Zeroth-order methods, which rely solely on forward passes,\nsubstantially reduce memory consumption during training. However, directly\ncombining them with standard differentially private gradient descent suffers\nmore as model size grows. To bridge this gap, we introduce DPZero, a novel\nprivate zeroth-order algorithm with nearly dimension-independent rates. The\nmemory efficiency of DPZero is demonstrated in privately fine-tuning RoBERTa\nand OPT on several downstream tasks. Our code is available at\nhttps://github.com/Liang137/DPZero.\n","authors":["Liang Zhang","Bingcong Li","Kiran Koshy Thekumparampil","Sewoong Oh","Niao He"],"pdf_url":"https://arxiv.org/pdf/2310.09639v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.04112v1","updated":"2024-06-06T14:29:49Z","published":"2024-06-06T14:29:49Z","title":"Compressible Dynamics in Deep Overparameterized Low-Rank Learning &\n  Adaptation","summary":"  While overparameterization in machine learning models offers great benefits\nin terms of optimization and generalization, it also leads to increased\ncomputational requirements as model sizes grow. In this work, we show that by\nleveraging the inherent low-dimensional structures of data and compressible\ndynamics within the model parameters, we can reap the benefits of\noverparameterization without the computational burdens. In practice, we\ndemonstrate the effectiveness of this approach for deep low-rank matrix\ncompletion as well as fine-tuning language models. Our approach is grounded in\ntheoretical findings for deep overparameterized low-rank matrix recovery, where\nwe show that the learning dynamics of each weight matrix are confined to an\ninvariant low-dimensional subspace. Consequently, we can construct and train\ncompact, highly compressed factorizations possessing the same benefits as their\noverparameterized counterparts. In the context of deep matrix completion, our\ntechnique substantially improves training efficiency while retaining the\nadvantages of overparameterization. For language model fine-tuning, we propose\na method called \"Deep LoRA\", which improves the existing low-rank adaptation\n(LoRA) technique, leading to reduced overfitting and a simplified\nhyperparameter setup, while maintaining comparable efficiency. We validate the\neffectiveness of Deep LoRA on natural language tasks, particularly when\nfine-tuning with limited data.\n","authors":["Can Yaras","Peng Wang","Laura Balzano","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2406.04112v1.pdf","comment":"Accepted at ICML'24 (Oral)"},{"id":"http://arxiv.org/abs/2405.07460v2","updated":"2024-06-06T14:23:48Z","published":"2024-05-13T04:35:14Z","title":"HoneyBee: A Scalable Modular Framework for Creating Multimodal Oncology\n  Datasets with Foundational Embedding Models","summary":"  Developing accurate machine learning models for oncology requires\nlarge-scale, high-quality multimodal datasets. However, creating such datasets\nremains challenging due to the complexity and heterogeneity of medical data. To\naddress this challenge, we introduce HoneyBee, a scalable modular framework for\nbuilding multimodal oncology datasets that leverages foundation models to\ngenerate representative embeddings. HoneyBee integrates various data\nmodalities, including clinical diagnostic and pathology imaging data, medical\nnotes, reports, records, and molecular data. It employs data preprocessing\ntechniques and foundation models to generate embeddings that capture the\nessential features and relationships within the raw medical data. The generated\nembeddings are stored in a structured format using Hugging Face datasets and\nPyTorch dataloaders for accessibility. Vector databases enable efficient\nquerying and retrieval for machine learning applications. We demonstrate the\neffectiveness of HoneyBee through experiments assessing the quality and\nrepresentativeness of these embeddings. The framework is designed to be\nextensible to other medical domains and aims to accelerate oncology research by\nproviding high-quality, machine learning-ready datasets. HoneyBee is an ongoing\nopen-source effort, and the code, datasets, and models are available at the\nproject repository.\n","authors":["Aakash Tripathi","Asim Waqas","Yasin Yilmaz","Ghulam Rasool"],"pdf_url":"https://arxiv.org/pdf/2405.07460v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04105v1","updated":"2024-06-06T14:21:15Z","published":"2024-06-06T14:21:15Z","title":"From Tissue Plane to Organ World: A Benchmark Dataset for Multimodal\n  Biomedical Image Registration using Deep Co-Attention Networks","summary":"  Correlating neuropathology with neuroimaging findings provides a multiscale\nview of pathologic changes in the human organ spanning the meso- to\nmicro-scales, and is an emerging methodology expected to shed light on numerous\ndisease states. To gain the most information from this multimodal, multiscale\napproach, it is desirable to identify precisely where a histologic tissue\nsection was taken from within the organ in order to correlate with the tissue\nfeatures in exactly the same organ region. Histology-to-organ registration\nposes an extra challenge, as any given histologic section can capture only a\nsmall portion of a human organ. Making use of the capabilities of\nstate-of-the-art deep learning models, we unlock the potential to address and\nsolve such intricate challenges. Therefore, we create the ATOM benchmark\ndataset, sourced from diverse institutions, with the primary objective of\ntransforming this challenge into a machine learning problem and delivering\noutstanding outcomes that enlighten the biomedical community. The performance\nof our RegisMCAN model demonstrates the potential of deep learning to\naccurately predict where a subregion extracted from an organ image was obtained\nfrom within the overall 3D volume. The code and dataset can be found at:\nhttps://github.com/haizailache999/Image-Registration/tree/main\n","authors":["Yifeng Wang","Weipeng Li","Thomas Pearce","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2406.04105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04103v1","updated":"2024-06-06T14:20:21Z","published":"2024-06-06T14:20:21Z","title":"Multistep Distillation of Diffusion Models via Moment Matching","summary":"  We present a new method for making diffusion models faster to sample. The\nmethod distills many-step diffusion models into few-step models by matching\nconditional expectations of the clean data given noisy data along the sampling\ntrajectory. Our approach extends recently proposed one-step methods to the\nmulti-step case, and provides a new perspective by interpreting these\napproaches in terms of moment matching. By using up to 8 sampling steps, we\nobtain distilled models that outperform not only their one-step versions but\nalso their original many-step teacher models, obtaining new state-of-the-art\nresults on the Imagenet dataset. We also show promising results on a large\ntext-to-image model where we achieve fast generation of high resolution images\ndirectly in image space, without needing autoencoders or upsamplers.\n","authors":["Tim Salimans","Thomas Mensink","Jonathan Heek","Emiel Hoogeboom"],"pdf_url":"https://arxiv.org/pdf/2406.04103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04099v1","updated":"2024-06-06T14:15:12Z","published":"2024-06-06T14:15:12Z","title":"Enhancing Weather Predictions: Super-Resolution via Deep Diffusion\n  Models","summary":"  This study investigates the application of deep-learning diffusion models for\nthe super-resolution of weather data, a novel approach aimed at enhancing the\nspatial resolution and detail of meteorological variables. Leveraging the\ncapabilities of diffusion models, specifically the SR3 and ResDiff\narchitectures, we present a methodology for transforming low-resolution weather\ndata into high-resolution outputs. Our experiments, conducted using the\nWeatherBench dataset, focus on the super-resolution of the two-meter\ntemperature variable, demonstrating the models' ability to generate detailed\nand accurate weather maps. The results indicate that the ResDiff model, further\nimproved by incorporating physics-based modifications, significantly\noutperforms traditional SR3 methods in terms of Mean Squared Error (MSE),\nStructural Similarity Index (SSIM), and Peak Signal-to-Noise Ratio (PSNR). This\nresearch highlights the potential of diffusion models in meteorological\napplications, offering insights into their effectiveness, challenges, and\nprospects for future advancements in weather prediction and climate analysis.\n","authors":["Jan Martinů","Petr Šimánek"],"pdf_url":"https://arxiv.org/pdf/2406.04099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04098v1","updated":"2024-06-06T14:13:38Z","published":"2024-06-06T14:13:38Z","title":"A Large-Scale Neutral Comparison Study of Survival Models on\n  Low-Dimensional Data","summary":"  This work presents the first large-scale neutral benchmark experiment focused\non single-event, right-censored, low-dimensional survival data. Benchmark\nexperiments are essential in methodological research to scientifically compare\nnew and existing model classes through proper empirical evaluation. Existing\nbenchmarks in the survival literature are often narrow in scope, focusing, for\nexample, on high-dimensional data. Additionally, they may lack appropriate\ntuning or evaluation procedures, or are qualitative reviews, rather than\nquantitative comparisons. This comprehensive study aims to fill the gap by\nneutrally evaluating a broad range of methods and providing generalizable\nconclusions. We benchmark 18 models, ranging from classical statistical\napproaches to many common machine learning methods, on 32 publicly available\ndatasets. The benchmark tunes for both a discrimination measure and a proper\nscoring rule to assess performance in different settings. Evaluating on 8\nsurvival metrics, we assess discrimination, calibration, and overall predictive\nperformance of the tested models. Using discrimination measures, we find that\nno method significantly outperforms the Cox model. However, (tuned) Accelerated\nFailure Time models were able to achieve significantly better results with\nrespect to overall predictive performance as measured by the right-censored\nlog-likelihood. Machine learning methods that performed comparably well include\nOblique Random Survival Forests under discrimination, and Cox-based\nlikelihood-boosting under overall predictive performance. We conclude that for\npredictive purposes in the standard survival analysis setting of\nlow-dimensional, right-censored data, the Cox Proportional Hazards model\nremains a simple and robust method, sufficient for practitioners.\n","authors":["Lukas Burk","John Zobolas","Bernd Bischl","Andreas Bender","Marvin N. Wright","Raphael Sonabend"],"pdf_url":"https://arxiv.org/pdf/2406.04098v1.pdf","comment":"42 pages, 28 figures"},{"id":"http://arxiv.org/abs/2310.02721v2","updated":"2024-06-06T14:11:29Z","published":"2023-10-04T10:52:51Z","title":"Leveraging Temporal Graph Networks Using Module Decoupling","summary":"  Modern approaches for learning on dynamic graphs have adopted the use of\nbatches instead of applying updates one by one. The use of batches allows these\ntechniques to become helpful in streaming scenarios where updates to graphs are\nreceived at extreme speeds. Using batches, however, forces the models to update\ninfrequently, which results in the degradation of their performance. In this\nwork, we suggest a decoupling strategy that enables the models to update\nfrequently while using batches. By decoupling the core modules of temporal\ngraph networks and implementing them using a minimal number of learnable\nparameters, we have developed the Lightweight Decoupled Temporal Graph Network\n(LDTGN), an exceptionally efficient model for learning on dynamic graphs. LDTG\nwas validated on various dynamic graph benchmarks, providing comparable or\nstate-of-the-art results with significantly higher throughput than previous\nart. Notably, our method outperforms previous approaches by more than 20\\% on\nbenchmarks that require rapid model update rates, such as USLegis or UNTrade.\nThe code to reproduce our experiments is available at\n\\href{https://orfeld415.github.io/module-decoupling}{this http url}.\n","authors":["Or Feldman","Chaim Baskin"],"pdf_url":"https://arxiv.org/pdf/2310.02721v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04093v1","updated":"2024-06-06T14:10:12Z","published":"2024-06-06T14:10:12Z","title":"Scaling and evaluating sparse autoencoders","summary":"  Sparse autoencoders provide a promising unsupervised approach for extracting\ninterpretable features from a language model by reconstructing activations from\na sparse bottleneck layer. Since language models learn many concepts,\nautoencoders need to be very large to recover all relevant features. However,\nstudying the properties of autoencoder scaling is difficult due to the need to\nbalance reconstruction and sparsity objectives and the presence of dead\nlatents. We propose using k-sparse autoencoders [Makhzani and Frey, 2013] to\ndirectly control sparsity, simplifying tuning and improving the\nreconstruction-sparsity frontier. Additionally, we find modifications that\nresult in few dead latents, even at the largest scales we tried. Using these\ntechniques, we find clean scaling laws with respect to autoencoder size and\nsparsity. We also introduce several new metrics for evaluating feature quality\nbased on the recovery of hypothesized features, the explainability of\nactivation patterns, and the sparsity of downstream effects. These metrics all\ngenerally improve with autoencoder size. To demonstrate the scalability of our\napproach, we train a 16 million latent autoencoder on GPT-4 activations for 40\nbillion tokens. We release training code and autoencoders for open-source\nmodels, as well as a visualizer.\n","authors":["Leo Gao","Tom Dupré la Tour","Henk Tillman","Gabriel Goh","Rajan Troll","Alec Radford","Ilya Sutskever","Jan Leike","Jeffrey Wu"],"pdf_url":"https://arxiv.org/pdf/2406.04093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04090v1","updated":"2024-06-06T14:01:28Z","published":"2024-06-06T14:01:28Z","title":"Interpretable Lightweight Transformer via Unrolling of Learned Graph\n  Smoothness Priors","summary":"  We build interpretable and lightweight transformer-like neural networks by\nunrolling iterative optimization algorithms that minimize graph smoothness\npriors -- the quadratic graph Laplacian regularizer (GLR) and the $\\ell_1$-norm\ngraph total variation (GTV) -- subject to an interpolation constraint. The\ncrucial insight is that a normalized signal-dependent graph learning module\namounts to a variant of the basic self-attention mechanism in conventional\ntransformers. Unlike \"black-box\" transformers that require learning of large\nkey, query and value matrices to compute scaled dot products as affinities and\nsubsequent output embeddings, resulting in huge parameter sets, our unrolled\nnetworks employ shallow CNNs to learn low-dimensional features per node to\nestablish pairwise Mahalanobis distances and construct sparse similarity\ngraphs. At each layer, given a learned graph, the target interpolated signal is\nsimply a low-pass filtered output derived from the minimization of an assumed\ngraph smoothness prior, leading to a dramatic reduction in parameter count.\nExperiments for two image interpolation applications verify the restoration\nperformance, parameter efficiency and robustness to covariate shift of our\ngraph-based unrolled networks compared to conventional transformers.\n","authors":["Tam Thuc Do","Parham Eftekhar","Seyed Alireza Hosseini","Gene Cheung","Philip Chou"],"pdf_url":"https://arxiv.org/pdf/2406.04090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16422v2","updated":"2024-06-06T14:00:36Z","published":"2023-07-31T06:11:57Z","title":"Statistically Optimal Generative Modeling with Maximum Deviation from\n  the Empirical Distribution","summary":"  This paper explores the problem of generative modeling, aiming to simulate\ndiverse examples from an unknown distribution based on observed examples. While\nrecent studies have focused on quantifying the statistical precision of popular\nalgorithms, there is a lack of mathematical evaluation regarding the\nnon-replication of observed examples and the creativity of the generative\nmodel. We present theoretical insights into this aspect, demonstrating that the\nWasserstein GAN, constrained to left-invertible push-forward maps, generates\ndistributions that avoid replication and significantly deviate from the\nempirical distribution. Importantly, we show that left-invertibility achieves\nthis without compromising the statistical optimality of the resulting\ngenerator. Our most important contribution provides a finite-sample lower bound\non the Wasserstein-1 distance between the generative distribution and the\nempirical one. We also establish a finite-sample upper bound on the distance\nbetween the generative distribution and the true data-generating one. Both\nbounds are explicit and show the impact of key parameters such as sample size,\ndimensions of the ambient and latent spaces, noise level, and smoothness\nmeasured by the Lipschitz constant.\n","authors":["Elen Vardanyan","Sona Hunanyan","Tigran Galstyan","Arshak Minasyan","Arnak Dalalyan"],"pdf_url":"https://arxiv.org/pdf/2307.16422v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2406.04089v1","updated":"2024-06-06T13:59:51Z","published":"2024-06-06T13:59:51Z","title":"On Limitation of Transformer for Learning HMMs","summary":"  Despite the remarkable success of Transformer-based architectures in various\nsequential modeling tasks, such as natural language processing, computer\nvision, and robotics, their ability to learn basic sequential models, like\nHidden Markov Models (HMMs), is still unclear. This paper investigates the\nperformance of Transformers in learning HMMs and their variants through\nextensive experimentation and compares them to Recurrent Neural Networks\n(RNNs). We show that Transformers consistently underperform RNNs in both\ntraining speed and testing accuracy across all tested HMM models. There are\neven challenging HMM instances where Transformers struggle to learn, while RNNs\ncan successfully do so. Our experiments further reveal the relation between the\ndepth of Transformers and the longest sequence length it can effectively learn,\nbased on the types and the complexity of HMMs. To address the limitation of\ntransformers in modeling HMMs, we demonstrate that a variant of the\nChain-of-Thought (CoT), called $\\textit{block CoT}$ in the training phase, can\nhelp transformers to reduce the evaluation error and to learn longer sequences\nat a cost of increasing the training time. Finally, we complement our empirical\nfindings by theoretical results proving the expressiveness of transformers in\napproximating HMMs with logarithmic depth.\n","authors":["Jiachen Hu","Qinghua Liu","Chi Jin"],"pdf_url":"https://arxiv.org/pdf/2406.04089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04088v1","updated":"2024-06-06T13:58:41Z","published":"2024-06-06T13:58:41Z","title":"Deterministic Uncertainty Propagation for Improved Model-Based Offline\n  Reinforcement Learning","summary":"  Current approaches to model-based offline Reinforcement Learning (RL) often\nincorporate uncertainty-based reward penalization to address the distributional\nshift problem. While these approaches have achieved some success, we argue that\nthis penalization introduces excessive conservatism, potentially resulting in\nsuboptimal policies through underestimation. We identify as an important cause\nof over-penalization the lack of a reliable uncertainty estimator capable of\npropagating uncertainties in the Bellman operator. The common approach to\ncalculating the penalty term relies on sampling-based uncertainty estimation,\nresulting in high variance. To address this challenge, we propose a novel\nmethod termed Moment Matching Offline Model-Based Policy Optimization (MOMBO).\nMOMBO learns a Q-function using moment matching, which allows us to\ndeterministically propagate uncertainties through the Q-function. We evaluate\nMOMBO's performance across various environments and demonstrate empirically\nthat MOMBO is a more stable and sample-efficient approach.\n","authors":["Abdullah Akgül","Manuel Haußmann","Melih Kandemir"],"pdf_url":"https://arxiv.org/pdf/2406.04088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18680v2","updated":"2024-06-06T13:58:20Z","published":"2024-03-27T15:22:16Z","title":"Non-Linear Inference Time Intervention: Improving LLM Truthfulness","summary":"  In this work, we explore LLM's internal representation space to identify\nattention heads that contain the most truthful and accurate information. We\nfurther developed the Inference Time Intervention (ITI) framework, which lets\nbias LLM without the need for fine-tuning. The improvement manifests in\nintroducing a non-linear multi-token probing and multi-token intervention:\nNon-Linear ITI (NL-ITI), which significantly enhances performance on evaluation\nbenchmarks. NL-ITI is tested on diverse multiple-choice datasets, including\nTruthfulQA, on which we report over 16% relative MC1 (accuracy of model\npointing to the correct answer) improvement with respect to the baseline ITI\nresults. Moreover, we achieved a 10% relative improvement over the recently\nreleased Truth Forest (TrFf) method that also focused on ITI improvement.\n","authors":["Jakub Hoscilowicz","Adam Wiacek","Jan Chojnacki","Adam Cieslak","Leszek Michon","Vitalii Urbanevych","Artur Janicki"],"pdf_url":"https://arxiv.org/pdf/2403.18680v2.pdf","comment":"Accepted on Interspeech 2024 Conference. Code is available at\n  https://github.com/Samsung/NL-ITI"},{"id":"http://arxiv.org/abs/2406.02126v2","updated":"2024-06-06T13:57:09Z","published":"2024-06-04T09:10:14Z","title":"CityLight: A Universal Model Towards Real-world City-scale Traffic\n  Signal Control Coordination","summary":"  Traffic signal control (TSC) is a promising low-cost measure to enhance\ntransportation efficiency without affecting existing road infrastructure. While\nvarious reinforcement learning-based TSC methods have been proposed and\nexperimentally outperform conventional rule-based methods, none of them has\nbeen deployed in the real world. An essential gap lies in the\noversimplification of the scenarios in terms of intersection heterogeneity and\nroad network intricacy. To make TSC applicable in urban traffic management, we\ntarget TSC coordination in city-scale high-authenticity road networks, aiming\nto solve the three unique and important challenges: city-level scalability,\nheterogeneity of real-world intersections, and effective coordination among\nintricate neighbor connections. Since optimizing multiple agents in a\nparameter-sharing paradigm can boost the training efficiency and help achieve\nscalability, we propose our method, CityLight, based on the well-acknowledged\noptimization framework, parameter-sharing MAPPO. To ensure the unified policy\nnetwork can learn to fit large-scale heterogeneous intersections and tackle the\nintricate between-neighbor coordination, CityLight proposes a universal\nrepresentation module that consists of two key designs: heterogeneous\nintersection alignment and neighborhood impact alignment for coordination. To\nfurther boost coordination, CityLight adopts neighborhood-integrated rewards to\ntransition from achieving local optimal to global optimal. Extensive\nexperiments on datasets with hundreds to tens of thousands of real-world\nintersections and authentic traffic demands validate the surprising\neffectiveness and generalizability of CityLight, with an overall performance\ngain of 11.66% and a 22.59% improvement in transfer scenarios in terms of\nthroughput.\n","authors":["Jinwei Zeng","Chao Yu","Xinyi Yang","Wenxuan Ao","Jian Yuan","Yong Li","Yu Wang","Huazhong Yang"],"pdf_url":"https://arxiv.org/pdf/2406.02126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10104v3","updated":"2024-06-06T13:54:23Z","published":"2023-12-15T03:11:03Z","title":"Lever LM: Configuring In-Context Sequence to Lever Large Vision Language\n  Models","summary":"  As Archimedes famously said, ``Give me a lever long enough and a fulcrum on\nwhich to place it, and I shall move the world'', in this study, we propose to\nuse a tiny Language Model (LM), \\eg, a Transformer with 67M parameters, to\nlever much larger Vision-Language Models (LVLMs) with 9B parameters.\nSpecifically, we use this tiny \\textbf{Lever-LM} to configure effective\nin-context demonstration (ICD) sequences to improve the In-Context Learinng\n(ICL) performance of LVLMs. Previous studies show that diverse ICD\nconfigurations like the selection and ordering of the demonstrations heavily\naffect the ICL performance, highlighting the significance of configuring\neffective ICD sequences. Motivated by this and by re-considering the the\nprocess of configuring ICD sequence, we find this is a mirror process of human\nsentence composition and further assume that effective ICD configurations may\ncontain internal statistical patterns that can be captured by Lever-LM. Then a\ndataset with effective ICD sequences is constructed to train Lever-LM. After\ntraining, given novel queries, new ICD sequences are configured by the trained\nLever-LM to solve vision-language tasks through ICL. Experiments show that\nthese ICD sequences can improve the ICL performance of two LVLMs compared with\nsome strong baselines in Visual Question Answering and Image Captioning,\nvalidating that Lever-LM can really capture the statistical patterns for\nlevering LVLMs.\n","authors":["Xu Yang","Yingzhe Peng","Haoxuan Ma","Shuo Xu","Chi Zhang","Yucheng Han","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.10104v3.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.06430v2","updated":"2024-06-06T13:52:04Z","published":"2023-10-10T08:54:14Z","title":"Conformal Prediction for Deep Classifier via Label Ranking","summary":"  Conformal prediction is a statistical framework that generates prediction\nsets containing ground-truth labels with a desired coverage guarantee. The\npredicted probabilities produced by machine learning models are generally\nmiscalibrated, leading to large prediction sets in conformal prediction. To\naddress this issue, we propose a novel algorithm named $\\textit{Sorted Adaptive\nPrediction Sets}$ (SAPS), which discards all the probability values except for\nthe maximum softmax probability. The key idea behind SAPS is to minimize the\ndependence of the non-conformity score on the probability values while\nretaining the uncertainty information. In this manner, SAPS can produce compact\nprediction sets and communicate instance-wise uncertainty. Extensive\nexperiments validate that SAPS not only lessens the prediction sets but also\nbroadly enhances the conditional coverage rate of prediction sets.\n","authors":["Jianguo Huang","Huajun Xi","Linjun Zhang","Huaxiu Yao","Yue Qiu","Hongxin Wei"],"pdf_url":"https://arxiv.org/pdf/2310.06430v2.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2406.04081v1","updated":"2024-06-06T13:51:39Z","published":"2024-06-06T13:51:39Z","title":"Bootstrapping Expectiles in Reinforcement Learning","summary":"  Many classic Reinforcement Learning (RL) algorithms rely on a Bellman\noperator, which involves an expectation over the next states, leading to the\nconcept of bootstrapping. To introduce a form of pessimism, we propose to\nreplace this expectation with an expectile. In practice, this can be very\nsimply done by replacing the $L_2$ loss with a more general expectile loss for\nthe critic. Introducing pessimism in RL is desirable for various reasons, such\nas tackling the overestimation problem (for which classic solutions are double\nQ-learning or the twin-critic approach of TD3) or robust RL (where transitions\nare adversarial). We study empirically these two cases. For the overestimation\nproblem, we show that the proposed approach, ExpectRL, provides better results\nthan a classic twin-critic. On robust RL benchmarks, involving changes of the\nenvironment, we show that our approach is more robust than classic RL\nalgorithms. We also introduce a variation of ExpectRL combined with domain\nrandomization which is competitive with state-of-the-art robust RL agents.\nEventually, we also extend \\ExpectRL with a mechanism for choosing\nautomatically the expectile value, that is the degree of pessimism\n","authors":["Pierre Clavier","Emmanuel Rachelson","Erwan Le Pennec","Matthieu Geist"],"pdf_url":"https://arxiv.org/pdf/2406.04081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18334v2","updated":"2024-06-06T13:50:26Z","published":"2024-02-28T13:54:57Z","title":"Learning to Generate Instruction Tuning Datasets for Zero-Shot Task\n  Adaptation","summary":"  We introduce Bonito, an open-source model for conditional task generation\nthat converts unannotated text into task-specific training datasets for\ninstruction tuning. We aim to enable zero-shot task adaptation of large\nlanguage models on users' specialized, private data. We train Bonito by\nfine-tuning a pretrained large language model on a new large-scale dataset with\n1.65M examples created by remixing existing instruction tuning datasets into\nmeta-templates. The meta-templates for a dataset produce training examples\nwhere the input is the unannotated text and the task attribute and the output\nconsists of the instruction and the response. We use Bonito to generate\nsynthetic tasks for seven datasets from specialized domains with unannotated\ntext across three task types -- yes-no question answering, extractive question\nanswering, and natural language inference -- and adapt language models. We show\nthat Bonito significantly improves the average performance of pretrained and\ninstruction tuned models over the de facto self supervised baseline. For\nexample, adapting Mistral-Instruct-v2 and instruction tuned variants of Mistral\nand Llama2 with Bonito improves the strong zero-shot performance by 22.1 F1\npoints whereas the next word prediction objective undoes some of the benefits\nof instruction tuning and reduces the average performance by 0.8 F1 points. We\nconduct additional experiments with Bonito to understand the effects of the\ndomain, the size of the training set, and the choice of alternative synthetic\ntask generators. Overall, we show that learning with synthetic instruction\ntuning datasets is an effective way to adapt language models to new domains.\nThe model, dataset, and code are available at\nhttps://github.com/BatsResearch/bonito.\n","authors":["Nihal V. Nayak","Yiyang Nan","Avi Trost","Stephen H. Bach"],"pdf_url":"https://arxiv.org/pdf/2402.18334v2.pdf","comment":"ACL Findings 2024"},{"id":"http://arxiv.org/abs/2406.00083v2","updated":"2024-06-06T13:38:42Z","published":"2024-06-03T02:25:33Z","title":"BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of\n  Large Language Models","summary":"  Large Language Models (LLMs) are constrained by outdated information and a\ntendency to generate incorrect data, commonly referred to as \"hallucinations.\"\nRetrieval-Augmented Generation (RAG) addresses these limitations by combining\nthe strengths of retrieval-based methods and generative models. This approach\ninvolves retrieving relevant information from a large, up-to-date dataset and\nusing it to enhance the generation process, leading to more accurate and\ncontextually appropriate responses. Despite its benefits, RAG introduces a new\nattack surface for LLMs, particularly because RAG databases are often sourced\nfrom public data, such as the web. In this paper, we propose \\TrojRAG{} to\nidentify the vulnerabilities and attacks on retrieval parts (RAG database) and\ntheir indirect attacks on generative parts (LLMs). Specifically, we identify\nthat poisoning several customized content passages could achieve a retrieval\nbackdoor, where the retrieval works well for clean queries but always returns\ncustomized poisoned adversarial queries. Triggers and poisoned passages can be\nhighly customized to implement various attacks. For example, a trigger could be\na semantic group like \"The Republican Party, Donald Trump, etc.\" Adversarial\npassages can be tailored to different contents, not only linked to the triggers\nbut also used to indirectly attack generative LLMs without modifying them.\nThese attacks can include denial-of-service attacks on RAG and semantic\nsteering attacks on LLM generations conditioned by the triggers. Our\nexperiments demonstrate that by just poisoning 10 adversarial passages can\ninduce 98.2\\% success rate to retrieve the adversarial passages. Then, these\npassages can increase the reject ratio of RAG-based GPT-4 from 0.01\\% to 74.6\\%\nor increase the rate of negative responses from 0.22\\% to 72\\% for targeted\nqueries.\n","authors":["Jiaqi Xue","Mengxin Zheng","Yebowen Hu","Fei Liu","Xun Chen","Qian Lou"],"pdf_url":"https://arxiv.org/pdf/2406.00083v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04071v1","updated":"2024-06-06T13:36:41Z","published":"2024-06-06T13:36:41Z","title":"Dynamic angular synchronization under smoothness constraints","summary":"  Given an undirected measurement graph $\\mathcal{H} = ([n], \\mathcal{E})$, the\nclassical angular synchronization problem consists of recovering unknown angles\n$\\theta_1^*,\\dots,\\theta_n^*$ from a collection of noisy pairwise measurements\nof the form $(\\theta_i^* - \\theta_j^*) \\mod 2\\pi$, for all $\\{i,j\\} \\in\n\\mathcal{E}$. This problem arises in a variety of applications, including\ncomputer vision, time synchronization of distributed networks, and ranking from\npairwise comparisons. In this paper, we consider a dynamic version of this\nproblem where the angles, and also the measurement graphs evolve over $T$ time\npoints. Assuming a smoothness condition on the evolution of the latent angles,\nwe derive three algorithms for joint estimation of the angles over all time\npoints. Moreover, for one of the algorithms, we establish non-asymptotic\nrecovery guarantees for the mean-squared error (MSE) under different\nstatistical models. In particular, we show that the MSE converges to zero as\n$T$ increases under milder conditions than in the static setting. This includes\nthe setting where the measurement graphs are highly sparse and disconnected,\nand also when the measurement noise is large and can potentially increase with\n$T$. We complement our theoretical results with experiments on synthetic data.\n","authors":["Ernesto Araya","Mihai Cucuringu","Hemant Tyagi"],"pdf_url":"https://arxiv.org/pdf/2406.04071v1.pdf","comment":"40 pages, 9 figures"},{"id":"http://arxiv.org/abs/2406.04070v1","updated":"2024-06-06T13:34:43Z","published":"2024-06-06T13:34:43Z","title":"Batch-in-Batch: a new adversarial training framework for initial\n  perturbation and sample selection","summary":"  Adversarial training methods commonly generate independent initial\nperturbation for adversarial samples from a simple uniform distribution, and\nobtain the training batch for the classifier without selection. In this work,\nwe propose a simple yet effective training framework called Batch-in-Batch (BB)\nto enhance models robustness. It involves specifically a joint construction of\ninitial values that could simultaneously generates $m$ sets of perturbations\nfrom the original batch set to provide more diversity for adversarial samples;\nand also includes various sample selection strategies that enable the trained\nmodels to have smoother losses and avoid overconfident outputs. Through\nextensive experiments on three benchmark datasets (CIFAR-10, SVHN, CIFAR-100)\nwith two networks (PreActResNet18 and WideResNet28-10) that are used in both\nthe single-step (Noise-Fast Gradient Sign Method, N-FGSM) and multi-step\n(Projected Gradient Descent, PGD-10) adversarial training, we show that models\ntrained within the BB framework consistently have higher adversarial accuracy\nacross various adversarial settings, notably achieving over a 13% improvement\non the SVHN dataset with an attack radius of 8/255 compared to the N-FGSM\nbaseline model. Furthermore, experimental analysis of the efficiency of both\nthe proposed initial perturbation method and sample selection strategies\nvalidates our insights. Finally, we show that our framework is cost-effective\nin terms of computational resources, even with a relatively large value of $m$.\n","authors":["Yinting Wu","Pai Peng","Bo Cai","Le Li"," ."],"pdf_url":"https://arxiv.org/pdf/2406.04070v1.pdf","comment":"29 pages, 11 figures"},{"id":"http://arxiv.org/abs/2406.04068v1","updated":"2024-06-06T13:33:45Z","published":"2024-06-06T13:33:45Z","title":"Reassessing How to Compare and Improve the Calibration of Machine\n  Learning Models","summary":"  A machine learning model is calibrated if its predicted probability for an\noutcome matches the observed frequency for that outcome conditional on the\nmodel prediction. This property has become increasingly important as the impact\nof machine learning models has continued to spread to various domains. As a\nresult, there are now a dizzying number of recent papers on measuring and\nimproving the calibration of (specifically deep learning) models. In this work,\nwe reassess the reporting of calibration metrics in the recent literature. We\nshow that there exist trivial recalibration approaches that can appear\nseemingly state-of-the-art unless calibration and prediction metrics (i.e. test\naccuracy) are accompanied by additional generalization metrics such as negative\nlog-likelihood. We then derive a calibration-based decomposition of Bregman\ndivergences that can be used to both motivate a choice of calibration metric\nbased on a generalization metric, and to detect trivial calibration. Finally,\nwe apply these ideas to develop a new extension to reliability diagrams that\ncan be used to jointly visualize calibration as well as the estimated\ngeneralization error of a model.\n","authors":["Muthu Chidambaram","Rong Ge"],"pdf_url":"https://arxiv.org/pdf/2406.04068v1.pdf","comment":"20 pages, 7 figures"},{"id":"http://arxiv.org/abs/2305.15577v3","updated":"2024-06-06T13:31:45Z","published":"2023-05-24T21:23:58Z","title":"Minimizing $f$-Divergences by Interpolating Velocity Fields","summary":"  Many machine learning problems can be seen as approximating a \\textit{target}\ndistribution using a \\textit{particle} distribution by minimizing their\nstatistical discrepancy. Wasserstein Gradient Flow can move particles along a\npath that minimizes the $f$-divergence between the target and particle\ndistributions. To move particles, we need to calculate the corresponding\nvelocity fields derived from a density ratio function between these two\ndistributions. Previous works estimated such density ratio functions and then\ndifferentiated the estimated ratios. These approaches may suffer from\noverfitting, leading to a less accurate estimate of the velocity fields.\nInspired by non-parametric curve fitting, we directly estimate these velocity\nfields using interpolation techniques. We prove that our estimators are\nconsistent under mild conditions. We validate their effectiveness using novel\napplications on domain adaptation and missing data imputation.\n","authors":["Song Liu","Jiahao Yu","Jack Simons","Mingxuan Yi","Mark Beaumont"],"pdf_url":"https://arxiv.org/pdf/2305.15577v3.pdf","comment":"This manuscript is an extended version of the ICML2024 version. The\n  code for reproducing our results can be found at\n  https://github.com/anewgithubname/gradest2"},{"id":"http://arxiv.org/abs/2406.01799v2","updated":"2024-06-06T13:29:48Z","published":"2024-06-03T21:40:59Z","title":"Online Control in Population Dynamics","summary":"  The study of population dynamics originated with early sociological works but\nhas since extended into many fields, including biology, epidemiology,\nevolutionary game theory, and economics. Most studies on population dynamics\nfocus on the problem of prediction rather than control. Existing mathematical\nmodels for control in population dynamics are often restricted to specific,\nnoise-free dynamics, while real-world population changes can be complex and\nadversarial.\n  To address this gap, we propose a new framework based on the paradigm of\nonline control. We first characterize a set of linear dynamical systems that\ncan naturally model evolving populations. We then give an efficient\ngradient-based controller for these systems, with near-optimal regret bounds\nwith respect to a broad class of linear policies. Our empirical evaluations\ndemonstrate the effectiveness of the proposed algorithm for control in\npopulation dynamics even for non-linear models such as SIR and replicator\ndynamics.\n","authors":["Noah Golowich","Elad Hazan","Zhou Lu","Dhruv Rohatgi","Y. Jennifer Sun"],"pdf_url":"https://arxiv.org/pdf/2406.01799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04056v1","updated":"2024-06-06T13:25:14Z","published":"2024-06-06T13:25:14Z","title":"Bisimulation Metrics are Optimal Transport Distances, and Can be\n  Computed Efficiently","summary":"  We propose a new framework for formulating optimal transport distances\nbetween Markov chains. Previously known formulations studied couplings between\nthe entire joint distribution induced by the chains, and derived solutions via\na reduction to dynamic programming (DP) in an appropriately defined Markov\ndecision process. This formulation has, however, not led to particularly\nefficient algorithms so far, since computing the associated DP operators\nrequires fully solving a static optimal transport problem, and these operators\nneed to be applied numerous times during the overall optimization process. In\nthis work, we develop an alternative perspective by considering couplings\nbetween a flattened version of the joint distributions that we call discounted\noccupancy couplings, and show that calculating optimal transport distances in\nthe full space of joint distributions can be equivalently formulated as solving\na linear program (LP) in this reduced space. This LP formulation allows us to\nport several algorithmic ideas from other areas of optimal transport theory. In\nparticular, our formulation makes it possible to introduce an appropriate\nnotion of entropy regularization into the optimization problem, which in turn\nenables us to directly calculate optimal transport distances via a\nSinkhorn-like method we call Sinkhorn Value Iteration (SVI). We show both\ntheoretically and empirically that this method converges quickly to an optimal\ncoupling, essentially at the same computational cost of running vanilla\nSinkhorn in each pair of states. Along the way, we point out that our optimal\ntransport distance exactly matches the common notion of bisimulation metrics\nbetween Markov chains, and thus our results also apply to computing such\nmetrics, and in fact our algorithm turns out to be significantly more efficient\nthan the best known methods developed so far for this purpose.\n","authors":["Sergio Calo","Anders Jonsson","Gergely Neu","Ludovic Schwartz","Javier Segovia"],"pdf_url":"https://arxiv.org/pdf/2406.04056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10195v3","updated":"2024-06-06T13:22:25Z","published":"2023-10-16T09:04:28Z","title":"AdaLomo: Low-memory Optimization with Adaptive Learning Rate","summary":"  Large language models have achieved remarkable success, but their extensive\nparameter size necessitates substantial memory for training, thereby setting a\nhigh threshold. While the recently proposed low-memory optimization (LOMO)\nreduces memory footprint, its optimization technique, akin to stochastic\ngradient descent, is sensitive to hyper-parameters and exhibits suboptimal\nconvergence, failing to match the performance of the prevailing optimizer for\nlarge language models, AdamW. Through empirical analysis of the Adam optimizer,\nwe found that, compared to momentum, the adaptive learning rate is more\ncritical for bridging the gap. Building on this insight, we introduce the\nlow-memory optimization with adaptive learning rate (AdaLomo), which offers an\nadaptive learning rate for each parameter. To maintain memory efficiency, we\nemploy non-negative matrix factorization for the second-order moment estimation\nin the optimizer state. Additionally, we suggest the use of a grouped update\nnormalization to stabilize convergence. Our experiments with instruction-tuning\nand further pre-training demonstrate that AdaLomo achieves results on par with\nAdamW, while significantly reducing memory requirements, thereby lowering the\nhardware barrier to training large language models. The code is accessible at\nhttps://github.com/OpenLMLab/LOMO.\n","authors":["Kai Lv","Hang Yan","Qipeng Guo","Haijun Lv","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.10195v3.pdf","comment":"ACL 2024 camera ready version"},{"id":"http://arxiv.org/abs/2406.04055v1","updated":"2024-06-06T13:21:28Z","published":"2024-06-06T13:21:28Z","title":"Leveraging SPD Matrices on Riemannian Manifolds in Quantum Classical\n  Hybrid Models for Structural Health Monitoring","summary":"  Realtime finite element modeling of bridges assists modern structural health\nmonitoring systems by providing comprehensive insights into structural\nintegrity. This capability is essential for ensuring the safe operation of\nbridges and preventing sudden catastrophic failures. However, FEM computational\ncost and the need for realtime analysis pose significant challenges.\nAdditionally, the input data is a 7 dimensional vector, while the output is a\n1017 dimensional vector, making accurate and efficient analysis particularly\ndifficult. In this study, we propose a novel hybrid quantum classical\nMultilayer Perceptron pipeline leveraging Symmetric Positive Definite matrices\nand Riemannian manifolds for effective data representation. To maintain the\nintegrity of the qubit structure, we utilize SPD matrices, ensuring data\nrepresentation is well aligned with the quantum computational framework.\nAdditionally, the method leverages polynomial feature expansion to capture\nnonlinear relationships within the data. The proposed pipeline combines\nclassical fully connected neural network layers with quantum circuit layers to\nenhance model performance and efficiency. Our experiments focused on various\nconfigurations of such hybrid models to identify the optimal structure for\naccurate and efficient realtime analysis. The best performing model achieved a\nMean Squared Error of 0.00031, significantly outperforming traditional methods.\n","authors":["Azadeh Alavi","Sanduni Jayasinghe"],"pdf_url":"https://arxiv.org/pdf/2406.04055v1.pdf","comment":"3 pages, 1 figure"},{"id":"http://arxiv.org/abs/2406.04052v1","updated":"2024-06-06T13:17:44Z","published":"2024-06-06T13:17:44Z","title":"Multivector Neurons: Better and Faster O(n)-Equivariant Clifford Graph\n  Neural Networks","summary":"  Most current deep learning models equivariant to $O(n)$ or $SO(n)$ either\nconsider mostly scalar information such as distances and angles or have a very\nhigh computational complexity. In this work, we test a few novel message\npassing graph neural networks (GNNs) based on Clifford multivectors, structured\nsimilarly to other prevalent equivariant models in geometric deep learning. Our\napproach leverages efficient invariant scalar features while simultaneously\nperforming expressive learning on multivector representations, particularly\nthrough the use of the equivariant geometric product operator. By integrating\nthese elements, our methods outperform established efficient baseline models on\nan N-Body simulation task and protein denoising task while maintaining a high\nefficiency. In particular, we push the state-of-the-art error on the N-body\ndataset to 0.0035 (averaged over 3 runs); an 8% improvement over recent\nmethods. Our implementation is available on Github.\n","authors":["Cong Liu","David Ruhe","Patrick Forré"],"pdf_url":"https://arxiv.org/pdf/2406.04052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04047v1","updated":"2024-06-06T13:15:37Z","published":"2024-06-06T13:15:37Z","title":"Slicing Mutual Information Generalization Bounds for Neural Networks","summary":"  The ability of machine learning (ML) algorithms to generalize well to unseen\ndata has been studied through the lens of information theory, by bounding the\ngeneralization error with the input-output mutual information (MI), i.e., the\nMI between the training data and the learned hypothesis. Yet, these bounds have\nlimited practicality for modern ML applications (e.g., deep learning), due to\nthe difficulty of evaluating MI in high dimensions. Motivated by recent\nfindings on the compressibility of neural networks, we consider algorithms that\noperate by slicing the parameter space, i.e., trained on random\nlower-dimensional subspaces. We introduce new, tighter information-theoretic\ngeneralization bounds tailored for such algorithms, demonstrating that slicing\nimproves generalization. Our bounds offer significant computational and\nstatistical advantages over standard MI bounds, as they rely on scalable\nalternative measures of dependence, i.e., disintegrated mutual information and\n$k$-sliced mutual information. Then, we extend our analysis to algorithms whose\nparameters do not need to exactly lie on random subspaces, by leveraging\nrate-distortion theory. This strategy yields generalization bounds that\nincorporate a distortion term measuring model compressibility under slicing,\nthereby tightening existing bounds without compromising performance or\nrequiring model compression. Building on this, we propose a regularization\nscheme enabling practitioners to control generalization through\ncompressibility. Finally, we empirically validate our results and achieve the\ncomputation of non-vacuous information-theoretic generalization bounds for\nneural networks, a task that was previously out of reach.\n","authors":["Kimia Nadjahi","Kristjan Greenewald","Rickard Brüel Gabrielsson","Justin Solomon"],"pdf_url":"https://arxiv.org/pdf/2406.04047v1.pdf","comment":"Accepted at ICML 2024"},{"id":"http://arxiv.org/abs/2406.02875v2","updated":"2024-06-06T13:13:35Z","published":"2024-06-05T02:50:27Z","title":"Leveraging KANs For Enhanced Deep Koopman Operator Discovery","summary":"  Multi-layer perceptrons (MLP's) have been extensively utilized in discovering\nDeep Koopman operators for linearizing nonlinear dynamics. With the emergence\nof Kolmogorov-Arnold Networks (KANs) as a more efficient and accurate\nalternative to the MLP Neural Network, we propose a comparison of the\nperformance of each network type in the context of learning Koopman operators\nwith control. In this work, we propose a KANs-based deep Koopman framework with\napplications to an orbital Two-Body Problem (2BP) and the pendulum for\ndata-driven discovery of linear system dynamics. KANs were found to be superior\nin nearly all aspects of training; learning 31 times faster, being 15 times\nmore parameter efficiency, and predicting 1.25 times more accurately as\ncompared to the MLP Deep Neural Networks (DNNs) in the case of the 2BP. Thus,\nKANs shows potential for being an efficient tool in the development of Deep\nKoopman Theory.\n","authors":["George Nehma","Madhur Tiwari"],"pdf_url":"https://arxiv.org/pdf/2406.02875v2.pdf","comment":"6 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2406.04043v1","updated":"2024-06-06T13:13:29Z","published":"2024-06-06T13:13:29Z","title":"Energy-based Epistemic Uncertainty for Graph Neural Networks","summary":"  In domains with interdependent data, such as graphs, quantifying the\nepistemic uncertainty of a Graph Neural Network (GNN) is challenging as\nuncertainty can arise at different structural scales. Existing techniques\nneglect this issue or only distinguish between structure-aware and\nstructure-agnostic uncertainty without combining them into a single measure. We\npropose GEBM, an energy-based model (EBM) that provides high-quality\nuncertainty estimates by aggregating energy at different structural levels that\nnaturally arise from graph diffusion. In contrast to logit-based EBMs, we\nprovably induce an integrable density in the data space by regularizing the\nenergy function. We introduce an evidential interpretation of our EBM that\nsignificantly improves the predictive robustness of the GNN. Our framework is a\nsimple and effective post hoc method applicable to any pre-trained GNN that is\nsensitive to various distribution shifts. It consistently achieves the best\nseparation of in-distribution and out-of-distribution data on 6 out of 7\nanomaly types while having the best average rank over shifts on \\emph{all}\ndatasets.\n","authors":["Dominik Fuchsgruber","Tom Wollschläger","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2406.04043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04041v1","updated":"2024-06-06T13:10:37Z","published":"2024-06-06T13:10:37Z","title":"Linear Opinion Pooling for Uncertainty Quantification on Graphs","summary":"  We address the problem of uncertainty quantification for graph-structured\ndata, or, more specifically, the problem to quantify the predictive uncertainty\nin (semi-supervised) node classification. Key questions in this regard concern\nthe distinction between two different types of uncertainty, aleatoric and\nepistemic, and how to support uncertainty quantification by leveraging the\nstructural information provided by the graph topology. Challenging assumptions\nand postulates of state-of-the-art methods, we propose a novel approach that\nrepresents (epistemic) uncertainty in terms of mixtures of Dirichlet\ndistributions and refers to the established principle of linear opinion pooling\nfor propagating information between neighbored nodes in the graph. The\neffectiveness of this approach is demonstrated in a series of experiments on a\nvariety of graph-structured datasets.\n","authors":["Clemens Damke","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2406.04041v1.pdf","comment":"Accepted for the 40th Conference on Uncertainty in Artificial\n  Intelligence (UAI 2024). Implementation available at\n  https://github.com/Cortys/gpn-extensions"},{"id":"http://arxiv.org/abs/2406.04039v1","updated":"2024-06-06T13:05:32Z","published":"2024-06-06T13:05:32Z","title":"Shaping History: Advanced Machine Learning Techniques for the Analysis\n  and Dating of Cuneiform Tablets over Three Millennia","summary":"  Cuneiform tablets, emerging in ancient Mesopotamia around the late fourth\nmillennium BCE, represent one of humanity's earliest writing systems.\nCharacterized by wedge-shaped marks on clay tablets, these artifacts provided\ninsight into Mesopotamian civilization across various domains. Traditionally,\nthe analysis and dating of these tablets rely on subjective assessment of shape\nand writing style, leading to uncertainties in pinpointing their exact temporal\norigins. Recent advances in digitization have revolutionized the study of\ncuneiform by enhancing accessibility and analytical capabilities. Our research\nuniquely focuses on the silhouette of tablets as significant indicators of\ntheir historical periods, diverging from most studies that concentrate on\ntextual content. Utilizing an unprecedented dataset of over 94,000 images from\nthe Cuneiform Digital Library Initiative collection, we apply deep learning\nmethods to classify cuneiform tablets, covering over 3,000 years of history. By\nleveraging statistical, computational techniques, and generative modeling\nthrough Variational Auto-Encoders (VAEs), we achieve substantial advancements\nin the automatic classification of these ancient documents, focusing on the\ntablets' silhouettes as key predictors. Our classification approach begins with\na Decision Tree using height-to-width ratios and culminates with a ResNet50\nmodel, achieving a 61% macro F1-score for tablet silhouettes. Moreover, we\nintroduce novel VAE-powered tools to enhance explainability and enable\nresearchers to explore changes in tablet shapes across different eras and\ngenres. This research contributes to document analysis and diplomatics by\ndemonstrating the value of large-scale data analysis combined with statistical\nmethods. These insights offer valuable tools for historians and epigraphists,\nenriching our understanding of cuneiform tablets and the cultures that produced\nthem.\n","authors":["Danielle Kapon","Michael Fire","Shai Gordin"],"pdf_url":"https://arxiv.org/pdf/2406.04039v1.pdf","comment":"24 pages, 18 figures"},{"id":"http://arxiv.org/abs/2406.04038v1","updated":"2024-06-06T13:04:43Z","published":"2024-06-06T13:04:43Z","title":"Road Network Representation Learning with the Third Law of Geography","summary":"  Road network representation learning aims to learn compressed and effective\nvectorized representations for road segments that are applicable to numerous\ntasks. In this paper, we identify the limitations of existing methods,\nparticularly their overemphasis on the distance effect as outlined in the First\nLaw of Geography. In response, we propose to endow road network representation\nwith the principles of the recent Third Law of Geography. To this end, we\npropose a novel graph contrastive learning framework that employs geographic\nconfiguration-aware graph augmentation and spectral negative sampling, ensuring\nthat road segments with similar geographic configurations yield similar\nrepresentations, and vice versa, aligning with the principles stated in the\nThird Law. The framework further fuses the Third Law with the First Law through\na dual contrastive learning objective to effectively balance the implications\nof both laws. We evaluate our framework on two real-world datasets across three\ndownstream tasks. The results show that the integration of the Third Law\nsignificantly improves the performance of road segment representations in\ndownstream tasks.\n","authors":["Haicang Zhou","Weiming Huang","Yile Chen","Tiantian He","Gao Cong","Yew-Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2406.04038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04035v1","updated":"2024-06-06T13:03:51Z","published":"2024-06-06T13:03:51Z","title":"Spatio-temporal Early Prediction based on Multi-objective Reinforcement\n  Learning","summary":"  Accuracy and timeliness are indeed often conflicting goals in prediction\ntasks. Premature predictions may yield a higher rate of false alarms, whereas\ndelaying predictions to gather more information can render them too late to be\nuseful. In applications such as wildfires, crimes, and traffic jams, timely\npredictions are vital for safeguarding human life and property. Consequently,\nfinding a balance between accuracy and timeliness is crucial. In this paper, we\npropose a spatio-temporal early prediction model based on Multi-Objective\nreinforcement learning that can either implement an optimal policy given a\npreference or infer the preference based on a small number of samples. The\nmodel addresses two primary challenges: 1) enhancing the accuracy of early\npredictions and 2) providing the optimal policy for determining the most\nsuitable prediction time for each area. Our method demonstrates superior\nperformance on three large-scale real-world datasets, surpassing existing\nmethods in early spatio-temporal prediction tasks.\n","authors":["Wei Shao","Yufan Kang","Ziyan Peng","Xiao Xiao","Lei Wang","Yuhui Yang","Flora D Salim"],"pdf_url":"https://arxiv.org/pdf/2406.04035v1.pdf","comment":"Conference"},{"id":"http://arxiv.org/abs/2406.04029v1","updated":"2024-06-06T12:59:46Z","published":"2024-06-06T12:59:46Z","title":"Pre-trained Transformer Uncovers Meaningful Patterns in Human Mobility\n  Data","summary":"  We empirically demonstrate that a transformer pre-trained on country-scale\nunlabeled human mobility data learns embeddings capable, through fine-tuning,\nof developing a deep understanding of the target geography and its\ncorresponding mobility patterns. Utilizing an adaptation framework, we evaluate\nthe performance of our pre-trained embeddings in encapsulating a broad spectrum\nof concepts directly and indirectly related to human mobility. This includes\nbasic notions, such as geographic location and distance, and extends to more\ncomplex constructs, such as administrative divisions and land cover. Our\nextensive empirical analysis reveals a substantial performance boost gained\nfrom pre-training, reaching up to 38% in tasks such as tree-cover regression.\nWe attribute this result to the ability of the pre-training to uncover\nmeaningful patterns hidden in the raw data, beneficial for modeling relevant\nhigh-level concepts. The pre-trained embeddings emerge as robust\nrepresentations of regions and trajectories, potentially valuable for a wide\nrange of downstream applications.\n","authors":["Alameen Najjar"],"pdf_url":"https://arxiv.org/pdf/2406.04029v1.pdf","comment":"10 pages, 12 figures, 14 tables"},{"id":"http://arxiv.org/abs/2406.03154v2","updated":"2024-06-06T12:58:17Z","published":"2024-06-05T11:30:16Z","title":"Detecting Model Misspecification in Amortized Bayesian Inference with\n  Neural Networks: An Extended Investigation","summary":"  Recent advances in probabilistic deep learning enable efficient amortized\nBayesian inference in settings where the likelihood function is only implicitly\ndefined by a simulation program (simulation-based inference; SBI). But how\nfaithful is such inference if the simulation represents reality somewhat\ninaccurately, that is, if the true system behavior at test time deviates from\nthe one seen during training? We conceptualize the types of such model\nmisspecification arising in SBI and systematically investigate how the\nperformance of neural posterior approximators gradually deteriorates as a\nconsequence, making inference results less and less trustworthy. To notify\nusers about this problem, we propose a new misspecification measure that can be\ntrained in an unsupervised fashion (i.e., without training data from the true\ndistribution) and reliably detects model misspecification at test time. Our\nexperiments clearly demonstrate the utility of our new measure both on toy\nexamples with an analytical ground-truth and on representative scientific tasks\nin cell biology, cognitive decision making, disease outbreak dynamics, and\ncomputer vision. We show how the proposed misspecification test warns users\nabout suspicious outputs, raises an alarm when predictions are not trustworthy,\nand guides model designers in their search for better simulators.\n","authors":["Marvin Schmitt","Paul-Christian Bürkner","Ullrich Köthe","Stefan T. Radev"],"pdf_url":"https://arxiv.org/pdf/2406.03154v2.pdf","comment":"Extended version of the conference paper\n  https://doi.org/10.1007/978-3-031-54605-1_35. arXiv admin note: text overlap\n  with arXiv:2112.08866"},{"id":"http://arxiv.org/abs/2402.12343v4","updated":"2024-06-06T12:54:48Z","published":"2024-02-19T18:16:51Z","title":"Emulated Disalignment: Safety Alignment for Large Language Models May\n  Backfire!","summary":"  Large language models (LLMs) undergo safety alignment to ensure safe\nconversations with humans. However, this paper introduces a training-free\nattack method capable of reversing safety alignment, converting the outcomes of\nstronger alignment into greater potential for harm by accessing only LLM output\ntoken distributions. Specifically, our method achieves this reversal by\ncontrasting the output token distribution of a safety-aligned language model\n(e.g., Llama-2-chat) against its pre-trained version (e.g., Llama-2), so that\nthe token predictions are shifted towards the opposite direction of safety\nalignment. We name this method emulated disalignment (ED) because sampling from\nthis contrastive distribution provably emulates the result of fine-tuning to\nminimize a safety reward. Our experiments with ED across three evaluation\ndatasets and four model families (Llama-1, Llama-2, Mistral, and Alpaca) show\nthat ED doubles the harmfulness of pre-trained models and outperforms strong\nbaselines, achieving the highest harmful rates in 43 out of 48 evaluation\nsubsets by a large margin. Eventually, given ED's reliance on language model\noutput token distributions, which particularly compromises open-source models,\nour findings highlight the need to reassess the open accessibility of language\nmodels, even if they have been safety-aligned. Code is available at\nhttps://github.com/ZHZisZZ/emulated-disalignment.\n","authors":["Zhanhui Zhou","Jie Liu","Zhichen Dong","Jiaheng Liu","Chao Yang","Wanli Ouyang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2402.12343v4.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2303.07139v2","updated":"2024-06-06T12:46:20Z","published":"2023-03-13T14:05:19Z","title":"Comparing statistical and machine learning methods for time series\n  forecasting in data-driven logistics -- A simulation study","summary":"  Many planning and decision activities in logistics and supply chain\nmanagement are based on forecasts of multiple time dependent factors.\nTherefore, the quality of planning depends on the quality of the forecasts. We\ncompare various forecasting methods in terms of out of the box forecasting\nperformance on a broad set of simulated time series. We simulate various linear\nand non-linear time series and look at the one step forecast performance of\nstatistical learning methods.\n","authors":["Lena Schmid","Moritz Roidl","Markus Pauly"],"pdf_url":"https://arxiv.org/pdf/2303.07139v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14461v2","updated":"2024-06-06T12:45:52Z","published":"2024-04-22T05:08:53Z","title":"Competition Report: Finding Universal Jailbreak Backdoors in Aligned\n  LLMs","summary":"  Large language models are aligned to be safe, preventing users from\ngenerating harmful content like misinformation or instructions for illegal\nactivities. However, previous work has shown that the alignment process is\nvulnerable to poisoning attacks. Adversaries can manipulate the safety training\ndata to inject backdoors that act like a universal sudo command: adding the\nbackdoor string to any prompt enables harmful responses from models that,\notherwise, behave safely. Our competition, co-located at IEEE SaTML 2024,\nchallenged participants to find universal backdoors in several large language\nmodels. This report summarizes the key findings and promising ideas for future\nresearch.\n","authors":["Javier Rando","Francesco Croce","Kryštof Mitka","Stepan Shabalin","Maksym Andriushchenko","Nicolas Flammarion","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2404.14461v2.pdf","comment":"Competition Report"},{"id":"http://arxiv.org/abs/2406.04012v1","updated":"2024-06-06T12:38:59Z","published":"2024-06-06T12:38:59Z","title":"Variational inference, Mixture of Gaussians, Bayesian Machine Learning","summary":"  Variational inference (VI) is a popular approach in Bayesian inference, that\nlooks for the best approximation of the posterior distribution within a\nparametric family, minimizing a loss that is typically the (reverse)\nKullback-Leibler (KL) divergence. Despite its empirical success, the\ntheoretical properties of VI have only received attention recently, and mostly\nwhen the parametric family is the one of Gaussians. This work aims to\ncontribute to the theoretical study of VI in the non-Gaussian case by\ninvestigating the setting of Mixture of Gaussians with fixed covariance and\nconstant weights. In this view, VI over this specific family can be casted as\nthe minimization of a Mollified relative entropy, i.e. the KL between the\nconvolution (with respect to a Gaussian kernel) of an atomic measure supported\non Diracs, and the target distribution. The support of the atomic measure\ncorresponds to the localization of the Gaussian components. Hence, solving\nvariational inference becomes equivalent to optimizing the positions of the\nDiracs (the particles), which can be done through gradient descent and takes\nthe form of an interacting particle system. We study two sources of error of\nvariational inference in this context when optimizing the mollified relative\nentropy. The first one is an optimization result, that is a descent lemma\nestablishing that the algorithm decreases the objective at each iteration. The\nsecond one is an approximation error, that upper bounds the objective between\nan optimal finite mixture and the target distribution.\n","authors":["Tom Huix","Anna Korba","Alain Durmus","Eric Moulines"],"pdf_url":"https://arxiv.org/pdf/2406.04012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06844v3","updated":"2024-06-06T12:38:50Z","published":"2023-06-12T03:35:45Z","title":"Provably Efficient Bayesian Optimization with Unknown Gaussian Process\n  Hyperparameter Estimation","summary":"  Gaussian process (GP) based Bayesian optimization (BO) is a powerful method\nfor optimizing black-box functions efficiently. The practical performance and\ntheoretical guarantees of this approach depend on having the correct GP\nhyperparameter values, which are usually unknown in advance and need to be\nestimated from the observed data. However, in practice, these estimations could\nbe incorrect due to biased data sampling strategies used in BO. This can lead\nto degraded performance and break the sub-linear global convergence guarantee\nof BO. To address this issue, we propose a new BO method that can sub-linearly\nconverge to the objective function's global optimum even when the true GP\nhyperparameters are unknown in advance and need to be estimated from the\nobserved data. Our method uses a multi-armed bandit technique (EXP3) to add\nrandom data points to the BO process, and employs a novel training loss\nfunction for the GP hyperparameter estimation process that ensures consistent\nestimation. We further provide theoretical analysis of our proposed method.\nFinally, we demonstrate empirically that our method outperforms existing\napproaches on various synthetic and real-world problems.\n","authors":["Huong Ha","Vu Nguyen","Hung Tran-The","Hongyu Zhang","Xiuzhen Zhang","Anton van den Hengel"],"pdf_url":"https://arxiv.org/pdf/2306.06844v3.pdf","comment":"25 pages, 5 figures"},{"id":"http://arxiv.org/abs/2302.05372v3","updated":"2024-06-06T12:32:02Z","published":"2023-02-10T16:50:40Z","title":"Towards Minimax Optimality of Model-based Robust Reinforcement Learning","summary":"  We study the sample complexity of obtaining an $\\epsilon$-optimal policy in\n\\emph{Robust} discounted Markov Decision Processes (RMDPs), given only access\nto a generative model of the nominal kernel. This problem is widely studied in\nthe non-robust case, and it is known that any planning approach applied to an\nempirical MDP estimated with $\\tilde{\\mathcal{O}}(\\frac{H^3 \\mid S \\mid\\mid A\n\\mid}{\\epsilon^2})$ samples provides an $\\epsilon$-optimal policy, which is\nminimax optimal. Results in the robust case are much more scarce. For $sa$-\n(resp $s$-)rectangular uncertainty sets, the best known sample complexity is\n$\\tilde{\\mathcal{O}}(\\frac{H^4 \\mid S \\mid^2\\mid A \\mid}{\\epsilon^2})$ (resp.\n$\\tilde{\\mathcal{O}}(\\frac{H^4 \\mid S \\mid^2\\mid A \\mid^2}{\\epsilon^2})$), for\nspecific algorithms and when the uncertainty set is based on the total\nvariation (TV), the KL or the Chi-square divergences. In this paper, we\nconsider uncertainty sets defined with an $L_p$-ball (recovering the TV case),\nand study the sample complexity of \\emph{any} planning algorithm (with high\naccuracy guarantee on the solution) applied to an empirical RMDP estimated\nusing the generative model. In the general case, we prove a sample complexity\nof $\\tilde{\\mathcal{O}}(\\frac{H^4 \\mid S \\mid\\mid A \\mid}{\\epsilon^2})$ for\nboth the $sa$- and $s$-rectangular cases (improvements of $\\mid S \\mid$ and\n$\\mid S \\mid\\mid A \\mid$ respectively). When the size of the uncertainty is\nsmall enough, we improve the sample complexity to\n$\\tilde{\\mathcal{O}}(\\frac{H^3 \\mid S \\mid\\mid A \\mid }{\\epsilon^2})$,\nrecovering the lower-bound for the non-robust case for the first time and a\nrobust lower-bound when the size of the uncertainty is small enough.\n","authors":["Pierre Clavier","Erwan Le Pennec","Matthieu Geist"],"pdf_url":"https://arxiv.org/pdf/2302.05372v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07245v2","updated":"2024-06-06T12:30:49Z","published":"2024-03-12T02:05:06Z","title":"Dataset Condensation for Time Series Classification via Dual Domain\n  Matching","summary":"  Time series data has been demonstrated to be crucial in various research\nfields. The management of large quantities of time series data presents\nchallenges in terms of deep learning tasks, particularly for training a deep\nneural network. Recently, a technique named \\textit{Dataset Condensation} has\nemerged as a solution to this problem. This technique generates a smaller\nsynthetic dataset that has comparable performance to the full real dataset in\ndownstream tasks such as classification. However, previous methods are\nprimarily designed for image and graph datasets, and directly adapting them to\nthe time series dataset leads to suboptimal performance due to their inability\nto effectively leverage the rich information inherent in time series data,\nparticularly in the frequency domain. In this paper, we propose a novel\nframework named Dataset \\textit{\\textbf{Cond}}ensation for\n\\textit{\\textbf{T}}ime \\textit{\\textbf{S}}eries\n\\textit{\\textbf{C}}lassification via Dual Domain Matching (\\textbf{CondTSC})\nwhich focuses on the time series classification dataset condensation task.\nDifferent from previous methods, our proposed framework aims to generate a\ncondensed dataset that matches the surrogate objectives in both the time and\nfrequency domains. Specifically, CondTSC incorporates multi-view data\naugmentation, dual domain training, and dual surrogate objectives to enhance\nthe dataset condensation process in the time and frequency domains. Through\nextensive experiments, we demonstrate the effectiveness of our proposed\nframework, which outperforms other baselines and learns a condensed synthetic\ndataset that exhibits desirable characteristics such as conforming to the\ndistribution of the original data.\n","authors":["Zhanyu Liu","Ke Hao","Guanjie Zheng","Yanwei Yu"],"pdf_url":"https://arxiv.org/pdf/2403.07245v2.pdf","comment":"Accepted by KDD 2024 research track"},{"id":"http://arxiv.org/abs/2405.09005v2","updated":"2024-06-06T12:29:48Z","published":"2024-05-15T00:13:18Z","title":"Cons-training tensor networks","summary":"  In this study, we introduce a novel family of tensor networks, termed\n\\textit{constrained matrix product states} (MPS), designed to incorporate\nexactly arbitrary discrete linear constraints, including inequalities, into\nsparse block structures. These tensor networks are particularly tailored for\nmodeling distributions with support strictly over the feasible space, offering\nbenefits such as reducing the search space in optimization problems,\nalleviating overfitting, improving training efficiency, and decreasing model\nsize. Central to our approach is the concept of a quantum region, an extension\nof quantum numbers traditionally used in U(1) symmetric tensor networks,\nadapted to capture any linear constraint, including the unconstrained scenario.\nWe further develop a novel canonical form for these new MPS, which allow for\nthe merging and factorization of tensor blocks according to quantum region\nfusion rules and permit optimal truncation schemes. Utilizing this canonical\nform, we apply an unsupervised training strategy to optimize arbitrary\nobjective functions subject to discrete linear constraints. Our method's\nefficacy is demonstrated by solving the quadratic knapsack problem, achieving\nsuperior performance compared to a leading nonlinear integer programming\nsolver. Additionally, we analyze the complexity and scalability of our\napproach, demonstrating its potential in addressing complex constrained\ncombinatorial optimization problems.\n","authors":["Javier Lopez-Piqueres","Jing Chen"],"pdf_url":"https://arxiv.org/pdf/2405.09005v2.pdf","comment":"v2: mostly improved Fig 1 and 13 for clarity, improved exposition of\n  ideas, and fixed a couple of transcription bugs in the pseudo algo. 3"},{"id":"http://arxiv.org/abs/2402.06700v4","updated":"2024-06-06T12:29:23Z","published":"2024-02-09T07:45:26Z","title":"Entropy-Regularized Token-Level Policy Optimization for Language Agent\n  Reinforcement","summary":"  Large Language Models (LLMs) have shown promise as intelligent agents in\ninteractive decision-making tasks. Traditional approaches often depend on\nmeticulously designed prompts, high-quality examples, or additional reward\nmodels for in-context learning, supervised fine-tuning, or RLHF. Reinforcement\nlearning (RL) presents a dynamic alternative for LLMs to overcome these\ndependencies by engaging directly with task-specific environments. Nonetheless,\nit faces significant hurdles: 1) instability stemming from the exponentially\nvast action space requiring exploration; 2) challenges in assigning token-level\ncredit based on action-level reward signals, resulting in discord between\nmaximizing rewards and accurately modeling corpus data. In response to these\nchallenges, we introduce Entropy-Regularized Token-level Policy Optimization\n(ETPO), an entropy-augmented RL method tailored for optimizing LLMs at the\ntoken level. At the heart of ETPO is our novel per-token soft Bellman update,\ndesigned to harmonize the RL process with the principles of language modeling.\nThis methodology decomposes the Q-function update from a coarse action-level\nview to a more granular token-level perspective, backed by theoretical proof of\noptimization consistency. Crucially, this decomposition renders linear time\ncomplexity in action exploration. We assess the effectiveness of ETPO within a\nsimulated environment that models data science code generation as a series of\nmulti-step interactive tasks; results underline ETPO's potential as a robust\nmethod for refining the interactive decision-making capabilities of language\nagents. For a more detailed preliminary work describing our motivation for\ntoken-level decomposition and applying it in PPO methods, please refer to\narXiv:2405.15821.\n","authors":["Muning Wen","Junwei Liao","Cheng Deng","Jun Wang","Weinan Zhang","Ying Wen"],"pdf_url":"https://arxiv.org/pdf/2402.06700v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03169v2","updated":"2024-06-06T12:22:18Z","published":"2024-02-05T16:38:30Z","title":"A Random Matrix Approach to Low-Multilinear-Rank Tensor Approximation","summary":"  This work presents a comprehensive understanding of the estimation of a\nplanted low-rank signal from a general spiked tensor model near the\ncomputational threshold. Relying on standard tools from the theory of large\nrandom matrices, we characterize the large-dimensional spectral behavior of the\nunfoldings of the data tensor and exhibit relevant signal-to-noise ratios\ngoverning the detectability of the principal directions of the signal. These\nresults allow to accurately predict the reconstruction performance of truncated\nmultilinear SVD (MLSVD) in the non-trivial regime. This is particularly\nimportant since it serves as an initialization of the higher-order orthogonal\niteration (HOOI) scheme, whose convergence to the best low-multilinear-rank\napproximation depends entirely on its initialization. We give a sufficient\ncondition for the convergence of HOOI and show that the number of iterations\nbefore convergence tends to $1$ in the large-dimensional limit.\n","authors":["Hugo Lebeau","Florent Chatelain","Romain Couillet"],"pdf_url":"https://arxiv.org/pdf/2402.03169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18924v2","updated":"2024-06-06T12:20:48Z","published":"2023-10-29T07:32:32Z","title":"Remaining useful life prediction of Lithium-ion batteries using\n  spatio-temporal multimodal attention networks","summary":"  Lithium-ion batteries are widely used in various applications, including\nelectric vehicles and renewable energy storage. The prediction of the remaining\nuseful life (RUL) of batteries is crucial for ensuring reliable and efficient\noperation, as well as reducing maintenance costs. However, determining the life\ncycle of batteries in real-world scenarios is challenging, and existing methods\nhave limitations in predicting the number of cycles iteratively. In addition,\nexisting works often oversimplify the datasets, neglecting important features\nof the batteries such as temperature, internal resistance, and material type.\nTo address these limitations, this paper proposes a two-stage RUL prediction\nscheme for Lithium-ion batteries using a spatio-temporal multimodal attention\nnetwork (ST-MAN). The proposed ST-MAN is to capture the complex spatio-temporal\ndependencies in the battery data, including the features that are often\nneglected in existing works. Despite operating without prior knowledge of\nend-of-life (EOL) events, our method consistently achieves lower error rates,\nboasting mean absolute error (MAE) and mean square error (MSE) of 0.0275 and\n0.0014, respectively, compared to existing convolutional neural networks (CNN)\nand long short-term memory (LSTM)-based methods. The proposed method has the\npotential to improve the reliability and efficiency of battery operations and\nis applicable in various industries.\n","authors":["Sungho Suh","Dhruv Aditya Mittal","Hymalai Bello","Bo Zhou","Mayank Shekhar Jha","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2310.18924v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03999v1","updated":"2024-06-06T12:17:57Z","published":"2024-06-06T12:17:57Z","title":"Unveiling the Dynamics of Information Interplay in Supervised Learning","summary":"  In this paper, we use matrix information theory as an analytical tool to\nanalyze the dynamics of the information interplay between data representations\nand classification head vectors in the supervised learning process.\nSpecifically, inspired by the theory of Neural Collapse, we introduce matrix\nmutual information ratio (MIR) and matrix entropy difference ratio (HDR) to\nassess the interactions of data representation and class classification heads\nin supervised learning, and we determine the theoretical optimal values for MIR\nand HDR when Neural Collapse happens. Our experiments show that MIR and HDR can\neffectively explain many phenomena occurring in neural networks, for example,\nthe standard supervised training dynamics, linear mode connectivity, and the\nperformance of label smoothing and pruning. Additionally, we use MIR and HDR to\ngain insights into the dynamics of grokking, which is an intriguing phenomenon\nobserved in supervised training, where the model demonstrates generalization\ncapabilities long after it has learned to fit the training data. Furthermore,\nwe introduce MIR and HDR as loss terms in supervised and semi-supervised\nlearning to optimize the information interactions among samples and\nclassification heads. The empirical results provide evidence of the method's\neffectiveness, demonstrating that the utilization of MIR and HDR not only aids\nin comprehending the dynamics throughout the training process but can also\nenhances the training procedure itself.\n","authors":["Kun Song","Zhiquan Tan","Bochao Zou","Huimin Ma","Weiran Huang"],"pdf_url":"https://arxiv.org/pdf/2406.03999v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2406.03997v1","updated":"2024-06-06T12:17:05Z","published":"2024-06-06T12:17:05Z","title":"HackAtari: Atari Learning Environments for Robust and Continual\n  Reinforcement Learning","summary":"  Artificial agents' adaptability to novelty and alignment with intended\nbehavior is crucial for their effective deployment. Reinforcement learning (RL)\nleverages novelty as a means of exploration, yet agents often struggle to\nhandle novel situations, hindering generalization. To address these issues, we\npropose HackAtari, a framework introducing controlled novelty to the most\ncommon RL benchmark, the Atari Learning Environment. HackAtari allows us to\ncreate novel game scenarios (including simplification for curriculum learning),\nto swap the game elements' colors, as well as to introduce different reward\nsignals for the agent. We demonstrate that current agents trained on the\noriginal environments include robustness failures, and evaluate HackAtari's\nefficacy in enhancing RL agents' robustness and aligning behavior through\nexperiments using C51 and PPO. Overall, HackAtari can be used to improve the\nrobustness of current and future RL algorithms, allowing Neuro-Symbolic RL,\ncurriculum RL, causal RL, as well as LLM-driven RL. Our work underscores the\nsignificance of developing interpretable in RL agents.\n","authors":["Quentin Delfosse","Jannis Blüml","Bjarne Gregori","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2406.03997v1.pdf","comment":"9 main pages, 4 pages references, 19 pages of appendix"},{"id":"http://arxiv.org/abs/2402.06733v3","updated":"2024-06-06T12:16:55Z","published":"2024-02-09T19:09:19Z","title":"NICE: To Optimize In-Context Examples or Not?","summary":"  Recent work shows that in-context learning and optimization of in-context\nexamples (ICE) can significantly improve the accuracy of large language models\n(LLMs) on a wide range of tasks, leading to an apparent consensus that ICE\noptimization is crucial for better performance. However, most of these studies\nassume a fixed or no instruction provided in the prompt. We challenge this\nconsensus by investigating the necessity of optimizing ICE when task-specific\ninstructions are provided and find that there are many tasks for which it\nyields diminishing returns. In particular, using a diverse set of tasks and a\nsystematically created instruction set with gradually added details, we find\nthat as the prompt instruction becomes more detailed, the returns on ICE\noptimization diminish. To characterize this behavior, we introduce a\ntask-specific metric called Normalized Invariability to Choice of Examples\n(NICE) that quantifies the learnability of tasks from a given instruction, and\nprovides a heuristic to help decide whether to optimize instructions or ICE for\na new task. Given a task, the proposed metric can reliably predict the utility\nof optimizing ICE compared to using random ICE. Our code is available at\nhttps://github.com/microsoft/nice-icl.\n","authors":["Pragya Srivastava","Satvik Golechha","Amit Deshpande","Amit Sharma"],"pdf_url":"https://arxiv.org/pdf/2402.06733v3.pdf","comment":"Accepted as a full paper (9 pages) at ACL 2024 (Main)"},{"id":"http://arxiv.org/abs/2206.07438v3","updated":"2024-06-06T12:14:36Z","published":"2022-06-15T10:23:19Z","title":"Multi-Objective Hyperparameter Optimization in Machine Learning -- An\n  Overview","summary":"  Hyperparameter optimization constitutes a large part of typical modern\nmachine learning workflows. This arises from the fact that machine learning\nmethods and corresponding preprocessing steps often only yield optimal\nperformance when hyperparameters are properly tuned. But in many applications,\nwe are not only interested in optimizing ML pipelines solely for predictive\naccuracy; additional metrics or constraints must be considered when determining\nan optimal configuration, resulting in a multi-objective optimization problem.\nThis is often neglected in practice, due to a lack of knowledge and readily\navailable software implementations for multi-objective hyperparameter\noptimization. In this work, we introduce the reader to the basics of\nmulti-objective hyperparameter optimization and motivate its usefulness in\napplied ML. Furthermore, we provide an extensive survey of existing\noptimization strategies, both from the domain of evolutionary algorithms and\nBayesian optimization. We illustrate the utility of MOO in several specific ML\napplications, considering objectives such as operating conditions, prediction\ntime, sparseness, fairness, interpretability and robustness.\n","authors":["Florian Karl","Tobias Pielok","Julia Moosbauer","Florian Pfisterer","Stefan Coors","Martin Binder","Lennart Schneider","Janek Thomas","Jakob Richter","Michel Lang","Eduardo C. Garrido-Merchán","Juergen Branke","Bernd Bischl"],"pdf_url":"https://arxiv.org/pdf/2206.07438v3.pdf","comment":"Published at ACM TELO"},{"id":"http://arxiv.org/abs/2402.10571v2","updated":"2024-06-06T12:02:37Z","published":"2024-02-16T10:55:38Z","title":"Direct Preference Optimization with an Offset","summary":"  Direct preference optimization (DPO) is a successful fine-tuning strategy for\naligning large language models with human preferences without the need to train\na reward model or employ reinforcement learning. DPO, as originally formulated,\nrelies on binary preference data and fine-tunes a language model to increase\nthe likelihood of a preferred response over a dispreferred response. However,\nnot all preference pairs are equal. Sometimes, the preferred response is only\nslightly better than the dispreferred one. In other cases, the preference is\nmuch stronger. For instance, if a response contains harmful or toxic content,\nthe annotator will have a strong preference for that response. In this paper,\nwe propose a generalization of DPO, termed DPO with an offset (ODPO), that does\nnot treat every preference pair equally during fine-tuning. Intuitively, ODPO\nrequires the difference between the likelihood of the preferred and\ndispreferred response to be greater than an offset value. The offset is\ndetermined based on the extent to which one response is preferred over another.\nOur experiments on various tasks suggest that ODPO significantly outperforms\nDPO in aligning language models, especially when the number of preference pairs\nis limited.\n","authors":["Afra Amini","Tim Vieira","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2402.10571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03980v1","updated":"2024-06-06T11:51:12Z","published":"2024-06-06T11:51:12Z","title":"Position: Embracing Negative Results in Machine Learning","summary":"  Publications proposing novel machine learning methods are often primarily\nrated by exhibited predictive performance on selected problems. In this\nposition paper we argue that predictive performance alone is not a good\nindicator for the worth of a publication. Using it as such even fosters\nproblems like inefficiencies of the machine learning research community as a\nwhole and setting wrong incentives for researchers. We therefore put out a call\nfor the publication of \"negative\" results, which can help alleviate some of\nthese problems and improve the scientific output of the machine learning\nresearch community. To substantiate our position, we present the advantages of\npublishing negative results and provide concrete measures for the community to\nmove towards a paradigm where their publication is normalized.\n","authors":["Florian Karl","Lukas Malte Kemeter","Gabriel Dax","Paulina Sierak"],"pdf_url":"https://arxiv.org/pdf/2406.03980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03978v1","updated":"2024-06-06T11:42:33Z","published":"2024-06-06T11:42:33Z","title":"Mini Honor of Kings: A Lightweight Environment for Multi-Agent\n  Reinforcement Learning","summary":"  Games are widely used as research environments for multi-agent reinforcement\nlearning (MARL), but they pose three significant challenges: limited\ncustomization, high computational demands, and oversimplification. To address\nthese issues, we introduce the first publicly available map editor for the\npopular mobile game Honor of Kings and design a lightweight environment, Mini\nHonor of Kings (Mini HoK), for researchers to conduct experiments. Mini HoK is\nhighly efficient, allowing experiments to be run on personal PCs or laptops\nwhile still presenting sufficient challenges for existing MARL algorithms. We\nhave tested our environment on common MARL algorithms and demonstrated that\nthese algorithms have yet to find optimal solutions within this environment.\nThis facilitates the dissemination and advancement of MARL methods within the\nresearch community. Additionally, we hope that more researchers will leverage\nthe Honor of Kings map editor to develop innovative and scientifically valuable\nnew maps. Our code and user manual are available at:\nhttps://github.com/tencent-ailab/mini-hok.\n","authors":["Lin Liu","Jian Zhao","Cheng Hu","Zhengtao Cao","Youpeng Zhao","Zhenbin Ye","Meng Meng","Wenjun Wang","Zhaofeng He","Houqiang Li","Xia Lin","Lanxiao Huang"],"pdf_url":"https://arxiv.org/pdf/2406.03978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09871v2","updated":"2024-06-06T11:42:24Z","published":"2024-03-14T21:01:06Z","title":"ThermoHands: A Benchmark for 3D Hand Pose Estimation from Egocentric\n  Thermal Images","summary":"  In this work, we present ThermoHands, a new benchmark for thermal image-based\negocentric 3D hand pose estimation, aimed at overcoming challenges like varying\nlighting conditions and obstructions (e.g., handwear). The benchmark includes a\nmulti-view and multi-spectral dataset collected from 28 subjects performing\nhand-object and hand-virtual interactions under diverse scenarios, accurately\nannotated with 3D hand poses through an automated process. We introduce a new\nbaseline method, TherFormer, utilizing dual transformer modules for effective\negocentric 3D hand pose estimation in thermal imagery. Our experimental results\nhighlight TherFormer's leading performance and affirm thermal imaging's\neffectiveness in enabling robust 3D hand pose estimation in adverse conditions.\n","authors":["Fangqiang Ding","Yunzhou Zhu","Xiangyu Wen","Gaowen Liu","Chris Xiaoxuan Lu"],"pdf_url":"https://arxiv.org/pdf/2403.09871v2.pdf","comment":"15 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2402.00759v3","updated":"2024-06-06T11:25:34Z","published":"2024-02-01T16:49:27Z","title":"Building Expressive and Tractable Probabilistic Generative Models: A\n  Review","summary":"  We present a comprehensive survey of the advancements and techniques in the\nfield of tractable probabilistic generative modeling, primarily focusing on\nProbabilistic Circuits (PCs). We provide a unified perspective on the inherent\ntrade-offs between expressivity and tractability, highlighting the design\nprinciples and algorithmic extensions that have enabled building expressive and\nefficient PCs, and provide a taxonomy of the field. We also discuss recent\nefforts to build deep and hybrid PCs by fusing notions from deep neural models,\nand outline the challenges and open questions that can guide future research in\nthis evolving field.\n","authors":["Sahil Sidheekh","Sriraam Natarajan"],"pdf_url":"https://arxiv.org/pdf/2402.00759v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17234v3","updated":"2024-06-06T11:18:17Z","published":"2024-05-27T14:50:42Z","title":"Benchmarking General Purpose In-Context Learning","summary":"  In-context learning (ICL) is becoming increasingly appealing to the AI\ncommunity due to its flexibility, generality, sample efficiency, and exemption\nfrom artificial optimization skills. It is desirable to further enhance the\ngenerality and capability of ICL, which gives rise to the concept of\ngeneral-purpose in-context learning (GPICL). We aim to extend ICL to address a\nbroader range of tasks with an extended learning horizon and higher improvement\npotential, albeit with relatively limited zero-shot generalization. To this\nend, we introduce two lightweight but insightful benchmarks specifically\ncrafted to train and evaluate GPICL functionalities. Each benchmark includes a\nvast number of tasks characterized by significant task variance, featuring\nminimal transferable knowledge among tasks. These tasks are designed to\nfacilitate lifelong in-context learning through continuous generation and\ninteraction. These features pose significant challenges for models that rely on\ncontext or interactions to improve their proficiency, including language\nmodels, decision models, and world models. Our experiments reveal that the\nscale of parameters alone may not be crucial for ICL or GPICL, suggesting\nalternative approaches such as increasing the scale of contexts and memory\nstates.\n","authors":["Fan Wang","Chuan Lin","Yang Cao","Yu Kang"],"pdf_url":"https://arxiv.org/pdf/2405.17234v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.12264v3","updated":"2024-06-06T11:17:31Z","published":"2022-07-25T15:35:48Z","title":"Dynamics and triggers of misinformation on vaccines","summary":"  The Covid-19 pandemic has sparked renewed attention on the prevalence of\nmisinformation online, whether intentional or not, underscoring the potential\nrisks posed to individuals' quality of life associated with the dissemination\nof misconceptions and enduring myths on health-related subjects. In this study,\nwe analyze 6 years (2016-2021) of Italian vaccine debate across diverse social\nmedia platforms (Facebook, Instagram, Twitter, YouTube), encompassing all major\nnews sources - both questionable and reliable. We first use the symbolic\ntransfer entropy analysis of news production time-series to dynamically\ndetermine which category of sources, questionable or reliable, causally drives\nthe agenda on vaccines. Then, leveraging deep learning models capable to\naccurately classify vaccine-related content based on the conveyed stance and\ndiscussed topic, respectively, we evaluate the focus on various topics by news\nsources promoting opposing views and compare the resulting user engagement.\nAside from providing valuable resources for further investigation of\nvaccine-related misinformation, particularly in a language (Italian) that\nreceives less attention in scientific research compared to languages like\nEnglish, our study uncovers misinformation not as a parasite of the news\necosystem that merely opposes the perspectives offered by mainstream media, but\nas an autonomous force capable of even overwhelming the production of\nvaccine-related content from the latter. While the pervasiveness of\nmisinformation is evident in the significantly higher engagement of\nquestionable sources compared to reliable ones, our findings underscore the\nimportance of consistent and thorough pro-vax coverage. This is especially\ncrucial in addressing the most sensitive topics where the risk of\nmisinformation spreading and potentially exacerbating negative attitudes toward\nvaccines among the users involved is higher.\n","authors":["Emanuele Brugnoli","Marco Delmastro"],"pdf_url":"https://arxiv.org/pdf/2207.12264v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07891v3","updated":"2024-06-06T11:07:17Z","published":"2024-02-12T18:54:02Z","title":"Label-Efficient Model Selection for Text Generation","summary":"  Model selection for a given target task can be costly, as it may entail\nextensive annotation of the quality of outputs of different models. We\nintroduce DiffUse, an efficient method to make an informed decision between\ncandidate text generation models based on preference annotations. DiffUse\nreduces the required amount of annotations, thus saving valuable time and\nresources in performing evaluation. DiffUse intelligently selects instances by\nclustering embeddings that represent the semantic differences between model\noutputs. Thus, it is able to identify a subset of examples that are more\ninformative for preference decisions. Our method is model-agnostic, and can be\napplied to any text generation model for selecting between models, prompts and\nconfigurations. Moreover, we propose a practical iterative approach for\ndynamically determining how many instances to annotate. In a series of\nexperiments over hundreds of model pairs, we demonstrate that DiffUse can\ndramatically reduce the required number of annotations -- by up to 75% -- while\nmaintaining high evaluation reliability.\n","authors":["Shir Ashury-Tahan","Ariel Gera","Benjamin Sznajder","Leshem Choshen","Liat Ein-Dor","Eyal Shnarch"],"pdf_url":"https://arxiv.org/pdf/2402.07891v3.pdf","comment":"Accepted to ACL (main conference)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2406.04321v1","updated":"2024-06-06T17:58:11Z","published":"2024-06-06T17:58:11Z","title":"VidMuse: A Simple Video-to-Music Generation Framework with\n  Long-Short-Term Modeling","summary":"  In this work, we systematically study music generation conditioned solely on\nthe video. First, we present a large-scale dataset comprising 190K video-music\npairs, including various genres such as movie trailers, advertisements, and\ndocumentaries. Furthermore, we propose VidMuse, a simple framework for\ngenerating music aligned with video inputs. VidMuse stands out by producing\nhigh-fidelity music that is both acoustically and semantically aligned with the\nvideo. By incorporating local and global visual cues, VidMuse enables the\ncreation of musically coherent audio tracks that consistently match the video\ncontent through Long-Short-Term modeling. Through extensive experiments,\nVidMuse outperforms existing models in terms of audio quality, diversity, and\naudio-visual alignment. The code and datasets will be available at\nhttps://github.com/ZeyueT/VidMuse/.\n","authors":["Zeyue Tian","Zhaoyang Liu","Ruibin Yuan","Jiahao Pan","Xiaoqiang Huang","Qifeng Liu","Xu Tan","Qifeng Chen","Wei Xue","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2406.04321v1.pdf","comment":"The code and datasets will be available at\n  https://github.com/ZeyueT/VidMuse/"},{"id":"http://arxiv.org/abs/2406.04309v1","updated":"2024-06-06T17:55:34Z","published":"2024-06-06T17:55:34Z","title":"ReFiNe: Recursive Field Networks for Cross-modal Multi-scene\n  Representation","summary":"  The common trade-offs of state-of-the-art methods for multi-shape\nrepresentation (a single model \"packing\" multiple objects) involve trading\nmodeling accuracy against memory and storage. We show how to encode multiple\nshapes represented as continuous neural fields with a higher degree of\nprecision than previously possible and with low memory usage. Key to our\napproach is a recursive hierarchical formulation that exploits object\nself-similarity, leading to a highly compressed and efficient shape latent\nspace. Thanks to the recursive formulation, our method supports spatial and\nglobal-to-local latent feature fusion without needing to initialize and\nmaintain auxiliary data structures, while still allowing for continuous field\nqueries to enable applications such as raytracing. In experiments on a set of\ndiverse datasets, we provide compelling qualitative results and demonstrate\nstate-of-the-art multi-scene reconstruction and compression results with a\nsingle network per dataset.\n","authors":["Sergey Zakharov","Katherine Liu","Adrien Gaidon","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2406.04309v1.pdf","comment":"SIGGRAPH 2024. Project Page:\n  https://zakharos.github.io/projects/refine/"},{"id":"http://arxiv.org/abs/2402.12451v2","updated":"2024-06-06T16:13:43Z","published":"2024-02-19T19:01:01Z","title":"The Revolution of Multimodal Large Language Models: A Survey","summary":"  Connecting text and visual modalities plays an essential role in generative\nintelligence. For this reason, inspired by the success of large language\nmodels, significant research efforts are being devoted to the development of\nMultimodal Large Language Models (MLLMs). These models can seamlessly integrate\nvisual and textual modalities, while providing a dialogue-based interface and\ninstruction-following capabilities. In this paper, we provide a comprehensive\nreview of recent visual-based MLLMs, analyzing their architectural choices,\nmultimodal alignment strategies, and training techniques. We also conduct a\ndetailed analysis of these models across a wide range of tasks, including\nvisual grounding, image generation and editing, visual understanding, and\ndomain-specific applications. Additionally, we compile and describe training\ndatasets and evaluation benchmarks, conducting comparisons among existing\nmodels in terms of performance and computational requirements. Overall, this\nsurvey offers a comprehensive overview of the current state of the art, laying\nthe groundwork for future MLLMs.\n","authors":["Davide Caffagni","Federico Cocchi","Luca Barsellotti","Nicholas Moratelli","Sara Sarto","Lorenzo Baraldi","Lorenzo Baraldi","Marcella Cornia","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2402.12451v2.pdf","comment":"ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2406.03723v1","updated":"2024-06-06T03:37:39Z","published":"2024-06-06T03:37:39Z","title":"Gear-NeRF: Free-Viewpoint Rendering and Tracking with Motion-aware\n  Spatio-Temporal Sampling","summary":"  Extensions of Neural Radiance Fields (NeRFs) to model dynamic scenes have\nenabled their near photo-realistic, free-viewpoint rendering. Although these\nmethods have shown some potential in creating immersive experiences, two\ndrawbacks limit their ubiquity: (i) a significant reduction in reconstruction\nquality when the computing budget is limited, and (ii) a lack of semantic\nunderstanding of the underlying scenes. To address these issues, we introduce\nGear-NeRF, which leverages semantic information from powerful image\nsegmentation models. Our approach presents a principled way for learning a\nspatio-temporal (4D) semantic embedding, based on which we introduce the\nconcept of gears to allow for stratified modeling of dynamic regions of the\nscene based on the extent of their motion. Such differentiation allows us to\nadjust the spatio-temporal sampling resolution for each region in proportion to\nits motion scale, achieving more photo-realistic dynamic novel view synthesis.\nAt the same time, almost for free, our approach enables free-viewpoint tracking\nof objects of interest - a functionality not yet achieved by existing\nNeRF-based methods. Empirical studies validate the effectiveness of our method,\nwhere we achieve state-of-the-art rendering and tracking performance on\nmultiple challenging datasets.\n","authors":["Xinhang Liu","Yu-Wing Tai","Chi-Keung Tang","Pedro Miraldo","Suhas Lohit","Moitreya Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2406.03723v1.pdf","comment":"Paper accepted to IEEE/CVF CVPR 2024 (Spotlight). Work done when XL\n  was an intern at MERL. Project Page Link:\n  https://merl.com/research/highlights/gear-nerf"},{"id":"http://arxiv.org/abs/2406.03720v1","updated":"2024-06-06T03:31:41Z","published":"2024-06-06T03:31:41Z","title":"JIGMARK: A Black-Box Approach for Enhancing Image Watermarks against\n  Diffusion Model Edits","summary":"  In this study, we investigate the vulnerability of image watermarks to\ndiffusion-model-based image editing, a challenge exacerbated by the\ncomputational cost of accessing gradient information and the closed-source\nnature of many diffusion models. To address this issue, we introduce JIGMARK.\nThis first-of-its-kind watermarking technique enhances robustness through\ncontrastive learning with pairs of images, processed and unprocessed by\ndiffusion models, without needing a direct backpropagation of the diffusion\nprocess. Our evaluation reveals that JIGMARK significantly surpasses existing\nwatermarking solutions in resilience to diffusion-model edits, demonstrating a\nTrue Positive Rate more than triple that of leading baselines at a 1% False\nPositive Rate while preserving image quality. At the same time, it consistently\nimproves the robustness against other conventional perturbations (like JPEG,\nblurring, etc.) and malicious watermark attacks over the state-of-the-art,\noften by a large margin. Furthermore, we propose the Human Aligned Variation\n(HAV) score, a new metric that surpasses traditional similarity measures in\nquantifying the number of image derivatives from image editing.\n","authors":["Minzhou Pan","Yi Zeng","Xue Lin","Ning Yu","Cho-Jui Hsieh","Peter Henderson","Ruoxi Jia"],"pdf_url":"https://arxiv.org/pdf/2406.03720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03701v1","updated":"2024-06-06T02:50:59Z","published":"2024-06-06T02:50:59Z","title":"Recognizing Everything from All Modalities at Once: Grounded Multimodal\n  Universal Information Extraction","summary":"  In the field of information extraction (IE), tasks across a wide range of\nmodalities and their combinations have been traditionally studied in isolation,\nleaving a gap in deeply recognizing and analyzing cross-modal information. To\naddress this, this work for the first time introduces the concept of grounded\nMultimodal Universal Information Extraction (MUIE), providing a unified task\nframework to analyze any IE tasks over various modalities, along with their\nfine-grained groundings. To tackle MUIE, we tailor a multimodal large language\nmodel (MLLM), Reamo, capable of extracting and grounding information from all\nmodalities, i.e., recognizing everything from all modalities at once. Reamo is\nupdated via varied tuning strategies, equipping it with powerful capabilities\nfor information recognition and fine-grained multimodal grounding. To address\nthe absence of a suitable benchmark for grounded MUIE, we curate a\nhigh-quality, diverse, and challenging test set, which encompasses IE tasks\nacross 9 common modality combinations with the corresponding multimodal\ngroundings. The extensive comparison of Reamo with existing MLLMs integrated\ninto pipeline approaches demonstrates its advantages across all evaluation\ndimensions, establishing a strong benchmark for the follow-up research. Our\nresources are publicly released at https://haofei.vip/MUIE.\n","authors":["Meishan Zhang","Hao Fei","Bin Wang","Shengqiong Wu","Yixin Cao","Fei Li","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.03701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14667v2","updated":"2024-06-06T02:10:23Z","published":"2023-12-22T13:03:23Z","title":"Token-Level Contrastive Learning with Modality-Aware Prompting for\n  Multimodal Intent Recognition","summary":"  Multimodal intent recognition aims to leverage diverse modalities such as\nexpressions, body movements and tone of speech to comprehend user's intent,\nconstituting a critical task for understanding human language and behavior in\nreal-world multimodal scenarios. Nevertheless, the majority of existing methods\nignore potential correlations among different modalities and own limitations in\neffectively learning semantic features from nonverbal modalities. In this\npaper, we introduce a token-level contrastive learning method with\nmodality-aware prompting (TCL-MAP) to address the above challenges. To\nestablish an optimal multimodal semantic environment for text modality, we\ndevelop a modality-aware prompting module (MAP), which effectively aligns and\nfuses features from text, video and audio modalities with similarity-based\nmodality alignment and cross-modality attention mechanism. Based on the\nmodality-aware prompt and ground truth labels, the proposed token-level\ncontrastive learning framework (TCL) constructs augmented samples and employs\nNT-Xent loss on the label token. Specifically, TCL capitalizes on the optimal\ntextual semantic insights derived from intent labels to guide the learning\nprocesses of other modalities in return. Extensive experiments show that our\nmethod achieves remarkable improvements compared to state-of-the-art methods.\nAdditionally, ablation analyses demonstrate the superiority of the\nmodality-aware prompt over the handcrafted prompt, which holds substantial\nsignificance for multimodal prompt learning. The codes are released at\nhttps://github.com/thuiar/TCL-MAP.\n","authors":["Qianrui Zhou","Hua Xu","Hao Li","Hanlei Zhang","Xiaohan Zhang","Yifan Wang","Kai Gao"],"pdf_url":"https://arxiv.org/pdf/2312.14667v2.pdf","comment":"Accepted by AAAI 2024 (Main Track, Long Paper)"},{"id":"http://arxiv.org/abs/2309.10740v2","updated":"2024-06-06T01:32:00Z","published":"2023-09-19T16:36:33Z","title":"ConsistencyTTA: Accelerating Diffusion-Based Text-to-Audio Generation\n  with Consistency Distillation","summary":"  Diffusion models are instrumental in text-to-audio (TTA) generation.\nUnfortunately, they suffer from slow inference due to an excessive number of\nqueries to the underlying denoising network per generation. To address this\nbottleneck, we introduce ConsistencyTTA, a framework requiring only a single\nnon-autoregressive network query, thereby accelerating TTA by hundreds of\ntimes. We achieve so by proposing \"CFG-aware latent consistency model,\" which\nadapts consistency generation into a latent space and incorporates\nclassifier-free guidance (CFG) into model training. Moreover, unlike diffusion\nmodels, ConsistencyTTA can be finetuned closed-loop with audio-space text-aware\nmetrics, such as CLAP score, to further enhance the generations. Our objective\nand subjective evaluation on the AudioCaps dataset shows that compared to\ndiffusion-based counterparts, ConsistencyTTA reduces inference computation by\n400x while retaining generation quality and diversity.\n","authors":["Yatong Bai","Trung Dang","Dung Tran","Kazuhito Koishida","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2309.10740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07640v2","updated":"2024-06-06T00:26:26Z","published":"2024-02-12T13:27:22Z","title":"CMFeed: A Benchmark Dataset for Controllable Multimodal Feedback\n  Synthesis","summary":"  The Controllable Multimodal Feedback Synthesis (CMFeed) dataset enables the\ngeneration of sentiment-controlled feedback from multimodal inputs. It contains\nimages, text, human comments, comments' metadata and sentiment labels. Existing\ndatasets for related tasks such as multimodal summarization, visual question\nanswering, visual dialogue, and sentiment-aware text generation do not\nincorporate training models using human-generated outputs and their metadata, a\ngap that CMFeed addresses. This capability is critical for developing feedback\nsystems that understand and replicate human-like spontaneous responses. Based\non the CMFeed dataset, we define a novel task of controllable feedback\nsynthesis to generate context-aware feedback aligned with the desired\nsentiment. We propose a benchmark feedback synthesis system comprising encoder,\ndecoder, and controllability modules. It employs transformer and Faster R-CNN\nnetworks to extract features and generate sentiment-specific feedback,\nachieving a sentiment classification accuracy of 77.23%, which is 18.82% higher\nthan models not leveraging the dataset's unique controllability features.\nAdditionally, we incorporate a similarity module for relevance assessment\nthrough rank-based metrics.\n","authors":["Puneet Kumar","Sarthak Malik","Balasubramanian Raman","Xiaobai Li"],"pdf_url":"https://arxiv.org/pdf/2402.07640v2.pdf","comment":null}]}}
\ No newline at end of file
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5
GIT binary patch
literal 15086
zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau
zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}<z`M#I>4!8Q=syhFvI(6#Q
zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c
zg{r6iyXi3T|KFt>>TsDWWe%JEG<m`Z;&-3^R0ug1f<^}$q5Q=P?#uie-}>eI;m)t_
z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa
zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y<ax-x_)V}OWlBT0T`Px7TNY}*zI;?
zWn^JY)))ky4lEHv(5WD_*xdquU*8ck(q|ykPW+@)bE-C$rN!lK{RRgcVzzm<##`_U
zhBRv$;y#E&mknJ|`AlUeIZXK`M0_2g$!Ip4kuom@NlD3tvX9Ccg@E$`PIbJ+;%m9!
z<G~o6H5!>2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8`
zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%P<t#lxR;fx2$gDB*dT
z-+m<>AnhXUA$><WLApjNA(g#c7XMh?0~WKTNN+d>^4n&yX>&2hmV*Ry0tB<smelLF
ztRFHgMKX(=h8&WgVVQw@_BMsOVbpSXo@0Ik=~dE0q{g&!VYIhlK_u#FczLLMOi)ko
z+Z57n(iM_JDs#0Ax^C!-1?B}BcHt@g_Dc3Eh}sc_4%<2)dUZ4gEg6IkTRNc3wK5L1
z6{?)5f*}isz;3of^glU085Y(z<Pg1^YMF|L2O1XD6WkV32U49XP;J{o4SyMGej3!O
zHBg^!hWca^)bbCZX3l{c@B&m*+OSZ{)`NA{lQ<?~=R0ohh!k^5A$GAB-B&+}N~ay>
zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j
zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z
z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4<Ggka
zN-&m0Qby!Ce-0`$i+Xi0!o%ue+*@xUD=Q20&GR)q^1tFcc0&for^wLqi<X-H+iiBl
z#>AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u
zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B
zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El
zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE<x*gAYQrYe#$37OG7fouZ<ko_rE&?p&yo
zCyTcs<FJ?VMnpipMqVCl%uPv=##(hlptfu+uJ3~V+i#)z_SJEY8wa&^E!0<ELH+Op
z)Q>+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+
zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_
zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~
zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M
zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm
z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2
zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R
zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BG<cICna&vQY(Nf}SL
zjkW~;^jT0XN{Rn#!5;v%dTnw2F?h*S?8%NDFg0ri)4X}b^+C$8-{qT>k+NDH$2J`E
zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL
zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JL<XQxi*{lO~cQ!n*ZwBj{Fu`
zPo2`&;BhzI^F<PkXB({|^>y9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^
z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0<rr~E<k87UPdvlyVcO_;~
z+9dH1Sx0(2qxYG+F<_sW-hEfc_BefP@ueNdN9ypMiQhE*EbDgu3icyMpqel?^>(R|
z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE
z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;<V&^EBZkW(`MPt+n<=pJk
z$Nm1v6JBrL%bXQIqD4!nXNbKtHor(~w4ChSo9}9Y>DK9ZoqOLJHETi*8l?M+!q*#o
zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*Dx<aiO|CwQ)fvke~&&#7;MztE%p`Pxc
z@5_b1JAEh#vwRc%mwMtGHgAQ-4I3e0Y64QNsd#8pCrCWeS1k%U8HDE-4??0j5ic%z
z5pj#+FxN7-F#aYn#3{BZ2+au9eZ#wGCm74t_Tw_IJ<Ul&7Qrj>N;%mZkENXqy&csb
zsD_E}O+<E9w#I9oc^*1#?1ZwHxDWO+1)dGWphbhU7_RhnmW@QbM*LHr(=5}_WM4tt
zUG@eeNOc&?l>J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS
zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9T<bF4y0;bcEGh6t6X6&+-)P5}%CR$#vK<
zu8~jt5(j;O)PPv(*MoYP``Tfnpb`=pZ%ARhgmS&Zc-NpIP*EMYu5$j|OZzyEbq<jV
z_UIA=e{$`UFk8$TKds-ac1*QS#j9DbVzO;AlFUh%xnL$*?&3UmN%wDSgOs!sP0t;%
zOJ2qyJ}Vv-&a<!GCGoM2knE(Pq%NeEB#D_fB}Fpdo%A~CBhnuvH?f5ZxfKwzJVuN4
z8yG|m;jhg<=H)D}&q~@r7r##8rIDYv#=F*cwQ~X~FXuF7#?mF9Xh}q!AM3b7@BB`1
z$$BAim*XVy0nU?Tth`8-PgTZ07yc7=ThaN0A&We-GqN%4gW>LDq~u-ur`m{J5sTMs
zuVdouiO8@OoM~_@pI-BHF}c0L<ZGE)Kh@IW^qLM4vw`1YwP1Go9Q0fnjgroS#dcLV
zS3x^}O=tfba*Ga?&n@poFZznur9Atw+sEBBJutA_*?)bP_eHHAYat+)dnzwdrPGxV
zZ;ywZIc1nL(EG#QJQw!eE_=hvwM-M9FD5TY)^u0&_kHvG2t7i(?PHR2*daV~?M~ZJ
z+3_sL$98<>>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI<ni3ebD5Irm*N}m
z5ae+_=7k*!&!fJ3;xDIy|Fs0|&*?_c|Nr<8^nlUta!)Iq)R9zpPm6cCiv2OtNj>}D
zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50
zU`*>4B=PI|ujbte-Xj>la6GD=q@VAS<lMgAv3lsbsVma0>BzQb<MA!rv)_VL`<L37
zY3Cgh+Iv~|2d)~3M&C1L!MjY-e$d$<L~o47+>E(Mou3Mu)rQ&jISBtLywuzSE(YL*
zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut
zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&<
zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`-
zv1q-oHJX3j0-ZkZ<kYuvPa<k}6#A|0kH@w?hDkY-bi6hzdTj22vIn>?7)E-Ue)=q)
z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ
zL&}654a0<m6Od!iLHrx9pmN6usKjaXuTSWEpX&vCQ`!3%B=3ZNJj%1^cr=M>gZHLP
z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDU<D_7}zFX3hEaUJ(yk{8~_H;JRC&PQmwrl|Hs
z@LzE9&?}5F`#;8UUd-5(W7~2kLt+}smT`}}fP7M+va`AW+gTKcGx}uu4BkPZ{>XWm
zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J
z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*<Lp%UOTTUSl=Fn+?@R1Q@D7Wk43tCo
z@{Xj&Th7U)O+ycizLBTAld+LI9HU1=rKRb0&*yS(c7fXNLm!Oc9JjD<MMjCs$vNcI
zsegUGA=tKU(=kXL?Op}j7(bqI1TXoEo#`J0wP|aS-U*Hk8}zu4jIE>Gj(PbecRHl3
z)RAxU<Q_x4rqAuaL?L$c*&5X6Jw^Pb9XXekc_sbqssH7@l##LqF5$n3b7e&F*oIpy
zSNXOt=j_(cocT+8BA?7V(KF#=@Q}Q`lW(s5BmB8HRx+mErs2ezv$&`a4L%%CEmq#7
z3Gc$t<{Ev$yovARTd`QL@SjZGndUWj+7y<Jf7MwQ;l((UDKZkKTD5rh@jLyy$jiLE
zlW$rq%5i)ya$ffP2h_(t@IT11-J8?i{Njv3&I;r_ry*^F*djxI=H;Dyb2?*?`LOFt
zu3NPk3-p1%#Ii<lyyU)xv|;qW$VwYvdgBde9g}k!c_;S4as2bB7b9r1X8ORpKu7$(
zQYhDnXFl7hhcbo+2GMPq*T!$-DeoNhUT}Xv9qLJ4_VK(&_OtEww!`zo2Oy|3ZI|3%
za>QWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG
z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY
zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q
zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D
zrv-UL^G+<+sG)<QVxQt2G}?qg0qiH1DOm5D9&CzPcK$yB+Mfls1$Zat9VBKYVuo!7
z?*67A-b@r#&Q!s3s|RR)zn$xj+)EZeRs2N*W8qU`h~JYol9rH`k+zUz&6c&pF>xM6
z?TJFFEmf0C{C}YwOAfki>*iPz<?_mD{9;zdV2*i?hI#mk5h$DIrQLRFtq6t0^LlOQ
zrTL-a+nY1Y7_)ecll(=c={GYDHek&FWSg@wa@k0fyIjsg%Z{~5&dXkBo^_rUZyUXI
zH2=RyV2S!z^LR~k-q9Jgexsee1xfY-G7c}TcnS4?s_)ujB6<AViF@fp#@Q1YA9vy<
z@q%J8f&btOiBm{eBAiqhr{MoJEA~f=#(r8?`9kG&)z4Pf^77jw-B9Zq9BZ(y=INS7
z83g49q1V=4h}|3uzduX%hpL{higC-wVc@2L2tFT-`oGr4gk=+;#cQ|q*7ZmBqGw1)
z3P|qFGnKto7Jat$(R4k<o`MJVJ%Cy#YawoZ940NBgeglWBm8UH<;9`&ZE4dMPp9ol
zWWF!wa|j;cnNh;x1hhTa7I*x52dZ4CqP6Rv?~kgvRndQ8ESkJE66(ucw1c#}vd&#5
zT_o+;j#jS^N0Tq_bk#HcH*JLff|f9S`@Md@ZQVNk{C&w1-UnJu`%Qm&AJ^=xJ7~{!
zd7S<|Xd>QOx^KY$zohMj<o=SZe{#Q&cg;0hoH?F;vr|o-|2o?7-8=PjTDjvc>$PFW
zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD
z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^
zpZemTUj}!jMxoMTuBTTxc8<P;=0jXXV5S|FR#8reRN+bmJi7Z)#BGS<8uA_@lG<}$
z_cr%v->|>VvOmu-_V5+=E%E4@&<i714M*KS>Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM
z<F}jxOX_l6iz2n5?$u^rm{9XfO}x4CP0Y=niz$nyBH}35Z>Ac2-+N1Mj9EDb0eKE%
zBWGqVy3+V)V<crI70w@)*dtUWx5}P~pCa~f{%OCB_pZ-!SIg~bzrVvCV$sVLGYG!D
F_!mjUQ#=3w

literal 0
HcmV?d00001

diff --git a/index.css b/index.css
new file mode 100644
index 00000000..9ded9d94
--- /dev/null
+++ b/index.css
@@ -0,0 +1,355 @@
+:root {
+    /* Palette: Nord (https://www.nordtheme.com)*/
+    --nord00: #2e3440;
+    --nord01: #3b4252;
+    --nord02: #434c5e;
+    --nord03: #4c566a;
+    --nord04: #d8dee9;
+    --nord05: #e5e9f0;
+    --nord06: #eceff4;
+    --nord07: #8fbcbb;
+    --nord08: #88c0d0;
+    --nord09: #81a1c1;
+    --nord0A: #5e81ac;
+    --nord0B: #bf616a;
+    --nord0C: #d08770;
+    --nord0D: #ebcb8b;
+    --nord0E: #a3be8c;
+    --nord0F: #b48ead;
+
+
+    /* Typograph */
+    --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue",
+    sans-serif;
+    --font-size-scaler: 62.5%;
+    --font-size-m: 1.6rem;
+    --font-size-s: 1.4rem;
+
+    /* Components */
+    --body-color: var(--nord06);
+    --body-bg: var(--nord00);
+
+    --header-title: var(--nord06);
+    --header-container: var(--nord00);
+    --header-title-preffix: var(--nord0F);
+
+    --chip-font: var(--nord08);
+    --chip-color: var(--nord0B);
+
+    --icons: var(--nord06);
+    --icons-hover: var(--nord0F);
+
+    --day-container: var(--nord01);
+    --date: var(--nord09);
+
+    --summary: var(--nord0E);
+    --summary-hover: var(--nord0F);
+
+    --details-open: var(--nord02);
+    --details-content: var(--nord05);
+    --details-a: var(--nord07);
+    --details-a-hover: var(--nord0F);
+
+    --highlight-title: var(--nord0B);
+    --highlight-author: var(--nord0B);
+
+    --article-summary-hover-color: var(--nord0D);
+    --article-summary-color: var(--nord04);
+
+    --article-title-color: var(--nord05);
+    --article-title-hover-color: var(--nord0E);
+
+    --accordion-content-rail-color: var(--nord01);
+    --accordion-content-hover-rail-color: var(--nord0D);
+    --accordion-title-marker-color: var(--nord01);
+    --accordion-title-hover-marker-color: var(--nord0E);
+
+    --footer-color: var(--nord04);
+    --footer-link-hover-color: var(--nord0D);
+}
+
+[data-theme="light"] {
+    /* Theme design */
+
+    --color-primary: var(--nord07);
+    --color-primary-second: var(--nord00);
+    --color-info: var(--nord0A);
+    --color-success: var(--nord0E);
+    --color-warning: var(--nord0C);
+    --color-danger: var(--nord0B);
+
+    --color-text: var(--nord00);
+    --color-hover: var(--nord0D);
+    --color-shadow: var(--nord03);
+
+    --color-primary-h: var(--nord09);
+    --color-primary-s: var(--nord08);
+    --color-primary-l: var(--nord07);
+
+    --color-contrast-higher-h: var(--nord01);
+    --color-contrast-higher-l: var(--nord02);
+    --color-contrast-higher-s: var(--nord03);
+
+    --color-content: white;
+
+    --background: var(--nord06);
+    --background-content: var(--nord05);
+    --background-color: var(--nord04);
+
+    /* Components */
+
+    --chip-font: var(--nord06);
+    --chip-color: var(--nord09);
+
+    --body-color: var(--background-color);
+    --body-bg: var(--background);
+
+    --header-title: var(--color-shadow);
+    --header-container: var(--background);
+    --header-title-preffix: var(--color-primary-h);
+
+    --icons: var(--color-shadow);
+    --icons-hover: var(--color-hover);
+
+    --day-container: var(--background-content);
+    --date: var(--color-primary-l);
+
+    --summary: var(--color-info);
+    --summary-hover: var(--color-success);
+
+    --details-open: var(--color-content);
+    --details-content: var(--color-text);
+    --details-a: var(--color-primary-h);
+    --details-a-hover: var(--color-hover);
+
+    --highlight-title: var(--color-danger);
+    --highlight-author: var(--color-warning);
+
+    --article-summary-color: var(--color-text);
+    --article-summary-hover-color: var(--color-primary-s);
+
+    --article-title-color: var(--color-primary);
+    --article-title-hover-color: var(--color-success);
+
+    --accordion-content-rail-color: var(--color-warning);
+    --accordion-content-hover-rail-color: var(--color-warning);
+    --accordion-title-marker-color: var(--color-success);
+    --accordion-title-hover-marker-color: var(--color-success);
+
+    --footer-color: var(--color-text);
+    --footer-link-hover-color: var(--color-hover);
+}
+
+html {
+    font-size: var(--font-size-scaler);
+}
+
+body {
+    background-color: var(--body-bg);
+    font-family: var(--font-family-default);
+    color: var(--body-color);
+    margin: 0;
+    padding-top: 16px;
+    display: grid;
+}
+
+.header-container {
+    width: 90%;
+    max-width: 1200px;
+    background: var(--header-container);
+    margin: 0 auto;
+}
+
+.header-title {
+    font-size: 32px;
+    font-weight: bold;
+    color: var(--header-title);
+    margin: 0;
+    padding-bottom: 14px;
+}
+
+.header-title-preffix {
+    color: var(--header-title-preffix);
+}
+
+.icons {
+    color: var(--icons);
+    padding-bottom: 16px;
+}
+
+.icons a {
+    color: var(--icons);
+    text-decoration: none;
+}
+
+.icons a:hover {
+    color: var(--icons-hover);
+}
+
+.day-container {
+    padding: 16px 16px 16px 16px;
+    background: var(--day-container);
+    width: 90%;
+    max-width: 1200px;
+    margin: 0 auto;
+    margin-bottom: 8px;
+    border-radius: 10px;
+}
+
+.date {
+    font-size: 24px;
+    font-weight: 700;
+    margin: 0;
+    color: var(--date);
+}
+
+p {
+    margin: 0;
+}
+
+summary {
+    font-weight: 600;
+    color: var(--summary);
+}
+
+summary:hover {
+    text-decoration: underline;
+    cursor: pointer;
+    color: var(--summary-hover);
+}
+
+details {
+    --border-color: transparent;
+
+    padding: 2px 4px;
+    font-size: 20px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+}
+
+details[open] {
+    background-color: var(--details-open);
+    margin-bottom: 8px;
+}
+
+.details-content {
+    padding: 12px 3px;
+    gap: 16px;
+    color: var(--details-content);
+}
+
+details a {
+    color: var(--details-a);
+}
+
+details a:hover {
+    color: var(--details-a-hover);
+}
+
+footer {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    justify-content: space-between;
+}
+
+.description {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    text-align: center;
+}
+
+.highlight-author {
+    color: var(--highlight-author);
+    font-weight: bold;
+}
+
+.highlight-title {
+    color: var(--highlight-title);
+    font-weight: bold;
+}
+
+.channel-description {
+    text-align: center;
+    font-size: var(--font-size-scaler);
+}
+
+.article-summary-link {
+    color: var(--article-summary-color);
+    font-size: var(--font-size-s);
+    text-decoration: none;
+}
+
+.article-summary-link:hover {
+    color: var(--article-summary-hover-color);
+    --accordion-content-rail-color: var(--accordion-content-hover-rail-color);
+}
+
+.article-summary-box-outer {
+    display: block;
+    padding: 4px 8px 8px 4px;
+}
+
+.article-summary-box-inner {
+    padding-left: 8px;
+    border-left: 1px solid var(--accordion-content-rail-color);
+    font-size: var(--font-size-m);
+}
+
+.article-expander {
+    padding: 10px 4px;
+    border-radius: 4px;
+}
+
+.article-authors {
+    font-size: var(--font-size-m);
+    padding: 0.25em 1em;
+}
+
+.article-authors a {
+    text-decoration: none;
+}
+
+.article-expander-title {
+    font-size: var(--font-size-m);
+    font-weight: 600;
+}
+
+.article-expander-title:hover {
+    cursor: pointer;
+}
+
+.article-expander-title::marker {
+    color: var(--accordion-title-marker-color);
+}
+
+.article-expander-title:hover::marker {
+    color: var(--accordion-title-hover-marker-color);
+}
+
+/* for switcher */
+.theme-switch {
+    display: inline-block;
+    position: relative;
+}
+
+.theme-switch input {
+    display: none;
+}
+
+/* chip */
+.chip {
+    font-size: 90%;
+    align-items: center;
+    color: var(--chip-font);
+    background: var(--chip-color);
+    border-radius: 5rem;
+    display: inline-flex;
+    padding: .2rem .4rem;
+    vertical-align: middle;
+}
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 00000000..357e4fff
--- /dev/null
+++ b/index.html
@@ -0,0 +1,105183 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>MyArxiv</title>
+    <meta charset="utf-8"/>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+    <meta name="robots" content="noindex, nofollow"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1"/>
+    <link rel="shortcut icon" type="image/x-icon" href="favicon.ico"/>
+    <link href="index.css" rel="stylesheet"/>
+    <link href="https://cdn.jsdelivr.net/npm/remixicon@2.5.0/fonts/remixicon.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.js"
+            integrity="sha384-z1fJDqw8ZApjGO3/unPWUPsIymfsJmyrDVWC8Tv/a1HeOtGmkwNd/7xUS0Xcnvsx"
+            crossorigin="anonymous"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/contrib/auto-render.min.js"
+            integrity="sha384-+XBljXPPiv+OzfbB3cVmLHf4hdUFHlWNZN5spNQ7rmHTXpd7WvJum6fIACpNNfIR"
+            crossorigin="anonymous"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            renderMathInElement(document.body, {
+                // customised options
+                // • auto-render specific keys, e.g.:
+                delimiters: [
+                    {left: '$$', right: '$$', display: true},
+                    {left: '$', right: '$', display: false},
+                    {left: '\\(', right: '\\)', display: false},
+                    {left: '\\[', right: '\\]', display: true},
+                    {left: "\\begin{equation}", right: "\\end{equation}", display: true},
+                    {left: "\\begin{align}", right: "\\end{align}", display: true},
+                    {left: "\\begin{alignat}", right: "\\end{alignat}", display: true},
+                    {left: "\\begin{gather}", right: "\\end{gather}", display: true},
+                    {left: "\\begin{CD}", right: "\\end{CD}", display: true},
+                ],
+                // • rendering keys, e.g.:
+                throwOnError: false
+            });
+        });
+    </script>
+</head>
+
+<body>
+<section class="header-container">
+    <div style="display:flex; justify-content:space-between; align-items:flex-end;">
+        <div>
+            <div class="header-title">
+                MyArxiv
+            </div>
+        </div>
+
+        <div class=icons>
+            <label class="theme-switch" for="checkbox">
+                <input type="checkbox" id="checkbox"/>
+                <i id="theme-icon" class="ri-moon-line" style="font-size: 32px" rel="noopener noreferrer"></i>
+            </label>
+        </div>
+    </div>
+</section>
+
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-06-06T00:00:00Z">2024-06-06</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Verbalized Machine Learning: Revisiting Machine Learning with Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Z. Xiao, Robert Bamler, Bernhard Schölkopf, Weiyang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the large progress made by large language models (LLMs), we
+introduce the framework of verbalized machine learning (VML). In contrast to
+conventional machine learning models that are typically optimized over a
+continuous parameter space, VML constrains the parameter space to be
+human-interpretable natural language. Such a constraint leads to a new
+perspective of function approximation, where an LLM with a text prompt can be
+viewed as a function parameterized by the text prompt. Guided by this
+perspective, we revisit classical machine learning problems, such as regression
+and classification, and find that these problems can be solved by an
+LLM-parameterized learner and optimizer. The major advantages of VML include
+(1) easy encoding of inductive bias: prior knowledge about the problem and
+hypothesis class can be encoded in natural language and fed into the
+LLM-parameterized learner; (2) automatic model class selection: the optimizer
+can automatically select a concrete model class based on data and verbalized
+prior knowledge, and it can update the model class during training; and (3)
+interpretable learner updates: the LLM-parameterized optimizer can provide
+explanations for why each learner update is performed. We conduct several
+studies to empirically evaluate the effectiveness of VML, and hope that VML can
+serve as a stepping stone to stronger interpretability and trustworthiness in
+ML.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report v1 (92 pages, 15 figures)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PaCE: Parsimonious Concept Engineering for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinqi Luo, Tianjiao Ding, Kwan Ho Ryan Chan, Darshan Thaker, Aditya Chattopadhyay, Chris Callison-Burch, René Vidal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are being used for a wide variety of tasks.
+While they are capable of generating human-like responses, they can also
+produce undesirable output including potentially harmful information, racist or
+sexist language, and hallucinations. Alignment methods are designed to reduce
+such undesirable output, via techniques such as fine-tuning, prompt
+engineering, and representation engineering. However, existing methods face
+several challenges: some require costly fine-tuning for every alignment task;
+some do not adequately remove undesirable concepts, failing alignment; some
+remove benign concepts, lowering the linguistic capabilities of LLMs. To
+address these issues, we propose Parsimonious Concept Engineering (PaCE), a
+novel activation engineering framework for alignment. First, to sufficiently
+model the concepts, we construct a large-scale concept dictionary in the
+activation space, in which each atom corresponds to a semantic concept. Then,
+given any alignment task, we instruct a concept partitioner to efficiently
+annotate the concepts as benign or undesirable. Finally, at inference time, we
+decompose the LLM activations along the concept dictionary via sparse coding,
+to accurately represent the activation as a linear combination of the benign
+and undesirable components. By removing the latter ones from the activation, we
+reorient the behavior of LLMs towards alignment goals. We conduct experiments
+on tasks such as response detoxification, faithfulness enhancement, and
+sentiment revising, and show that PaCE achieves state-of-the-art alignment
+performance while maintaining linguistic capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 17 figures, 5 tables, dataset and code at
+  https://github.com/peterljq/Parsimonious-Concept-Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Alignment and Robustness with Short Circuiting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Zou, Long Phan, Justin Wang, Derek Duenas, Maxwell Lin, Maksym Andriushchenko, Rowan Wang, Zico Kolter, Matt Fredrikson, Dan Hendrycks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI systems can take harmful actions and are highly vulnerable to adversarial
+attacks. We present an approach, inspired by recent advances in representation
+engineering, that "short-circuits" models as they respond with harmful outputs.
+Existing techniques aimed at improving alignment, such as refusal training, are
+often bypassed. Techniques such as adversarial training try to plug these holes
+by countering specific attacks. As an alternative to refusal training and
+adversarial training, short-circuiting directly controls the representations
+that are responsible for harmful outputs in the first place. Our technique can
+be applied to both text-only and multimodal language models to prevent the
+generation of harmful outputs without sacrificing utility -- even in the
+presence of powerful unseen attacks. Notably, while adversarial robustness in
+standalone image recognition remains an open challenge, short-circuiting allows
+the larger multimodal system to reliably withstand image "hijacks" that aim to
+produce harmful content. Finally, we extend our approach to AI agents,
+demonstrating considerable reductions in the rate of harmful actions when they
+are under attack. Our approach represents a significant step forward in the
+development of reliable safeguards to harmful behavior and adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring and Addressing Indexical Bias in Information Retrieval <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caleb Ziems, William Held, Jane Dwivedi-Yu, Diyi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information Retrieval (IR) systems are designed to deliver relevant content,
+but traditional systems may not optimize rankings for fairness, neutrality, or
+the balance of ideas. Consequently, IR can often introduce indexical biases, or
+biases in the positional order of documents. Although indexical bias can
+demonstrably affect people's opinion, voting patterns, and other behaviors,
+these issues remain understudied as the field lacks reliable metrics and
+procedures for automatically measuring indexical bias. Towards this end, we
+introduce the PAIR framework, which supports automatic bias audits for ranked
+documents or entire IR systems. After introducing DUO, the first
+general-purpose automatic bias metric, we run an extensive evaluation of 8 IR
+systems on a new corpus of 32k synthetic and 4.7k natural documents, with 4k
+queries spanning 1.4k controversial issue topics. A human behavioral study
+validates our approach, showing that our bias metric can help predict when and
+how indexical bias will shift a reader's opinion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Zhou, Zheng Liu, Shitao Xiao, Bo Zhao, Yongping Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal retrieval becomes increasingly popular in practice. However, the
+existing retrievers are mostly text-oriented, which lack the capability to
+process visual information. Despite the presence of vision-language models like
+CLIP, the current methods are severely limited in representing the text-only
+and image-only data. In this work, we present a new embedding model VISTA for
+universal multi-modal retrieval. Our work brings forth threefold technical
+contributions. Firstly, we introduce a flexible architecture which extends a
+powerful text encoder with the image understanding capability by introducing
+visual token embeddings. Secondly, we develop two data generation strategies,
+which bring high-quality composed image-text to facilitate the training of the
+embedding model. Thirdly, we introduce a multi-stage training algorithm, which
+first aligns the visual token embedding with the text encoder using massive
+weakly labeled data, and then develops multi-modal representation capability
+using the generated composed image-text data. In our experiments, VISTA
+achieves superior performances across a variety of multi-modal retrieval tasks
+in both zero-shot and supervised settings. Our model, data, and source code are
+available at https://github.com/FlagOpen/FlagEmbedding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Languages are Easy to Language-Model? A Perspective from Learning
+  Probabilistic Regular Languages <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nadav Borenstein, Anej Svete, Robin Chan, Josef Valvoda, Franz Nowak, Isabelle Augenstein, Eleanor Chodroff, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What can large language models learn? By definition, language models (LM) are
+distributions over strings. Therefore, an intuitive way of addressing the above
+question is to formalize it as a matter of learnability of classes of
+distributions over strings. While prior work in this direction focused on
+assessing the theoretical limits, in contrast, we seek to understand the
+empirical learnability. Unlike prior empirical work, we evaluate neural LMs on
+their home turf-learning probabilistic languages-rather than as classifiers of
+formal languages. In particular, we investigate the learnability of regular LMs
+(RLMs) by RNN and Transformer LMs. We empirically test the learnability of RLMs
+as a function of various complexity parameters of the RLM and the hidden state
+size of the neural LM. We find that the RLM rank, which corresponds to the size
+of linear space spanned by the logits of its conditional distributions, and the
+expected length of sampled strings are strong and significant predictors of
+learnability for both RNNs and Transformers. Several other predictors also
+reach significance, but with differing patterns between RNNs and Transformers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ABEX: Data Augmentation for Low-Resource NLU via Expanding Abstract
+  Descriptions <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04286v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04286v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sreyan Ghosh, Utkarsh Tyagi, Sonal Kumar, C. K. Evuru, S Ramaneswaran, S Sakshi, Dinesh Manocha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present ABEX, a novel and effective generative data augmentation
+methodology for low-resource Natural Language Understanding (NLU) tasks. ABEX
+is based on ABstract-and-EXpand, a novel paradigm for generating diverse forms
+of an input document -- we first convert a document into its concise, abstract
+description and then generate new documents based on expanding the resultant
+abstraction. To learn the task of expanding abstract descriptions, we first
+train BART on a large-scale synthetic dataset with abstract-document pairs.
+Next, to generate abstract descriptions for a document, we propose a simple,
+controllable, and training-free method based on editing AMR graphs. ABEX brings
+the best of both worlds: by expanding from abstract representations, it
+preserves the original semantic properties of the documents, like style and
+meaning, thereby maintaining alignment with the original label and data
+distribution. At the same time, the fundamental process of elaborating on
+abstract descriptions facilitates diverse generations. We demonstrate the
+effectiveness of ABEX on 4 NLU tasks spanning 12 datasets and 4 low-resource
+settings. ABEX outperforms all our baselines qualitatively with improvements of
+0.04% - 38.8%. Qualitatively, ABEX outperforms all prior methods from
+literature in terms of context and length diversity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main Conference. Code and data:
+  https://github.com/Sreyan88/ABEX</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Characterizing Similarities and Divergences in Conversational Tones in
+  Humans and LLMs by Sampling with People <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dun-Ming Huang, Pol Van Rijn, Ilia Sucholutsky, Raja Marjieh, Nori Jacoby
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational tones -- the manners and attitudes in which speakers
+communicate -- are essential to effective communication. Amidst the increasing
+popularization of Large Language Models (LLMs) over recent years, it becomes
+necessary to characterize the divergences in their conversational tones
+relative to humans. However, existing investigations of conversational
+modalities rely on pre-existing taxonomies or text corpora, which suffer from
+experimenter bias and may not be representative of real-world distributions for
+the studies' psycholinguistic domains. Inspired by methods from cognitive
+science, we propose an iterative method for simultaneously eliciting
+conversational tones and sentences, where participants alternate between two
+tasks: (1) one participant identifies the tone of a given sentence and (2) a
+different participant generates a sentence based on that tone. We run 100
+iterations of this process with human participants and GPT-4, then obtain a
+dataset of sentences and frequent conversational tones. In an additional
+experiment, humans and GPT-4 annotated all sentences with all tones. With data
+from 1,339 human participants, 33,370 human judgments, and 29,900 GPT-4
+queries, we show how our approach can be used to create an interpretable
+geometric representation of relations between conversational tones in humans
+and GPT-4. This work demonstrates how combining ideas from machine learning and
+cognitive science can address challenges in human-computer interactions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Main Conference at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Play with Adversarial Critic: Provable and Scalable Offline
+  Alignment for Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Ji, Sanjeev Kulkarni, Mengdi Wang, Tengyang Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies the challenge of aligning large language models (LLMs) with
+offline preference data. We focus on alignment by Reinforcement Learning from
+Human Feedback (RLHF) in particular. While popular preference optimization
+methods exhibit good empirical performance in practice, they are not
+theoretically guaranteed to converge to the optimal policy and can provably
+fail when the data coverage is sparse by classical offline reinforcement
+learning (RL) results. On the other hand, a recent line of work has focused on
+theoretically motivated preference optimization methods with provable
+guarantees, but these are not computationally efficient for large-scale
+applications like LLM alignment. To bridge this gap, we propose SPAC, a new
+offline preference optimization method with self-play, inspired by the
+on-average pessimism technique from the offline RL literature, to be the first
+provable and scalable approach to LLM alignment. We both provide theoretical
+analysis for its convergence under single-policy concentrability for the
+general function approximation setting and demonstrate its competitive
+empirical performance for LLM alignment on a 7B Mistral model with Open LLM
+Leaderboard evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Buffer of Thoughts: Thought-Augmented Reasoning with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Yang, Zhaochen Yu, Tianjun Zhang, Shiyi Cao, Minkai Xu, Wentao Zhang, Joseph E. Gonzalez, Bin Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Buffer of Thoughts (BoT), a novel and versatile
+thought-augmented reasoning approach for enhancing accuracy, efficiency and
+robustness of large language models (LLMs). Specifically, we propose
+meta-buffer to store a series of informative high-level thoughts, namely
+thought-template, distilled from the problem-solving processes across various
+tasks. Then for each problem, we retrieve a relevant thought-template and
+adaptively instantiate it with specific reasoning structures to conduct
+efficient reasoning. To guarantee the scalability and stability, we further
+propose buffer-manager to dynamically update the meta-buffer, thus enhancing
+the capacity of meta-buffer as more tasks are solved. We conduct extensive
+experiments on 10 challenging reasoning-intensive tasks, and achieve
+significant performance improvements over previous SOTA methods: 11% on Game of
+24, 20% on Geometric Shapes and 51% on Checkmate-in-One. Further analysis
+demonstrate the superior generalization ability and model robustness of our
+BoT, while requiring only 12% of the cost of multi-query prompting methods
+(e.g., tree/graph of thoughts) on average. Notably, we find that our
+Llama3-8B+BoT has the potential to surpass Llama3-70B model. Our project is
+available at: https://github.com/YangLing0818/buffer-of-thought-llm
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project: https://github.com/YangLing0818/buffer-of-thought-llm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s need glasses! Information over-squashing in language tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Barbero, Andrea Banino, Steven Kapturowski, Dharshan Kumaran, João G. M. Araújo, Alex Vitvitskyi, Razvan Pascanu, Petar Veličković
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study how information propagates in decoder-only Transformers, which are
+the architectural backbone of most existing frontier large language models
+(LLMs). We rely on a theoretical signal propagation analysis -- specifically,
+we analyse the representations of the last token in the final layer of the
+Transformer, as this is the representation used for next-token prediction. Our
+analysis reveals a representational collapse phenomenon: we prove that certain
+distinct sequences of inputs to the Transformer can yield arbitrarily close
+representations in the final token. This effect is exacerbated by the
+low-precision floating-point formats frequently used in modern LLMs. As a
+result, the model is provably unable to respond to these sequences in different
+ways -- leading to errors in, e.g., tasks involving counting or copying.
+Further, we show that decoder-only Transformer language models can lose
+sensitivity to specific tokens in the input, which relates to the well-known
+phenomenon of over-squashing in graph neural networks. We provide empirical
+evidence supporting our claims on contemporary LLMs. Our theory also points to
+simple solutions towards ameliorating these issues.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MLVU: A Comprehensive Benchmark for Multi-Task Long Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Zhou, Yan Shu, Bo Zhao, Boya Wu, Shitao Xiao, Xi Yang, Yongping Xiong, Bo Zhang, Tiejun Huang, Zheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evaluation of Long Video Understanding (LVU) performance poses an
+important but challenging research problem. Despite previous efforts, the
+existing video understanding benchmarks are severely constrained by several
+issues, especially the insufficient lengths of videos, a lack of diversity in
+video types and evaluation tasks, and the inappropriateness for evaluating LVU
+performances. To address the above problems, we propose a new benchmark, called
+MLVU (Multi-task Long Video Understanding Benchmark), for the comprehensive and
+in-depth evaluation of LVU. MLVU presents the following critical values: 1) The
+substantial and flexible extension of video lengths, which enables the
+benchmark to evaluate LVU performance across a wide range of durations. 2) The
+inclusion of various video genres, e.g., movies, surveillance footage,
+egocentric videos, cartoons, game videos, etc., which reflects the models' LVU
+performances in different scenarios. 3) The development of diversified
+evaluation tasks, which enables a comprehensive examination of MLLMs' key
+abilities in long-video understanding. The empirical study with 20 latest MLLMs
+reveals significant room for improvement in today's technique, as all existing
+methods struggle with most of the evaluation tasks and exhibit severe
+performance degradation when handling longer videos. Additionally, it suggests
+that factors such as context length, image-understanding quality, and the
+choice of LLM backbone can play critical roles in future advancements. We
+anticipate that MLVU will advance the research of long video understanding by
+providing a comprehensive and in-depth analysis of MLLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Benchmark Data Contamination of Large Language Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Xu, Shuhao Guan, Derek Greene, M-Tahar Kechadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of Large Language Models (LLMs) like GPT-4, Claude-3,
+and Gemini has transformed the field of natural language processing. However,
+it has also resulted in a significant issue known as Benchmark Data
+Contamination (BDC). This occurs when language models inadvertently incorporate
+evaluation benchmark information from their training data, leading to
+inaccurate or unreliable performance during the evaluation phase of the
+process. This paper reviews the complex challenge of BDC in LLM evaluation and
+explores alternative assessment methods to mitigate the risks associated with
+traditional benchmarks. The paper also examines challenges and future
+directions in mitigating BDC risks, highlighting the complexity of the issue
+and the need for innovative solutions to ensure the reliability of LLM
+evaluation in real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 7 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hypernetworks for Personalizing ASR to Atypical Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Mueller-Eberstein, Dianna Yee, Karren Yang, Gautam Varma Mantena, Colin Lea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) for personalizing automatic speech
+recognition (ASR) has recently shown promise for adapting general population
+models to atypical speech. However, these approaches assume a priori knowledge
+of the atypical speech disorder being adapted for -- the diagnosis of which
+requires expert knowledge that is not always available. Even given this
+knowledge, data scarcity and high inter/intra-speaker variability further limit
+the effectiveness of traditional fine-tuning. To circumvent these challenges,
+we first identify the minimal set of model parameters required for ASR
+adaptation. Our analysis of each individual parameter's effect on adaptation
+performance allows us to reduce Word Error Rate (WER) by half while adapting
+0.03\% of all weights. Alleviating the need for cohort-specific models, we next
+propose the novel use of a meta-learned hypernetwork to generate highly
+individualized, utterance-level adaptations on-the-fly for a diverse set of
+atypical speech characteristics. Evaluating adaptation at the global, cohort
+and individual-level, we show that hypernetworks generalize better to
+out-of-distribution speakers, while maintaining an overall relative WER
+reduction of 75.2% using 0.1% of the full parameter budget.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FairytaleQA Translated: Enabling Educational Question and Answer
+  Generation in Less-Resourced Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bernardo Leite, Tomás Freitas Osório, Henrique Lopes Cardoso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question Answering (QA) datasets are crucial in assessing reading
+comprehension skills for both machines and humans. While numerous datasets have
+been developed in English for this purpose, a noticeable void exists in
+less-resourced languages. To alleviate this gap, our paper introduces
+machine-translated versions of FairytaleQA, a renowned QA dataset designed to
+assess and enhance narrative comprehension skills in young children. By
+employing fine-tuned, modest-scale models, we establish benchmarks for both
+Question Generation (QG) and QA tasks within the translated datasets. In
+addition, we present a case study proposing a model for generating
+question-answer pairs, with an evaluation incorporating quality metrics such as
+question well-formedness, answerability, relevance, and children suitability.
+Our evaluation prioritizes quantifying and describing error cases, along with
+providing directions for future work. This paper contributes to the advancement
+of QA and QG research in less-resourced languages, promoting accessibility and
+inclusivity in the development of these models for reading comprehension. The
+code and data is publicly available at
+github.com/bernardoleite/fairytaleqa-translated.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint - Accepted for publication at ECTEL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The CLRS-Text Algorithmic Reasoning Language Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Larisa Markeeva, Sean McLeish, Borja Ibarz, Wilfried Bounsi, Olga Kozlova, Alex Vitvitskyi, Charles Blundell, Tom Goldstein, Avi Schwarzschild, Petar Veličković
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Eliciting reasoning capabilities from language models (LMs) is a critical
+direction on the path towards building intelligent systems. Most recent studies
+dedicated to reasoning focus on out-of-distribution performance on
+procedurally-generated synthetic benchmarks, bespoke-built to evaluate specific
+skills only. This trend makes results hard to transfer across publications,
+slowing down progress. Three years ago, a similar issue was identified and
+rectified in the field of neural algorithmic reasoning, with the advent of the
+CLRS benchmark. CLRS is a dataset generator comprising graph execution traces
+of classical algorithms from the Introduction to Algorithms textbook. Inspired
+by this, we propose CLRS-Text -- a textual version of these algorithmic traces.
+Out of the box, CLRS-Text is capable of procedurally generating trace data for
+thirty diverse, challenging algorithmic tasks across any desirable input
+distribution, while offering a standard pipeline in which any additional
+algorithmic tasks may be created in the benchmark. We fine-tune and evaluate
+various LMs as generalist executors on this benchmark, validating prior work
+and revealing a novel, interesting challenge for the LM reasoning community.
+Our code is available at
+https://github.com/google-deepmind/clrs/tree/master/clrs/_src/clrs_text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, under review. Comments welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BEADs: Bias Evaluation Across Domains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaina Raza, Mizanur Rahman, Michael R. Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent improvements in large language models (LLMs) have significantly
+enhanced natural language processing (NLP) applications. However, these models
+can also inherit and perpetuate biases from their training data. Addressing
+this issue is crucial, yet many existing datasets do not offer evaluation
+across diverse NLP tasks. To tackle this, we introduce the Bias Evaluations
+Across Domains (BEADs) dataset, designed to support a wide range of NLP tasks,
+including text classification, bias entity recognition, bias quantification,
+and benign language generation. BEADs uses AI-driven annotation combined with
+experts' verification to provide reliable labels. This method overcomes the
+limitations of existing datasets that typically depend on crowd-sourcing,
+expert-only annotations with limited bias evaluations, or unverified AI
+labeling. Our empirical analysis shows that BEADs is effective in detecting and
+reducing biases across different language models, with smaller models
+fine-tuned on BEADs often outperforming LLMs in bias classification tasks.
+However, these models may still exhibit biases towards certain demographics.
+Fine-tuning LLMs with our benign language data also reduces biases while
+preserving the models' knowledge. Our findings highlight the importance of
+comprehensive bias evaluation and the potential of targeted fine-tuning for
+reducing the bias of LLMs. We are making BEADs publicly available at
+https://huggingface.co/datasets/shainar/BEAD
+  Warning: This paper contains examples that may be considered offensive.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking LLM and Linguistic Steganalysis: An Efficient Detection of
+  Strongly Concealed Stego 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Tang, Yihao Wang, Ru Zhang, Jianyi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To detect stego (steganographic text) in complex scenarios, linguistic
+steganalysis (LS) with various motivations has been proposed and achieved
+excellent performance. However, with the development of generative
+steganography, some stegos have strong concealment, especially after the
+emergence of LLMs-based steganography, the existing LS has low detection or
+even cannot detect them. We designed a novel LS with two modes called LSGC. In
+the generation mode, we created an LS-task "description" and used the
+generation ability of LLM to explain whether texts to be detected are stegos.
+On this basis, we rethought the principle of LS and LLMs, and proposed the
+classification mode. In this mode, LSGC deleted the LS-task "description" and
+changed the "causalLM" LLMs to the "sequenceClassification" architecture. The
+LS features can be extracted by only one pass of the model, and a linear layer
+with initialization weights is added to obtain the classification probability.
+Experiments on strongly concealed stegos show that LSGC significantly improves
+detection and reaches SOTA performance. Additionally, LSGC in classification
+mode greatly reduces training time while maintaining high performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Do Language Models Learn in Context? The Structured Task Hypothesis <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaoda Li, Yifan Hou, Mrinmaya Sachan, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) exhibit an intriguing ability to learn a novel
+task from in-context examples presented in a demonstration, termed in-context
+learning (ICL). Understandably, a swath of research has been dedicated to
+uncovering the theories underpinning ICL. One popular hypothesis explains ICL
+by task selection. LLMs identify the task based on the demonstration and
+generalize it to the prompt. Another popular hypothesis is that ICL is a form
+of meta-learning, i.e., the models learn a learning algorithm at pre-training
+time and apply it to the demonstration. Finally, a third hypothesis argues that
+LLMs use the demonstration to select a composition of tasks learned during
+pre-training to perform ICL. In this paper, we empirically explore these three
+hypotheses that explain LLMs' ability to learn in context with a suite of
+experiments derived from common text classification tasks. We invalidate the
+first two hypotheses with counterexamples and provide evidence in support of
+the last hypothesis. Our results suggest an LLM could learn a novel task in
+context via composing tasks learned during pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is published in ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ mCSQA: Multilingual Commonsense Reasoning <span class="highlight-title">Dataset</span> with Unified Creation
+  Strategy by Language Models and Humans <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuke Sakai, Hidetaka Kamigaito, Taro Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is very challenging to curate a dataset for language-specific knowledge
+and common sense in order to evaluate natural language understanding
+capabilities of language models. Due to the limitation in the availability of
+annotators, most current multilingual datasets are created through translation,
+which cannot evaluate such language-specific aspects. Therefore, we propose
+Multilingual CommonsenseQA (mCSQA) based on the construction process of CSQA
+but leveraging language models for a more efficient construction, e.g., by
+asking LM to generate questions/answers, refine answers and verify QAs followed
+by reduced human efforts for verification. Constructed dataset is a benchmark
+for cross-lingual language-transfer capabilities of multilingual LMs, and
+experimental results showed high language-transfer capabilities for questions
+that LMs could easily solve, but lower transfer capabilities for questions
+requiring deep knowledge or commonsense. This highlights the necessity of
+language-specific datasets for evaluation and training. Finally, our method
+demonstrated that multilingual LMs could create QA including language-specific
+knowledge, significantly reducing the dataset creation cost compared to manual
+creation. The datasets are available at
+https://huggingface.co/datasets/yusuke1997/mCSQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ValueBench: Towards Comprehensively Evaluating Value Orientations and
+  Understanding of Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyi Ren, Haoran Ye, Hanjun Fang, Xin Zhang, Guojie Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are transforming diverse fields and gaining
+increasing influence as human proxies. This development underscores the urgent
+need for evaluating value orientations and understanding of LLMs to ensure
+their responsible integration into public-facing applications. This work
+introduces ValueBench, the first comprehensive psychometric benchmark for
+evaluating value orientations and value understanding in LLMs. ValueBench
+collects data from 44 established psychometric inventories, encompassing 453
+multifaceted value dimensions. We propose an evaluation pipeline grounded in
+realistic human-AI interactions to probe value orientations, along with novel
+tasks for evaluating value understanding in an open-ended value space. With
+extensive experiments conducted on six representative LLMs, we unveil their
+shared and distinctive value orientations and exhibit their ability to
+approximate expert conclusions in value-related extraction and generation
+tasks. ValueBench is openly accessible at
+https://github.com/Value4AI/ValueBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Legal Documents Drafting with Fine-Tuned <span class="highlight-title">Pre-Train</span>ed Large Language
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chun-Hsien Lin, Pu-Jen Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of large-scale Language Models (LLM), fine-tuning
+pre-trained LLM has become a mainstream paradigm for solving downstream tasks
+of natural language processing. However, training a language model in the legal
+field requires a large number of legal documents so that the language model can
+learn legal terminology and the particularity of the format of legal documents.
+The typical NLP approaches usually rely on many manually annotated data sets
+for training. However, in the legal field application, it is difficult to
+obtain a large number of manually annotated data sets, which restricts the
+typical method applied to the task of drafting legal documents. The
+experimental results of this paper show that not only can we leverage a large
+number of annotation-free legal documents without Chinese word segmentation to
+fine-tune a large-scale language model, but more importantly, it can fine-tune
+a pre-trained LLM on the local computer to achieve the generating legal
+document drafts task, and at the same time achieve the protection of
+information privacy and to improve information security issues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12th International Conference on Software Engineering & Trends (SE
+  2024), April 27 ~ 28, 2024, Copenhagen, Denmark Volume Editors : David C.
+  Wyld, Dhinaharan Nagamalai (Eds) ISBN : 978-1-923107-24-3</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DICE: Detecting In-distribution Contamination in LLM's Fine-tuning Phase
+  for Math Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04197v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04197v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangqing Tu, Kejian Zhu, Yushi Bai, Zijun Yao, Lei Hou, Juanzi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of large language models (LLMs) relies on evaluation using
+public benchmarks, but data contamination can lead to overestimated
+performance. Previous researches focus on detecting contamination by
+determining whether the model has seen the exact same data during training. In
+this work, we argue that even training on data similar to benchmark data
+inflates performance on in-distribution tasks without improving overall
+capacity, which we called In-distribution contamination. To effectively detect
+in-distribution contamination, we propose DICE, a novel method that leverages
+the internal states of LLMs to locate-then-detect the contamination. DICE first
+identifies the most sensitive layer to contamination, then trains a classifier
+based on the internal states of that layer. Experiments reveal DICE's high
+accuracy in detecting in-distribution contamination across various LLMs and
+math reasoning datasets. We also show the generalization capability of the
+trained DICE detector, which is able to detect contamination across multiple
+benchmarks with similar distributions. Additionally, we find that the DICE
+detection scores are positively correlated with the performance of ten LLMs
+fine-tuned by either us or other organizations on four math reasoning datasets
+(with $R^2$ values between 0.6 and 0.75). This indicates that the
+in-distribution contamination problem potentially lead to an overestimation of
+the true capabilities of many existing models. The code and data are available
+at https://github.com/THU-KEG/DICE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Confabulation: The Surprising Value of Large Language Model
+  Hallucinations <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiqi Sui, Eamon Duede, Sophie Wu, Richard Jean So
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a systematic defense of large language model (LLM)
+hallucinations or 'confabulations' as a potential resource instead of a
+categorically negative pitfall. The standard view is that confabulations are
+inherently problematic and AI research should eliminate this flaw. In this
+paper, we argue and empirically demonstrate that measurable semantic
+characteristics of LLM confabulations mirror a human propensity to utilize
+increased narrativity as a cognitive resource for sense-making and
+communication. In other words, it has potential value. Specifically, we analyze
+popular hallucination benchmarks and reveal that hallucinated outputs display
+increased levels of narrativity and semantic coherence relative to veridical
+outputs. This finding reveals a tension in our usually dismissive
+understandings of confabulation. It suggests, counter-intuitively, that the
+tendency for LLMs to confabulate may be intimately associated with a positive
+capacity for coherent narrative-text generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Forthcoming at ACL2024 main conference. 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pointer-Guided <span class="highlight-title">Pre-Train</span>ing: Infusing Large Language Models with
+  Paragraph-Level Contextual Awareness <span class="chip">ECML-PKDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Hillebrand, Prabhupad Pradhan, Christian Bauckhage, Rafet Sifa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce "pointer-guided segment ordering" (SO), a novel pre-training
+technique aimed at enhancing the contextual understanding of paragraph-level
+text representations in large language models. Our methodology leverages a
+self-attention-driven pointer network to restore the original sequence of
+shuffled text segments, addressing the challenge of capturing the structural
+coherence and contextual dependencies within documents. This pre-training
+approach is complemented by a fine-tuning methodology that incorporates dynamic
+sampling, augmenting the diversity of training instances and improving sample
+efficiency for various downstream applications. We evaluate our method on a
+diverse set of datasets, demonstrating its efficacy in tasks requiring
+sequential text classification across scientific literature and financial
+reporting domains. Our experiments show that pointer-guided pre-training
+significantly enhances the model's ability to understand complex document
+structures, leading to state-of-the-art performance in downstream
+classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 3 figures, 5 tables, accepted at ECML-PKDD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentGym: Evolving Large Language Model-based Agents across Diverse
+  Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiheng Xi, Yiwen Ding, Wenxiang Chen, Boyang Hong, Honglin Guo, Junzhe Wang, Dingwen Yang, Chenyang Liao, Xin Guo, Wei He, Songyang Gao, Lu Chen, Rui Zheng, Yicheng Zou, Tao Gui, Qi Zhang, Xipeng Qiu, Xuanjing Huang, Zuxuan Wu, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building generalist agents that can handle diverse tasks and evolve
+themselves across different environments is a long-term goal in the AI
+community. Large language models (LLMs) are considered a promising foundation
+to build such agents due to their generalized capabilities. Current approaches
+either have LLM-based agents imitate expert-provided trajectories step-by-step,
+requiring human supervision, which is hard to scale and limits environmental
+exploration; or they let agents explore and learn in isolated environments,
+resulting in specialist agents with limited generalization. In this paper, we
+take the first step towards building generally-capable LLM-based agents with
+self-evolution ability. We identify a trinity of ingredients: 1) diverse
+environments for agent exploration and learning, 2) a trajectory set to equip
+agents with basic capabilities and prior knowledge, and 3) an effective and
+scalable evolution method. We propose AgentGym, a new framework featuring a
+variety of environments and tasks for broad, real-time, uni-format, and
+concurrent agent exploration. AgentGym also includes a database with expanded
+instructions, a benchmark suite, and high-quality trajectories across
+environments. Next, we propose a novel method, AgentEvol, to investigate the
+potential of agent self-evolution beyond previously seen data across tasks and
+environments. Experimental results show that the evolved agents can achieve
+results comparable to SOTA models. We release the AgentGym suite, including the
+platform, dataset, benchmark, checkpoints, and algorithm implementations. The
+AgentGym suite is available on https://github.com/WooooDyy/AgentGym.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project site: https://agentgym.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Understanding Task-agnostic Debiasing Through the Lenses of
+  Intrinsic Bias and Forgetfulness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangliang Liu, Milad Afshari, Xitong Zhang, Zhiyu Xue, Avrajit Ghosh, Bidhan Bashyal, Rongrong Wang, Kristen Johnson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While task-agnostic debiasing provides notable generalizability and reduced
+reliance on downstream data, its impact on language modeling ability and the
+risk of relearning social biases from downstream task-specific data remain as
+the two most significant challenges when debiasing Pretrained Language Models
+(PLMs). The impact on language modeling ability can be alleviated given a
+high-quality and long-contextualized debiasing corpus, but there remains a
+deficiency in understanding the specifics of relearning biases. We empirically
+ascertain that the effectiveness of task-agnostic debiasing hinges on the
+quantitative bias level of both the task-specific data used for downstream
+applications and the debiased model. We empirically show that the lower bound
+of the bias level of the downstream fine-tuned model can be approximated by the
+bias level of the debiased model, in most practical cases. To gain more
+in-depth understanding about how the parameters of PLMs change during
+fine-tuning due to the forgetting issue of PLMs, we propose a novel framework
+which can Propagate Socially-fair Debiasing to Downstream Fine-tuning,
+ProSocialTuning. Our proposed framework can push the fine-tuned model to
+approach the bias lower bound during downstream fine-tuning, indicating that
+the ineffectiveness of debiasing can be alleviated by overcoming the forgetting
+issue through regularizing successfully debiased attention heads based on the
+PLMs' bias levels from stages of pretraining and debiasing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Every Answer Matters: Evaluating Commonsense with Probabilistic Measures <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04145v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04145v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Cheng, Michael Boratko, Pranay Kumar Yelugam, Tim O'Gorman, Nalini Singh, Andrew McCallum, Xiang Lorraine Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have demonstrated impressive performance on commonsense
+tasks; however, these tasks are often posed as multiple-choice questions,
+allowing models to exploit systematic biases. Commonsense is also inherently
+probabilistic with multiple correct answers. The purpose of "boiling water"
+could be making tea and cooking, but it also could be killing germs. Existing
+tasks do not capture the probabilistic nature of common sense. To this end, we
+present commonsense frame completion (CFC), a new generative task that
+evaluates common sense via multiple open-ended generations. We also propose a
+method of probabilistic evaluation that strongly correlates with human
+judgments. Humans drastically outperform strong language model baselines on our
+dataset, indicating this approach is both a challenging and useful evaluation
+of machine common sense.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Language Models Understand Morality? Towards a Robust Detection of
+  Moral Content 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luana Bulla, Aldo Gangemi, Misael Mongiovì
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of detecting moral values in text has significant implications in
+various fields, including natural language processing, social sciences, and
+ethical decision-making. Previously proposed supervised models often suffer
+from overfitting, leading to hyper-specialized moral classifiers that struggle
+to perform well on data from different domains. To address this issue, we
+introduce novel systems that leverage abstract concepts and common-sense
+knowledge acquired from Large Language Models and Natural Language Inference
+models during previous stages of training on multiple data sources. By doing
+so, we aim to develop versatile and robust methods for detecting moral values
+in real-world scenarios. Our approach uses the GPT 3.5 model as a zero-shot
+ready-made unsupervised multi-label classifier for moral values detection,
+eliminating the need for explicit training on labeled data. We compare it with
+a smaller NLI-based zero-shot model. The results show that the NLI approach
+achieves competitive results compared to the Davinci model. Furthermore, we
+conduct an in-depth investigation of the performance of supervised systems in
+the context of cross-domain multi-label moral value detection. This involves
+training supervised models on different domains to explore their effectiveness
+in handling data from different sources and comparing their performance with
+the unsupervised methods. Our contributions encompass a thorough analysis of
+both supervised and unsupervised methodologies for cross-domain value
+detection. We introduce the Davinci model as a state-of-the-art zero-shot
+unsupervised moral values classifier, pushing the boundaries of moral value
+detection without the need for explicit training on labeled data. Additionally,
+we perform a comparative evaluation of our approach with the supervised models,
+shedding light on their respective strengths and weaknesses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Legal Judgment Reimagined: PredEx and the Rise of Intelligent AI
+  Interpretation in Indian Courts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Kumar Nigam, Anurag Sharma, Danush Khanna, Noel Shallum, Kripabandhu Ghosh, Arnab Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of Large Language Models (LLMs), predicting judicial outcomes
+poses significant challenges due to the complexity of legal proceedings and the
+scarcity of expert-annotated datasets. Addressing this, we introduce
+\textbf{Pred}iction with \textbf{Ex}planation (\texttt{PredEx}), the largest
+expert-annotated dataset for legal judgment prediction and explanation in the
+Indian context, featuring over 15,000 annotations. This groundbreaking corpus
+significantly enhances the training and evaluation of AI models in legal
+analysis, with innovations including the application of instruction tuning to
+LLMs. This method has markedly improved the predictive accuracy and explanatory
+depth of these models for legal judgments. We employed various
+transformer-based models, tailored for both general and Indian legal contexts.
+Through rigorous lexical, semantic, and expert assessments, our models
+effectively leverage \texttt{PredEx} to provide precise predictions and
+meaningful explanations, establishing it as a valuable benchmark for both the
+legal profession and the NLP community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are We Done with MMLU? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aryo Pradipta Gema, Joshua Ong Jun Leang, Giwon Hong, Alessio Devoto, Alberto Carlo Maria Mancino, Rohit Saxena, Xuanli He, Yu Zhao, Xiaotang Du, Mohammad Reza Ghasemi Madani, Claire Barale, Robert McHardy, Joshua Harris, Jean Kaddour, Emile van Krieken, Pasquale Minervini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maybe not. We identify and analyse errors in the popular Massive Multitask
+Language Understanding (MMLU) benchmark. Even though MMLU is widely adopted,
+our analysis demonstrates numerous ground truth errors that obscure the true
+capabilities of LLMs. For example, we find that 57% of the analysed questions
+in the Virology subset contain errors. To address this issue, we introduce a
+comprehensive framework for identifying dataset errors using a novel error
+taxonomy. Then, we create MMLU-Redux, which is a subset of 3,000 manually
+re-annotated questions across 30 MMLU subjects. Using MMLU-Redux, we
+demonstrate significant discrepancies with the model performance metrics that
+were originally reported. Our results strongly advocate for revising MMLU's
+error-ridden questions to enhance its future utility and reliability as a
+benchmark. Therefore, we open up MMLU-Redux for additional annotation
+https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Promoting Fairness and Diversity in Speech <span class="highlight-title">Dataset</span>s for Mental Health
+  and Neurological Disorders Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04116v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04116v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eleonora Mancini, Ana Tanevska, Andrea Galassi, Alessio Galatolo, Federico Ruggeri, Paolo Torroni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current research in machine learning and artificial intelligence is largely
+centered on modeling and performance evaluation, less so on data collection.
+However, recent research demonstrated that limitations and biases in data may
+negatively impact trustworthiness and reliability. These aspects are
+particularly impactful on sensitive domains such as mental health and
+neurological disorders, where speech data are used to develop AI applications
+aimed at improving the health of patients and supporting healthcare providers.
+In this paper, we chart the landscape of available speech datasets for this
+domain, to highlight possible pitfalls and opportunities for improvement and
+promote fairness and diversity. We present a comprehensive list of desiderata
+for building speech datasets for mental health and neurological disorders and
+distill it into a checklist focused on ethical concerns to foster more
+responsible research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncovering Limitations of Large Language Models in Information Seeking
+  from Tables <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04113v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04113v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoxu Pang, Yixuan Cao, Chunhao Yang, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tables are recognized for their high information density and widespread
+usage, serving as essential sources of information. Seeking information from
+tables (TIS) is a crucial capability for Large Language Models (LLMs), serving
+as the foundation of knowledge-based Q&A systems. However, this field presently
+suffers from an absence of thorough and reliable evaluation. This paper
+introduces a more reliable benchmark for Table Information Seeking (TabIS). To
+avoid the unreliable evaluation caused by text similarity-based metrics, TabIS
+adopts a single-choice question format (with two options per question) instead
+of a text generation format. We establish an effective pipeline for generating
+options, ensuring their difficulty and quality. Experiments conducted on 12
+LLMs reveal that while the performance of GPT-4-turbo is marginally
+satisfactory, both other proprietary and open-source models perform
+inadequately. Further analysis shows that LLMs exhibit a poor understanding of
+table structures, and struggle to balance between TIS performance and
+robustness against pseudo-relevant tables (common in retrieval-augmented
+systems). These findings uncover the limitations and potential challenges of
+LLMs in seeking information from tables. We release our data and code to
+facilitate further research in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intention and Face in Dialog 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adil Soubki, Owen Rambow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The notion of face described by Brown and Levinson (1987) has been studied in
+great detail, but a critical aspect of the framework, that which focuses on how
+intentions mediate the planning of turns which impose upon face, has received
+far less attention. We present an analysis of three computational systems
+trained for classifying both intention and politeness, focusing on how the
+former influences the latter. In politeness theory, agents attend to the desire
+to have their wants appreciated (positive face), and a complementary desire to
+act unimpeded and maintain freedom (negative face). Similar to speech acts,
+utterances can perform so-called face acts which can either raise or threaten
+the positive or negative face of the speaker or hearer. We begin by using an
+existing corpus to train a model which classifies face acts, achieving a new
+SoTA in the process. We then observe that every face act has an underlying
+intention that motivates it and perform additional experiments integrating
+dialog act annotations to provide these intentions by proxy. Our analysis finds
+that dialog acts improve performance on face act detection for minority classes
+and points to a close relationship between aspects of face and intent.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainability and Hate Speech: Structured Explanations Make Social
+  Media Moderators Faster <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Agostina Calabrese, Leonardo Neves, Neil Shah, Maarten W. Bos, Björn Ross, Mirella Lapata, Francesco Barbieri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Content moderators play a key role in keeping the conversation on social
+media healthy. While the high volume of content they need to judge represents a
+bottleneck to the moderation pipeline, no studies have explored how models
+could support them to make faster decisions. There is, by now, a vast body of
+research into detecting hate speech, sometimes explicitly motivated by a desire
+to help improve content moderation, but published research using real content
+moderators is scarce. In this work we investigate the effect of explanations on
+the speed of real-world moderators. Our experiments show that while generic
+explanations do not affect their speed and are often ignored, structured
+explanations lower moderators' decision making time by 7.4%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 14 figures, to be published at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ask LLMs Directly, "What shapes your bias?": Measuring Social Bias in
+  Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jisu Shin, Hoyun Song, Huije Lee, Soyeong Jeong, Jong C. Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social bias is shaped by the accumulation of social perceptions towards
+targets across various demographic identities. To fully understand such social
+bias in large language models (LLMs), it is essential to consider the composite
+of social perceptions from diverse perspectives among identities. Previous
+studies have either evaluated biases in LLMs by indirectly assessing the
+presence of sentiments towards demographic identities in the generated text or
+measuring the degree of alignment with given stereotypes. These methods have
+limitations in directly quantifying social biases at the level of distinct
+perspectives among identities. In this paper, we aim to investigate how social
+perceptions from various viewpoints contribute to the development of social
+bias in LLMs. To this end, we propose a novel strategy to intuitively quantify
+these social perceptions and suggest metrics that can evaluate the social
+biases within LLMs by aggregating diverse social perceptions. The experimental
+results show the quantitative demonstration of the social attitude in LLMs by
+examining social perception. The analysis we conducted shows that our proposed
+metrics capture the multi-dimensional aspects of social bias, enabling a
+fine-grained and comprehensive investigation of bias in LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The syntax-semantics interface in a child's path: A study of 3- to
+  11-year-olds' elicited production of Mandarin recursive relative clauses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caimei Yang, Qihang Yang, Xingzhi Su, Chenxi Fu, Xiaoyi Wang, Ying Yan, Zaijiang Man
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There have been apparently conflicting claims over the syntax-semantics
+relationship in child acquisition. However, few of them have assessed the
+child's path toward the acquisition of recursive relative clauses (RRCs). The
+authors of the current paper did experiments to investigate 3- to 11-year-olds'
+most-structured elicited production of eight Mandarin RRCs in a 4 (syntactic
+types)*2 (semantic conditions) design. The four syntactic types were RRCs with
+a subject-gapped RC embedded in an object-gapped RC (SORRCs), RRCs with an
+object-gapped RC embedded in another object-gapped RC (OORRCs), RRCs with an
+object-gapped RC embedded in a subject-gapped RC (OSRRCs), and RRCs with a
+subject-gapped RC embedded in another subject-gapped RC (SSRRCs). Each
+syntactic type was put in two conditions differing in internal semantics:
+irreversible internal semantics (IIS) and reversible internal semantics (RIS).
+For example, "the balloon that [the girl that _ eats the banana] holds _" is
+SORRCs in the IIS condition; "the monkey that [the dog that _ bites the pig]
+hits_" is SORRCs in the RIS condition. For each target, the participants were
+provided with a speech-visual stimulus constructing a condition of irreversible
+external semantics (IES). The results showed that SSRRCs, OSRRCs and SORRCs in
+the IIS-IES condition were produced two years earlier than their counterparts
+in the RIS-IES condition. Thus, a 2-stage development path is proposed: the
+language acquisition device starts with the interface between (irreversible)
+syntax and IIS, and ends with the interface between syntax and IES, both
+abiding by the syntax-semantic interface principle.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ American Sign Language Handshapes Reflect Pressures for Communicative
+  Efficiency <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kayo Yin, Terry Regier, Dan Klein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Communicative efficiency is a prominent theory in linguistics and cognitive
+science. While numerous studies have shown how the pressure to save energy is
+reflected in the form of spoken languages, few have explored this phenomenon in
+signed languages. In this paper, we show how handshapes in American Sign
+Language (ASL) reflect these efficiency pressures and we present new evidence
+of communicative efficiency in the visual-gestural modality.
+  We focus on handshapes that are used in both native ASL signs and signs
+borrowed from English to compare efficiency pressures from both ASL and
+English. First, we design new methodologies to quantify the articulatory effort
+required to produce handshapes as well as the perceptual effort needed to
+recognize them. Then, we compare correlations between communicative effort and
+usage statistics in ASL and English. Our findings reveal that frequent ASL
+handshapes are easier to produce and that pressures for communicative
+efficiency mostly come from ASL usage, not from English lexical borrowing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing LLMs for Zero-shot Abstractive Summarization Through the Lens
+  of Relevance Paraphrasing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03993v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03993v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hadi Askari, Anshuman Chhabra, Muhao Chen, Prasant Mohapatra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have achieved state-of-the-art performance at
+zero-shot generation of abstractive summaries for given articles. However,
+little is known about the robustness of such a process of zero-shot
+summarization. To bridge this gap, we propose relevance paraphrasing, a simple
+strategy that can be used to measure the robustness of LLMs as summarizers. The
+relevance paraphrasing approach identifies the most relevant sentences that
+contribute to generating an ideal summary, and then paraphrases these inputs to
+obtain a minimally perturbed dataset. Then, by evaluating model performance for
+summarization on both the original and perturbed datasets, we can assess the
+LLM's one aspect of robustness. We conduct extensive experiments with relevance
+paraphrasing on 4 diverse datasets, as well as 4 LLMs of different sizes
+(GPT-3.5-Turbo, Llama-2-13B, Mistral-7B, and Dolly-v2-7B). Our results indicate
+that LLMs are not consistent summarizers for the minimally perturbed articles,
+necessitating further improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On The Persona-based Summarization of Domain-Specific Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankan Mullick, Sombit Bose, Rounak Saha, Ayan Kumar Bhowmick, Pawan Goyal, Niloy Ganguly, Prasenjit Dey, Ravi Kokku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In an ever-expanding world of domain-specific knowledge, the increasing
+complexity of consuming, and storing information necessitates the generation of
+summaries from large information repositories. However, every persona of a
+domain has different requirements of information and hence their summarization.
+For example, in the healthcare domain, a persona-based (such as Doctor, Nurse,
+Patient etc.) approach is imperative to deliver targeted medical information
+efficiently. Persona-based summarization of domain-specific information by
+humans is a high cognitive load task and is generally not preferred. The
+summaries generated by two different humans have high variability and do not
+scale in cost and subject matter expertise as domains and personas grow.
+Further, AI-generated summaries using generic Large Language Models (LLMs) may
+not necessarily offer satisfactory accuracy for different domains unless they
+have been specifically trained on domain-specific data and can also be very
+expensive to use in day-to-day operations. Our contribution in this paper is
+two-fold: 1) We present an approach to efficiently fine-tune a domain-specific
+small foundation LLM using a healthcare corpus and also show that we can
+effectively evaluate the summarization quality using AI-based critiquing. 2) We
+further show that AI-based critiquing has good concordance with Human-based
+critiquing of the summaries. Hence, such AI-based pipelines to generate
+domain-specific persona-based summaries can be easily scaled to other domains
+such as legal, enterprise documents, education etc. in a very efficient and
+cost-effective manner.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A + B: A General Generator-Reader Framework for Optimizing LLMs to
+  Unleash Synergy Potential <span class="chip">ACL'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Tang, Yixin Cao, Jiahao Ying, Bo Wang, Yuyue Zhao, Yong Liao, Pengyuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Generation (RAG) is an effective solution to supplement
+necessary knowledge to large language models (LLMs). Targeting its bottleneck
+of retriever performance, "generate-then-read" pipeline is proposed to replace
+the retrieval stage with generation from the LLM itself. Although promising,
+this research direction is underexplored and still cannot work in the scenario
+when source knowledge is given. In this paper, we formalize a general "A + B"
+framework with varying combinations of foundation models and types for
+systematic investigation. We explore the efficacy of the base and chat versions
+of LLMs and found their different functionalities suitable for generator A and
+reader B, respectively. Their combinations consistently outperform single
+models, especially in complex scenarios. Furthermore, we extend the application
+of the "A + B" framework to scenarios involving source documents through
+continuous learning, enabling the direct integration of external knowledge into
+LLMs. This approach not only facilitates effective acquisition of new knowledge
+but also addresses the challenges of safety and helpfulness post-adaptation.
+The paper underscores the versatility of the "A + B" framework, demonstrating
+its potential to enhance the practical application of LLMs across various
+domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL'24 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tox-BART: Leveraging Toxicity Attributes for Explanation Generation of
+  Implicit Hate Speech <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03953v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03953v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neemesh Yadav, Sarah Masud, Vikram Goyal, Vikram Goyal, Md Shad Akhtar, Tanmoy Chakraborty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Employing language models to generate explanations for an incoming implicit
+hate post is an active area of research. The explanation is intended to make
+explicit the underlying stereotype and aid content moderators. The training
+often combines top-k relevant knowledge graph (KG) tuples to provide world
+knowledge and improve performance on standard metrics. Interestingly, our study
+presents conflicting evidence for the role of the quality of KG tuples in
+generating implicit explanations. Consequently, simpler models incorporating
+external toxicity signals outperform KG-infused models. Compared to the
+KG-based setup, we observe a comparable performance for SBIC (LatentHatred)
+datasets with a performance variation of +0.44 (+0.49), +1.83 (-1.56), and
+-4.59 (+0.77) in BLEU, ROUGE-L, and BERTScore. Further human evaluation and
+error analysis reveal that our proposed setup produces more precise
+explanations than zero-shot GPT-3.5, highlighting the intricate nature of the
+task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 Pages, 5 Figures, 13 Tables, ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UltraMedical: Building Specialized Generalists in Biomedicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyan Zhang, Sihang Zeng, Ermo Hua, Ning Ding, Zhang-Ren Chen, Zhiyuan Ma, Haoxin Li, Ganqu Cui, Biqing Qi, Xuekai Zhu, Xingtai Lv, Hu Jinfang, Zhiyuan Liu, Bowen Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable capabilities across
+various domains and are moving towards more specialized areas. Recent advanced
+proprietary models such as GPT-4 and Gemini have achieved significant
+advancements in biomedicine, which have also raised privacy and security
+challenges. The construction of specialized generalists hinges largely on
+high-quality datasets, enhanced by techniques like supervised fine-tuning and
+reinforcement learning from human or AI feedback, and direct preference
+optimization. However, these leading technologies (e.g., preference learning)
+are still significantly limited in the open source community due to the
+scarcity of specialized data. In this paper, we present the UltraMedical
+collections, which consist of high-quality manual and synthetic datasets in the
+biomedicine domain, featuring preference annotations across multiple advanced
+LLMs. By utilizing these datasets, we fine-tune a suite of specialized medical
+models based on Llama-3 series, demonstrating breathtaking capabilities across
+various medical benchmarks. Moreover, we develop powerful reward models skilled
+in biomedical and general reward benchmark, enhancing further online preference
+learning within the biomedical LLM community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Datasets and models are available at
+  https://github.com/TsinghuaC3I/UltraMedical</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Culturally Aware and Adapted NLP: A Taxonomy and a <span class="highlight-title">Survey</span> of the State
+  of the Art 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Cecilia Liu, Iryna Gurevych, Anna Korhonen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The surge of interest in culturally aware and adapted Natural Language
+Processing (NLP) has inspired much recent research. However, the lack of common
+understanding of the concept of "culture" has made it difficult to evaluate
+progress in this emerging area. Drawing on prior research in NLP and related
+fields, we propose an extensive taxonomy of elements of culture that can
+provide a systematic framework for analyzing and understanding research
+progress. Using the taxonomy, we survey existing resources and models for
+culturally aware and adapted NLP, providing an overview of the state of the art
+and the research gaps that still need to be filled.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ArMeme: Propagandistic Content in Arabic Memes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Firoj Alam, Abul Hasnat, Fatema Ahmed, Md Arid Hasan, Maram Hasanain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rise of digital communication, memes have become a significant
+medium for cultural and political expression that is often used to mislead
+audiences. Identification of such misleading and persuasive multimodal content
+has become more important among various stakeholders, including social media
+platforms, policymakers, and the broader society as they often cause harm to
+individuals, organizations, and/or society. While there has been effort to
+develop AI-based automatic systems for resource-rich languages (e.g., English),
+it is relatively little to none for medium to low resource languages. In this
+study, we focused on developing an Arabic memes dataset with manual annotations
+of propagandistic content. We annotated ~6K Arabic memes collected from various
+social media platforms, which is a first resource for Arabic multimodal
+research. We provide a comprehensive analysis aiming to develop computational
+tools for their detection. We will make them publicly available for the
+community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>disinformation, misinformation, factuality, harmfulness, fake news,
+  propaganda, multimodality, text, images</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HeSum: a Novel <span class="highlight-title">Dataset</span> for Abstractive Text Summarization in Hebrew 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tzuf Paz-Argaman, Itai Mondshine, Asaf Achi Mordechai, Reut Tsarfaty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) excel in various natural language tasks in
+English, their performance in lower-resourced languages like Hebrew, especially
+for generative tasks such as abstractive summarization, remains unclear. The
+high morphological richness in Hebrew adds further challenges due to the
+ambiguity in sentence comprehension and the complexities in meaning
+construction. In this paper, we address this resource and evaluation gap by
+introducing HeSum, a novel benchmark specifically designed for abstractive text
+summarization in Modern Hebrew. HeSum consists of 10,000 article-summary pairs
+sourced from Hebrew news websites written by professionals. Linguistic analysis
+confirms HeSum's high abstractness and unique morphological challenges. We show
+that HeSum presents distinct difficulties for contemporary state-of-the-art
+LLMs, establishing it as a valuable testbed for generative language technology
+in Hebrew, and MRLs generative challenges in general.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Good is Zero-Shot MT Evaluation for Low Resource Indian Languages? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03893v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03893v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anushka Singh, Ananya B. Sai, Raj Dabre, Ratish Puduppully, Anoop Kunchukuttan, Mitesh M Khapra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While machine translation evaluation has been studied primarily for
+high-resource languages, there has been a recent interest in evaluation for
+low-resource languages due to the increasing availability of data and models.
+In this paper, we focus on a zero-shot evaluation setting focusing on
+low-resource Indian languages, namely Assamese, Kannada, Maithili, and Punjabi.
+We collect sufficient Multi-Dimensional Quality Metrics (MQM) and Direct
+Assessment (DA) annotations to create test sets and meta-evaluate a plethora of
+automatic evaluation metrics. We observe that even for learned metrics, which
+are known to exhibit zero-shot performance, the Kendall Tau and Pearson
+correlations with human annotations are only as high as 0.32 and 0.45.
+Synthetic data approaches show mixed results and overall do not help close the
+gap by much for these languages. This indicates that there is still a long way
+to go for low-resource evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spontaneous Speech-Based Suicide Risk Detection Using Whisper and Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyun Cui, Chang Lei, Wen Wu, Yinan Duan, Diyang Qu, Ji Wu, Runsen Chen, Chao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The early detection of suicide risk is important since it enables the
+intervention to prevent potential suicide attempts. This paper studies the
+automatic detection of suicide risk based on spontaneous speech from
+adolescents, and collects a Mandarin dataset with 15 hours of suicide speech
+from more than a thousand adolescents aged from ten to eighteen for our
+experiments. To leverage the diverse acoustic and linguistic features embedded
+in spontaneous speech, both the Whisper speech model and textual large language
+models (LLMs) are used for suicide risk detection. Both all-parameter
+finetuning and parameter-efficient finetuning approaches are used to adapt the
+pre-trained models for suicide risk detection, and multiple audio-text fusion
+approaches are evaluated to combine the representations of Whisper and the LLM.
+The proposed system achieves a detection accuracy of 0.807 and an F1-score of
+0.846 on the test set with 119 subjects, indicating promising potential for
+real suicide risk detection applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Interspeech 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the IWSLT2023 Speech Translation Tasks: Human Annotations,
+  Automatic Metrics, and Segmentation <span class="chip">LREC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03881v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03881v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias Sperber, Ondřej Bojar, Barry Haddow, Dávid Javorský, Xutai Ma, Matteo Negri, Jan Niehues, Peter Polák, Elizabeth Salesky, Katsuhito Sudoh, Marco Turchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human evaluation is a critical component in machine translation system
+development and has received much attention in text translation research.
+However, little prior work exists on the topic of human evaluation for speech
+translation, which adds additional challenges such as noisy data and
+segmentation mismatches. We take first steps to fill this gap by conducting a
+comprehensive human evaluation of the results of several shared tasks from the
+last International Workshop on Spoken Language Translation (IWSLT 2023). We
+propose an effective evaluation strategy based on automatic resegmentation and
+direct assessment with segment context. Our analysis revealed that: 1) the
+proposed evaluation strategy is robust and scores well-correlated with other
+types of human judgements; 2) automatic metrics are usually, but not always,
+well-correlated with direct assessment scores; and 3) COMET as a slightly
+stronger automatic metric than chrF, despite the segmentation noise introduced
+by the resegmentation step systems. We release the collected human-annotated
+data in order to encourage further investigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>LREC-COLING2024 publication (with corrections for Table 3)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoder-only Streaming <span class="highlight-title">Transformer</span> for Simultaneous Translation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shoutao Guo, Shaolei Zhang, Yang Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simultaneous Machine Translation (SiMT) generates translation while reading
+source tokens, essentially producing the target prefix based on the source
+prefix. To achieve good performance, it leverages the relationship between
+source and target prefixes to exact a policy to guide the generation of
+translations. Although existing SiMT methods primarily focus on the
+Encoder-Decoder architecture, we explore the potential of Decoder-only
+architecture, owing to its superior performance in various tasks and its
+inherent compatibility with SiMT. However, directly applying the Decoder-only
+architecture to SiMT poses challenges in terms of training and inference. To
+alleviate the above problems, we propose the first Decoder-only SiMT model,
+named Decoder-only Streaming Transformer (DST). Specifically, DST separately
+encodes the positions of the source and target prefixes, ensuring that the
+position of the target prefix remains unaffected by the expansion of the source
+prefix. Furthermore, we propose a Streaming Self-Attention (SSA) mechanism
+tailored for the Decoder-only architecture. It is capable of obtaining
+translation policy by assessing the sufficiency of input source information and
+integrating with the soft-attention mechanism to generate translations.
+Experiments demonstrate that our approach achieves state-of-the-art performance
+on three translation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024. 14 pages, 10 Tables, 5 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BLSP-Emo: Towards Empathetic Large Speech-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03872v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03872v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Wang, Minpeng Liao, Zhongqiang Huang, Junhong Wu, Chengqing Zong, Jiajun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent release of GPT-4o showcased the potential of end-to-end multimodal
+models, not just in terms of low latency but also in their ability to
+understand and generate expressive speech with rich emotions. While the details
+are unknown to the open research community, it likely involves significant
+amounts of curated data and compute, neither of which is readily accessible. In
+this paper, we present BLSP-Emo (Bootstrapped Language-Speech Pretraining with
+Emotion support), a novel approach to developing an end-to-end speech-language
+model capable of understanding both semantics and emotions in speech and
+generate empathetic responses. BLSP-Emo utilizes existing speech recognition
+(ASR) and speech emotion recognition (SER) datasets through a two-stage
+process. The first stage focuses on semantic alignment, following recent work
+on pretraining speech-language models using ASR data. The second stage performs
+emotion alignment with the pretrained speech-language model on an emotion-aware
+continuation task constructed from SER data. Our experiments demonstrate that
+the BLSP-Emo model excels in comprehending speech and delivering empathetic
+responses, both in instruction-following tasks and conversations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recovering document annotations for sentence-level bitext <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rachel Wicks, Matt Post, Philipp Koehn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data availability limits the scope of any given task. In machine translation,
+historical models were incapable of handling longer contexts, so the lack of
+document-level datasets was less noticeable. Now, despite the emergence of
+long-sequence methods, we remain within a sentence-level paradigm and without
+data to adequately approach context-aware machine translation. Most large-scale
+datasets have been processed through a pipeline that discards document-level
+metadata. In this work, we reconstruct document-level information for three
+(ParaCrawl, News Commentary, and Europarl) large datasets in German, French,
+Spanish, Italian, Polish, and Portuguese (paired with English). We then
+introduce a document-level filtering technique as an alternative to traditional
+bitext filtering. We present this filtering with analysis to show that this
+method prefers context-consistent translations rather than those that may have
+been sentence-level machine translated. Last we train models on these longer
+contexts and demonstrate improvement in document-level translation without
+degradation of sentence-level translation. We release our dataset, ParaDocs,
+and resulting models as a resource to the community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MuJo: Multimodal Joint Feature Space Learning for Human Activity
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Gerd Fritsch, Cennet Oguz, Vitor Fortes Rey, Lala Ray, Maximilian Kiefer-Emmanouilidis, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Activity Recognition is a longstanding problem in AI with applications
+in a broad range of areas: from healthcare, sports and fitness, security, and
+human computer interaction to robotics. The performance of HAR in real-world
+settings is strongly dependent on the type and quality of the input signal that
+can be acquired. Given an unobstructed, high-quality camera view of a scene,
+computer vision systems, in particular in conjunction with foundational models
+(e.g., CLIP), can today fairly reliably distinguish complex activities. On the
+other hand, recognition using modalities such as wearable sensors (which are
+often more broadly available, e.g, in mobile phones and smartwatches) is a more
+difficult problem, as the signals often contain less information and labeled
+training data is more difficult to acquire. In this work, we show how we can
+improve HAR performance across different modalities using multimodal
+contrastive pretraining. Our approach MuJo (Multimodal Joint Feature Space
+Learning), learns a multimodal joint feature space with video, language, pose,
+and IMU sensor data. The proposed approach combines contrastive and multitask
+learning methods and analyzes different multitasking strategies for learning a
+compact shared representation. A large dataset with parallel video, language,
+pose, and sensor data points is also introduced to support the research, along
+with an analysis of the robustness of the multimodal joint space for
+modal-incomplete and low-resource data. On the MM-Fit dataset, our model
+achieves an impressive Macro F1-Score of up to 0.992 with only 2% of the train
+data and 0.999 when using all available training data for classification tasks.
+Moreover, in the scenario where the MM-Fit dataset is unseen, we demonstrate a
+generalization performance of up to 0.638.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance of large language models in numerical vs. semantic medical
+  knowledge: Benchmarking on evidence-based Q&As 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eden Avnat, Michal Levy, Daniel Herstain, Elia Yanko, Daniel Ben Joya, Michal Tzuchman Katz, Dafna Eshel, Sahar Laros, Yael Dagan, Shahar Barami, Joseph Mermelstein, Shahar Ovadia, Noam Shomron, Varda Shalev, Raja-Elie E. Abdulnour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical problem-solving requires processing of semantic medical knowledge
+such as illness scripts and numerical medical knowledge of diagnostic tests for
+evidence-based decision-making. As large language models (LLMs) show promising
+results in many aspects of language-based clinical practice, their ability to
+generate non-language evidence-based answers to clinical questions is
+inherently limited by tokenization. Therefore, we evaluated LLMs' performance
+on two question types: numeric (correlating findings) and semantic
+(differentiating entities) while examining differences within and between LLMs
+in medical aspects and comparing their performance to humans. To generate
+straightforward multi-choice questions and answers (QAs) based on
+evidence-based medicine (EBM), we used a comprehensive medical knowledge graph
+(encompassed data from more than 50,00 peer-reviewed articles) and created the
+"EBMQA". EBMQA contains 105,000 QAs labeled with medical and non-medical topics
+and classified into numerical or semantic questions. We benchmarked this
+dataset using more than 24,500 QAs on two state-of-the-art LLMs: Chat-GPT4 and
+Claude3-Opus. We evaluated the LLMs accuracy on semantic and numerical question
+types and according to sub-labeled topics. For validation, six medical experts
+were tested on 100 numerical EBMQA questions. We found that both LLMs excelled
+more in semantic than numerical QAs, with Claude3 surpassing GPT4 in numerical
+QAs. However, both LLMs showed inter and intra gaps in different medical
+aspects and remained inferior to humans. Thus, their medical advice should be
+addressed carefully.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Speculative Decoding via Early-exiting for Faster LLM Inference with
+  Thompson Sampling Control Mechanism <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Liu, Qifan Wang, Jingang Wang, Xunliang Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advancements in large language models (LLMs) have been
+extraordinary, yet the escalating inference costs associated with them present
+challenges in real-world applications. To address these challenges, we propose
+a novel approach called Early-exiting Speculative Decoding (EESD) with lossless
+acceleration. Specifically, EESD utilizes a segment of the LLM to generate
+draft tokens, incorporating Early-exiting structures after the first N layers.
+To enhance the quality of draft tokens, a self-distillation method is
+integrated. This early-exiting design not only reduces deployment and training
+costs but also significantly accelerates the token generation speed. Moreover,
+we introduce a novel sampling mechanism that leverages Thompson Sampling to
+regulate the generation processes, automatically determining the quantity of
+draft tokens in each round. The original LLM is then employed to validate these
+draft tokens through a single forward pass, and thus guarantees that the final
+output text maintains a distribution consistent with vanilla auto-regressive
+decoding. The experimental results on both 13B and 70B models demonstrate that
+our approach decodes tokens at a markedly accelerated rate compared to prior
+methods, showing the effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lean Workbook: A large-scale Lean problem set formalized from natural
+  language math problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaiyuan Ying, Zijian Wu, Yihan Geng, Jiayu Wang, Dahua Lin, Kai Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have demonstrated impressive capabilities across
+various natural language processing tasks, especially in solving mathematical
+problems. However, large language models are not good at math theorem proving
+using formal languages like Lean. A significant challenge in this area is the
+scarcity of training data available in these formal languages. To address this
+issue, we propose a novel pipeline that iteratively generates and filters
+synthetic data to translate natural language mathematical problems into Lean 4
+statements, and vice versa. Our results indicate that the synthetic data
+pipeline can provide useful training data and improve the performance of LLMs
+in translating and understanding complex mathematical problems and proofs. Our
+final dataset contains about 57K formal-informal question pairs along with
+searched proof from the math contest forum and 21 new IMO questions. We
+open-source our code at https://github.com/InternLM/InternLM-Math and our data
+at https://huggingface.co/datasets/InternLM/Lean-Workbook.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chaos with Keywords: Exposing Large Language Models Sycophancy to
+  Misleading Keywords and Evaluating Defense Strategies <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03827v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03827v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aswin RRV, Nemika Tyagi, Md Nayem Uddin, Neeraj Varshney, Chitta Baral
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the sycophantic tendencies of Large Language Models
+(LLMs), where these models tend to provide answers that match what users want
+to hear, even if they are not entirely correct. The motivation behind this
+exploration stems from the common behavior observed in individuals searching
+the internet for facts with partial or misleading knowledge. Similar to using
+web search engines, users may recall fragments of misleading keywords and
+submit them to an LLM, hoping for a comprehensive response. Our empirical
+analysis of several LLMs shows the potential danger of these models amplifying
+misinformation when presented with misleading keywords. Additionally, we
+thoroughly assess four existing hallucination mitigation strategies to reduce
+LLMs sycophantic behavior. Our experiments demonstrate the effectiveness of
+these strategies for generating factually correct statements. Furthermore, our
+analyses delve into knowledge-probing experiments on factual keywords and
+different categories of sycophancy mitigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReST-MCTS*: LLM Self-Training via Process Reward Guided Tree Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03816v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03816v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan Zhang, Sining Zhoubian, Yisong Yue, Yuxiao Dong, Jie Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent methodologies in LLM self-training mostly rely on LLM generating
+responses and filtering those with correct output answers as training data.
+This approach often yields a low-quality fine-tuning training set (e.g.,
+incorrect plans or intermediate reasoning). In this paper, we develop a
+reinforced self-training approach, called ReST-MCTS*, based on integrating
+process reward guidance with tree search MCTS* for collecting higher-quality
+reasoning traces as well as per-step value to train policy and reward models.
+ReST-MCTS* circumvents the per-step manual annotation typically used to train
+process rewards by tree-search-based reinforcement learning: Given oracle final
+correct answers, ReST-MCTS* is able to infer the correct process rewards by
+estimating the probability this step can help lead to the correct answer. These
+inferred rewards serve dual purposes: they act as value targets for further
+refining the process reward model and also facilitate the selection of
+high-quality traces for policy model self-training. We first show that the
+tree-search policy in ReST-MCTS* achieves higher accuracy compared with prior
+LLM reasoning baselines such as Best-of-N and Tree-of-Thought, within the same
+search budget. We then show that by using traces searched by this tree-search
+policy as training data, we can continuously enhance the three language models
+for multiple iterations, and outperform other self-training algorithms such as
+ReST$^\text{EM}$ and Self-Rewarding LM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Zero-Shot Chinese-English Code-Switching ASR with kNN-CTC and
+  Gated Monolingual Datastores 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03814v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03814v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Zhou, Shiwan Zhao, Hui Wang, Tian-Hao Zhang, Haoqin Sun, Xuechen Wang, Yong Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The kNN-CTC model has proven to be effective for monolingual automatic speech
+recognition (ASR). However, its direct application to multilingual scenarios
+like code-switching, presents challenges. Although there is potential for
+performance improvement, a kNN-CTC model utilizing a single bilingual datastore
+can inadvertently introduce undesirable noise from the alternative language. To
+address this, we propose a novel kNN-CTC-based code-switching ASR (CS-ASR)
+framework that employs dual monolingual datastores and a gated datastore
+selection mechanism to reduce noise interference. Our method selects the
+appropriate datastore for decoding each frame, ensuring the injection of
+language-specific information into the ASR process. We apply this framework to
+cutting-edge CTC-based models, developing an advanced CS-ASR system. Extensive
+experiments demonstrate the remarkable effectiveness of our gated datastore
+mechanism in enhancing the performance of zero-shot Chinese-English CS-ASR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tool-Planner: Dynamic Solution Tree Planning for Large Language Model
+  with Tool Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03807v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03807v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanming Liu, Xinyue Peng, Yuwei Zhang, Jiannan Cao, Xuhong Zhang, Sheng Cheng, Xun Wang, Jianwei Yin, Tianyu Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated exceptional reasoning
+capabilities, enabling them to solve various complex problems. Recently, this
+ability has been applied to the paradigm of tool learning. Tool learning
+involves providing examples of tool usage and their corresponding functions,
+allowing LLMs to formulate plans and demonstrate the process of invoking and
+executing each tool. LLMs can address tasks that they cannot complete
+independently, thereby enhancing their potential across different tasks.
+However, this approach faces two key challenges. First, redundant error
+correction leads to unstable planning and long execution time. Additionally,
+designing a correct plan among multiple tools is also a challenge in tool
+learning. To address these issues, we propose Tool-Planner, a task-processing
+framework based on toolkits. Tool-Planner groups tools based on the API
+functions with the same function into a toolkit and allows LLMs to implement
+planning across the various toolkits. When a tool error occurs, the language
+model can reselect and adjust tools based on the toolkit. Experiments show that
+our approach demonstrates a high pass and win rate across different datasets
+and optimizes the planning scheme for tool learning in models such as GPT-4 and
+Claude 3, showcasing the potential of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46pages first version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Light-PEFT: Lightening Parameter-Efficient Fine-Tuning via Early Pruning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03792v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03792v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naibin Gu, Peng Fu, Xiyu Liu, Bowen Shen, Zheng Lin, Weiping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) has emerged as the predominant
+technique for fine-tuning in the era of large language models. However,
+existing PEFT methods still have inadequate training efficiency. Firstly, the
+utilization of large-scale foundation models during the training process is
+excessively redundant for certain fine-tuning tasks. Secondly, as the model
+size increases, the growth in trainable parameters of empirically added PEFT
+modules becomes non-negligible and redundant, leading to inefficiency. To
+achieve task-specific efficient fine-tuning, we propose the Light-PEFT
+framework, which includes two methods: Masked Early Pruning of the Foundation
+Model and Multi-Granularity Early Pruning of PEFT. The Light-PEFT framework
+allows for the simultaneous estimation of redundant parameters in both the
+foundation model and PEFT modules during the early stage of training. These
+parameters can then be pruned for more efficient fine-tuning. We validate our
+approach on GLUE, SuperGLUE, QA tasks, and various models. With Light-PEFT,
+parameters of the foundation model can be pruned by up to over 40%, while still
+controlling trainable parameters to be only 25% of the original PEFT method.
+Compared to utilizing the PEFT method directly, Light-PEFT achieves training
+and inference speedup, reduces memory usage, and maintains comparable
+performance and the plug-and-play feature of PEFT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ End-to-End Trainable Soft Retriever for Low-resource Relation Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kohei Makino, Makoto Miwa, Yutaka Sasaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses a crucial challenge in instance-based relation
+extraction using text generation models: end-to-end training in target relation
+extraction task is not applicable to retrievers due to the non-differentiable
+nature of instance selection. We propose a novel End-to-end TRAinable Soft
+K-nearest neighbor retriever (ETRASK) by the neural prompting method that
+utilizes a soft, differentiable selection of the $k$ nearest instances. This
+approach enables the end-to-end training of retrievers in target tasks. On the
+TACRED benchmark dataset with a low-resource setting where the training data
+was reduced to 10\%, our method achieved a state-of-the-art F1 score of 71.5\%.
+Moreover, ETRASK consistently improved the baseline model by adding instances
+for all settings. These results highlight the efficacy of our approach in
+enhancing relation extraction performance, especially in resource-constrained
+environments. Our findings offer a promising direction for future research with
+extraction and the broader application of text generation in natural language
+processing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XL-HeadTags: Leveraging Multimodal Retrieval Augmentation for the
+  Multilingual Generation of News Headlines and Tags <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faisal Tareque Shohan, Mir Tafseer Nayeem, Samsul Islam, Abu Ubaida Akash, Shafiq Joty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Millions of news articles published online daily can overwhelm readers.
+Headlines and entity (topic) tags are essential for guiding readers to decide
+if the content is worth their time. While headline generation has been
+extensively studied, tag generation remains largely unexplored, yet it offers
+readers better access to topics of interest. The need for conciseness in
+capturing readers' attention necessitates improved content selection strategies
+for identifying salient and relevant segments within lengthy articles, thereby
+guiding language models effectively. To address this, we propose to leverage
+auxiliary information such as images and captions embedded in the articles to
+retrieve relevant sentences and utilize instruction tuning with variations to
+generate both headlines and tags for news articles in a multilingual context.
+To make use of the auxiliary information, we have compiled a dataset named
+XL-HeadTags, which includes 20 languages across 6 diverse language families.
+Through extensive evaluation, we demonstrate the effectiveness of our
+plug-and-play multimodal-multilingual retrievers for both tasks. Additionally,
+we have developed a suite of tools for processing and evaluating multilingual
+texts, significantly contributing to the research community by enabling more
+accurate and efficient analysis across languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TRAP: Targeted Random Adversarial <span class="highlight-title">Prompt</span> Honeypot for Black-Box
+  Identification <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12991v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12991v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Gubri, Dennis Ulmer, Hwaran Lee, Sangdoo Yun, Seong Joon Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Model (LLM) services and models often come with legal rules on
+who can use them and how they must use them. Assessing the compliance of the
+released LLMs is crucial, as these rules protect the interests of the LLM
+contributor and prevent misuse. In this context, we describe the novel
+fingerprinting problem of Black-box Identity Verification (BBIV). The goal is
+to determine whether a third-party application uses a certain LLM through its
+chat function. We propose a method called Targeted Random Adversarial Prompt
+(TRAP) that identifies the specific LLM in use. We repurpose adversarial
+suffixes, originally proposed for jailbreaking, to get a pre-defined answer
+from the target LLM, while other models give random answers. TRAP detects the
+target LLMs with over 95% true positive rate at under 0.2% false positive rate
+even after a single interaction. TRAP remains effective even if the LLM has
+minor changes that do not significantly alter the original function.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 (findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Don't Rank, Combine! Combining Machine Translation Hypotheses Using
+  Quality Estimation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06688v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06688v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgos Vernikos, Andrei Popescu-Belis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural machine translation systems estimate probabilities of target sentences
+given source sentences, yet these estimates may not align with human
+preferences. This work introduces QE-fusion, a method that synthesizes
+translations using a quality estimation metric (QE), which correlates better
+with human judgments. QE-fusion leverages a pool of candidates sampled from a
+model, combining spans from different candidates using a QE metric such as
+CometKiwi. We compare QE-fusion against beam search and recent reranking
+techniques, such as Minimum Bayes Risk decoding or QE-reranking. Our method
+consistently improves translation quality in terms of COMET and BLEURT scores
+when applied to large language models (LLMs) used for translation (PolyLM,
+XGLM, Llama2, Mistral, ALMA, and Tower) and to multilingual translation models
+(NLLB), over five language pairs. Notably, QE-fusion exhibits larger
+improvements for LLMs due to their ability to generate diverse outputs. We
+demonstrate that our approach generates novel translations in over half of the
+cases and consistently outperforms other methods across varying numbers of
+candidates (5-200). Furthermore, we empirically establish that QE-fusion scales
+linearly with the number of candidates in the pool.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LiveCodeBench: Holistic and Contamination Free Evaluation of Large
+  Language Models for Code 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07974v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07974v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, Ion Stoica
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) applied to code-related applications have
+emerged as a prominent field, attracting significant interest from both
+academia and industry. However, as new and improved LLMs are developed,
+existing evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient
+for assessing their capabilities. In this work, we propose LiveCodeBench, a
+comprehensive and contamination-free evaluation of LLMs for code, which
+continuously collects new problems over time from contests across three
+competition platforms, namely LeetCode, AtCoder, and CodeForces. Notably, our
+benchmark also focuses on a broader range of code related capabilities, such as
+self-repair, code execution, and test output prediction, beyond just code
+generation. Currently, LiveCodeBench hosts four hundred high-quality coding
+problems that were published between May 2023 and May 2024. We have evaluated
+18 base LLMs and 34 instruction-tuned LLMs on LiveCodeBench. We present
+empirical findings on contamination, holistic performance comparisons,
+potential overfitting in existing benchmarks as well as individual model
+comparisons. We will release all prompts and model completions for further
+community analysis, along with a general toolkit for adding new scenarios and
+model
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website - https://livecodebench.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReGAL: Refactoring Programs to Discover Generalizable Abstractions <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.16467v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.16467v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elias Stengel-Eskin, Archiki Prasad, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) are increasingly being used for program
+synthesis, they lack the global view needed to develop useful abstractions;
+they generally predict programs one at a time, often repeating the same
+functionality. Generating redundant code from scratch is both inefficient and
+error-prone. To address this, we propose Refactoring for Generalizable
+Abstraction Learning (ReGAL), a gradient-free method for learning a library of
+reusable functions via code refactorization, i.e., restructuring code without
+changing its execution output. ReGAL learns from a small set of existing
+programs, iteratively verifying and refining its abstractions via execution. We
+find that the shared function libraries discovered by ReGAL make programs
+easier to predict across diverse domains. On five datasets -- LOGO graphics
+generation, Date reasoning, TextCraft (a Minecraft-based text-game) MATH, and
+TabMWP -- both open-source and proprietary LLMs improve in accuracy when
+predicting programs with ReGAL functions. For CodeLlama-13B, ReGAL results in
+absolute accuracy increases of 11.5% on LOGO, 26.1% on date understanding, and
+8.1% on TextCraft, outperforming GPT-3.5 in two of three domains. Our analysis
+reveals ReGAL's abstractions encapsulate frequently-used subroutines as well as
+environment dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Camera-Ready; First two authors contributed equally; Code:
+  https://github.com/esteng/regal_program_learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RECAP: Retrieval-Augmented Audio Captioning <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09836v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09836v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sreyan Ghosh, Sonal Kumar, Chandra Kiran Reddy Evuru, Ramani Duraiswami, Dinesh Manocha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present RECAP (REtrieval-Augmented Audio CAPtioning), a novel and
+effective audio captioning system that generates captions conditioned on an
+input audio and other captions similar to the audio retrieved from a datastore.
+Additionally, our proposed method can transfer to any domain without the need
+for any additional fine-tuning. To generate a caption for an audio sample, we
+leverage an audio-text model CLAP to retrieve captions similar to it from a
+replaceable datastore, which are then used to construct a prompt. Next, we feed
+this prompt to a GPT-2 decoder and introduce cross-attention layers between the
+CLAP encoder and GPT-2 to condition the audio for caption generation.
+Experiments on two benchmark datasets, Clotho and AudioCaps, show that RECAP
+achieves competitive performance in in-domain settings and significant
+improvements in out-of-domain settings. Additionally, due to its capability to
+exploit a large text-captions-only datastore in a training-free fashion, RECAP
+shows unique capabilities of captioning novel audio events never seen during
+training and compositional audios with multiple events. To promote research in
+this space, we also release 150,000+ new weakly labeled captions for AudioSet,
+AudioCaps, and Clotho.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2024. Code and data: https://github.com/Sreyan88/RECAP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language Models Don't Learn the Physical Manifestation of Language <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11349v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11349v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruce W. Lee, JaeHyuk Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We argue that language-only models don't learn the physical manifestation of
+language. We present an empirical investigation of visual-auditory properties
+of language through a series of tasks, termed H-Test. These tasks highlight a
+fundamental gap between human linguistic understanding and the sensory-deprived
+linguistic understanding of LLMs. In support of our hypothesis, 1. deliberate
+reasoning (Chain-of-Thought), 2. few-shot examples, or 3. stronger LLM from the
+same model family (LLaMA 2 13B -> LLaMA 2 70B) has no significant effect on
+H-Test performance.
+  We bring in the philosophical case of Mary, who learns about the world in a
+sensory-deprived environment as a useful conceptual framework to understand how
+language-only models learn about the world (Jackson, 1986). Our experiments
+show that some of the strongest proprietary LLMs stay near random chance
+baseline accuracy of 50%, highlighting the limitations of linguistic knowledge
+acquired in the absence of sensory experience. Our code and data are available
+at <github.com/brucewlee/h-test>.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reflect-RL: Two-Player Online RL Fine-Tuning for LMs <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12621v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12621v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runlong Zhou, Simon S. Du, Beibin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As language models (LMs) demonstrate their capabilities in various fields,
+their application to tasks requiring multi-round interactions has become
+increasingly popular. These tasks usually have complex dynamics, so supervised
+fine-tuning (SFT) on a limited offline dataset does not yield good performance.
+However, only a few works attempted to directly train the LMs within
+interactive decision-making environments. We aim to create an effective
+approach to fine-tune LMs with online reinforcement learning (RL) in these
+environments. We propose Reflect-RL, a two-player system to fine-tune an LM
+using SFT and online RL, where a frozen reflection model (player) assists the
+policy model (player). To generate data for the warm-up SFT stage, we use
+negative example generation to enhance the error-correction ability of the
+reflection model. Furthermore, we designed single-prompt action enumeration and
+applied curriculum learning to allow the policy model to learn more
+efficiently. Empirically, we verify that Reflect-RL outperforms SFT and online
+RL without reflection. Testing results indicate GPT-2 XL 1.56B fine-tuned with
+Reflect-RL outperforms larger open-source LMs, such as Mistral 7B. The
+benchmarks, dataset, and code involved in this work are publicly available:
+https://github.com/zhourunlong/Reflect-RL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FanOutQA: A Multi-Hop, Multi-Document Question Answering Benchmark for
+  Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14116v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14116v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Zhu, Alyssa Hwang, Liam Dugan, Chris Callison-Burch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One type of question that is commonly found in day-to-day scenarios is
+``fan-out'' questions, complex multi-hop, multi-document reasoning questions
+that require finding information about a large number of entities. However,
+there exist few resources to evaluate this type of question-answering
+capability among large language models. To evaluate complex reasoning in LLMs
+more fully, we present FanOutQA, a high-quality dataset of fan-out
+question-answer pairs and human-annotated decompositions with English Wikipedia
+as the knowledge base. We formulate three benchmark settings across our dataset
+and benchmark 7 LLMs, including GPT-4, LLaMA 2, Claude-2.1, and Mixtral-8x7B,
+finding that contemporary models still have room to improve reasoning over
+inter-document dependencies in a long context. We provide our dataset and
+open-source tools to run models to encourage evaluation at https://fanoutqa.com
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 2 figures. ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparsity-Accelerated Training for Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01392v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01392v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Da Ma, Lu Chen, Pengyu Wang, Hongshen Xu, Hanqi Li, Liangtai Sun, Su Zhu, Shuai Fan, Kai Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated proficiency across various
+natural language processing (NLP) tasks but often require additional training,
+such as continual pre-training and supervised fine-tuning. However, the costs
+associated with this, primarily due to their large parameter count, remain
+high. This paper proposes leveraging \emph{sparsity} in pre-trained LLMs to
+expedite this training process. By observing sparsity in activated neurons
+during forward iterations, we identify the potential for computational
+speed-ups by excluding inactive neurons. We address associated challenges by
+extending existing neuron importance evaluation metrics and introducing a
+ladder omission rate scheduler. Our experiments on Llama-2 demonstrate that
+Sparsity-Accelerated Training (SAT) achieves comparable or superior performance
+to standard training while significantly accelerating the process.
+Specifically, SAT achieves a $45\%$ throughput improvement in continual
+pre-training and saves $38\%$ training time in supervised fine-tuning in
+practice. It offers a simple, hardware-agnostic, and easily deployable
+framework for additional LLM training. Our code is available at
+https://github.com/OpenDFM/SAT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging the Empirical-Theoretical Gap in Neural Network Formal Language
+  Learning Using Minimum Description Length 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10013v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10013v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nur Lan, Emmanuel Chemla, Roni Katzir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks offer good approximation to many tasks but consistently fail
+to reach perfect generalization, even when theoretical work shows that such
+perfect solutions can be expressed by certain architectures. Using the task of
+formal language learning, we focus on one simple formal language and show that
+the theoretically correct solution is in fact not an optimum of commonly used
+objectives -- even with regularization techniques that according to common
+wisdom should lead to simple weights and good generalization (L1, L2) or other
+meta-heuristics (early-stopping, dropout). On the other hand, replacing
+standard targets with the Minimum Description Length objective (MDL) results in
+the correct solution being an optimum.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 3 appendix pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Revolution of Multimodal Large Language Models: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12451v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12451v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Caffagni, Federico Cocchi, Luca Barsellotti, Nicholas Moratelli, Sara Sarto, Lorenzo Baraldi, Lorenzo Baraldi, Marcella Cornia, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Connecting text and visual modalities plays an essential role in generative
+intelligence. For this reason, inspired by the success of large language
+models, significant research efforts are being devoted to the development of
+Multimodal Large Language Models (MLLMs). These models can seamlessly integrate
+visual and textual modalities, while providing a dialogue-based interface and
+instruction-following capabilities. In this paper, we provide a comprehensive
+review of recent visual-based MLLMs, analyzing their architectural choices,
+multimodal alignment strategies, and training techniques. We also conduct a
+detailed analysis of these models across a wide range of tasks, including
+visual grounding, image generation and editing, visual understanding, and
+domain-specific applications. Additionally, we compile and describe training
+datasets and evaluation benchmarks, conducting comparisons among existing
+models in terms of performance and computational requirements. Overall, this
+survey offers a comprehensive overview of the current state of the art, laying
+the groundwork for future MLLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Multilingual Large Language Models: Corpora, Alignment, and
+  Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00929v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00929v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuemei Xu, Ling Hu, Jiayi Zhao, Zihan Qiu, Yuqi Ye, Hanwen Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Based on the foundation of Large Language Models (LLMs), Multilingual Large
+Language Models (MLLMs) have been developed to address the challenges of
+multilingual natural language processing tasks, hoping to achieve knowledge
+transfer from high-resource to low-resource languages. However, significant
+limitations and challenges still exist, such as language imbalance,
+multilingual alignment, and inherent bias. In this paper, we aim to provide a
+comprehensive analysis of MLLMs, delving deeply into discussions surrounding
+these critical issues. First of all, we start by presenting an overview of
+MLLMs, covering their evolution, key techniques, and multilingual capacities.
+Secondly, we explore widely utilized multilingual corpora for MLLMs' training
+and multilingual datasets oriented for downstream tasks that are crucial for
+enhancing the cross-lingual capability of MLLMs. Thirdly, we survey the
+existing studies on multilingual representations and investigate whether the
+current MLLMs can learn a universal language representation. Fourthly, we
+discuss bias on MLLMs including its category and evaluation metrics, and
+summarize the existing debiasing techniques. Finally, we discuss existing
+challenges and point out promising research directions. By demonstrating these
+aspects, this paper aims to facilitate a deeper understanding of MLLMs and
+their potentiality in various domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NormAd: A Benchmark for Measuring the Cultural Adaptability of Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.12464v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.12464v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhinav Rao, Akhila Yerukola, Vishwa Shah, Katharina Reinecke, Maarten Sap
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of Large Language Models (LLMs) into various global cultures
+fundamentally presents a cultural challenge: LLMs must navigate interactions,
+respect social norms, and avoid transgressing cultural boundaries. However, it
+is still unclear if LLMs can adapt their outputs to diverse cultural norms. Our
+study focuses on this aspect. We introduce NormAd, a novel dataset, which
+includes 2.6k stories that represent social and cultural norms from 75
+countries, to assess the ability of LLMs to adapt to different granular levels
+of socio-cultural contexts such as the country of origin, its associated
+cultural values, and prevalent social norms. Our study reveals that LLMs
+struggle with cultural reasoning across all contextual granularities, showing
+stronger adaptability to English-centric cultures over those from the Global
+South. Even with explicit social norms, the top-performing model,
+Mistral-7b-Instruct, achieves only 81.8\% accuracy, lagging behind the 95.6\%
+achieved by humans. Evaluation on NormAd further reveals that LLMs struggle to
+adapt to stories involving gift-giving across cultures. Due to inherent
+agreement or sycophancy biases, LLMs find it considerably easier to assess the
+social acceptability of stories that adhere to cultural norms than those that
+deviate from them. Our benchmark measures the cultural adaptability (or lack
+thereof) of LLMs, emphasizing the potential to make these technologies more
+equitable and useful for global audiences. We release the NormAd dataset and
+its associated code on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. In Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Does <span class="highlight-title">Pre-train</span>ed Language Model Actually Infer Unseen Links in Knowledge
+  Graph Completion? <span class="chip">NAACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuke Sakai, Hidetaka Kamigaito, Katsuhiko Hayashi, Taro Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graphs (KGs) consist of links that describe relationships between
+entities. Due to the difficulty of manually enumerating all relationships
+between entities, automatically completing them is essential for KGs. Knowledge
+Graph Completion (KGC) is a task that infers unseen relationships between
+entities in a KG. Traditional embedding-based KGC methods, such as RESCAL,
+TransE, DistMult, ComplEx, RotatE, HAKE, HousE, etc., infer missing links using
+only the knowledge from training data. In contrast, the recent Pre-trained
+Language Model (PLM)-based KGC utilizes knowledge obtained during pre-training.
+Therefore, PLM-based KGC can estimate missing links between entities by reusing
+memorized knowledge from pre-training without inference. This approach is
+problematic because building KGC models aims to infer unseen links between
+entities. However, conventional evaluations in KGC do not consider inference
+and memorization abilities separately. Thus, a PLM-based KGC method, which
+achieves high performance in current KGC evaluations, may be ineffective in
+practical applications. To address this issue, we analyze whether PLM-based KGC
+methods make inferences or merely access memorized knowledge. For this purpose,
+we propose a method for constructing synthetic datasets specified in this
+analysis and conclude that PLMs acquire the inference abilities required for
+KGC through pre-training, even though the performance improvements mostly come
+from textual information of entities and relations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NAACL 2024 main oral, 15 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating <span class="highlight-title">Pre-Train</span>ed Speech and Language Models for End-to-End Speech
+  Recognition <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03668v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03668v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukiya Hono, Koh Mitsuda, Tianyu Zhao, Kentaro Mitsui, Toshiaki Wakatsuki, Kei Sawada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in machine learning have made it possible to perform various text
+and speech processing tasks, such as automatic speech recognition (ASR), in an
+end-to-end (E2E) manner. E2E approaches utilizing pre-trained models are
+gaining attention for conserving training data and resources. However, most of
+their applications in ASR involve only one of either a pre-trained speech or a
+language model. This paper proposes integrating a pre-trained speech
+representation model and a large language model (LLM) for E2E ASR. The proposed
+model enables the optimization of the entire ASR process, including acoustic
+feature extraction and acoustic and language modeling, by combining pre-trained
+models with a bridge network and also enables the application of remarkable
+developments in LLM utilization, such as parameter-efficient domain adaptation
+and inference optimization. Experimental results demonstrate that the proposed
+model achieves a performance comparable to that of modern E2E ASR models by
+utilizing powerful pre-training models with the proposed integrated approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 4 figures, 9 tables, accepted for Findings of ACL 2024. The
+  model is available at https://huggingface.co/rinna/nue-asr</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adam: Dense Retrieval Distillation with Adaptive Dark Examples 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chongyang Tao, Chang Liu, Tao Shen, Can Xu, Xiubo Geng, Binxing Jiao, Daxin Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To improve the performance of the dual-encoder retriever, one effective
+approach is knowledge distillation from the cross-encoder ranker. Existing
+works construct the candidate passages following the supervised learning
+setting where a query is paired with a positive passage and a batch of
+negatives. However, through empirical observation, we find that even the hard
+negatives from advanced methods are still too trivial for the teacher to
+distinguish, preventing the teacher from transferring abundant dark knowledge
+to the student through its soft label. To alleviate this issue, we propose
+ADAM, a knowledge distillation framework that can better transfer the dark
+knowledge held in the teacher with Adaptive Dark exAMples. Different from
+previous works that only rely on one positive and hard negatives as candidate
+passages, we create dark examples that all have moderate relevance to the query
+through mixing-up and masking in discrete space. Furthermore, as the quality of
+knowledge held in different training instances varies as measured by the
+teacher's confidence score, we propose a self-paced distillation strategy that
+adaptively concentrates on a subset of high-quality instances to conduct our
+dark-example-based knowledge distillation to help the student learn better. We
+conduct experiments on two widely-used benchmarks and verify the effectiveness
+of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-modal Stance Detection: New <span class="highlight-title">Dataset</span>s and Model <span class="chip">ACL'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14298v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14298v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Liang, Ang Li, Jingqian Zhao, Lin Gui, Min Yang, Yue Yu, Kam-Fai Wong, Ruifeng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stance detection is a challenging task that aims to identify public opinion
+from social media platforms with respect to specific targets. Previous work on
+stance detection largely focused on pure texts. In this paper, we study
+multi-modal stance detection for tweets consisting of texts and images, which
+are prevalent in today's fast-growing social media platforms where people often
+post multi-modal messages. To this end, we create five new multi-modal stance
+detection datasets of different domains based on Twitter, in which each example
+consists of a text and an image. In addition, we propose a simple yet effective
+Targeted Multi-modal Prompt Tuning framework (TMPT), where target information
+is leveraged to learn multi-modal stance features from textual and visual
+modalities. Experimental results on our five benchmark datasets show that the
+proposed TMPT achieves state-of-the-art performance in multi-modal stance
+detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL'24 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PEMT: Multi-Task Correlation Guided Mixture-of-Experts Enables
+  Parameter-Efficient Transfer Learning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15082v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15082v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhisheng Lin, Han Fu, Chenghao Liu, Zhuo Li, Jianling Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) has emerged as an effective method for
+adapting pre-trained language models to various tasks efficiently. Recently,
+there has been a growing interest in transferring knowledge from one or
+multiple tasks to the downstream target task to achieve performance
+improvements. However, current approaches typically either train adapters on
+individual tasks or distill shared knowledge from source tasks, failing to
+fully exploit task-specific knowledge and the correlation between source and
+target tasks. To overcome these limitations, we propose PEMT, a novel
+parameter-efficient fine-tuning framework based on multi-task transfer
+learning. PEMT extends the mixture-of-experts (MoE) framework to capture the
+transferable knowledge as a weighted combination of adapters trained on source
+tasks. These weights are determined by a gated unit, measuring the correlation
+between the target and each source task using task description prompt vectors.
+To fully exploit the task-specific knowledge, we also propose the Task Sparsity
+Loss to improve the sparsity of the gated unit. We conduct experiments on a
+broad range of tasks over 17 datasets. The experimental results demonstrate our
+PEMT yields stable improvements over full fine-tuning, and state-of-the-art
+PEFT and knowledge transferring methods on various tasks. The results highlight
+the effectiveness of our method which is capable of sufficiently exploiting the
+knowledge and correlation features across multiple tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of the ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Breaking through the learning plateaus of in-context learning in
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06054v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06054v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwen Fu, Tao Yang, Yuwang Wang, Yan Lu, Nanning Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning, i.e., learning from context examples, is an impressive
+ability of Transformer. Training Transformers to possess this in-context
+learning skill is computationally intensive due to the occurrence of learning
+plateaus, which are periods within the training process where there is minimal
+or no enhancement in the model's in-context learning capability. To study the
+mechanism behind the learning plateaus, we conceptually seperate a component
+within the model's internal representation that is exclusively affected by the
+model's weights. We call this the "weights component", and the remainder is
+identified as the "context component". By conducting meticulous and controlled
+experiments on synthetic tasks, we note that the persistence of learning
+plateaus correlates with compromised functionality of the weights component.
+Recognizing the impaired performance of the weights component as a fundamental
+behavior drives learning plateaus, we have developed three strategies to
+expedite the learning of Transformers. The effectiveness of these strategies is
+further confirmed in natural language processing tasks. In conclusion, our
+research demonstrates the feasibility of cultivating a powerful in-context
+learning ability within AI systems in an eco-friendly manner.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When is Tree Search Useful for LLM Planning? It Depends on the
+  Discriminator <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10890v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10890v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziru Chen, Michael White, Raymond Mooney, Ali Payani, Yu Su, Huan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we examine how large language models (LLMs) solve multi-step
+problems under a language agent framework with three components: a generator, a
+discriminator, and a planning method. We investigate the practical utility of
+two advanced planning methods, iterative correction and tree search. We present
+a comprehensive analysis of how discrimination accuracy affects the overall
+performance of agents when using these two methods or a simpler method,
+re-ranking. Experiments on two tasks, text-to-SQL parsing and mathematical
+reasoning, show that: (1) advanced planning methods demand discriminators with
+at least 90% accuracy to achieve significant improvements over re-ranking; (2)
+current LLMs' discrimination abilities have not met the needs of advanced
+planning methods to achieve such improvements; (3) with LLM-based
+discriminators, advanced planning methods may not adequately balance accuracy
+and efficiency. For example, compared to the other two methods, tree search is
+at least 10--20 times slower but leads to negligible performance gains, which
+hinders its real-world applications. Code and data are available at
+https://github.com/OSU-NLP-Group/llm-planning-eval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Codebook Knowledge with NLI and Chat<span class="highlight-title">GPT</span> for Zero-Shot
+  Political Relation Classification <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07876v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07876v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibo Hu, Erick Skorupa Parolin, Latifur Khan, Patrick T. Brandt, Javier Osorio, Vito J. D'Orazio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Is it possible accurately classify political relations within evolving event
+ontologies without extensive annotations? This study investigates zero-shot
+learning methods that use expert knowledge from existing annotation codebook,
+and evaluates the performance of advanced ChatGPT (GPT-3.5/4) and a natural
+language inference (NLI)-based model called ZSP. ChatGPT uses codebook's
+labeled summaries as prompts, whereas ZSP breaks down the classification task
+into context, event mode, and class disambiguation to refine task-specific
+hypotheses. This decomposition enhances interpretability, efficiency, and
+adaptability to schema changes. The experiments reveal ChatGPT's strengths and
+limitations, and crucially show ZSP's outperformance of dictionary-based
+methods and its competitive edge over some supervised models. These findings
+affirm the value of ZSP for validating event records and advancing ontology
+development. Our study underscores the efficacy of leveraging transfer learning
+and existing domain expertise to enhance research efficiency and scalability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RIFF: Learning to Rephrase Inputs for Few-shot Fine-tuning of Language
+  Models <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saeed Najafi, Alona Fyshe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained Language Models (PLMs) can be accurately fine-tuned for
+downstream text processing tasks. Recently, researchers have introduced several
+parameter-efficient fine-tuning methods that optimize input prompts or adjust a
+small number of model parameters (e.g LoRA). In this study, we explore the
+impact of altering the input text of the original task in conjunction with
+parameter-efficient fine-tuning methods. To most effectively rewrite the input
+text, we train a few-shot paraphrase model with a Maximum-Marginal Likelihood
+objective. Using six few-shot text classification datasets, we show that
+enriching data with paraphrases at train and test time enhances the performance
+beyond what can be achieved with parameter-efficient fine-tuning alone. The
+code used for our experiments can be found at
+https://github.com/SaeedNajafi/RIFF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final Version (Findings of ACL2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ T-RAG: Lessons from the LLM Trenches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07483v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07483v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoomali Fatehkia, Ji Kim Lucas, Sanjay Chawla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLM) have shown remarkable language capabilities
+fueling attempts to integrate them into applications across a wide range of
+domains. An important application area is question answering over private
+enterprise documents where the main considerations are data security, which
+necessitates applications that can be deployed on-prem, limited computational
+resources and the need for a robust application that correctly responds to
+queries. Retrieval-Augmented Generation (RAG) has emerged as the most prominent
+framework for building LLM-based applications. While building a RAG is
+relatively straightforward, making it robust and a reliable application
+requires extensive customization and relatively deep knowledge of the
+application domain. We share our experiences building and deploying an LLM
+application for question answering over private organizational documents. Our
+application combines the use of RAG with a finetuned open-source LLM.
+Additionally, our system, which we call Tree-RAG (T-RAG), uses a tree structure
+to represent entity hierarchies within the organization. This is used to
+generate a textual description to augment the context when responding to user
+queries pertaining to entities within the organization's hierarchy. Our
+evaluations, including a Needle in a Haystack test, show that this combination
+performs better than a simple RAG or finetuning implementation. Finally, we
+share some lessons learned based on our experiences building an LLM application
+for real-world use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added Needle in a Haystack analysis for T-RAG</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding and Patching Compositional Reasoning in LLMs <span class="chip">ACL'2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14328v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14328v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoyi Li, Gangwei Jiang, Hong Xie, Linqi Song, Defu Lian, Ying Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LLMs have marked a revolutonary shift, yet they falter when faced with
+compositional reasoning tasks. Our research embarks on a quest to uncover the
+root causes of compositional reasoning failures of LLMs, uncovering that most
+of them stem from the improperly generated or leveraged implicit reasoning
+results. Inspired by our empirical findings, we resort to Logit Lens and an
+intervention experiment to dissect the inner hidden states of LLMs. This deep
+dive reveals that implicit reasoning results indeed surface within middle
+layers and play a causative role in shaping the final explicit reasoning
+results. Our exploration further locates multi-head self-attention (MHSA)
+modules within these layers, which emerge as the linchpins in accurate
+generation and leveraing of implicit reasoning results. Grounded on the above
+findings, we develop CREME, a lightweight method to patch errors in
+compositional reasoning via editing the located MHSA modules. Our empirical
+evidence stands testament to CREME's effectiveness, paving the way for
+autonomously and continuously enhancing compositional reasoning capabilities in
+language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL'2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Retrieval Robustness for Retrieval-Augmented Image
+  Captioning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenyan Li, Jiaang Li, Rita Ramos, Raphael Tang, Desmond Elliott
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in retrieval-augmented models for image captioning highlight
+the benefit of retrieving related captions for efficient, lightweight models
+with strong domain-transfer capabilities. While these models demonstrate the
+success of retrieval augmentation, retrieval models are still far from perfect
+in practice: the retrieved information can sometimes mislead the model,
+resulting in incorrect generation and worse performance. In this paper, we
+analyze the robustness of a retrieval-augmented captioning model SmallCap. Our
+analysis shows that the model is sensitive to tokens that appear in the
+majority of the retrieved captions, and the input attribution shows that those
+tokens are likely copied into the generated output. Given these findings, we
+propose to train the model by sampling retrieved captions from more diverse
+sets. This decreases the chance that the model learns to copy majority tokens,
+and improves both in-domain and cross-domain performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, long paper at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-Linear Inference Time Intervention: Improving LLM Truthfulness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakub Hoscilowicz, Adam Wiacek, Jan Chojnacki, Adam Cieslak, Leszek Michon, Vitalii Urbanevych, Artur Janicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we explore LLM's internal representation space to identify
+attention heads that contain the most truthful and accurate information. We
+further developed the Inference Time Intervention (ITI) framework, which lets
+bias LLM without the need for fine-tuning. The improvement manifests in
+introducing a non-linear multi-token probing and multi-token intervention:
+Non-Linear ITI (NL-ITI), which significantly enhances performance on evaluation
+benchmarks. NL-ITI is tested on diverse multiple-choice datasets, including
+TruthfulQA, on which we report over 16% relative MC1 (accuracy of model
+pointing to the correct answer) improvement with respect to the baseline ITI
+results. Moreover, we achieved a 10% relative improvement over the recently
+released Truth Forest (TrFf) method that also focused on ITI improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted on Interspeech 2024 Conference. Code is available at
+  https://github.com/Samsung/NL-ITI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge-to-SQL: Enhancing SQL Generation with Data Expert LLM <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11517v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11517v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijin Hong, Zheng Yuan, Hao Chen, Qinggang Zhang, Feiran Huang, Xiao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating accurate SQL queries for user questions (text-to-SQL) has been a
+long-standing challenge since it requires a deep understanding of both the
+user's question and the corresponding database schema in order to retrieve the
+desired content accurately. Existing methods rely on the comprehensive
+capability of large language models (LLMs) to generate the SQL. However, some
+necessary knowledge is not explicitly included in the database schema and user
+question or has been learned by LLMs. Thus, the generated SQL of the
+knowledge-insufficient questions may be inaccurate, negatively influencing the
+text-to-SQL models' performance and robustness. To address this challenge, we
+propose the Knowledge-to-SQL framework, which employs tailored Data Expert LLM
+(DELLM) to provide helpful knowledge for all text-to-SQL models. Specifically,
+we introduce the detailed implementation of DELLM regarding table reading and
+the basic fine-tuning process. We further propose a Preference Learning via
+Database Feedback (PLDBF) strategy, refining the DELLM to generate more helpful
+knowledge for LLMs. Extensive experiments verify that DELLM can enhance the
+state-of-the-art approaches for text-to-SQL tasks. The corresponding code of
+DELLM is released for further research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lever LM: Configuring In-Context Sequence to Lever Large Vision Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10104v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10104v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Yang, Yingzhe Peng, Haoxuan Ma, Shuo Xu, Chi Zhang, Yucheng Han, Hanwang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Archimedes famously said, ``Give me a lever long enough and a fulcrum on
+which to place it, and I shall move the world'', in this study, we propose to
+use a tiny Language Model (LM), \eg, a Transformer with 67M parameters, to
+lever much larger Vision-Language Models (LVLMs) with 9B parameters.
+Specifically, we use this tiny \textbf{Lever-LM} to configure effective
+in-context demonstration (ICD) sequences to improve the In-Context Learinng
+(ICL) performance of LVLMs. Previous studies show that diverse ICD
+configurations like the selection and ordering of the demonstrations heavily
+affect the ICL performance, highlighting the significance of configuring
+effective ICD sequences. Motivated by this and by re-considering the the
+process of configuring ICD sequence, we find this is a mirror process of human
+sentence composition and further assume that effective ICD configurations may
+contain internal statistical patterns that can be captured by Lever-LM. Then a
+dataset with effective ICD sequences is constructed to train Lever-LM. After
+training, given novel queries, new ICD sequences are configured by the trained
+Lever-LM to solve vision-language tasks through ICL. Experiments show that
+these ICD sequences can improve the ICL performance of two LVLMs compared with
+some strong baselines in Visual Question Answering and Image Captioning,
+validating that Lever-LM can really capture the statistical patterns for
+levering LVLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Generate Instruction Tuning <span class="highlight-title">Dataset</span>s for Zero-Shot Task
+  Adaptation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18334v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18334v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nihal V. Nayak, Yiyang Nan, Avi Trost, Stephen H. Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Bonito, an open-source model for conditional task generation
+that converts unannotated text into task-specific training datasets for
+instruction tuning. We aim to enable zero-shot task adaptation of large
+language models on users' specialized, private data. We train Bonito by
+fine-tuning a pretrained large language model on a new large-scale dataset with
+1.65M examples created by remixing existing instruction tuning datasets into
+meta-templates. The meta-templates for a dataset produce training examples
+where the input is the unannotated text and the task attribute and the output
+consists of the instruction and the response. We use Bonito to generate
+synthetic tasks for seven datasets from specialized domains with unannotated
+text across three task types -- yes-no question answering, extractive question
+answering, and natural language inference -- and adapt language models. We show
+that Bonito significantly improves the average performance of pretrained and
+instruction tuned models over the de facto self supervised baseline. For
+example, adapting Mistral-Instruct-v2 and instruction tuned variants of Mistral
+and Llama2 with Bonito improves the strong zero-shot performance by 22.1 F1
+points whereas the next word prediction objective undoes some of the benefits
+of instruction tuning and reduces the average performance by 0.8 F1 points. We
+conduct additional experiments with Bonito to understand the effects of the
+domain, the size of the training set, and the choice of alternative synthetic
+task generators. Overall, we show that learning with synthetic instruction
+tuning datasets is an effective way to adapt language models to new domains.
+The model, dataset, and code are available at
+https://github.com/BatsResearch/bonito.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TAXI: Evaluating Categorical Knowledge Editing for Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.15004v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.15004v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Derek Powell, Walter Gerych, Thomas Hartvigsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans rarely learn one fact in isolation. Instead, learning a new fact
+induces knowledge of other facts about the world. For example, in learning a
+korat is a type of cat, you also infer it is a mammal and has claws, ensuring
+your model of the world is consistent. Knowledge editing aims to inject new
+facts into language models to improve their factuality, but current benchmarks
+fail to evaluate consistency, which is critical to ensure efficient, accurate,
+and generalizable edits. We manually create TAXI, a new benchmark dataset
+specifically created to evaluate consistency in categorical knowledge edits.
+TAXI contains 11,120 multiple-choice queries for 976 edits spanning 41
+categories (e.g., Dogs), 164 subjects (e.g., Labrador), and 183 properties
+(e.g., is a mammal). We then use TAXI to evaluate popular editors' categorical
+consistency, measuring how often editing a subject's category appropriately
+edits its properties. We find that 1) the editors achieve marginal, yet
+non-random consistency, 2) their consistency far underperforms human baselines,
+and 3) consistency is more achievable when editing atypical subjects Our code
+and data are available at https://github.com/derekpowell/taxi.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multipath parsing in the brain <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.18046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.18046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Berta Franzluebbers, Donald Dunagan, Miloš Stanojević, Jan Buys, John T. Hale
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans understand sentences word-by-word, in the order that they hear them.
+This incrementality entails resolving temporary ambiguities about syntactic
+relationships. We investigate how humans process these syntactic ambiguities by
+correlating predictions from incremental generative dependency parsers with
+timecourse data from people undergoing functional neuroimaging while listening
+to an audiobook. In particular, we compare competing hypotheses regarding the
+number of developing syntactic analyses in play during word-by-word
+comprehension: one vs more than one. This comparison involves evaluating
+syntactic surprisal from a state-of-the-art dependency parser with LLM-adapted
+encodings against an existing fMRI dataset. In both English and Chinese data,
+we find evidence for multipath parsing. Brain regions associated with this
+multipath effect include bilateral superior temporal gyrus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL2024, main conference. 15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Better Question Generation in QA-based Event Extraction <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10517v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10517v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijin Hong, Jian Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event Extraction (EE) is an essential information extraction task that aims
+to extract event-related information from unstructured texts. The paradigm of
+this task has shifted from conventional classification-based methods to more
+contemporary question-answering-based (QA-based) approaches. However, in
+QA-based EE, the quality of the questions dramatically affects the extraction
+accuracy, and how to generate high-quality questions for QA-based EE remains a
+challenge. In this work, to tackle this challenge, we suggest four criteria to
+evaluate the quality of a question and propose a reinforcement learning method,
+RLQG, for QA-based EE that can generate generalizable, high-quality, and
+context-dependent questions and provides clear guidance to QA models. The
+extensive experiments conducted on ACE and RAMS datasets have strongly
+validated our approach's effectiveness, which also demonstrates its robustness
+in scenarios with limited training data. The corresponding code of RLQG is
+released for further research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00083v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00083v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Xue, Mengxin Zheng, Yebowen Hu, Fei Liu, Xun Chen, Qian Lou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are constrained by outdated information and a
+tendency to generate incorrect data, commonly referred to as "hallucinations."
+Retrieval-Augmented Generation (RAG) addresses these limitations by combining
+the strengths of retrieval-based methods and generative models. This approach
+involves retrieving relevant information from a large, up-to-date dataset and
+using it to enhance the generation process, leading to more accurate and
+contextually appropriate responses. Despite its benefits, RAG introduces a new
+attack surface for LLMs, particularly because RAG databases are often sourced
+from public data, such as the web. In this paper, we propose \TrojRAG{} to
+identify the vulnerabilities and attacks on retrieval parts (RAG database) and
+their indirect attacks on generative parts (LLMs). Specifically, we identify
+that poisoning several customized content passages could achieve a retrieval
+backdoor, where the retrieval works well for clean queries but always returns
+customized poisoned adversarial queries. Triggers and poisoned passages can be
+highly customized to implement various attacks. For example, a trigger could be
+a semantic group like "The Republican Party, Donald Trump, etc." Adversarial
+passages can be tailored to different contents, not only linked to the triggers
+but also used to indirectly attack generative LLMs without modifying them.
+These attacks can include denial-of-service attacks on RAG and semantic
+steering attacks on LLM generations conditioned by the triggers. Our
+experiments demonstrate that by just poisoning 10 adversarial passages can
+induce 98.2\% success rate to retrieve the adversarial passages. Then, these
+passages can increase the reject ratio of RAG-based GPT-4 from 0.01\% to 74.6\%
+or increase the rate of negative responses from 0.22\% to 72\% for targeted
+queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive Evaluation of Quantization Strategies for Large Language
+  Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16775v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16775v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renren Jin, Jiangcun Du, Wuwei Huang, Wei Liu, Jian Luan, Bin Wang, Deyi Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasing the number of parameters in large language models (LLMs) usually
+improves performance in downstream tasks but raises compute and memory costs,
+making deployment difficult in resource-limited settings. Quantization
+techniques, which reduce the bits needed for model weights or activations with
+minimal performance loss, have become popular due to the rise of LLMs. However,
+most quantization studies use pre-trained LLMs, and the impact of quantization
+on instruction-tuned LLMs and the relationship between perplexity and benchmark
+performance of quantized LLMs are not well understood. Evaluation of quantized
+LLMs is often limited to language modeling and a few classification tasks,
+leaving their performance on other benchmarks unclear. To address these gaps,
+we propose a structured evaluation framework consisting of three critical
+dimensions: (1) knowledge \& capacity, (2) alignment, and (3) efficiency, and
+conduct extensive experiments across ten diverse benchmarks. Our experimental
+results indicate that LLMs with 4-bit quantization can retain performance
+comparable to their non-quantized counterparts, and perplexity can serve as a
+proxy metric for quantized LLMs on most benchmarks. Furthermore, quantized LLMs
+with larger parameter scales can outperform smaller LLMs. Despite the memory
+savings achieved through quantization, it can also slow down the inference
+speed of LLMs. Consequently, substantial engineering efforts and hardware
+support are imperative to achieve a balanced optimization of decoding speed and
+memory consumption in the context of quantized LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with
+  Vision-Language Benchmark <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04788v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04788v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongping Chen, Ruoxi Chen, Shilin Zhang, Yinuo Liu, Yaochen Wang, Huichi Zhou, Qihui Zhang, Pan Zhou, Yao Wan, Lichao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have gained significant attention
+recently, showing remarkable potential in artificial general intelligence.
+However, assessing the utility of MLLMs presents considerable challenges,
+primarily due to the absence of multimodal benchmarks that align with human
+preferences. Drawing inspiration from the concept of LLM-as-a-Judge within
+LLMs, this paper introduces a novel benchmark, termed MLLM-as-a-Judge, to
+assess the ability of MLLMs in assisting judges across diverse modalities,
+encompassing three distinct tasks: Scoring Evaluation, Pair Comparison, and
+Batch Ranking. Our study reveals that, while MLLMs demonstrate remarkable
+human-like discernment in Pair Comparison, there is a significant divergence
+from human preferences in Scoring Evaluation and Batch Ranking. Furthermore, a
+closer examination reveals persistent challenges in the judgment capacities of
+LLMs, including diverse biases, hallucinatory responses, and inconsistencies in
+judgment, even in advanced models such as GPT-4V. These findings emphasize the
+pressing need for enhancements and further research efforts to be undertaken
+before regarding MLLMs as fully reliable evaluators. In light of this, we
+advocate for additional efforts dedicated to supporting the continuous
+development within the domain of MLLM functioning as judges. The code and
+dataset are publicly available at our project homepage:
+\url{https://mllm-judge.github.io/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trade-off between Redundancy and Local Coherence in Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.10192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.10192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ronald Cardenas, Matthias Galle, Shay B. Cohen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extractive summaries are usually presented as lists of sentences with no
+expected cohesion between them and with plenty of redundant information if not
+accounted for. In this paper, we investigate the trade-offs incurred when
+aiming to control for inter-sentential cohesion and redundancy in extracted
+summaries, and their impact on their informativeness. As case study, we focus
+on the summarization of long, highly redundant documents and consider two
+optimization scenarios, reward-guided and with no supervision. In the
+reward-guided scenario, we compare systems that control for redundancy and
+cohesion during sentence scoring. In the unsupervised scenario, we introduce
+two systems that aim to control all three properties -- informativeness,
+redundancy, and cohesion -- in a principled way. Both systems implement a
+psycholinguistic theory that simulates how humans keep track of relevant
+content units and how cohesion and non-redundancy constraints are applied in
+short-term memory during reading. Extensive automatic and human evaluations
+reveal that systems optimizing for -- among other properties -- cohesion are
+capable of better organizing content in summaries compared to systems that
+optimize only for redundancy, while maintaining comparable informativeness. We
+find that the proposed unsupervised systems manage to extract highly cohesive
+summaries across varying levels of document redundancy, although sacrificing
+informativeness in the process. Finally, we lay evidence as to how simulated
+cognitive processes impact the trade-off between the analyzed summary
+properties.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to JAIR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Full Parameter Fine-tuning for Large Language Models with Limited
+  Resources <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09782v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09782v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Lv, Yuqing Yang, Tengxiao Liu, Qinghui Gao, Qipeng Guo, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have revolutionized Natural Language Processing
+(NLP) but demand massive GPU resources for training. Lowering the threshold for
+LLMs training would encourage greater participation from researchers,
+benefiting both academia and society. While existing approaches have focused on
+parameter-efficient fine-tuning, which tunes or adds a small number of
+parameters, few have addressed the challenge of tuning the full parameters of
+LLMs with limited resources. In this work, we propose a new optimizer,
+LOw-Memory Optimization (LOMO), which fuses the gradient computation and the
+parameter update in one step to reduce memory usage. By integrating LOMO with
+existing memory saving techniques, we reduce memory usage to 10.8% compared to
+the standard approach (DeepSpeed solution). Consequently, our approach enables
+the full parameter fine-tuning of a 65B model on a single machine with 8 RTX
+3090, each with 24GB memory.Code and data are available at
+https://github.com/OpenLMLab/LOMO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaLomo: Low-memory Optimization with Adaptive Learning Rate <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10195v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10195v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Lv, Hang Yan, Qipeng Guo, Haijun Lv, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have achieved remarkable success, but their extensive
+parameter size necessitates substantial memory for training, thereby setting a
+high threshold. While the recently proposed low-memory optimization (LOMO)
+reduces memory footprint, its optimization technique, akin to stochastic
+gradient descent, is sensitive to hyper-parameters and exhibits suboptimal
+convergence, failing to match the performance of the prevailing optimizer for
+large language models, AdamW. Through empirical analysis of the Adam optimizer,
+we found that, compared to momentum, the adaptive learning rate is more
+critical for bridging the gap. Building on this insight, we introduce the
+low-memory optimization with adaptive learning rate (AdaLomo), which offers an
+adaptive learning rate for each parameter. To maintain memory efficiency, we
+employ non-negative matrix factorization for the second-order moment estimation
+in the optimizer state. Additionally, we suggest the use of a grouped update
+normalization to stabilize convergence. Our experiments with instruction-tuning
+and further pre-training demonstrate that AdaLomo achieves results on par with
+AdamW, while significantly reducing memory requirements, thereby lowering the
+hardware barrier to training large language models. The code is accessible at
+https://github.com/OpenLMLab/LOMO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 camera ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OlympiadBench: A Challenging Benchmark for Promoting AGI with
+  Olympiad-Level Bilingual Multimodal Scientific Problems <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14008v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14008v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqun He, Renjie Luo, Yuzhuo Bai, Shengding Hu, Zhen Leng Thai, Junhao Shen, Jinyi Hu, Xu Han, Yujie Huang, Yuxiang Zhang, Jie Liu, Lei Qi, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements have seen Large Language Models (LLMs) and Large
+Multimodal Models (LMMs) surpassing general human capabilities in various
+tasks, approaching the proficiency level of human experts across multiple
+domains. With traditional benchmarks becoming less challenging for these
+models, new rigorous challenges are essential to gauge their advanced
+abilities. In this work, we present OlympiadBench, an Olympiad-level bilingual
+multimodal scientific benchmark, featuring 8,476 problems from Olympiad-level
+mathematics and physics competitions, including the Chinese college entrance
+exam. Each problem is detailed with expert-level annotations for step-by-step
+reasoning. Evaluating top-tier models on OlympiadBench, we implement a
+comprehensive assessment methodology to accurately evaluate model responses.
+Notably, the best-performing model, GPT-4V, attains an average score of 17.97%
+on OlympiadBench, with a mere 10.74% in physics, highlighting the benchmark
+rigor and the intricacy of physical reasoning. Our analysis orienting GPT-4V
+points out prevalent issues with hallucinations, knowledge omissions, and
+logical fallacies. We hope that our challenging benchmark can serve as a
+valuable resource for helping future AGI research endeavors. The data and
+evaluation code are available at \url{https://github.com/OpenBMB/OlympiadBench}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 (main), update</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WatME: Towards Lossless Watermarking Through Lexical Redundancy <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09832v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09832v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Chen, Yatao Bian, Yang Deng, Deng Cai, Shuaiyi Li, Peilin Zhao, Kam-fai Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text watermarking has emerged as a pivotal technique for identifying
+machine-generated text. However, existing methods often rely on arbitrary
+vocabulary partitioning during decoding to embed watermarks, which compromises
+the availability of suitable tokens and significantly degrades the quality of
+responses. This study assesses the impact of watermarking on different
+capabilities of large language models (LLMs) from a cognitive science lens. Our
+finding highlights a significant disparity; knowledge recall and logical
+reasoning are more adversely affected than language generation. These results
+suggest a more profound effect of watermarking on LLMs than previously
+understood. To address these challenges, we introduce Watermarking with Mutual
+Exclusion (WatME), a novel approach leveraging linguistic prior knowledge of
+inherent lexical redundancy in LLM vocabularies to seamlessly integrate
+watermarks. Specifically, WatME dynamically optimizes token usage during the
+decoding process by applying a mutually exclusive rule to the identified
+lexical redundancies. This strategy effectively prevents the unavailability of
+appropriate tokens and preserves the expressive power of LLMs. We provide both
+theoretical analysis and empirical evidence showing that WatME effectively
+preserves the diverse capabilities of LLMs while ensuring watermark
+detectability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tree-Planted <span class="highlight-title">Transformer</span>s: Unidirectional <span class="highlight-title">Transformer</span> Language Models
+  with Implicit Syntactic Supervision <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12691v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12691v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryo Yoshida, Taiga Someya, Yohei Oseki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Syntactic Language Models (SLMs) can be trained efficiently to reach
+relatively high performance; however, they have trouble with inference
+efficiency due to the explicit generation of syntactic structures. In this
+paper, we propose a new method dubbed tree-planting: instead of explicitly
+generating syntactic structures, we "plant" trees into attention weights of
+unidirectional Transformer LMs to implicitly reflect syntactic structures of
+natural language. Specifically, unidirectional Transformer LMs trained with
+tree-planting will be called Tree-Planted Transformers (TPT), which inherit the
+training efficiency from SLMs without changing the inference efficiency of
+their underlying Transformer LMs. Targeted syntactic evaluations on the
+SyntaxGym benchmark demonstrated that TPTs, despite the lack of explicit
+generation of syntactic structures, significantly outperformed not only vanilla
+Transformer LMs but also various SLMs that generate hundreds of syntactic
+structures in parallel. This result suggests that TPTs can learn human-like
+syntactic knowledge as data-efficiently as SLMs while maintaining the modeling
+space of Transformer LMs unchanged.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Task Inference: Can Large Language Models Follow Multiple
+  Instructions at Once? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11597v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11597v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guijin Son, Sangwon Baek, Sangdae Nam, Ilgyun Jeong, Seungone Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are typically prompted to follow a single
+instruction per inference call. In this work, we analyze whether LLMs also hold
+the capability to handle multiple instructions simultaneously, denoted as
+Multi-Task Inference. For this purpose, we introduce the MTI Bench(Multi-Task
+Inference Benchmark), a comprehensive evaluation benchmark encompassing 5,000
+instances across 25 tasks. Each task in the MTI Bench involves 2 to 3
+sub-tasks. As expected, we first demonstrate that Multi-Task Inference reduces
+the total inference time by 1.46 times in average since it does not require
+multiple inference calls. Interestingly, contrary to the expectation that LLMs
+would perform better when tasks are divided, we find that state-of-the-art
+LLMs, such as Llama-2-Chat-70B and GPT-4, show up to 7.3% and 12.4% improved
+performance with Multi-Task Inference compared to Single-Task Inference on the
+MTI Bench. We release the MTI Bench dataset and our code at this link
+https://github.com/guijinSON/MTI-Bench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>acl 2024 (main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Emulated Disalignment: Safety Alignment for Large Language Models May
+  Backfire! <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12343v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12343v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanhui Zhou, Jie Liu, Zhichen Dong, Jiaheng Liu, Chao Yang, Wanli Ouyang, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) undergo safety alignment to ensure safe
+conversations with humans. However, this paper introduces a training-free
+attack method capable of reversing safety alignment, converting the outcomes of
+stronger alignment into greater potential for harm by accessing only LLM output
+token distributions. Specifically, our method achieves this reversal by
+contrasting the output token distribution of a safety-aligned language model
+(e.g., Llama-2-chat) against its pre-trained version (e.g., Llama-2), so that
+the token predictions are shifted towards the opposite direction of safety
+alignment. We name this method emulated disalignment (ED) because sampling from
+this contrastive distribution provably emulates the result of fine-tuning to
+minimize a safety reward. Our experiments with ED across three evaluation
+datasets and four model families (Llama-1, Llama-2, Mistral, and Alpaca) show
+that ED doubles the harmfulness of pre-trained models and outperforms strong
+baselines, achieving the highest harmful rates in 43 out of 48 evaluation
+subsets by a large margin. Eventually, given ED's reliance on language model
+output token distributions, which particularly compromises open-source models,
+our findings highlight the need to reassess the open accessibility of language
+models, even if they have been safety-aligned. Code is available at
+https://github.com/ZHZisZZ/emulated-disalignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automating <span class="highlight-title">Dataset</span> Updates Towards Reliable and Timely Evaluation of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11894v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11894v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Ying, Yixin Cao, Yushi Bai, Qianru Sun, Bo Wang, Wei Tang, Zhaojun Ding, Yizhe Yang, Xuanjing Huang, Shuicheng Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved impressive performance across
+various natural language benchmarks, prompting a continual need to curate more
+difficult datasets for larger LLMs, which is costly and time-consuming. In this
+paper, we propose to automate dataset updating and provide systematic analysis
+regarding its effectiveness in dealing with benchmark leakage issue, difficulty
+control, and stability. Thus, once the current benchmark has been mastered or
+leaked, we can update it for timely and reliable evaluation. There are two
+updating strategies: 1) mimicking strategy to generate similar samples based on
+original data, preserving stylistic and contextual essence, and 2) extending
+strategy that further expands existing samples at varying cognitive levels by
+adapting Bloom's taxonomy of educational objectives. Extensive experiments on
+updated MMLU and BIG-Bench demonstrate the stability of the proposed strategies
+and find that the mimicking strategy can effectively alleviate issues of
+overestimation from benchmark leakage. In cases where the efficient mimicking
+strategy fails, our extending strategy still shows promising results.
+Additionally, by controlling the difficulty, we can better discern the models'
+performance and enable fine-grained analysis neither too difficult nor too easy
+an exam can fairly judge students' learning status. To the best of our
+knowledge, we are the first to automate updating benchmarks for reliable and
+timely evaluation. Our demo leaderboard can be found at
+https://yingjiahao14.github.io/Automating-DatasetUpdates/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PLaD: Preference-based Large Language Model Distillation with
+  Pseudo-Preference Pairs <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02886v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02886v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongzhi Zhang, Jiaming Shen, Tianqi Liu, Haorui Wang, Zhen Qin, Feng Han, Jialu Liu, Simon Baumgartner, Michael Bendersky, Chao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have exhibited impressive capabilities in
+various tasks, yet their vast parameter sizes restrict their applicability in
+resource-constrained settings. Knowledge distillation (KD) offers a viable
+solution by transferring expertise from large teacher models to compact student
+models. However, traditional KD techniques face specific challenges when
+applied to LLMs, including restricted access to LLM outputs, significant
+teacher-student capacity gaps, and the inherited mis-calibration issue. In this
+work, we present PLaD, a novel preference-based LLM distillation framework.
+PLaD exploits the teacher-student capacity discrepancy to generate
+pseudo-preference pairs where teacher outputs are preferred over student
+outputs. Then, PLaD leverages a ranking loss to re-calibrate student's
+estimation of sequence likelihood, which steers the student's focus towards
+understanding the relative quality of outputs instead of simply imitating the
+teacher. PLaD bypasses the need for access to teacher LLM's internal states,
+tackles the student's expressivity limitations, and mitigates the student
+mis-calibration issue. Through extensive experiments on two sequence generation
+tasks and with various LLMs, we demonstrate the effectiveness of our proposed
+PLaD framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Competition Report: Finding Universal Jailbreak Backdoors in Aligned
+  LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14461v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14461v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Rando, Francesco Croce, Kryštof Mitka, Stepan Shabalin, Maksym Andriushchenko, Nicolas Flammarion, Florian Tramèr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models are aligned to be safe, preventing users from
+generating harmful content like misinformation or instructions for illegal
+activities. However, previous work has shown that the alignment process is
+vulnerable to poisoning attacks. Adversaries can manipulate the safety training
+data to inject backdoors that act like a universal sudo command: adding the
+backdoor string to any prompt enables harmful responses from models that,
+otherwise, behave safely. Our competition, co-located at IEEE SaTML 2024,
+challenged participants to find universal backdoors in several large language
+models. This report summarizes the key findings and promising ideas for future
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Competition Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Looking Right is Sometimes Right: Investigating the Capabilities of
+  Decoder-only LLMs for Sequence Labeling <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14556v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14556v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Dukić, Jan Šnajder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained language models based on masked language modeling (MLM) excel in
+natural language understanding (NLU) tasks. While fine-tuned MLM-based encoders
+consistently outperform causal language modeling decoders of comparable size,
+recent decoder-only large language models (LLMs) perform on par with smaller
+MLM-based encoders. Although their performance improves with scale, LLMs fall
+short of achieving state-of-the-art results in information extraction (IE)
+tasks, many of which are formulated as sequence labeling (SL). We hypothesize
+that LLMs' poor SL performance stems from causal masking, which prevents the
+model from attending to tokens on the right of the current token. Yet, how
+exactly and to what extent LLMs' performance on SL can be improved remains
+unclear. We explore techniques for improving the SL performance of open LLMs on
+IE tasks by applying layer-wise removal of the causal mask (CM) during LLM
+fine-tuning. This approach yields performance gains competitive with
+state-of-the-art SL models, matching or outperforming the results of CM removal
+from all blocks. Our findings hold for diverse SL tasks, demonstrating that
+open LLMs with layer-dependent CM removal outperform strong MLM-based encoders
+and even instruction-tuned LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MELA: Multilingual Evaluation of Linguistic Acceptability <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09033v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09033v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyin Zhang, Yikang Liu, Weifang Huang, Junyu Mao, Rui Wang, Hai Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present the largest benchmark to date on linguistic
+acceptability: Multilingual Evaluation of Linguistic Acceptability -- MELA,
+with 46K samples covering 10 languages from a diverse set of language families.
+We establish LLM baselines on this benchmark, and investigate cross-lingual
+transfer in acceptability judgements with XLM-R. In pursuit of multilingual
+interpretability, we conduct probing experiments with fine-tuned XLM-R to
+explore the process of syntax capability acquisition. Our results show that
+GPT-4o exhibits a strong multilingual ability, outperforming fine-tuned XLM-R,
+while open-source multilingual models lag behind by a noticeable gap.
+Cross-lingual transfer experiments show that transfer in acceptability judgment
+is non-trivial: 500 Icelandic fine-tuning examples lead to 23 MCC performance
+in a completely unrelated language -- Chinese. Results of our probing
+experiments indicate that training on MELA improves the performance of XLM-R on
+syntax-related tasks. Our data is available at
+https://github.com/sjtu-compling/MELA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Traditional Benchmarks: Analyzing Behaviors of Open LLMs on
+  Data-to-Text Generation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10186v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10186v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zdeněk Kasner, Ondřej Dušek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze the behaviors of open large language models (LLMs) on the task of
+data-to-text (D2T) generation, i.e., generating coherent and relevant text from
+structured data. To avoid the issue of LLM training data contamination with
+standard benchmarks, we design Quintd - a tool for collecting novel structured
+data records from public APIs. We find that open LLMs (Llama 2, Mistral, and
+Zephyr) can generate fluent and coherent texts in zero-shot settings from data
+in common formats collected with Quintd. However, we show that the semantic
+accuracy of the outputs is a major issue: both according to human annotators
+and our reference-free metric based on GPT-4, more than 80% of the outputs of
+open LLMs contain at least one semantic error. We publicly release the code,
+data, and model outputs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KMMLU: Measuring Massive Multitask Language Understanding in Korean 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11548v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11548v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guijin Son, Hanwool Lee, Sungdong Kim, Seungone Kim, Niklas Muennighoff, Taekyoon Choi, Cheonbok Park, Kang Min Yoo, Stella Biderman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose KMMLU, a new Korean benchmark with 35,030 expert-level
+multiple-choice questions across 45 subjects ranging from humanities to STEM.
+While prior Korean benchmarks are translated from existing English benchmarks,
+KMMLU is collected from original Korean exams, capturing linguistic and
+cultural aspects of the Korean language. We test 27 public and proprietary LLMs
+and observe the best public model to score 50.5%, leaving significant room for
+improvement. This model was primarily trained for English and Chinese, not
+Korean. Current LLMs tailored to Korean, such as Polyglot-Ko, perform far
+worse. Surprisingly, even the most capable proprietary LLMs, e.g., GPT-4 and
+HyperCLOVA X do not exceed 60%. This suggests that further work is needed to
+improve LLMs for Korean, and we believe KMMLU offers the appropriate tool to
+track this progress. We make our dataset publicly available on the Hugging Face
+Hub and integrate the benchmark into EleutherAI's Language Model Evaluation
+Harness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Why Can Large Language Models Generate Correct Chain-of-Thoughts? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.13571v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.13571v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rasul Tutunov, Antoine Grosnit, Juliusz Ziomek, Jun Wang, Haitham Bou-Ammar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper delves into the capabilities of large language models (LLMs),
+specifically focusing on advancing the theoretical comprehension of
+chain-of-thought prompting. We investigate how LLMs can be effectively induced
+to generate a coherent chain of thoughts. To achieve this, we introduce a
+two-level hierarchical graphical model tailored for natural language
+generation. Within this framework, we establish a compelling geometrical
+convergence rate that gauges the likelihood of an LLM-generated chain of
+thoughts compared to those originating from the true language. Our findings
+provide a theoretical justification for the ability of LLMs to produce the
+correct sequence of thoughts (potentially) explaining performance gains in
+tasks demanding reasoning skills.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NICE: To Optimize In-Context Examples or Not? <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06733v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06733v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pragya Srivastava, Satvik Golechha, Amit Deshpande, Amit Sharma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work shows that in-context learning and optimization of in-context
+examples (ICE) can significantly improve the accuracy of large language models
+(LLMs) on a wide range of tasks, leading to an apparent consensus that ICE
+optimization is crucial for better performance. However, most of these studies
+assume a fixed or no instruction provided in the prompt. We challenge this
+consensus by investigating the necessity of optimizing ICE when task-specific
+instructions are provided and find that there are many tasks for which it
+yields diminishing returns. In particular, using a diverse set of tasks and a
+systematically created instruction set with gradually added details, we find
+that as the prompt instruction becomes more detailed, the returns on ICE
+optimization diminish. To characterize this behavior, we introduce a
+task-specific metric called Normalized Invariability to Choice of Examples
+(NICE) that quantifies the learnability of tasks from a given instruction, and
+provides a heuristic to help decide whether to optimize instructions or ICE for
+a new task. Given a task, the proposed metric can reliably predict the utility
+of optimizing ICE compared to using random ICE. Our code is available at
+https://github.com/microsoft/nice-icl.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a full paper (9 pages) at ACL 2024 (Main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAPT: A Shared Attention Framework for Parameter-Efficient Continual
+  Learning of Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08295v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08295v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weixiang Zhao, Shilong Wang, Yulin Hu, Yanyan Zhao, Bing Qin, Xuanyu Zhang, Qing Yang, Dongliang Xu, Wanxiang Che
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The continual learning (CL) ability is vital for deploying large language
+models (LLMs) in the dynamic world. Existing methods devise the learning module
+to acquire task-specific knowledge with parameter-efficient tuning (PET) block
+and the selection module to pick out the corresponding one for the testing
+input, aiming at handling the challenges of catastrophic forgetting and
+knowledge transfer in CL. However, these methods tend to address only one of
+the challenges, ignoring the potential of aligning the two modules to
+effectively address catastrophic forgetting and knowledge transfer
+simultaneously. To this end, we propose a novel Shared Attention Framework
+(SAPT), to align the PET learning and selection via the Shared Attentive
+Learning \& Selection module. Extensive Experiments on two CL benchmarks
+demonstrate the superiority of SAPT. Moreover, SAPT consistently demonstrates
+its superiority when we scale it to different model sizes (from 770M to 13B),
+different model architectures (T5 and LLaMA-2) and unseen tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Direct Preference Optimization with an Offset 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10571v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10571v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Afra Amini, Tim Vieira, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Direct preference optimization (DPO) is a successful fine-tuning strategy for
+aligning large language models with human preferences without the need to train
+a reward model or employ reinforcement learning. DPO, as originally formulated,
+relies on binary preference data and fine-tunes a language model to increase
+the likelihood of a preferred response over a dispreferred response. However,
+not all preference pairs are equal. Sometimes, the preferred response is only
+slightly better than the dispreferred one. In other cases, the preference is
+much stronger. For instance, if a response contains harmful or toxic content,
+the annotator will have a strong preference for that response. In this paper,
+we propose a generalization of DPO, termed DPO with an offset (ODPO), that does
+not treat every preference pair equally during fine-tuning. Intuitively, ODPO
+requires the difference between the likelihood of the preferred and
+dispreferred response to be greater than an offset value. The offset is
+determined based on the extent to which one response is preferred over another.
+Our experiments on various tasks suggest that ODPO significantly outperforms
+DPO in aligning language models, especially when the number of preference pairs
+is limited.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Addressing Order Sensitivity of In-Context Demonstration Examples in
+  Causal Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15637v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15637v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanzheng Xiang, Hanqi Yan, Lin Gui, Yulan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning has become a popular paradigm in natural language
+processing. However, its performance can be significantly influenced by the
+order of in-context demonstration examples. In this paper, we found that causal
+language models (CausalLMs) are more sensitive to this order compared to prefix
+language models (PrefixLMs). We attribute this phenomenon to the
+auto-regressive attention masks within CausalLMs, which restrict each token
+from accessing information from subsequent tokens. This results in different
+receptive fields for samples at different positions, thereby leading to
+representation disparities across positions. To tackle this challenge, we
+introduce an unsupervised fine-tuning method, termed the Information-Augmented
+and Consistency-Enhanced approach. This approach utilizes contrastive learning
+to align representations of in-context examples across different positions and
+introduces a consistency loss to ensure similar representations for inputs with
+different permutations. This enhances the model's predictive consistency across
+permutations. Experimental results on five benchmarks suggest that our proposed
+method can reduce the sensitivity of CausalLMs to the order of in-context
+examples and exhibit robust generalizability, particularly when demonstrations
+are sourced from a candidate pool different from that used in the training
+phase, or when the number of in-context examples differs from what is used
+during training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RA-ISF: Learning to Answer and Understand from Retrieval Augmentation
+  via Iterative Self-Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanming Liu, Xinyue Peng, Xuhong Zhang, Weihao Liu, Jianwei Yin, Jiannan Cao, Tianyu Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) demonstrate exceptional performance in numerous
+tasks but still heavily rely on knowledge stored in their parameters. Moreover,
+updating this knowledge incurs high training costs. Retrieval-augmented
+generation (RAG) methods address this issue by integrating external knowledge.
+The model can answer questions it couldn't previously by retrieving knowledge
+relevant to the query. This approach improves performance in certain scenarios
+for specific tasks. However, if irrelevant texts are retrieved, it may impair
+model performance. In this paper, we propose Retrieval Augmented Iterative
+Self-Feedback (RA-ISF), a framework that iteratively decomposes tasks and
+processes them in three submodules to enhance the model's problem-solving
+capabilities. Experiments show that our method outperforms existing benchmarks,
+performing well on models like GPT3.5, Llama2, significantly enhancing factual
+reasoning capabilities and reducing hallucinations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, multiple figures. Providing second version RA-ISF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ERA-CoT: Improving Chain-of-Thought through Entity Relationship Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06932v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06932v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanming Liu, Xinyue Peng, Tianyu Du, Jianwei Yin, Weihao Liu, Xuhong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved commendable accomplishments in
+various natural language processing tasks. However, LLMs still encounter
+significant challenges when dealing with complex scenarios involving multiple
+entities. These challenges arise from the presence of implicit relationships
+that demand multi-step reasoning. In this paper, we propose a novel approach
+ERA-CoT, which aids LLMs in understanding context by capturing relationships
+between entities and supports the reasoning of diverse tasks through
+Chain-of-Thoughts (CoT). Experimental results show that ERA-CoT demonstrates
+the superior performance of our proposed method compared to current CoT
+prompting methods, achieving a significant improvement of an average of 5.1\%
+on GPT3.5 compared to previous SOTA baselines. Our analysis indicates that
+ERA-CoT increases the LLM's understanding of entity relationships,
+significantly improves the accuracy of question answering, and enhances the
+reasoning ability of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, second version of ERA-CoT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring and steering the moral compass of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17345v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17345v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Tlaie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have become central to advancing automation and
+decision-making across various sectors, raising significant ethical questions.
+This study proposes a comprehensive comparative analysis of the most advanced
+LLMs to assess their moral profiles. We subjected several state-of-the-art
+models to a selection of ethical dilemmas and found that all the proprietary
+ones are mostly utilitarian and all of the open-weights ones align mostly with
+values-based ethics. Furthermore, when using the Moral Foundations
+Questionnaire, all models we probed - except for Llama 2-7B - displayed a
+strong liberal bias. Lastly, in order to causally intervene in one of the
+studied models, we propose a novel similarity-specific activation steering
+technique. Using this method, we were able to reliably steer the model's moral
+compass to different ethical schools. All of these results showcase that there
+is an ethical dimension in already deployed LLMs, an aspect that is generally
+overlooked.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bias in News Summarization: Measures, Pitfalls and Corpora <span class="chip">ACL 24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08047v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08047v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Steen, Katja Markert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Summarization is an important application of large language models (LLMs).
+Most previous evaluation of summarization models has focused on their content
+selection, faithfulness, grammaticality and coherence. However, it is well
+known that LLMs can reproduce and reinforce harmful social biases. This raises
+the question: Do biases affect model outputs in a constrained setting like
+summarization? To help answer this question, we first motivate and introduce a
+number of definitions for biased behaviours in summarization models, along with
+practical operationalizations. Since we find that biases inherent to input
+documents can confound bias analysis in summaries, we propose a method to
+generate input documents with carefully controlled demographic attributes. This
+allows us to study summarizer behavior in a controlled setting, while still
+working with realistic input documents. We measure gender bias in English
+summaries generated by both purpose-built summarization models and general
+purpose chat models as a case study. We find content selection in single
+document summarization to be largely unaffected by gender bias, while
+hallucinations exhibit evidence of bias. To demonstrate the generality of our
+approach, we additionally investigate racial bias, including intersectional
+settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 24 Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Auto Arena of LLMs: Automating LLM Evaluations with Agent Peer-battles
+  and Committee Discussions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20267v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20267v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruochen Zhao, Wenxuan Zhang, Yew Ken Chia, Deli Zhao, Lidong Bing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As LLMs evolve on a daily basis, there is an urgent need for a trustworthy
+evaluation method that can provide robust evaluation results in a timely
+fashion. Currently, as static benchmarks are prone to contamination concerns,
+users tend to trust human voting platforms, such as Chatbot Arena. However,
+human annotations require extensive manual efforts. To provide an automatic,
+robust, and trustworthy evaluation framework, we innovatively propose the
+Auto-Arena of LLMs, which automates the entire evaluation process with LLM
+agents. Firstly, an examiner LLM devises queries. Then, a pair of candidate
+LLMs engage in a multi-round peer-battle around the query, during which the
+LLM's true performance gaps become visible. Finally, a committee of LLM judges
+collectively discuss and determine the winner, which alleviates bias and
+promotes fairness. In our extensive experiment on the 17 newest LLMs,
+Auto-Arena shows the highest correlation with human preferences, providing a
+promising alternative to human evaluation platforms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $Se^2$: Sequential Example Selection for In-Context Learning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13874v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13874v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Liu, Jianfeng Liu, Shaohan Huang, Yuefeng Zhan, Hao Sun, Weiwei Deng, Furu Wei, Qi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable capability of large language models (LLMs) for in-context
+learning (ICL) needs to be activated by demonstration examples. Prior work has
+extensively explored the selection of examples for ICL, predominantly following
+the "select then organize" paradigm, such approaches often neglect the internal
+relationships between examples and exist an inconsistency between the training
+and inference. In this paper, we formulate the problem as a $Se$quential
+$Se$lection problem and introduce $Se^2$, a sequential-aware method that
+leverages the LLM's feedback on varying context, aiding in capturing
+inter-relationships and sequential information among examples, significantly
+enriching the contextuality and relevance of ICL prompts. Meanwhile, we utilize
+beam search to seek and construct example sequences, enhancing both quality and
+diversity. Extensive experiments across 23 NLP tasks from 8 distinct categories
+illustrate that $Se^2$ markedly surpasses competitive baselines and achieves
+42\% relative improvement over random selection. Further in-depth analysis
+shows the effectiveness of proposed strategies, highlighting $Se^2$'s
+exceptional stability and adaptability across various scenarios. Code available
+at https://github.com/microsoft/LMOps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Label-Efficient Model Selection for Text Generation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07891v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07891v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shir Ashury-Tahan, Ariel Gera, Benjamin Sznajder, Leshem Choshen, Liat Ein-Dor, Eyal Shnarch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model selection for a given target task can be costly, as it may entail
+extensive annotation of the quality of outputs of different models. We
+introduce DiffUse, an efficient method to make an informed decision between
+candidate text generation models based on preference annotations. DiffUse
+reduces the required amount of annotations, thus saving valuable time and
+resources in performing evaluation. DiffUse intelligently selects instances by
+clustering embeddings that represent the semantic differences between model
+outputs. Thus, it is able to identify a subset of examples that are more
+informative for preference decisions. Our method is model-agnostic, and can be
+applied to any text generation model for selecting between models, prompts and
+configurations. Moreover, we propose a practical iterative approach for
+dynamically determining how many instances to annotate. In a series of
+experiments over hundreds of model pairs, we demonstrate that DiffUse can
+dramatically reduce the required number of annotations -- by up to 75% -- while
+maintaining high evaluation reliability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL (main conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Through the Lens of Split Vote: Exploring Disagreement, Difficulty and
+  Calibration in Legal Case Outcome Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07214v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07214v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanshan Xu, T. Y. S. S Santosh, Oana Ichim, Barbara Plank, Matthias Grabmair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In legal decisions, split votes (SV) occur when judges cannot reach a
+unanimous decision, posing a difficulty for lawyers who must navigate diverse
+legal arguments and opinions. In high-stakes domains, understanding the
+alignment of perceived difficulty between humans and AI systems is crucial to
+build trust. However, existing NLP calibration methods focus on a classifier's
+awareness of predictive performance, measured against the human majority class,
+overlooking inherent human label variation (HLV). This paper explores split
+votes as naturally observable human disagreement and value pluralism. We
+collect judges' vote distributions from the European Court of Human Rights
+(ECHR), and present SV-ECHR, a case outcome classification (COC) dataset with
+SV information. We build a taxonomy of disagreement with SV-specific
+subcategories. We further assess the alignment of perceived difficulty between
+models and humans, as well as confidence- and human-calibration of COC models.
+We observe limited alignment with the judge vote distribution. To our
+knowledge, this is the first systematic exploration of calibration to human
+judgements in legal NLP. Our study underscores the necessity for further
+research on measuring and enhancing model calibration considering HLV in legal
+decision tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ It is Simple Sometimes: A Study On Improving Aspect-Based Sentiment
+  Analysis Performance <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20703v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20703v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Cabello, Uchenna Akujuobi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aspect-Based Sentiment Analysis (ABSA) involves extracting opinions from
+textual data about specific entities and their corresponding aspects through
+various complementary subtasks. Several prior research has focused on
+developing ad hoc designs of varying complexities for these subtasks. In this
+paper, we present a generative framework extensible to any ABSA subtask. We
+build upon the instruction tuned model proposed by Scaria et al. (2023), who
+present an instruction-based model with task descriptions followed by
+in-context examples on ABSA subtasks. We propose PFInstruct, an extension to
+this instruction learning paradigm by appending an NLP-related task prefix to
+the task description. This simple approach leads to improved performance across
+all tested SemEval subtasks, surpassing previous state-of-the-art (SOTA) on the
+ATE subtask (Rest14) by +3.28 F1-score, and on the AOOE subtask by an average
+of +5.43 F1-score across SemEval datasets. Furthermore, we explore the impact
+of the prefix-enhanced prompt quality on the ABSA subtasks and find that even a
+noisy prefix enhances model performance compared to the baseline. Our method
+also achieves competitive results on a biomedical domain dataset (ERSA).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mercury: A Code Efficiency Benchmark for LLM Code Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07844v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07844v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingzhe Du, Anh Tuan Luu, Bin Ji, Qian Liu, See-Kiong Ng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Amidst the recent strides in evaluating Large Language Models for Code (Code
+LLMs), existing benchmarks have mainly focused on the functional correctness of
+generated code, neglecting the importance of their computational efficiency. To
+fill the gap, we present Mercury, the first code efficiency benchmark for Code
+LLMs. It comprises 1,889 Python tasks, each accompanied by adequate solutions
+that serve as real-world efficiency baselines, enabling a comprehensive
+analysis of the runtime distribution. Based on the distribution, we introduce a
+new metric Beyond, which computes a runtime-percentile-weighted Pass score to
+reflect functional correctness and code efficiency simultaneously. On Mercury,
+leading Code LLMs can achieve 65% on Pass, while less than 50% on Beyond. Given
+that an ideal Beyond score would be aligned with the Pass score, it indicates
+that while Code LLMs exhibit impressive capabilities in generating functionally
+correct code, there remains a notable gap in their efficiency. Finally, our
+empirical experiments reveal that Direct Preference Optimization (DPO) serves
+as a robust baseline for enhancing code efficiency compared with Supervised
+Fine Tuning (SFT), which paves a promising avenue for future exploration of
+efficient code generation. Our code and data are available on GitHub:
+https://github.com/Elfsong/Mercury.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GRASP: A novel benchmark for evaluating language GRounding And Situated
+  Physics understanding in multimodal language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09048v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09048v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serwan Jassim, Mario Holubar, Annika Richter, Cornelius Wolff, Xenia Ohmer, Elia Bruni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents GRASP, a novel benchmark to evaluate the language
+grounding and physical understanding capabilities of video-based multimodal
+large language models (LLMs). This evaluation is accomplished via a two-tier
+approach leveraging Unity simulations. The first level tests for language
+grounding by assessing a model's ability to relate simple textual descriptions
+with visual information. The second level evaluates the model's understanding
+of "Intuitive Physics" principles, such as object permanence and continuity. In
+addition to releasing the benchmark, we use it to evaluate several
+state-of-the-art multimodal LLMs. Our evaluation reveals significant
+shortcomings in the language grounding and intuitive physics capabilities of
+these models. Although they exhibit at least some grounding capabilities,
+particularly for colors and shapes, these capabilities depend heavily on the
+prompting strategy. At the same time, all models perform below or at the chance
+level of 50% in the Intuitive Physics tests, while human subjects are on
+average 80% correct. These identified limitations underline the importance of
+using benchmarks like GRASP to monitor the progress of future models in
+developing these competencies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoGenesis: A Framework Collaborating Large and Small Language Models for
+  Secure Context-Aware Instruction Following <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03129v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03129v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyan Zhang, Jianyu Wang, Ermo Hua, Biqing Qi, Ning Ding, Bowen Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of language models (LMs), their exposure to private data
+is increasingly inevitable, and their deployment (especially for smaller ones)
+on personal devices, such as PCs and smartphones, has become a prevailing
+trend. In contexts laden with user information, enabling models to both
+safeguard user privacy and execute commands efficiently emerges as an essential
+research imperative. In this paper, we propose CoGenesis, a collaborative
+generation framework integrating large (hosted on cloud infrastructure) and
+small models (deployed on local devices) to address privacy concerns logically.
+Initially, we design a pipeline to create personalized writing instruction
+datasets enriched with extensive context details as the testbed of this
+research issue. Subsequently, we introduce two variants of CoGenesis based on
+sketch and logits respectively. Our experimental findings, based on our
+synthesized dataset and two additional open-source datasets, indicate that: 1)
+Large-scale models perform well when provided with user context but struggle in
+the absence of such context. 2) While specialized smaller models fine-tuned on
+the synthetic dataset show promise, they still lag behind their larger
+counterparts. 3) Our CoGenesis framework, utilizing mixed-scale models,
+showcases competitive performance, providing a feasible solution to privacy
+issues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 (Main Conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structured Voronoi Sampling <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03061v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03061v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Afra Amini, Li Du, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient-based sampling algorithms have demonstrated their effectiveness in
+text generation, especially in the context of controlled text generation.
+However, there exists a lack of theoretically grounded and principled
+approaches for this task. In this paper, we take an important step toward
+building a principled approach for sampling from language models with
+gradient-based methods. We use discrete distributions given by language models
+to define densities and develop an algorithm based on Hamiltonian Monte Carlo
+to sample from them. We name our gradient-based technique Structured Voronoi
+Sampling (SVS). In an experimental setup where the reference distribution is
+known, we show that the empirical distribution of SVS samples is closer to the
+reference distribution compared to alternative sampling schemes. Furthermore,
+in a controlled generation task, SVS is able to generate fluent and diverse
+samples while following the control targets significantly better than other
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Which Side Are You On? A Multi-task <span class="highlight-title">Dataset</span> for End-to-End Argument
+  Summarisation and Evaluation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03151v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03151v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Li, Yuping Wu, Viktor Schlegel, Riza Batista-Navarro, Tharindu Madusanka, Iqra Zahid, Jiayan Zeng, Xiaochi Wang, Xinran He, Yizhi Li, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent advances of large language models (LLMs), it is no longer
+infeasible to build an automated debate system that helps people to synthesise
+persuasive arguments. Previous work attempted this task by integrating multiple
+components. In our work, we introduce an argument mining dataset that captures
+the end-to-end process of preparing an argumentative essay for a debate, which
+covers the tasks of claim and evidence identification (Task 1 ED), evidence
+convincingness ranking (Task 2 ECR), argumentative essay summarisation and
+human preference ranking (Task 3 ASR) and metric learning for automated
+evaluation of resulting essays, based on human feedback along argument quality
+dimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are
+fully annotated with the various properties supporting the aforementioned
+tasks. We evaluate multiple generative baselines for each of these tasks,
+including representative LLMs. We find, that while they show promising results
+on individual tasks in our benchmark, their end-to-end performance on all four
+tasks in succession deteriorates significantly, both in automated measures as
+well as in human-centred evaluation. This challenge presented by our proposed
+dataset motivates future research on end-to-end argument mining and
+summarisation. The repository of this project is available at
+https://github.com/HarrywillDr/ArgSum-Datatset
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Factorized Neural Transducer Model For text-only Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09524v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09524v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junzhe Liu, Jianwei Yu, Xie Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapting End-to-End ASR models to out-of-domain datasets with text data is
+challenging. Factorized neural Transducer (FNT) aims to address this issue by
+introducing a separate vocabulary decoder to predict the vocabulary.
+Nonetheless, this approach has limitations in fusing acoustic and language
+information seamlessly. Moreover, a degradation in word error rate (WER) on the
+general test sets was also observed, leading to doubts about its overall
+performance. In response to this challenge, we present the improved factorized
+neural Transducer (IFNT) model structure designed to comprehensively integrate
+acoustic and language information while enabling effective text adaptation. We
+assess the performance of our proposed method on English and Mandarin datasets.
+The results indicate that IFNT not only surpasses the neural Transducer and FNT
+in baseline performance in both scenarios but also exhibits superior adaptation
+ability compared to FNT. On source domains, IFNT demonstrated statistically
+significant accuracy improvements, achieving a relative enhancement of 1.2% to
+2.8% in baseline accuracy compared to the neural Transducer. On out-of-domain
+datasets, IFNT shows relative WER(CER) improvements of up to 30.2% over the
+standard neural Transducer with shallow fusion, and relative WER(CER)
+reductions ranging from 1.1% to 2.8% on test sets compared to the FNT model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Interspeech 2024 cameraready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Large Language Models for Health-related Queries with
+  Presuppositions <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.08800v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.08800v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navreet Kaur, Monojit Choudhury, Danish Pruthi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As corporations rush to integrate large language models (LLMs) to their
+search offerings, it is critical that they provide factually accurate
+information that is robust to any presuppositions that a user may express. In
+this work, we introduce UPHILL, a dataset consisting of health-related queries
+with varying degrees of presuppositions. Using UPHILL, we evaluate the factual
+accuracy and consistency of InstructGPT, ChatGPT, and BingChat models. We find
+that while model responses rarely disagree with true health claims (posed as
+questions), they often fail to challenge false claims: responses from
+InstructGPT agree with 32% of the false claims, ChatGPT 26% and BingChat 23%.
+As we increase the extent of presupposition in input queries, the responses
+from InstructGPT and ChatGPT agree with the claim considerably more often,
+regardless of its veracity. Responses from BingChat, which rely on retrieved
+webpages, are not as susceptible. Given the moderate factual accuracy, and the
+inability of models to consistently correct false assumptions, our work calls
+for a careful assessment of current LLMs for use in high-stakes scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NaijaHate: Evaluating Hate Speech Detection on Nigerian Twitter Using
+  Representative Data <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.19260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.19260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Tonneau, Pedro Vitor Quinta de Castro, Karim Lasri, Ibrahim Farouq, Lakshminarayanan Subramanian, Victor Orozco-Olvera, Samuel P. Fraiberger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To address the global issue of online hate, hate speech detection (HSD)
+systems are typically developed on datasets from the United States, thereby
+failing to generalize to English dialects from the Majority World. Furthermore,
+HSD models are often evaluated on non-representative samples, raising concerns
+about overestimating model performance in real-world settings. In this work, we
+introduce NaijaHate, the first dataset annotated for HSD which contains a
+representative sample of Nigerian tweets. We demonstrate that HSD evaluated on
+biased datasets traditionally used in the literature consistently overestimates
+real-world performance by at least two-fold. We then propose NaijaXLM-T, a
+pretrained model tailored to the Nigerian Twitter context, and establish the
+key role played by domain-adaptive pretraining and finetuning in maximizing HSD
+performance. Finally, owing to the modest performance of HSD systems in
+real-world conditions, we find that content moderators would need to review
+about ten thousand Nigerian tweets flagged as hateful daily to moderate 60% of
+all hateful content, highlighting the challenges of moderating hate speech at
+scale as social media usage continues to grow globally. Taken together, these
+results pave the way towards robust HSD systems and a better protection of
+social media users from hateful content in low-resource settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 main conference. Data and models available at
+  https://github.com/worldbank/NaijaHate</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StatBot.Swiss: Bilingual Open Data Exploration in Natural Language <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03170v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03170v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farhad Nooralahzadeh, Yi Zhang, Ellery Smith, Sabine Maennel, Cyril Matthey-Doret, Raphaël de Fondville, Kurt Stockinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The potential for improvements brought by Large Language Models (LLMs) in
+Text-to-SQL systems is mostly assessed on monolingual English datasets.
+However, LLMs' performance for other languages remains vastly unexplored. In
+this work, we release the StatBot.Swiss dataset, the first bilingual benchmark
+for evaluating Text-to-SQL systems based on real-world applications. The
+StatBot.Swiss dataset contains 455 natural language/SQL-pairs over 35 big
+databases with varying level of complexity for both English and German.
+  We evaluate the performance of state-of-the-art LLMs such as GPT-3.5-Turbo
+and mixtral-8x7b-instruct for the Text-to-SQL translation task using an
+in-context learning approach. Our experimental analysis illustrates that
+current LLMs struggle to generalize well in generating SQL queries on our novel
+bilingual dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted at ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PARADISE: Evaluating Implicit Planning Skills of Language Models with
+  Procedural Warnings and Tips <span class="highlight-title">Dataset</span> <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03167v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03167v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arda Uzunoglu, Abdalfatah Rashid Safa, Gözde Gül Şahin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been growing interest within the community regarding
+whether large language models are capable of planning or executing plans.
+However, most prior studies use LLMs to generate high-level plans for
+simplified scenarios lacking linguistic complexity and domain diversity,
+limiting analysis of their planning abilities. These setups constrain
+evaluation methods (e.g., predefined action space), architectural choices
+(e.g., only generative models), and overlook the linguistic nuances essential
+for realistic analysis. To tackle this, we present PARADISE, an abductive
+reasoning task using Q\&A format on practical procedural text sourced from
+wikiHow. It involves warning and tip inference tasks directly associated with
+goals, excluding intermediary steps, with the aim of testing the ability of the
+models to infer implicit knowledge of the plan solely from the given goal. Our
+experiments, utilizing fine-tuned language models and zero-shot prompting,
+reveal the effectiveness of task-specific small models over large language
+models in most scenarios. Despite advancements, all models fall short of human
+performance. Notably, our analysis uncovers intriguing insights, such as
+variations in model behavior with dropped keywords, struggles of BERT-family
+and GPT-4 with physical and abstract goals, and the proposed tasks offering
+valuable prior knowledge for other unseen procedural tasks. The PARADISE
+dataset and associated resources are publicly available for further research
+exploration with https://github.com/GGLAB-KU/paradise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LogicBench: Towards Systematic Evaluation of Logical Reasoning Ability
+  of Large Language Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.15522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.15522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mihir Parmar, Nisarg Patel, Neeraj Varshney, Mutsumi Nakamura, Man Luo, Santosh Mashetty, Arindam Mitra, Chitta Baral
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently developed large language models (LLMs) have been shown to perform
+remarkably well on a wide range of language understanding tasks. But, can they
+really "reason" over the natural language? This question has been receiving
+significant research attention and many reasoning skills such as commonsense,
+numerical, and qualitative have been studied. However, the crucial skill
+pertaining to 'logical reasoning' has remained underexplored. Existing work
+investigating this reasoning ability of LLMs has focused only on a couple of
+inference rules (such as modus ponens and modus tollens) of propositional and
+first-order logic. Addressing the above limitation, we comprehensively evaluate
+the logical reasoning ability of LLMs on 25 different reasoning patterns
+spanning over propositional, first-order, and non-monotonic logics. To enable
+systematic evaluation, we introduce LogicBench, a natural language
+question-answering dataset focusing on the use of a single inference rule. We
+conduct detailed analysis with a range of LLMs such as GPT-4, ChatGPT, Gemini,
+Llama-2, and Mistral using chain-of-thought prompting. Experimental results
+show that existing LLMs do not fare well on LogicBench; especially, they
+struggle with instances involving complex reasoning and negations. Furthermore,
+they sometimes overlook contextual information necessary for reasoning to
+arrive at the correct conclusion. We believe that our work and findings
+facilitate future research for evaluating and enhancing the logical reasoning
+ability of LLMs. Data and code are available at
+https://github.com/Mihir3009/LogicBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL(Main) 2024 | First version available @
+  https://openreview.net/forum?id=7NR2ZVzZxx</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language-Specific Neurons: The Key to Multilingual Capabilities in Large
+  Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16438v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16438v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Tang, Wenyang Luo, Haoyang Huang, Dongdong Zhang, Xiaolei Wang, Xin Zhao, Furu Wei, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) demonstrate remarkable multilingual capabilities
+without being pre-trained on specially curated multilingual parallel corpora.
+It remains a challenging problem to explain the underlying mechanisms by which
+LLMs process multilingual texts. In this paper, we delve into the composition
+of Transformer architectures in LLMs to pinpoint language-specific regions.
+Specially, we propose a novel detection method, language activation probability
+entropy (LAPE), to identify language-specific neurons within LLMs. Based on
+LAPE, we conduct comprehensive experiments on several representative LLMs, such
+as LLaMA-2, BLOOM, and Mistral. Our findings indicate that LLMs' proficiency in
+processing a particular language is predominantly due to a small subset of
+neurons, primarily situated in the models' top and bottom layers. Furthermore,
+we showcase the feasibility to "steer" the output language of LLMs by
+selectively activating or deactivating language-specific neurons. Our research
+provides important evidence to the understanding and exploration of the
+multilingual capabilities of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lost in the Source Language: How Large Language Models Evaluate the
+  Quality of Machine Translation <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06568v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06568v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Huang, Zhirui Zhang, Xiang Geng, Yichao Du, Jiajun Chen, Shujian Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigates how Large Language Models (LLMs) leverage source and
+reference data in machine translation evaluation task, aiming to better
+understand the mechanisms behind their remarkable performance in this task. We
+design the controlled experiments across various input modes and model types,
+and employ both coarse-grained and fine-grained prompts to discern the utility
+of source versus reference information. We find that reference information
+significantly enhances the evaluation accuracy, while surprisingly, source
+information sometimes is counterproductive, indicating LLMs' inability to fully
+leverage the cross-lingual capability when evaluating translations. Further
+analysis of the fine-grained evaluation and fine-tuning experiments show
+similar results. These findings also suggest a potential research direction for
+LLMs that fully exploits the cross-lingual capability of LLMs to achieve better
+performance in machine translation evaluation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning Based Named Entity Recognition Models for Recipes <span class="chip">LREC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17447v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17447v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mansi Goel, Ayush Agarwal, Shubham Agrawal, Janak Kapuriya, Akhil Vamshi Konam, Rishabh Gupta, Shrey Rastogi,  Niharika, Ganesh Bagler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Food touches our lives through various endeavors, including flavor,
+nourishment, health, and sustainability. Recipes are cultural capsules
+transmitted across generations via unstructured text. Automated protocols for
+recognizing named entities, the building blocks of recipe text, are of immense
+value for various applications ranging from information extraction to novel
+recipe generation. Named entity recognition is a technique for extracting
+information from unstructured or semi-structured data with known labels.
+Starting with manually-annotated data of 6,611 ingredient phrases, we created
+an augmented dataset of 26,445 phrases cumulatively. Simultaneously, we
+systematically cleaned and analyzed ingredient phrases from RecipeDB, the
+gold-standard recipe data repository, and annotated them using the Stanford
+NER. Based on the analysis, we sampled a subset of 88,526 phrases using a
+clustering-based approach while preserving the diversity to create the
+machine-annotated dataset. A thorough investigation of NER approaches on these
+three datasets involving statistical, fine-tuning of deep learning-based
+language models and few-shot prompting on large language models (LLMs) provides
+deep insights. We conclude that few-shot prompting on LLMs has abysmal
+performance, whereas the fine-tuned spaCy-transformer emerges as the best model
+with macro-F1 scores of 95.9%, 96.04%, and 95.71% for the manually-annotated,
+augmented, and machine-annotated datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 main figures and 2 in appendices, and 3 main tables;
+  Accepted for publication in LREC-COLING 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Flesch-Kincaid: <span class="highlight-title">Prompt</span>-based Metrics Improve Difficulty
+  Classification of Educational Texts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09482v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09482v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donya Rooein, Paul Rottger, Anastassia Shaitarova, Dirk Hovy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using large language models (LLMs) for educational applications like
+dialogue-based teaching is a hot topic. Effective teaching, however, requires
+teachers to adapt the difficulty of content and explanations to the education
+level of their students. Even the best LLMs today struggle to do this well. If
+we want to improve LLMs on this adaptation task, we need to be able to measure
+adaptation success reliably. However, current Static metrics for text
+difficulty, like the Flesch-Kincaid Reading Ease score, are known to be crude
+and brittle. We, therefore, introduce and evaluate a new set of Prompt-based
+metrics for text difficulty. Based on a user study, we create Prompt-based
+metrics as inputs for LLMs. They leverage LLM's general language understanding
+capabilities to capture more abstract and complex features than Static metrics.
+Regression experiments show that adding our Prompt-based metrics significantly
+improves text difficulty classification over Static metrics alone. Our results
+demonstrate the promise of using LLMs to evaluate text adaptation to different
+education levels.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced Language Model Truthfulness with Learnable Intervention and
+  Uncertainty Expression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00301v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00301v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farima Fatahi Bayat, Xin Liu, H. V. Jagadish, Lu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) can generate long-form and coherent text, yet
+they often hallucinate facts, which undermines their reliability. To mitigate
+this issue, inference-time methods steer LLM representations toward the
+"truthful directions" previously learned for truth elicitation. However,
+applying these truthful directions with the same intensity fails to generalize
+across different query contexts. We propose LITO, a Learnable Intervention
+method for Truthfulness Optimization that automatically identifies the optimal
+intervention intensity tailored to each specific context. LITO explores a
+sequence of model generations based on increasing levels of intervention
+intensities. It selects the most accurate response or refuses to answer when
+the predictions are highly uncertain. Experiments on multiple LLMs and
+question-answering datasets demonstrate that LITO improves truthfulness while
+preserving task accuracy. The adaptive nature of LITO counters the limitations
+of one-size-fits-all intervention methods, maximizing truthfulness by
+reflecting the model's internal knowledge only when it is confident. Our code
+is available at https://github.com/launchnlp/LITO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STAR: Constraint LoRA with Dynamic Active Learning for Data-Efficient
+  Fine-Tuning of Large Language Models <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01165v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01165v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linhai Zhang, Jialong Wu, Deyu Zhou, Guoqiang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Though Large Language Models (LLMs) have demonstrated the powerful
+capabilities of few-shot learning through prompting methods, supervised
+training is still necessary for complex reasoning tasks. Because of their
+extensive parameters and memory consumption, both Parameter-Efficient
+Fine-Tuning (PEFT) methods and Memory-Efficient Fine-Tuning methods have been
+proposed for LLMs. Nevertheless, the issue of large annotated data consumption,
+the aim of Data-Efficient Fine-Tuning, remains unexplored. One obvious way is
+to combine the PEFT method with active learning. However, the experimental
+results show that such a combination is not trivial and yields inferior
+results. Through probe experiments, such observation might be explained by two
+main reasons: uncertainty gap and poor model calibration. Therefore, in this
+paper, we propose a novel approach to effectively integrate uncertainty-based
+active learning and LoRA. Specifically, for the uncertainty gap, we introduce a
+dynamic uncertainty measurement that combines the uncertainty of the base model
+and the uncertainty of the full model during the iteration of active learning.
+For poor model calibration, we incorporate the regularization method during
+LoRA training to keep the model from being over-confident, and the Monte-Carlo
+dropout mechanism is employed to enhance the uncertainty estimation.
+Experimental results show that the proposed approach outperforms existing
+baseline models on three complex reasoning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL2024(Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Quantized Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18158v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18158v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyao Li, Xuefei Ning, Luning Wang, Tengxuan Liu, Xiangsheng Shi, Shengen Yan, Guohao Dai, Huazhong Yang, Yu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization (PTQ) has emerged as a promising technique to
+reduce the cost of large language models (LLMs). Specifically, PTQ can
+effectively mitigate memory consumption and reduce computational overhead in
+LLMs. To meet the requirements of both high efficiency and performance across
+diverse scenarios, a comprehensive evaluation of quantized LLMs is essential to
+guide the selection of quantization methods. This paper presents a thorough
+evaluation of these factors by evaluating the effect of PTQ on Weight,
+Activation, and KV Cache on 11 model families, including OPT, LLaMA2, Falcon,
+Bloomz, Mistral, ChatGLM, Vicuna, LongChat, StableLM, Gemma, and Mamba, with
+parameters ranging from 125M to 180B. The evaluation encompasses five types of
+tasks: basic NLP, emergent ability, trustworthiness, dialogue, and long-context
+tasks. Moreover, we also evaluate the state-of-the-art (SOTA) quantization
+methods to demonstrate their applicability. Based on the extensive experiments,
+we systematically summarize the effect of quantization, provide recommendations
+to apply quantization techniques, and point out future directions. The code can
+be found in https://github.com/thu-nics/qllm-eval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DINER: Debiasing Aspect-based Sentiment Analysis with Multi-variable
+  Causal Inference <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01166v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01166v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialong Wu, Linhai Zhang, Deyu Zhou, Guoqiang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Though notable progress has been made, neural-based aspect-based sentiment
+analysis (ABSA) models are prone to learn spurious correlations from annotation
+biases, resulting in poor robustness on adversarial data transformations. Among
+the debiasing solutions, causal inference-based methods have attracted much
+research attention, which can be mainly categorized into causal intervention
+methods and counterfactual reasoning methods. However, most of the present
+debiasing methods focus on single-variable causal inference, which is not
+suitable for ABSA with two input variables (the target aspect and the review).
+In this paper, we propose a novel framework based on multi-variable causal
+inference for debiasing ABSA. In this framework, different types of biases are
+tackled based on different causal intervention methods. For the review branch,
+the bias is modeled as indirect confounding from context, where backdoor
+adjustment intervention is employed for debiasing. For the aspect branch, the
+bias is described as a direct correlation with labels, where counterfactual
+reasoning is adopted for debiasing. Extensive experiments demonstrate the
+effectiveness of the proposed method compared to various baselines on the two
+widely used real-world aspect robustness test set datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL2024(Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Small and Fast <span class="highlight-title">BERT</span> for Chinese Medical Punctuation Restoration <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12568v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12568v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongtao Ling, Yutao Lai, Lei Chen, Shilei Huang, Yi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In clinical dictation, utterances after automatic speech recognition (ASR)
+without explicit punctuation marks may lead to the misunderstanding of dictated
+reports. To give a precise and understandable clinical report with ASR,
+automatic punctuation restoration is required. Considering a practical
+scenario, we propose a fast and light pre-trained model for Chinese medical
+punctuation restoration based on 'pretraining and fine-tuning' paradigm. In
+this work, we distill pre-trained models by incorporating supervised
+contrastive learning and a novel auxiliary pre-training task (Punctuation Mark
+Prediction) to make it well-suited for punctuation restoration. Our experiments
+on various distilled models reveal that our model can achieve 95% performance
+while 10% model size relative to state-of-the-art Chinese RoBERTa.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, Accepted by INTERSPEECH 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Using Synchronic Definitions and Semantic Relations to Classify Semantic
+  Change Types 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03452v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03452v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierluigi Cassotti, Stefano De Pascale, Nina Tahmasebi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is abundant evidence of the fact that the way words change their
+meaning can be classified in different types of change, highlighting the
+relationship between the old and new meanings (among which generalization,
+specialization and co-hyponymy transfer). In this paper, we present a way of
+detecting these types of change by constructing a model that leverages
+information both from synchronic lexical relations and definitions of word
+meanings. Specifically, we use synset definitions and hierarchy information
+from WordNet and test it on a digitized version of Blank's (1997) dataset of
+semantic change types. Finally, we show how the sense relationships can improve
+models for both approximation of human judgments of semantic relatedness as
+well as binary Lexical Semantic Change Detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exfiltration of personal information from Chat<span class="highlight-title">GPT</span> via <span class="highlight-title">prompt</span> injection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00199v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00199v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gregory Schwartzman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We report that ChatGPT 4 and 4o are susceptible to a prompt injection attack
+that allows an attacker to exfiltrate users' personal data. It is applicable
+without the use of any 3rd party tools and all users are currently affected.
+This vulnerability is exacerbated by the recent introduction of ChatGPT's
+memory feature, which allows an attacker to command ChatGPT to monitor the user
+for the desired personal data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Strengthened Symbol Binding Makes Large Language Models Reliable
+  Multiple-Choice Selectors <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01026v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01026v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengge Xue, Zhenyu Hu, Liqun Liu, Kuo Liao, Shuang Li, Honglin Han, Meng Zhao, Chengguo Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple-Choice Questions (MCQs) constitute a critical area of research in
+the study of Large Language Models (LLMs). Previous works have investigated the
+selection bias problem in MCQs within few-shot scenarios, in which the LLM's
+performance may be influenced by the presentation of answer choices, leaving
+the selection bias during Supervised Fine-Tuning (SFT) unexplored. In this
+paper, we reveal that selection bias persists in the SFT phase , primarily due
+to the LLM's inadequate Multiple Choice Symbol Binding (MCSB) ability. This
+limitation implies that the model struggles to associate the answer options
+with their corresponding symbols (e.g., A/B/C/D) effectively. To enhance the
+model's MCSB capability, we first incorporate option contents into the loss
+function and subsequently adjust the weights of the option symbols and
+contents, guiding the model to understand the option content of the current
+symbol. Based on this, we introduce an efficient SFT algorithm for MCQs, termed
+Point-wise Intelligent Feedback (PIF). PIF constructs negative instances by
+randomly combining the incorrect option contents with all candidate symbols,
+and proposes a point-wise loss to provide feedback on these negative samples
+into LLMs. Our experimental results demonstrate that PIF significantly reduces
+the model's selection bias by improving its MCSB capability. Remarkably, PIF
+exhibits a substantial enhancement in the accuracy for MCQs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept at ACL2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stereo-Depth Fusion through Virtual Pattern Projection <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Bartolomei, Matteo Poggi, Fabio Tosi, Andrea Conti, Stefano Mattoccia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel general-purpose stereo and depth data fusion
+paradigm that mimics the active stereo principle by replacing the unreliable
+physical pattern projector with a depth sensor. It works by projecting virtual
+patterns consistent with the scene geometry onto the left and right images
+acquired by a conventional stereo camera, using the sparse hints obtained from
+a depth sensor, to facilitate the visual correspondence. Purposely, any depth
+sensing device can be seamlessly plugged into our framework, enabling the
+deployment of a virtual active stereo setup in any possible environment and
+overcoming the severe limitations of physical pattern projection, such as the
+limited working range and environmental conditions. Exhaustive experiments on
+indoor and outdoor datasets featuring both long and close range, including
+those providing raw, unfiltered depth hints from off-the-shelf depth sensors,
+highlight the effectiveness of our approach in notably boosting the robustness
+and accuracy of algorithms and deep stereo without any code modification and
+even without re-training. Additionally, we assess the performance of our
+strategy on active stereo evaluation datasets with conventional pattern
+projection. Indeed, in all these scenarios, our virtual pattern projection
+paradigm achieves state-of-the-art performance. The source code is available
+at: https://github.com/bartn8/vppstereo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>extended version of ICCV 2023: "Active Stereo Without Pattern
+  Projector"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning 1D Causal Visual Representation with De-focus Attention
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04342v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04342v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxin Tao, Xizhou Zhu, Shiqian Su, Lewei Lu, Changyao Tian, Xuan Luo, Gao Huang, Hongsheng Li, Yu Qiao, Jie Zhou, Jifeng Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modality differences have led to the development of heterogeneous
+architectures for vision and language models. While images typically require 2D
+non-causal modeling, texts utilize 1D causal modeling. This distinction poses
+significant challenges in constructing unified multi-modal models. This paper
+explores the feasibility of representing images using 1D causal modeling. We
+identify an "over-focus" issue in existing 1D causal vision models, where
+attention overly concentrates on a small proportion of visual tokens. The issue
+of "over-focus" hinders the model's ability to extract diverse visual features
+and to receive effective gradients for optimization. To address this, we
+propose De-focus Attention Networks, which employ learnable bandpass filters to
+create varied attention patterns. During training, large and scheduled drop
+path rates, and an auxiliary loss on globally pooled features for global
+understanding tasks are introduced. These two strategies encourage the model to
+attend to a broader range of tokens and enhance network optimization. Extensive
+experiments validate the efficacy of our approach, demonstrating that 1D causal
+visual representation can perform comparably to 2D non-causal representation in
+tasks such as global perception, dense prediction, and multi-modal
+understanding. Code is released at
+https://github.com/OpenGVLab/De-focus-Attention-Networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flash3D: Feed-Forward Generalisable 3D Scene Reconstruction from a
+  Single Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stanislaw Szymanowicz, Eldar Insafutdinov, Chuanxia Zheng, Dylan Campbell, João F. Henriques, Christian Rupprecht, Andrea Vedaldi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose Flash3D, a method for scene reconstruction and
+novel view synthesis from a single image which is both very generalisable and
+efficient. For generalisability, we start from a "foundation" model for
+monocular depth estimation and extend it to a full 3D shape and appearance
+reconstructor. For efficiency, we base this extension on feed-forward Gaussian
+Splatting. Specifically, we predict a first layer of 3D Gaussians at the
+predicted depth, and then add additional layers of Gaussians that are offset in
+space, allowing the model to complete the reconstruction behind occlusions and
+truncations. Flash3D is very efficient, trainable on a single GPU in a day, and
+thus accessible to most researchers. It achieves state-of-the-art results when
+trained and tested on RealEstate10k. When transferred to unseen datasets like
+NYU it outperforms competitors by a large margin. More impressively, when
+transferred to KITTI, Flash3D achieves better PSNR than methods trained
+specifically on that dataset. In some instances, it even outperforms recent
+methods that use multiple views as input. Code, models, demo, and more results
+are available at https://www.robots.ox.ac.uk/~vgg/research/flash3d/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://www.robots.ox.ac.uk/~vgg/research/flash3d/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Verbalized Machine Learning: Revisiting Machine Learning with Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Z. Xiao, Robert Bamler, Bernhard Schölkopf, Weiyang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the large progress made by large language models (LLMs), we
+introduce the framework of verbalized machine learning (VML). In contrast to
+conventional machine learning models that are typically optimized over a
+continuous parameter space, VML constrains the parameter space to be
+human-interpretable natural language. Such a constraint leads to a new
+perspective of function approximation, where an LLM with a text prompt can be
+viewed as a function parameterized by the text prompt. Guided by this
+perspective, we revisit classical machine learning problems, such as regression
+and classification, and find that these problems can be solved by an
+LLM-parameterized learner and optimizer. The major advantages of VML include
+(1) easy encoding of inductive bias: prior knowledge about the problem and
+hypothesis class can be encoded in natural language and fed into the
+LLM-parameterized learner; (2) automatic model class selection: the optimizer
+can automatically select a concrete model class based on data and verbalized
+prior knowledge, and it can update the model class during training; and (3)
+interpretable learner updates: the LLM-parameterized optimizer can provide
+explanations for why each learner update is performed. We conduct several
+studies to empirically evaluate the effectiveness of VML, and hope that VML can
+serve as a stepping stone to stronger interpretability and trustworthiness in
+ML.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report v1 (92 pages, 15 figures)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpreting the Second-Order Effects of Neurons in CLIP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yossi Gandelsman, Alexei A. Efros, Jacob Steinhardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We interpret the function of individual neurons in CLIP by automatically
+describing them using text. Analyzing the direct effects (i.e. the flow from a
+neuron through the residual stream to the output) or the indirect effects
+(overall contribution) fails to capture the neurons' function in CLIP.
+Therefore, we present the "second-order lens", analyzing the effect flowing
+from a neuron through the later attention heads, directly to the output. We
+find that these effects are highly selective: for each neuron, the effect is
+significant for <2% of the images. Moreover, each effect can be approximated by
+a single direction in the text-image space of CLIP. We describe neurons by
+decomposing these directions into sparse sets of text representations. The sets
+reveal polysemantic behavior - each neuron corresponds to multiple, often
+unrelated, concepts (e.g. ships and cars). Exploiting this neuron polysemy, we
+mass-produce "semantic" adversarial examples by generating images with concepts
+spuriously correlated to the incorrect class. Additionally, we use the
+second-order effects for zero-shot segmentation and attribute discovery in
+images. Our results indicate that a scalable understanding of neurons can be
+used for model deception and for introducing new model capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page:
+  https://yossigandelsman.github.io/clip_neurons/index.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GLACE: Global Local Accelerated Coordinate Encoding <span class="chip">CVPR
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangjinhua Wang, Xudong Jiang, Silvano Galliani, Christoph Vogel, Marc Pollefeys
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene coordinate regression (SCR) methods are a family of visual localization
+methods that directly regress 2D-3D matches for camera pose estimation. They
+are effective in small-scale scenes but face significant challenges in
+large-scale scenes that are further amplified in the absence of ground truth 3D
+point clouds for supervision. Here, the model can only rely on reprojection
+constraints and needs to implicitly triangulate the points. The challenges stem
+from a fundamental dilemma: The network has to be invariant to observations of
+the same landmark at different viewpoints and lighting conditions, etc., but at
+the same time discriminate unrelated but similar observations. The latter
+becomes more relevant and severe in larger scenes. In this work, we tackle this
+problem by introducing the concept of co-visibility to the network. We propose
+GLACE, which integrates pre-trained global and local encodings and enables SCR
+to scale to large scenes with only a single small-sized network. Specifically,
+we propose a novel feature diffusion technique that implicitly groups the
+reprojection constraints with co-visibility and avoids overfitting to trivial
+solutions. Additionally, our position decoder parameterizes the output
+positions for large-scale scenes more effectively. Without using 3D models or
+depth maps for supervision, our method achieves state-of-the-art results on
+large-scale scenes with a low-map-size model. On Cambridge landmarks, with a
+single model, we achieve 17% lower median position error than Poker, the
+ensemble variant of the state-of-the-art SCR method ACE. Code is available at:
+https://github.com/cvg/glace.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Large-scale visual localization with a single optimizable MLP. CVPR
+  2024. Code: https://github.com/cvg/glace. Project page:
+  https://xjiangan.github.io/glace</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics3D: Learning Physical Properties of 3D Gaussians via Video
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04338v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04338v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangfu Liu, Hanyang Wang, Shunyu Yao, Shengjun Zhang, Jie Zhou, Yueqi Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there has been rapid development in 3D generation models,
+opening up new possibilities for applications such as simulating the dynamic
+movements of 3D objects and customizing their behaviors. However, current 3D
+generative models tend to focus only on surface features such as color and
+shape, neglecting the inherent physical properties that govern the behavior of
+objects in the real world. To accurately simulate physics-aligned dynamics, it
+is essential to predict the physical properties of materials and incorporate
+them into the behavior prediction process. Nonetheless, predicting the diverse
+materials of real-world objects is still challenging due to the complex nature
+of their physical attributes. In this paper, we propose \textbf{Physics3D}, a
+novel method for learning various physical properties of 3D objects through a
+video diffusion model. Our approach involves designing a highly generalizable
+physical simulation system based on a viscoelastic material model, which
+enables us to simulate a wide range of materials with high-fidelity
+capabilities. Moreover, we distill the physical priors from a video diffusion
+model that contains more understanding of realistic object materials. Extensive
+experiments demonstrate the effectiveness of our method with both elastic and
+plastic materials. Physics3D shows great potential for bridging the gap between
+the physical world and virtual neural space, providing a better integration and
+application of realistic physical principles in virtual environments. Project
+page: https://liuff19.github.io/Physics3D.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://liuff19.github.io/Physics3D</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoboMamba: Multimodal State Space Model for Efficient Robot Reasoning
+  and Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Liu, Mengzhen Liu, Zhenyu Wang, Lily Lee, Kaichen Zhou, Pengju An, Senqiao Yang, Renrui Zhang, Yandong Guo, Shanghang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A fundamental objective in robot manipulation is to enable models to
+comprehend visual scenes and execute actions. Although existing robot
+Multimodal Large Language Models (MLLMs) can handle a range of basic tasks,
+they still face challenges in two areas: 1) inadequate reasoning ability to
+tackle complex tasks, and 2) high computational costs for MLLM fine-tuning and
+inference. The recently proposed state space model (SSM) known as Mamba
+demonstrates promising capabilities in non-trivial sequence modeling with
+linear inference complexity. Inspired by this, we introduce RoboMamba, an
+end-to-end robotic MLLM that leverages the Mamba model to deliver both robotic
+reasoning and action capabilities, while maintaining efficient fine-tuning and
+inference. Specifically, we first integrate the vision encoder with Mamba,
+aligning visual data with language embedding through co-training, empowering
+our model with visual common sense and robot-related reasoning. To further
+equip RoboMamba with action pose prediction abilities, we explore an efficient
+fine-tuning strategy with a simple policy head. We find that once RoboMamba
+possesses sufficient reasoning capability, it can acquire manipulation skills
+with minimal fine-tuning parameters (0.1\% of the model) and time (20 minutes).
+In experiments, RoboMamba demonstrates outstanding reasoning capabilities on
+general and robotic evaluation benchmarks. Meanwhile, our model showcases
+impressive pose prediction results in both simulation and real-world
+experiments, achieving inference speeds 7 times faster than existing robot
+MLLMs. Our project web page: https://sites.google.com/view/robomamba-web
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coherent Zero-Shot Visual Instruction Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quynh Phung, Songwei Ge, Jia-Bin Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the advances in text-to-image synthesis, particularly with diffusion
+models, generating visual instructions that require consistent representation
+and smooth state transitions of objects across sequential steps remains a
+formidable challenge. This paper introduces a simple, training-free framework
+to tackle the issues, capitalizing on the advancements in diffusion models and
+large language models (LLMs). Our approach systematically integrates text
+comprehension and image generation to ensure visual instructions are visually
+appealing and maintain consistency and accuracy throughout the instruction
+sequence. We validate the effectiveness by testing multi-step instructions and
+comparing the text alignment and consistency with several baselines. Our
+experiments show that our approach can visualize coherent and visually pleasing
+instructions
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://instruct-vis-zero.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepStack: Deeply Stacking Visual Tokens is Surprisingly Simple and
+  Effective for LMMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingchen Meng, Jianwei Yang, Rui Tian, Xiyang Dai, Zuxuan Wu, Jianfeng Gao, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most large multimodal models (LMMs) are implemented by feeding visual tokens
+as a sequence into the first layer of a large language model (LLM). The
+resulting architecture is simple but significantly increases computation and
+memory costs, as it has to handle a large number of additional tokens in its
+input layer. This paper presents a new architecture DeepStack for LMMs.
+Considering $N$ layers in the language and vision transformer of LMMs, we stack
+the visual tokens into $N$ groups and feed each group to its aligned
+transformer layer \textit{from bottom to top}. Surprisingly, this simple method
+greatly enhances the power of LMMs to model interactions among visual tokens
+across layers but with minimal additional cost. We apply DeepStack to both
+language and vision transformer in LMMs, and validate the effectiveness of
+DeepStack LMMs with extensive empirical results. Using the same context length,
+our DeepStack 7B and 13B parameters surpass their counterparts by \textbf{2.7}
+and \textbf{2.9} on average across \textbf{9} benchmarks, respectively. Using
+only one-fifth of the context length, DeepStack rivals closely to the
+counterparts that use the full context length. These gains are particularly
+pronounced on high-resolution tasks, e.g., \textbf{4.2}, \textbf{11.0}, and
+\textbf{4.0} improvements on TextVQA, DocVQA, and InfoVQA compared to
+LLaVA-1.5-7B, respectively. We further apply DeepStack to vision transformer
+layers, which brings us a similar amount of improvements, \textbf{3.8} on
+average compared with LLaVA-1.5-7B.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://deepstack-vl.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coarse-To-Fine Tensor Trains for Compact Visual Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Loeschcke, Dan Wang, Christian Leth-Espensen, Serge Belongie, Michael J. Kastoryano, Sagie Benaim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to learn compact, high-quality, and easy-to-optimize
+representations for visual data is paramount to many applications such as novel
+view synthesis and 3D reconstruction. Recent work has shown substantial success
+in using tensor networks to design such compact and high-quality
+representations. However, the ability to optimize tensor-based representations,
+and in particular, the highly compact tensor train representation, is still
+lacking. This has prevented practitioners from deploying the full potential of
+tensor networks for visual data. To this end, we propose 'Prolongation
+Upsampling Tensor Train (PuTT)', a novel method for learning tensor train
+representations in a coarse-to-fine manner. Our method involves the prolonging
+or `upsampling' of a learned tensor train representation, creating a sequence
+of 'coarse-to-fine' tensor trains that are incrementally refined. We evaluate
+our representation along three axes: (1). compression, (2). denoising
+capability, and (3). image completion capability. To assess these axes, we
+consider the tasks of image fitting, 3D fitting, and novel view synthesis,
+where our method shows an improved performance compared to state-of-the-art
+tensor-based methods. For full results see our project webpage:
+https://sebulo.github.io/PuTT_website/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://sebulo.github.io/PuTT_website/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BitsFusion: 1.99 bits Weight Quantization of Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Sui, Yanyu Li, Anil Kag, Yerlan Idelbayev, Junli Cao, Ju Hu, Dhritiman Sagar, Bo Yuan, Sergey Tulyakov, Jian Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based image generation models have achieved great success in recent
+years by showing the capability of synthesizing high-quality content. However,
+these models contain a huge number of parameters, resulting in a significantly
+large model size. Saving and transferring them is a major bottleneck for
+various applications, especially those running on resource-constrained devices.
+In this work, we develop a novel weight quantization method that quantizes the
+UNet from Stable Diffusion v1.5 to 1.99 bits, achieving a model with 7.9X
+smaller size while exhibiting even better generation quality than the original
+one. Our approach includes several novel techniques, such as assigning optimal
+bits to each layer, initializing the quantized model for better performance,
+and improving the training strategy to dramatically reduce quantization error.
+Furthermore, we extensively evaluate our quantized model across various
+benchmark datasets and through human evaluation to demonstrate its superior
+generation quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://snap-research.github.io/BitsFusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parameter-Inverted Image Pyramid Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04330v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04330v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xizhou Zhu, Xue Yang, Zhaokai Wang, Hao Li, Wenhan Dou, Junqi Ge, Lewei Lu, Yu Qiao, Jifeng Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image pyramids are commonly used in modern computer vision tasks to obtain
+multi-scale features for precise understanding of images. However, image
+pyramids process multiple resolutions of images using the same large-scale
+model, which requires significant computational cost. To overcome this issue,
+we propose a novel network architecture known as the Parameter-Inverted Image
+Pyramid Networks (PIIP). Our core idea is to use models with different
+parameter sizes to process different resolution levels of the image pyramid,
+thereby balancing computational efficiency and performance. Specifically, the
+input to PIIP is a set of multi-scale images, where higher resolution images
+are processed by smaller networks. We further propose a feature interaction
+mechanism to allow features of different resolutions to complement each other
+and effectively integrate information from different spatial scales. Extensive
+experiments demonstrate that the PIIP achieves superior performance in tasks
+such as object detection, segmentation, and image classification, compared to
+traditional image pyramid methods and single-branch networks, while reducing
+computational cost. Notably, when applying our method on a large-scale vision
+foundation model InternViT-6B, we improve its performance by 1%-2% on detection
+and segmentation with only 40%-60% of the original computation. These results
+validate the effectiveness of the PIIP approach and provide a new technical
+direction for future vision computing tasks. Our code and models are available
+at https://github.com/OpenGVLab/PIIP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Share<span class="highlight-title">GPT</span>4Video: Improving Video Understanding and Generation with Better
+  Captions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Chen, Xilin Wei, Jinsong Li, Xiaoyi Dong, Pan Zhang, Yuhang Zang, Zehui Chen, Haodong Duan, Bin Lin, Zhenyu Tang, Li Yuan, Yu Qiao, Dahua Lin, Feng Zhao, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the ShareGPT4Video series, aiming to facilitate the video
+understanding of large video-language models (LVLMs) and the video generation
+of text-to-video models (T2VMs) via dense and precise captions. The series
+comprises: 1) ShareGPT4Video, 40K GPT4V annotated dense captions of videos with
+various lengths and sources, developed through carefully designed data
+filtering and annotating strategy. 2) ShareCaptioner-Video, an efficient and
+capable captioning model for arbitrary videos, with 4.8M high-quality aesthetic
+videos annotated by it. 3) ShareGPT4Video-8B, a simple yet superb LVLM that
+reached SOTA performance on three advancing video benchmarks. To achieve this,
+taking aside the non-scalable costly human annotators, we find using GPT4V to
+caption video with a naive multi-frame or frame-concatenation input strategy
+leads to less detailed and sometimes temporal-confused results. We argue the
+challenge of designing a high-quality video captioning strategy lies in three
+aspects: 1) Inter-frame precise temporal change understanding. 2) Intra-frame
+detailed content description. 3) Frame-number scalability for arbitrary-length
+videos. To this end, we meticulously designed a differential video captioning
+strategy, which is stable, scalable, and efficient for generating captions for
+videos with arbitrary resolution, aspect ratios, and length. Based on it, we
+construct ShareGPT4Video, which contains 40K high-quality videos spanning a
+wide range of categories, and the resulting captions encompass rich world
+knowledge, object attributes, camera movements, and crucially, detailed and
+precise temporal descriptions of events. Based on ShareGPT4Video, we further
+develop ShareCaptioner-Video, a superior captioner capable of efficiently
+generating high-quality captions for arbitrary videos...
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://sharegpt4video.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SF-V: Single Forward Video Generation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixing Zhang, Yanyu Li, Yushu Wu, Yanwu Xu, Anil Kag, Ivan Skorokhodov, Willi Menapace, Aliaksandr Siarohin, Junli Cao, Dimitris Metaxas, Sergey Tulyakov, Jian Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based video generation models have demonstrated remarkable success
+in obtaining high-fidelity videos through the iterative denoising process.
+However, these models require multiple denoising steps during sampling,
+resulting in high computational costs. In this work, we propose a novel
+approach to obtain single-step video generation models by leveraging
+adversarial training to fine-tune pre-trained video diffusion models. We show
+that, through the adversarial training, the multi-steps video diffusion model,
+i.e., Stable Video Diffusion (SVD), can be trained to perform single forward
+pass to synthesize high-quality videos, capturing both temporal and spatial
+dependencies in the video data. Extensive experiments demonstrate that our
+method achieves competitive generation quality of synthesized videos with
+significantly reduced computational overhead for the denoising process (i.e.,
+around $23\times$ speedup compared with SVD and $6\times$ speedup compared with
+existing works, with even better generation quality), paving the way for
+real-time video synthesis and editing. More visualization results are made
+publicly available at https://snap-research.github.io/SF-V.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://snap-research.github.io/SF-V</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DIRECT-3D: Learning Direct Text-to-3D Generation on Massive Noisy 3D
+  Data <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qihao Liu, Yi Zhang, Song Bai, Adam Kortylewski, Alan Yuille
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present DIRECT-3D, a diffusion-based 3D generative model for creating
+high-quality 3D assets (represented by Neural Radiance Fields) from text
+prompts. Unlike recent 3D generative models that rely on clean and well-aligned
+3D data, limiting them to single or few-class generation, our model is directly
+trained on extensive noisy and unaligned `in-the-wild' 3D assets, mitigating
+the key challenge (i.e., data scarcity) in large-scale 3D generation. In
+particular, DIRECT-3D is a tri-plane diffusion model that integrates two
+innovations: 1) A novel learning framework where noisy data are filtered and
+aligned automatically during the training process. Specifically, after an
+initial warm-up phase using a small set of clean data, an iterative
+optimization is introduced in the diffusion process to explicitly estimate the
+3D pose of objects and select beneficial data based on conditional density. 2)
+An efficient 3D representation that is achieved by disentangling object
+geometry and color features with two separate conditional diffusion models that
+are optimized hierarchically. Given a prompt input, our model generates
+high-quality, high-resolution, realistic, and complex 3D objects with accurate
+geometric details in seconds. We achieve state-of-the-art performance in both
+single-class generation and text-to-3D generation. We also demonstrate that
+DIRECT-3D can serve as a useful 3D geometric prior of objects, for example to
+alleviate the well-known Janus problem in 2D-lifting methods such as
+DreamFusion. The code and models are available for research purposes at:
+https://github.com/qihao067/direct3d.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024; code: https://github.com/qihao067/direct3d;
+  project page: https://direct-3d.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ATraDiff: Accelerating Online Reinforcement Learning with Imaginary
+  Trajectories <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianlan Yang, Yu-Xiong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training autonomous agents with sparse rewards is a long-standing problem in
+online reinforcement learning (RL), due to low data efficiency. Prior work
+overcomes this challenge by extracting useful knowledge from offline data,
+often accomplished through the learning of action distribution from offline
+data and utilizing the learned distribution to facilitate online RL. However,
+since the offline data are given and fixed, the extracted knowledge is
+inherently limited, making it difficult to generalize to new tasks. We propose
+a novel approach that leverages offline data to learn a generative diffusion
+model, coined as Adaptive Trajectory Diffuser (ATraDiff). This model generates
+synthetic trajectories, serving as a form of data augmentation and consequently
+enhancing the performance of online RL methods. The key strength of our
+diffuser lies in its adaptability, allowing it to effectively handle varying
+trajectory lengths and mitigate distribution shifts between online and offline
+data. Because of its simplicity, ATraDiff seamlessly integrates with a wide
+spectrum of RL methods. Empirical evaluation shows that ATraDiff consistently
+achieves state-of-the-art performance across a variety of environments, with
+particularly pronounced improvements in complicated settings. Our code and demo
+video are available at https://atradiff.github.io .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VidMuse: A Simple Video-to-Music Generation Framework with
+  Long-Short-Term Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyue Tian, Zhaoyang Liu, Ruibin Yuan, Jiahao Pan, Xiaoqiang Huang, Qifeng Liu, Xu Tan, Qifeng Chen, Wei Xue, Yike Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we systematically study music generation conditioned solely on
+the video. First, we present a large-scale dataset comprising 190K video-music
+pairs, including various genres such as movie trailers, advertisements, and
+documentaries. Furthermore, we propose VidMuse, a simple framework for
+generating music aligned with video inputs. VidMuse stands out by producing
+high-fidelity music that is both acoustically and semantically aligned with the
+video. By incorporating local and global visual cues, VidMuse enables the
+creation of musically coherent audio tracks that consistently match the video
+content through Long-Short-Term modeling. Through extensive experiments,
+VidMuse outperforms existing models in terms of audio quality, diversity, and
+audio-visual alignment. The code and datasets will be available at
+https://github.com/ZeyueT/VidMuse/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and datasets will be available at
+  https://github.com/ZeyueT/VidMuse/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Sampling of k-Space in Magnetic Resonance for Rapid Pathology
+  Prediction <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen-Yu Yen, Raghav Singhal, Umang Sharma, Rajesh Ranganath, Sumit Chopra, Lerrel Pinto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Magnetic Resonance (MR) imaging, despite its proven diagnostic utility,
+remains an inaccessible imaging modality for disease surveillance at the
+population level. A major factor rendering MR inaccessible is lengthy scan
+times. An MR scanner collects measurements associated with the underlying
+anatomy in the Fourier space, also known as the k-space. Creating a
+high-fidelity image requires collecting large quantities of such measurements,
+increasing the scan time. Traditionally to accelerate an MR scan, image
+reconstruction from under-sampled k-space data is the method of choice.
+However, recent works show the feasibility of bypassing image reconstruction
+and directly learning to detect disease directly from a sparser learned subset
+of the k-space measurements. In this work, we propose Adaptive Sampling for MR
+(ASMR), a sampling method that learns an adaptive policy to sequentially select
+k-space samples to optimize for target disease detection. On 6 out of 8
+pathology classification tasks spanning the Knee, Brain, and Prostate MR scans,
+ASMR reaches within 2% of the performance of a fully sampled classifier while
+using only 8% of the k-space, as well as outperforming prior state-of-the-art
+work in k-space sampling such as EMRT, LOUPE, and DPS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Project website at https://adaptive-sampling-mr.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Omni6DPose: A Benchmark and Model for Universal 6D Object Pose
+  Estimation and Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiyao Zhang, Weiyao Huang, Bo Peng, Mingdong Wu, Fei Hu, Zijian Chen, Bo Zhao, Hao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  6D Object Pose Estimation is a crucial yet challenging task in computer
+vision, suffering from a significant lack of large-scale datasets. This
+scarcity impedes comprehensive evaluation of model performance, limiting
+research advancements. Furthermore, the restricted number of available
+instances or categories curtails its applications. To address these issues,
+this paper introduces Omni6DPose, a substantial dataset characterized by its
+diversity in object categories, large scale, and variety in object materials.
+Omni6DPose is divided into three main components: ROPE (Real 6D Object Pose
+Estimation Dataset), which includes 332K images annotated with over 1.5M
+annotations across 581 instances in 149 categories; SOPE(Simulated 6D Object
+Pose Estimation Dataset), consisting of 475K images created in a mixed reality
+setting with depth simulation, annotated with over 5M annotations across 4162
+instances in the same 149 categories; and the manually aligned real scanned
+objects used in both ROPE and SOPE. Omni6DPose is inherently challenging due to
+the substantial variations and ambiguities. To address this challenge, we
+introduce GenPose++, an enhanced version of the SOTA category-level pose
+estimation framework, incorporating two pivotal improvements: Semantic-aware
+feature extraction and Clustering-based aggregation. Moreover, we provide a
+comprehensive benchmarking analysis to evaluate the performance of previous
+methods on this large-scale dataset in the realms of 6D object pose estimation
+and pose tracking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Step-aware Preference Optimization: Aligning Preference with Denoising
+  Performance at Each Step 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanhao Liang, Yuhui Yuan, Shuyang Gu, Bohan Chen, Tiankai Hang, Ji Li, Liang Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Direct Preference Optimization (DPO) has extended its success from
+aligning large language models (LLMs) to aligning text-to-image diffusion
+models with human preferences. Unlike most existing DPO methods that assume all
+diffusion steps share a consistent preference order with the final generated
+images, we argue that this assumption neglects step-specific denoising
+performance and that preference labels should be tailored to each step's
+contribution. To address this limitation, we propose Step-aware Preference
+Optimization (SPO), a novel post-training approach that independently evaluates
+and adjusts the denoising performance at each step, using a step-aware
+preference model and a step-wise resampler to ensure accurate step-aware
+supervision. Specifically, at each denoising step, we sample a pool of images,
+find a suitable win-lose pair, and, most importantly, randomly select a single
+image from the pool to initialize the next denoising step. This step-wise
+resampler process ensures the next win-lose image pair comes from the same
+image, making the win-lose comparison independent of the previous step. To
+assess the preferences at each step, we train a separate step-aware preference
+model that can be applied to both noisy and clean images. Our experiments with
+Stable Diffusion v1.5 and SDXL demonstrate that SPO significantly outperforms
+the latest Diffusion-DPO in aligning generated images with complex, detailed
+prompts and enhancing aesthetics, while also achieving more than 20x times
+faster in training efficiency. Code and model:
+https://rockeycoss.github.io/spo.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Alignment and Robustness with Short Circuiting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Zou, Long Phan, Justin Wang, Derek Duenas, Maxwell Lin, Maksym Andriushchenko, Rowan Wang, Zico Kolter, Matt Fredrikson, Dan Hendrycks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI systems can take harmful actions and are highly vulnerable to adversarial
+attacks. We present an approach, inspired by recent advances in representation
+engineering, that "short-circuits" models as they respond with harmful outputs.
+Existing techniques aimed at improving alignment, such as refusal training, are
+often bypassed. Techniques such as adversarial training try to plug these holes
+by countering specific attacks. As an alternative to refusal training and
+adversarial training, short-circuiting directly controls the representations
+that are responsible for harmful outputs in the first place. Our technique can
+be applied to both text-only and multimodal language models to prevent the
+generation of harmful outputs without sacrificing utility -- even in the
+presence of powerful unseen attacks. Notably, while adversarial robustness in
+standalone image recognition remains an open challenge, short-circuiting allows
+the larger multimodal system to reliably withstand image "hijacks" that aim to
+produce harmful content. Finally, we extend our approach to AI agents,
+demonstrating considerable reductions in the rate of harmful actions when they
+are under attack. Our approach represents a significant step forward in the
+development of reliable safeguards to harmful behavior and adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReNO: Enhancing One-step Text-to-Image Models through Reward-based Noise
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04312v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04312v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Eyring, Shyamgopal Karthik, Karsten Roth, Alexey Dosovitskiy, Zeynep Akata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-Image (T2I) models have made significant advancements in recent
+years, but they still struggle to accurately capture intricate details
+specified in complex compositional prompts. While fine-tuning T2I models with
+reward objectives has shown promise, it suffers from "reward hacking" and may
+not generalize well to unseen prompt distributions. In this work, we propose
+Reward-based Noise Optimization (ReNO), a novel approach that enhances T2I
+models at inference by optimizing the initial noise based on the signal from
+one or multiple human preference reward models. Remarkably, solving this
+optimization problem with gradient ascent for 50 iterations yields impressive
+results on four different one-step models across two competitive benchmarks,
+T2I-CompBench and GenEval. Within a computational budget of 20-50 seconds,
+ReNO-enhanced one-step models consistently surpass the performance of all
+current open-source Text-to-Image models. Extensive user studies demonstrate
+that our model is preferred nearly twice as often compared to the popular SDXL
+model and is on par with the proprietary Stable Diffusion 3 with 8B parameters.
+Moreover, given the same computational resources, a ReNO-optimized one-step
+model outperforms widely-used open-source models such as SDXL and
+PixArt-$\alpha$, highlighting the efficiency and effectiveness of ReNO in
+enhancing T2I model performance at inference time. Code is available at
+https://github.com/ExplainableML/ReNO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReFiNe: Recursive Field Networks for Cross-modal Multi-scene
+  Representation <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergey Zakharov, Katherine Liu, Adrien Gaidon, Rares Ambrus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The common trade-offs of state-of-the-art methods for multi-shape
+representation (a single model "packing" multiple objects) involve trading
+modeling accuracy against memory and storage. We show how to encode multiple
+shapes represented as continuous neural fields with a higher degree of
+precision than previously possible and with low memory usage. Key to our
+approach is a recursive hierarchical formulation that exploits object
+self-similarity, leading to a highly compressed and efficient shape latent
+space. Thanks to the recursive formulation, our method supports spatial and
+global-to-local latent feature fusion without needing to initialize and
+maintain auxiliary data structures, while still allowing for continuous field
+queries to enable applications such as raytracing. In experiments on a set of
+diverse datasets, we provide compelling qualitative results and demonstrate
+state-of-the-art multi-scene reconstruction and compression results with a
+single network per dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH 2024. Project Page:
+  https://zakharos.github.io/projects/refine/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-LSTM: xLSTM as Generic Vision Backbone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benedikt Alkin, Maximilian Beck, Korbinian Pöppel, Sepp Hochreiter, Johannes Brandstetter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers are widely used as generic backbones in computer vision, despite
+initially introduced for natural language processing. Recently, the Long
+Short-Term Memory (LSTM) has been extended to a scalable and performant
+architecture - the xLSTM - which overcomes long-standing LSTM limitations via
+exponential gating and parallelizable matrix memory structure. In this report,
+we introduce Vision-LSTM (ViL), an adaption of the xLSTM building blocks to
+computer vision. ViL comprises a stack of xLSTM blocks where odd blocks process
+the sequence of patch tokens from top to bottom while even blocks go from
+bottom to top. Experiments show that ViL holds promise to be further deployed
+as new generic backbone for computer vision architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Surface Reconstruction from Sparse Views Using Epipolar Geometry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaichen Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the challenge of reconstructing surfaces from sparse
+view inputs, where ambiguity and occlusions due to missing information pose
+significant hurdles. We present a novel approach, named EpiS, that incorporates
+Epipolar information into the reconstruction process. Existing methods in
+sparse-view neural surface learning have mainly focused on mean and variance
+considerations using cost volumes for feature extraction. In contrast, our
+method aggregates coarse information from the cost volume into Epipolar
+features extracted from multiple source views, enabling the generation of
+fine-grained Signal Distance Function (SDF)-aware features. Additionally, we
+employ an attention mechanism along the line dimension to facilitate feature
+fusion based on the SDF feature. Furthermore, to address the information gaps
+in sparse conditions, we integrate depth information from monocular depth
+estimation using global and local regularization techniques. The global
+regularization utilizes a triplet loss function, while the local regularization
+employs a derivative loss function. Extensive experiments demonstrate that our
+approach outperforms state-of-the-art methods, especially in cases with sparse
+and generalizable conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Everything to the Synthetic: Diffusion-driven Test-time Adaptation via
+  Synthetic-Domain Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Guo, Junhao Zhao, Chunjiang Ge, Chaoqun Du, Zanlin Ni, Shiji Song, Humphrey Shi, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Test-time adaptation (TTA) aims to enhance the performance of source-domain
+pretrained models when tested on unknown shifted target domains. Traditional
+TTA methods primarily adapt model weights based on target data streams, making
+model performance sensitive to the amount and order of target data. Recently,
+diffusion-driven TTA methods have demonstrated strong performance by using an
+unconditional diffusion model, which is also trained on the source domain to
+transform target data into synthetic data as a source domain projection. This
+allows the source model to make predictions without weight adaptation. In this
+paper, we argue that the domains of the source model and the synthetic data in
+diffusion-driven TTA methods are not aligned. To adapt the source model to the
+synthetic domain of the unconditional diffusion model, we introduce a
+Synthetic-Domain Alignment (SDA) framework to fine-tune the source model with
+synthetic data. Specifically, we first employ a conditional diffusion model to
+generate labeled samples, creating a synthetic dataset. Subsequently, we use
+the aforementioned unconditional diffusion model to add noise to and denoise
+each sample before fine-tuning. This process mitigates the potential domain gap
+between the conditional and unconditional models. Extensive experiments across
+various models and benchmarks demonstrate that SDA achieves superior domain
+alignment and consistently outperforms existing diffusion-driven TTA methods.
+Our code is available at
+https://github.com/SHI-Labs/Diffusion-Driven-Test-Time-Adaptation-via-Synthetic-Domain-Alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GitHub:
+  https://github.com/SHI-Labs/Diffusion-Driven-Test-Time-Adaptation-via-Synthetic-Domain-Alignment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Zhou, Zheng Liu, Shitao Xiao, Bo Zhao, Yongping Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal retrieval becomes increasingly popular in practice. However, the
+existing retrievers are mostly text-oriented, which lack the capability to
+process visual information. Despite the presence of vision-language models like
+CLIP, the current methods are severely limited in representing the text-only
+and image-only data. In this work, we present a new embedding model VISTA for
+universal multi-modal retrieval. Our work brings forth threefold technical
+contributions. Firstly, we introduce a flexible architecture which extends a
+powerful text encoder with the image understanding capability by introducing
+visual token embeddings. Secondly, we develop two data generation strategies,
+which bring high-quality composed image-text to facilitate the training of the
+embedding model. Thirdly, we introduce a multi-stage training algorithm, which
+first aligns the visual token embedding with the text encoder using massive
+weakly labeled data, and then develops multi-modal representation capability
+using the generated composed image-text data. In our experiments, VISTA
+achieves superior performances across a variety of multi-modal retrieval tasks
+in both zero-shot and supervised settings. Our model, data, and source code are
+available at https://github.com/FlagOpen/FlagEmbedding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpectralZoom: Efficient Segmentation with an Adaptive Hyperspectral
+  Camera 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jackson Arnold, Sophia Rossi, Chloe Petrosino, Ethan Mitchell, Sanjeev J. Koppal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral image segmentation is crucial for many fields such as
+agriculture, remote sensing, biomedical imaging, battlefield sensing and
+astronomy. However, the challenge of hyper and multi spectral imaging is its
+large data footprint. We propose both a novel camera design and a vision
+transformer-based (ViT) algorithm that alleviate both the captured data
+footprint and the computational load for hyperspectral segmentation. Our camera
+is able to adaptively sample image regions or patches at different resolutions,
+instead of capturing the entire hyperspectral cube at one high resolution. Our
+segmentation algorithm works in concert with the camera, applying ViT-based
+segmentation only to adaptively selected patches. We show results both in
+simulation and on a real hardware platform demonstrating both accurate
+segmentation results and reduced computational burden.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ xMIL: Insightful Explanations for Multiple Instance Learning in
+  Histopathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Hense, Mina Jamshidi Idaji, Oliver Eberle, Thomas Schnake, Jonas Dippel, Laure Ciernik, Oliver Buchstab, Andreas Mock, Frederick Klauschen, Klaus-Robert Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple instance learning (MIL) is an effective and widely used approach for
+weakly supervised machine learning. In histopathology, MIL models have achieved
+remarkable success in tasks like tumor detection, biomarker prediction, and
+outcome prognostication. However, MIL explanation methods are still lagging
+behind, as they are limited to small bag sizes or disregard instance
+interactions. We revisit MIL through the lens of explainable AI (XAI) and
+introduce xMIL, a refined framework with more general assumptions. We
+demonstrate how to obtain improved MIL explanations using layer-wise relevance
+propagation (LRP) and conduct extensive evaluation experiments on three toy
+settings and four real-world histopathology datasets. Our approach consistently
+outperforms previous explanation attempts with particularly improved
+faithfulness scores on challenging biomarker prediction tasks. Finally, we
+showcase how xMIL explanations enable pathologists to extract insights from MIL
+models, representing a significant advance for knowledge discovery and model
+debugging in digital histopathology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VideoTetris: Towards Compositional Text-to-Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04277v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04277v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Tian, Ling Yang, Haotian Yang, Yuan Gao, Yufan Deng, Jingmin Chen, Xintao Wang, Zhaochen Yu, Xin Tao, Pengfei Wan, Di Zhang, Bin Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated great success in text-to-video (T2V)
+generation. However, existing methods may face challenges when handling complex
+(long) video generation scenarios that involve multiple objects or dynamic
+changes in object numbers. To address these limitations, we propose
+VideoTetris, a novel framework that enables compositional T2V generation.
+Specifically, we propose spatio-temporal compositional diffusion to precisely
+follow complex textual semantics by manipulating and composing the attention
+maps of denoising networks spatially and temporally. Moreover, we propose an
+enhanced video data preprocessing to enhance the training data regarding motion
+dynamics and prompt understanding, equipped with a new reference frame
+attention mechanism to improve the consistency of auto-regressive video
+generation. Extensive experiments demonstrate that our VideoTetris achieves
+impressive qualitative and quantitative results in compositional T2V
+generation. Code is available at: https://github.com/YangLing0818/VideoTetris
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/YangLing0818/VideoTetris</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ELFS: Enhancing Label-Free Coreset Selection via Clustering-based
+  Pseudo-Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haizhong Zheng, Elisa Tsai, Yifu Lu, Jiachen Sun, Brian R. Bartoldson, Bhavya Kailkhura, Atul Prakash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-quality human-annotated data is crucial for modern deep learning
+pipelines, yet the human annotation process is both costly and time-consuming.
+Given a constrained human labeling budget, selecting an informative and
+representative data subset for labeling can significantly reduce human
+annotation effort. Well-performing state-of-the-art (SOTA) coreset selection
+methods require ground-truth labels over the whole dataset, failing to reduce
+the human labeling burden. Meanwhile, SOTA label-free coreset selection methods
+deliver inferior performance due to poor geometry-based scores. In this paper,
+we introduce ELFS, a novel label-free coreset selection method. ELFS employs
+deep clustering to estimate data difficulty scores without ground-truth labels.
+Furthermore, ELFS uses a simple but effective double-end pruning method to
+mitigate bias on calculated scores, which further improves the performance on
+selected coresets. We evaluate ELFS on five vision benchmarks and show that
+ELFS consistently outperforms SOTA label-free baselines. For instance, at a 90%
+pruning rate, ELFS surpasses the best-performing baseline by 5.3% on CIFAR10
+and 7.1% on CIFAR100. Moreover, ELFS even achieves comparable performance to
+supervised coreset selection at low pruning rates (e.g., 30% and 50%) on
+CIFAR10 and ImageNet-1K.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MLVU: A Comprehensive Benchmark for Multi-Task Long Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Zhou, Yan Shu, Bo Zhao, Boya Wu, Shitao Xiao, Xi Yang, Yongping Xiong, Bo Zhang, Tiejun Huang, Zheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evaluation of Long Video Understanding (LVU) performance poses an
+important but challenging research problem. Despite previous efforts, the
+existing video understanding benchmarks are severely constrained by several
+issues, especially the insufficient lengths of videos, a lack of diversity in
+video types and evaluation tasks, and the inappropriateness for evaluating LVU
+performances. To address the above problems, we propose a new benchmark, called
+MLVU (Multi-task Long Video Understanding Benchmark), for the comprehensive and
+in-depth evaluation of LVU. MLVU presents the following critical values: 1) The
+substantial and flexible extension of video lengths, which enables the
+benchmark to evaluate LVU performance across a wide range of durations. 2) The
+inclusion of various video genres, e.g., movies, surveillance footage,
+egocentric videos, cartoons, game videos, etc., which reflects the models' LVU
+performances in different scenarios. 3) The development of diversified
+evaluation tasks, which enables a comprehensive examination of MLLMs' key
+abilities in long-video understanding. The empirical study with 20 latest MLLMs
+reveals significant room for improvement in today's technique, as all existing
+methods struggle with most of the evaluation tasks and exhibit severe
+performance degradation when handling longer videos. Additionally, it suggests
+that factors such as context length, image-understanding quality, and the
+choice of LLM backbone can play critical roles in future advancements. We
+anticipate that MLVU will advance the research of long video understanding by
+providing a comprehensive and in-depth analysis of MLLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GeoGen: Geometry-Aware Generative Modeling via Signed Distance Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salvatore Esposito, Qingshan Xu, Kacper Kania, Charlie Hewitt, Octave Mariotti, Lohit Petikam, Julien Valentin, Arno Onken, Oisin Mac Aodha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new generative approach for synthesizing 3D geometry and
+images from single-view collections. Most existing approaches predict
+volumetric density to render multi-view consistent images. By employing
+volumetric rendering using neural radiance fields, they inherit a key
+limitation: the generated geometry is noisy and unconstrained, limiting the
+quality and utility of the output meshes. To address this issue, we propose
+GeoGen, a new SDF-based 3D generative model trained in an end-to-end manner.
+Initially, we reinterpret the volumetric density as a Signed Distance Function
+(SDF). This allows us to introduce useful priors to generate valid meshes.
+However, those priors prevent the generative model from learning details,
+limiting the applicability of the method to real-world scenarios. To alleviate
+that problem, we make the transformation learnable and constrain the rendered
+depth map to be consistent with the zero-level set of the SDF. Through the lens
+of adversarial training, we encourage the network to produce higher fidelity
+details on the output meshes. For evaluation, we introduce a synthetic dataset
+of human avatars captured from 360-degree camera angles, to overcome the
+challenges presented by real-world datasets, which often lack 3D consistency
+and do not cover all camera angles. Our experiments on multiple datasets show
+that GeoGen produces visually and quantitatively better geometry than the
+previous generative models based on neural radiance fields.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computer Vision and Pattern Recognition 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on 3D Human Avatar Modeling -- From Reconstruction to
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruihe Wang, Yukang Cao, Kai Han, Kwan-Yee K. Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D modeling has long been an important area in computer vision and computer
+graphics. Recently, thanks to the breakthroughs in neural representations and
+generative models, we witnessed a rapid development of 3D modeling. 3D human
+modeling, lying at the core of many real-world applications, such as gaming and
+animation, has attracted significant attention. Over the past few years, a
+large body of work on creating 3D human avatars has been introduced, forming a
+new and abundant knowledge base for 3D human modeling. The scale of the
+literature makes it difficult for individuals to keep track of all the works.
+This survey aims to provide a comprehensive overview of these emerging
+techniques for 3D human avatar modeling, from both reconstruction and
+generation perspectives. Firstly, we review representative methods for 3D human
+reconstruction, including methods based on pixel-aligned implicit function,
+neural radiance field, and 3D Gaussian Splatting, etc. We then summarize
+representative methods for 3D human generation, especially those using large
+language models like CLIP, diffusion models, and various 3D representations,
+which demonstrate state-of-the-art performance. Finally, we discuss our
+reflection on existing methods and open challenges for 3D human avatar
+modeling, shedding light on future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Localized Gaussian Point Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haosen Yang, Chenhao Zhang, Wenqing Wang, Marco Volino, Adrian Hilton, Li Zhang, Xiatian Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point management is a critical component in optimizing 3D Gaussian Splatting
+(3DGS) models, as the point initiation (e.g., via structure from motion) is
+distributionally inappropriate. Typically, the Adaptive Density Control (ADC)
+algorithm is applied, leveraging view-averaged gradient magnitude thresholding
+for point densification, opacity thresholding for pruning, and regular
+all-points opacity reset. However, we reveal that this strategy is limited in
+tackling intricate/special image regions (e.g., transparent) as it is unable to
+identify all the 3D zones that require point densification, and lacking an
+appropriate mechanism to handle the ill-conditioned points with negative
+impacts (occlusion due to false high opacity). To address these limitations, we
+propose a Localized Point Management (LPM) strategy, capable of identifying
+those error-contributing zones in the highest demand for both point addition
+and geometry calibration. Zone identification is achieved by leveraging the
+underlying multiview geometry constraints, with the guidance of image rendering
+errors. We apply point densification in the identified zone, whilst resetting
+the opacity of those points residing in front of these regions so that a new
+opportunity is created to correct ill-conditioned points. Serving as a
+versatile plugin, LPM can be seamlessly integrated into existing 3D Gaussian
+Splatting models. Experimental evaluation across both static 3D and dynamic 4D
+scenes validate the efficacy of our LPM strategy in boosting a variety of
+existing 3DGS models both quantitatively and qualitatively. Notably, LPM
+improves both vanilla 3DGS and SpaceTimeGS to achieve state-of-the-art
+rendering quality while retaining real-time speeds, outperforming on
+challenging datasets such as Tanks & Temples and the Neural 3D Video Dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conv-INR: Convolutional Implicit Neural Representation for Multimodal
+  Visual Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhicheng Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit neural representation (INR) has recently emerged as a promising
+paradigm for signal representations. Typically, INR is parameterized by a
+multiplayer perceptron (MLP) which takes the coordinates as the inputs and
+generates corresponding attributes of a signal. However, MLP-based INRs face
+two critical issues: i) individually considering each coordinate while ignoring
+the connections; ii) suffering from the spectral bias thus failing to learn
+high-frequency components. While target visual signals usually exhibit strong
+local structures and neighborhood dependencies, and high-frequency components
+are significant in these signals, the issues harm the representational capacity
+of INRs. This paper proposes Conv-INR, the first INR model fully based on
+convolution. Due to the inherent attributes of convolution, Conv-INR can
+simultaneously consider adjacent coordinates and learn high-frequency
+components effectively. Compared to existing MLP-based INRs, Conv-INR has
+better representational capacity and trainability without requiring primary
+function expansion. We conduct extensive experiments on four tasks, including
+image fitting, CT/MRI reconstruction, and novel view synthesis, Conv-INR all
+significantly surpasses existing MLP-based INRs, validating the effectiveness.
+Finally, we raise three reparameterization methods that can further enhance the
+performance of the vanilla Conv-INR without introducing any extra inference
+cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Information Storage and Transfer in Multi-modal Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samyadeep Basu, Martin Grayson, Cecily Morrison, Besmira Nushi, Soheil Feizi, Daniela Massiceti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the mechanisms of information storage and transfer in
+Transformer-based models is important for driving model understanding progress.
+Recent work has studied these mechanisms for Large Language Models (LLMs),
+revealing insights on how information is stored in a model's parameters and how
+information flows to and from these parameters in response to specific prompts.
+However, these studies have not yet been extended to Multi-modal Large Language
+Models (MLLMs). Given their expanding capabilities and real-world use, we start
+by studying one aspect of these models -- how MLLMs process information in a
+factual visual question answering task. We use a constraint-based formulation
+which views a visual question as having a set of visual or textual constraints
+that the model's generated answer must satisfy to be correct (e.g. What movie
+directed by the director in this photo has won a Golden Globe?). Under this
+setting, we contribute i) a method that extends causal information tracing from
+pure language to the multi-modal setting, and ii) VQA-Constraints, a test-bed
+of 9.7K visual questions annotated with constraints. We use these tools to
+study two open-source MLLMs, LLaVa and multi-modal Phi-2. Our key findings show
+that these MLLMs rely on MLP and self-attention blocks in much earlier layers
+for information storage, compared to LLMs whose mid-layer MLPs are more
+important. We also show that a consistent small subset of visual tokens output
+by the vision encoder are responsible for transferring information from the
+image to these causal blocks. We validate these mechanisms by introducing
+MultEdit, a model-editing algorithm that can correct errors and insert new
+long-tailed information into MLLMs by targeting these causal blocks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M3LEO: A Multi-Modal, Multi-Label Earth Observation <span class="highlight-title">Dataset</span> Integrating
+  Interferometric SAR and RGB Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04230v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04230v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew J Allen, Francisco Dorr, Joseph Alejandro Gallego Mejia, Laura Martínez-Ferrer, Anna Jungbluth, Freddie Kalaitzis, Raúl Ramos-Pollán
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite-based remote sensing has revolutionised the way we address global
+challenges in a rapidly evolving world. Huge quantities of Earth Observation
+(EO) data are generated by satellite sensors daily, but processing these large
+datasets for use in ML pipelines is technically and computationally
+challenging. Specifically, different types of EO data are often hosted on a
+variety of platforms, with differing availability for Python preprocessing
+tools. In addition, spatial alignment across data sources and data tiling can
+present significant technical hurdles for novice users. While some preprocessed
+EO datasets exist, their content is often limited to optical or near-optical
+wavelength data, which is ineffective at night or in adverse weather
+conditions. Synthetic Aperture Radar (SAR), an active sensing technique based
+on microwave length radiation, offers a viable alternative. However, the
+application of machine learning to SAR has been limited due to a lack of
+ML-ready data and pipelines, particularly for the full diversity of SAR data,
+including polarimetry, coherence and interferometry. We introduce M3LEO, a
+multi-modal, multi-label EO dataset that includes polarimetric,
+interferometric, and coherence SAR data derived from Sentinel-1, alongside
+Sentinel-2 RGB imagery and a suite of labelled tasks for model evaluation.
+M3LEO spans 17.5TB and contains approximately 10M data chips across six
+geographic regions. The dataset is complemented by a flexible PyTorch Lightning
+framework, with configuration management using Hydra. We provide tools to
+process any dataset available on popular platforms such as Google Earth Engine
+for integration with our framework. Initial experiments validate the utility of
+our data and framework, showing that SAR imagery contains information
+additional to that extractable from RGB data. Data at huggingface.co/M3LEO, and
+code at github.com/spaceml-org/M3LEO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ R-CONV: An Analytical Approach for Efficient Data Reconstruction via
+  Convolutional Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tamer Ahmed Eltaras, Qutaibah Malluhi, Alessandro Savino, Stefano Di Carlo, Adnan Qayyum, Junaid Qadir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the effort to learn from extensive collections of distributed data,
+federated learning has emerged as a promising approach for preserving privacy
+by using a gradient-sharing mechanism instead of exchanging raw data. However,
+recent studies show that private training data can be leaked through many
+gradient attacks. While previous analytical-based attacks have successfully
+reconstructed input data from fully connected layers, their effectiveness
+diminishes when applied to convolutional layers. This paper introduces an
+advanced data leakage method to efficiently exploit convolutional layers'
+gradients. We present a surprising finding: even with non-fully invertible
+activation functions, such as ReLU, we can analytically reconstruct training
+samples from the gradients. To the best of our knowledge, this is the first
+analytical approach that successfully reconstructs convolutional layer inputs
+directly from the gradients, bypassing the need to reconstruct layers' outputs.
+Prior research has mainly concentrated on the weight constraints of convolution
+layers, overlooking the significance of gradient constraints. Our findings
+demonstrate that existing analytical methods used to estimate the risk of
+gradient attacks lack accuracy. In some layers, attacks can be launched with
+less than 5% of the reported constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Matching Anything by Segmenting Anything <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Li, Lei Ke, Martin Danelljan, Luigi Piccinelli, Mattia Segu, Luc Van Gool, Fisher Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The robust association of the same objects across video frames in complex
+scenes is crucial for many applications, especially Multiple Object Tracking
+(MOT). Current methods predominantly rely on labeled domain-specific video
+datasets, which limits the cross-domain generalization of learned similarity
+embeddings. We propose MASA, a novel method for robust instance association
+learning, capable of matching any objects within videos across diverse domains
+without tracking labels. Leveraging the rich object segmentation from the
+Segment Anything Model (SAM), MASA learns instance-level correspondence through
+exhaustive data transformations. We treat the SAM outputs as dense object
+region proposals and learn to match those regions from a vast image collection.
+We further design a universal MASA adapter which can work in tandem with
+foundational segmentation or detection models and enable them to track any
+detected objects. Those combinations present strong zero-shot tracking ability
+in complex domains. Extensive tests on multiple challenging MOT and MOTS
+benchmarks indicate that the proposed method, using only unlabeled static
+images, achieves even better performance than state-of-the-art methods trained
+with fully annotated in-domain video sequences, in zero-shot association.
+Project Page: https://matchinganything.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024 Highlight. code at: https://github.com/siyuanliii/masa</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CDMamba: Remote Sensing Image Change Detection with Mamba 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotian Zhang, Keyan Chen, Chenyang Liu, Hao Chen, Zhengxia Zou, Zhenwei Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the Mamba architecture based on state space models has demonstrated
+remarkable performance in a series of natural language processing tasks and has
+been rapidly applied to remote sensing change detection (CD) tasks. However,
+most methods enhance the global receptive field by directly modifying the
+scanning mode of Mamba, neglecting the crucial role that local information
+plays in dense prediction tasks (e.g., CD). In this article, we propose a model
+called CDMamba, which effectively combines global and local features for
+handling CD tasks. Specifically, the Scaled Residual ConvMamba (SRCM) block is
+proposed to utilize the ability of Mamba to extract global features and
+convolution to enhance the local details, to alleviate the issue that current
+Mamba-based methods lack detailed clues and are difficult to achieve fine
+detection in dense prediction tasks. Furthermore, considering the
+characteristics of bi-temporal feature interaction required for CD, the
+Adaptive Global Local Guided Fusion (AGLGF) block is proposed to dynamically
+facilitate the bi-temporal interaction guided by other temporal global/local
+features. Our intuition is that more discriminative change features can be
+acquired with the guidance of other temporal features. Extensive experiments on
+three datasets demonstrate that our proposed CDMamba outperforms the current
+state-of-the-art methods. Our code will be open-sourced at
+https://github.com/zmoka-zht/CDMamba.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion-based image inpainting with internal learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Cherel, Andrés Almansa, Yann Gousseau, Alasdair Newson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are now the undisputed state-of-the-art for image generation
+and image restoration. However, they require large amounts of computational
+power for training and inference. In this paper, we propose lightweight
+diffusion models for image inpainting that can be trained on a single image, or
+a few images. We show that our approach competes with large state-of-the-art
+models in specific cases. We also show that training a model on a single image
+is particularly relevant for image acquisition modality that differ from the
+RGB images of standard learning databases. We show results in three different
+contexts: texture images, line drawing images, and materials BRDF, for which we
+achieve state-of-the-art results in terms of realism, with a computational load
+that is greatly reduced compared to concurrent methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures. EUSIPCO 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Encoding Semantic Priors into the Weights of Implicit Neural
+  Representation <span class="chip">ICME 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhicheng Cai, Qiu Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit neural representation (INR) has recently emerged as a promising
+paradigm for signal representations, which takes coordinates as inputs and
+generates corresponding signal values. Since these coordinates contain no
+semantic features, INR fails to take any semantic information into
+consideration. However, semantic information has been proven critical in many
+vision tasks, especially for visual signal representation. This paper proposes
+a reparameterization method termed as SPW, which encodes the semantic priors to
+the weights of INR, thus making INR contain semantic information implicitly and
+enhancing its representational capacity. Specifically, SPW uses the Semantic
+Neural Network (SNN) to extract both low- and high-level semantic information
+of the target visual signal and generates the semantic vector, which is input
+into the Weight Generation Network (WGN) to generate the weights of INR model.
+Finally, INR uses the generated weights with semantic priors to map the
+coordinates to the signal values. After training, we only retain the generated
+weights while abandoning both SNN and WGN, thus SPW introduces no extra costs
+in inference. Experimental results show that SPW can improve the performance of
+various INR models significantly on various tasks, including image fitting, CT
+reconstruction, MRI reconstruction, and novel view synthesis. Further
+experiments illustrate that model with SPW has lower weight redundancy and
+learns more novel representations, validating the effectiveness of SPW.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICME 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Voxel-based Approach for Simulating Microbial Decomposition in Soil:
+  Comparison with LBM and Improvement of Morphological Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mouad Klai, Olivier Monga, Mohamed Soufiane Jouini, Valérie Pot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a new computational approach for simulating the microbial
+decomposition of organic matter, from 3D micro-computed tomography (micro-CT)
+images of soil. The method employs a valuated graph of connected voxels to
+simulate transformation and diffusion processes involved in microbial
+decomposition within the complex soil matrix. The resulting model can be
+adapted to simulate any diffusion-transformation processes in porous media. We
+implemented parallelization strategies and explored different numerical
+methods, including implicit, explicit, synchronous, and asynchronous schemes.
+To validate our method, we compared simulation outputs with those provided by
+LBioS and by Mosaic models. LBioS uses a lattice-Boltzmann method for diffusion
+and Mosaic takes benefit of Pore Network Geometrical Modelling (PNGM) by means
+of geometrical primitives such as spheres and ellipsoids. This approach
+achieved comparable results to traditional LBM-based simulations, but required
+only one-fourth of the computing time. Compared to Mosaic simulation, the
+proposed method is slower but more accurate and does not require any
+calibration. Furthermore, we present a theoretical framework and an application
+example to enhance PNGM-based simulations. This is accomplished by
+approximating the diffusional conductance coefficients using stochastic
+gradient descent and data generated by the current approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to IEEE Access</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse Multi-baseline SAR Cross-modal 3D Reconstruction of Vehicle
+  Targets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Da Li, Guoqiang Zhao, Houjun Sun, Jiacheng Bao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-baseline SAR 3D imaging faces significant challenges due to data
+sparsity. In recent years, deep learning techniques have achieved notable
+success in enhancing the quality of sparse SAR 3D imaging. However, previous
+work typically rely on full-aperture high-resolution radar images to supervise
+the training of deep neural networks (DNNs), utilizing only single-modal
+information from radar data. Consequently, imaging performance is limited, and
+acquiring full-aperture data for multi-baseline SAR is costly and sometimes
+impractical in real-world applications. In this paper, we propose a Cross-Modal
+Reconstruction Network (CMR-Net), which integrates differentiable render and
+cross-modal supervision with optical images to reconstruct highly sparse
+multi-baseline SAR 3D images of vehicle targets into visually structured and
+high-resolution images. We meticulously designed the network architecture and
+training strategies to enhance network generalization capability. Remarkably,
+CMR-Net, trained solely on simulated data, demonstrates high-resolution
+reconstruction capabilities on both publicly available simulation datasets and
+real measured datasets, outperforming traditional sparse reconstruction
+algorithms based on compressed sensing and other learning-based methods.
+Additionally, using optical images as supervision provides a cost-effective way
+to build training datasets, reducing the difficulty of method dissemination.
+Our work showcases the broad prospects of deep learning in multi-baseline SAR
+3D imaging and offers a novel path for researching radar imaging based on
+cross-modal learning theory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Physics-Augmented Continuum Neural Radiance Field-Based
+  Geometry-Agnostic System Identification with Lagrangian Particle Optimization <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takuhiro Kaneko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geometry-agnostic system identification is a technique for identifying the
+geometry and physical properties of an object from video sequences without any
+geometric assumptions. Recently, physics-augmented continuum neural radiance
+fields (PAC-NeRF) has demonstrated promising results for this technique by
+utilizing a hybrid Eulerian-Lagrangian representation, in which the geometry is
+represented by the Eulerian grid representations of NeRF, the physics is
+described by a material point method (MPM), and they are connected via
+Lagrangian particles. However, a notable limitation of PAC-NeRF is that its
+performance is sensitive to the learning of the geometry from the first frames
+owing to its two-step optimization. First, the grid representations are
+optimized with the first frames of video sequences, and then the physical
+properties are optimized through video sequences utilizing the fixed
+first-frame grid representations. This limitation can be critical when learning
+of the geometric structure is difficult, for example, in a few-shot (sparse
+view) setting. To overcome this limitation, we propose Lagrangian particle
+optimization (LPO), in which the positions and features of particles are
+optimized through video sequences in Lagrangian space. This method allows for
+the optimization of the geometric structure across the entire video sequence
+within the physical constraints imposed by the MPM. The experimental results
+demonstrate that the LPO is useful for geometric correction and physical
+identification in sparse-view settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024. Project page:
+  https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/lpo/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Redundancy-aware Action Spaces for Robot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pietro Mazzaglia, Nicholas Backshall, Xiao Ma, Stephen James
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Joint space and task space control are the two dominant action modes for
+controlling robot arms within the robot learning literature. Actions in joint
+space provide precise control over the robot's pose, but tend to suffer from
+inefficient training; actions in task space boast data-efficient training but
+sacrifice the ability to perform tasks in confined spaces due to limited
+control over the full joint configuration. This work analyses the criteria for
+designing action spaces for robot manipulation and introduces ER (End-effector
+Redundancy), a novel action space formulation that, by addressing the
+redundancies present in the manipulator, aims to combine the advantages of both
+joint and task spaces, offering fine-grained comprehensive control with
+overactuated robot arms whilst achieving highly efficient robot learning. We
+present two implementations of ER, ERAngle (ERA) and ERJoint (ERJ), and we show
+that ERJ in particular demonstrates superior performance across multiple
+settings, especially when precise control over the robot configuration is
+required. We validate our results both in simulated and real robotic
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the RA-L journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The 3D-PC: a benchmark for visual perspective taking in humans and
+  machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Drew Linsley, Peisen Zhou, Alekh Karkada Ashok, Akash Nagaraj, Gaurav Gaonkar, Francis E Lewis, Zygmunt Pizlo, Thomas Serre
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual perspective taking (VPT) is the ability to perceive and reason about
+the perspectives of others. It is an essential feature of human intelligence,
+which develops over the first decade of life and requires an ability to process
+the 3D structure of visual scenes. A growing number of reports have indicated
+that deep neural networks (DNNs) become capable of analyzing 3D scenes after
+training on large image datasets. We investigated if this emergent ability for
+3D analysis in DNNs is sufficient for VPT with the 3D perception challenge
+(3D-PC): a novel benchmark for 3D perception in humans and DNNs. The 3D-PC is
+comprised of three 3D-analysis tasks posed within natural scene images: 1. a
+simple test of object depth order, 2. a basic VPT task (VPT-basic), and 3.
+another version of VPT (VPT-Strategy) designed to limit the effectiveness of
+"shortcut" visual strategies. We tested human participants (N=33) and linearly
+probed or text-prompted over 300 DNNs on the challenge and found that nearly
+all of the DNNs approached or exceeded human accuracy in analyzing object depth
+order. Surprisingly, DNN accuracy on this task correlated with their object
+recognition performance. In contrast, there was an extraordinary gap between
+DNNs and humans on VPT-basic. Humans were nearly perfect, whereas most DNNs
+were near chance. Fine-tuning DNNs on VPT-basic brought them close to human
+performance, but they, unlike humans, dropped back to chance when tested on
+VPT-perturb. Our challenge demonstrates that the training routines and
+architectures of today's DNNs are well-suited for learning basic 3D properties
+of scenes and objects but are ill-suited for reasoning about these properties
+like humans do. We release our 3D-PC datasets and code to help bridge this gap
+in 3D perception between humans and machines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LenslessFace: An End-to-End Optimized Lensless System for
+  Privacy-Preserving Face Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Cai, Hailong Zhang, Chenchen Wang, Wentao Liu, Jinwei Gu, Tianfan Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lensless cameras, innovatively replacing traditional lenses for ultra-thin,
+flat optics, encode light directly onto sensors, producing images that are not
+immediately recognizable. This compact, lightweight, and cost-effective imaging
+solution offers inherent privacy advantages, making it attractive for
+privacy-sensitive applications like face verification. Typical lensless face
+verification adopts a two-stage process of reconstruction followed by
+verification, incurring privacy risks from reconstructed faces and high
+computational costs. This paper presents an end-to-end optimization approach
+for privacy-preserving face verification directly on encoded lensless captures,
+ensuring that the entire software pipeline remains encoded with no visible
+faces as intermediate results. To achieve this, we propose several techniques
+to address unique challenges from the lensless setup which precludes
+traditional face detection and alignment. Specifically, we propose a face
+center alignment scheme, an augmentation curriculum to build robustness against
+variations, and a knowledge distillation method to smooth optimization and
+enhance performance. Evaluations under both simulation and real environment
+demonstrate our method outperforms two-stage lensless verification while
+enhancing privacy and efficiency. Project website:
+\url{lenslessface.github.io}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Parameterization-based Texture Space Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04115v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04115v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Chen, Yuxue Ren, Na Lei, Zhongxuan Luo, Xianfeng Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Texture mapping is a common technology in the area of computer graphics, it
+maps the 3D surface space onto the 2D texture space. However, the loose texture
+space will reduce the efficiency of data storage and GPU memory addressing in
+the rendering process. Many of the existing methods focus on repacking given
+textures, but they still suffer from high computational cost and hardly produce
+a wholly tight texture space. In this paper, we propose a method to optimize
+the texture space and produce a new texture mapping which is compact based on
+global parameterization. The proposed method is computationally robust and
+efficient. Experiments show the effectiveness of the proposed method and the
+potency in improving the storage and rendering efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to Comput. Math. Math. Phys</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UrbanSARFloods: Sentinel-1 SLC-Based Benchmark <span class="highlight-title">Dataset</span> for Urban and
+  Open-Area Flood Mapping <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04111v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04111v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Zhao, Zhitong Xiong, Xiao Xiang Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to its cloud-penetrating capability and independence from solar
+illumination, satellite Synthetic Aperture Radar (SAR) is the preferred data
+source for large-scale flood mapping, providing global coverage and including
+various land cover classes. However, most studies on large-scale SAR-derived
+flood mapping using deep learning algorithms have primarily focused on flooded
+open areas, utilizing available open-access datasets (e.g., Sen1Floods11) and
+with limited attention to urban floods. To address this gap, we introduce
+\textbf{UrbanSARFloods}, a floodwater dataset featuring pre-processed
+Sentinel-1 intensity data and interferometric coherence imagery acquired before
+and during flood events. It contains 8,879 $512\times 512$ chips covering
+807,500 $km^2$ across 20 land cover classes and 5 continents, spanning 18 flood
+events. We used UrbanSARFloods to benchmark existing state-of-the-art
+convolutional neural networks (CNNs) for segmenting open and urban flood areas.
+Our findings indicate that prevalent approaches, including the Weighted
+Cross-Entropy (WCE) loss and the application of transfer learning with
+pretrained models, fall short in overcoming the obstacles posed by imbalanced
+data and the constraints of a small training dataset. Urban flood detection
+remains challenging. Future research should explore strategies for addressing
+imbalanced data challenges and investigate transfer learning's potential for
+SAR-based large-scale flood mapping. Besides, expanding this dataset to include
+additional flood events holds promise for enhancing its utility and
+contributing to advancements in flood mapping techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2024 EarthVision Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multistep Distillation of Diffusion Models via Moment Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Salimans, Thomas Mensink, Jonathan Heek, Emiel Hoogeboom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new method for making diffusion models faster to sample. The
+method distills many-step diffusion models into few-step models by matching
+conditional expectations of the clean data given noisy data along the sampling
+trajectory. Our approach extends recently proposed one-step methods to the
+multi-step case, and provides a new perspective by interpreting these
+approaches in terms of moment matching. By using up to 8 sampling steps, we
+obtain distilled models that outperform not only their one-step versions but
+also their original many-step teacher models, obtaining new state-of-the-art
+results on the Imagenet dataset. We also show promising results on a large
+text-to-image model where we achieve fast generation of high resolution images
+directly in image space, without needing autoencoders or upsamplers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Far Can We Compress Instant-NGP-Based NeRF? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihang Chen, Qianyi Wu, Mehrtash Harandi, Jianfei Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Neural Radiance Field (NeRF) has demonstrated remarkable
+capabilities in representing 3D scenes. To expedite the rendering process,
+learnable explicit representations have been introduced for combination with
+implicit NeRF representation, which however results in a large storage space
+requirement. In this paper, we introduce the Context-based NeRF Compression
+(CNC) framework, which leverages highly efficient context models to provide a
+storage-friendly NeRF representation. Specifically, we excavate both level-wise
+and dimension-wise context dependencies to enable probability prediction for
+information entropy reduction. Additionally, we exploit hash collision and
+occupancy grids as strong prior knowledge for better context modeling. To the
+best of our knowledge, we are the first to construct and exploit context models
+for NeRF compression. We achieve a size reduction of 100$\times$ and 70$\times$
+with improved fidelity against the baseline Instant-NGP on Synthesic-NeRF and
+Tanks and Temples datasets, respectively. Additionally, we attain 86.7\% and
+82.3\% storage size reduction against the SOTA NeRF compression method BiRF.
+Our code is available here: https://github.com/YihangChen-ee/CNC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://yihangchen-ee.github.io/project_cnc/ Code:
+  https://github.com/yihangchen-ee/cnc/. We further propose a 3DGS compression
+  method HAC, which is based on CNC:
+  https://yihangchen-ee.github.io/project_hac/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Class-Aware Cartilage Segmentation for Autonomous US-CT Registration in
+  Robotic Intercostal Ultrasound Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongliang Jiang, Yunfeng Kang, Yuan Bi, Xuesong Li, Chenyang Li, Nassir Navab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrasound imaging has been widely used in clinical examinations owing to the
+advantages of being portable, real-time, and radiation-free. Considering the
+potential of extensive deployment of autonomous examination systems in
+hospitals, robotic US imaging has attracted increased attention. However, due
+to the inter-patient variations, it is still challenging to have an optimal
+path for each patient, particularly for thoracic applications with limited
+acoustic windows, e.g., intercostal liver imaging. To address this problem, a
+class-aware cartilage bone segmentation network with geometry-constraint
+post-processing is presented to capture patient-specific rib skeletons. Then, a
+dense skeleton graph-based non-rigid registration is presented to map the
+intercostal scanning path from a generic template to individual patients. By
+explicitly considering the high-acoustic impedance bone structures, the
+transferred scanning path can be precisely located in the intercostal space,
+enhancing the visibility of internal organs by reducing the acoustic shadow. To
+evaluate the proposed approach, the final path mapping performance is validated
+on five distinct CTs and two volunteer US data, resulting in ten pairs of CT-US
+combinations. Results demonstrate that the proposed graph-based registration
+method can robustly and precisely map the path from CT template to individual
+patients (Euclidean error: $2.21\pm1.11~mm$).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable Lightweight <span class="highlight-title">Transformer</span> via Unrolling of Learned Graph
+  Smoothness Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04090v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04090v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tam Thuc Do, Parham Eftekhar, Seyed Alireza Hosseini, Gene Cheung, Philip Chou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We build interpretable and lightweight transformer-like neural networks by
+unrolling iterative optimization algorithms that minimize graph smoothness
+priors -- the quadratic graph Laplacian regularizer (GLR) and the $\ell_1$-norm
+graph total variation (GTV) -- subject to an interpolation constraint. The
+crucial insight is that a normalized signal-dependent graph learning module
+amounts to a variant of the basic self-attention mechanism in conventional
+transformers. Unlike "black-box" transformers that require learning of large
+key, query and value matrices to compute scaled dot products as affinities and
+subsequent output embeddings, resulting in huge parameter sets, our unrolled
+networks employ shallow CNNs to learn low-dimensional features per node to
+establish pairwise Mahalanobis distances and construct sparse similarity
+graphs. At each layer, given a learned graph, the target interpolated signal is
+simply a low-pass filtered output derived from the minimization of an assumed
+graph smoothness prior, leading to a dramatic reduction in parameter count.
+Experiments for two image interpolation applications verify the restoration
+performance, parameter efficiency and robustness to covariate shift of our
+graph-based unrolled networks compared to conventional transformers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semmeldetector: Application of Machine Learning in Commercial Bakeries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas H. Schmitt, Maximilian Bundscherer, Tobias Bocklet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Semmeldetector, is a machine learning application that utilizes object
+detection models to detect, classify and count baked goods in images. Our
+application allows commercial bakers to track unsold baked goods, which allows
+them to optimize production and increase resource efficiency. We compiled a
+dataset comprising 1151 images that distinguishes between 18 different types of
+baked goods to train our detection models. To facilitate model training, we
+used a Copy-Paste augmentation pipeline to expand our dataset. We trained the
+state-of-the-art object detection model YOLOv8 on our detection task. We tested
+the impact of different training data, model scale, and online image
+augmentation pipelines on model performance. Our overall best performing model,
+achieved an AP@0.5 of 89.1% on our test set. Based on our results, we conclude
+that machine learning can be a valuable tool even for unforeseen industries
+like bakeries, even with very limited datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shaping History: Advanced Machine Learning Techniques for the Analysis
+  and Dating of Cuneiform Tablets over Three Millennia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danielle Kapon, Michael Fire, Shai Gordin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cuneiform tablets, emerging in ancient Mesopotamia around the late fourth
+millennium BCE, represent one of humanity's earliest writing systems.
+Characterized by wedge-shaped marks on clay tablets, these artifacts provided
+insight into Mesopotamian civilization across various domains. Traditionally,
+the analysis and dating of these tablets rely on subjective assessment of shape
+and writing style, leading to uncertainties in pinpointing their exact temporal
+origins. Recent advances in digitization have revolutionized the study of
+cuneiform by enhancing accessibility and analytical capabilities. Our research
+uniquely focuses on the silhouette of tablets as significant indicators of
+their historical periods, diverging from most studies that concentrate on
+textual content. Utilizing an unprecedented dataset of over 94,000 images from
+the Cuneiform Digital Library Initiative collection, we apply deep learning
+methods to classify cuneiform tablets, covering over 3,000 years of history. By
+leveraging statistical, computational techniques, and generative modeling
+through Variational Auto-Encoders (VAEs), we achieve substantial advancements
+in the automatic classification of these ancient documents, focusing on the
+tablets' silhouettes as key predictors. Our classification approach begins with
+a Decision Tree using height-to-width ratios and culminates with a ResNet50
+model, achieving a 61% macro F1-score for tablet silhouettes. Moreover, we
+introduce novel VAE-powered tools to enhance explainability and enable
+researchers to explore changes in tablet shapes across different eras and
+genres. This research contributes to document analysis and diplomatics by
+demonstrating the value of large-scale data analysis combined with statistical
+methods. These insights offer valuable tools for historians and epigraphists,
+enriching our understanding of cuneiform tablets and the cultures that produced
+them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Painter: Training-Free Layout Control for Text-to-Image Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marianna Ohanyan, Hayk Manukyan, Zhangyang Wang, Shant Navasardyan, Humphrey Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Zero-Painter, a novel training-free framework for
+layout-conditional text-to-image synthesis that facilitates the creation of
+detailed and controlled imagery from textual prompts. Our method utilizes
+object masks and individual descriptions, coupled with a global text prompt, to
+generate images with high fidelity. Zero-Painter employs a two-stage process
+involving our novel Prompt-Adjusted Cross-Attention (PACA) and Region-Grouped
+Cross-Attention (ReGCA) blocks, ensuring precise alignment of generated objects
+with textual prompts and mask shapes. Our extensive experiments demonstrate
+that Zero-Painter surpasses current state-of-the-art methods in preserving
+textual details and adhering to mask shapes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jailbreak Vision Language Models via Bi-Modal Adversarial <span class="highlight-title">Prompt</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zonghao Ying, Aishan Liu, Tianyuan Zhang, Zhengmin Yu, Siyuan Liang, Xianglong Liu, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of large vision language models (LVLMs), jailbreak attacks serve
+as a red-teaming approach to bypass guardrails and uncover safety implications.
+Existing jailbreaks predominantly focus on the visual modality, perturbing
+solely visual inputs in the prompt for attacks. However, they fall short when
+confronted with aligned models that fuse visual and textual features
+simultaneously for generation. To address this limitation, this paper
+introduces the Bi-Modal Adversarial Prompt Attack (BAP), which executes
+jailbreaks by optimizing textual and visual prompts cohesively. Initially, we
+adversarially embed universally harmful perturbations in an image, guided by a
+few-shot query-agnostic corpus (e.g., affirmative prefixes and negative
+inhibitions). This process ensures that image prompt LVLMs to respond
+positively to any harmful queries. Subsequently, leveraging the adversarial
+image, we optimize textual prompts with specific harmful intent. In particular,
+we utilize a large language model to analyze jailbreak failures and employ
+chain-of-thought reasoning to refine textual prompts through a
+feedback-iteration manner. To validate the efficacy of our approach, we
+conducted extensive evaluations on various datasets and LVLMs, demonstrating
+that our method significantly outperforms other methods by large margins
+(+29.03% in attack success rate on average). Additionally, we showcase the
+potential of our attacks on black-box commercial LVLMs, such as Gemini and
+ChatGLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3rd Place Solution for PVUW Challenge 2024: Video Panoptic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruipu Wu, Jifei Che, Han Li, Chengjing Wu, Ting Liu, Luoqi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video panoptic segmentation is an advanced task that extends panoptic
+segmentation by applying its concept to video sequences. In the hope of
+addressing the challenge of video panoptic segmentation in diverse conditions,
+We utilize DVIS++ as our baseline model and enhance it by introducing a
+comprehensive approach centered on the query-wise ensemble, supplemented by
+additional techniques. Our proposed approach achieved a VPQ score of 57.01 on
+the VIPSeg test set, and ranked 3rd in the VPS track of the 3rd Pixel-level
+Video Understanding in the Wild Challenge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Dynamics of Information Interplay in Supervised Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Song, Zhiquan Tan, Bochao Zou, Huimin Ma, Weiran Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we use matrix information theory as an analytical tool to
+analyze the dynamics of the information interplay between data representations
+and classification head vectors in the supervised learning process.
+Specifically, inspired by the theory of Neural Collapse, we introduce matrix
+mutual information ratio (MIR) and matrix entropy difference ratio (HDR) to
+assess the interactions of data representation and class classification heads
+in supervised learning, and we determine the theoretical optimal values for MIR
+and HDR when Neural Collapse happens. Our experiments show that MIR and HDR can
+effectively explain many phenomena occurring in neural networks, for example,
+the standard supervised training dynamics, linear mode connectivity, and the
+performance of label smoothing and pruning. Additionally, we use MIR and HDR to
+gain insights into the dynamics of grokking, which is an intriguing phenomenon
+observed in supervised training, where the model demonstrates generalization
+capabilities long after it has learned to fit the training data. Furthermore,
+we introduce MIR and HDR as loss terms in supervised and semi-supervised
+learning to optimize the information interactions among samples and
+classification heads. The empirical results provide evidence of the method's
+effectiveness, demonstrating that the utilization of MIR and HDR not only aids
+in comprehending the dynamics throughout the training process but can also
+enhances the training procedure itself.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LNQ Challenge 2023: Learning Mediastinal Lymph Node Segmentation with a
+  Probabilistic Lymph Node Atlas 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sofija Engelson, Jan Ehrhardt, Timo Kepp, Joshua Niemeijer, Heinz Handels
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evaluation of lymph node metastases plays a crucial role in achieving
+precise cancer staging, influencing subsequent decisions regarding treatment
+options. Lymph node detection poses challenges due to the presence of unclear
+boundaries and the diverse range of sizes and morphological characteristics,
+making it a resource-intensive process. As part of the LNQ 2023 MICCAI
+challenge, we propose the use of anatomical priors as a tool to address the
+challenges that persist in mediastinal lymph node segmentation in combination
+with the partial annotation of the challenge training data. The model ensemble
+using all suggested modifications yields a Dice score of 0.6033 and segments
+57% of the ground truth lymph nodes, compared to 27% when training on CT only.
+Segmentation accuracy is improved significantly by incorporating a
+probabilistic lymph node atlas in loss weighting and post-processing. The
+largest performance gains are achieved by oversampling fully annotated data to
+account for the partial annotation of the challenge training data, as well as
+adding additional data augmentation to address the high heterogeneity of the CT
+images and lymph node appearance. Our code is available at
+https://github.com/MICAI-IMI-UzL/LNQ2023.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the Journal of Machine Learning for
+  Biomedical Imaging (MELBA) https://melba-journal.org/2024:009</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LDM-RSIC: Exploring Distortion Prior with Latent Diffusion Models for
+  Remote Sensing Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhui Li, Jutao Li, Xingsong Hou, Huake Wang, Yutao Zhang, Yujie Dun, Wenke Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based image compression algorithms typically focus on designing
+encoding and decoding networks and improving the accuracy of entropy model
+estimation to enhance the rate-distortion (RD) performance. However, few
+algorithms leverage the compression distortion prior from existing compression
+algorithms to improve RD performance. In this paper, we propose a latent
+diffusion model-based remote sensing image compression (LDM-RSIC) method, which
+aims to enhance the final decoding quality of RS images by utilizing the
+generated distortion prior from a LDM. Our approach consists of two stages. In
+the first stage, a self-encoder learns prior from the high-quality input image.
+In the second stage, the prior is generated through an LDM, conditioned on the
+decoded image of an existing learning-based image compression algorithm, to be
+used as auxiliary information for generating the texture-rich enhanced image.
+To better utilize the prior, a channel attention and gate-based dynamic feature
+attention module (DFAM) is embedded into a Transformer-based multi-scale
+enhancement network (MEN) for image enhancement. Extensive experiments
+demonstrate the proposed LDM-RSIC significantly outperforms existing
+state-of-the-art traditional and learning-based image compression algorithms in
+terms of both subjective perception and objective metrics. Additionally, we use
+the LDM-based scheme to improve the traditional image compression algorithm
+JPEG2000 and obtain 32.00% bit savings on the DOTA testing set. The code will
+be available at https://github.com/mlkk518/LDM-RSIC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vectorized Conditional Neural Fields: A Framework for Solving
+  Time-dependent Parametric Partial Differential Equations <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03919v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03919v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Hagnberger, Marimuthu Kalimuthu, Daniel Musekamp, Mathias Niepert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models are increasingly used for solving Partial Differential
+Equations (PDEs). Several adaptations have been proposed, all of which suffer
+from the typical problems of Transformers, such as quadratic memory and time
+complexity. Furthermore, all prevalent architectures for PDE solving lack at
+least one of several desirable properties of an ideal surrogate model, such as
+(i) generalization to PDE parameters not seen during training, (ii) spatial and
+temporal zero-shot super-resolution, (iii) continuous temporal extrapolation,
+(iv) support for 1D, 2D, and 3D PDEs, and (v) efficient inference for longer
+temporal rollouts. To address these limitations, we propose Vectorized
+Conditional Neural Fields (VCNeFs), which represent the solution of
+time-dependent PDEs as neural fields. Contrary to prior methods, however,
+VCNeFs compute, for a set of multiple spatio-temporal query points, their
+solutions in parallel and model their dependencies through attention
+mechanisms. Moreover, VCNeF can condition the neural field on both the initial
+conditions and the parameters of the PDEs. An extensive set of experiments
+demonstrates that VCNeFs are competitive with and often outperform existing
+ML-based surrogate models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the 41st International Conference on
+  Machine Learning (ICML) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Frequency-based Matcher for Long-tailed Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shan Li, Lu Yang, Pu Cao, Liulei Li, Huadong Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The successful application of semantic segmentation technology in the real
+world has been among the most exciting achievements in the computer vision
+community over the past decade. Although the long-tailed phenomenon has been
+investigated in many fields, e.g., classification and object detection, it has
+not received enough attention in semantic segmentation and has become a
+non-negligible obstacle to applying semantic segmentation technology in
+autonomous driving and virtual reality. Therefore, in this work, we focus on a
+relatively under-explored task setting, long-tailed semantic segmentation
+(LTSS). We first establish three representative datasets from different
+aspects, i.e., scene, object, and human. We further propose a dual-metric
+evaluation system and construct the LTSS benchmark to demonstrate the
+performance of semantic segmentation methods and long-tailed solutions. We also
+propose a transformer-based algorithm to improve LTSS, frequency-based matcher,
+which solves the oversuppression problem by one-to-many matching and
+automatically determines the number of matching queries for each class. Given
+the comprehensiveness of this work and the importance of the issues revealed,
+this work aims to promote the empirical study of semantic segmentation tasks.
+Our datasets, codes, and models will be publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication as a Regular paper in the IEEE Transactions
+  on Multimedia</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ArMeme: Propagandistic Content in Arabic Memes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Firoj Alam, Abul Hasnat, Fatema Ahmed, Md Arid Hasan, Maram Hasanain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rise of digital communication, memes have become a significant
+medium for cultural and political expression that is often used to mislead
+audiences. Identification of such misleading and persuasive multimodal content
+has become more important among various stakeholders, including social media
+platforms, policymakers, and the broader society as they often cause harm to
+individuals, organizations, and/or society. While there has been effort to
+develop AI-based automatic systems for resource-rich languages (e.g., English),
+it is relatively little to none for medium to low resource languages. In this
+study, we focused on developing an Arabic memes dataset with manual annotations
+of propagandistic content. We annotated ~6K Arabic memes collected from various
+social media platforms, which is a first resource for Arabic multimodal
+research. We provide a comprehensive analysis aiming to develop computational
+tools for their detection. We will make them publicly available for the
+community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>disinformation, misinformation, factuality, harmfulness, fake news,
+  propaganda, multimodality, text, images</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Zero-Shot Capabilities of Vision-Language Models for
+  Improving Gaze Following <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03907v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03907v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anshul Gupta, Pierre Vuillecard, Arya Farkhondeh, Jean-Marc Odobez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contextual cues related to a person's pose and interactions with objects and
+other people in the scene can provide valuable information for gaze following.
+While existing methods have focused on dedicated cue extraction methods, in
+this work we investigate the zero-shot capabilities of Vision-Language Models
+(VLMs) for extracting a wide array of contextual cues to improve gaze following
+performance. We first evaluate various VLMs, prompting strategies, and
+in-context learning (ICL) techniques for zero-shot cue recognition performance.
+We then use these insights to extract contextual cues for gaze following, and
+investigate their impact when incorporated into a state of the art model for
+the task. Our analysis indicates that BLIP-2 is the overall top performing VLM
+and that ICL can improve performance. We also observe that VLMs are sensitive
+to the choice of the text prompt although ensembling over multiple text prompts
+can provide more robust performance. Additionally, we discover that using the
+entire image along with an ellipse drawn around the target person is the most
+effective strategy for visual prompting. For gaze following, incorporating the
+extracted cues results in better generalization performance, especially when
+considering a larger set of cues, highlighting the potential of this approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the GAZE Workshop at CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data-Centric Label Smoothing for Explainable Glaucoma Screening from Eye
+  Fundus Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Galdran, Miguel A. González Ballester
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As current computing capabilities increase, modern machine learning and
+computer vision system tend to increase in complexity, mostly by means of
+larger models and advanced optimization strategies. Although often neglected,
+in many problems there is also much to be gained by considering potential
+improvements in understanding and better leveraging already-available training
+data, including annotations. This so-called data-centric approach can lead to
+substantial performance increases, sometimes beyond what can be achieved by
+larger models. In this paper we adopt such an approach for the task of
+justifiable glaucoma screening from retinal images. In particular, we focus on
+how to combine information from multiple annotators of different skills into a
+tailored label smoothing scheme that allows us to better employ a large
+collection of fundus images, instead of discarding samples suffering from
+inter-rater variability. Internal validation results indicate that our bespoke
+label smoothing approach surpasses the performance of a standard resnet50 model
+and also the same model trained with conventional label smoothing techniques,
+in particular for the multi-label scenario of predicting clinical reasons of
+glaucoma likelihood in a highly imbalanced screening context. Our code is made
+available at github.com/agaldran/justraigs .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ISBI 2024 (Challenges), 2nd position in the JustRAIGS
+  challenge (https://justraigs.grand-challenge.org/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ C^2RV: Cross-Regional and Cross-View Learning for Sparse-View CBCT
+  Reconstruction <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqun Lin, Jiewen Yang, Hualiang Wang, Xinpeng Ding, Wei Zhao, Xiaomeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cone beam computed tomography (CBCT) is an important imaging technology
+widely used in medical scenarios, such as diagnosis and preoperative planning.
+Using fewer projection views to reconstruct CT, also known as sparse-view
+reconstruction, can reduce ionizing radiation and further benefit
+interventional radiology. Compared with sparse-view reconstruction for
+traditional parallel/fan-beam CT, CBCT reconstruction is more challenging due
+to the increased dimensionality caused by the measurement process based on
+cone-shaped X-ray beams. As a 2D-to-3D reconstruction problem, although
+implicit neural representations have been introduced to enable efficient
+training, only local features are considered and different views are processed
+equally in previous works, resulting in spatial inconsistency and poor
+performance on complicated anatomies. To this end, we propose C^2RV by
+leveraging explicit multi-scale volumetric representations to enable
+cross-regional learning in the 3D space. Additionally, the scale-view
+cross-attention module is introduced to adaptively aggregate multi-scale and
+multi-view features. Extensive experiments demonstrate that our C^2RV achieves
+consistent and significant improvement over previous state-of-the-art methods
+on datasets with diverse anatomy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Polyp and Surgical Instrument Segmentation with Double Encoder-Decoder
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Galdran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes a solution for the MedAI competition, in which
+participants were required to segment both polyps and surgical instruments from
+endoscopic images. Our approach relies on a double encoder-decoder neural
+network which we have previously applied for polyp segmentation, but with a
+series of enhancements: a more powerful encoder architecture, an improved
+optimization procedure, and the post-processing of segmentations based on
+tempered model ensembling. Experimental results show that our method produces
+segmentations that show a good agreement with manual delineations provided by
+medical experts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decay Pruning Method: Smooth Pruning With a Self-Rectifying Procedure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghao Yang, Linlin Gao, Pengyuan Li, Wenbo Li, Yihong Dong, Zhiying Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current structured pruning methods often result in considerable accuracy
+drops due to abrupt network changes and loss of information from pruned
+structures. To address these issues, we introduce the Decay Pruning Method
+(DPM), a novel smooth pruning approach with a self-rectifying mechanism. DPM
+consists of two key components: (i) Smooth Pruning: It converts conventional
+single-step pruning into multi-step smooth pruning, gradually reducing
+redundant structures to zero over N steps with ongoing optimization. (ii)
+Self-Rectifying: This procedure further enhances the aforementioned process by
+rectifying sub-optimal pruning based on gradient information. Our approach
+demonstrates strong generalizability and can be easily integrated with various
+existing pruning methods. We validate the effectiveness of DPM by integrating
+it with three popular pruning methods: OTOv2, Depgraph, and Gate Decorator.
+Experimental results show consistent improvements in performance compared to
+the original pruning methods, along with further reductions of FLOPs in most
+scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bench2Drive: Towards Multi-Ability Benchmarking of Closed-Loop
+  End-To-End Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaosong Jia, Zhenjie Yang, Qifeng Li, Zhiyuan Zhang, Junchi Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In an era marked by the rapid scaling of foundation models, autonomous
+driving technologies are approaching a transformative threshold where
+end-to-end autonomous driving (E2E-AD) emerges due to its potential of scaling
+up in the data-driven manner. However, existing E2E-AD methods are mostly
+evaluated under the open-loop log-replay manner with L2 errors and collision
+rate as metrics (e.g., in nuScenes), which could not fully reflect the driving
+performance of algorithms as recently acknowledged in the community. For those
+E2E-AD methods evaluated under the closed-loop protocol, they are tested in
+fixed routes (e.g., Town05Long and Longest6 in CARLA) with the driving score as
+metrics, which is known for high variance due to the unsmoothed metric function
+and large randomness in the long route. Besides, these methods usually collect
+their own data for training, which makes algorithm-level fair comparison
+infeasible.
+  To fulfill the paramount need of comprehensive, realistic, and fair testing
+environments for Full Self-Driving (FSD), we present Bench2Drive, the first
+benchmark for evaluating E2E-AD systems' multiple abilities in a closed-loop
+manner. Bench2Drive's official training data consists of 2 million fully
+annotated frames, collected from 10000 short clips uniformly distributed under
+44 interactive scenarios (cut-in, overtaking, detour, etc), 23 weathers (sunny,
+foggy, rainy, etc), and 12 towns (urban, village, university, etc) in CARLA v2.
+Its evaluation protocol requires E2E-AD models to pass 44 interactive scenarios
+under different locations and weathers which sums up to 220 routes and thus
+provides a comprehensive and disentangled assessment about their driving
+capability under different situations. We implement state-of-the-art E2E-AD
+models and evaluate them in Bench2Drive, providing insights regarding current
+status and future directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum Implicit Neural Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Zhao, Wenbo Qiao, Peng Zhang, Hui Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit neural representations have emerged as a powerful paradigm to
+represent signals such as images and sounds. This approach aims to utilize
+neural networks to parameterize the implicit function of the signal. However,
+when representing implicit functions, traditional neural networks such as
+ReLU-based multilayer perceptrons face challenges in accurately modeling
+high-frequency components of signals. Recent research has begun to explore the
+use of Fourier Neural Networks (FNNs) to overcome this limitation. In this
+paper, we propose Quantum Implicit Representation Network (QIREN), a novel
+quantum generalization of FNNs. Furthermore, through theoretical analysis, we
+demonstrate that QIREN possesses a quantum advantage over classical FNNs.
+Lastly, we conducted experiments in signal representation, image
+superresolution, and image generation tasks to show the superior performance of
+QIREN compared to state-of-the-art (SOTA) models. Our work not only
+incorporates quantum advantages into implicit neural representations but also
+uncovers a promising application direction for Quantum Neural Networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted by icml 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLplace: The 3D Indoor Scene Layout Generation and Editing via Large
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Yang, Junru Lu, Zixiang Zhao, Zhen Luo, James J. Q. Yu, Victor Sanchez, Feng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing 3D indoor layouts is a crucial task with significant applications
+in virtual reality, interior design, and automated space planning. Existing
+methods for 3D layout design either rely on diffusion models, which utilize
+spatial relationship priors, or heavily leverage the inferential capabilities
+of proprietary Large Language Models (LLMs), which require extensive prompt
+engineering and in-context exemplars via black-box trials. These methods often
+face limitations in generalization and dynamic scene editing. In this paper, we
+introduce LLplace, a novel 3D indoor scene layout designer based on lightweight
+fine-tuned open-source LLM Llama3. LLplace circumvents the need for spatial
+relationship priors and in-context exemplars, enabling efficient and credible
+room layout generation based solely on user inputs specifying the room type and
+desired objects. We curated a new dialogue dataset based on the 3D-Front
+dataset, expanding the original data volume and incorporating dialogue data for
+adding and removing objects. This dataset can enhance the LLM's spatial
+understanding. Furthermore, through dialogue, LLplace activates the LLM's
+capability to understand 3D layouts and perform dynamic scene editing, enabling
+the addition and removal of objects. Our approach demonstrates that LLplace can
+effectively generate and edit 3D indoor layouts interactively and outperform
+existing methods in delivering high-quality 3D design solutions. Code and
+dataset will be released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Similarity Score for Measuring Visual Similarity at Semantic
+  Level 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Senran Fan, Zhicheng Bao, Chen Dong, Haotai Liang, Xiaodong Xu, Ping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic communication, as a revolutionary communication architecture, is
+considered a promising novel communication paradigm. Unlike traditional
+symbol-based error-free communication systems, semantic-based visual
+communication systems extract, compress, transmit, and reconstruct images at
+the semantic level. However, widely used image similarity evaluation metrics,
+whether pixel-based MSE or PSNR or structure-based MS-SSIM, struggle to
+accurately measure the loss of semantic-level information of the source during
+system transmission. This presents challenges in evaluating the performance of
+visual semantic communication systems, especially when comparing them with
+traditional communication systems. To address this, we propose a semantic
+evaluation metric -- SeSS (Semantic Similarity Score), based on Scene Graph
+Generation and graph matching, which shifts the similarity scores between
+images into semantic-level graph matching scores. Meanwhile, semantic
+similarity scores for tens of thousands of image pairs are manually annotated
+to fine-tune the hyperparameters in the graph matching algorithm, aligning the
+metric more closely with human semantic perception. The performance of the SeSS
+is tested on different datasets, including (1)images transmitted by traditional
+and semantic communication systems at different compression rates, (2)images
+transmitted by traditional and semantic communication systems at different
+signal-to-noise ratios, (3)images generated by large-scale model with different
+noise levels introduced, and (4)cases of images subjected to certain special
+transformations. The experiments demonstrate the effectiveness of SeSS,
+indicating that the metric can measure the semantic-level differences in
+semantic-level information of images and can be used for evaluation in visual
+semantic communication systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From operculum and body tail movements to different coupling of physical
+  activity and respiratory frequency in farmed gilthead sea bream and European
+  sea bass. Insights on aquaculture biosensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miguel A. Ferrer, Josep A. Calduch-Giner, Moises Díaz, Javier Sosa, Enrique Rosell-Moll, Judith Santana Abril, Graciela Santana Sosa, Tomás Bautista Delgado, Cristina Carmona, Juan Antonio Martos-Sitcha, Enric Cabruja, Juan Manuel Afonso, Aurelio Vega, Manuel Lozano, Juan Antonio Montiel-Nelson, Jaume Pérez-Sánchez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The AEFishBIT tri-axial accelerometer was externally attached to the
+operculum to assess the divergent activity and respiratory patterns of two
+marine farmed fish, the gilthead sea bream (Sparus aurata) and European sea
+bass (Dicentrarchus labrax). Analysis of raw data from exercised fish
+highlighted the large amplitude of operculum aperture and body tail movements
+in European sea bass, which were overall more stable at low-medium exercise
+intensity levels. Cosinor analysis in free-swimming fish (on-board data
+processing) highlighted a pronounced daily rhythmicity of locomotor activity
+and respiratory frequency in both gilthead sea bream and European sea bass.
+Acrophases of activity and respiration were coupled in gilthead sea bream,
+acting feeding time (once daily at 11:00 h) as a main synchronizing factor. By
+contrast, locomotor activity and respiratory frequency were out of phase in
+European sea bass with activity acrophase on early morning and respiration
+acrophase on the afternoon. The daily range of activity and respiration
+variation was also higher in European sea bass, probably as part of the
+adaptation of this fish species to act as a fast swimming predator. In any
+case, lower locomotor activity and enhanced respiration were associated with
+larger body weight in both fish species. This agrees with the notion that
+selection for fast growth in farming conditions is accompanied by a lower
+activity profile, which may favor an efficient feed conversion for growth
+purposes. Therefore, the use of behavioral monitoring is becoming a reliable
+and large-scale promising tool for selecting more efficient farmed fish,
+allowing researchers and farmers to establish stricter criteria of welfare for
+more sustainable and ethical fish production.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MuJo: Multimodal Joint Feature Space Learning for Human Activity
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03857v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03857v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Gerd Fritsch, Cennet Oguz, Vitor Fortes Rey, Lala Ray, Maximilian Kiefer-Emmanouilidis, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Activity Recognition is a longstanding problem in AI with applications
+in a broad range of areas: from healthcare, sports and fitness, security, and
+human computer interaction to robotics. The performance of HAR in real-world
+settings is strongly dependent on the type and quality of the input signal that
+can be acquired. Given an unobstructed, high-quality camera view of a scene,
+computer vision systems, in particular in conjunction with foundational models
+(e.g., CLIP), can today fairly reliably distinguish complex activities. On the
+other hand, recognition using modalities such as wearable sensors (which are
+often more broadly available, e.g, in mobile phones and smartwatches) is a more
+difficult problem, as the signals often contain less information and labeled
+training data is more difficult to acquire. In this work, we show how we can
+improve HAR performance across different modalities using multimodal
+contrastive pretraining. Our approach MuJo (Multimodal Joint Feature Space
+Learning), learns a multimodal joint feature space with video, language, pose,
+and IMU sensor data. The proposed approach combines contrastive and multitask
+learning methods and analyzes different multitasking strategies for learning a
+compact shared representation. A large dataset with parallel video, language,
+pose, and sensor data points is also introduced to support the research, along
+with an analysis of the robustness of the multimodal joint space for
+modal-incomplete and low-resource data. On the MM-Fit dataset, our model
+achieves an impressive Macro F1-Score of up to 0.992 with only 2% of the train
+data and 0.999 when using all available training data for classification tasks.
+Moreover, in the scenario where the MM-Fit dataset is unseen, we demonstrate a
+generalization performance of up to 0.638.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Monocular Localization with Semantics Map for Autonomous Vehicles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jixiang Wan, Xudong Zhang, Shuzhou Dong, Yuwei Zhang, Yuchen Yang, Ruoxi Wu, Ye Jiang, Jijunnan Li, Jinquan Lin, Ming Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and robust localization remains a significant challenge for
+autonomous vehicles. The cost of sensors and limitations in local computational
+efficiency make it difficult to scale to large commercial applications.
+Traditional vision-based approaches focus on texture features that are
+susceptible to changes in lighting, season, perspective, and appearance.
+Additionally, the large storage size of maps with descriptors and complex
+optimization processes hinder system performance. To balance efficiency and
+accuracy, we propose a novel lightweight visual semantic localization algorithm
+that employs stable semantic features instead of low-level texture features.
+First, semantic maps are constructed offline by detecting semantic objects,
+such as ground markers, lane lines, and poles, using cameras or LiDAR sensors.
+Then, online visual localization is performed through data association of
+semantic features and map objects. We evaluated our proposed localization
+framework in the publicly available KAIST Urban dataset and in scenarios
+recorded by ourselves. The experimental results demonstrate that our method is
+a reliable and practical localization solution in various autonomous driving
+localization tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Amortized Equation Discovery in Hybrid Dynamical Systems <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongtuo Liu, Sara Magliacane, Miltiadis Kofinas, Efstratios Gavves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hybrid dynamical systems are prevalent in science and engineering to express
+complex systems with continuous and discrete states. To learn the laws of
+systems, all previous methods for equation discovery in hybrid systems follow a
+two-stage paradigm, i.e. they first group time series into small cluster
+fragments and then discover equations in each fragment separately through
+methods in non-hybrid systems. Although effective, these methods do not fully
+take advantage of the commonalities in the shared dynamics of multiple
+fragments that are driven by the same equations. Besides, the two-stage
+paradigm breaks the interdependence between categorizing and representing
+dynamics that jointly form hybrid systems. In this paper, we reformulate the
+problem and propose an end-to-end learning framework, i.e. Amortized Equation
+Discovery (AMORE), to jointly categorize modes and discover equations
+characterizing the dynamics of each mode by all segments of the mode.
+Experiments on four hybrid and six non-hybrid systems show that our method
+outperforms previous methods on equation discovery, segmentation, and
+forecasting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 5 figures, accepted by International Conference on Machine
+  Learning (ICML) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Semantic Segmentation Pipeline for WeatherProof <span class="highlight-title">Dataset</span>
+  Challenge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03799v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03799v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Zhang, Xidan Zhang, Jianing Wei, Fangjun Wang, Zhiming Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report describes the winning solution to the WeatherProof Dataset
+Challenge (CVPR 2024 UG2+ Track 3). Details regarding the challenge are
+available at https://cvpr2024ug2challenge.github.io/track3.html. We propose an
+enhanced semantic segmentation pipeline for this challenge. Firstly, we improve
+semantic segmentation models, using backbone pretrained with Depth Anything to
+improve UperNet model and SETRMLA model, and adding language guidance based on
+both weather and category information to InternImage model. Secondly, we
+introduce a new dataset WeatherProofExtra with wider viewing angle and employ
+data augmentation methods, including adverse weather and super-resolution.
+Finally, effective training strategies and ensemble method are applied to
+improve final performance further. Our solution is ranked 1st on the final
+leaderboard. Code will be available at
+https://github.com/KaneiGi/WeatherProofChallenge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low-Rank Similarity Mining for Multimodal <span class="highlight-title">Dataset</span> Distillation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03793v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03793v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Xu, Zhilin Lin, Yusong Qiu, Cewu Lu, Yong-Lu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Though dataset distillation has witnessed rapid development in recent years,
+the distillation of multimodal data, e.g., image-text pairs, poses unique and
+under-explored challenges. Unlike unimodal data, image-text contrastive
+learning (ITC) data lack inherent categorization and should instead place
+greater emphasis on modality correspondence. In this work, we propose Low-Rank
+Similarity Mining (LoRS) for multimodal dataset distillation, that concurrently
+distills a ground truth similarity matrix with image-text pairs, and leverages
+low-rank factorization for efficiency and scalability. The proposed approach
+brings significant improvement to the existing algorithms, marking a
+significant contribution to the field of visual-language dataset distillation.
+We advocate adopting LoRS as a foundational synthetic data setup for image-text
+dataset distillation. Our code is available at
+https://github.com/silicx/LoRS_Distill.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XL-HeadTags: Leveraging Multimodal Retrieval Augmentation for the
+  Multilingual Generation of News Headlines and Tags <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faisal Tareque Shohan, Mir Tafseer Nayeem, Samsul Islam, Abu Ubaida Akash, Shafiq Joty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Millions of news articles published online daily can overwhelm readers.
+Headlines and entity (topic) tags are essential for guiding readers to decide
+if the content is worth their time. While headline generation has been
+extensively studied, tag generation remains largely unexplored, yet it offers
+readers better access to topics of interest. The need for conciseness in
+capturing readers' attention necessitates improved content selection strategies
+for identifying salient and relevant segments within lengthy articles, thereby
+guiding language models effectively. To address this, we propose to leverage
+auxiliary information such as images and captions embedded in the articles to
+retrieve relevant sentences and utilize instruction tuning with variations to
+generate both headlines and tags for news articles in a multilingual context.
+To make use of the auxiliary information, we have compiled a dataset named
+XL-HeadTags, which includes 20 languages across 6 diverse language families.
+Through extensive evaluation, we demonstrate the effectiveness of our
+plug-and-play multimodal-multilingual retrievers for both tasks. Additionally,
+we have developed a suite of tools for processing and evaluating multilingual
+texts, significantly contributing to the research community by enabling more
+accurate and efficient analysis across languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instance Segmentation and Teeth Classification in Panoramic X-rays 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03747v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03747v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Devichand Budagam, Ayush Kumar, Sayan Ghosh, Anuj Shrivastav, Azamat Zhanatuly Imanbayev, Iskander Rafailovich Akhmetov, Dmitrii Kaplun, Sergey Antonov, Artem Rychenkov, Gleb Cyganov, Aleksandr Sinitca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teeth segmentation and recognition are critical in various dental
+applications and dental diagnosis. Automatic and accurate segmentation
+approaches have been made possible by integrating deep learning models.
+Although teeth segmentation has been studied in the past, only some techniques
+were able to effectively classify and segment teeth simultaneously. This
+article offers a pipeline of two deep learning models, U-Net and YOLOv8, which
+results in BB-UNet, a new architecture for the classification and segmentation
+of teeth on panoramic X-rays that is efficient and reliable. We have improved
+the quality and reliability of teeth segmentation by utilising the YOLOv8 and
+U-Net capabilities. The proposed networks have been evaluated using the mean
+average precision (mAP) and dice coefficient for YOLOv8 and BB-UNet,
+respectively. We have achieved a 3\% increase in mAP score for teeth
+classification compared to existing methods, and a 10-15\% increase in dice
+coefficient for teeth segmentation compared to U-Net across different
+categories of teeth. A new Dental dataset was created based on UFBA-UESC
+dataset with Bounding-Box and Polygon annotations of 425 dental panoramic
+X-rays. The findings of this research pave the way for a wider adoption of
+object detection models in the field of dental diagnosis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submtted to Expert Systems with Applications Journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReDistill: Residual Encoded Distillation for Peak Memory Reduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fang Chen, Gourav Datta, Mujahid Al Rafi, Hyeran Jeon, Meng Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The expansion of neural network sizes and the enhancement of image resolution
+through modern camera sensors result in heightened memory and power demands for
+neural networks. Reducing peak memory, which is the maximum memory consumed
+during the execution of a neural network, is critical to deploy neural networks
+on edge devices with limited memory budget. A naive approach to reducing peak
+memory is aggressive down-sampling of feature maps via pooling with large
+stride, which often results in unacceptable degradation in network performance.
+To mitigate this problem, we propose residual encoded distillation (ReDistill)
+for peak memory reduction in a teacher-student framework, in which a student
+network with less memory is derived from the teacher network using aggressive
+pooling. We apply our distillation method to multiple problems in computer
+vision including image classification and diffusion based image generation. For
+image classification, our method yields 2x-3.2x measured peak memory on an edge
+GPU with negligible degradation in accuracy for most CNN based architectures.
+Additionally, our method yields improved test accuracy for tiny vision
+transformer (ViT) based models distilled from large CNN based teacher
+architectures. For diffusion-based image generation, our proposed distillation
+method yields a denoising network with 4x lower theoretical peak memory while
+maintaining decent diversity and fidelity for image generation. Experiments
+demonstrate our method's superior performance compared to other feature-based
+and response-based distillation methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Durability: Benchmark Insights into Multimodal Watermarking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03728v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03728v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jielin Qiu, William Han, Xuandong Zhao, Shangbang Long, Christos Faloutsos, Lei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of large models, watermarks are increasingly employed to
+assert copyright, verify authenticity, or monitor content distribution. As
+applications become more multimodal, the utility of watermarking techniques
+becomes even more critical. The effectiveness and reliability of these
+watermarks largely depend on their robustness to various disturbances. However,
+the robustness of these watermarks in real-world scenarios, particularly under
+perturbations and corruption, is not well understood. To highlight the
+significance of robustness in watermarking techniques, our study evaluated the
+robustness of watermarked content generated by image and text generation models
+against common real-world image corruptions and text perturbations. Our results
+could pave the way for the development of more robust watermarking techniques
+in the future. Our project website can be found at
+\url{https://mmwatermark-robustness.github.io/}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gear-NeRF: Free-Viewpoint Rendering and Tracking with Motion-aware
+  Spatio-Temporal Sampling <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhang Liu, Yu-Wing Tai, Chi-Keung Tang, Pedro Miraldo, Suhas Lohit, Moitreya Chatterjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extensions of Neural Radiance Fields (NeRFs) to model dynamic scenes have
+enabled their near photo-realistic, free-viewpoint rendering. Although these
+methods have shown some potential in creating immersive experiences, two
+drawbacks limit their ubiquity: (i) a significant reduction in reconstruction
+quality when the computing budget is limited, and (ii) a lack of semantic
+understanding of the underlying scenes. To address these issues, we introduce
+Gear-NeRF, which leverages semantic information from powerful image
+segmentation models. Our approach presents a principled way for learning a
+spatio-temporal (4D) semantic embedding, based on which we introduce the
+concept of gears to allow for stratified modeling of dynamic regions of the
+scene based on the extent of their motion. Such differentiation allows us to
+adjust the spatio-temporal sampling resolution for each region in proportion to
+its motion scale, achieving more photo-realistic dynamic novel view synthesis.
+At the same time, almost for free, our approach enables free-viewpoint tracking
+of objects of interest - a functionality not yet achieved by existing
+NeRF-based methods. Empirical studies validate the effectiveness of our method,
+where we achieve state-of-the-art rendering and tracking performance on
+multiple challenging datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted to IEEE/CVF CVPR 2024 (Spotlight). Work done when XL
+  was an intern at MERL. Project Page Link:
+  https://merl.com/research/highlights/gear-nerf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attribute-Aware Implicit Modality Alignment for Text Attribute Person
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Wang, Fangfang Liu, Zheng Li, Caili Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text attribute person search aims to find specific pedestrians through given
+textual attributes, which is very meaningful in the scene of searching for
+designated pedestrians through witness descriptions. The key challenge is the
+significant modality gap between textual attributes and images. Previous
+methods focused on achieving explicit representation and alignment through
+unimodal pre-trained models. Nevertheless, the absence of inter-modality
+correspondence in these models may lead to distortions in the local information
+of intra-modality. Moreover, these methods only considered the alignment of
+inter-modality and ignored the differences between different attribute
+categories. To mitigate the above problems, we propose an Attribute-Aware
+Implicit Modality Alignment (AIMA) framework to learn the correspondence of
+local representations between textual attributes and images and combine global
+representation matching to narrow the modality gap. Firstly, we introduce the
+CLIP model as the backbone and design prompt templates to transform attribute
+combinations into structured sentences. This facilitates the model's ability to
+better understand and match image details. Next, we design a Masked Attribute
+Prediction (MAP) module that predicts the masked attributes after the
+interaction of image and masked textual attribute features through multi-modal
+interaction, thereby achieving implicit local relationship alignment. Finally,
+we propose an Attribute-IoU Guided Intra-Modal Contrastive (A-IoU IMC) loss,
+aligning the distribution of different textual attributes in the embedding
+space with their IoU distribution, achieving better semantic arrangement.
+Extensive experiments on the Market-1501 Attribute, PETA, and PA100K datasets
+show that the performance of our proposed method significantly surpasses the
+current state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JIGMARK: A Black-Box Approach for Enhancing Image Watermarks against
+  Diffusion Model Edits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minzhou Pan, Yi Zeng, Xue Lin, Ning Yu, Cho-Jui Hsieh, Peter Henderson, Ruoxi Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we investigate the vulnerability of image watermarks to
+diffusion-model-based image editing, a challenge exacerbated by the
+computational cost of accessing gradient information and the closed-source
+nature of many diffusion models. To address this issue, we introduce JIGMARK.
+This first-of-its-kind watermarking technique enhances robustness through
+contrastive learning with pairs of images, processed and unprocessed by
+diffusion models, without needing a direct backpropagation of the diffusion
+process. Our evaluation reveals that JIGMARK significantly surpasses existing
+watermarking solutions in resilience to diffusion-model edits, demonstrating a
+True Positive Rate more than triple that of leading baselines at a 1% False
+Positive Rate while preserving image quality. At the same time, it consistently
+improves the robustness against other conventional perturbations (like JPEG,
+blurring, etc.) and malicious watermark attacks over the state-of-the-art,
+often by a large margin. Furthermore, we propose the Human Aligned Variation
+(HAV) score, a new metric that surpasses traditional similarity measures in
+quantifying the number of image derivatives from image editing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSNet: A Novel Way to Use Atrous Convolutions in Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zilu Guo, Liuyang Bian, Xuan Huang, Hu Wei, Jingyu Li, Huasheng Ni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Atrous convolutions are employed as a method to increase the receptive field
+in semantic segmentation tasks. However, in previous works of semantic
+segmentation, it was rarely employed in the shallow layers of the model. We
+revisit the design of atrous convolutions in modern convolutional neural
+networks (CNNs), and demonstrate that the concept of using large kernels to
+apply atrous convolutions could be a more powerful paradigm. We propose three
+guidelines to apply atrous convolutions more efficiently. Following these
+guidelines, we propose DSNet, a Dual-Branch CNN architecture, which
+incorporates atrous convolutions in the shallow layers of the model
+architecture, as well as pretraining the nearly entire encoder on ImageNet to
+achieve better performance. To demonstrate the effectiveness of our approach,
+our models achieve a new state-of-the-art trade-off between accuracy and speed
+on ADE20K, Cityscapes and BDD datasets. Specifically, DSNet achieves 40.0% mIOU
+with inference speed of 179.2 FPS on ADE20K, and 80.4% mIOU with speed of 81.9
+FPS on Cityscapes. Source code and models are available at Github:
+https://github.com/takaniwa/DSNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Superpoint Gaussian Splatting for Real-Time High-Fidelity Dynamic Scene
+  Reconstruction <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diwen Wan, Ruijie Lu, Gang Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rendering novel view images in dynamic scenes is a crucial yet challenging
+task. Current methods mainly utilize NeRF-based methods to represent the static
+scene and an additional time-variant MLP to model scene deformations, resulting
+in relatively low rendering quality as well as slow inference speed. To tackle
+these challenges, we propose a novel framework named Superpoint Gaussian
+Splatting (SP-GS). Specifically, our framework first employs explicit 3D
+Gaussians to reconstruct the scene and then clusters Gaussians with similar
+properties (e.g., rotation, translation, and location) into superpoints.
+Empowered by these superpoints, our method manages to extend 3D Gaussian
+splatting to dynamic scenes with only a slight increase in computational
+expense. Apart from achieving state-of-the-art visual quality and real-time
+rendering under high resolutions, the superpoint representation provides a
+stronger manipulation capability. Extensive experiments demonstrate the
+practicality and effectiveness of our approach on both synthetic and real-world
+datasets. Please see our project page at
+https://dnvtmf.github.io/SP_GS.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Untrained Neural Nets for Snapshot Compressive Imaging: Theory and
+  Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03694v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03694v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengyu Zhao, Xi Chen, Xin Yuan, Shirin Jalali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Snapshot compressive imaging (SCI) recovers high-dimensional (3D) data cubes
+from a single 2D measurement, enabling diverse applications like video and
+hyperspectral imaging to go beyond standard techniques in terms of acquisition
+speed and efficiency. In this paper, we focus on SCI recovery algorithms that
+employ untrained neural networks (UNNs), such as deep image prior (DIP), to
+model source structure. Such UNN-based methods are appealing as they have the
+potential of avoiding the computationally intensive retraining required for
+different source models and different measurement scenarios. We first develop a
+theoretical framework for characterizing the performance of such UNN-based
+methods. The theoretical framework, on the one hand, enables us to optimize the
+parameters of data-modulating masks, and on the other hand, provides a
+fundamental connection between the number of data frames that can be recovered
+from a single measurement to the parameters of the untrained NN. We also employ
+the recently proposed bagged-deep-image-prior (bagged-DIP) idea to develop SCI
+Bagged Deep Video Prior (SCI-BDVP) algorithms that address the common
+challenges faced by standard UNN solutions. Our experimental results show that
+in video SCI our proposed solution achieves state-of-the-art among UNN methods,
+and in the case of noisy measurements, it even outperforms supervised
+solutions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shadow and Light: Digitally Reconstructed Radiographs for Disease
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Hou, Qingqing Zhu, Tejas Sudarshan Mathai, Qiao Jin, Zhiyong Lu, Ronald M. Summers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce DRR-RATE, a large-scale synthetic chest X-ray
+dataset derived from the recently released CT-RATE dataset. DRR-RATE comprises
+of 50,188 frontal Digitally Reconstructed Radiographs (DRRs) from 21,304 unique
+patients. Each image is paired with a corresponding radiology text report and
+binary labels for 18 pathology classes. Given the controllable nature of DRR
+generation, it facilitates the inclusion of lateral view images and images from
+any desired viewing position. This opens up avenues for research into new and
+novel multimodal applications involving paired CT, X-ray images from various
+views, text, and binary labels. We demonstrate the applicability of DRR-RATE
+alongside existing large-scale chest X-ray resources, notably the CheXpert
+dataset and CheXnet model. Experiments demonstrate that CheXnet, when trained
+and tested on the DRR-RATE dataset, achieves sufficient to high AUC scores for
+the six common pathologies cited in common literature: Atelectasis,
+Cardiomegaly, Consolidation, Lung Lesion, Lung Opacity, and Pleural Effusion.
+Additionally, CheXnet trained on the CheXpert dataset can accurately identify
+several pathologies, even when operating out of distribution. This confirms
+that the generated DRR images effectively capture the essential pathology
+features from CT images. The dataset and labels are publicly accessible at
+https://huggingface.co/datasets/farrell236/DRR-RATE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Eureka-Moments in <span class="highlight-title">Transformer</span>s: Multi-Step Tasks Reveal Softmax Induced
+  Optimization Problems <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12956v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12956v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David T. Hoffmann, Simon Schrodi, Jelena Bratulić, Nadine Behrmann, Volker Fischer, Thomas Brox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we study rapid improvements of the training loss in
+transformers when being confronted with multi-step decision tasks. We found
+that transformers struggle to learn the intermediate task and both training and
+validation loss saturate for hundreds of epochs. When transformers finally
+learn the intermediate task, they do this rapidly and unexpectedly. We call
+these abrupt improvements Eureka-moments, since the transformer appears to
+suddenly learn a previously incomprehensible concept. We designed synthetic
+tasks to study the problem in detail, but the leaps in performance can be
+observed also for language modeling and in-context learning (ICL). We suspect
+that these abrupt transitions are caused by the multi-step nature of these
+tasks. Indeed, we find connections and show that ways to improve on the
+synthetic multi-step tasks can be used to improve the training of language
+modeling and ICL. Using the synthetic data we trace the problem back to the
+Softmax function in the self-attention block of transformers and show ways to
+alleviate the problem. These fixes reduce the required number of training
+steps, lead to higher likelihood to learn the intermediate task, to higher
+final accuracy and training becomes more robust to hyper-parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vista: A Generalizable Driving World Model with High Fidelity and
+  Versatile Controllability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17398v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17398v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenyuan Gao, Jiazhi Yang, Li Chen, Kashyap Chitta, Yihang Qiu, Andreas Geiger, Jun Zhang, Hongyang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  World models can foresee the outcomes of different actions, which is of
+paramount importance for autonomous driving. Nevertheless, existing driving
+world models still have limitations in generalization to unseen environments,
+prediction fidelity of critical details, and action controllability for
+flexible application. In this paper, we present Vista, a generalizable driving
+world model with high fidelity and versatile controllability. Based on a
+systematic diagnosis of existing methods, we introduce several key ingredients
+to address these limitations. To accurately predict real-world dynamics at high
+resolution, we propose two novel losses to promote the learning of moving
+instances and structural information. We also devise an effective latent
+replacement approach to inject historical frames as priors for coherent
+long-horizon rollouts. For action controllability, we incorporate a versatile
+set of controls from high-level intentions (command, goal point) to low-level
+maneuvers (trajectory, angle, and speed) through an efficient learning
+strategy. After large-scale training, the capabilities of Vista can seamlessly
+generalize to different scenarios. Extensive experiments on multiple datasets
+show that Vista outperforms the most advanced general-purpose video generator
+in over 70% of comparisons and surpasses the best-performing driving world
+model by 55% in FID and 27% in FVD. Moreover, for the first time, we utilize
+the capacity of Vista itself to establish a generalizable reward for real-world
+action evaluation without accessing the ground truth actions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and model: https://github.com/OpenDriveLab/Vista, video demos:
+  https://vista-demo.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unleashing Generalization of End-to-End Autonomous Driving with
+  Controllable Long Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01349v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01349v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enhui Ma, Lijun Zhou, Tao Tang, Zhan Zhang, Dong Han, Junpeng Jiang, Kun Zhan, Peng Jia, Xianpeng Lang, Haiyang Sun, Di Lin, Kaicheng Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using generative models to synthesize new data has become a de-facto standard
+in autonomous driving to address the data scarcity issue. Though existing
+approaches are able to boost perception models, we discover that these
+approaches fail to improve the performance of planning of end-to-end autonomous
+driving models as the generated videos are usually less than 8 frames and the
+spatial and temporal inconsistencies are not negligible. To this end, we
+propose Delphi, a novel diffusion-based long video generation method with a
+shared noise modeling mechanism across the multi-views to increase spatial
+consistency, and a feature-aligned module to achieves both precise
+controllability and temporal consistency. Our method can generate up to 40
+frames of video without loss of consistency which is about 5 times longer
+compared with state-of-the-art methods. Instead of randomly generating new
+data, we further design a sampling policy to let Delphi generate new data that
+are similar to those failure cases to improve the sample efficiency. This is
+achieved by building a failure-case driven framework with the help of
+pre-trained visual language models. Our extensive experiment demonstrates that
+our Delphi generates a higher quality of long videos surpassing previous
+state-of-the-art methods. Consequentially, with only generating 4% of the
+training dataset size, our framework is able to go beyond perception and
+prediction tasks, for the first time to the best of our knowledge, boost the
+planning performance of the end-to-end autonomous driving model by a margin of
+25%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://westlake-autolab.github.io/delphi.github.io/, 8
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wake Vision: A Large-scale, Diverse <span class="highlight-title">Dataset</span> and Benchmark Suite for
+  TinyML Person Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00892v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00892v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Colby Banbury, Emil Njor, Matthew Stewart, Pete Warden, Manjunath Kudlur, Nat Jeffries, Xenofon Fafoutis, Vijay Janapa Reddi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tiny machine learning (TinyML), which enables machine learning applications
+on extremely low-power devices, suffers from limited size and quality of
+relevant datasets. To address this issue, we introduce Wake Vision, a
+large-scale, diverse dataset tailored for person detection, the canonical task
+for TinyML visual sensing. Wake Vision comprises over 6 million images,
+representing a hundredfold increase compared to the previous standard, and has
+undergone thorough quality filtering. We provide two Wake Vision training sets:
+Wake Vision (Large) and Wake Vision (Quality), a smaller set with
+higher-quality labels. Our results demonstrate that using the Wake Vision
+(Quality) training set produces more accurate models than the Wake Vision
+(Large) training set, strongly suggesting that label quality is more important
+than quantity in our setting. We find use for the large training set for
+pre-training and knowledge distillation. To minimize label errors that can
+obscure true model performance, we manually label the validation and test sets,
+improving the test set error rate from 7.8% in the prior standard to only 2.2%.
+In addition to the dataset, we provide a collection of five detailed benchmark
+sets to facilitate the evaluation of model quality in challenging real world
+scenarios that are often ignored when focusing solely on overall accuracy.
+These novel fine-grained benchmarks assess model performance on specific
+segments of the test data, such as varying lighting conditions, distances from
+the camera, and demographic characteristics of subjects. Our results
+demonstrate that using Wake Vision for training results in a 2.49% increase in
+accuracy compared to the established dataset. We also show the importance of
+dataset quality for low-capacity models and the value of dataset size for
+high-capacity models. wakevision.ai
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Revolution of Multimodal Large Language Models: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12451v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12451v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Caffagni, Federico Cocchi, Luca Barsellotti, Nicholas Moratelli, Sara Sarto, Lorenzo Baraldi, Lorenzo Baraldi, Marcella Cornia, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Connecting text and visual modalities plays an essential role in generative
+intelligence. For this reason, inspired by the success of large language
+models, significant research efforts are being devoted to the development of
+Multimodal Large Language Models (MLLMs). These models can seamlessly integrate
+visual and textual modalities, while providing a dialogue-based interface and
+instruction-following capabilities. In this paper, we provide a comprehensive
+review of recent visual-based MLLMs, analyzing their architectural choices,
+multimodal alignment strategies, and training techniques. We also conduct a
+detailed analysis of these models across a wide range of tasks, including
+visual grounding, image generation and editing, visual understanding, and
+domain-specific applications. Additionally, we compile and describe training
+datasets and evaluation benchmarks, conducting comparisons among existing
+models in terms of performance and computational requirements. Overall, this
+survey offers a comprehensive overview of the current state of the art, laying
+the groundwork for future MLLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Whole Heart 3D+T Representation Learning Through Sparse 2D Cardiac MR
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00329v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00329v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yundi Zhang, Chen Chen, Suprosanna Shit, Sophie Starck, Daniel Rueckert, Jiazhen Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiac Magnetic Resonance (CMR) imaging serves as the gold-standard for
+evaluating cardiac morphology and function. Typically, a multi-view CMR stack,
+covering short-axis (SA) and 2/3/4-chamber long-axis (LA) views, is acquired
+for a thorough cardiac assessment. However, efficiently streamlining the
+complex, high-dimensional 3D+T CMR data and distilling compact, coherent
+representation remains a challenge. In this work, we introduce a whole-heart
+self-supervised learning framework that utilizes masked imaging modeling to
+automatically uncover the correlations between spatial and temporal patches
+throughout the cardiac stacks. This process facilitates the generation of
+meaningful and well-clustered heart representations without relying on the
+traditionally required, and often costly, labeled data. The learned heart
+representation can be directly used for various downstream tasks. Furthermore,
+our method demonstrates remarkable robustness, ensuring consistent
+representations even when certain CMR planes are missing/flawed. We train our
+model on 14,000 unlabeled CMR data from UK BioBank and evaluate it on 1,000
+annotated data. The proposed method demonstrates superior performance to
+baselines in tasks that demand comprehensive 3D+T cardiac information, e.g.
+cardiac phenotype (ejection fraction and ventricle volume) prediction and
+multi-plane/multi-frame CMR segmentation, highlighting its effectiveness in
+extracting comprehensive cardiac features that are both anatomically and
+pathologically relevant.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collapse-Aware Triplet Decoupling for Adversarially Robust Image
+  Retrieval <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07364v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07364v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiwei Tian, Chenhao Lin, Zhengyu Zhao, Qian Li, Chao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial training has achieved substantial performance in defending image
+retrieval against adversarial examples. However, existing studies in deep
+metric learning (DML) still suffer from two major limitations: weak adversary
+and model collapse. In this paper, we address these two limitations by
+proposing Collapse-Aware TRIplet DEcoupling (CA-TRIDE). Specifically, TRIDE
+yields a stronger adversary by spatially decoupling the perturbation targets
+into the anchor and the other candidates. Furthermore, CA prevents the
+consequential model collapse, based on a novel metric, collapseness, which is
+incorporated into the optimization of perturbation. We also identify two
+drawbacks of the existing robustness metric in image retrieval and propose a
+new metric for a more reasonable robustness evaluation. Extensive experiments
+on three datasets demonstrate that CA-TRIDE outperforms existing defense
+methods in both conventional and new metrics. Codes are available at
+https://github.com/michaeltian108/CA-TRIDE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learned feature representations are biased by complexity, learning
+  order, position, and more 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05847v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05847v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Kyle Lampinen, Stephanie C. Y. Chan, Katherine Hermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning, and interpreting learned representations, are key
+areas of focus in machine learning and neuroscience. Both fields generally use
+representations as a means to understand or improve a system's computations. In
+this work, however, we explore surprising dissociations between representation
+and computation that may pose challenges for such efforts. We create datasets
+in which we attempt to match the computational role that different features
+play, while manipulating other properties of the features or the data. We train
+various deep learning architectures to compute these multiple abstract features
+about their inputs. We find that their learned feature representations are
+systematically biased towards representing some features more strongly than
+others, depending upon extraneous properties such as feature complexity, the
+order in which features are learned, and the distribution of features over the
+inputs. For example, features that are simpler to compute or learned first tend
+to be represented more strongly and densely than features that are more complex
+or learned later, even if all features are learned equally well. We also
+explore how these biases are affected by architectures, optimizers, and
+training regimes (e.g., in transformers, features decoded earlier in the output
+sequence also tend to be represented more strongly). Our results help to
+characterize the inductive biases of gradient-based representation learning.
+These results also highlight a key challenge for interpretability $-$ or for
+comparing the representations of models and brains $-$ disentangling extraneous
+biases from the computationally important aspects of a system's internal
+representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MVTN: Learning Multi-View Transformations for 3D Understanding <span class="chip">ICCV 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.13462v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.13462v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdullah Hamdi, Faisal AlZahrani, Silvio Giancola, Bernard Ghanem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view projection techniques have shown themselves to be highly effective
+in achieving top-performing results in the recognition of 3D shapes. These
+methods involve learning how to combine information from multiple view-points.
+However, the camera view-points from which these views are obtained are often
+fixed for all shapes. To overcome the static nature of current multi-view
+techniques, we propose learning these view-points. Specifically, we introduce
+the Multi-View Transformation Network (MVTN), which uses differentiable
+rendering to determine optimal view-points for 3D shape recognition. As a
+result, MVTN can be trained end-to-end with any multi-view network for 3D shape
+classification. We integrate MVTN into a novel adaptive multi-view pipeline
+that is capable of rendering both 3D meshes and point clouds. Our approach
+demonstrates state-of-the-art performance in 3D classification and shape
+retrieval on several benchmarks (ModelNet40, ScanObjectNN, ShapeNet Core55).
+Further analysis indicates that our approach exhibits improved robustness to
+occlusion compared to other methods. We also investigate additional aspects of
+MVTN, such as 2D pretraining and its use for segmentation. To support further
+research in this area, we have released MVTorch, a PyTorch library for 3D
+understanding and generation using multi-view projections.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review journal extension for the ICCV 2021 paper
+  arXiv:2011.13244</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Breaking through the learning plateaus of in-context learning in
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06054v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06054v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwen Fu, Tao Yang, Yuwang Wang, Yan Lu, Nanning Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning, i.e., learning from context examples, is an impressive
+ability of Transformer. Training Transformers to possess this in-context
+learning skill is computationally intensive due to the occurrence of learning
+plateaus, which are periods within the training process where there is minimal
+or no enhancement in the model's in-context learning capability. To study the
+mechanism behind the learning plateaus, we conceptually seperate a component
+within the model's internal representation that is exclusively affected by the
+model's weights. We call this the "weights component", and the remainder is
+identified as the "context component". By conducting meticulous and controlled
+experiments on synthetic tasks, we note that the persistence of learning
+plateaus correlates with compromised functionality of the weights component.
+Recognizing the impaired performance of the weights component as a fundamental
+behavior drives learning plateaus, we have developed three strategies to
+expedite the learning of Transformers. The effectiveness of these strategies is
+further confirmed in natural language processing tasks. In conclusion, our
+research demonstrates the feasibility of cultivating a powerful in-context
+learning ability within AI systems in an eco-friendly manner.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalised Diffusion Probabilistic Scale-Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Peter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion probabilistic models excel at sampling new images from learned
+distributions. Originally motivated by drift-diffusion concepts from physics,
+they apply image perturbations such as noise and blur in a forward process that
+results in a tractable probability distribution. A corresponding learned
+reverse process generates images and can be conditioned on side information,
+which leads to a wide variety of practical applications. Most of the research
+focus currently lies on practice-oriented extensions. In contrast, the
+theoretical background remains largely unexplored, in particular the relations
+to drift-diffusion. In order to shed light on these connections to classical
+image filtering, we propose a generalised scale-space theory for diffusion
+probabilistic models. Moreover, we show conceptual and empirical connections to
+diffusion and osmosis filters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffCAD: Weakly-Supervised Probabilistic CAD Model Retrieval and
+  Alignment from an RGB Image <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18610v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18610v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daoyi Gao, Dávid Rozenberszki, Stefan Leutenegger, Angela Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Perceiving 3D structures from RGB images based on CAD model primitives can
+enable an effective, efficient 3D object-based representation of scenes.
+However, current approaches rely on supervision from expensive annotations of
+CAD models associated with real images, and encounter challenges due to the
+inherent ambiguities in the task -- both in depth-scale ambiguity in monocular
+perception, as well as inexact matches of CAD database models to real
+observations. We thus propose DiffCAD, the first weakly-supervised
+probabilistic approach to CAD retrieval and alignment from an RGB image. We
+formulate this as a conditional generative task, leveraging diffusion to learn
+implicit probabilistic models capturing the shape, pose, and scale of CAD
+objects in an image. This enables multi-hypothesis generation of different
+plausible CAD reconstructions, requiring only a few hypotheses to characterize
+ambiguities in depth/scale and inexact shape matches. Our approach is trained
+only on synthetic data, leveraging monocular depth and mask estimates to enable
+robust zero-shot adaptation to various real target domains. Despite being
+trained solely on synthetic data, our multi-hypothesis approach can even
+surpass the supervised state-of-the-art on the Scan2CAD dataset by 5.9% with 8
+hypotheses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH 2024, Project page: https://daoyig.github.io/DiffCAD/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Retrieval Robustness for Retrieval-Augmented Image
+  Captioning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenyan Li, Jiaang Li, Rita Ramos, Raphael Tang, Desmond Elliott
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in retrieval-augmented models for image captioning highlight
+the benefit of retrieving related captions for efficient, lightweight models
+with strong domain-transfer capabilities. While these models demonstrate the
+success of retrieval augmentation, retrieval models are still far from perfect
+in practice: the retrieved information can sometimes mislead the model,
+resulting in incorrect generation and worse performance. In this paper, we
+analyze the robustness of a retrieval-augmented captioning model SmallCap. Our
+analysis shows that the model is sensitive to tokens that appear in the
+majority of the retrieved captions, and the input attribution shows that those
+tokens are likely copied into the generated output. Given these findings, we
+propose to train the model by sampling retrieved captions from more diverse
+sets. This decreases the chance that the model learns to copy majority tokens,
+and improves both in-domain and cross-domain performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, long paper at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lever LM: Configuring In-Context Sequence to Lever Large Vision Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10104v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10104v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Yang, Yingzhe Peng, Haoxuan Ma, Shuo Xu, Chi Zhang, Yucheng Han, Hanwang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Archimedes famously said, ``Give me a lever long enough and a fulcrum on
+which to place it, and I shall move the world'', in this study, we propose to
+use a tiny Language Model (LM), \eg, a Transformer with 67M parameters, to
+lever much larger Vision-Language Models (LVLMs) with 9B parameters.
+Specifically, we use this tiny \textbf{Lever-LM} to configure effective
+in-context demonstration (ICD) sequences to improve the In-Context Learinng
+(ICL) performance of LVLMs. Previous studies show that diverse ICD
+configurations like the selection and ordering of the demonstrations heavily
+affect the ICL performance, highlighting the significance of configuring
+effective ICD sequences. Motivated by this and by re-considering the the
+process of configuring ICD sequence, we find this is a mirror process of human
+sentence composition and further assume that effective ICD configurations may
+contain internal statistical patterns that can be captured by Lever-LM. Then a
+dataset with effective ICD sequences is constructed to train Lever-LM. After
+training, given novel queries, new ICD sequences are configured by the trained
+Lever-LM to solve vision-language tasks through ICL. Experiments show that
+these ICD sequences can improve the ICL performance of two LVLMs compared with
+some strong baselines in Visual Question Answering and Image Captioning,
+validating that Lever-LM can really capture the statistical patterns for
+levering LVLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformal Prediction for Deep Classifier via Label Ranking <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06430v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06430v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianguo Huang, Huajun Xi, Linjun Zhang, Huaxiu Yao, Yue Qiu, Hongxin Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conformal prediction is a statistical framework that generates prediction
+sets containing ground-truth labels with a desired coverage guarantee. The
+predicted probabilities produced by machine learning models are generally
+miscalibrated, leading to large prediction sets in conformal prediction. To
+address this issue, we propose a novel algorithm named $\textit{Sorted Adaptive
+Prediction Sets}$ (SAPS), which discards all the probability values except for
+the maximum softmax probability. The key idea behind SAPS is to minimize the
+dependence of the non-conformity score on the probability values while
+retaining the uncertainty information. In this manner, SAPS can produce compact
+prediction sets and communicate instance-wise uncertainty. Extensive
+experiments validate that SAPS not only lessens the prediction sets but also
+broadly enhances the conditional coverage rate of prediction sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cascade-CLIP: Cascaded Vision-Language Embeddings Alignment for
+  Zero-Shot Semantic Segmentation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00670v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00670v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunheng Li, ZhongYu Li, Quansheng Zeng, Qibin Hou, Ming-Ming Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained vision-language models, e.g., CLIP, have been successfully
+applied to zero-shot semantic segmentation. Existing CLIP-based approaches
+primarily utilize visual features from the last layer to align with text
+embeddings, while they neglect the crucial information in intermediate layers
+that contain rich object details. However, we find that directly aggregating
+the multi-level visual features weakens the zero-shot ability for novel
+classes. The large differences between the visual features from different
+layers make these features hard to align well with the text embeddings. We
+resolve this problem by introducing a series of independent decoders to align
+the multi-level visual features with the text embeddings in a cascaded way,
+forming a novel but simple framework named Cascade-CLIP. Our Cascade-CLIP is
+flexible and can be easily applied to existing zero-shot semantic segmentation
+methods. Experimental results show that our simple Cascade-CLIP achieves
+superior zero-shot performance on segmentation benchmarks, like COCO-Stuff,
+Pascal-VOC, and Pascal-Context. Our code is available at:
+https://github.com/HVision-NKU/Cascade-CLIP
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FastDrag: Manipulate Anything in One Step 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15769v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15769v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanjia Zhao, Jian Guan, Congyi Fan, Dongli Xu, Youtian Lin, Haiwei Pan, Pengming Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Drag-based image editing using generative models provides precise control
+over image contents, enabling users to manipulate anything in an image with a
+few clicks. However, prevailing methods typically adopt $n$-step iterations for
+latent semantic optimization to achieve drag-based image editing, which is
+time-consuming and limits practical applications. In this paper, we introduce a
+novel one-step drag-based image editing method, i.e., FastDrag, to accelerate
+the editing process. Central to our approach is a latent warpage function
+(LWF), which simulates the behavior of a stretched material to adjust the
+location of individual pixels within the latent space. This innovation achieves
+one-step latent semantic optimization and hence significantly promotes editing
+speeds. Meanwhile, null regions emerging after applying LWF are addressed by
+our proposed bilateral nearest neighbor interpolation (BNNI) strategy. This
+strategy interpolates these regions using similar features from neighboring
+areas, thus enhancing semantic integrity. Additionally, a
+consistency-preserving strategy is introduced to maintain the consistency
+between the edited and original images by adopting semantic information from
+the original image, saved as key and value pairs in self-attention module
+during diffusion inversion, to guide the diffusion sampling. Our FastDrag is
+validated on the DragBench dataset, demonstrating substantial improvements in
+processing time over existing methods, while achieving enhanced editing
+performance. Project page: https://fastdrag-site.github.io/ .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 13 figures, Project page: https://fastdrag-site.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with
+  Vision-Language Benchmark <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04788v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04788v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongping Chen, Ruoxi Chen, Shilin Zhang, Yinuo Liu, Yaochen Wang, Huichi Zhou, Qihui Zhang, Pan Zhou, Yao Wan, Lichao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have gained significant attention
+recently, showing remarkable potential in artificial general intelligence.
+However, assessing the utility of MLLMs presents considerable challenges,
+primarily due to the absence of multimodal benchmarks that align with human
+preferences. Drawing inspiration from the concept of LLM-as-a-Judge within
+LLMs, this paper introduces a novel benchmark, termed MLLM-as-a-Judge, to
+assess the ability of MLLMs in assisting judges across diverse modalities,
+encompassing three distinct tasks: Scoring Evaluation, Pair Comparison, and
+Batch Ranking. Our study reveals that, while MLLMs demonstrate remarkable
+human-like discernment in Pair Comparison, there is a significant divergence
+from human preferences in Scoring Evaluation and Batch Ranking. Furthermore, a
+closer examination reveals persistent challenges in the judgment capacities of
+LLMs, including diverse biases, hallucinatory responses, and inconsistencies in
+judgment, even in advanced models such as GPT-4V. These findings emphasize the
+pressing need for enhancements and further research efforts to be undertaken
+before regarding MLLMs as fully reliable evaluators. In light of this, we
+advocate for additional efforts dedicated to supporting the continuous
+development within the domain of MLLM functioning as judges. The code and
+dataset are publicly available at our project homepage:
+\url{https://mllm-judge.github.io/}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unleashing HyDRa: Hybrid Fusion, Depth Consistency and Radar for Unified
+  3D Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07746v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07746v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Wolters, Johannes Gilg, Torben Teepe, Fabian Herzog, Anouar Laouichi, Martin Hofmann, Gerhard Rigoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-cost, vision-centric 3D perception systems for autonomous driving have
+made significant progress in recent years, narrowing the gap to expensive
+LiDAR-based methods. The primary challenge in becoming a fully reliable
+alternative lies in robust depth prediction capabilities, as camera-based
+systems struggle with long detection ranges and adverse lighting and weather
+conditions. In this work, we introduce HyDRa, a novel camera-radar fusion
+architecture for diverse 3D perception tasks. Building upon the principles of
+dense BEV (Bird's Eye View)-based architectures, HyDRa introduces a hybrid
+fusion approach to combine the strengths of complementary camera and radar
+features in two distinct representation spaces. Our Height Association
+Transformer module leverages radar features already in the perspective view to
+produce more robust and accurate depth predictions. In the BEV, we refine the
+initial sparse representation by a Radar-weighted Depth Consistency. HyDRa
+achieves a new state-of-the-art for camera-radar fusion of 64.2 NDS (+1.8) and
+58.4 AMOTA (+1.5) on the public nuScenes dataset. Moreover, our new
+semantically rich and spatially accurate BEV features can be directly converted
+into a powerful occupancy representation, beating all previous camera-based
+methods on the Occ3D benchmark by an impressive 3.7 mIoU. Code and models are
+available at https://github.com/phi-wol/hydra.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures Added eval on VoD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ See More Details: Efficient Image Super-Resolution by Experts Mining <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03412v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03412v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduard Zamfir, Zongwei Wu, Nancy Mehta, Yulun Zhang, Radu Timofte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing high-resolution (HR) images from low-resolution (LR) inputs
+poses a significant challenge in image super-resolution (SR). While recent
+approaches have demonstrated the efficacy of intricate operations customized
+for various objectives, the straightforward stacking of these disparate
+operations can result in a substantial computational burden, hampering their
+practical utility. In response, we introduce SeemoRe, an efficient SR model
+employing expert mining. Our approach strategically incorporates experts at
+different levels, adopting a collaborative methodology. At the macro scale, our
+experts address rank-wise and spatial-wise informative features, providing a
+holistic understanding. Subsequently, the model delves into the subtleties of
+rank choice by leveraging a mixture of low-rank experts. By tapping into
+experts specialized in distinct key factors crucial for accurate SR, our model
+excels in uncovering intricate intra-feature details. This collaborative
+approach is reminiscent of the concept of "see more", allowing our model to
+achieve an optimal performance with minimal computational costs in efficient
+settings. The source will be publicly made available at
+https://github.com/eduardzamfir/seemoredetails
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sync4D: Video Guided Controllable Dynamics for Physics-Based 4D
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16849v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16849v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhoujie Fu, Jiacheng Wei, Wenhao Shen, Chaoyue Song, Xiaofeng Yang, Fayao Liu, Xulei Yang, Guosheng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce a novel approach for creating controllable
+dynamics in 3D-generated Gaussians using casually captured reference videos.
+Our method transfers the motion of objects from reference videos to a variety
+of generated 3D Gaussians across different categories, ensuring precise and
+customizable motion transfer. We achieve this by employing blend skinning-based
+non-parametric shape reconstruction to extract the shape and motion of
+reference objects. This process involves segmenting the reference objects into
+motion-related parts based on skinning weights and establishing shape
+correspondences with generated target shapes. To address shape and temporal
+inconsistencies prevalent in existing methods, we integrate physical
+simulation, driving the target shapes with matched motion. This integration is
+optimized through a displacement loss to ensure reliable and genuine dynamics.
+Our approach supports diverse reference inputs, including humans, quadrupeds,
+and articulated objects, and can generate dynamics of arbitrary length,
+providing enhanced fidelity and applicability. Unlike methods heavily reliant
+on diffusion video generation models, our technique offers specific and
+high-quality motion transfer, maintaining both shape integrity and temporal
+consistency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page: https://sync4dphys.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UNIMO-G: Unified Image Generation through Multimodal Conditional
+  Diffusion <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.13388v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.13388v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Li, Xue Xu, Jiachen Liu, Xinyan Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing text-to-image diffusion models primarily generate images from text
+prompts. However, the inherent conciseness of textual descriptions poses
+challenges in faithfully synthesizing images with intricate details, such as
+specific entities or scenes. This paper presents UNIMO-G, a simple multimodal
+conditional diffusion framework that operates on multimodal prompts with
+interleaved textual and visual inputs, which demonstrates a unified ability for
+both text-driven and subject-driven image generation. UNIMO-G comprises two
+core components: a Multimodal Large Language Model (MLLM) for encoding
+multimodal prompts, and a conditional denoising diffusion network for
+generating images based on the encoded multimodal input. We leverage a
+two-stage training strategy to effectively train the framework: firstly
+pre-training on large-scale text-image pairs to develop conditional image
+generation capabilities, and then instruction tuning with multimodal prompts to
+achieve unified image generation proficiency. A well-designed data processing
+pipeline involving language grounding and image segmentation is employed to
+construct multi-modal prompts. UNIMO-G excels in both text-to-image generation
+and zero-shot subject-driven synthesis, and is notably effective in generating
+high-fidelity images from complex multimodal prompts involving multiple image
+entities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024, Main Conference, Long Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion-aware Dynamic Graph Neural Network for Video Compressive Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.00387v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.00387v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiying Lu, Ziheng Cheng, Bo Chen, Xin Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video snapshot compressive imaging (SCI) utilizes a 2D detector to capture
+sequential video frames and compress them into a single measurement. Various
+reconstruction methods have been developed to recover the high-speed video
+frames from the snapshot measurement. However, most existing reconstruction
+methods are incapable of efficiently capturing long-range spatial and temporal
+dependencies, which are critical for video processing. In this paper, we
+propose a flexible and robust approach based on the graph neural network (GNN)
+to efficiently model non-local interactions between pixels in space and time
+regardless of the distance. Specifically, we develop a motion-aware dynamic GNN
+for better video representation, i.e., represent each node as the aggregation
+of relative neighbors under the guidance of frame-by-frame motions, which
+consists of motion-aware dynamic sampling, cross-scale node sampling, global
+knowledge integration, and graph aggregation. Extensive results on both
+simulation and real data demonstrate both the effectiveness and efficiency of
+the proposed approach, and the visualization illustrates the intrinsic dynamic
+sampling operations of our proposed model for boosting the video SCI
+reconstruction results. The code and model will be released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ThermoHands: A Benchmark for 3D Hand Pose Estimation from Egocentric
+  Thermal Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09871v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09871v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangqiang Ding, Yunzhou Zhu, Xiangyu Wen, Gaowen Liu, Chris Xiaoxuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present ThermoHands, a new benchmark for thermal image-based
+egocentric 3D hand pose estimation, aimed at overcoming challenges like varying
+lighting conditions and obstructions (e.g., handwear). The benchmark includes a
+multi-view and multi-spectral dataset collected from 28 subjects performing
+hand-object and hand-virtual interactions under diverse scenarios, accurately
+annotated with 3D hand poses through an automated process. We introduce a new
+baseline method, TherFormer, utilizing dual transformer modules for effective
+egocentric 3D hand pose estimation in thermal imagery. Our experimental results
+highlight TherFormer's leading performance and affirm thermal imaging's
+effectiveness in enabling robust 3D hand pose estimation in adverse conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Rate-Distortion-Perception-Classification Tradeoff: Joint Source
+  Coding and Modulation via Inverse-Domain GANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14792v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14792v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junli Fang, João F. C. Mota, Baoshan Lu, Weicheng Zhang, Xuemin Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The joint source-channel coding (JSCC) framework leverages deep learning to
+learn from data the best codes for source and channel coding. When the output
+signal, rather than being binary, is directly mapped onto the IQ domain
+(complex-valued), we call the resulting framework joint source coding and
+modulation (JSCM). We consider a JSCM scenario and show the existence of a
+strict tradeoff between channel rate, distortion, perception, and
+classification accuracy, a tradeoff that we name RDPC. We then propose two
+image compression methods to navigate that tradeoff: the RDPCO algorithm which,
+under simple assumptions, directly solves the optimization problem
+characterizing the tradeoff, and an algorithm based on an inverse-domain
+generative adversarial network (ID-GAN), which is more general and achieves
+extreme compression. Simulation results corroborate the theoretical findings,
+showing that both algorithms exhibit the RDPC tradeoff. They also demonstrate
+that the proposed ID-GAN algorithm effectively balances image distortion,
+perception, and classification accuracy, and significantly outperforms
+traditional separation-based methods and recent deep JSCM architectures in
+terms of one or more of these metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted in IEEE Transactions on Signal Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Point Cloud Matters: Rethinking the Impact of Different Observation
+  Spaces on Robot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02500v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02500v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyi Zhu, Yating Wang, Di Huang, Weicai Ye, Wanli Ouyang, Tong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In robot learning, the observation space is crucial due to the distinct
+characteristics of different modalities, which can potentially become a
+bottleneck alongside policy design. In this study, we explore the influence of
+various observation spaces on robot learning, focusing on three predominant
+modalities: RGB, RGB-D, and point cloud. We introduce OBSBench, a benchmark
+comprising two simulators and 125 tasks, along with standardized pipelines for
+various encoders and policy baselines. Extensive experiments on diverse
+contact-rich manipulation tasks reveal a notable trend: point cloud-based
+methods, even those with the simplest designs, frequently outperform their RGB
+and RGB-D counterparts. This trend persists in both scenarios: training from
+scratch and utilizing pre-training. Furthermore, our findings demonstrate that
+point cloud observations often yield better policy performance and
+significantly stronger generalization capabilities across various geometric and
+visual conditions. These outcomes suggest that the 3D point cloud is a valuable
+observation modality for intricate robotic tasks. We also suggest that
+incorporating both appearance and coordinate information can enhance the
+performance of point cloud methods. We hope our work provides valuable insights
+and guidance for designing more generalizable and robust robotic models. Codes
+are available at https://github.com/HaoyiZhu/PointCloudMatters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Tuning: Transferring Diffusion Models via Chain of Forgetting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jincheng Zhong, Xingzhuo Guo, Jiaxiang Dong, Mingsheng Long
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have significantly advanced the field of generative
+modeling. However, training a diffusion model is computationally expensive,
+creating a pressing need to adapt off-the-shelf diffusion models for downstream
+generation tasks. Current fine-tuning methods focus on parameter-efficient
+transfer learning but overlook the fundamental transfer characteristics of
+diffusion models. In this paper, we investigate the transferability of
+diffusion models and observe a monotonous chain of forgetting trend of
+transferability along the reverse process. Based on this observation and novel
+theoretical insights, we present Diff-Tuning, a frustratingly simple transfer
+approach that leverages the chain of forgetting tendency. Diff-Tuning
+encourages the fine-tuned model to retain the pre-trained knowledge at the end
+of the denoising chain close to the generated data while discarding the other
+noise side. We conduct comprehensive experiments to evaluate Diff-Tuning,
+including the transfer of pre-trained Diffusion Transformer models to eight
+downstream generations and the adaptation of Stable Diffusion to five control
+conditions with ControlNet. Diff-Tuning achieves a 26% improvement over
+standard fine-tuning and enhances the convergence speed of ControlNet by 24%.
+Notably, parameter-efficient transfer learning techniques for diffusion models
+can also benefit from Diff-Tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation
+  System <span class="chip">CVPR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01616v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01616v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfei Fan, Tianyu Zhao, Guidong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accuracy and computational efficiency are the most important metrics to
+Visual Inertial Navigation System (VINS). The existing VINS algorithms with
+either high accuracy or low computational complexity, are difficult to provide
+the high precision localization in resource-constrained devices. To this end,
+we propose a novel filter-based VINS framework named SchurVINS, which could
+guarantee both high accuracy by building a complete residual model and low
+computational complexity with Schur complement. Technically, we first formulate
+the full residual model where Gradient, Hessian and observation covariance are
+explicitly modeled. Then Schur complement is employed to decompose the full
+model into ego-motion residual model and landmark residual model. Finally,
+Extended Kalman Filter (EKF) update is implemented in these two models with
+high efficiency. Experiments on EuRoC and TUM-VI datasets show that our method
+notably outperforms state-of-the-art (SOTA) methods in both accuracy and
+computational complexity. The experimental code of SchurVINS is available at
+https://github.com/bytedance/SchurVINS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoopHash: Cooperative Learning of Multipurpose Descriptor and
+  Contrastive Pair Generator via Variational MCMC Teaching for Supervised Image
+  Hashing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04288v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04288v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khoa D. Doan, Jianwen Xie, Yaxuan Zhu, Yang Zhao, Ping Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging supervised information can lead to superior retrieval performance
+in the image hashing domain but the performance degrades significantly without
+enough labeled data. One effective solution to boost performance is to employ
+generative models, such as Generative Adversarial Networks (GANs), to generate
+synthetic data in an image hashing model. However, GAN-based methods are
+difficult to train, which prevents the hashing approaches from jointly training
+the generative models and the hash functions. This limitation results in
+sub-optimal retrieval performance. To overcome this limitation, we propose a
+novel framework, the generative cooperative hashing network, which is based on
+energy-based cooperative learning. This framework jointly learns a powerful
+generative representation of the data and a robust hash function via two
+components: a top-down contrastive pair generator that synthesizes contrastive
+images and a bottom-up multipurpose descriptor that simultaneously represents
+the images from multiple perspectives, including probability density, hash
+code, latent code, and category. The two components are jointly learned via a
+novel likelihood-based cooperative learning scheme. We conduct experiments on
+several real-world datasets and show that the proposed method outperforms the
+competing hashing supervised methods, achieving up to 10\% relative improvement
+over the current state-of-the-art supervised hashing methods, and exhibits a
+significantly better performance in out-of-distribution retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bidirectional Autoregressive Diffusion Model for Dance Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04356v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04356v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Canyu Zhang, Youbao Tang, Ning Zhang, Ruei-Sung Lin, Mei Han, Jing Xiao, Song Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dance serves as a powerful medium for expressing human emotions, but the
+lifelike generation of dance is still a considerable challenge. Recently,
+diffusion models have showcased remarkable generative abilities across various
+domains. They hold promise for human motion generation due to their adaptable
+many-to-many nature. Nonetheless, current diffusion-based motion generation
+models often create entire motion sequences directly and unidirectionally,
+lacking focus on the motion with local and bidirectional enhancement. When
+choreographing high-quality dance movements, people need to take into account
+not only the musical context but also the nearby music-aligned dance motions.
+To authentically capture human behavior, we propose a Bidirectional
+Autoregressive Diffusion Model (BADM) for music-to-dance generation, where a
+bidirectional encoder is built to enforce that the generated dance is
+harmonious in both the forward and backward directions. To make the generated
+dance motion smoother, a local information decoder is built for local motion
+enhancement. The proposed framework is able to generate new motions based on
+the input conditions and nearby motions, which foresees individual motion
+slices iteratively and consolidates all predictions. To further refine the
+synchronicity between the generated dance and the beat, the beat information is
+incorporated as an input to generate better music-aligned dance movements.
+Experimental results demonstrate that the proposed model achieves
+state-of-the-art performance compared to existing unidirectional approaches on
+the prominent benchmark for music-to-dance generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Follow-Your-Emoji: Fine-Controllable and Expressive Freestyle Portrait
+  Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01900v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01900v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Ma, Hongyu Liu, Hongfa Wang, Heng Pan, Yingqing He, Junkun Yuan, Ailing Zeng, Chengfei Cai, Heung-Yeung Shum, Wei Liu, Qifeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Follow-Your-Emoji, a diffusion-based framework for portrait
+animation, which animates a reference portrait with target landmark sequences.
+The main challenge of portrait animation is to preserve the identity of the
+reference portrait and transfer the target expression to this portrait while
+maintaining temporal consistency and fidelity. To address these challenges,
+Follow-Your-Emoji equipped the powerful Stable Diffusion model with two
+well-designed technologies. Specifically, we first adopt a new explicit motion
+signal, namely expression-aware landmark, to guide the animation process. We
+discover this landmark can not only ensure the accurate motion alignment
+between the reference portrait and target motion during inference but also
+increase the ability to portray exaggerated expressions (i.e., large pupil
+movements) and avoid identity leakage. Then, we propose a facial fine-grained
+loss to improve the model's ability of subtle expression perception and
+reference portrait appearance reconstruction by using both expression and
+facial masks. Accordingly, our method demonstrates significant performance in
+controlling the expression of freestyle portraits, including real humans,
+cartoons, sculptures, and even animals. By leveraging a simple and effective
+progressive generation strategy, we extend our model to stable long-term
+animation, thus increasing its potential application value. To address the lack
+of a benchmark for this field, we introduce EmojiBench, a comprehensive
+benchmark comprising diverse portrait images, driving videos, and landmarks. We
+show extensive evaluations on EmojiBench to verify the superiority of
+Follow-Your-Emoji.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://follow-your-emoji.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One Point, One Object: Simultaneous 3D Object Segmentation and 6-DOF
+  Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1912.12095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1912.12095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongsen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a single-shot method for simultaneous 3D object segmentation and
+6-DOF pose estimation in pure 3D point clouds scenes based on a consensus that
+\emph{one point only belongs to one object}, i.e., each point has the potential
+power to predict the 6-DOF pose of its corresponding object. Unlike the
+recently proposed methods of the similar task, which rely on 2D detectors to
+predict the projection of 3D corners of the 3D bounding boxes and the 6-DOF
+pose must be estimated by a PnP like spatial transformation method, ours is
+concise enough not to require additional spatial transformation between
+different dimensions. Due to the lack of training data for many objects, the
+recently proposed 2D detection methods try to generate training data by using
+rendering engine and achieve good results. However, rendering in 3D space along
+with 6-DOF is relatively difficult. Therefore, we propose an augmented reality
+technology to generate the training data in semi-virtual reality 3D space. The
+key component of our method is a multi-task CNN architecture that can
+simultaneously predicts the 3D object segmentation and 6-DOF pose estimation in
+pure 3D point clouds.
+  For experimental evaluation, we generate expanded training data for two
+state-of-the-arts 3D object datasets \cite{PLCHF}\cite{TLINEMOD} by using
+Augmented Reality technology (AR). We evaluate our proposed method on the two
+datasets. The results show that our method can be well generalized into
+multiple scenarios and provide performance comparable to or better than the
+state-of-the-arts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spiking CenterNet: A Distillation-boosted Spiking Neural Network for
+  Object Detection <span class="chip">IJCNN 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01287v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01287v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lennard Bodden, Franziska Schwaiger, Duc Bach Ha, Lars Kreuzberg, Sven Behnke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of AI at the edge, self-driving cars, and climate change, the need
+for energy-efficient, small, embedded AI is growing. Spiking Neural Networks
+(SNNs) are a promising approach to address this challenge, with their
+event-driven information flow and sparse activations. We propose Spiking
+CenterNet for object detection on event data. It combines an SNN CenterNet
+adaptation with an efficient M2U-Net-based decoder. Our model significantly
+outperforms comparable previous work on Prophesee's challenging GEN1 Automotive
+Detection Dataset while using less than half the energy. Distilling the
+knowledge of a non-spiking teacher into our SNN further increases performance.
+To the best of our knowledge, our work is the first approach that takes
+advantage of knowledge distillation in the field of spiking object detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures. Accepted at IJCNN 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harmonious Group Choreography with Trajectory-Controllable Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqin Dai, Wanlu Zhu, Ronghui Li, Zeping Ren, Xiangzheng Zhou, Xiu Li, Jun Li, Jian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Creating group choreography from music has gained attention in cultural
+entertainment and virtual reality, aiming to coordinate visually cohesive and
+diverse group movements. Despite increasing interest, recent works face
+challenges in achieving aesthetically appealing choreography, primarily for two
+key issues: multi-dancer collision and single-dancer foot slide. To address
+these issues, we propose a Trajectory-Controllable Diffusion (TCDiff), a novel
+approach that harnesses non-overlapping trajectories to facilitate coherent
+dance movements. Specifically, to tackle dancer collisions, we introduce a
+Dance-Beat Navigator capable of generating trajectories for multiple dancers
+based on the music, complemented by a Distance-Consistency loss to maintain
+appropriate spacing among trajectories within a reasonable threshold. To
+mitigate foot sliding, we present a Footwork Adaptor that utilizes trajectory
+displacement from adjacent frames to enable flexible footwork, coupled with a
+Relative Forward-Kinematic loss to adjust the positioning of individual
+dancers' root nodes and joints. Extensive experiments demonstrate that our
+method achieves state-of-the-art results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COURIER: Contrastive User Intention Reconstruction for Large-Scale
+  Visual Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05001v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05001v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Qi Yang, Chenglei Dai, Dan OU, Dongshuai Li, Ju Huang, De-Chuan Zhan, Xiaoyi Zeng, Yang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of multimedia internet, the impact of visual
+characteristics on the decision of users to click or not within the online
+retail industry is increasingly significant. Thus, incorporating visual
+features is a promising direction for further performance improvements in
+click-through rate (CTR). However, experiments on our production system
+revealed that simply injecting the image embeddings trained with established
+pre-training methods only has marginal improvements. We believe that the main
+advantage of existing image feature pre-training methods lies in their
+effectiveness for cross-modal predictions. However, this differs significantly
+from the task of CTR prediction in recommendation systems. In recommendation
+systems, other modalities of information (such as text) can be directly used as
+features in downstream models. Even if the performance of cross-modal
+prediction tasks is excellent, it is challenging to provide significant
+information gain for the downstream models. We argue that a visual feature
+pre-training method tailored for recommendation is necessary for further
+improvements beyond existing modality features. To this end, we propose an
+effective user intention reconstruction module to mine visual features related
+to user interests from behavior histories, which constructs a many-to-one
+correspondence. We further propose a contrastive training method to learn the
+user intentions and prevent the collapse of embedding vectors. We conduct
+extensive experimental evaluations on public datasets and our production system
+to verify that our method can learn users' visual interests. Our method
+achieves $0.46\%$ improvement in offline AUC and $0.88\%$ improvement in Taobao
+GMV (Cross Merchandise Volume) with p-value$<$0.01.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Textual Inversion and <span class="highlight-title">Self-supervised</span> Refinement for Radiology Report
+  Generation <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20607v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20607v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanjiang Luo, Hongxiang Li, Xuan Wu, Meng Cao, Xiaoshuang Huang, Zhihong Zhu, Peixi Liao, Hu Chen, Yi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing mainstream approaches follow the encoder-decoder paradigm for
+generating radiology reports. They focus on improving the network structure of
+encoders and decoders, which leads to two shortcomings: overlooking the
+modality gap and ignoring report content constraints. In this paper, we
+proposed Textual Inversion and Self-supervised Refinement (TISR) to address the
+above two issues. Specifically, textual inversion can project text and image
+into the same space by representing images as pseudo words to eliminate the
+cross-modeling gap. Subsequently, self-supervised refinement refines these
+pseudo words through contrastive loss computation between images and texts,
+enhancing the fidelity of generated reports to images. Notably, TISR is
+orthogonal to most existing methods, plug-and-play. We conduct experiments on
+two widely-used public datasets and achieve significant improvements on various
+baselines, which demonstrates the effectiveness and generalization of TISR. The
+code will be available soon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been early accepted by MICCAI 2024!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TAAT: Think and Act from Arbitrary Texts in Text2Motion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14745v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14745v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runqi Wang, Caoyuan Ma, Guopeng Li, Zheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text2Motion aims to generate human motions from texts. Existing datasets rely
+on the assumption that texts include action labels (such as "walk, bend, and
+pick up"), which is not flexible for practical scenarios. This paper redefines
+this problem with a more realistic assumption that the texts are arbitrary.
+Specifically, arbitrary texts include existing action texts composed of action
+labels (e.g., A person walks and bends to pick up something), and introduce
+scene texts without explicit action labels (e.g., A person notices his wallet
+on the ground ahead).
+  To bridge the gaps between this realistic setting and existing datasets, we
+expand the action texts on the HumanML3D dataset to more scene texts, thereby
+creating a new HumanML3D++ dataset including arbitrary texts. In this
+challenging dataset, we benchmark existing state-of-the-art methods and propose
+a novel two-stage framework to extract action labels from arbitrary texts by
+the Large Language Model (LLM) and then generate motions from action labels.
+Extensive experiments are conducted under different application scenarios to
+validate the effectiveness of the proposed framework on existing and proposed
+datasets. The results indicate that Text2Motion in this realistic setting is
+very challenging, fostering new research in this practical direction. Our
+dataset and code will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated errors in author information</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ADer: A Comprehensive Benchmark for Multi-class Visual Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03262v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03262v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangning Zhang, Haoyang He, Zhenye Gan, Qingdong He, Yuxuan Cai, Zhucun Xue, Yabiao Wang, Chengjie Wang, Lei Xie, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual anomaly detection aims to identify anomalous regions in images through
+unsupervised learning paradigms, with increasing application demand and value
+in fields such as industrial inspection and medical lesion detection. Despite
+significant progress in recent years, there is a lack of comprehensive
+benchmarks to adequately evaluate the performance of various mainstream methods
+across different datasets under the practical multi-class setting. The absence
+of standardized experimental setups can lead to potential biases in training
+epochs, resolution, and metric results, resulting in erroneous conclusions.
+This paper addresses this issue by proposing a comprehensive visual anomaly
+detection benchmark, \textbf{\textit{ADer}}, which is a modular framework that
+is highly extensible for new methods. The benchmark includes multiple datasets
+from industrial and medical domains, implementing fifteen state-of-the-art
+methods and nine comprehensive metrics. Additionally, we have open-sourced the
+GPU-assisted \href{https://pypi.org/project/ADEval}{ADEval} package to address
+the slow evaluation problem of metrics like time-consuming mAU-PRO on
+large-scale data, significantly reducing evaluation time by more than
+\textit{1000-fold}. Through extensive experimental results, we objectively
+reveal the strengths and weaknesses of different methods and provide insights
+into the challenges and future directions of multi-class visual anomaly
+detection. We hope that \textbf{\textit{ADer}} will become a valuable resource
+for researchers and practitioners in the field, promoting the development of
+more robust and generalizable anomaly detection systems. Full codes have been
+attached in Appendix and open-sourced at
+\url{https://github.com/zhangzjn/ader}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inv-Adapter: ID Customization Generation via Image Inversion and
+  Lightweight Adapter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02881v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02881v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xing, Ning Wang, Jianbo Ouyang, Zechao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable advancement in text-to-image generation models significantly
+boosts the research in ID customization generation. However, existing
+personalization methods cannot simultaneously satisfy high fidelity and
+high-efficiency requirements. Their main bottleneck lies in the prompt image
+encoder, which produces weak alignment signals with the text-to-image model and
+significantly increased model size. Towards this end, we propose a lightweight
+Inv-Adapter, which first extracts diffusion-domain representations of ID images
+utilizing a pre-trained text-to-image model via DDIM image inversion, without
+additional image encoder. Benefiting from the high alignment of the extracted
+ID prompt features and the intermediate features of the text-to-image model, we
+then embed them efficiently into the base text-to-image model by carefully
+designing a lightweight attention adapter. We conduct extensive experiments to
+assess ID fidelity, generation loyalty, speed, and training parameters, all of
+which show that the proposed Inv-Adapter is highly competitive in ID
+customization generation and model scale.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SCoRe: Submodular Combinatorial Representation Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00165v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00165v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anay Majee, Suraj Kothawade, Krishnateja Killamsetty, Rishabh Iyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we introduce the SCoRe (Submodular Combinatorial Representation
+Learning) framework, a novel approach in representation learning that addresses
+inter-class bias and intra-class variance. SCoRe provides a new combinatorial
+viewpoint to representation learning, by introducing a family of loss functions
+based on set-based submodular information measures. We develop two novel
+combinatorial formulations for loss functions, using the Total Information and
+Total Correlation, that naturally minimize intra-class variance and inter-class
+bias. Several commonly used metric/contrastive learning loss functions like
+supervised contrastive loss, orthogonal projection loss, and N-pairs loss, are
+all instances of SCoRe, thereby underlining the versatility and applicability
+of SCoRe in a broad spectrum of learning scenarios. Novel objectives in SCoRe
+naturally model class-imbalance with up to 7.6\% improvement in classification
+on CIFAR-10-LT, CIFAR-100-LT, MedMNIST, 2.1% on ImageNet-LT, and 19.4% in
+object detection on IDD and LVIS (v1.0), demonstrating its effectiveness over
+existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Partial train and isolate, mitigate backdoor attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong Li, Han Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are widely known to be vulnerable to backdoor attacks, a
+method that poisons a portion of the training data to make the target model
+perform well on normal data sets, while outputting attacker-specified or random
+categories on the poisoned samples. Backdoor attacks are full of threats.
+Poisoned samples are becoming more and more similar to corresponding normal
+samples, and even the human eye cannot easily distinguish them. On the other
+hand, the accuracy of models carrying backdoors on normal samples is no
+different from that of clean models.In this article, by observing the
+characteristics of backdoor attacks, We provide a new model training method
+(PT) that freezes part of the model to train a model that can isolate
+suspicious samples. Then, on this basis, a clean model is fine-tuned to resist
+backdoor attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HENASY: Learning to Assemble Scene-Entities for Egocentric
+  Video-Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00307v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00307v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khoa Vo, Thinh Phan, Kashu Yamazaki, Minh Tran, Ngan Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current video-language models (VLMs) rely extensively on instance-level
+alignment between video and language modalities, which presents two major
+limitations: (1) visual reasoning disobeys the natural perception that humans
+do in first-person perspective, leading to a lack of reasoning interpretation;
+and (2) learning is limited in capturing inherent fine-grained relationships
+between two modalities.
+  In this paper, we take an inspiration from human perception and explore a
+compositional approach for egocentric video representation. We introduce HENASY
+(Hierarchical ENtities ASsemblY), which includes a spatiotemporal token
+grouping mechanism to explicitly assemble dynamically evolving scene entities
+through time and model their relationship for video representation. By
+leveraging compositional structure understanding, HENASY possesses strong
+interpretability via visual grounding with free-form text queries. We further
+explore a suite of multi-grained contrastive losses to facilitate
+entity-centric understandings. This comprises three alignment types:
+video-narration, noun-entity, verb-entities alignments.
+  Our method demonstrates strong interpretability in both quantitative and
+qualitative experiments; while maintaining competitive performances on five
+downstream tasks via zero-shot transfer or as video/text representation,
+including video/text retrieval, action recognition, multi-choice query, natural
+language query, and moments query.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UVOSAM: A Mask-free Paradigm for Unsupervised Video Object Segmentation
+  via Segment Anything Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12659v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12659v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghao Zhang, Shengfan Zhang, Zhichao Wei, Zuozhuo Dai, Siyu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current state-of-the-art methods for unsupervised video object
+segmentation (UVOS) require extensive training on video datasets with mask
+annotations, limiting their effectiveness in handling challenging scenarios.
+However, the Segment Anything Model (SAM) introduces a new prompt-driven
+paradigm for image segmentation, offering new possibilities. In this study, we
+investigate SAM's potential for UVOS through different prompt strategies. We
+then propose UVOSAM, a mask-free paradigm for UVOS that utilizes the STD-Net
+tracker. STD-Net incorporates a spatial-temporal decoupled deformable attention
+mechanism to establish an effective correlation between intra- and inter-frame
+features, remarkably enhancing the quality of box prompts in complex video
+scenes. Extensive experiments on the DAVIS2017-unsupervised and
+YoutubeVIS19\&21 datasets demonstrate the superior performance of UVOSAM
+without mask supervision compared to existing mask-supervised methods, as well
+as its ability to generalize to weakly-annotated video datasets. Code can be
+found at https://github.com/alibaba/UVOSAM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EgoSurgery-Tool: A <span class="highlight-title">Dataset</span> of Surgical Tool and Hand Detection from
+  Egocentric Open Surgery Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03095v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03095v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryo Fujii, Hideo Saito, Hiroki Kajita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surgical tool detection is a fundamental task for understanding egocentric
+open surgery videos. However, detecting surgical tools presents significant
+challenges due to their highly imbalanced class distribution, similar shapes
+and similar textures, and heavy occlusion. The lack of a comprehensive
+large-scale dataset compounds these challenges. In this paper, we introduce
+EgoSurgery-Tool, an extension of the existing EgoSurgery-Phase dataset, which
+contains real open surgery videos captured using an egocentric camera attached
+to the surgeon's head, along with phase annotations. EgoSurgery-Tool has been
+densely annotated with surgical tools and comprises over 49K surgical tool
+bounding boxes across 15 categories, constituting a large-scale surgical tool
+detection dataset. EgoSurgery-Tool also provides annotations for hand detection
+with over 46K hand-bounding boxes, capturing hand-object interactions that are
+crucial for understanding activities in egocentric open surgery.
+EgoSurgery-Tool is superior to existing datasets due to its larger scale,
+greater variety of surgical tools, more annotations, and denser scenes. We
+conduct a comprehensive analysis of EgoSurgery-Tool using nine popular object
+detectors to assess their effectiveness in both surgical tool and hand
+detection. The dataset will be released at
+https://github.com/Fujiry0/EgoSurgery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UFineBench: Towards Text-based Person Retrieval with Ultra-fine
+  Granularity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03441v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03441v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialong Zuo, Hanyu Zhou, Ying Nie, Feng Zhang, Tianyu Guo, Nong Sang, Yunhe Wang, Changxin Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing text-based person retrieval datasets often have relatively
+coarse-grained text annotations. This hinders the model to comprehend the
+fine-grained semantics of query texts in real scenarios. To address this
+problem, we contribute a new benchmark named \textbf{UFineBench} for text-based
+person retrieval with ultra-fine granularity.
+  Firstly, we construct a new \textbf{dataset} named UFine6926. We collect a
+large number of person images and manually annotate each image with two
+detailed textual descriptions, averaging 80.8 words each. The average word
+count is three to four times that of the previous datasets. In addition of
+standard in-domain evaluation, we also propose a special \textbf{evaluation
+paradigm} more representative of real scenarios. It contains a new evaluation
+set with cross domains, cross textual granularity and cross textual styles,
+named UFine3C, and a new evaluation metric for accurately measuring retrieval
+ability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a
+more efficient \textbf{algorithm} especially designed for text-based person
+retrieval with ultra fine-grained texts. It achieves fine granularity mining by
+adopting a shared cross-modal granularity decoder and hard negative match
+mechanism.
+  With standard in-domain evaluation, CFAM establishes competitive performance
+across various datasets, especially on our ultra fine-grained UFine6926.
+Furthermore, by evaluating on UFine3C, we demonstrate that training on our
+UFine6926 significantly improves generalization to real scenarios compared with
+other coarse-grained datasets. The dataset and code will be made publicly
+available at \url{https://github.com/Zplusdragon/UFineBench}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ U-KAN Makes Strong Backbone for Medical Image Segmentation and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02918v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02918v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxin Li, Xinyu Liu, Wuyang Li, Cheng Wang, Hengyu Liu, Yixuan Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  U-Net has become a cornerstone in various visual applications such as image
+segmentation and diffusion probability models. While numerous innovative
+designs and improvements have been introduced by incorporating transformers or
+MLPs, the networks are still limited to linearly modeling patterns as well as
+the deficient interpretability. To address these challenges, our intuition is
+inspired by the impressive results of the Kolmogorov-Arnold Networks (KANs) in
+terms of accuracy and interpretability, which reshape the neural network
+learning via the stack of non-linear learnable activation functions derived
+from the Kolmogorov-Anold representation theorem. Specifically, in this paper,
+we explore the untapped potential of KANs in improving backbones for vision
+tasks. We investigate, modify and re-design the established U-Net pipeline by
+integrating the dedicated KAN layers on the tokenized intermediate
+representation, termed U-KAN. Rigorous medical image segmentation benchmarks
+verify the superiority of U-KAN by higher accuracy even with less computation
+cost. We further delved into the potential of U-KAN as an alternative U-Net
+noise predictor in diffusion models, demonstrating its applicability in
+generating task-oriented model architectures. These endeavours unveil valuable
+insights and sheds light on the prospect that with U-KAN, you can make strong
+backbone for medical image segmentation and generation. Project page:
+https://yes-ukan.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Optimizers Are Better Than One: LLM Catalyst Empowers Gradient-Based
+  Optimization for <span class="highlight-title">Prompt</span> Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19732v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19732v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixian Guo, Ming Liu, Zhilong Ji, Jinfeng Bai, Yiwen Guo, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning a skill generally relies on both practical experience by doer and
+insightful high-level guidance by instructor. Will this strategy also work well
+for solving complex non-convex optimization problems? Here, a common
+gradient-based optimizer acts like a disciplined doer, making locally optimal
+update at each step. Recent methods utilize large language models (LLMs) to
+optimize solutions for concrete problems by inferring from natural language
+instructions, akin to a high-level instructor. In this paper, we show that
+these two optimizers are complementary to each other, suggesting a
+collaborative optimization approach. The gradient-based optimizer and LLM-based
+optimizer are combined in an interleaved manner. We instruct LLMs using task
+descriptions and timely optimization trajectories recorded during
+gradient-based optimization. Inferred results from LLMs are used as restarting
+points for the next stage of gradient optimization. By leveraging both the
+locally rigorous gradient-based optimizer and the high-level deductive
+LLM-based optimizer, our combined optimization method consistently yields
+improvements over competitive baseline prompt tuning methods. Our results
+demonstrate the synergistic effect of conventional gradient-based optimization
+and the inference ability of LLMs. The code is released at
+https://github.com/guozix/LLM-catalyst.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FAIntbench: A Holistic and Precise Benchmark for Bias Evaluation in
+  Text-to-Image Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17814v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17814v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanjun Luo, Ziye Deng, Ruizhe Chen, Zuozhu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development and reduced barriers to entry for Text-to-Image (T2I)
+models have raised concerns about the biases in their outputs, but existing
+research lacks a holistic definition and evaluation framework of biases,
+limiting the enhancement of debiasing techniques. To address this issue, we
+introduce FAIntbench, a holistic and precise benchmark for biases in T2I
+models. In contrast to existing benchmarks that evaluate bias in limited
+aspects, FAIntbench evaluate biases from four dimensions: manifestation of
+bias, visibility of bias, acquired attributes, and protected attributes. We
+applied FAIntbench to evaluate seven recent large-scale T2I models and
+conducted human evaluation, whose results demonstrated the effectiveness of
+FAIntbench in identifying various biases. Our study also revealed new research
+questions about biases, including the side-effect of distillation. The findings
+presented here are preliminary, highlighting the potential of FAIntbench to
+advance future research aimed at mitigating the biases in T2I models. Our
+benchmark is publicly available to ensure the reproducibility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cluster-Aware Similarity Diffusion for Instance Retrieval <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02343v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02343v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jifei Luo, Hantao Yao, Changsheng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based re-ranking is a common method used for retrieving instances
+by performing similarity propagation in a nearest neighbor graph. However,
+existing techniques that construct the affinity graph based on pairwise
+instances can lead to the propagation of misinformation from outliers and other
+manifolds, resulting in inaccurate results. To overcome this issue, we propose
+a novel Cluster-Aware Similarity (CAS) diffusion for instance retrieval. The
+primary concept of CAS is to conduct similarity diffusion within local
+clusters, which can reduce the influence from other manifolds explicitly. To
+obtain a symmetrical and smooth similarity matrix, our Bidirectional Similarity
+Diffusion strategy introduces an inverse constraint term to the optimization
+objective of local cluster diffusion. Additionally, we have optimized a
+Neighbor-guided Similarity Smoothing approach to ensure similarity consistency
+among the local neighbors of each instance. Evaluations in instance retrieval
+and object re-identification validate the effectiveness of the proposed CAS,
+our code is publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated Dominative Subspace Mining for Efficient Neural Architecture
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.17180v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.17180v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaofo Chen, Yong Guo, Daihai Liao, Fanbing Lv, Hengjie Song, James Tin-Yau Kwok, Mingkui Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Architecture Search (NAS) aims to automatically find effective
+architectures within a predefined search space. However, the search space is
+often extremely large. As a result, directly searching in such a large search
+space is non-trivial and also very time-consuming. To address the above issues,
+in each search step, we seek to limit the search space to a small but effective
+subspace to boost both the search performance and search efficiency. To this
+end, we propose a novel Neural Architecture Search method via Dominative
+Subspace Mining (DSM-NAS) that finds promising architectures in automatically
+mined subspaces. Specifically, we first perform a global search, i.e .,
+dominative subspace mining, to find a good subspace from a set of candidates.
+Then, we perform a local search within the mined subspace to find effective
+architectures. More critically, we further boost search performance by taking
+well-designed/ searched architectures to initialize candidate subspaces.
+Experimental results demonstrate that DSM-NAS not only reduces the search cost
+but also discovers better architectures than state-of-the-art methods in
+various benchmark search spaces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in IEEE TCSVT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust and Efficient Cloud-Edge Elastic Model Adaptation via
+  Selective Entropy Distillation <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17316v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17316v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaofo Chen, Shuaicheng Niu, Yaowei Wang, Shoukai Xu, Hengjie Song, Mingkui Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conventional deep learning paradigm often involves training a deep model
+on a server and then deploying the model or its distilled ones to
+resource-limited edge devices. Usually, the models shall remain fixed once
+deployed (at least for some period) due to the potential high cost of model
+adaptation for both the server and edge sides. However, in many real-world
+scenarios, the test environments may change dynamically (known as distribution
+shifts), which often results in degraded performance. Thus, one has to adapt
+the edge models promptly to attain promising performance. Moreover, with the
+increasing data collected at the edge, this paradigm also fails to further
+adapt the cloud model for better performance. To address these, we encounter
+two primary challenges: 1) the edge model has limited computation power and may
+only support forward propagation; 2) the data transmission budget between cloud
+and edge devices is limited in latency-sensitive scenarios. In this paper, we
+establish a Cloud-Edge Elastic Model Adaptation (CEMA) paradigm in which the
+edge models only need to perform forward propagation and the edge models can be
+adapted online. In our CEMA, to reduce the communication burden, we devise two
+criteria to exclude unnecessary samples from uploading to the cloud, i.e.,
+dynamic unreliable and low-informative sample exclusion. Based on the uploaded
+samples, we update and distribute the affine parameters of normalization layers
+by distilling from the stronger foundation model to the edge model with a
+sample replay strategy. Extensive experimental results on ImageNet-C and
+ImageNet-R verify the effectiveness of our CEMA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unveiling the Tapestry of Consistency in Large Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14156v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14156v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Zhang, Fei Xiao, Tao Huang, Chun-Kai Fan, Hongyuan Dong, Jiawen Li, Jiacong Wang, Kuan Cheng, Shanghang Zhang, Haoyuan Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models (LVLMs) have recently achieved rapid progress,
+exhibiting great perception and reasoning abilities concerning visual
+information. However, when faced with prompts in different sizes of solution
+spaces, LVLMs fail to always give consistent answers regarding the same
+knowledge point. This inconsistency of answers between different solution
+spaces is prevalent in LVLMs and erodes trust. To this end, we provide a
+multi-modal benchmark ConBench, to intuitively analyze how LVLMs perform when
+the solution space of a prompt revolves around a knowledge point. Based on the
+ConBench tool, we are the first to reveal the tapestry and get the following
+findings: (1) In the discriminate realm, the larger the solution space of the
+prompt, the lower the accuracy of the answers. (2) Establish the relationship
+between the discriminative and generative realms: the accuracy of the
+discriminative question type exhibits a strong positive correlation with its
+Consistency with the caption. (3) Compared to open-source models, closed-source
+models exhibit a pronounced bias advantage in terms of Consistency. Eventually,
+we ameliorate the consistency of LVLMs by trigger-based diagnostic refinement,
+indirectly improving the performance of their caption. We hope this paper will
+accelerate the research community in better evaluating their models and
+encourage future advancements in the consistency domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This project is available at
+  https://github.com/foundation-multimodal-models/ConBench</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tables as Texts or Images: Evaluating the Table Reasoning Ability of
+  LLMs and MLLMs <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12424v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12424v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naihao Deng, Zhenjie Sun, Ruiqi He, Aman Sikka, Yulong Chen, Lin Ma, Yue Zhang, Rada Mihalcea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the effectiveness of various LLMs in
+interpreting tabular data through different prompting strategies and data
+formats. Our analyses extend across six benchmarks for table-related tasks such
+as question-answering and fact-checking. We introduce for the first time the
+assessment of LLMs' performance on image-based table representations.
+Specifically, we compare five text-based and three image-based table
+representations, demonstrating the role of representation and prompting on LLM
+performance. Our study provides insights into the effective use of LLMs on
+table-related tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VALOR-EVAL: Holistic Coverage and Faithfulness Evaluation of Large
+  Vision-Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyi Qiu, Wenbo Hu, Zi-Yi Dou, Nanyun Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) suffer from hallucination issues,
+wherein the models generate plausible-sounding but factually incorrect outputs,
+undermining their reliability. A comprehensive quantitative evaluation is
+necessary to identify and understand the extent of hallucinations in these
+models. However, existing benchmarks are often limited in scope, focusing
+mainly on object hallucinations. Furthermore, current evaluation methods
+struggle to effectively address the subtle semantic distinctions between model
+outputs and reference data, as well as the balance between hallucination and
+informativeness. To address these issues, we introduce a multi-dimensional
+benchmark covering objects, attributes, and relations, with challenging images
+selected based on associative biases. Moreover, we propose a large language
+model (LLM)-based two-stage evaluation framework that generalizes the popular
+CHAIR metric and incorporates both faithfulness and coverage into the
+evaluation. Experiments on 10 established LVLMs demonstrate that our evaluation
+metric is more comprehensive and better correlated with humans than existing
+work when evaluating on our challenging human-annotated benchmark dataset. Our
+work also highlights the critical balance between faithfulness and coverage of
+model outputs, and encourages future works to address hallucinations in LVLMs
+while keeping their outputs informative.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language Agent Tree Search Unifies Reasoning Acting and Planning in
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04406v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04406v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Zhou, Kai Yan, Michal Shlapentokh-Rothman, Haohan Wang, Yu-Xiong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While language models (LMs) have shown potential across a range of
+decision-making tasks, their reliance on simple acting processes limits their
+broad deployment as autonomous agents. In this paper, we introduce Language
+Agent Tree Search (LATS) -- the first general framework that synergizes the
+capabilities of LMs in reasoning, acting, and planning. By leveraging the
+in-context learning ability of LMs, we integrate Monte Carlo Tree Search into
+LATS to enable LMs as agents, along with LM-powered value functions and
+self-reflections for proficient exploration and enhanced decision-making. A key
+feature of our approach is the incorporation of an environment for external
+feedback, which offers a more deliberate and adaptive problem-solving mechanism
+that surpasses the constraints of existing techniques. Our experimental
+evaluation across diverse domains, including programming, interactive
+question-answering (QA), web navigation, and math, validates the effectiveness
+and generality of LATS in decision-making while maintaining competitive or
+improved reasoning performance. Notably, LATS achieves state-of-the-art pass@1
+accuracy (92.7%) for programming on HumanEval with GPT-4 and demonstrates
+gradient-free performance (average score of 75.9) comparable to gradient-based
+fine-tuning for web navigation on WebShop with GPT-3.5. Code can be found at
+https://github.com/lapisrocks/LanguageAgentTreeSearch
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code at https://github.com/lapisrocks/LanguageAgentTreeSearch</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Backdoor Attack with Sparse and Invisible Trigger 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06209v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06209v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghua Gao, Yiming Li, Xueluan Gong, Zhifeng Li, Shu-Tao Xia, Qian Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNNs) are vulnerable to backdoor attacks, where the
+adversary manipulates a small portion of training data such that the victim
+model predicts normally on the benign samples but classifies the triggered
+samples as the target class. The backdoor attack is an emerging yet threatening
+training-phase threat, leading to serious risks in DNN-based applications. In
+this paper, we revisit the trigger patterns of existing backdoor attacks. We
+reveal that they are either visible or not sparse and therefore are not
+stealthy enough. More importantly, it is not feasible to simply combine
+existing methods to design an effective sparse and invisible backdoor attack.
+To address this problem, we formulate the trigger generation as a bi-level
+optimization problem with sparsity and invisibility constraints and propose an
+effective method to solve it. The proposed method is dubbed sparse and
+invisible backdoor attack (SIBA). We conduct extensive experiments on benchmark
+datasets under different settings, which verify the effectiveness of our attack
+and its resistance to existing backdoor defenses. The codes for reproducing
+main experiments are available at \url{https://github.com/YinghuaGao/SIBA}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted by IEEE Transactions on Information Forensics
+  and Security (TIFS). The first two authors contributed equally to this work.
+  14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapter-X: A Novel General Parameter-Efficient Fine-Tuning Framework for
+  Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minglei Li, Peng Ye, Yongqi Huang, Lin Zhang, Tao Chen, Tong He, Jiayuan Fan, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) has become increasingly important as
+foundation models continue to grow in both popularity and size. Adapter has
+been particularly well-received due to their potential for parameter reduction
+and adaptability across diverse tasks. However, striking a balance between high
+efficiency and robust generalization across tasks remains a challenge for
+adapter-based methods. We analyze existing methods and find that: 1) parameter
+sharing is the key to reducing redundancy; 2) more tunable parameters, dynamic
+allocation, and block-specific design are keys to improving performance.
+Unfortunately, no previous work considers all these factors. Inspired by this
+insight, we introduce a novel framework named Adapter-X. First, a Sharing
+Mixture of Adapters (SMoA) module is proposed to fulfill token-level dynamic
+allocation, increased tunable parameters, and inter-block sharing at the same
+time. Second, some block-specific designs like Prompt Generator (PG) are
+introduced to further enhance the ability of adaptation. Extensive experiments
+across 2D image and 3D point cloud modalities demonstrate that Adapter-X
+represents a significant milestone as it is the first to outperform full
+fine-tuning in both 2D image and 3D point cloud modalities with significantly
+fewer parameters, i.e., only 0.20% and 1.88% of original trainable parameters
+for 2D and 3D classification tasks. Our code will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">20</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PaCE: Parsimonious Concept Engineering for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinqi Luo, Tianjiao Ding, Kwan Ho Ryan Chan, Darshan Thaker, Aditya Chattopadhyay, Chris Callison-Burch, René Vidal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are being used for a wide variety of tasks.
+While they are capable of generating human-like responses, they can also
+produce undesirable output including potentially harmful information, racist or
+sexist language, and hallucinations. Alignment methods are designed to reduce
+such undesirable output, via techniques such as fine-tuning, prompt
+engineering, and representation engineering. However, existing methods face
+several challenges: some require costly fine-tuning for every alignment task;
+some do not adequately remove undesirable concepts, failing alignment; some
+remove benign concepts, lowering the linguistic capabilities of LLMs. To
+address these issues, we propose Parsimonious Concept Engineering (PaCE), a
+novel activation engineering framework for alignment. First, to sufficiently
+model the concepts, we construct a large-scale concept dictionary in the
+activation space, in which each atom corresponds to a semantic concept. Then,
+given any alignment task, we instruct a concept partitioner to efficiently
+annotate the concepts as benign or undesirable. Finally, at inference time, we
+decompose the LLM activations along the concept dictionary via sparse coding,
+to accurately represent the activation as a linear combination of the benign
+and undesirable components. By removing the latter ones from the activation, we
+reorient the behavior of LLMs towards alignment goals. We conduct experiments
+on tasks such as response detoxification, faithfulness enhancement, and
+sentiment revising, and show that PaCE achieves state-of-the-art alignment
+performance while maintaining linguistic capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 17 figures, 5 tables, dataset and code at
+  https://github.com/peterljq/Parsimonious-Concept-Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring and Addressing Indexical Bias in Information Retrieval <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caleb Ziems, William Held, Jane Dwivedi-Yu, Diyi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information Retrieval (IR) systems are designed to deliver relevant content,
+but traditional systems may not optimize rankings for fairness, neutrality, or
+the balance of ideas. Consequently, IR can often introduce indexical biases, or
+biases in the positional order of documents. Although indexical bias can
+demonstrably affect people's opinion, voting patterns, and other behaviors,
+these issues remain understudied as the field lacks reliable metrics and
+procedures for automatically measuring indexical bias. Towards this end, we
+introduce the PAIR framework, which supports automatic bias audits for ranked
+documents or entire IR systems. After introducing DUO, the first
+general-purpose automatic bias metric, we run an extensive evaluation of 8 IR
+systems on a new corpus of 32k synthetic and 4.7k natural documents, with 4k
+queries spanning 1.4k controversial issue topics. A human behavioral study
+validates our approach, showing that our bias metric can help predict when and
+how indexical bias will shift a reader's opinion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Zhou, Zheng Liu, Shitao Xiao, Bo Zhao, Yongping Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal retrieval becomes increasingly popular in practice. However, the
+existing retrievers are mostly text-oriented, which lack the capability to
+process visual information. Despite the presence of vision-language models like
+CLIP, the current methods are severely limited in representing the text-only
+and image-only data. In this work, we present a new embedding model VISTA for
+universal multi-modal retrieval. Our work brings forth threefold technical
+contributions. Firstly, we introduce a flexible architecture which extends a
+powerful text encoder with the image understanding capability by introducing
+visual token embeddings. Secondly, we develop two data generation strategies,
+which bring high-quality composed image-text to facilitate the training of the
+embedding model. Thirdly, we introduce a multi-stage training algorithm, which
+first aligns the visual token embedding with the text encoder using massive
+weakly labeled data, and then develops multi-modal representation capability
+using the generated composed image-text data. In our experiments, VISTA
+achieves superior performances across a variety of multi-modal retrieval tasks
+in both zero-shot and supervised settings. Our model, data, and source code are
+available at https://github.com/FlagOpen/FlagEmbedding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Measurements for Decentralized Data Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04257v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04257v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles Lu, Mohammad Mohammadi Amiri, Ramesh Raskar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized data markets can provide more equitable forms of data
+acquisition for machine learning. However, to realize practical marketplaces,
+efficient techniques for seller selection need to be developed. We propose and
+benchmark federated data measurements to allow a data buyer to find sellers
+with relevant and diverse datasets. Diversity and relevance measures enable a
+buyer to make relative comparisons between sellers without requiring
+intermediate brokers and training task-dependent models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On The Persona-based Summarization of Domain-Specific Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankan Mullick, Sombit Bose, Rounak Saha, Ayan Kumar Bhowmick, Pawan Goyal, Niloy Ganguly, Prasenjit Dey, Ravi Kokku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In an ever-expanding world of domain-specific knowledge, the increasing
+complexity of consuming, and storing information necessitates the generation of
+summaries from large information repositories. However, every persona of a
+domain has different requirements of information and hence their summarization.
+For example, in the healthcare domain, a persona-based (such as Doctor, Nurse,
+Patient etc.) approach is imperative to deliver targeted medical information
+efficiently. Persona-based summarization of domain-specific information by
+humans is a high cognitive load task and is generally not preferred. The
+summaries generated by two different humans have high variability and do not
+scale in cost and subject matter expertise as domains and personas grow.
+Further, AI-generated summaries using generic Large Language Models (LLMs) may
+not necessarily offer satisfactory accuracy for different domains unless they
+have been specifically trained on domain-specific data and can also be very
+expensive to use in day-to-day operations. Our contribution in this paper is
+two-fold: 1) We present an approach to efficiently fine-tune a domain-specific
+small foundation LLM using a healthcare corpus and also show that we can
+effectively evaluate the summarization quality using AI-based critiquing. 2) We
+further show that AI-based critiquing has good concordance with Human-based
+critiquing of the summaries. Hence, such AI-based pipelines to generate
+domain-specific persona-based summaries can be easily scaled to other domains
+such as legal, enterprise documents, education etc. in a very efficient and
+cost-effective manner.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Similarity: Personalized Federated Recommendation with Composite
+  Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honglei Zhang, Haoxuan Li, Jundong Chen, Sen Cui, Kunda Yan, Abudukelimu Wuerkaixi, Xin Zhou, Zhiqi Shen, Yidong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated recommendation aims to collect global knowledge by aggregating
+local models from massive devices, to provide recommendations while ensuring
+privacy. Current methods mainly leverage aggregation functions invented by
+federated vision community to aggregate parameters from similar clients, e.g.,
+clustering aggregation. Despite considerable performance, we argue that it is
+suboptimal to apply them to federated recommendation directly. This is mainly
+reflected in the disparate model architectures. Different from structured
+parameters like convolutional neural networks in federated vision, federated
+recommender models usually distinguish itself by employing one-to-one item
+embedding table. Such a discrepancy induces the challenging embedding skew
+issue, which continually updates the trained embeddings but ignores the
+non-trained ones during aggregation, thus failing to predict future items
+accurately. To this end, we propose a personalized Federated recommendation
+model with Composite Aggregation (FedCA), which not only aggregates similar
+clients to enhance trained embeddings, but also aggregates complementary
+clients to update non-trained embeddings. Besides, we formulate the overall
+learning process into a unified optimization algorithm to jointly learn the
+similarity and complementarity. Extensive experiments on several real-world
+datasets substantiate the effectiveness of our proposed model. The source codes
+are available at https://github.com/hongleizhang/FedCA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Polyhedral Conic Classifier for CTR Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beyza Turkmen, Ramazan Tarik Turksoy, Hasan Saribas, Hakan Cevikalp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel approach for click-through rate (CTR)
+prediction within industrial recommender systems, addressing the inherent
+challenges of numerical imbalance and geometric asymmetry. These challenges
+stem from imbalanced datasets, where positive (click) instances occur less
+frequently than negatives (non-clicks), and geometrically asymmetric
+distributions, where positive samples exhibit visually coherent patterns while
+negatives demonstrate greater diversity. To address these challenges, we have
+used a deep neural network classifier that uses the polyhedral conic functions.
+This classifier is similar to the one-class classifiers in spirit and it
+returns compact polyhedral acceptance regions to separate the positive class
+samples from the negative samples that have diverse distributions. Extensive
+experiments have been conducted to test the proposed approach using
+state-of-the-art (SOTA) CTR prediction models on four public datasets, namely
+Criteo, Avazu, MovieLens and Frappe. The experimental evaluations highlight the
+superiority of our proposed approach over Binary Cross Entropy (BCE) Loss,
+which is widely used in CTR prediction tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reducing the climate impact of data portals: a case study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noah Gießing, Madhurima Deb, Ankit Satpute, Moritz Schubotz, Olaf Teschke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The carbon footprint share of the information and communication technology
+(ICT) sector has steadily increased in the past decade and is predicted to make
+up as much as 23 \% of global emissions in 2030. This shows a pressing need for
+developers, including the information retrieval community, to make their code
+more energy-efficient. In this project proposal, we discuss techniques to
+reduce the energy footprint of the MaRDI (Mathematical Research Data
+Initiative) Portal, a MediaWiki-based knowledge base. In future work, we plan
+to implement these changes and provide concrete measurements on the gain in
+energy efficiency. Researchers developing similar knowledge bases can adapt our
+measures to reduce their environmental footprint. In this way, we are working
+on mitigating the climate impact of Information Retrieval research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XL-HeadTags: Leveraging Multimodal Retrieval Augmentation for the
+  Multilingual Generation of News Headlines and Tags <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Faisal Tareque Shohan, Mir Tafseer Nayeem, Samsul Islam, Abu Ubaida Akash, Shafiq Joty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Millions of news articles published online daily can overwhelm readers.
+Headlines and entity (topic) tags are essential for guiding readers to decide
+if the content is worth their time. While headline generation has been
+extensively studied, tag generation remains largely unexplored, yet it offers
+readers better access to topics of interest. The need for conciseness in
+capturing readers' attention necessitates improved content selection strategies
+for identifying salient and relevant segments within lengthy articles, thereby
+guiding language models effectively. To address this, we propose to leverage
+auxiliary information such as images and captions embedded in the articles to
+retrieve relevant sentences and utilize instruction tuning with variations to
+generate both headlines and tags for news articles in a multilingual context.
+To make use of the auxiliary information, we have compiled a dataset named
+XL-HeadTags, which includes 20 languages across 6 diverse language families.
+Through extensive evaluation, we demonstrate the effectiveness of our
+plug-and-play multimodal-multilingual retrievers for both tasks. Additionally,
+we have developed a suite of tools for processing and evaluating multilingual
+texts, significantly contributing to the research community by enabling more
+accurate and efficient analysis across languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attribute-Aware Implicit Modality Alignment for Text Attribute Person
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Wang, Fangfang Liu, Zheng Li, Caili Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text attribute person search aims to find specific pedestrians through given
+textual attributes, which is very meaningful in the scene of searching for
+designated pedestrians through witness descriptions. The key challenge is the
+significant modality gap between textual attributes and images. Previous
+methods focused on achieving explicit representation and alignment through
+unimodal pre-trained models. Nevertheless, the absence of inter-modality
+correspondence in these models may lead to distortions in the local information
+of intra-modality. Moreover, these methods only considered the alignment of
+inter-modality and ignored the differences between different attribute
+categories. To mitigate the above problems, we propose an Attribute-Aware
+Implicit Modality Alignment (AIMA) framework to learn the correspondence of
+local representations between textual attributes and images and combine global
+representation matching to narrow the modality gap. Firstly, we introduce the
+CLIP model as the backbone and design prompt templates to transform attribute
+combinations into structured sentences. This facilitates the model's ability to
+better understand and match image details. Next, we design a Masked Attribute
+Prediction (MAP) module that predicts the masked attributes after the
+interaction of image and masked textual attribute features through multi-modal
+interaction, thereby achieving implicit local relationship alignment. Finally,
+we propose an Attribute-IoU Guided Intra-Modal Contrastive (A-IoU IMC) loss,
+aligning the distribution of different textual attributes in the embedding
+space with their IoU distribution, achieving better semantic arrangement.
+Extensive experiments on the Market-1501 Attribute, PETA, and PA100K datasets
+show that the performance of our proposed method significantly surpasses the
+current state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ListT5: Listwise Reranking with Fusion-in-Decoder Improves Zero-shot
+  Retrieval <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15838v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15838v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soyoung Yoon, Eunbi Choi, Jiyeon Kim, Hyeongu Yun, Yireun Kim, Seung-won Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose ListT5, a novel reranking approach based on Fusion-in-Decoder
+(FiD) that handles multiple candidate passages at both train and inference
+time. We also introduce an efficient inference framework for listwise ranking
+based on m-ary tournament sort with output caching. We evaluate and compare our
+model on the BEIR benchmark for zero-shot retrieval task, demonstrating that
+ListT5 (1) outperforms the state-of-the-art RankT5 baseline with a notable +1.3
+gain in the average NDCG@10 score, (2) has an efficiency comparable to
+pointwise ranking models and surpasses the efficiency of previous listwise
+ranking models, and (3) overcomes the lost-in-the-middle problem of previous
+listwise rerankers. Our code, model checkpoints, and the evaluation framework
+are fully open-sourced at \url{https://github.com/soyoung97/ListT5}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 main (long)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Codebook Knowledge with NLI and Chat<span class="highlight-title">GPT</span> for Zero-Shot
+  Political Relation Classification <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07876v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07876v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibo Hu, Erick Skorupa Parolin, Latifur Khan, Patrick T. Brandt, Javier Osorio, Vito J. D'Orazio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Is it possible accurately classify political relations within evolving event
+ontologies without extensive annotations? This study investigates zero-shot
+learning methods that use expert knowledge from existing annotation codebook,
+and evaluates the performance of advanced ChatGPT (GPT-3.5/4) and a natural
+language inference (NLI)-based model called ZSP. ChatGPT uses codebook's
+labeled summaries as prompts, whereas ZSP breaks down the classification task
+into context, event mode, and class disambiguation to refine task-specific
+hypotheses. This decomposition enhances interpretability, efficiency, and
+adaptability to schema changes. The experiments reveal ChatGPT's strengths and
+limitations, and crucially show ZSP's outperformance of dictionary-based
+methods and its competitive edge over some supervised models. These findings
+affirm the value of ZSP for validating event records and advancing ontology
+development. Our study underscores the efficacy of leveraging transfer learning
+and existing domain expertise to enhance research efficiency and scalability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00083v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00083v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Xue, Mengxin Zheng, Yebowen Hu, Fei Liu, Xun Chen, Qian Lou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are constrained by outdated information and a
+tendency to generate incorrect data, commonly referred to as "hallucinations."
+Retrieval-Augmented Generation (RAG) addresses these limitations by combining
+the strengths of retrieval-based methods and generative models. This approach
+involves retrieving relevant information from a large, up-to-date dataset and
+using it to enhance the generation process, leading to more accurate and
+contextually appropriate responses. Despite its benefits, RAG introduces a new
+attack surface for LLMs, particularly because RAG databases are often sourced
+from public data, such as the web. In this paper, we propose \TrojRAG{} to
+identify the vulnerabilities and attacks on retrieval parts (RAG database) and
+their indirect attacks on generative parts (LLMs). Specifically, we identify
+that poisoning several customized content passages could achieve a retrieval
+backdoor, where the retrieval works well for clean queries but always returns
+customized poisoned adversarial queries. Triggers and poisoned passages can be
+highly customized to implement various attacks. For example, a trigger could be
+a semantic group like "The Republican Party, Donald Trump, etc." Adversarial
+passages can be tailored to different contents, not only linked to the triggers
+but also used to indirectly attack generative LLMs without modifying them.
+These attacks can include denial-of-service attacks on RAG and semantic
+steering attacks on LLM generations conditioned by the triggers. Our
+experiments demonstrate that by just poisoning 10 adversarial passages can
+induce 98.2\% success rate to retrieve the adversarial passages. Then, these
+passages can increase the reject ratio of RAG-based GPT-4 from 0.01\% to 74.6\%
+or increase the rate of negative responses from 0.22\% to 72\% for targeted
+queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoopHash: Cooperative Learning of Multipurpose Descriptor and
+  Contrastive Pair Generator via Variational MCMC Teaching for Supervised Image
+  Hashing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04288v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04288v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khoa D. Doan, Jianwen Xie, Yaxuan Zhu, Yang Zhao, Ping Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging supervised information can lead to superior retrieval performance
+in the image hashing domain but the performance degrades significantly without
+enough labeled data. One effective solution to boost performance is to employ
+generative models, such as Generative Adversarial Networks (GANs), to generate
+synthetic data in an image hashing model. However, GAN-based methods are
+difficult to train, which prevents the hashing approaches from jointly training
+the generative models and the hash functions. This limitation results in
+sub-optimal retrieval performance. To overcome this limitation, we propose a
+novel framework, the generative cooperative hashing network, which is based on
+energy-based cooperative learning. This framework jointly learns a powerful
+generative representation of the data and a robust hash function via two
+components: a top-down contrastive pair generator that synthesizes contrastive
+images and a bottom-up multipurpose descriptor that simultaneously represents
+the images from multiple perspectives, including probability density, hash
+code, latent code, and category. The two components are jointly learned via a
+novel likelihood-based cooperative learning scheme. We conduct experiments on
+several real-world datasets and show that the proposed method outperforms the
+competing hashing supervised methods, achieving up to 10\% relative improvement
+over the current state-of-the-art supervised hashing methods, and exhibits a
+significantly better performance in out-of-distribution retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Embedding Collapse when Scaling up Recommendation Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04400v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04400v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingzhuo Guo, Junwei Pan, Ximei Wang, Baixu Chen, Jie Jiang, Mingsheng Long
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in foundation models have led to a promising trend of
+developing large recommendation models to leverage vast amounts of available
+data. Still, mainstream models remain embarrassingly small in size and na\"ive
+enlarging does not lead to sufficient performance gain, suggesting a deficiency
+in the model scalability. In this paper, we identify the embedding collapse
+phenomenon as the inhibition of scalability, wherein the embedding matrix tends
+to occupy a low-dimensional subspace. Through empirical and theoretical
+analysis, we demonstrate a \emph{two-sided effect} of feature interaction
+specific to recommendation models. On the one hand, interacting with collapsed
+embeddings restricts embedding learning and exacerbates the collapse issue. On
+the other hand, interaction is crucial in mitigating the fitting of spurious
+features as a scalability guarantee. Based on our analysis, we propose a simple
+yet effective multi-embedding design incorporating embedding-set-specific
+interaction modules to learn embedding sets with large diversity and thus
+reduce collapse. Extensive experiments demonstrate that this proposed design
+provides consistent scalability and effective collapse mitigation for various
+recommendation models. Code is available at this repository:
+https://github.com/thuml/Multi-Embedding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning Based Named Entity Recognition Models for Recipes <span class="chip">LREC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17447v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17447v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mansi Goel, Ayush Agarwal, Shubham Agrawal, Janak Kapuriya, Akhil Vamshi Konam, Rishabh Gupta, Shrey Rastogi,  Niharika, Ganesh Bagler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Food touches our lives through various endeavors, including flavor,
+nourishment, health, and sustainability. Recipes are cultural capsules
+transmitted across generations via unstructured text. Automated protocols for
+recognizing named entities, the building blocks of recipe text, are of immense
+value for various applications ranging from information extraction to novel
+recipe generation. Named entity recognition is a technique for extracting
+information from unstructured or semi-structured data with known labels.
+Starting with manually-annotated data of 6,611 ingredient phrases, we created
+an augmented dataset of 26,445 phrases cumulatively. Simultaneously, we
+systematically cleaned and analyzed ingredient phrases from RecipeDB, the
+gold-standard recipe data repository, and annotated them using the Stanford
+NER. Based on the analysis, we sampled a subset of 88,526 phrases using a
+clustering-based approach while preserving the diversity to create the
+machine-annotated dataset. A thorough investigation of NER approaches on these
+three datasets involving statistical, fine-tuning of deep learning-based
+language models and few-shot prompting on large language models (LLMs) provides
+deep insights. We conclude that few-shot prompting on LLMs has abysmal
+performance, whereas the fine-tuned spaCy-transformer emerges as the best model
+with macro-F1 scores of 95.9%, 96.04%, and 95.71% for the manually-annotated,
+augmented, and machine-annotated datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 main figures and 2 in appendices, and 3 main tables;
+  Accepted for publication in LREC-COLING 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Med<span class="highlight-title">Prompt</span>Extract (Medical Data Extraction Tool): Anonymization and
+  Hi-fidelity Automated data extraction using NLP and <span class="highlight-title">prompt</span> engineering <span class="chip">CIKM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02664v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02664v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roomani Srivastava, Suraj Prasad, Lipika Bhat, Sarvesh Deshpande, Barnali Das, Kshitij Jadhav
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major roadblock in the seamless digitization of medical records remains the
+lack of interoperability of existing records. Extracting relevant medical
+information required for further treatment planning or even research is a time
+consuming labour intensive task involving expenditure of valuable time of
+doctors. In this demo paper we present, MedPromptExtract an automated tool
+using a combination of semi supervised learning, large language models, natural
+language processing and prompt engineering to convert unstructured medical
+records to structured data which is amenable for further analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures, pre-print sumitted to CIKM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spiral of Silences: How is Large Language Model Killing Information
+  Retrieval? -- A Case Study on Open Domain Question Answering <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.10496v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.10496v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyang Chen, Ben He, Hongyu Lin, Xianpei Han, Tianshu Wang, Boxi Cao, Le Sun, Yingfei Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The practice of Retrieval-Augmented Generation (RAG), which integrates Large
+Language Models (LLMs) with retrieval systems, has become increasingly
+prevalent. However, the repercussions of LLM-derived content infiltrating the
+web and influencing the retrieval-generation feedback loop are largely
+uncharted territories. In this study, we construct and iteratively run a
+simulation pipeline to deeply investigate the short-term and long-term effects
+of LLM text on RAG systems. Taking the trending Open Domain Question Answering
+(ODQA) task as a point of entry, our findings reveal a potential digital
+"Spiral of Silence" effect, with LLM-generated text consistently outperforming
+human-authored content in search rankings, thereby diminishing the presence and
+impact of human contributions online. This trend risks creating an imbalanced
+information ecosystem, where the unchecked proliferation of erroneous
+LLM-generated content may result in the marginalization of accurate
+information. We urge the academic community to take heed of this potential
+issue, ensuring a diverse and authentic digital information landscape.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models as Evaluators for Recommendation Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03248v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03248v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Zhang, Yishan Li, Jiayin Wang, Bowen Sun, Weizhi Ma, Peijie Sun, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The explainability of recommender systems has attracted significant attention
+in academia and industry. Many efforts have been made for explainable
+recommendations, yet evaluating the quality of the explanations remains a
+challenging and unresolved issue. In recent years, leveraging LLMs as
+evaluators presents a promising avenue in Natural Language Processing tasks
+(e.g., sentiment classification, information extraction), as they perform
+strong capabilities in instruction following and common-sense reasoning.
+However, evaluating recommendation explanatory texts is different from these
+NLG tasks, as its criteria are related to human perceptions and are usually
+subjective. In this paper, we investigate whether LLMs can serve as evaluators
+of recommendation explanations. To answer the question, we utilize real user
+feedback on explanations given from previous work and additionally collect
+third-party annotations and LLM evaluations. We design and apply a 3-level meta
+evaluation strategy to measure the correlation between evaluator labels and the
+ground truth provided by users. Our experiments reveal that LLMs, such as GPT4,
+can provide comparable evaluations with appropriate prompts and settings. We
+also provide further insights into combining human labels with the LLM
+evaluation process and utilizing ensembles of multiple heterogeneous LLM
+evaluators to enhance the accuracy and stability of evaluations. Our study
+verifies that utilizing LLMs as evaluators can be an accurate, reproducible and
+cost-effective solution for evaluating recommendation explanation texts. Our
+code is available at https://github.com/Xiaoyu-SZ/LLMasEvaluator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Table Retrieval a Solved Problem? Exploring Join-Aware Multi-Table
+  Retrieval <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09889v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09889v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Baile Chen, Yi Zhang, Dan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieving relevant tables containing the necessary information to accurately
+answer a given question over tables is critical to open-domain
+question-answering (QA) systems. Previous methods assume the answer to such a
+question can be found either in a single table or multiple tables identified
+through question decomposition or rewriting. However, neither of these
+approaches is sufficient, as many questions require retrieving multiple tables
+and joining them through a join plan that cannot be discerned from the user
+query itself. If the join plan is not considered in the retrieval stage, the
+subsequent steps of reasoning and answering based on those retrieved tables are
+likely to be incorrect. To address this problem, we introduce a method that
+uncovers useful join relations for any query and database during table
+retrieval. We use a novel re-ranking method formulated as a mixed-integer
+program that considers not only table-query relevance but also table-table
+relevance that requires inferring join relationships. Our method outperforms
+the state-of-the-art approaches for table retrieval by up to 9.3% in F1 score
+and for end-to-end QA by up to 5.4% in accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Verbalized Machine Learning: Revisiting Machine Learning with Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Z. Xiao, Robert Bamler, Bernhard Schölkopf, Weiyang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the large progress made by large language models (LLMs), we
+introduce the framework of verbalized machine learning (VML). In contrast to
+conventional machine learning models that are typically optimized over a
+continuous parameter space, VML constrains the parameter space to be
+human-interpretable natural language. Such a constraint leads to a new
+perspective of function approximation, where an LLM with a text prompt can be
+viewed as a function parameterized by the text prompt. Guided by this
+perspective, we revisit classical machine learning problems, such as regression
+and classification, and find that these problems can be solved by an
+LLM-parameterized learner and optimizer. The major advantages of VML include
+(1) easy encoding of inductive bias: prior knowledge about the problem and
+hypothesis class can be encoded in natural language and fed into the
+LLM-parameterized learner; (2) automatic model class selection: the optimizer
+can automatically select a concrete model class based on data and verbalized
+prior knowledge, and it can update the model class during training; and (3)
+interpretable learner updates: the LLM-parameterized optimizer can provide
+explanations for why each learner update is performed. We conduct several
+studies to empirically evaluate the effectiveness of VML, and hope that VML can
+serve as a stepping stone to stronger interpretability and trustworthiness in
+ML.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report v1 (92 pages, 15 figures)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Expressive Power of Spectral Invariant Graph Neural Networks <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohang Zhang, Lingxiao Zhao, Haggai Maron
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incorporating spectral information to enhance Graph Neural Networks (GNNs)
+has shown promising results but raises a fundamental challenge due to the
+inherent ambiguity of eigenvectors. Various architectures have been proposed to
+address this ambiguity, referred to as spectral invariant architectures.
+Notable examples include GNNs and Graph Transformers that use spectral
+distances, spectral projection matrices, or other invariant spectral features.
+However, the potential expressive power of these spectral invariant
+architectures remains largely unclear. The goal of this work is to gain a deep
+theoretical understanding of the expressive power obtainable when using
+spectral features. We first introduce a unified message-passing framework for
+designing spectral invariant GNNs, called Eigenspace Projection GNN (EPNN). A
+comprehensive analysis shows that EPNN essentially unifies all prior spectral
+invariant architectures, in that they are either strictly less expressive or
+equivalent to EPNN. A fine-grained expressiveness hierarchy among different
+architectures is also established. On the other hand, we prove that EPNN itself
+is bounded by a recently proposed class of Subgraph GNNs, implying that all
+these spectral invariant architectures are strictly less expressive than 3-WL.
+Finally, we discuss whether using spectral features can gain additional
+expressiveness when combined with more expressive GNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages; 3 figures; to appear in ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coarse-To-Fine Tensor Trains for Compact Visual Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Loeschcke, Dan Wang, Christian Leth-Espensen, Serge Belongie, Michael J. Kastoryano, Sagie Benaim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to learn compact, high-quality, and easy-to-optimize
+representations for visual data is paramount to many applications such as novel
+view synthesis and 3D reconstruction. Recent work has shown substantial success
+in using tensor networks to design such compact and high-quality
+representations. However, the ability to optimize tensor-based representations,
+and in particular, the highly compact tensor train representation, is still
+lacking. This has prevented practitioners from deploying the full potential of
+tensor networks for visual data. To this end, we propose 'Prolongation
+Upsampling Tensor Train (PuTT)', a novel method for learning tensor train
+representations in a coarse-to-fine manner. Our method involves the prolonging
+or `upsampling' of a learned tensor train representation, creating a sequence
+of 'coarse-to-fine' tensor trains that are incrementally refined. We evaluate
+our representation along three axes: (1). compression, (2). denoising
+capability, and (3). image completion capability. To assess these axes, we
+consider the tasks of image fitting, 3D fitting, and novel view synthesis,
+where our method shows an improved performance compared to state-of-the-art
+tensor-based methods. For full results see our project webpage:
+https://sebulo.github.io/PuTT_website/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://sebulo.github.io/PuTT_website/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simplified and Generalized Masked Diffusion for Discrete Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04329v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04329v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Shi, Kehang Han, Zhe Wang, Arnaud Doucet, Michalis K. Titsias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Masked (or absorbing) diffusion is actively explored as an alternative to
+autoregressive models for generative modeling of discrete data. However,
+existing work in this area has been hindered by unnecessarily complex model
+formulations and unclear relationships between different perspectives, leading
+to suboptimal parameterization, training objectives, and ad hoc adjustments to
+counteract these issues. In this work, we aim to provide a simple and general
+framework that unlocks the full potential of masked diffusion models. We show
+that the continuous-time variational objective of masked diffusion models is a
+simple weighted integral of cross-entropy losses. Our framework also enables
+training generalized masked diffusion models with state-dependent masking
+schedules. When evaluated by perplexity, our models trained on OpenWebText
+surpass prior diffusion language models at GPT-2 scale and demonstrate superior
+performance on 4 out of 5 zero-shot language modeling tasks. Furthermore, our
+models vastly outperform previous discrete diffusion models on pixel-level
+image modeling, achieving 2.78~(CIFAR-10) and 3.42 (ImageNet 64$\times$64) bits
+per dimension that are comparable or better than autoregressive models of
+similar sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PaCE: Parsimonious Concept Engineering for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinqi Luo, Tianjiao Ding, Kwan Ho Ryan Chan, Darshan Thaker, Aditya Chattopadhyay, Chris Callison-Burch, René Vidal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are being used for a wide variety of tasks.
+While they are capable of generating human-like responses, they can also
+produce undesirable output including potentially harmful information, racist or
+sexist language, and hallucinations. Alignment methods are designed to reduce
+such undesirable output, via techniques such as fine-tuning, prompt
+engineering, and representation engineering. However, existing methods face
+several challenges: some require costly fine-tuning for every alignment task;
+some do not adequately remove undesirable concepts, failing alignment; some
+remove benign concepts, lowering the linguistic capabilities of LLMs. To
+address these issues, we propose Parsimonious Concept Engineering (PaCE), a
+novel activation engineering framework for alignment. First, to sufficiently
+model the concepts, we construct a large-scale concept dictionary in the
+activation space, in which each atom corresponds to a semantic concept. Then,
+given any alignment task, we instruct a concept partitioner to efficiently
+annotate the concepts as benign or undesirable. Finally, at inference time, we
+decompose the LLM activations along the concept dictionary via sparse coding,
+to accurately represent the activation as a linear combination of the benign
+and undesirable components. By removing the latter ones from the activation, we
+reorient the behavior of LLMs towards alignment goals. We conduct experiments
+on tasks such as response detoxification, faithfulness enhancement, and
+sentiment revising, and show that PaCE achieves state-of-the-art alignment
+performance while maintaining linguistic capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 17 figures, 5 tables, dataset and code at
+  https://github.com/peterljq/Parsimonious-Concept-Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Estimation of Memorisation Profiles <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pietro Lesci, Clara Meister, Thomas Hofmann, Andreas Vlachos, Tiago Pimentel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding memorisation in language models has practical and societal
+implications, e.g., studying models' training dynamics or preventing copyright
+infringements. Prior work defines memorisation as the causal effect of training
+with an instance on the model's ability to predict that instance. This
+definition relies on a counterfactual: the ability to observe what would have
+happened had the model not seen that instance. Existing methods struggle to
+provide computationally efficient and accurate estimates of this
+counterfactual. Further, they often estimate memorisation for a model
+architecture rather than for a specific model instance. This paper fills an
+important gap in the literature, proposing a new, principled, and efficient
+method to estimate memorisation based on the difference-in-differences design
+from econometrics. Using this method, we characterise a model's memorisation
+profile--its memorisation trends across training--by only observing its
+behaviour on a small set of instances throughout training. In experiments with
+the Pythia model suite, we find that memorisation (i) is stronger and more
+persistent in larger models, (ii) is determined by data order and learning
+rate, and (iii) has stable trends across model sizes, thus making memorisation
+in larger models predictable from smaller ones.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the ACL 2024 Conference (main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Brain's Bitter Lesson: Scaling Speech Decoding With <span class="highlight-title">Self-Supervised</span>
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04328v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04328v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dulhan Jayalath, Gilad Landau, Brendan Shillingford, Mark Woolrich, Oiwi Parker Jones
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The past few years have produced a series of spectacular advances in the
+decoding of speech from brain activity. The engine of these advances has been
+the acquisition of labelled data, with increasingly large datasets acquired
+from single subjects. However, participants exhibit anatomical and other
+individual differences, and datasets use varied scanners and task designs. As a
+result, prior work has struggled to leverage data from multiple subjects,
+multiple datasets, multiple tasks, and unlabelled datasets. In turn, the field
+has not benefited from the rapidly growing number of open neural data
+repositories to exploit large-scale data and deep learning. To address this, we
+develop an initial set of neuroscience-inspired self-supervised objectives,
+together with a neural architecture, for representation learning from
+heterogeneous and unlabelled neural recordings. Experimental results show that
+representations learned with these objectives generalise across subjects,
+datasets, and tasks, and are also learned faster than using only labelled data.
+In addition, we set new benchmarks for two foundational speech decoding tasks.
+Taken together, these methods now unlock the potential for training speech
+decoding models with orders of magnitude more existing data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ATraDiff: Accelerating Online Reinforcement Learning with Imaginary
+  Trajectories <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianlan Yang, Yu-Xiong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training autonomous agents with sparse rewards is a long-standing problem in
+online reinforcement learning (RL), due to low data efficiency. Prior work
+overcomes this challenge by extracting useful knowledge from offline data,
+often accomplished through the learning of action distribution from offline
+data and utilizing the learned distribution to facilitate online RL. However,
+since the offline data are given and fixed, the extracted knowledge is
+inherently limited, making it difficult to generalize to new tasks. We propose
+a novel approach that leverages offline data to learn a generative diffusion
+model, coined as Adaptive Trajectory Diffuser (ATraDiff). This model generates
+synthetic trajectories, serving as a form of data augmentation and consequently
+enhancing the performance of online RL methods. The key strength of our
+diffuser lies in its adaptability, allowing it to effectively handle varying
+trajectory lengths and mitigate distribution shifts between online and offline
+data. Because of its simplicity, ATraDiff seamlessly integrates with a wide
+spectrum of RL methods. Empirical evaluation shows that ATraDiff consistently
+achieves state-of-the-art performance across a variety of environments, with
+particularly pronounced improvements in complicated settings. Our code and demo
+video are available at https://atradiff.github.io .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VidMuse: A Simple Video-to-Music Generation Framework with
+  Long-Short-Term Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyue Tian, Zhaoyang Liu, Ruibin Yuan, Jiahao Pan, Xiaoqiang Huang, Qifeng Liu, Xu Tan, Qifeng Chen, Wei Xue, Yike Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we systematically study music generation conditioned solely on
+the video. First, we present a large-scale dataset comprising 190K video-music
+pairs, including various genres such as movie trailers, advertisements, and
+documentaries. Furthermore, we propose VidMuse, a simple framework for
+generating music aligned with video inputs. VidMuse stands out by producing
+high-fidelity music that is both acoustically and semantically aligned with the
+video. By incorporating local and global visual cues, VidMuse enables the
+creation of musically coherent audio tracks that consistently match the video
+content through Long-Short-Term modeling. Through extensive experiments,
+VidMuse outperforms existing models in terms of audio quality, diversity, and
+audio-visual alignment. The code and datasets will be available at
+https://github.com/ZeyueT/VidMuse/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and datasets will be available at
+  https://github.com/ZeyueT/VidMuse/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chimera: Effectively Modeling Multivariate Time Series with
+  2-Dimensional State Space Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Behrouz, Michele Santacatterina, Ramin Zabih
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling multivariate time series is a well-established problem with a wide
+range of applications from healthcare to financial markets. Traditional State
+Space Models (SSMs) are classical approaches for univariate time series
+modeling due to their simplicity and expressive power to represent linear
+dependencies. They, however, have fundamentally limited expressive power to
+capture non-linear dependencies, are slow in practice, and fail to model the
+inter-variate information flow. Despite recent attempts to improve the
+expressive power of SSMs by using deep structured SSMs, the existing methods
+are either limited to univariate time series, fail to model complex patterns
+(e.g., seasonal patterns), fail to dynamically model the dependencies of
+variate and time dimensions, and/or are input-independent. We present Chimera
+that uses two input-dependent 2-D SSM heads with different discretization
+processes to learn long-term progression and seasonal patterns. To improve the
+efficiency of complex 2D recurrence, we present a fast training using a new
+2-dimensional parallel selective scan. We further present and discuss
+2-dimensional Mamba and Mamba-2 as the spacial cases of our 2D SSM. Our
+experimental evaluation shows the superior performance of Chimera on extensive
+and diverse benchmarks, including ECG and speech time series classification,
+long-term and short-term time series forecasting, and time series anomaly
+detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Sampling of k-Space in Magnetic Resonance for Rapid Pathology
+  Prediction <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen-Yu Yen, Raghav Singhal, Umang Sharma, Rajesh Ranganath, Sumit Chopra, Lerrel Pinto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Magnetic Resonance (MR) imaging, despite its proven diagnostic utility,
+remains an inaccessible imaging modality for disease surveillance at the
+population level. A major factor rendering MR inaccessible is lengthy scan
+times. An MR scanner collects measurements associated with the underlying
+anatomy in the Fourier space, also known as the k-space. Creating a
+high-fidelity image requires collecting large quantities of such measurements,
+increasing the scan time. Traditionally to accelerate an MR scan, image
+reconstruction from under-sampled k-space data is the method of choice.
+However, recent works show the feasibility of bypassing image reconstruction
+and directly learning to detect disease directly from a sparser learned subset
+of the k-space measurements. In this work, we propose Adaptive Sampling for MR
+(ASMR), a sampling method that learns an adaptive policy to sequentially select
+k-space samples to optimize for target disease detection. On 6 out of 8
+pathology classification tasks spanning the Knee, Brain, and Prostate MR scans,
+ASMR reaches within 2% of the performance of a fully sampled classifier while
+using only 8% of the k-space, as well as outperforming prior state-of-the-art
+work in k-space sampling such as EMRT, LOUPE, and DPS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Project website at https://adaptive-sampling-mr.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regularized KL-Divergence for Well-Defined Function-Space Variational
+  Inference in Bayesian neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tristan Cinquin, Robert Bamler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian neural networks (BNN) promise to combine the predictive performance
+of neural networks with principled uncertainty modeling important for
+safety-critical systems and decision making. However, posterior uncertainty
+estimates depend on the choice of prior, and finding informative priors in
+weight-space has proven difficult. This has motivated variational inference
+(VI) methods that pose priors directly on the function generated by the BNN
+rather than on weights. In this paper, we address a fundamental issue with such
+function-space VI approaches pointed out by Burt et al. (2020), who showed that
+the objective function (ELBO) is negative infinite for most priors of interest.
+Our solution builds on generalized VI (Knoblauch et al., 2019) with the
+regularized KL divergence (Quang, 2019) and is, to the best of our knowledge,
+the first well-defined variational objective for function-space inference in
+BNNs with Gaussian process (GP) priors. Experiments show that our method
+incorporates the properties specified by the GP prior on synthetic and small
+real-world data sets, and provides competitive uncertainty estimates for
+regression, classification and out-of-distribution detection compared to BNN
+baselines with both function and weight-space priors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Alignment and Robustness with Short Circuiting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Zou, Long Phan, Justin Wang, Derek Duenas, Maxwell Lin, Maksym Andriushchenko, Rowan Wang, Zico Kolter, Matt Fredrikson, Dan Hendrycks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI systems can take harmful actions and are highly vulnerable to adversarial
+attacks. We present an approach, inspired by recent advances in representation
+engineering, that "short-circuits" models as they respond with harmful outputs.
+Existing techniques aimed at improving alignment, such as refusal training, are
+often bypassed. Techniques such as adversarial training try to plug these holes
+by countering specific attacks. As an alternative to refusal training and
+adversarial training, short-circuiting directly controls the representations
+that are responsible for harmful outputs in the first place. Our technique can
+be applied to both text-only and multimodal language models to prevent the
+generation of harmful outputs without sacrificing utility -- even in the
+presence of powerful unseen attacks. Notably, while adversarial robustness in
+standalone image recognition remains an open challenge, short-circuiting allows
+the larger multimodal system to reliably withstand image "hijacks" that aim to
+produce harmful content. Finally, we extend our approach to AI agents,
+demonstrating considerable reductions in the rate of harmful actions when they
+are under attack. Our approach represents a significant step forward in the
+development of reliable safeguards to harmful behavior and adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReFiNe: Recursive Field Networks for Cross-modal Multi-scene
+  Representation <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergey Zakharov, Katherine Liu, Adrien Gaidon, Rares Ambrus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The common trade-offs of state-of-the-art methods for multi-shape
+representation (a single model "packing" multiple objects) involve trading
+modeling accuracy against memory and storage. We show how to encode multiple
+shapes represented as continuous neural fields with a higher degree of
+precision than previously possible and with low memory usage. Key to our
+approach is a recursive hierarchical formulation that exploits object
+self-similarity, leading to a highly compressed and efficient shape latent
+space. Thanks to the recursive formulation, our method supports spatial and
+global-to-local latent feature fusion without needing to initialize and
+maintain auxiliary data structures, while still allowing for continuous field
+queries to enable applications such as raytracing. In experiments on a set of
+diverse datasets, we provide compelling qualitative results and demonstrate
+state-of-the-art multi-scene reconstruction and compression results with a
+single network per dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH 2024. Project Page:
+  https://zakharos.github.io/projects/refine/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Approximation-Aware Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04308v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04308v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Natalie Maus, Kyurae Kim, Geoff Pleiss, David Eriksson, John P. Cunningham, Jacob R. Gardner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-dimensional Bayesian optimization (BO) tasks such as molecular design
+often require 10,000 function evaluations before obtaining meaningful results.
+While methods like sparse variational Gaussian processes (SVGPs) reduce
+computational requirements in these settings, the underlying approximations
+result in suboptimal data acquisitions that slow the progress of optimization.
+In this paper we modify SVGPs to better align with the goals of BO: targeting
+informed data acquisition rather than global posterior fidelity. Using the
+framework of utility-calibrated variational inference, we unify GP
+approximation and data acquisition into a joint optimization problem, thereby
+ensuring optimal decisions under a limited computational budget. Our approach
+can be used with any decision-theoretic acquisition function and is compatible
+with trust region methods like TuRBO. We derive efficient joint objectives for
+the expected improvement and knowledge gradient acquisition functions in both
+the standard and batch BO settings. Our approach outperforms standard SVGPs on
+high-dimensional benchmark tasks in control and molecular design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantically Diverse Language Generation for Uncertainty Estimation in
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Aichberger, Kajetan Schweighofer, Mykyta Ielanskyi, Sepp Hochreiter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) can suffer from hallucinations when generating
+text. These hallucinations impede various applications in society and industry
+by making LLMs untrustworthy. Current LLMs generate text in an autoregressive
+fashion by predicting and appending text tokens. When an LLM is uncertain about
+the semantic meaning of the next tokens to generate, it is likely to start
+hallucinating. Thus, it has been suggested that hallucinations stem from
+predictive uncertainty. We introduce Semantically Diverse Language Generation
+(SDLG) to quantify predictive uncertainty in LLMs. SDLG steers the LLM to
+generate semantically diverse yet likely alternatives for an initially
+generated text. This approach provides a precise measure of aleatoric semantic
+uncertainty, detecting whether the initial text is likely to be hallucinated.
+Experiments on question-answering tasks demonstrate that SDLG consistently
+outperforms existing methods while being the most computationally efficient,
+setting a new standard for uncertainty estimation in LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-LSTM: xLSTM as Generic Vision Backbone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benedikt Alkin, Maximilian Beck, Korbinian Pöppel, Sepp Hochreiter, Johannes Brandstetter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers are widely used as generic backbones in computer vision, despite
+initially introduced for natural language processing. Recently, the Long
+Short-Term Memory (LSTM) has been extended to a scalable and performant
+architecture - the xLSTM - which overcomes long-standing LSTM limitations via
+exponential gating and parallelizable matrix memory structure. In this report,
+we introduce Vision-LSTM (ViL), an adaption of the xLSTM building blocks to
+computer vision. ViL comprises a stack of xLSTM blocks where odd blocks process
+the sequence of patch tokens from top to bottom while even blocks go from
+bottom to top. Experiments show that ViL holds promise to be further deployed
+as new generic backbone for computer vision architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Representational Alignment Supports Effective Machine Teaching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04302v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04302v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilia Sucholutsky, Katherine M. Collins, Maya Malaviya, Nori Jacoby, Weiyang Liu, Theodore R. Sumers, Michalis Korakakis, Umang Bhatt, Mark Ho, Joshua B. Tenenbaum, Brad Love, Zachary A. Pardos, Adrian Weller, Thomas L. Griffiths
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A good teacher should not only be knowledgeable; but should be able to
+communicate in a way that the student understands -- to share the student's
+representation of the world. In this work, we integrate insights from machine
+teaching and pragmatic communication with the burgeoning literature on
+representational alignment to characterize a utility curve defining a
+relationship between representational alignment and teacher capability for
+promoting student learning. To explore the characteristics of this utility
+curve, we design a supervised learning environment that disentangles
+representational alignment from teacher accuracy. We conduct extensive
+computational experiments with machines teaching machines, complemented by a
+series of experiments in which machines teach humans. Drawing on our findings
+that improved representational alignment with a student improves student
+learning outcomes (i.e., task accuracy), we design a classroom matching
+procedure that assigns students to teachers based on the utility curve. If we
+are to design effective machine teachers, it is not enough to build teachers
+that are accurate -- we want teachers that can align, representationally, to
+their students too.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NoisyGL: A Comprehensive Benchmark for Graph Neural Networks under Label
+  Noise <span class="chip">NeurIPS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhonghao Wang, Danyu Sun, Sheng Zhou, Haobo Wang, Jiapei Fan, Longtao Huang, Jiajun Bu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) exhibit strong potential in node classification
+task through a message-passing mechanism. However, their performance often
+hinges on high-quality node labels, which are challenging to obtain in
+real-world scenarios due to unreliable sources or adversarial attacks.
+Consequently, label noise is common in real-world graph data, negatively
+impacting GNNs by propagating incorrect information during training. To address
+this issue, the study of Graph Neural Networks under Label Noise (GLN) has
+recently gained traction. However, due to variations in dataset selection, data
+splitting, and preprocessing techniques, the community currently lacks a
+comprehensive benchmark, which impedes deeper understanding and further
+development of GLN. To fill this gap, we introduce NoisyGL in this paper, the
+first comprehensive benchmark for graph neural networks under label noise.
+NoisyGL enables fair comparisons and detailed analyses of GLN methods on noisy
+labeled graph data across various datasets, with unified experimental settings
+and interface. Our benchmark has uncovered several important insights that were
+missed in previous research, and we believe these findings will be highly
+beneficial for future studies. We hope our open-source benchmark library will
+foster further advancements in this field. The code of the benchmark can be
+found in https://github.com/eaglelab-zju/NoisyGL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the 38th Conference on Neural Information Processing
+  Systems (NeurIPS 2024) Track on Datasets and Benchmarks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stratified Prediction-Powered Inference for Hybrid Language Model
+  Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Fisch, Joshua Maynez, R. Alex Hofer, Bhuwan Dhingra, Amir Globerson, William W. Cohen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prediction-powered inference (PPI) is a method that improves statistical
+estimates based on limited human-labeled data. PPI achieves this by combining
+small amounts of human-labeled data with larger amounts of data labeled by a
+reasonably accurate -- but potentially biased -- automatic system, in a way
+that results in tighter confidence intervals for certain parameters of interest
+(e.g., the mean performance of a language model). In this paper, we propose a
+method called Stratified Prediction-Powered Inference (StratPPI), in which we
+show that the basic PPI estimates can be considerably improved by employing
+simple data stratification strategies. Without making any assumptions on the
+underlying automatic labeling system or data distribution, we derive an
+algorithm for computing provably valid confidence intervals for population
+parameters (such as averages) that is based on stratified sampling. In
+particular, we show both theoretically and empirically that, with appropriate
+choices of stratification and sample allocation, our approach can provide
+substantially tighter confidence intervals than unstratified approaches.
+Specifically, StratPPI is expected to improve in cases where the performance of
+the autorater varies across different conditional distributions of the target
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What is <span class="highlight-title">Dataset</span> Distillation Learning? <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Yang, Ye Zhu, Zhiwei Deng, Olga Russakovsky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation has emerged as a strategy to overcome the hurdles
+associated with large datasets by learning a compact set of synthetic data that
+retains essential information from the original dataset. While distilled data
+can be used to train high performing models, little is understood about how the
+information is stored. In this study, we posit and answer three questions about
+the behavior, representativeness, and point-wise information content of
+distilled data. We reveal distilled data cannot serve as a substitute for real
+data during training outside the standard evaluation setting for dataset
+distillation. Additionally, the distillation process retains high task
+performance by compressing information related to the early training dynamics
+of real models. Finally, we provide an framework for interpreting distilled
+data and reveal that individual distilled data points contain meaningful
+semantic information. This investigation sheds light on the intricate nature of
+distilled data, providing a better understanding on how they can be effectively
+utilized.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ xMIL: Insightful Explanations for Multiple Instance Learning in
+  Histopathology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julius Hense, Mina Jamshidi Idaji, Oliver Eberle, Thomas Schnake, Jonas Dippel, Laure Ciernik, Oliver Buchstab, Andreas Mock, Frederick Klauschen, Klaus-Robert Müller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple instance learning (MIL) is an effective and widely used approach for
+weakly supervised machine learning. In histopathology, MIL models have achieved
+remarkable success in tasks like tumor detection, biomarker prediction, and
+outcome prognostication. However, MIL explanation methods are still lagging
+behind, as they are limited to small bag sizes or disregard instance
+interactions. We revisit MIL through the lens of explainable AI (XAI) and
+introduce xMIL, a refined framework with more general assumptions. We
+demonstrate how to obtain improved MIL explanations using layer-wise relevance
+propagation (LRP) and conduct extensive evaluation experiments on three toy
+settings and four real-world histopathology datasets. Our approach consistently
+outperforms previous explanation attempts with particularly improved
+faithfulness scores on challenging biomarker prediction tasks. Finally, we
+showcase how xMIL explanations enable pathologists to extract insights from MIL
+models, representing a significant advance for knowledge discovery and model
+debugging in digital histopathology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative AI-in-the-loop: Integrating LLMs and <span class="highlight-title">GPT</span>s into the Next
+  Generation Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Zhang, Akram Bin Sediq, Ali Afana, Melike Erol-Kantarci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, machine learning (ML) techniques have created numerous
+opportunities for intelligent mobile networks and have accelerated the
+automation of network operations. However, complex network tasks may involve
+variables and considerations even beyond the capacity of traditional ML
+algorithms. On the other hand, large language models (LLMs) have recently
+emerged, demonstrating near-human-level performance in cognitive tasks across
+various fields. However, they remain prone to hallucinations and often lack
+common sense in basic tasks. Therefore, they are regarded as assistive tools
+for humans. In this work, we propose the concept of "generative AI-in-the-loop"
+and utilize the semantic understanding, context awareness, and reasoning
+abilities of LLMs to assist humans in handling complex or unforeseen situations
+in mobile communication networks. We believe that combining LLMs and ML models
+allows both to leverage their respective capabilities and achieve better
+results than either model alone. To support this idea, we begin by analyzing
+the capabilities of LLMs and compare them with traditional ML algorithms. We
+then explore potential LLM-based applications in line with the requirements of
+next-generation networks. We further examine the integration of ML and LLMs,
+discussing how they can be used together in mobile networks. Unlike existing
+studies, our research emphasizes the fusion of LLMs with traditional ML-driven
+next-generation networks and serves as a comprehensive refinement of existing
+surveys. Finally, we provide a case study to enhance ML-based network intrusion
+detection with synthesized data generated by LLMs. Our case study further
+demonstrates the advantages of our proposed idea.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Play with Adversarial Critic: Provable and Scalable Offline
+  Alignment for Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Ji, Sanjeev Kulkarni, Mengdi Wang, Tengyang Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies the challenge of aligning large language models (LLMs) with
+offline preference data. We focus on alignment by Reinforcement Learning from
+Human Feedback (RLHF) in particular. While popular preference optimization
+methods exhibit good empirical performance in practice, they are not
+theoretically guaranteed to converge to the optimal policy and can provably
+fail when the data coverage is sparse by classical offline reinforcement
+learning (RL) results. On the other hand, a recent line of work has focused on
+theoretically motivated preference optimization methods with provable
+guarantees, but these are not computationally efficient for large-scale
+applications like LLM alignment. To bridge this gap, we propose SPAC, a new
+offline preference optimization method with self-play, inspired by the
+on-average pessimism technique from the offline RL literature, to be the first
+provable and scalable approach to LLM alignment. We both provide theoretical
+analysis for its convergence under single-policy concentrability for the
+general function approximation setting and demonstrate its competitive
+empirical performance for LLM alignment on a 7B Mistral model with Open LLM
+Leaderboard evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open-Endedness is Essential for Artificial Superhuman Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edward Hughes, Michael Dennis, Jack Parker-Holder, Feryal Behbahani, Aditi Mavalankar, Yuge Shi, Tom Schaul, Tim Rocktaschel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years there has been a tremendous surge in the general capabilities
+of AI systems, mainly fuelled by training foundation models on internetscale
+data. Nevertheless, the creation of openended, ever self-improving AI remains
+elusive. In this position paper, we argue that the ingredients are now in place
+to achieve openendedness in AI systems with respect to a human observer.
+Furthermore, we claim that such open-endedness is an essential property of any
+artificial superhuman intelligence (ASI). We begin by providing a concrete
+formal definition of open-endedness through the lens of novelty and
+learnability. We then illustrate a path towards ASI via open-ended systems
+built on top of foundation models, capable of making novel, humanrelevant
+discoveries. We conclude by examining the safety implications of
+generally-capable openended AI. We expect that open-ended foundation models
+will prove to be an increasingly fertile and safety-critical area of research
+in the near future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s need glasses! Information over-squashing in language tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Barbero, Andrea Banino, Steven Kapturowski, Dharshan Kumaran, João G. M. Araújo, Alex Vitvitskyi, Razvan Pascanu, Petar Veličković
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study how information propagates in decoder-only Transformers, which are
+the architectural backbone of most existing frontier large language models
+(LLMs). We rely on a theoretical signal propagation analysis -- specifically,
+we analyse the representations of the last token in the final layer of the
+Transformer, as this is the representation used for next-token prediction. Our
+analysis reveals a representational collapse phenomenon: we prove that certain
+distinct sequences of inputs to the Transformer can yield arbitrarily close
+representations in the final token. This effect is exacerbated by the
+low-precision floating-point formats frequently used in modern LLMs. As a
+result, the model is provably unable to respond to these sequences in different
+ways -- leading to errors in, e.g., tasks involving counting or copying.
+Further, we show that decoder-only Transformer language models can lose
+sensitivity to specific tokens in the input, which relates to the well-known
+phenomenon of over-squashing in graph neural networks. We provide empirical
+evidence supporting our claims on contemporary LLMs. Our theory also points to
+simple solutions towards ameliorating these issues.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulating, Fast and Slow: Learning Policies for Black-Box Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04261v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04261v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio Valerio Massoli, Tim Bakker, Thomas Hehn, Tribhuvanesh Orekondy, Arash Behboodi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, solving optimization problems involving black-box simulators
+has become a point of focus for the machine learning community due to their
+ubiquity in science and engineering. The simulators describe a forward process
+$f_{\mathrm{sim}}: (\psi, x) \rightarrow y$ from simulation parameters $\psi$
+and input data $x$ to observations $y$, and the goal of the optimization
+problem is to find parameters $\psi$ that minimize a desired loss function.
+Sophisticated optimization algorithms typically require gradient information
+regarding the forward process, $f_{\mathrm{sim}}$, with respect to the
+parameters $\psi$. However, obtaining gradients from black-box simulators can
+often be prohibitively expensive or, in some cases, impossible. Furthermore, in
+many applications, practitioners aim to solve a set of related problems. Thus,
+starting the optimization ``ab initio", i.e. from scratch, each time might be
+inefficient if the forward model is expensive to evaluate. To address those
+challenges, this paper introduces a novel method for solving classes of similar
+black-box optimization problems by learning an active learning policy that
+guides a differentiable surrogate's training and uses the surrogate's gradients
+to optimize the simulation parameters with gradient descent. After training the
+policy, downstream optimization of problems involving black-box simulators
+requires up to $\sim$90\% fewer expensive simulator calls compared to baselines
+such as local surrogate-based approaches, numerical optimization, and Bayesian
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Measurements for Decentralized Data Markets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04257v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04257v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charles Lu, Mohammad Mohammadi Amiri, Ramesh Raskar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized data markets can provide more equitable forms of data
+acquisition for machine learning. However, to realize practical marketplaces,
+efficient techniques for seller selection need to be developed. We propose and
+benchmark federated data measurements to allow a data buyer to find sellers
+with relevant and diverse datasets. Diversity and relevance measures enable a
+buyer to make relative comparisons between sellers without requiring
+intermediate brokers and training task-dependent models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online learning of quantum processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asad Raza, Matthias C. Caro, Jens Eisert, Sumeet Khatri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among recent insights into learning quantum states, online learning and
+shadow tomography procedures are notable for their ability to accurately
+predict expectation values even of adaptively chosen observables. In contrast
+to the state case, quantum process learning tasks with a similarly adaptive
+nature have received little attention. In this work, we investigate online
+learning tasks for quantum processes. Whereas online learning is infeasible for
+general quantum channels, we show that channels of bounded gate complexity as
+well as Pauli channels can be online learned in the regret and mistake-bounded
+models of online learning. In fact, we can online learn probabilistic mixtures
+of any exponentially large set of known channels. We also provide a provably
+sample-efficient shadow tomography procedure for Pauli channels. Our results
+extend beyond quantum channels to non-Markovian multi-time processes, with
+favorable regret and mistake bounds, as well as a shadow tomography procedure.
+We complement our online learning upper bounds with mistake as well as
+computational lower bounds. On the technical side, we make use of the
+multiplicative weights update algorithm, classical adaptive data analysis, and
+Bell sampling, as well as tools from the theory of quantum combs for multi-time
+quantum processes. Our work initiates a study of online learning for classes of
+quantum channels and, more generally, non-Markovian quantum processes. Given
+the importance of online learning for state shadow tomography, this may serve
+as a step towards quantum channel variants of adaptive shadow tomography.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 + 72 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online learning of a panoply of quantum objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshay Bansal, Ian George, Soumik Ghosh, Jamie Sikora, Alice Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many quantum tasks, there is an unknown quantum object that one wishes to
+learn. An online strategy for this task involves adaptively refining a
+hypothesis to reproduce such an object or its measurement statistics. A common
+evaluation metric for such a strategy is its regret, or roughly the accumulated
+errors in hypothesis statistics. We prove a sublinear regret bound for learning
+over general subsets of positive semidefinite matrices via the
+regularized-follow-the-leader algorithm and apply it to various settings where
+one wishes to learn quantum objects. For concrete applications, we present a
+sublinear regret bound for learning quantum states, effects, channels,
+interactive measurements, strategies, co-strategies, and the collection of
+inner products of pure states. Our bound applies to many other quantum objects
+with compact, convex representations. In proving our regret bound, we establish
+various matrix analysis results useful in quantum information theory. This
+includes a generalization of Pinsker's inequality for arbitrary positive
+semidefinite operators with possibly different traces, which may be of
+independent interest and applicable to more general classes of divergences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages. Comments welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hypernetworks for Personalizing ASR to Atypical Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Mueller-Eberstein, Dianna Yee, Karren Yang, Gautam Varma Mantena, Colin Lea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) for personalizing automatic speech
+recognition (ASR) has recently shown promise for adapting general population
+models to atypical speech. However, these approaches assume a priori knowledge
+of the atypical speech disorder being adapted for -- the diagnosis of which
+requires expert knowledge that is not always available. Even given this
+knowledge, data scarcity and high inter/intra-speaker variability further limit
+the effectiveness of traditional fine-tuning. To circumvent these challenges,
+we first identify the minimal set of model parameters required for ASR
+adaptation. Our analysis of each individual parameter's effect on adaptation
+performance allows us to reduce Word Error Rate (WER) by half while adapting
+0.03\% of all weights. Alleviating the need for cohort-specific models, we next
+propose the novel use of a meta-learned hypernetwork to generate highly
+individualized, utterance-level adaptations on-the-fly for a diverse set of
+atypical speech characteristics. Evaluating adaptation at the global, cohort
+and individual-level, we show that hypernetworks generalize better to
+out-of-distribution speakers, while maintaining an overall relative WER
+reduction of 75.2% using 0.1% of the full parameter budget.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving Inverse Problems in Protein Space Using Diffusion-Based Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Axel Levy, Eric R. Chan, Sara Fridovich-Keil, Frédéric Poitevin, Ellen D. Zhong, Gordon Wetzstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The interaction of a protein with its environment can be understood and
+controlled via its 3D structure. Experimental methods for protein structure
+determination, such as X-ray crystallography or cryogenic electron microscopy,
+shed light on biological processes but introduce challenging inverse problems.
+Learning-based approaches have emerged as accurate and efficient methods to
+solve these inverse problems for 3D structure determination, but are
+specialized for a predefined type of measurement. Here, we introduce a
+versatile framework to turn raw biophysical measurements of varying types into
+3D atomic models. Our method combines a physics-based forward model of the
+measurement process with a pretrained generative model providing a
+task-agnostic, data-driven prior. Our method outperforms posterior sampling
+baselines on both linear and non-linear inverse problems. In particular, it is
+the first diffusion-based method for refining atomic models from cryo-EM
+density maps.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The CLRS-Text Algorithmic Reasoning Language Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Larisa Markeeva, Sean McLeish, Borja Ibarz, Wilfried Bounsi, Olga Kozlova, Alex Vitvitskyi, Charles Blundell, Tom Goldstein, Avi Schwarzschild, Petar Veličković
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Eliciting reasoning capabilities from language models (LMs) is a critical
+direction on the path towards building intelligent systems. Most recent studies
+dedicated to reasoning focus on out-of-distribution performance on
+procedurally-generated synthetic benchmarks, bespoke-built to evaluate specific
+skills only. This trend makes results hard to transfer across publications,
+slowing down progress. Three years ago, a similar issue was identified and
+rectified in the field of neural algorithmic reasoning, with the advent of the
+CLRS benchmark. CLRS is a dataset generator comprising graph execution traces
+of classical algorithms from the Introduction to Algorithms textbook. Inspired
+by this, we propose CLRS-Text -- a textual version of these algorithmic traces.
+Out of the box, CLRS-Text is capable of procedurally generating trace data for
+thirty diverse, challenging algorithmic tasks across any desirable input
+distribution, while offering a standard pipeline in which any additional
+algorithmic tasks may be created in the benchmark. We fine-tune and evaluate
+various LMs as generalist executors on this benchmark, validating prior work
+and revealing a novel, interesting challenge for the LM reasoning community.
+Our code is available at
+https://github.com/google-deepmind/clrs/tree/master/clrs/_src/clrs_text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, under review. Comments welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ R-CONV: An Analytical Approach for Efficient Data Reconstruction via
+  Convolutional Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tamer Ahmed Eltaras, Qutaibah Malluhi, Alessandro Savino, Stefano Di Carlo, Adnan Qayyum, Junaid Qadir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the effort to learn from extensive collections of distributed data,
+federated learning has emerged as a promising approach for preserving privacy
+by using a gradient-sharing mechanism instead of exchanging raw data. However,
+recent studies show that private training data can be leaked through many
+gradient attacks. While previous analytical-based attacks have successfully
+reconstructed input data from fully connected layers, their effectiveness
+diminishes when applied to convolutional layers. This paper introduces an
+advanced data leakage method to efficiently exploit convolutional layers'
+gradients. We present a surprising finding: even with non-fully invertible
+activation functions, such as ReLU, we can analytically reconstruct training
+samples from the gradients. To the best of our knowledge, this is the first
+analytical approach that successfully reconstructs convolutional layer inputs
+directly from the gradients, bypassing the need to reconstruct layers' outputs.
+Prior research has mainly concentrated on the weight constraints of convolution
+layers, overlooking the significance of gradient constraints. Our findings
+demonstrate that existing analytical methods used to estimate the risk of
+gradient attacks lack accuracy. In some layers, attacks can be launched with
+less than 5% of the reported constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Agent Imitation Learning: Value is Easy, Regret is Hard 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwu Tang, Gokul Swamy, Fei Fang, Zhiwei Steven Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a multi-agent imitation learning (MAIL) problem where we take the
+perspective of a learner attempting to coordinate a group of agents based on
+demonstrations of an expert doing so. Most prior work in MAIL essentially
+reduces the problem to matching the behavior of the expert within the support
+of the demonstrations. While doing so is sufficient to drive the value gap
+between the learner and the expert to zero under the assumption that agents are
+non-strategic, it does not guarantee robustness to deviations by strategic
+agents. Intuitively, this is because strategic deviations can depend on a
+counterfactual quantity: the coordinator's recommendations outside of the state
+distribution their recommendations induce. In response, we initiate the study
+of an alternative objective for MAIL in Markov Games we term the regret gap
+that explicitly accounts for potential deviations by agents in the group. We
+first perform an in-depth exploration of the relationship between the value and
+regret gaps. First, we show that while the value gap can be efficiently
+minimized via a direct extension of single-agent IL algorithms, even value
+equivalence can lead to an arbitrarily large regret gap. This implies that
+achieving regret equivalence is harder than achieving value equivalence in
+MAIL. We then provide a pair of efficient reductions to no-regret online convex
+optimization that are capable of minimizing the regret gap (a) under a coverage
+assumption on the expert (MALICE) or (b) with access to a queryable expert
+(BLADES).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Do Language Models Learn in Context? The Structured Task Hypothesis <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaoda Li, Yifan Hou, Mrinmaya Sachan, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) exhibit an intriguing ability to learn a novel
+task from in-context examples presented in a demonstration, termed in-context
+learning (ICL). Understandably, a swath of research has been dedicated to
+uncovering the theories underpinning ICL. One popular hypothesis explains ICL
+by task selection. LLMs identify the task based on the demonstration and
+generalize it to the prompt. Another popular hypothesis is that ICL is a form
+of meta-learning, i.e., the models learn a learning algorithm at pre-training
+time and apply it to the demonstration. Finally, a third hypothesis argues that
+LLMs use the demonstration to select a composition of tasks learned during
+pre-training to perform ICL. In this paper, we empirically explore these three
+hypotheses that explain LLMs' ability to learn in context with a suite of
+experiments derived from common text classification tasks. We invalidate the
+first two hypotheses with counterexamples and provide evidence in support of
+the last hypothesis. Our results suggest an LLM could learn a novel task in
+context via composing tasks learned during pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is published in ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ mCSQA: Multilingual Commonsense Reasoning <span class="highlight-title">Dataset</span> with Unified Creation
+  Strategy by Language Models and Humans <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuke Sakai, Hidetaka Kamigaito, Taro Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is very challenging to curate a dataset for language-specific knowledge
+and common sense in order to evaluate natural language understanding
+capabilities of language models. Due to the limitation in the availability of
+annotators, most current multilingual datasets are created through translation,
+which cannot evaluate such language-specific aspects. Therefore, we propose
+Multilingual CommonsenseQA (mCSQA) based on the construction process of CSQA
+but leveraging language models for a more efficient construction, e.g., by
+asking LM to generate questions/answers, refine answers and verify QAs followed
+by reduced human efforts for verification. Constructed dataset is a benchmark
+for cross-lingual language-transfer capabilities of multilingual LMs, and
+experimental results showed high language-transfer capabilities for questions
+that LMs could easily solve, but lower transfer capabilities for questions
+requiring deep knowledge or commonsense. This highlights the necessity of
+language-specific datasets for evaluation and training. Finally, our method
+demonstrated that multilingual LMs could create QA including language-specific
+knowledge, significantly reducing the dataset creation cost compared to manual
+creation. The datasets are available at
+https://huggingface.co/datasets/yusuke1997/mCSQA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aligning Agents like Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Jelley, Yuhan Cao, Dave Bignell, Sam Devlin, Tabish Rashid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training agents to behave as desired in complex 3D environments from
+high-dimensional sensory information is challenging. Imitation learning from
+diverse human behavior provides a scalable approach for training an agent with
+a sensible behavioral prior, but such an agent may not perform the specific
+behaviors of interest when deployed. To address this issue, we draw an analogy
+between the undesirable behaviors of imitation learning agents and the
+unhelpful responses of unaligned large language models (LLMs). We then
+investigate how the procedure for aligning LLMs can be applied to aligning
+agents in a 3D environment from pixels. For our analysis, we utilize an
+academically illustrative part of a modern console game in which the human
+behavior distribution is multi-modal, but we want our agent to imitate a single
+mode of this behavior. We demonstrate that we can align our agent to
+consistently perform the desired mode, while providing insights and advice for
+successfully applying this approach to training agents. Project webpage at
+https://adamjelley.github.io/aligning-agents-like-llms .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Principled Superhuman AI for Multiplayer Symmetric Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04201v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04201v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Ge, Yuanhao Wang, Wenzhe Li, Chi Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiplayer games, when the number of players exceeds two, present unique
+challenges that fundamentally distinguish them from the extensively studied
+two-player zero-sum games. These challenges arise from the non-uniqueness of
+equilibria and the risk of agents performing highly suboptimally when adopting
+equilibrium strategies. While a line of recent works developed learning systems
+successfully achieving human-level or even superhuman performance in popular
+multiplayer games such as Mahjong, Poker, and Diplomacy, two critical questions
+remain unaddressed: (1) What is the correct solution concept that AI agents
+should find? and (2) What is the general algorithmic framework that provably
+solves all games within this class? This paper takes the first step towards
+solving these unique challenges of multiplayer games by provably addressing
+both questions in multiplayer symmetric normal-form games. We also demonstrate
+that many meta-algorithms developed in prior practical systems for multiplayer
+games can fail to achieve even the basic goal of obtaining agent's equal share
+of the total reward.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shield Synthesis for LTL Modulo Theories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andoni Rodriguez, Guy Amir, Davide Corsi, Cesar Sanchez, Guy Katz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Machine Learning (ML) models have achieved remarkable
+success in various domains. However, these models also tend to demonstrate
+unsafe behaviors, precluding their deployment in safety-critical systems. To
+cope with this issue, ample research focuses on developing methods that
+guarantee the safe behaviour of a given ML model. A prominent example is
+shielding which incorporates an external component (a "shield") that blocks
+unwanted behavior. Despite significant progress, shielding suffers from a main
+setback: it is currently geared towards properties encoded solely in
+propositional logics (e.g., LTL) and is unsuitable for richer logics. This, in
+turn, limits the widespread applicability of shielding in many real-world
+systems. In this work, we address this gap, and extend shielding to LTL modulo
+theories, by building upon recent advances in reactive synthesis modulo
+theories. This allowed us to develop a novel approach for generating shields
+conforming to complex safety specifications in these more expressive, logics.
+We evaluated our shields and demonstrate their ability to handle rich data with
+temporal dynamics. To the best of our knowledge, this is the first approach for
+synthesizing shields for such expressivity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Element-wise Multiplication Based Physics-informed Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feilong Jiang, Xiaonan Hou, Min Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a promising framework for resolving partial differential equations (PDEs),
+physics-informed neural networks (PINNs) have received widespread attention
+from industrial and scientific fields. However, lack of expressive ability and
+initialization pathology issues are found to prevent the application of PINNs
+in complex PDEs. In this work, we propose Element-wise Multiplication Based
+Physics-informed Neural Networks (EM-PINNs) to resolve these issues. The
+element-wise multiplication operation is adopted to transform features into
+high-dimensional, non-linear spaces, which effectively enhance the expressive
+capability of PINNs. Benefiting from element-wise multiplication operation,
+EM-PINNs can eliminate the initialization pathologies of PINNs. The proposed
+structure is verified on various benchmarks. The results show that EM-PINNs
+have strong expressive ability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Repurposing Language Models into Embedding Models: Finding the
+  Compute-Optimal Recipe 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alicja Ziarko, Albert Q. Jiang, Bartosz Piotrowski, Wenda Li, Mateja Jamnik, Piotr Miłoś
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text embeddings are essential for many tasks, such as document retrieval,
+clustering, and semantic similarity assessment. In this paper, we study how to
+contrastively train text embedding models in a compute-optimal fashion, given a
+suite of pre-trained decoder-only language models. Our innovation is an
+algorithm that produces optimal configurations of model sizes, data quantities,
+and fine-tuning methods for text-embedding models at different computational
+budget levels. The resulting recipe, which we obtain through extensive
+experiments, can be used by practitioners to make informed design choices for
+their embedding models. Specifically, our findings suggest that full
+fine-tuning and low-rank adaptation fine-tuning produce optimal models at lower
+and higher computational budgets respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Essentially Sharp Estimates on the Entropy Regularization Error in
+  Discrete Discounted Markov Decision Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Müller, Semih Cayci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the error introduced by entropy regularization of infinite-horizon
+discrete discounted Markov decision processes. We show that this error
+decreases exponentially in the inverse regularization strength both in a
+weighted KL-divergence and in value with a problem-specific exponent. We
+provide a lower bound matching our upper bound up to a polynomial factor. Our
+proof relies on the correspondence of the solutions of entropy-regularized
+Markov decision processes with gradient flows of the unregularized reward with
+respect to a Riemannian metric common in natural policy gradient methods.
+Further, this correspondence allows us to identify the limit of the gradient
+flow as the generalized maximum entropy optimal policy, thereby characterizing
+the implicit bias of the Kakade gradient flow which corresponds to a
+time-continuous version of the natural policy gradient method. We use this to
+show that for entropy-regularized natural policy gradient methods the overall
+error decays exponentially in the square root of the number of iterations
+improving existing sublinear guarantees.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pointer-Guided <span class="highlight-title">Pre-Train</span>ing: Infusing Large Language Models with
+  Paragraph-Level Contextual Awareness <span class="chip">ECML-PKDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04156v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04156v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Hillebrand, Prabhupad Pradhan, Christian Bauckhage, Rafet Sifa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce "pointer-guided segment ordering" (SO), a novel pre-training
+technique aimed at enhancing the contextual understanding of paragraph-level
+text representations in large language models. Our methodology leverages a
+self-attention-driven pointer network to restore the original sequence of
+shuffled text segments, addressing the challenge of capturing the structural
+coherence and contextual dependencies within documents. This pre-training
+approach is complemented by a fine-tuning methodology that incorporates dynamic
+sampling, augmenting the diversity of training instances and improving sample
+efficiency for various downstream applications. We evaluate our method on a
+diverse set of datasets, demonstrating its efficacy in tasks requiring
+sequential text classification across scientific literature and financial
+reporting domains. Our experiments show that pointer-guided pre-training
+significantly enhances the model's ability to understand complex document
+structures, leading to state-of-the-art performance in downstream
+classification tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 3 figures, 5 tables, accepted at ECML-PKDD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Physics-Augmented Continuum Neural Radiance Field-Based
+  Geometry-Agnostic System Identification with Lagrangian Particle Optimization <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takuhiro Kaneko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geometry-agnostic system identification is a technique for identifying the
+geometry and physical properties of an object from video sequences without any
+geometric assumptions. Recently, physics-augmented continuum neural radiance
+fields (PAC-NeRF) has demonstrated promising results for this technique by
+utilizing a hybrid Eulerian-Lagrangian representation, in which the geometry is
+represented by the Eulerian grid representations of NeRF, the physics is
+described by a material point method (MPM), and they are connected via
+Lagrangian particles. However, a notable limitation of PAC-NeRF is that its
+performance is sensitive to the learning of the geometry from the first frames
+owing to its two-step optimization. First, the grid representations are
+optimized with the first frames of video sequences, and then the physical
+properties are optimized through video sequences utilizing the fixed
+first-frame grid representations. This limitation can be critical when learning
+of the geometric structure is difficult, for example, in a few-shot (sparse
+view) setting. To overcome this limitation, we propose Lagrangian particle
+optimization (LPO), in which the positions and features of particles are
+optimized through video sequences in Lagrangian space. This method allows for
+the optimization of the geometric structure across the entire video sequence
+within the physical constraints imposed by the MPM. The experimental results
+demonstrate that the LPO is useful for geometric correction and physical
+identification in sparse-view settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024. Project page:
+  https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/lpo/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learned Feature Importance Scores for Automated Feature Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihe Dong, Sercan Arik, Nathanael Yoder, Tomas Pfister
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Feature engineering has demonstrated substantial utility for many machine
+learning workflows, such as in the small data regime or when distribution
+shifts are severe. Thus automating this capability can relieve much manual
+effort and improve model performance. Towards this, we propose AutoMAN, or
+Automated Mask-based Feature Engineering, an automated feature engineering
+framework that achieves high accuracy, low latency, and can be extended to
+heterogeneous and time-varying data. AutoMAN is based on effectively exploring
+the candidate transforms space, without explicitly manifesting transformed
+features. This is achieved by learning feature importance masks, which can be
+extended to support other modalities such as time series. AutoMAN learns
+feature transform importance end-to-end, incorporating a dataset's task target
+directly into feature engineering, resulting in state-of-the-art performance
+with significantly lower latency compared to alternatives.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Redescription Mining Using Locality-Sensitive Hashing <span class="chip">ECML-PKDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04148v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04148v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maiju Karjalainen, Esther Galbrun, Pauli Miettinen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Redescription mining is a data analysis technique that has found applications
+in diverse fields. The most used redescription mining approaches involve two
+phases: finding matching pairs among data attributes and extending the pairs.
+This process is relatively efficient when the number of attributes remains
+limited and when the attributes are Boolean, but becomes almost intractable
+when the data consist of many numerical attributes. In this paper, we present
+new algorithms that perform the matching and extension orders of magnitude
+faster than the existing approaches. Our algorithms are based on
+locality-sensitive hashing with a tailored approach to handle the
+discretisation of numerical attributes as used in redescription mining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 4 figures, to appear at ECML-PKDD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Redundancy-aware Action Spaces for Robot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pietro Mazzaglia, Nicholas Backshall, Xiao Ma, Stephen James
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Joint space and task space control are the two dominant action modes for
+controlling robot arms within the robot learning literature. Actions in joint
+space provide precise control over the robot's pose, but tend to suffer from
+inefficient training; actions in task space boast data-efficient training but
+sacrifice the ability to perform tasks in confined spaces due to limited
+control over the full joint configuration. This work analyses the criteria for
+designing action spaces for robot manipulation and introduces ER (End-effector
+Redundancy), a novel action space formulation that, by addressing the
+redundancies present in the manipulator, aims to combine the advantages of both
+joint and task spaces, offering fine-grained comprehensive control with
+overactuated robot arms whilst achieving highly efficient robot learning. We
+present two implementations of ER, ERAngle (ERA) and ERJoint (ERJ), and we show
+that ERJ in particular demonstrates superior performance across multiple
+settings, especially when precise control over the robot configuration is
+required. We validate our results both in simulated and real robotic
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the RA-L journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do Language Models Understand Morality? Towards a Robust Detection of
+  Moral Content 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luana Bulla, Aldo Gangemi, Misael Mongiovì
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of detecting moral values in text has significant implications in
+various fields, including natural language processing, social sciences, and
+ethical decision-making. Previously proposed supervised models often suffer
+from overfitting, leading to hyper-specialized moral classifiers that struggle
+to perform well on data from different domains. To address this issue, we
+introduce novel systems that leverage abstract concepts and common-sense
+knowledge acquired from Large Language Models and Natural Language Inference
+models during previous stages of training on multiple data sources. By doing
+so, we aim to develop versatile and robust methods for detecting moral values
+in real-world scenarios. Our approach uses the GPT 3.5 model as a zero-shot
+ready-made unsupervised multi-label classifier for moral values detection,
+eliminating the need for explicit training on labeled data. We compare it with
+a smaller NLI-based zero-shot model. The results show that the NLI approach
+achieves competitive results compared to the Davinci model. Furthermore, we
+conduct an in-depth investigation of the performance of supervised systems in
+the context of cross-domain multi-label moral value detection. This involves
+training supervised models on different domains to explore their effectiveness
+in handling data from different sources and comparing their performance with
+the unsupervised methods. Our contributions encompass a thorough analysis of
+both supervised and unsupervised methodologies for cross-domain value
+detection. We introduce the Davinci model as a state-of-the-art zero-shot
+unsupervised moral values classifier, pushing the boundaries of moral value
+detection without the need for explicit training on labeled data. Additionally,
+we perform a comparative evaluation of our approach with the supervised models,
+shedding light on their respective strengths and weaknesses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stochastic Polyak Step-sizes and Momentum: Convergence Guarantees and
+  Practical Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitris Oikonomou, Nicolas Loizou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradient descent with momentum, also known as Stochastic Heavy
+Ball method (SHB), is one of the most popular algorithms for solving
+large-scale stochastic optimization problems in various machine learning tasks.
+In practical scenarios, tuning the step-size and momentum parameters of the
+method is a prohibitively expensive and time-consuming process. In this work,
+inspired by the recent advantages of stochastic Polyak step-size in the
+performance of stochastic gradient descent (SGD), we propose and explore new
+Polyak-type variants suitable for the update rule of the SHB method. In
+particular, using the Iterate Moving Average (IMA) viewpoint of SHB, we propose
+and analyze three novel step-size selections: MomSPS$_{\max}$, MomDecSPS, and
+MomAdaSPS. For MomSPS$_{\max}$, we provide convergence guarantees for SHB to a
+neighborhood of the solution for convex and smooth problems (without assuming
+interpolation). If interpolation is also satisfied, then using MomSPS$_{\max}$,
+SHB converges to the true solution at a fast rate matching the deterministic
+HB. The other two variants, MomDecSPS and MomAdaSPS, are the first adaptive
+step-sizes for SHB that guarantee convergence to the exact minimizer without
+prior knowledge of the problem parameters and without assuming interpolation.
+The convergence analysis of SHB is tight and obtains the convergence guarantees
+of SGD with stochastic Polyak step-sizes as a special case. We supplement our
+analysis with experiments that validate the theory and demonstrate the
+effectiveness and robustness of the new algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 20 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Batched Linear Bandits <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanfei Ren, Tianyuan Jin, Pan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the E$^4$ algorithm for the batched linear bandit problem,
+incorporating an Explore-Estimate-Eliminate-Exploit framework. With a proper
+choice of exploration rate, we prove E$^4$ achieves the finite-time minimax
+optimal regret with only $O(\log\log T)$ batches, and the asymptotically
+optimal regret with only $3$ batches as $T\rightarrow\infty$, where $T$ is the
+time horizon. We further prove a lower bound on the batch complexity of linear
+contextual bandits showing that any asymptotically optimal algorithm must
+require at least $3$ batches in expectation as $T\rightarrow\infty$, which
+indicates E$^4$ achieves the asymptotic optimality in regret and batch
+complexity simultaneously. To the best of our knowledge, E$^4$ is the first
+algorithm for linear bandits that simultaneously achieves the minimax and
+asymptotic optimality in regret with the corresponding optimal batch
+complexities. In addition, we show that with another choice of exploration rate
+E$^4$ achieves an instance-dependent regret bound requiring at most $O(\log T)$
+batches, and maintains the minimax optimality and asymptotic optimality. We
+conduct thorough experiments to evaluate our algorithm on randomly generated
+instances and the challenging \textit{End of Optimism} instances
+\citep{lattimore2017end} which were shown to be hard to learn for optimism
+based algorithms. Empirical results show that E$^4$ consistently outperforms
+baseline algorithms with respect to regret minimization, batch complexity, and
+computational efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 6 figures, 4 tables. To appear in the proceedings of the
+  41st International Conference on Machine Learning (ICML 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Legal Judgment Reimagined: PredEx and the Rise of Intelligent AI
+  Interpretation in Indian Courts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Kumar Nigam, Anurag Sharma, Danush Khanna, Noel Shallum, Kripabandhu Ghosh, Arnab Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the era of Large Language Models (LLMs), predicting judicial outcomes
+poses significant challenges due to the complexity of legal proceedings and the
+scarcity of expert-annotated datasets. Addressing this, we introduce
+\textbf{Pred}iction with \textbf{Ex}planation (\texttt{PredEx}), the largest
+expert-annotated dataset for legal judgment prediction and explanation in the
+Indian context, featuring over 15,000 annotations. This groundbreaking corpus
+significantly enhances the training and evaluation of AI models in legal
+analysis, with innovations including the application of instruction tuning to
+LLMs. This method has markedly improved the predictive accuracy and explanatory
+depth of these models for legal judgments. We employed various
+transformer-based models, tailored for both general and Indian legal contexts.
+Through rigorous lexical, semantic, and expert assessments, our models
+effectively leverage \texttt{PredEx} to provide precise predictions and
+meaningful explanations, establishing it as a valuable benchmark for both the
+legal profession and the NLP community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compressible Dynamics in Deep Overparameterized Low-Rank Learning &
+  Adaptation <span class="chip">ICML'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04112v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04112v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Can Yaras, Peng Wang, Laura Balzano, Qing Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While overparameterization in machine learning models offers great benefits
+in terms of optimization and generalization, it also leads to increased
+computational requirements as model sizes grow. In this work, we show that by
+leveraging the inherent low-dimensional structures of data and compressible
+dynamics within the model parameters, we can reap the benefits of
+overparameterization without the computational burdens. In practice, we
+demonstrate the effectiveness of this approach for deep low-rank matrix
+completion as well as fine-tuning language models. Our approach is grounded in
+theoretical findings for deep overparameterized low-rank matrix recovery, where
+we show that the learning dynamics of each weight matrix are confined to an
+invariant low-dimensional subspace. Consequently, we can construct and train
+compact, highly compressed factorizations possessing the same benefits as their
+overparameterized counterparts. In the context of deep matrix completion, our
+technique substantially improves training efficiency while retaining the
+advantages of overparameterization. For language model fine-tuning, we propose
+a method called "Deep LoRA", which improves the existing low-rank adaptation
+(LoRA) technique, leading to reduced overfitting and a simplified
+hyperparameter setup, while maintaining comparable efficiency. We validate the
+effectiveness of Deep LoRA on natural language tasks, particularly when
+fine-tuning with limited data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML'24 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Tissue Plane to Organ World: A Benchmark <span class="highlight-title">Dataset</span> for Multimodal
+  Biomedical Image Registration using Deep Co-Attention Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04105v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04105v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifeng Wang, Weipeng Li, Thomas Pearce, Haohan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Correlating neuropathology with neuroimaging findings provides a multiscale
+view of pathologic changes in the human organ spanning the meso- to
+micro-scales, and is an emerging methodology expected to shed light on numerous
+disease states. To gain the most information from this multimodal, multiscale
+approach, it is desirable to identify precisely where a histologic tissue
+section was taken from within the organ in order to correlate with the tissue
+features in exactly the same organ region. Histology-to-organ registration
+poses an extra challenge, as any given histologic section can capture only a
+small portion of a human organ. Making use of the capabilities of
+state-of-the-art deep learning models, we unlock the potential to address and
+solve such intricate challenges. Therefore, we create the ATOM benchmark
+dataset, sourced from diverse institutions, with the primary objective of
+transforming this challenge into a machine learning problem and delivering
+outstanding outcomes that enlighten the biomedical community. The performance
+of our RegisMCAN model demonstrates the potential of deep learning to
+accurately predict where a subregion extracted from an organ image was obtained
+from within the overall 3D volume. The code and dataset can be found at:
+https://github.com/haizailache999/Image-Registration/tree/main
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multistep Distillation of Diffusion Models via Moment Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Salimans, Thomas Mensink, Jonathan Heek, Emiel Hoogeboom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new method for making diffusion models faster to sample. The
+method distills many-step diffusion models into few-step models by matching
+conditional expectations of the clean data given noisy data along the sampling
+trajectory. Our approach extends recently proposed one-step methods to the
+multi-step case, and provides a new perspective by interpreting these
+approaches in terms of moment matching. By using up to 8 sampling steps, we
+obtain distilled models that outperform not only their one-step versions but
+also their original many-step teacher models, obtaining new state-of-the-art
+results on the Imagenet dataset. We also show promising results on a large
+text-to-image model where we achieve fast generation of high resolution images
+directly in image space, without needing autoencoders or upsamplers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Weather Predictions: Super-Resolution via Deep Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Martinů, Petr Šimánek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigates the application of deep-learning diffusion models for
+the super-resolution of weather data, a novel approach aimed at enhancing the
+spatial resolution and detail of meteorological variables. Leveraging the
+capabilities of diffusion models, specifically the SR3 and ResDiff
+architectures, we present a methodology for transforming low-resolution weather
+data into high-resolution outputs. Our experiments, conducted using the
+WeatherBench dataset, focus on the super-resolution of the two-meter
+temperature variable, demonstrating the models' ability to generate detailed
+and accurate weather maps. The results indicate that the ResDiff model, further
+improved by incorporating physics-based modifications, significantly
+outperforms traditional SR3 methods in terms of Mean Squared Error (MSE),
+Structural Similarity Index (SSIM), and Peak Signal-to-Noise Ratio (PSNR). This
+research highlights the potential of diffusion models in meteorological
+applications, offering insights into their effectiveness, challenges, and
+prospects for future advancements in weather prediction and climate analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Large-Scale Neutral Comparison Study of Survival Models on
+  Low-Dimensional Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04098v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04098v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Burk, John Zobolas, Bernd Bischl, Andreas Bender, Marvin N. Wright, Raphael Sonabend
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents the first large-scale neutral benchmark experiment focused
+on single-event, right-censored, low-dimensional survival data. Benchmark
+experiments are essential in methodological research to scientifically compare
+new and existing model classes through proper empirical evaluation. Existing
+benchmarks in the survival literature are often narrow in scope, focusing, for
+example, on high-dimensional data. Additionally, they may lack appropriate
+tuning or evaluation procedures, or are qualitative reviews, rather than
+quantitative comparisons. This comprehensive study aims to fill the gap by
+neutrally evaluating a broad range of methods and providing generalizable
+conclusions. We benchmark 18 models, ranging from classical statistical
+approaches to many common machine learning methods, on 32 publicly available
+datasets. The benchmark tunes for both a discrimination measure and a proper
+scoring rule to assess performance in different settings. Evaluating on 8
+survival metrics, we assess discrimination, calibration, and overall predictive
+performance of the tested models. Using discrimination measures, we find that
+no method significantly outperforms the Cox model. However, (tuned) Accelerated
+Failure Time models were able to achieve significantly better results with
+respect to overall predictive performance as measured by the right-censored
+log-likelihood. Machine learning methods that performed comparably well include
+Oblique Random Survival Forests under discrimination, and Cox-based
+likelihood-boosting under overall predictive performance. We conclude that for
+predictive purposes in the standard survival analysis setting of
+low-dimensional, right-censored data, the Cox Proportional Hazards model
+remains a simple and robust method, sufficient for practitioners.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 28 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling and evaluating sparse autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leo Gao, Tom Dupré la Tour, Henk Tillman, Gabriel Goh, Rajan Troll, Alec Radford, Ilya Sutskever, Jan Leike, Jeffrey Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sparse autoencoders provide a promising unsupervised approach for extracting
+interpretable features from a language model by reconstructing activations from
+a sparse bottleneck layer. Since language models learn many concepts,
+autoencoders need to be very large to recover all relevant features. However,
+studying the properties of autoencoder scaling is difficult due to the need to
+balance reconstruction and sparsity objectives and the presence of dead
+latents. We propose using k-sparse autoencoders [Makhzani and Frey, 2013] to
+directly control sparsity, simplifying tuning and improving the
+reconstruction-sparsity frontier. Additionally, we find modifications that
+result in few dead latents, even at the largest scales we tried. Using these
+techniques, we find clean scaling laws with respect to autoencoder size and
+sparsity. We also introduce several new metrics for evaluating feature quality
+based on the recovery of hypothesized features, the explainability of
+activation patterns, and the sparsity of downstream effects. These metrics all
+generally improve with autoencoder size. To demonstrate the scalability of our
+approach, we train a 16 million latent autoencoder on GPT-4 activations for 40
+billion tokens. We release training code and autoencoders for open-source
+models, as well as a visualizer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable Lightweight <span class="highlight-title">Transformer</span> via Unrolling of Learned Graph
+  Smoothness Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04090v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04090v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tam Thuc Do, Parham Eftekhar, Seyed Alireza Hosseini, Gene Cheung, Philip Chou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We build interpretable and lightweight transformer-like neural networks by
+unrolling iterative optimization algorithms that minimize graph smoothness
+priors -- the quadratic graph Laplacian regularizer (GLR) and the $\ell_1$-norm
+graph total variation (GTV) -- subject to an interpolation constraint. The
+crucial insight is that a normalized signal-dependent graph learning module
+amounts to a variant of the basic self-attention mechanism in conventional
+transformers. Unlike "black-box" transformers that require learning of large
+key, query and value matrices to compute scaled dot products as affinities and
+subsequent output embeddings, resulting in huge parameter sets, our unrolled
+networks employ shallow CNNs to learn low-dimensional features per node to
+establish pairwise Mahalanobis distances and construct sparse similarity
+graphs. At each layer, given a learned graph, the target interpolated signal is
+simply a low-pass filtered output derived from the minimization of an assumed
+graph smoothness prior, leading to a dramatic reduction in parameter count.
+Experiments for two image interpolation applications verify the restoration
+performance, parameter efficiency and robustness to covariate shift of our
+graph-based unrolled networks compared to conventional transformers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Limitation of <span class="highlight-title">Transformer</span> for Learning HMMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Hu, Qinghua Liu, Chi Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the remarkable success of Transformer-based architectures in various
+sequential modeling tasks, such as natural language processing, computer
+vision, and robotics, their ability to learn basic sequential models, like
+Hidden Markov Models (HMMs), is still unclear. This paper investigates the
+performance of Transformers in learning HMMs and their variants through
+extensive experimentation and compares them to Recurrent Neural Networks
+(RNNs). We show that Transformers consistently underperform RNNs in both
+training speed and testing accuracy across all tested HMM models. There are
+even challenging HMM instances where Transformers struggle to learn, while RNNs
+can successfully do so. Our experiments further reveal the relation between the
+depth of Transformers and the longest sequence length it can effectively learn,
+based on the types and the complexity of HMMs. To address the limitation of
+transformers in modeling HMMs, we demonstrate that a variant of the
+Chain-of-Thought (CoT), called $\textit{block CoT}$ in the training phase, can
+help transformers to reduce the evaluation error and to learn longer sequences
+at a cost of increasing the training time. Finally, we complement our empirical
+findings by theoretical results proving the expressiveness of transformers in
+approximating HMMs with logarithmic depth.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deterministic Uncertainty Propagation for Improved Model-Based Offline
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdullah Akgül, Manuel Haußmann, Melih Kandemir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current approaches to model-based offline Reinforcement Learning (RL) often
+incorporate uncertainty-based reward penalization to address the distributional
+shift problem. While these approaches have achieved some success, we argue that
+this penalization introduces excessive conservatism, potentially resulting in
+suboptimal policies through underestimation. We identify as an important cause
+of over-penalization the lack of a reliable uncertainty estimator capable of
+propagating uncertainties in the Bellman operator. The common approach to
+calculating the penalty term relies on sampling-based uncertainty estimation,
+resulting in high variance. To address this challenge, we propose a novel
+method termed Moment Matching Offline Model-Based Policy Optimization (MOMBO).
+MOMBO learns a Q-function using moment matching, which allows us to
+deterministically propagate uncertainties through the Q-function. We evaluate
+MOMBO's performance across various environments and demonstrate empirically
+that MOMBO is a more stable and sample-efficient approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bootstrapping Expectiles in Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Clavier, Emmanuel Rachelson, Erwan Le Pennec, Matthieu Geist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many classic Reinforcement Learning (RL) algorithms rely on a Bellman
+operator, which involves an expectation over the next states, leading to the
+concept of bootstrapping. To introduce a form of pessimism, we propose to
+replace this expectation with an expectile. In practice, this can be very
+simply done by replacing the $L_2$ loss with a more general expectile loss for
+the critic. Introducing pessimism in RL is desirable for various reasons, such
+as tackling the overestimation problem (for which classic solutions are double
+Q-learning or the twin-critic approach of TD3) or robust RL (where transitions
+are adversarial). We study empirically these two cases. For the overestimation
+problem, we show that the proposed approach, ExpectRL, provides better results
+than a classic twin-critic. On robust RL benchmarks, involving changes of the
+environment, we show that our approach is more robust than classic RL
+algorithms. We also introduce a variation of ExpectRL combined with domain
+randomization which is competitive with state-of-the-art robust RL agents.
+Eventually, we also extend \ExpectRL with a mechanism for choosing
+automatically the expectile value, that is the degree of pessimism
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic angular synchronization under smoothness constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ernesto Araya, Mihai Cucuringu, Hemant Tyagi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given an undirected measurement graph $\mathcal{H} = ([n], \mathcal{E})$, the
+classical angular synchronization problem consists of recovering unknown angles
+$\theta_1^*,\dots,\theta_n^*$ from a collection of noisy pairwise measurements
+of the form $(\theta_i^* - \theta_j^*) \mod 2\pi$, for all $\{i,j\} \in
+\mathcal{E}$. This problem arises in a variety of applications, including
+computer vision, time synchronization of distributed networks, and ranking from
+pairwise comparisons. In this paper, we consider a dynamic version of this
+problem where the angles, and also the measurement graphs evolve over $T$ time
+points. Assuming a smoothness condition on the evolution of the latent angles,
+we derive three algorithms for joint estimation of the angles over all time
+points. Moreover, for one of the algorithms, we establish non-asymptotic
+recovery guarantees for the mean-squared error (MSE) under different
+statistical models. In particular, we show that the MSE converges to zero as
+$T$ increases under milder conditions than in the static setting. This includes
+the setting where the measurement graphs are highly sparse and disconnected,
+and also when the measurement noise is large and can potentially increase with
+$T$. We complement our theoretical results with experiments on synthetic data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Batch-in-Batch: a new adversarial training framework for initial
+  perturbation and sample selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinting Wu, Pai Peng, Bo Cai, Le Li,  .
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial training methods commonly generate independent initial
+perturbation for adversarial samples from a simple uniform distribution, and
+obtain the training batch for the classifier without selection. In this work,
+we propose a simple yet effective training framework called Batch-in-Batch (BB)
+to enhance models robustness. It involves specifically a joint construction of
+initial values that could simultaneously generates $m$ sets of perturbations
+from the original batch set to provide more diversity for adversarial samples;
+and also includes various sample selection strategies that enable the trained
+models to have smoother losses and avoid overconfident outputs. Through
+extensive experiments on three benchmark datasets (CIFAR-10, SVHN, CIFAR-100)
+with two networks (PreActResNet18 and WideResNet28-10) that are used in both
+the single-step (Noise-Fast Gradient Sign Method, N-FGSM) and multi-step
+(Projected Gradient Descent, PGD-10) adversarial training, we show that models
+trained within the BB framework consistently have higher adversarial accuracy
+across various adversarial settings, notably achieving over a 13% improvement
+on the SVHN dataset with an attack radius of 8/255 compared to the N-FGSM
+baseline model. Furthermore, experimental analysis of the efficiency of both
+the proposed initial perturbation method and sample selection strategies
+validates our insights. Finally, we show that our framework is cost-effective
+in terms of computational resources, even with a relatively large value of $m$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reassessing How to Compare and Improve the Calibration of Machine
+  Learning Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muthu Chidambaram, Rong Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A machine learning model is calibrated if its predicted probability for an
+outcome matches the observed frequency for that outcome conditional on the
+model prediction. This property has become increasingly important as the impact
+of machine learning models has continued to spread to various domains. As a
+result, there are now a dizzying number of recent papers on measuring and
+improving the calibration of (specifically deep learning) models. In this work,
+we reassess the reporting of calibration metrics in the recent literature. We
+show that there exist trivial recalibration approaches that can appear
+seemingly state-of-the-art unless calibration and prediction metrics (i.e. test
+accuracy) are accompanied by additional generalization metrics such as negative
+log-likelihood. We then derive a calibration-based decomposition of Bregman
+divergences that can be used to both motivate a choice of calibration metric
+based on a generalization metric, and to detect trivial calibration. Finally,
+we apply these ideas to develop a new extension to reliability diagrams that
+can be used to jointly visualize calibration as well as the estimated
+generalization error of a model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bisimulation Metrics are Optimal Transport Distances, and Can be
+  Computed Efficiently 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04056v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04056v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergio Calo, Anders Jonsson, Gergely Neu, Ludovic Schwartz, Javier Segovia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new framework for formulating optimal transport distances
+between Markov chains. Previously known formulations studied couplings between
+the entire joint distribution induced by the chains, and derived solutions via
+a reduction to dynamic programming (DP) in an appropriately defined Markov
+decision process. This formulation has, however, not led to particularly
+efficient algorithms so far, since computing the associated DP operators
+requires fully solving a static optimal transport problem, and these operators
+need to be applied numerous times during the overall optimization process. In
+this work, we develop an alternative perspective by considering couplings
+between a flattened version of the joint distributions that we call discounted
+occupancy couplings, and show that calculating optimal transport distances in
+the full space of joint distributions can be equivalently formulated as solving
+a linear program (LP) in this reduced space. This LP formulation allows us to
+port several algorithmic ideas from other areas of optimal transport theory. In
+particular, our formulation makes it possible to introduce an appropriate
+notion of entropy regularization into the optimization problem, which in turn
+enables us to directly calculate optimal transport distances via a
+Sinkhorn-like method we call Sinkhorn Value Iteration (SVI). We show both
+theoretically and empirically that this method converges quickly to an optimal
+coupling, essentially at the same computational cost of running vanilla
+Sinkhorn in each pair of states. Along the way, we point out that our optimal
+transport distance exactly matches the common notion of bisimulation metrics
+between Markov chains, and thus our results also apply to computing such
+metrics, and in fact our algorithm turns out to be significantly more efficient
+than the best known methods developed so far for this purpose.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging SPD Matrices on Riemannian Manifolds in Quantum Classical
+  Hybrid Models for Structural Health Monitoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azadeh Alavi, Sanduni Jayasinghe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Realtime finite element modeling of bridges assists modern structural health
+monitoring systems by providing comprehensive insights into structural
+integrity. This capability is essential for ensuring the safe operation of
+bridges and preventing sudden catastrophic failures. However, FEM computational
+cost and the need for realtime analysis pose significant challenges.
+Additionally, the input data is a 7 dimensional vector, while the output is a
+1017 dimensional vector, making accurate and efficient analysis particularly
+difficult. In this study, we propose a novel hybrid quantum classical
+Multilayer Perceptron pipeline leveraging Symmetric Positive Definite matrices
+and Riemannian manifolds for effective data representation. To maintain the
+integrity of the qubit structure, we utilize SPD matrices, ensuring data
+representation is well aligned with the quantum computational framework.
+Additionally, the method leverages polynomial feature expansion to capture
+nonlinear relationships within the data. The proposed pipeline combines
+classical fully connected neural network layers with quantum circuit layers to
+enhance model performance and efficiency. Our experiments focused on various
+configurations of such hybrid models to identify the optimal structure for
+accurate and efficient realtime analysis. The best performing model achieved a
+Mean Squared Error of 0.00031, significantly outperforming traditional methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multivector Neurons: Better and Faster O(n)-Equivariant Clifford Graph
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Liu, David Ruhe, Patrick Forré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most current deep learning models equivariant to $O(n)$ or $SO(n)$ either
+consider mostly scalar information such as distances and angles or have a very
+high computational complexity. In this work, we test a few novel message
+passing graph neural networks (GNNs) based on Clifford multivectors, structured
+similarly to other prevalent equivariant models in geometric deep learning. Our
+approach leverages efficient invariant scalar features while simultaneously
+performing expressive learning on multivector representations, particularly
+through the use of the equivariant geometric product operator. By integrating
+these elements, our methods outperform established efficient baseline models on
+an N-Body simulation task and protein denoising task while maintaining a high
+efficiency. In particular, we push the state-of-the-art error on the N-body
+dataset to 0.0035 (averaged over 3 runs); an 8% improvement over recent
+methods. Our implementation is available on Github.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Slicing Mutual Information Generalization Bounds for Neural Networks <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04047v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04047v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kimia Nadjahi, Kristjan Greenewald, Rickard Brüel Gabrielsson, Justin Solomon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability of machine learning (ML) algorithms to generalize well to unseen
+data has been studied through the lens of information theory, by bounding the
+generalization error with the input-output mutual information (MI), i.e., the
+MI between the training data and the learned hypothesis. Yet, these bounds have
+limited practicality for modern ML applications (e.g., deep learning), due to
+the difficulty of evaluating MI in high dimensions. Motivated by recent
+findings on the compressibility of neural networks, we consider algorithms that
+operate by slicing the parameter space, i.e., trained on random
+lower-dimensional subspaces. We introduce new, tighter information-theoretic
+generalization bounds tailored for such algorithms, demonstrating that slicing
+improves generalization. Our bounds offer significant computational and
+statistical advantages over standard MI bounds, as they rely on scalable
+alternative measures of dependence, i.e., disintegrated mutual information and
+$k$-sliced mutual information. Then, we extend our analysis to algorithms whose
+parameters do not need to exactly lie on random subspaces, by leveraging
+rate-distortion theory. This strategy yields generalization bounds that
+incorporate a distortion term measuring model compressibility under slicing,
+thereby tightening existing bounds without compromising performance or
+requiring model compression. Building on this, we propose a regularization
+scheme enabling practitioners to control generalization through
+compressibility. Finally, we empirically validate our results and achieve the
+computation of non-vacuous information-theoretic generalization bounds for
+neural networks, a task that was previously out of reach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy-based Epistemic Uncertainty for Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Fuchsgruber, Tom Wollschläger, Stephan Günnemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In domains with interdependent data, such as graphs, quantifying the
+epistemic uncertainty of a Graph Neural Network (GNN) is challenging as
+uncertainty can arise at different structural scales. Existing techniques
+neglect this issue or only distinguish between structure-aware and
+structure-agnostic uncertainty without combining them into a single measure. We
+propose GEBM, an energy-based model (EBM) that provides high-quality
+uncertainty estimates by aggregating energy at different structural levels that
+naturally arise from graph diffusion. In contrast to logit-based EBMs, we
+provably induce an integrable density in the data space by regularizing the
+energy function. We introduce an evidential interpretation of our EBM that
+significantly improves the predictive robustness of the GNN. Our framework is a
+simple and effective post hoc method applicable to any pre-trained GNN that is
+sensitive to various distribution shifts. It consistently achieves the best
+separation of in-distribution and out-of-distribution data on 6 out of 7
+anomaly types while having the best average rank over shifts on \emph{all}
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Opinion Pooling for Uncertainty Quantification on Graphs <span class="chip">UAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04041v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04041v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clemens Damke, Eyke Hüllermeier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the problem of uncertainty quantification for graph-structured
+data, or, more specifically, the problem to quantify the predictive uncertainty
+in (semi-supervised) node classification. Key questions in this regard concern
+the distinction between two different types of uncertainty, aleatoric and
+epistemic, and how to support uncertainty quantification by leveraging the
+structural information provided by the graph topology. Challenging assumptions
+and postulates of state-of-the-art methods, we propose a novel approach that
+represents (epistemic) uncertainty in terms of mixtures of Dirichlet
+distributions and refers to the established principle of linear opinion pooling
+for propagating information between neighbored nodes in the graph. The
+effectiveness of this approach is demonstrated in a series of experiments on a
+variety of graph-structured datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the 40th Conference on Uncertainty in Artificial
+  Intelligence (UAI 2024). Implementation available at
+  https://github.com/Cortys/gpn-extensions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shaping History: Advanced Machine Learning Techniques for the Analysis
+  and Dating of Cuneiform Tablets over Three Millennia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danielle Kapon, Michael Fire, Shai Gordin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cuneiform tablets, emerging in ancient Mesopotamia around the late fourth
+millennium BCE, represent one of humanity's earliest writing systems.
+Characterized by wedge-shaped marks on clay tablets, these artifacts provided
+insight into Mesopotamian civilization across various domains. Traditionally,
+the analysis and dating of these tablets rely on subjective assessment of shape
+and writing style, leading to uncertainties in pinpointing their exact temporal
+origins. Recent advances in digitization have revolutionized the study of
+cuneiform by enhancing accessibility and analytical capabilities. Our research
+uniquely focuses on the silhouette of tablets as significant indicators of
+their historical periods, diverging from most studies that concentrate on
+textual content. Utilizing an unprecedented dataset of over 94,000 images from
+the Cuneiform Digital Library Initiative collection, we apply deep learning
+methods to classify cuneiform tablets, covering over 3,000 years of history. By
+leveraging statistical, computational techniques, and generative modeling
+through Variational Auto-Encoders (VAEs), we achieve substantial advancements
+in the automatic classification of these ancient documents, focusing on the
+tablets' silhouettes as key predictors. Our classification approach begins with
+a Decision Tree using height-to-width ratios and culminates with a ResNet50
+model, achieving a 61% macro F1-score for tablet silhouettes. Moreover, we
+introduce novel VAE-powered tools to enhance explainability and enable
+researchers to explore changes in tablet shapes across different eras and
+genres. This research contributes to document analysis and diplomatics by
+demonstrating the value of large-scale data analysis combined with statistical
+methods. These insights offer valuable tools for historians and epigraphists,
+enriching our understanding of cuneiform tablets and the cultures that produced
+them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Road Network Representation Learning with the Third Law of Geography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haicang Zhou, Weiming Huang, Yile Chen, Tiantian He, Gao Cong, Yew-Soon Ong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Road network representation learning aims to learn compressed and effective
+vectorized representations for road segments that are applicable to numerous
+tasks. In this paper, we identify the limitations of existing methods,
+particularly their overemphasis on the distance effect as outlined in the First
+Law of Geography. In response, we propose to endow road network representation
+with the principles of the recent Third Law of Geography. To this end, we
+propose a novel graph contrastive learning framework that employs geographic
+configuration-aware graph augmentation and spectral negative sampling, ensuring
+that road segments with similar geographic configurations yield similar
+representations, and vice versa, aligning with the principles stated in the
+Third Law. The framework further fuses the Third Law with the First Law through
+a dual contrastive learning objective to effectively balance the implications
+of both laws. We evaluate our framework on two real-world datasets across three
+downstream tasks. The results show that the integration of the Third Law
+significantly improves the performance of road segment representations in
+downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatio-temporal Early Prediction based on Multi-objective Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04035v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04035v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Shao, Yufan Kang, Ziyan Peng, Xiao Xiao, Lei Wang, Yuhui Yang, Flora D Salim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accuracy and timeliness are indeed often conflicting goals in prediction
+tasks. Premature predictions may yield a higher rate of false alarms, whereas
+delaying predictions to gather more information can render them too late to be
+useful. In applications such as wildfires, crimes, and traffic jams, timely
+predictions are vital for safeguarding human life and property. Consequently,
+finding a balance between accuracy and timeliness is crucial. In this paper, we
+propose a spatio-temporal early prediction model based on Multi-Objective
+reinforcement learning that can either implement an optimal policy given a
+preference or infer the preference based on a small number of samples. The
+model addresses two primary challenges: 1) enhancing the accuracy of early
+predictions and 2) providing the optimal policy for determining the most
+suitable prediction time for each area. Our method demonstrates superior
+performance on three large-scale real-world datasets, surpassing existing
+methods in early spatio-temporal prediction tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-train</span>ed <span class="highlight-title">Transformer</span> Uncovers Meaningful Patterns in Human Mobility
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04029v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04029v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alameen Najjar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We empirically demonstrate that a transformer pre-trained on country-scale
+unlabeled human mobility data learns embeddings capable, through fine-tuning,
+of developing a deep understanding of the target geography and its
+corresponding mobility patterns. Utilizing an adaptation framework, we evaluate
+the performance of our pre-trained embeddings in encapsulating a broad spectrum
+of concepts directly and indirectly related to human mobility. This includes
+basic notions, such as geographic location and distance, and extends to more
+complex constructs, such as administrative divisions and land cover. Our
+extensive empirical analysis reveals a substantial performance boost gained
+from pre-training, reaching up to 38% in tasks such as tree-cover regression.
+We attribute this result to the ability of the pre-training to uncover
+meaningful patterns hidden in the raw data, beneficial for modeling relevant
+high-level concepts. The pre-trained embeddings emerge as robust
+representations of regions and trajectories, potentially valuable for a wide
+range of downstream applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 12 figures, 14 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variational inference, Mixture of Gaussians, Bayesian Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Huix, Anna Korba, Alain Durmus, Eric Moulines
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational inference (VI) is a popular approach in Bayesian inference, that
+looks for the best approximation of the posterior distribution within a
+parametric family, minimizing a loss that is typically the (reverse)
+Kullback-Leibler (KL) divergence. Despite its empirical success, the
+theoretical properties of VI have only received attention recently, and mostly
+when the parametric family is the one of Gaussians. This work aims to
+contribute to the theoretical study of VI in the non-Gaussian case by
+investigating the setting of Mixture of Gaussians with fixed covariance and
+constant weights. In this view, VI over this specific family can be casted as
+the minimization of a Mollified relative entropy, i.e. the KL between the
+convolution (with respect to a Gaussian kernel) of an atomic measure supported
+on Diracs, and the target distribution. The support of the atomic measure
+corresponds to the localization of the Gaussian components. Hence, solving
+variational inference becomes equivalent to optimizing the positions of the
+Diracs (the particles), which can be done through gradient descent and takes
+the form of an interacting particle system. We study two sources of error of
+variational inference in this context when optimizing the mollified relative
+entropy. The first one is an optimization result, that is a descent lemma
+establishing that the algorithm decreases the objective at each iteration. The
+second one is an approximation error, that upper bounds the objective between
+an optimal finite mixture and the target distribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Dynamics of Information Interplay in Supervised Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Song, Zhiquan Tan, Bochao Zou, Huimin Ma, Weiran Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we use matrix information theory as an analytical tool to
+analyze the dynamics of the information interplay between data representations
+and classification head vectors in the supervised learning process.
+Specifically, inspired by the theory of Neural Collapse, we introduce matrix
+mutual information ratio (MIR) and matrix entropy difference ratio (HDR) to
+assess the interactions of data representation and class classification heads
+in supervised learning, and we determine the theoretical optimal values for MIR
+and HDR when Neural Collapse happens. Our experiments show that MIR and HDR can
+effectively explain many phenomena occurring in neural networks, for example,
+the standard supervised training dynamics, linear mode connectivity, and the
+performance of label smoothing and pruning. Additionally, we use MIR and HDR to
+gain insights into the dynamics of grokking, which is an intriguing phenomenon
+observed in supervised training, where the model demonstrates generalization
+capabilities long after it has learned to fit the training data. Furthermore,
+we introduce MIR and HDR as loss terms in supervised and semi-supervised
+learning to optimize the information interactions among samples and
+classification heads. The empirical results provide evidence of the method's
+effectiveness, demonstrating that the utilization of MIR and HDR not only aids
+in comprehending the dynamics throughout the training process but can also
+enhances the training procedure itself.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HackAtari: Atari Learning Environments for Robust and Continual
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quentin Delfosse, Jannis Blüml, Bjarne Gregori, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial agents' adaptability to novelty and alignment with intended
+behavior is crucial for their effective deployment. Reinforcement learning (RL)
+leverages novelty as a means of exploration, yet agents often struggle to
+handle novel situations, hindering generalization. To address these issues, we
+propose HackAtari, a framework introducing controlled novelty to the most
+common RL benchmark, the Atari Learning Environment. HackAtari allows us to
+create novel game scenarios (including simplification for curriculum learning),
+to swap the game elements' colors, as well as to introduce different reward
+signals for the agent. We demonstrate that current agents trained on the
+original environments include robustness failures, and evaluate HackAtari's
+efficacy in enhancing RL agents' robustness and aligning behavior through
+experiments using C51 and PPO. Overall, HackAtari can be used to improve the
+robustness of current and future RL algorithms, allowing Neuro-Symbolic RL,
+curriculum RL, causal RL, as well as LLM-driven RL. Our work underscores the
+significance of developing interpretable in RL agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 main pages, 4 pages references, 19 pages of appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Position: Embracing Negative Results in Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Karl, Lukas Malte Kemeter, Gabriel Dax, Paulina Sierak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Publications proposing novel machine learning methods are often primarily
+rated by exhibited predictive performance on selected problems. In this
+position paper we argue that predictive performance alone is not a good
+indicator for the worth of a publication. Using it as such even fosters
+problems like inefficiencies of the machine learning research community as a
+whole and setting wrong incentives for researchers. We therefore put out a call
+for the publication of "negative" results, which can help alleviate some of
+these problems and improve the scientific output of the machine learning
+research community. To substantiate our position, we present the advantages of
+publishing negative results and provide concrete measures for the community to
+move towards a paradigm where their publication is normalized.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mini Honor of Kings: A Lightweight Environment for Multi-Agent
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Liu, Jian Zhao, Cheng Hu, Zhengtao Cao, Youpeng Zhao, Zhenbin Ye, Meng Meng, Wenjun Wang, Zhaofeng He, Houqiang Li, Xia Lin, Lanxiao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Games are widely used as research environments for multi-agent reinforcement
+learning (MARL), but they pose three significant challenges: limited
+customization, high computational demands, and oversimplification. To address
+these issues, we introduce the first publicly available map editor for the
+popular mobile game Honor of Kings and design a lightweight environment, Mini
+Honor of Kings (Mini HoK), for researchers to conduct experiments. Mini HoK is
+highly efficient, allowing experiments to be run on personal PCs or laptops
+while still presenting sufficient challenges for existing MARL algorithms. We
+have tested our environment on common MARL algorithms and demonstrated that
+these algorithms have yet to find optimal solutions within this environment.
+This facilitates the dissemination and advancement of MARL methods within the
+research community. Additionally, we hope that more researchers will leverage
+the Honor of Kings map editor to develop innovative and scientifically valuable
+new maps. Our code and user manual are available at:
+https://github.com/tencent-ailab/mini-hok.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Subhomogeneous Deep Equilibrium Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00720v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00720v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pietro Sittoni, Francesco Tudisco
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit-depth neural networks have grown as powerful alternatives to
+traditional networks in various applications in recent years. However, these
+models often lack guarantees of existence and uniqueness, raising stability,
+performance, and reproducibility issues. In this paper, we present a new
+analysis of the existence and uniqueness of fixed points for implicit-depth
+neural networks based on the concept of subhomogeneous operators and the
+nonlinear Perron-Frobenius theory. Compared to previous similar analyses, our
+theory allows for weaker assumptions on the parameter matrices, thus yielding a
+more flexible framework for well-defined implicit networks. We illustrate the
+performance of the resulting subhomogeneous networks on feedforward,
+convolutional, and graph neural network examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Eureka-Moments in <span class="highlight-title">Transformer</span>s: Multi-Step Tasks Reveal Softmax Induced
+  Optimization Problems <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12956v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12956v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David T. Hoffmann, Simon Schrodi, Jelena Bratulić, Nadine Behrmann, Volker Fischer, Thomas Brox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we study rapid improvements of the training loss in
+transformers when being confronted with multi-step decision tasks. We found
+that transformers struggle to learn the intermediate task and both training and
+validation loss saturate for hundreds of epochs. When transformers finally
+learn the intermediate task, they do this rapidly and unexpectedly. We call
+these abrupt improvements Eureka-moments, since the transformer appears to
+suddenly learn a previously incomprehensible concept. We designed synthetic
+tasks to study the problem in detail, but the leaps in performance can be
+observed also for language modeling and in-context learning (ICL). We suspect
+that these abrupt transitions are caused by the multi-step nature of these
+tasks. Indeed, we find connections and show that ways to improve on the
+synthetic multi-step tasks can be used to improve the training of language
+modeling and ICL. Using the synthetic data we trace the problem back to the
+Softmax function in the self-attention block of transformers and show ways to
+alleviate the problem. These fixes reduce the required number of training
+steps, lead to higher likelihood to learn the intermediate task, to higher
+final accuracy and training becomes more robust to hyper-parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Event-Triggered Time-Varying Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.10790v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.10790v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Brunzema, Alexander von Rohr, Friedrich Solowjow, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of sequentially optimizing a time-varying objective
+function using time-varying Bayesian optimization (TVBO). To cope with stale
+data arising from time variations, current approaches to TVBO require prior
+knowledge of a constant rate of change. However, in practice, the rate of
+change is usually unknown. We propose an event-triggered algorithm, ET-GP-UCB,
+that treats the optimization problem as static until it detects changes in the
+objective function and then resets the dataset. This allows the algorithm to
+adapt online to realized temporal changes without the need for exact prior
+knowledge. The event trigger is based on probabilistic uniform error bounds
+used in Gaussian process regression. We derive regret bounds of adaptive resets
+without exact prior knowledge on the temporal changes, and show in numerical
+experiments that ET-GP-UCB outperforms state-of-the-art algorithms on both
+synthetic and real-world data. The results demonstrate that ET-GP-UCB is
+readily applicable to various settings without extensive hyperparameter tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TRAP: Targeted Random Adversarial <span class="highlight-title">Prompt</span> Honeypot for Black-Box
+  Identification <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12991v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12991v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Gubri, Dennis Ulmer, Hwaran Lee, Sangdoo Yun, Seong Joon Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Model (LLM) services and models often come with legal rules on
+who can use them and how they must use them. Assessing the compliance of the
+released LLMs is crucial, as these rules protect the interests of the LLM
+contributor and prevent misuse. In this context, we describe the novel
+fingerprinting problem of Black-box Identity Verification (BBIV). The goal is
+to determine whether a third-party application uses a certain LLM through its
+chat function. We propose a method called Targeted Random Adversarial Prompt
+(TRAP) that identifies the specific LLM in use. We repurpose adversarial
+suffixes, originally proposed for jailbreaking, to get a pre-defined answer
+from the target LLM, while other models give random answers. TRAP detects the
+target LLMs with over 95% true positive rate at under 0.2% false positive rate
+even after a single interaction. TRAP remains effective even if the LLM has
+minor changes that do not significantly alter the original function.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 (findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Don't Rank, Combine! Combining Machine Translation Hypotheses Using
+  Quality Estimation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06688v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06688v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgos Vernikos, Andrei Popescu-Belis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural machine translation systems estimate probabilities of target sentences
+given source sentences, yet these estimates may not align with human
+preferences. This work introduces QE-fusion, a method that synthesizes
+translations using a quality estimation metric (QE), which correlates better
+with human judgments. QE-fusion leverages a pool of candidates sampled from a
+model, combining spans from different candidates using a QE metric such as
+CometKiwi. We compare QE-fusion against beam search and recent reranking
+techniques, such as Minimum Bayes Risk decoding or QE-reranking. Our method
+consistently improves translation quality in terms of COMET and BLEURT scores
+when applied to large language models (LLMs) used for translation (PolyLM,
+XGLM, Llama2, Mistral, ALMA, and Tower) and to multilingual translation models
+(NLLB), over five language pairs. Notably, QE-fusion exhibits larger
+improvements for LLMs due to their ability to generate diverse outputs. We
+demonstrate that our approach generates novel translations in over half of the
+cases and consistently outperforms other methods across varying numbers of
+candidates (5-200). Furthermore, we empirically establish that QE-fusion scales
+linearly with the number of candidates in the pool.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LiveCodeBench: Holistic and Contamination Free Evaluation of Large
+  Language Models for Code 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07974v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07974v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, Ion Stoica
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) applied to code-related applications have
+emerged as a prominent field, attracting significant interest from both
+academia and industry. However, as new and improved LLMs are developed,
+existing evaluation benchmarks (e.g., HumanEval, MBPP) are no longer sufficient
+for assessing their capabilities. In this work, we propose LiveCodeBench, a
+comprehensive and contamination-free evaluation of LLMs for code, which
+continuously collects new problems over time from contests across three
+competition platforms, namely LeetCode, AtCoder, and CodeForces. Notably, our
+benchmark also focuses on a broader range of code related capabilities, such as
+self-repair, code execution, and test output prediction, beyond just code
+generation. Currently, LiveCodeBench hosts four hundred high-quality coding
+problems that were published between May 2023 and May 2024. We have evaluated
+18 base LLMs and 34 instruction-tuned LLMs on LiveCodeBench. We present
+empirical findings on contamination, holistic performance comparisons,
+potential overfitting in existing benchmarks as well as individual model
+comparisons. We will release all prompts and model completions for further
+community analysis, along with a general toolkit for adding new scenarios and
+model
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website - https://livecodebench.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rolling Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09470v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09470v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Ruhe, Jonathan Heek, Tim Salimans, Emiel Hoogeboom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have recently been increasingly applied to temporal data
+such as video, fluid mechanics simulations, or climate data. These methods
+generally treat subsequent frames equally regarding the amount of noise in the
+diffusion process. This paper explores Rolling Diffusion: a new approach that
+uses a sliding window denoising process. It ensures that the diffusion process
+progressively corrupts through time by assigning more noise to frames that
+appear later in a sequence, reflecting greater uncertainty about the future as
+the generation process unfolds. Empirically, we show that when the temporal
+dynamics are complex, Rolling Diffusion is superior to standard diffusion. In
+particular, this result is demonstrated in a video prediction task using the
+Kinetics-600 video dataset and in a chaotic fluid dynamics forecasting
+experiment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReGAL: Refactoring Programs to Discover Generalizable Abstractions <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.16467v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.16467v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elias Stengel-Eskin, Archiki Prasad, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) are increasingly being used for program
+synthesis, they lack the global view needed to develop useful abstractions;
+they generally predict programs one at a time, often repeating the same
+functionality. Generating redundant code from scratch is both inefficient and
+error-prone. To address this, we propose Refactoring for Generalizable
+Abstraction Learning (ReGAL), a gradient-free method for learning a library of
+reusable functions via code refactorization, i.e., restructuring code without
+changing its execution output. ReGAL learns from a small set of existing
+programs, iteratively verifying and refining its abstractions via execution. We
+find that the shared function libraries discovered by ReGAL make programs
+easier to predict across diverse domains. On five datasets -- LOGO graphics
+generation, Date reasoning, TextCraft (a Minecraft-based text-game) MATH, and
+TabMWP -- both open-source and proprietary LLMs improve in accuracy when
+predicting programs with ReGAL functions. For CodeLlama-13B, ReGAL results in
+absolute accuracy increases of 11.5% on LOGO, 26.1% on date understanding, and
+8.1% on TextCraft, outperforming GPT-3.5 in two of three domains. Our analysis
+reveals ReGAL's abstractions encapsulate frequently-used subroutines as well as
+environment dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Camera-Ready; First two authors contributed equally; Code:
+  https://github.com/esteng/regal_program_learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An operator learning perspective on parameter-to-observable maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06031v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06031v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Zhengyu Huang, Nicholas H. Nelsen, Margaret Trautner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computationally efficient surrogates for parametrized physical models play a
+crucial role in science and engineering. Operator learning provides data-driven
+surrogates that map between function spaces. However, instead of full-field
+measurements, often the available data are only finite-dimensional
+parametrizations of model inputs or finite observables of model outputs.
+Building on Fourier Neural Operators, this paper introduces the Fourier Neural
+Mappings (FNMs) framework that is able to accommodate such finite-dimensional
+vector inputs or outputs. The paper develops universal approximation theorems
+for the method. Moreover, in many applications the underlying
+parameter-to-observable (PtO) map is defined implicitly through an
+infinite-dimensional operator, such as the solution operator of a partial
+differential equation. A natural question is whether it is more data-efficient
+to learn the PtO map end-to-end or first learn the solution operator and
+subsequently compute the observable from the full-field solution. A theoretical
+analysis of Bayesian nonparametric regression of linear functionals, which is
+of independent interest, suggests that the end-to-end approach can actually
+have worse sample complexity. Extending beyond the theory, numerical results
+for the FNM approximation of three nonlinear PtO maps demonstrate the benefits
+of the operator learning perspective that this paper adopts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>63 pages, 10 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-group Learning for Hierarchical Groups <span class="chip">ICML
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00258v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00258v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Deng, Daniel Hsu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The multi-group learning model formalizes the learning scenario in which a
+single predictor must generalize well on multiple, possibly overlapping
+subgroups of interest. We extend the study of multi-group learning to the
+natural case where the groups are hierarchically structured. We design an
+algorithm for this setting that outputs an interpretable and deterministic
+decision tree predictor with near-optimal sample complexity. We then conduct an
+empirical evaluation of our algorithm and find that it achieves attractive
+generalization properties on real datasets with hierarchical group structure.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in International Conference on Machine Learning 2024 (ICML
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reflect-RL: Two-Player Online RL Fine-Tuning for LMs <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12621v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12621v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runlong Zhou, Simon S. Du, Beibin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As language models (LMs) demonstrate their capabilities in various fields,
+their application to tasks requiring multi-round interactions has become
+increasingly popular. These tasks usually have complex dynamics, so supervised
+fine-tuning (SFT) on a limited offline dataset does not yield good performance.
+However, only a few works attempted to directly train the LMs within
+interactive decision-making environments. We aim to create an effective
+approach to fine-tune LMs with online reinforcement learning (RL) in these
+environments. We propose Reflect-RL, a two-player system to fine-tune an LM
+using SFT and online RL, where a frozen reflection model (player) assists the
+policy model (player). To generate data for the warm-up SFT stage, we use
+negative example generation to enhance the error-correction ability of the
+reflection model. Furthermore, we designed single-prompt action enumeration and
+applied curriculum learning to allow the policy model to learn more
+efficiently. Empirically, we verify that Reflect-RL outperforms SFT and online
+RL without reflection. Testing results indicate GPT-2 XL 1.56B fine-tuned with
+Reflect-RL outperforms larger open-source LMs, such as Mistral 7B. The
+benchmarks, dataset, and code involved in this work are publicly available:
+https://github.com/zhourunlong/Reflect-RL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning from higher-order statistics, efficiently: hypothesis tests,
+  random features, and neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14922v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14922v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eszter Székely, Lorenzo Bardone, Federica Gerace, Sebastian Goldt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks excel at discovering statistical patterns in high-dimensional
+data sets. In practice, higher-order cumulants, which quantify the non-Gaussian
+correlations between three or more variables, are particularly important for
+the performance of neural networks. But how efficient are neural networks at
+extracting features from higher-order cumulants? We study this question in the
+spiked cumulant model, where the statistician needs to recover a privileged
+direction or "spike" from the order-$p\ge 4$ cumulants of $d$-dimensional
+inputs. Existing literature established the presence of a wide
+statistical-to-computational gap in this problem. We deepen this line of work
+by finding an exact formula for the likelihood ratio norm which proves that
+statistical distinguishability requires $n\gtrsim d$ samples, while
+distinguishing the two distributions in polynomial time requires $n \gtrsim
+d^2$ samples for a wide class of algorithms, i.e. those covered by the
+low-degree conjecture. Numerical experiments show that neural networks do
+indeed learn to distinguish the two distributions with quadratic sample
+complexity, while "lazy" methods like random features are not better than
+random guessing in this regime. Our results show that neural networks extract
+information from higher-ordercorrelations in the spiked cumulant model
+efficiently, and reveal a large gap in the amount of data required by neural
+networks and random features to learn from higher-order cumulants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Private are DP-SGD Implementations? <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17673v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17673v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lynn Chua, Badih Ghazi, Pritish Kamath, Ravi Kumar, Pasin Manurangsi, Amer Sinha, Chiyuan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate a substantial gap between the privacy guarantees of the
+Adaptive Batch Linear Queries (ABLQ) mechanism under different types of batch
+sampling: (i) Shuffling, and (ii) Poisson subsampling; the typical analysis of
+Differentially Private Stochastic Gradient Descent (DP-SGD) follows by
+interpreting it as a post-processing of ABLQ. While shuffling-based DP-SGD is
+more commonly used in practical implementations, it has not been amenable to
+easy privacy analysis, either analytically or even numerically. On the other
+hand, Poisson subsampling-based DP-SGD is challenging to scalably implement,
+but has a well-understood privacy analysis, with multiple open-source
+numerically tight privacy accountants available. This has led to a common
+practice of using shuffling-based DP-SGD in practice, but using the privacy
+analysis for the corresponding Poisson subsampling version. Our result shows
+that there can be a substantial gap between the privacy analysis when using the
+two types of batch sampling, and thus advises caution in reporting privacy
+parameters for DP-SGD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Code Symmetries for Learning Program Semantics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03312v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03312v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kexin Pei, Weichen Li, Qirui Jin, Shuyang Liu, Scott Geng, Lorenzo Cavallaro, Junfeng Yang, Suman Jana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper tackles the challenge of teaching code semantics to Large Language
+Models (LLMs) for program analysis by incorporating code symmetries into the
+model architecture. We introduce a group-theoretic framework that defines code
+symmetries as semantics-preserving transformations, where forming a code
+symmetry group enables precise and efficient reasoning of code semantics. Our
+solution, SymC, develops a novel variant of self-attention that is provably
+equivariant to code symmetries from the permutation group defined over the
+program dependence graph. SymC obtains superior performance on five program
+analysis tasks, outperforming state-of-the-art code models without any
+pre-training. Our results suggest that code LLMs that encode the code
+structural prior via the code symmetry group generalize better and faster.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Learning-Assisted Discovery of Flow Reactor Designs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08841v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08841v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Savage, Nausheen Basha, Jonathan McDonough, James Krassowski, Omar K Matar, Ehecatl Antonio del Rio Chanona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Additive manufacturing has enabled the fabrication of advanced reactor
+geometries, permitting larger, more complex design spaces. Identifying
+promising configurations within such spaces presents a significant challenge
+for current approaches. Furthermore, existing parameterisations of reactor
+geometries are low-dimensional with expensive optimisation limiting more
+complex solutions. To address this challenge, we establish a machine
+learning-assisted approach for the design of the next-generation of chemical
+reactors, combining the application of high-dimensional parameterisations,
+computational fluid dynamics, and multi-fidelity Bayesian optimisation. We
+associate the development of mixing-enhancing vortical flow structures in novel
+coiled reactors with performance, and use our approach to identify key
+characteristics of optimal designs. By appealing to the principles of flow
+dynamics, we rationalise the selection of novel design features that lead to
+experimental plug flow performance improvements of 60% over conventional
+designs. Our results demonstrate that coupling advanced manufacturing
+techniques with `augmented-intelligence' approaches can lead to superior design
+performance and, consequently, emissions-reduction and sustainability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 9 figures, as accepted Nature Chemical Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatial-Temporal Graph Representation Learning for Tactical Networks
+  Future State Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13872v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13872v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Junhua, Albrethsen Justin, Goh Lincoln, Yau David, Lim Kwan Hui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Resource allocation in tactical ad-hoc networks presents unique challenges
+due to their dynamic and multi-hop nature. Accurate prediction of future
+network connectivity is essential for effective resource allocation in such
+environments. In this paper, we introduce the Spatial-Temporal Graph
+Encoder-Decoder (STGED) framework for Tactical Communication Networks that
+leverages both spatial and temporal features of network states to learn latent
+tactical behaviors effectively. STGED hierarchically utilizes graph-based
+attention mechanism to spatially encode a series of communication network
+states, leverages a recurrent neural network to temporally encode the evolution
+of states, and a fully-connected feed-forward network to decode the
+connectivity in the future state. Through extensive experiments, we demonstrate
+that STGED consistently outperforms baseline models by large margins across
+different time-steps input, achieving an accuracy of up to 99.2\% for the
+future state prediction task of tactical communication networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transfer Learning for Latent Variable Network Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03437v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03437v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akhil Jalan, Arya Mazumdar, Soumendu Sundar Mukherjee, Purnamrita Sarkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study transfer learning for estimation in latent variable network models.
+In our setting, the conditional edge probability matrices given the latent
+variables are represented by $P$ for the source and $Q$ for the target. We wish
+to estimate $Q$ given two kinds of data: (1) edge data from a subgraph induced
+by an $o(1)$ fraction of the nodes of $Q$, and (2) edge data from all of $P$.
+If the source $P$ has no relation to the target $Q$, the estimation error must
+be $\Omega(1)$. However, we show that if the latent variables are shared, then
+vanishing error is possible. We give an efficient algorithm that utilizes the
+ordering of a suitably defined graph distance. Our algorithm achieves $o(1)$
+error and does not assume a parametric form on the source or target networks.
+Next, for the specific case of Stochastic Block Models we prove a minimax lower
+bound and show that a simple algorithm achieves this rate. Finally, we
+empirically demonstrate our algorithm's use on real-world and simulated graph
+transfer problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Study of Optimizations for Fine-tuning Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02290v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02290v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arjun Singh, Nikhil Pandey, Anup Shirgaonkar, Pavan Manoj, Vijay Aski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning large language models is a popular choice among users trying to
+adapt them for specific applications. However, fine-tuning these models is a
+demanding task because the user has to examine several factors, such as
+resource budget, runtime, model size and context length among others. A
+specific challenge is that fine-tuning is memory intensive, imposing
+constraints on the required hardware memory and context length of training data
+that can be handled. In this work, we share a detailed study on a variety of
+fine-tuning optimizations across different fine-tuning scenarios. In
+particular, we assess Gradient Checkpointing, Low-Rank Adaptation, DeepSpeed's
+Zero Redundancy Optimizer and FlashAttention. With a focus on memory and
+runtime, we examine the impact of different optimization combinations on GPU
+memory usage and execution runtime during fine-tuning phase. We provide our
+recommendation on the best default optimization for balancing memory and
+runtime across diverse model sizes. We share effective strategies for
+fine-tuning very large models with tens or hundreds of billions of parameters
+and enabling large context lengths during fine-tuning. Furthermore, we propose
+the appropriate optimization mixtures for fine-tuning under GPU resource
+limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures. Revised text for clarity, updated references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Does <span class="highlight-title">Pre-train</span>ed Language Model Actually Infer Unseen Links in Knowledge
+  Graph Completion? <span class="chip">NAACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuke Sakai, Hidetaka Kamigaito, Katsuhiko Hayashi, Taro Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graphs (KGs) consist of links that describe relationships between
+entities. Due to the difficulty of manually enumerating all relationships
+between entities, automatically completing them is essential for KGs. Knowledge
+Graph Completion (KGC) is a task that infers unseen relationships between
+entities in a KG. Traditional embedding-based KGC methods, such as RESCAL,
+TransE, DistMult, ComplEx, RotatE, HAKE, HousE, etc., infer missing links using
+only the knowledge from training data. In contrast, the recent Pre-trained
+Language Model (PLM)-based KGC utilizes knowledge obtained during pre-training.
+Therefore, PLM-based KGC can estimate missing links between entities by reusing
+memorized knowledge from pre-training without inference. This approach is
+problematic because building KGC models aims to infer unseen links between
+entities. However, conventional evaluations in KGC do not consider inference
+and memorization abilities separately. Thus, a PLM-based KGC method, which
+achieves high performance in current KGC evaluations, may be ineffective in
+practical applications. To address this issue, we analyze whether PLM-based KGC
+methods make inferences or merely access memorized knowledge. For this purpose,
+we propose a method for constructing synthetic datasets specified in this
+analysis and conclude that PLMs acquire the inference abilities required for
+KGC through pre-training, even though the performance improvements mostly come
+from textual information of entities and relations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NAACL 2024 main oral, 15 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predictive Uncertainty Quantification via Risk Decompositions for
+  Strictly Proper Scoring Rules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10727v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10727v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikita Kotelevskii, Maxim Panov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty quantification in predictive modeling often relies on ad hoc
+methods as there is no universally accepted formal framework for that. This
+paper introduces a theoretical approach to understanding uncertainty through
+statistical risks, distinguishing between aleatoric (data-related) and
+epistemic (model-related) uncertainties. We explain how to split pointwise risk
+into Bayes risk and excess risk. In particular, we show that excess risk,
+related to epistemic uncertainty, aligns with Bregman divergences. To turn
+considered risk measures into actual uncertainty estimates, we suggest using
+the Bayesian approach by approximating the risks with the help of posterior
+distributions. We tested our method on image datasets, evaluating its
+performance in detecting out-of-distribution and misclassified data using the
+AUROC metric. Our results confirm the effectiveness of the considered approach
+and offer practical guidance for estimating uncertainty in real-world
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Parameter-Adaptive Approximate MPC: Tuning Neural-Network Controllers
+  without Retraining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.05835v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.05835v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henrik Hose, Alexander Gräfe, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model Predictive Control (MPC) is a method to control nonlinear systems with
+guaranteed stability and constraint satisfaction but suffers from high
+computation times. Approximate MPC (AMPC) with neural networks (NNs) has
+emerged to address this limitation, enabling deployment on resource-constrained
+embedded systems. However, when tuning AMPCs for real-world systems, large
+datasets need to be regenerated and the NN needs to be retrained at every
+tuning step. This work introduces a novel, parameter-adaptive AMPC architecture
+capable of online tuning without recomputing large datasets and retraining. By
+incorporating local sensitivities of nonlinear programs, the proposed method
+not only mimics optimal MPC inputs but also adjusts to known changes in
+physical parameters of the model using linear predictions while still
+guaranteeing stability. We showcase the effectiveness of parameter-adaptive
+AMPC by controlling the swing-ups of two different real cartpole systems with a
+severely resource-constrained microcontroller (MCU). We use the same NN across
+both system instances that have different parameters. This work not only
+represents the first experimental demonstration of AMPC for fast-moving systems
+on low-cost MCUs to the best of our knowledge, but also showcases
+generalization across system instances and variations through our
+parameter-adaptation method. Taken together, these contributions represent a
+marked step toward the practical application of AMPC in real-world systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to L4DC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imbalanced Data Clustering using Equilibrium K-Means 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14490v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14490v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yudong He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Centroid-based clustering algorithms, such as hard K-means (HKM) and fuzzy
+K-means (FKM), have suffered from learning bias towards large clusters. Their
+centroids tend to be crowded in large clusters, compromising performance when
+the true underlying data groups vary in size (i.e., imbalanced data). To
+address this, we propose a new clustering objective function based on the
+Boltzmann operator, which introduces a novel centroid repulsion mechanism,
+where data points surrounding the centroids repel other centroids. Larger
+clusters repel more, effectively mitigating the issue of large cluster learning
+bias. The proposed new algorithm, called equilibrium K-means (EKM), is simple,
+alternating between two steps; resource-saving, with the same time and space
+complexity as FKM; and scalable to large datasets via batch learning. We
+substantially evaluate the performance of EKM on synthetic and real-world
+datasets. The results show that EKM performs competitively on balanced data and
+significantly outperforms benchmark algorithms on imbalanced data. Deep
+clustering experiments demonstrate that EKM is a better alternative to HKM and
+FKM on imbalanced data as more discriminative representation can be obtained.
+Additionally, we reformulate HKM, FKM, and EKM in a general form of gradient
+descent and demonstrate how this general form facilitates a uniform study of
+K-means algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sequential memory improves sample and memory efficiency in Episodic
+  Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.14734v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.14734v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ismael T. Freire, Adrián F. Amil, Paul F. M. J. Verschure
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State of the art deep reinforcement learning algorithms are sample
+inefficient due to the large number of episodes they require to achieve
+asymptotic performance. Episodic Reinforcement Learning (ERL) algorithms,
+inspired by the mammalian hippocampus, typically use extended memory systems to
+bootstrap learning from past events to overcome this sample-inefficiency
+problem. However, such memory augmentations are often used as mere buffers,
+from which isolated past experiences are drawn to learn from in an offline
+fashion (e.g., replay). Here, we demonstrate that including a bias in the
+acquired memory content derived from the order of episodic sampling improves
+both the sample and memory efficiency of an episodic control algorithm. We test
+our Sequential Episodic Control (SEC) model in a foraging task to show that
+storing and using integrated episodes as event sequences leads to faster
+learning with fewer memory requirements as opposed to a standard ERL benchmark,
+Model-Free Episodic Control, that buffers isolated events only. We also study
+the effect of memory constraints and forgetting on the sequential and
+non-sequential version of the SEC algorithm. Furthermore, we discuss how a
+hippocampal-like fast memory system could bootstrap slow cortical and
+subcortical learning subserving habit formation in the mammalian brain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Regularities from Data using Spiking Functions: A Theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11684v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11684v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Canlin Zhang, Xiuwen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks trained in an end-to-end manner are proven to be
+efficient in a wide range of machine learning tasks. However, there is one
+drawback of end-to-end learning: The learned features and information are
+implicitly represented in neural network parameters, which cannot be used as
+regularities, concepts or knowledge to explicitly represent the data
+probability distribution. To resolve this issue, we propose in this paper a new
+machine learning theory, which defines in mathematics what are regularities.
+Briefly, regularities are concise representations of the non-random features,
+or 'non-randomness' in the data probability distribution. Combining this with
+information theory, we claim that regularities can also be regarded as a small
+amount of information encoding a large amount of information. Our theory is
+based on spiking functions. That is, if a function can react to, or spike on
+specific data samples more frequently than random noise inputs, we say that
+such a function discovers non-randomness from the data distribution. Also, we
+say that the discovered non-randomness is encoded into regularities if the
+function is simple enough. Our theory also discusses applying multiple spiking
+functions to the same data distribution. In this process, we claim that the
+'best' regularities, or the optimal spiking functions, are those who can
+capture the largest amount of information from the data distribution, and then
+encode the captured information in the most concise way. Theorems and
+hypotheses are provided to describe in mathematics what are 'best' regularities
+and optimal spiking functions. Finally, we propose a machine learning approach,
+which can potentially obtain the optimal spiking functions regarding the given
+dataset in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Entropy annealing for policy mirror descent in continuous time and space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20250v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20250v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deven Sethi, David Šiška, Yufei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entropy regularization has been extensively used in policy optimization
+algorithms to regularize the optimization landscape and accelerate convergence;
+however, it comes at the cost of introducing an additional regularization bias.
+This work quantifies the impact of entropy regularization on the convergence of
+policy gradient methods for stochastic exit time control problems. We analyze a
+continuous-time policy mirror descent dynamics, which updates the policy based
+on the gradient of an entropy-regularized value function and adjusts the
+strength of entropy regularization as the algorithm progresses. We prove that
+with a fixed entropy level, the dynamics converges exponentially to the optimal
+solution of the regularized problem. We further show that when the entropy
+level decays at suitable polynomial rates, the annealed flow converges to the
+solution of the unregularized problem at a rate of $\mathcal O(1/S)$ for
+discrete action spaces and, under suitable conditions, at a rate of $\mathcal
+O(1/\sqrt{S})$ for general action spaces, with $S$ being the gradient flow
+time. This paper explains how entropy regularization improves policy
+optimization, even with the true gradient, from the perspective of convergence
+rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Whole Heart 3D+T Representation Learning Through Sparse 2D Cardiac MR
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00329v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00329v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yundi Zhang, Chen Chen, Suprosanna Shit, Sophie Starck, Daniel Rueckert, Jiazhen Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiac Magnetic Resonance (CMR) imaging serves as the gold-standard for
+evaluating cardiac morphology and function. Typically, a multi-view CMR stack,
+covering short-axis (SA) and 2/3/4-chamber long-axis (LA) views, is acquired
+for a thorough cardiac assessment. However, efficiently streamlining the
+complex, high-dimensional 3D+T CMR data and distilling compact, coherent
+representation remains a challenge. In this work, we introduce a whole-heart
+self-supervised learning framework that utilizes masked imaging modeling to
+automatically uncover the correlations between spatial and temporal patches
+throughout the cardiac stacks. This process facilitates the generation of
+meaningful and well-clustered heart representations without relying on the
+traditionally required, and often costly, labeled data. The learned heart
+representation can be directly used for various downstream tasks. Furthermore,
+our method demonstrates remarkable robustness, ensuring consistent
+representations even when certain CMR planes are missing/flawed. We train our
+model on 14,000 unlabeled CMR data from UK BioBank and evaluate it on 1,000
+annotated data. The proposed method demonstrates superior performance to
+baselines in tasks that demand comprehensive 3D+T cardiac information, e.g.
+cardiac phenotype (ejection fraction and ventricle volume) prediction and
+multi-plane/multi-frame CMR segmentation, highlighting its effectiveness in
+extracting comprehensive cardiac features that are both anatomically and
+pathologically relevant.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating <span class="highlight-title">Pre-Train</span>ed Speech and Language Models for End-to-End Speech
+  Recognition <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03668v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03668v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukiya Hono, Koh Mitsuda, Tianyu Zhao, Kentaro Mitsui, Toshiaki Wakatsuki, Kei Sawada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in machine learning have made it possible to perform various text
+and speech processing tasks, such as automatic speech recognition (ASR), in an
+end-to-end (E2E) manner. E2E approaches utilizing pre-trained models are
+gaining attention for conserving training data and resources. However, most of
+their applications in ASR involve only one of either a pre-trained speech or a
+language model. This paper proposes integrating a pre-trained speech
+representation model and a large language model (LLM) for E2E ASR. The proposed
+model enables the optimization of the entire ASR process, including acoustic
+feature extraction and acoustic and language modeling, by combining pre-trained
+models with a bridge network and also enables the application of remarkable
+developments in LLM utilization, such as parameter-efficient domain adaptation
+and inference optimization. Experimental results demonstrate that the proposed
+model achieves a performance comparable to that of modern E2E ASR models by
+utilizing powerful pre-training models with the proposed integrated approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 4 figures, 9 tables, accepted for Findings of ACL 2024. The
+  model is available at https://huggingface.co/rinna/nue-asr</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learned feature representations are biased by complexity, learning
+  order, position, and more 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05847v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05847v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Kyle Lampinen, Stephanie C. Y. Chan, Katherine Hermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representation learning, and interpreting learned representations, are key
+areas of focus in machine learning and neuroscience. Both fields generally use
+representations as a means to understand or improve a system's computations. In
+this work, however, we explore surprising dissociations between representation
+and computation that may pose challenges for such efforts. We create datasets
+in which we attempt to match the computational role that different features
+play, while manipulating other properties of the features or the data. We train
+various deep learning architectures to compute these multiple abstract features
+about their inputs. We find that their learned feature representations are
+systematically biased towards representing some features more strongly than
+others, depending upon extraneous properties such as feature complexity, the
+order in which features are learned, and the distribution of features over the
+inputs. For example, features that are simpler to compute or learned first tend
+to be represented more strongly and densely than features that are more complex
+or learned later, even if all features are learned equally well. We also
+explore how these biases are affected by architectures, optimizers, and
+training regimes (e.g., in transformers, features decoded earlier in the output
+sequence also tend to be represented more strongly). Our results help to
+characterize the inductive biases of gradient-based representation learning.
+These results also highlight a key challenge for interpretability $-$ or for
+comparing the representations of models and brains $-$ disentangling extraneous
+biases from the computationally important aspects of a system's internal
+representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ E(n) Equivariant Message Passing Cellular Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03145v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03145v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Veljko Kovač, Erik J. Bekkers, Pietro Liò, Floor Eijkelboom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces E(n) Equivariant Message Passing Cellular Networks
+(EMPCNs), an extension of E(n) Equivariant Graph Neural Networks to
+CW-complexes. Our approach addresses two aspects of geometric message passing
+networks: 1) enhancing their expressiveness by incorporating arbitrary cells,
+and 2) achieving this in a computationally efficient way with a decoupled
+EMPCNs technique. We demonstrate that EMPCNs achieve close to state-of-the-art
+performance on multiple tasks without the need for steerability, including
+many-body predictions and motion capture. Moreover, ablation studies confirm
+that decoupled EMPCNs exhibit stronger generalization capabilities than their
+non-topologically informed counterparts. These findings show that EMPCNs can be
+used as a scalable and expressive framework for higher-order message passing in
+geometric and topological graphs
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PEMT: Multi-Task Correlation Guided Mixture-of-Experts Enables
+  Parameter-Efficient Transfer Learning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15082v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15082v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhisheng Lin, Han Fu, Chenghao Liu, Zhuo Li, Jianling Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) has emerged as an effective method for
+adapting pre-trained language models to various tasks efficiently. Recently,
+there has been a growing interest in transferring knowledge from one or
+multiple tasks to the downstream target task to achieve performance
+improvements. However, current approaches typically either train adapters on
+individual tasks or distill shared knowledge from source tasks, failing to
+fully exploit task-specific knowledge and the correlation between source and
+target tasks. To overcome these limitations, we propose PEMT, a novel
+parameter-efficient fine-tuning framework based on multi-task transfer
+learning. PEMT extends the mixture-of-experts (MoE) framework to capture the
+transferable knowledge as a weighted combination of adapters trained on source
+tasks. These weights are determined by a gated unit, measuring the correlation
+between the target and each source task using task description prompt vectors.
+To fully exploit the task-specific knowledge, we also propose the Task Sparsity
+Loss to improve the sparsity of the gated unit. We conduct experiments on a
+broad range of tasks over 17 datasets. The experimental results demonstrate our
+PEMT yields stable improvements over full fine-tuning, and state-of-the-art
+PEFT and knowledge transferring methods on various tasks. The results highlight
+the effectiveness of our method which is capable of sufficiently exploiting the
+knowledge and correlation features across multiple tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of the ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Breaking through the learning plateaus of in-context learning in
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06054v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06054v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwen Fu, Tao Yang, Yuwang Wang, Yan Lu, Nanning Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning, i.e., learning from context examples, is an impressive
+ability of Transformer. Training Transformers to possess this in-context
+learning skill is computationally intensive due to the occurrence of learning
+plateaus, which are periods within the training process where there is minimal
+or no enhancement in the model's in-context learning capability. To study the
+mechanism behind the learning plateaus, we conceptually seperate a component
+within the model's internal representation that is exclusively affected by the
+model's weights. We call this the "weights component", and the remainder is
+identified as the "context component". By conducting meticulous and controlled
+experiments on synthetic tasks, we note that the persistence of learning
+plateaus correlates with compromised functionality of the weights component.
+Recognizing the impaired performance of the weights component as a fundamental
+behavior drives learning plateaus, we have developed three strategies to
+expedite the learning of Transformers. The effectiveness of these strategies is
+further confirmed in natural language processing tasks. In conclusion, our
+research demonstrates the feasibility of cultivating a powerful in-context
+learning ability within AI systems in an eco-friendly manner.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Free Prediction with Uncertainty Assessment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12684v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12684v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuling Jiao, Lican Kang, Jin Liu, Heng Peng, Heng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep nonparametric regression, characterized by the utilization of deep
+neural networks to learn target functions, has emerged as a focus of research
+attention in recent years. Despite considerable progress in understanding
+convergence rates, the absence of asymptotic properties hinders rigorous
+statistical inference. To address this gap, we propose a novel framework that
+transforms the deep estimation paradigm into a platform conducive to
+conditional mean estimation, leveraging the conditional diffusion model.
+Theoretically, we develop an end-to-end convergence rate for the conditional
+diffusion model and establish the asymptotic normality of the generated
+samples. Consequently, we are equipped to construct confidence regions,
+facilitating robust statistical inference. Furthermore, through numerical
+experiments, we empirically validate the efficacy of our proposed methodology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalised Diffusion Probabilistic Scale-Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Peter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion probabilistic models excel at sampling new images from learned
+distributions. Originally motivated by drift-diffusion concepts from physics,
+they apply image perturbations such as noise and blur in a forward process that
+results in a tractable probability distribution. A corresponding learned
+reverse process generates images and can be conditioned on side information,
+which leads to a wide variety of practical applications. Most of the research
+focus currently lies on practice-oriented extensions. In contrast, the
+theoretical background remains largely unexplored, in particular the relations
+to drift-diffusion. In order to shed light on these connections to classical
+image filtering, we propose a generalised scale-space theory for diffusion
+probabilistic models. Moreover, we show conceptual and empirical connections to
+diffusion and osmosis filters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When is Tree Search Useful for LLM Planning? It Depends on the
+  Discriminator <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10890v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10890v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziru Chen, Michael White, Raymond Mooney, Ali Payani, Yu Su, Huan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we examine how large language models (LLMs) solve multi-step
+problems under a language agent framework with three components: a generator, a
+discriminator, and a planning method. We investigate the practical utility of
+two advanced planning methods, iterative correction and tree search. We present
+a comprehensive analysis of how discrimination accuracy affects the overall
+performance of agents when using these two methods or a simpler method,
+re-ranking. Experiments on two tasks, text-to-SQL parsing and mathematical
+reasoning, show that: (1) advanced planning methods demand discriminators with
+at least 90% accuracy to achieve significant improvements over re-ranking; (2)
+current LLMs' discrimination abilities have not met the needs of advanced
+planning methods to achieve such improvements; (3) with LLM-based
+discriminators, advanced planning methods may not adequately balance accuracy
+and efficiency. For example, compared to the other two methods, tree search is
+at least 10--20 times slower but leads to negligible performance gains, which
+hinders its real-world applications. Code and data are available at
+https://github.com/OSU-NLP-Group/llm-planning-eval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Codebook Knowledge with NLI and Chat<span class="highlight-title">GPT</span> for Zero-Shot
+  Political Relation Classification <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07876v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07876v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibo Hu, Erick Skorupa Parolin, Latifur Khan, Patrick T. Brandt, Javier Osorio, Vito J. D'Orazio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Is it possible accurately classify political relations within evolving event
+ontologies without extensive annotations? This study investigates zero-shot
+learning methods that use expert knowledge from existing annotation codebook,
+and evaluates the performance of advanced ChatGPT (GPT-3.5/4) and a natural
+language inference (NLI)-based model called ZSP. ChatGPT uses codebook's
+labeled summaries as prompts, whereas ZSP breaks down the classification task
+into context, event mode, and class disambiguation to refine task-specific
+hypotheses. This decomposition enhances interpretability, efficiency, and
+adaptability to schema changes. The experiments reveal ChatGPT's strengths and
+limitations, and crucially show ZSP's outperformance of dictionary-based
+methods and its competitive edge over some supervised models. These findings
+affirm the value of ZSP for validating event records and advancing ontology
+development. Our study underscores the efficacy of leveraging transfer learning
+and existing domain expertise to enhance research efficiency and scalability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RIFF: Learning to Rephrase Inputs for Few-shot Fine-tuning of Language
+  Models <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saeed Najafi, Alona Fyshe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained Language Models (PLMs) can be accurately fine-tuned for
+downstream text processing tasks. Recently, researchers have introduced several
+parameter-efficient fine-tuning methods that optimize input prompts or adjust a
+small number of model parameters (e.g LoRA). In this study, we explore the
+impact of altering the input text of the original task in conjunction with
+parameter-efficient fine-tuning methods. To most effectively rewrite the input
+text, we train a few-shot paraphrase model with a Maximum-Marginal Likelihood
+objective. Using six few-shot text classification datasets, we show that
+enriching data with paraphrases at train and test time enhances the performance
+beyond what can be achieved with parameter-efficient fine-tuning alone. The
+code used for our experiments can be found at
+https://github.com/SaeedNajafi/RIFF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final Version (Findings of ACL2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simulating infinite-dimensional nonlinear diffusion bridges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18353v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18353v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gefan Yang, Elizabeth Louise Baker, Michael L. Severinsen, Christy Anna Hipsley, Stefan Sommer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion bridge is a type of diffusion process that conditions on
+hitting a specific state within a finite time period. It has broad applications
+in fields such as Bayesian inference, financial mathematics, control theory,
+and shape analysis. However, simulating the diffusion bridge for natural data
+can be challenging due to both the intractability of the drift term and
+continuous representations of the data. Although several methods are available
+to simulate finite-dimensional diffusion bridges, infinite-dimensional cases
+remain unresolved. In the paper, we present a solution to this problem by
+merging score-matching techniques with operator learning, enabling a direct
+approach to score-matching for the infinite-dimensional bridge. We construct
+the score to be discretization invariant, which is natural given the underlying
+spatially continuous process. We conduct a series of experiments, ranging from
+synthetic examples with closed-form solutions to the stochastic nonlinear
+evolution of real-world biological shape data, and our method demonstrates high
+efficacy, particularly due to its ability to adapt to any resolution without
+extra training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DPZero: Private Fine-Tuning of Language Models without Backpropagation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09639v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09639v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Zhang, Bingcong Li, Kiran Koshy Thekumparampil, Sewoong Oh, Niao He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread practice of fine-tuning large language models (LLMs) on
+domain-specific data faces two major challenges in memory and privacy. First,
+as the size of LLMs continues to grow, the memory demands of gradient-based
+training methods via backpropagation become prohibitively high. Second, given
+the tendency of LLMs to memorize training data, it is important to protect
+potentially sensitive information in the fine-tuning data from being
+regurgitated. Zeroth-order methods, which rely solely on forward passes,
+substantially reduce memory consumption during training. However, directly
+combining them with standard differentially private gradient descent suffers
+more as model size grows. To bridge this gap, we introduce DPZero, a novel
+private zeroth-order algorithm with nearly dimension-independent rates. The
+memory efficiency of DPZero is demonstrated in privately fine-tuning RoBERTa
+and OPT on several downstream tasks. Our code is available at
+https://github.com/Liang137/DPZero.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HoneyBee: A Scalable Modular Framework for Creating Multimodal Oncology
+  <span class="highlight-title">Dataset</span>s with Foundational Embedding Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07460v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07460v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aakash Tripathi, Asim Waqas, Yasin Yilmaz, Ghulam Rasool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing accurate machine learning models for oncology requires
+large-scale, high-quality multimodal datasets. However, creating such datasets
+remains challenging due to the complexity and heterogeneity of medical data. To
+address this challenge, we introduce HoneyBee, a scalable modular framework for
+building multimodal oncology datasets that leverages foundation models to
+generate representative embeddings. HoneyBee integrates various data
+modalities, including clinical diagnostic and pathology imaging data, medical
+notes, reports, records, and molecular data. It employs data preprocessing
+techniques and foundation models to generate embeddings that capture the
+essential features and relationships within the raw medical data. The generated
+embeddings are stored in a structured format using Hugging Face datasets and
+PyTorch dataloaders for accessibility. Vector databases enable efficient
+querying and retrieval for machine learning applications. We demonstrate the
+effectiveness of HoneyBee through experiments assessing the quality and
+representativeness of these embeddings. The framework is designed to be
+extensible to other medical domains and aims to accelerate oncology research by
+providing high-quality, machine learning-ready datasets. HoneyBee is an ongoing
+open-source effort, and the code, datasets, and models are available at the
+project repository.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Temporal Graph Networks Using Module Decoupling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02721v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02721v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Or Feldman, Chaim Baskin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern approaches for learning on dynamic graphs have adopted the use of
+batches instead of applying updates one by one. The use of batches allows these
+techniques to become helpful in streaming scenarios where updates to graphs are
+received at extreme speeds. Using batches, however, forces the models to update
+infrequently, which results in the degradation of their performance. In this
+work, we suggest a decoupling strategy that enables the models to update
+frequently while using batches. By decoupling the core modules of temporal
+graph networks and implementing them using a minimal number of learnable
+parameters, we have developed the Lightweight Decoupled Temporal Graph Network
+(LDTGN), an exceptionally efficient model for learning on dynamic graphs. LDTG
+was validated on various dynamic graph benchmarks, providing comparable or
+state-of-the-art results with significantly higher throughput than previous
+art. Notably, our method outperforms previous approaches by more than 20\% on
+benchmarks that require rapid model update rates, such as USLegis or UNTrade.
+The code to reproduce our experiments is available at
+\href{https://orfeld415.github.io/module-decoupling}{this http url}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Statistically Optimal Generative Modeling with Maximum Deviation from
+  the Empirical Distribution <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16422v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16422v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elen Vardanyan, Sona Hunanyan, Tigran Galstyan, Arshak Minasyan, Arnak Dalalyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the problem of generative modeling, aiming to simulate
+diverse examples from an unknown distribution based on observed examples. While
+recent studies have focused on quantifying the statistical precision of popular
+algorithms, there is a lack of mathematical evaluation regarding the
+non-replication of observed examples and the creativity of the generative
+model. We present theoretical insights into this aspect, demonstrating that the
+Wasserstein GAN, constrained to left-invertible push-forward maps, generates
+distributions that avoid replication and significantly deviate from the
+empirical distribution. Importantly, we show that left-invertibility achieves
+this without compromising the statistical optimality of the resulting
+generator. Our most important contribution provides a finite-sample lower bound
+on the Wasserstein-1 distance between the generative distribution and the
+empirical one. We also establish a finite-sample upper bound on the distance
+between the generative distribution and the true data-generating one. Both
+bounds are explicit and show the impact of key parameters such as sample size,
+dimensions of the ambient and latent spaces, noise level, and smoothness
+measured by the Lipschitz constant.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-Linear Inference Time Intervention: Improving LLM Truthfulness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakub Hoscilowicz, Adam Wiacek, Jan Chojnacki, Adam Cieslak, Leszek Michon, Vitalii Urbanevych, Artur Janicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we explore LLM's internal representation space to identify
+attention heads that contain the most truthful and accurate information. We
+further developed the Inference Time Intervention (ITI) framework, which lets
+bias LLM without the need for fine-tuning. The improvement manifests in
+introducing a non-linear multi-token probing and multi-token intervention:
+Non-Linear ITI (NL-ITI), which significantly enhances performance on evaluation
+benchmarks. NL-ITI is tested on diverse multiple-choice datasets, including
+TruthfulQA, on which we report over 16% relative MC1 (accuracy of model
+pointing to the correct answer) improvement with respect to the baseline ITI
+results. Moreover, we achieved a 10% relative improvement over the recently
+released Truth Forest (TrFf) method that also focused on ITI improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted on Interspeech 2024 Conference. Code is available at
+  https://github.com/Samsung/NL-ITI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CityLight: A Universal Model Towards Real-world City-scale Traffic
+  Signal Control Coordination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02126v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02126v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinwei Zeng, Chao Yu, Xinyi Yang, Wenxuan Ao, Jian Yuan, Yong Li, Yu Wang, Huazhong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic signal control (TSC) is a promising low-cost measure to enhance
+transportation efficiency without affecting existing road infrastructure. While
+various reinforcement learning-based TSC methods have been proposed and
+experimentally outperform conventional rule-based methods, none of them has
+been deployed in the real world. An essential gap lies in the
+oversimplification of the scenarios in terms of intersection heterogeneity and
+road network intricacy. To make TSC applicable in urban traffic management, we
+target TSC coordination in city-scale high-authenticity road networks, aiming
+to solve the three unique and important challenges: city-level scalability,
+heterogeneity of real-world intersections, and effective coordination among
+intricate neighbor connections. Since optimizing multiple agents in a
+parameter-sharing paradigm can boost the training efficiency and help achieve
+scalability, we propose our method, CityLight, based on the well-acknowledged
+optimization framework, parameter-sharing MAPPO. To ensure the unified policy
+network can learn to fit large-scale heterogeneous intersections and tackle the
+intricate between-neighbor coordination, CityLight proposes a universal
+representation module that consists of two key designs: heterogeneous
+intersection alignment and neighborhood impact alignment for coordination. To
+further boost coordination, CityLight adopts neighborhood-integrated rewards to
+transition from achieving local optimal to global optimal. Extensive
+experiments on datasets with hundreds to tens of thousands of real-world
+intersections and authentic traffic demands validate the surprising
+effectiveness and generalizability of CityLight, with an overall performance
+gain of 11.66% and a 22.59% improvement in transfer scenarios in terms of
+throughput.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lever LM: Configuring In-Context Sequence to Lever Large Vision Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10104v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10104v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Yang, Yingzhe Peng, Haoxuan Ma, Shuo Xu, Chi Zhang, Yucheng Han, Hanwang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Archimedes famously said, ``Give me a lever long enough and a fulcrum on
+which to place it, and I shall move the world'', in this study, we propose to
+use a tiny Language Model (LM), \eg, a Transformer with 67M parameters, to
+lever much larger Vision-Language Models (LVLMs) with 9B parameters.
+Specifically, we use this tiny \textbf{Lever-LM} to configure effective
+in-context demonstration (ICD) sequences to improve the In-Context Learinng
+(ICL) performance of LVLMs. Previous studies show that diverse ICD
+configurations like the selection and ordering of the demonstrations heavily
+affect the ICL performance, highlighting the significance of configuring
+effective ICD sequences. Motivated by this and by re-considering the the
+process of configuring ICD sequence, we find this is a mirror process of human
+sentence composition and further assume that effective ICD configurations may
+contain internal statistical patterns that can be captured by Lever-LM. Then a
+dataset with effective ICD sequences is constructed to train Lever-LM. After
+training, given novel queries, new ICD sequences are configured by the trained
+Lever-LM to solve vision-language tasks through ICL. Experiments show that
+these ICD sequences can improve the ICL performance of two LVLMs compared with
+some strong baselines in Visual Question Answering and Image Captioning,
+validating that Lever-LM can really capture the statistical patterns for
+levering LVLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformal Prediction for Deep Classifier via Label Ranking <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06430v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06430v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianguo Huang, Huajun Xi, Linjun Zhang, Huaxiu Yao, Yue Qiu, Hongxin Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conformal prediction is a statistical framework that generates prediction
+sets containing ground-truth labels with a desired coverage guarantee. The
+predicted probabilities produced by machine learning models are generally
+miscalibrated, leading to large prediction sets in conformal prediction. To
+address this issue, we propose a novel algorithm named $\textit{Sorted Adaptive
+Prediction Sets}$ (SAPS), which discards all the probability values except for
+the maximum softmax probability. The key idea behind SAPS is to minimize the
+dependence of the non-conformity score on the probability values while
+retaining the uncertainty information. In this manner, SAPS can produce compact
+prediction sets and communicate instance-wise uncertainty. Extensive
+experiments validate that SAPS not only lessens the prediction sets but also
+broadly enhances the conditional coverage rate of prediction sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Generate Instruction Tuning <span class="highlight-title">Dataset</span>s for Zero-Shot Task
+  Adaptation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18334v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18334v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nihal V. Nayak, Yiyang Nan, Avi Trost, Stephen H. Bach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Bonito, an open-source model for conditional task generation
+that converts unannotated text into task-specific training datasets for
+instruction tuning. We aim to enable zero-shot task adaptation of large
+language models on users' specialized, private data. We train Bonito by
+fine-tuning a pretrained large language model on a new large-scale dataset with
+1.65M examples created by remixing existing instruction tuning datasets into
+meta-templates. The meta-templates for a dataset produce training examples
+where the input is the unannotated text and the task attribute and the output
+consists of the instruction and the response. We use Bonito to generate
+synthetic tasks for seven datasets from specialized domains with unannotated
+text across three task types -- yes-no question answering, extractive question
+answering, and natural language inference -- and adapt language models. We show
+that Bonito significantly improves the average performance of pretrained and
+instruction tuned models over the de facto self supervised baseline. For
+example, adapting Mistral-Instruct-v2 and instruction tuned variants of Mistral
+and Llama2 with Bonito improves the strong zero-shot performance by 22.1 F1
+points whereas the next word prediction objective undoes some of the benefits
+of instruction tuning and reduces the average performance by 0.8 F1 points. We
+conduct additional experiments with Bonito to understand the effects of the
+domain, the size of the training set, and the choice of alternative synthetic
+task generators. Overall, we show that learning with synthetic instruction
+tuning datasets is an effective way to adapt language models to new domains.
+The model, dataset, and code are available at
+https://github.com/BatsResearch/bonito.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00083v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00083v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Xue, Mengxin Zheng, Yebowen Hu, Fei Liu, Xun Chen, Qian Lou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are constrained by outdated information and a
+tendency to generate incorrect data, commonly referred to as "hallucinations."
+Retrieval-Augmented Generation (RAG) addresses these limitations by combining
+the strengths of retrieval-based methods and generative models. This approach
+involves retrieving relevant information from a large, up-to-date dataset and
+using it to enhance the generation process, leading to more accurate and
+contextually appropriate responses. Despite its benefits, RAG introduces a new
+attack surface for LLMs, particularly because RAG databases are often sourced
+from public data, such as the web. In this paper, we propose \TrojRAG{} to
+identify the vulnerabilities and attacks on retrieval parts (RAG database) and
+their indirect attacks on generative parts (LLMs). Specifically, we identify
+that poisoning several customized content passages could achieve a retrieval
+backdoor, where the retrieval works well for clean queries but always returns
+customized poisoned adversarial queries. Triggers and poisoned passages can be
+highly customized to implement various attacks. For example, a trigger could be
+a semantic group like "The Republican Party, Donald Trump, etc." Adversarial
+passages can be tailored to different contents, not only linked to the triggers
+but also used to indirectly attack generative LLMs without modifying them.
+These attacks can include denial-of-service attacks on RAG and semantic
+steering attacks on LLM generations conditioned by the triggers. Our
+experiments demonstrate that by just poisoning 10 adversarial passages can
+induce 98.2\% success rate to retrieve the adversarial passages. Then, these
+passages can increase the reject ratio of RAG-based GPT-4 from 0.01\% to 74.6\%
+or increase the rate of negative responses from 0.22\% to 72\% for targeted
+queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Minimizing $f$-Divergences by Interpolating Velocity Fields <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15577v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15577v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Song Liu, Jiahao Yu, Jack Simons, Mingxuan Yi, Mark Beaumont
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many machine learning problems can be seen as approximating a \textit{target}
+distribution using a \textit{particle} distribution by minimizing their
+statistical discrepancy. Wasserstein Gradient Flow can move particles along a
+path that minimizes the $f$-divergence between the target and particle
+distributions. To move particles, we need to calculate the corresponding
+velocity fields derived from a density ratio function between these two
+distributions. Previous works estimated such density ratio functions and then
+differentiated the estimated ratios. These approaches may suffer from
+overfitting, leading to a less accurate estimate of the velocity fields.
+Inspired by non-parametric curve fitting, we directly estimate these velocity
+fields using interpolation techniques. We prove that our estimators are
+consistent under mild conditions. We validate their effectiveness using novel
+applications on domain adaptation and missing data imputation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This manuscript is an extended version of the ICML2024 version. The
+  code for reproducing our results can be found at
+  https://github.com/anewgithubname/gradest2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Control in Population Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01799v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01799v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noah Golowich, Elad Hazan, Zhou Lu, Dhruv Rohatgi, Y. Jennifer Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The study of population dynamics originated with early sociological works but
+has since extended into many fields, including biology, epidemiology,
+evolutionary game theory, and economics. Most studies on population dynamics
+focus on the problem of prediction rather than control. Existing mathematical
+models for control in population dynamics are often restricted to specific,
+noise-free dynamics, while real-world population changes can be complex and
+adversarial.
+  To address this gap, we propose a new framework based on the paradigm of
+online control. We first characterize a set of linear dynamical systems that
+can naturally model evolving populations. We then give an efficient
+gradient-based controller for these systems, with near-optimal regret bounds
+with respect to a broad class of linear policies. Our empirical evaluations
+demonstrate the effectiveness of the proposed algorithm for control in
+population dynamics even for non-linear models such as SIR and replicator
+dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AdaLomo: Low-memory Optimization with Adaptive Learning Rate <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10195v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10195v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Lv, Hang Yan, Qipeng Guo, Haijun Lv, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have achieved remarkable success, but their extensive
+parameter size necessitates substantial memory for training, thereby setting a
+high threshold. While the recently proposed low-memory optimization (LOMO)
+reduces memory footprint, its optimization technique, akin to stochastic
+gradient descent, is sensitive to hyper-parameters and exhibits suboptimal
+convergence, failing to match the performance of the prevailing optimizer for
+large language models, AdamW. Through empirical analysis of the Adam optimizer,
+we found that, compared to momentum, the adaptive learning rate is more
+critical for bridging the gap. Building on this insight, we introduce the
+low-memory optimization with adaptive learning rate (AdaLomo), which offers an
+adaptive learning rate for each parameter. To maintain memory efficiency, we
+employ non-negative matrix factorization for the second-order moment estimation
+in the optimizer state. Additionally, we suggest the use of a grouped update
+normalization to stabilize convergence. Our experiments with instruction-tuning
+and further pre-training demonstrate that AdaLomo achieves results on par with
+AdamW, while significantly reducing memory requirements, thereby lowering the
+hardware barrier to training large language models. The code is accessible at
+https://github.com/OpenLMLab/LOMO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 camera ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging KANs For Enhanced Deep Koopman Operator Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02875v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02875v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Nehma, Madhur Tiwari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-layer perceptrons (MLP's) have been extensively utilized in discovering
+Deep Koopman operators for linearizing nonlinear dynamics. With the emergence
+of Kolmogorov-Arnold Networks (KANs) as a more efficient and accurate
+alternative to the MLP Neural Network, we propose a comparison of the
+performance of each network type in the context of learning Koopman operators
+with control. In this work, we propose a KANs-based deep Koopman framework with
+applications to an orbital Two-Body Problem (2BP) and the pendulum for
+data-driven discovery of linear system dynamics. KANs were found to be superior
+in nearly all aspects of training; learning 31 times faster, being 15 times
+more parameter efficiency, and predicting 1.25 times more accurately as
+compared to the MLP Deep Neural Networks (DNNs) in the case of the 2BP. Thus,
+KANs shows potential for being an efficient tool in the development of Deep
+Koopman Theory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detecting Model Misspecification in Amortized Bayesian Inference with
+  Neural Networks: An Extended Investigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03154v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03154v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marvin Schmitt, Paul-Christian Bürkner, Ullrich Köthe, Stefan T. Radev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in probabilistic deep learning enable efficient amortized
+Bayesian inference in settings where the likelihood function is only implicitly
+defined by a simulation program (simulation-based inference; SBI). But how
+faithful is such inference if the simulation represents reality somewhat
+inaccurately, that is, if the true system behavior at test time deviates from
+the one seen during training? We conceptualize the types of such model
+misspecification arising in SBI and systematically investigate how the
+performance of neural posterior approximators gradually deteriorates as a
+consequence, making inference results less and less trustworthy. To notify
+users about this problem, we propose a new misspecification measure that can be
+trained in an unsupervised fashion (i.e., without training data from the true
+distribution) and reliably detects model misspecification at test time. Our
+experiments clearly demonstrate the utility of our new measure both on toy
+examples with an analytical ground-truth and on representative scientific tasks
+in cell biology, cognitive decision making, disease outbreak dynamics, and
+computer vision. We show how the proposed misspecification test warns users
+about suspicious outputs, raises an alarm when predictions are not trustworthy,
+and guides model designers in their search for better simulators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of the conference paper
+  https://doi.org/10.1007/978-3-031-54605-1_35. arXiv admin note: text overlap
+  with arXiv:2112.08866</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Emulated Disalignment: Safety Alignment for Large Language Models May
+  Backfire! <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12343v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12343v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanhui Zhou, Jie Liu, Zhichen Dong, Jiaheng Liu, Chao Yang, Wanli Ouyang, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) undergo safety alignment to ensure safe
+conversations with humans. However, this paper introduces a training-free
+attack method capable of reversing safety alignment, converting the outcomes of
+stronger alignment into greater potential for harm by accessing only LLM output
+token distributions. Specifically, our method achieves this reversal by
+contrasting the output token distribution of a safety-aligned language model
+(e.g., Llama-2-chat) against its pre-trained version (e.g., Llama-2), so that
+the token predictions are shifted towards the opposite direction of safety
+alignment. We name this method emulated disalignment (ED) because sampling from
+this contrastive distribution provably emulates the result of fine-tuning to
+minimize a safety reward. Our experiments with ED across three evaluation
+datasets and four model families (Llama-1, Llama-2, Mistral, and Alpaca) show
+that ED doubles the harmfulness of pre-trained models and outperforms strong
+baselines, achieving the highest harmful rates in 43 out of 48 evaluation
+subsets by a large margin. Eventually, given ED's reliance on language model
+output token distributions, which particularly compromises open-source models,
+our findings highlight the need to reassess the open accessibility of language
+models, even if they have been safety-aligned. Code is available at
+https://github.com/ZHZisZZ/emulated-disalignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparing statistical and machine learning methods for time series
+  forecasting in data-driven logistics -- A simulation study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07139v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07139v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lena Schmid, Moritz Roidl, Markus Pauly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many planning and decision activities in logistics and supply chain
+management are based on forecasts of multiple time dependent factors.
+Therefore, the quality of planning depends on the quality of the forecasts. We
+compare various forecasting methods in terms of out of the box forecasting
+performance on a broad set of simulated time series. We simulate various linear
+and non-linear time series and look at the one step forecast performance of
+statistical learning methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Competition Report: Finding Universal Jailbreak Backdoors in Aligned
+  LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14461v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14461v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Rando, Francesco Croce, Kryštof Mitka, Stepan Shabalin, Maksym Andriushchenko, Nicolas Flammarion, Florian Tramèr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models are aligned to be safe, preventing users from
+generating harmful content like misinformation or instructions for illegal
+activities. However, previous work has shown that the alignment process is
+vulnerable to poisoning attacks. Adversaries can manipulate the safety training
+data to inject backdoors that act like a universal sudo command: adding the
+backdoor string to any prompt enables harmful responses from models that,
+otherwise, behave safely. Our competition, co-located at IEEE SaTML 2024,
+challenged participants to find universal backdoors in several large language
+models. This report summarizes the key findings and promising ideas for future
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Competition Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provably Efficient Bayesian Optimization with Unknown Gaussian Process
+  Hyperparameter Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06844v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06844v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huong Ha, Vu Nguyen, Hung Tran-The, Hongyu Zhang, Xiuzhen Zhang, Anton van den Hengel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian process (GP) based Bayesian optimization (BO) is a powerful method
+for optimizing black-box functions efficiently. The practical performance and
+theoretical guarantees of this approach depend on having the correct GP
+hyperparameter values, which are usually unknown in advance and need to be
+estimated from the observed data. However, in practice, these estimations could
+be incorrect due to biased data sampling strategies used in BO. This can lead
+to degraded performance and break the sub-linear global convergence guarantee
+of BO. To address this issue, we propose a new BO method that can sub-linearly
+converge to the objective function's global optimum even when the true GP
+hyperparameters are unknown in advance and need to be estimated from the
+observed data. Our method uses a multi-armed bandit technique (EXP3) to add
+random data points to the BO process, and employs a novel training loss
+function for the GP hyperparameter estimation process that ensures consistent
+estimation. We further provide theoretical analysis of our proposed method.
+Finally, we demonstrate empirically that our method outperforms existing
+approaches on various synthetic and real-world problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Minimax Optimality of Model-based Robust Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.05372v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.05372v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Clavier, Erwan Le Pennec, Matthieu Geist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the sample complexity of obtaining an $\epsilon$-optimal policy in
+\emph{Robust} discounted Markov Decision Processes (RMDPs), given only access
+to a generative model of the nominal kernel. This problem is widely studied in
+the non-robust case, and it is known that any planning approach applied to an
+empirical MDP estimated with $\tilde{\mathcal{O}}(\frac{H^3 \mid S \mid\mid A
+\mid}{\epsilon^2})$ samples provides an $\epsilon$-optimal policy, which is
+minimax optimal. Results in the robust case are much more scarce. For $sa$-
+(resp $s$-)rectangular uncertainty sets, the best known sample complexity is
+$\tilde{\mathcal{O}}(\frac{H^4 \mid S \mid^2\mid A \mid}{\epsilon^2})$ (resp.
+$\tilde{\mathcal{O}}(\frac{H^4 \mid S \mid^2\mid A \mid^2}{\epsilon^2})$), for
+specific algorithms and when the uncertainty set is based on the total
+variation (TV), the KL or the Chi-square divergences. In this paper, we
+consider uncertainty sets defined with an $L_p$-ball (recovering the TV case),
+and study the sample complexity of \emph{any} planning algorithm (with high
+accuracy guarantee on the solution) applied to an empirical RMDP estimated
+using the generative model. In the general case, we prove a sample complexity
+of $\tilde{\mathcal{O}}(\frac{H^4 \mid S \mid\mid A \mid}{\epsilon^2})$ for
+both the $sa$- and $s$-rectangular cases (improvements of $\mid S \mid$ and
+$\mid S \mid\mid A \mid$ respectively). When the size of the uncertainty is
+small enough, we improve the sample complexity to
+$\tilde{\mathcal{O}}(\frac{H^3 \mid S \mid\mid A \mid }{\epsilon^2})$,
+recovering the lower-bound for the non-robust case for the first time and a
+robust lower-bound when the size of the uncertainty is small enough.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Dataset</span> Condensation for Time Series Classification via Dual Domain
+  Matching <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07245v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07245v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanyu Liu, Ke Hao, Guanjie Zheng, Yanwei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series data has been demonstrated to be crucial in various research
+fields. The management of large quantities of time series data presents
+challenges in terms of deep learning tasks, particularly for training a deep
+neural network. Recently, a technique named \textit{Dataset Condensation} has
+emerged as a solution to this problem. This technique generates a smaller
+synthetic dataset that has comparable performance to the full real dataset in
+downstream tasks such as classification. However, previous methods are
+primarily designed for image and graph datasets, and directly adapting them to
+the time series dataset leads to suboptimal performance due to their inability
+to effectively leverage the rich information inherent in time series data,
+particularly in the frequency domain. In this paper, we propose a novel
+framework named Dataset \textit{\textbf{Cond}}ensation for
+\textit{\textbf{T}}ime \textit{\textbf{S}}eries
+\textit{\textbf{C}}lassification via Dual Domain Matching (\textbf{CondTSC})
+which focuses on the time series classification dataset condensation task.
+Different from previous methods, our proposed framework aims to generate a
+condensed dataset that matches the surrogate objectives in both the time and
+frequency domains. Specifically, CondTSC incorporates multi-view data
+augmentation, dual domain training, and dual surrogate objectives to enhance
+the dataset condensation process in the time and frequency domains. Through
+extensive experiments, we demonstrate the effectiveness of our proposed
+framework, which outperforms other baselines and learns a condensed synthetic
+dataset that exhibits desirable characteristics such as conforming to the
+distribution of the original data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2024 research track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cons-training tensor networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09005v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09005v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Lopez-Piqueres, Jing Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce a novel family of tensor networks, termed
+\textit{constrained matrix product states} (MPS), designed to incorporate
+exactly arbitrary discrete linear constraints, including inequalities, into
+sparse block structures. These tensor networks are particularly tailored for
+modeling distributions with support strictly over the feasible space, offering
+benefits such as reducing the search space in optimization problems,
+alleviating overfitting, improving training efficiency, and decreasing model
+size. Central to our approach is the concept of a quantum region, an extension
+of quantum numbers traditionally used in U(1) symmetric tensor networks,
+adapted to capture any linear constraint, including the unconstrained scenario.
+We further develop a novel canonical form for these new MPS, which allow for
+the merging and factorization of tensor blocks according to quantum region
+fusion rules and permit optimal truncation schemes. Utilizing this canonical
+form, we apply an unsupervised training strategy to optimize arbitrary
+objective functions subject to discrete linear constraints. Our method's
+efficacy is demonstrated by solving the quadratic knapsack problem, achieving
+superior performance compared to a leading nonlinear integer programming
+solver. Additionally, we analyze the complexity and scalability of our
+approach, demonstrating its potential in addressing complex constrained
+combinatorial optimization problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2: mostly improved Fig 1 and 13 for clarity, improved exposition of
+  ideas, and fixed a couple of transcription bugs in the pseudo algo. 3</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Entropy-Regularized Token-Level Policy Optimization for Language Agent
+  Reinforcement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06700v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06700v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muning Wen, Junwei Liao, Cheng Deng, Jun Wang, Weinan Zhang, Ying Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown promise as intelligent agents in
+interactive decision-making tasks. Traditional approaches often depend on
+meticulously designed prompts, high-quality examples, or additional reward
+models for in-context learning, supervised fine-tuning, or RLHF. Reinforcement
+learning (RL) presents a dynamic alternative for LLMs to overcome these
+dependencies by engaging directly with task-specific environments. Nonetheless,
+it faces significant hurdles: 1) instability stemming from the exponentially
+vast action space requiring exploration; 2) challenges in assigning token-level
+credit based on action-level reward signals, resulting in discord between
+maximizing rewards and accurately modeling corpus data. In response to these
+challenges, we introduce Entropy-Regularized Token-level Policy Optimization
+(ETPO), an entropy-augmented RL method tailored for optimizing LLMs at the
+token level. At the heart of ETPO is our novel per-token soft Bellman update,
+designed to harmonize the RL process with the principles of language modeling.
+This methodology decomposes the Q-function update from a coarse action-level
+view to a more granular token-level perspective, backed by theoretical proof of
+optimization consistency. Crucially, this decomposition renders linear time
+complexity in action exploration. We assess the effectiveness of ETPO within a
+simulated environment that models data science code generation as a series of
+multi-step interactive tasks; results underline ETPO's potential as a robust
+method for refining the interactive decision-making capabilities of language
+agents. For a more detailed preliminary work describing our motivation for
+token-level decomposition and applying it in PPO methods, please refer to
+arXiv:2405.15821.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Random Matrix Approach to Low-Multilinear-Rank Tensor Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03169v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03169v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hugo Lebeau, Florent Chatelain, Romain Couillet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a comprehensive understanding of the estimation of a
+planted low-rank signal from a general spiked tensor model near the
+computational threshold. Relying on standard tools from the theory of large
+random matrices, we characterize the large-dimensional spectral behavior of the
+unfoldings of the data tensor and exhibit relevant signal-to-noise ratios
+governing the detectability of the principal directions of the signal. These
+results allow to accurately predict the reconstruction performance of truncated
+multilinear SVD (MLSVD) in the non-trivial regime. This is particularly
+important since it serves as an initialization of the higher-order orthogonal
+iteration (HOOI) scheme, whose convergence to the best low-multilinear-rank
+approximation depends entirely on its initialization. We give a sufficient
+condition for the convergence of HOOI and show that the number of iterations
+before convergence tends to $1$ in the large-dimensional limit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Remaining useful life prediction of Lithium-ion batteries using
+  spatio-temporal multimodal attention networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.18924v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.18924v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sungho Suh, Dhruv Aditya Mittal, Hymalai Bello, Bo Zhou, Mayank Shekhar Jha, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lithium-ion batteries are widely used in various applications, including
+electric vehicles and renewable energy storage. The prediction of the remaining
+useful life (RUL) of batteries is crucial for ensuring reliable and efficient
+operation, as well as reducing maintenance costs. However, determining the life
+cycle of batteries in real-world scenarios is challenging, and existing methods
+have limitations in predicting the number of cycles iteratively. In addition,
+existing works often oversimplify the datasets, neglecting important features
+of the batteries such as temperature, internal resistance, and material type.
+To address these limitations, this paper proposes a two-stage RUL prediction
+scheme for Lithium-ion batteries using a spatio-temporal multimodal attention
+network (ST-MAN). The proposed ST-MAN is to capture the complex spatio-temporal
+dependencies in the battery data, including the features that are often
+neglected in existing works. Despite operating without prior knowledge of
+end-of-life (EOL) events, our method consistently achieves lower error rates,
+boasting mean absolute error (MAE) and mean square error (MSE) of 0.0275 and
+0.0014, respectively, compared to existing convolutional neural networks (CNN)
+and long short-term memory (LSTM)-based methods. The proposed method has the
+potential to improve the reliability and efficiency of battery operations and
+is applicable in various industries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NICE: To Optimize In-Context Examples or Not? <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06733v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06733v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pragya Srivastava, Satvik Golechha, Amit Deshpande, Amit Sharma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work shows that in-context learning and optimization of in-context
+examples (ICE) can significantly improve the accuracy of large language models
+(LLMs) on a wide range of tasks, leading to an apparent consensus that ICE
+optimization is crucial for better performance. However, most of these studies
+assume a fixed or no instruction provided in the prompt. We challenge this
+consensus by investigating the necessity of optimizing ICE when task-specific
+instructions are provided and find that there are many tasks for which it
+yields diminishing returns. In particular, using a diverse set of tasks and a
+systematically created instruction set with gradually added details, we find
+that as the prompt instruction becomes more detailed, the returns on ICE
+optimization diminish. To characterize this behavior, we introduce a
+task-specific metric called Normalized Invariability to Choice of Examples
+(NICE) that quantifies the learnability of tasks from a given instruction, and
+provides a heuristic to help decide whether to optimize instructions or ICE for
+a new task. Given a task, the proposed metric can reliably predict the utility
+of optimizing ICE compared to using random ICE. Our code is available at
+https://github.com/microsoft/nice-icl.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a full paper (9 pages) at ACL 2024 (Main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Objective Hyperparameter Optimization in Machine Learning -- An
+  <span class="highlight-title">Overview</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.07438v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.07438v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Karl, Tobias Pielok, Julia Moosbauer, Florian Pfisterer, Stefan Coors, Martin Binder, Lennart Schneider, Janek Thomas, Jakob Richter, Michel Lang, Eduardo C. Garrido-Merchán, Juergen Branke, Bernd Bischl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperparameter optimization constitutes a large part of typical modern
+machine learning workflows. This arises from the fact that machine learning
+methods and corresponding preprocessing steps often only yield optimal
+performance when hyperparameters are properly tuned. But in many applications,
+we are not only interested in optimizing ML pipelines solely for predictive
+accuracy; additional metrics or constraints must be considered when determining
+an optimal configuration, resulting in a multi-objective optimization problem.
+This is often neglected in practice, due to a lack of knowledge and readily
+available software implementations for multi-objective hyperparameter
+optimization. In this work, we introduce the reader to the basics of
+multi-objective hyperparameter optimization and motivate its usefulness in
+applied ML. Furthermore, we provide an extensive survey of existing
+optimization strategies, both from the domain of evolutionary algorithms and
+Bayesian optimization. We illustrate the utility of MOO in several specific ML
+applications, considering objectives such as operating conditions, prediction
+time, sparseness, fairness, interpretability and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ACM TELO</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Direct Preference Optimization with an Offset 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10571v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10571v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Afra Amini, Tim Vieira, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Direct preference optimization (DPO) is a successful fine-tuning strategy for
+aligning large language models with human preferences without the need to train
+a reward model or employ reinforcement learning. DPO, as originally formulated,
+relies on binary preference data and fine-tunes a language model to increase
+the likelihood of a preferred response over a dispreferred response. However,
+not all preference pairs are equal. Sometimes, the preferred response is only
+slightly better than the dispreferred one. In other cases, the preference is
+much stronger. For instance, if a response contains harmful or toxic content,
+the annotator will have a strong preference for that response. In this paper,
+we propose a generalization of DPO, termed DPO with an offset (ODPO), that does
+not treat every preference pair equally during fine-tuning. Intuitively, ODPO
+requires the difference between the likelihood of the preferred and
+dispreferred response to be greater than an offset value. The offset is
+determined based on the extent to which one response is preferred over another.
+Our experiments on various tasks suggest that ODPO significantly outperforms
+DPO in aligning language models, especially when the number of preference pairs
+is limited.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ThermoHands: A Benchmark for 3D Hand Pose Estimation from Egocentric
+  Thermal Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09871v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09871v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangqiang Ding, Yunzhou Zhu, Xiangyu Wen, Gaowen Liu, Chris Xiaoxuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present ThermoHands, a new benchmark for thermal image-based
+egocentric 3D hand pose estimation, aimed at overcoming challenges like varying
+lighting conditions and obstructions (e.g., handwear). The benchmark includes a
+multi-view and multi-spectral dataset collected from 28 subjects performing
+hand-object and hand-virtual interactions under diverse scenarios, accurately
+annotated with 3D hand poses through an automated process. We introduce a new
+baseline method, TherFormer, utilizing dual transformer modules for effective
+egocentric 3D hand pose estimation in thermal imagery. Our experimental results
+highlight TherFormer's leading performance and affirm thermal imaging's
+effectiveness in enabling robust 3D hand pose estimation in adverse conditions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Building Expressive and Tractable Probabilistic Generative Models: A
+  <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00759v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00759v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sahil Sidheekh, Sriraam Natarajan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a comprehensive survey of the advancements and techniques in the
+field of tractable probabilistic generative modeling, primarily focusing on
+Probabilistic Circuits (PCs). We provide a unified perspective on the inherent
+trade-offs between expressivity and tractability, highlighting the design
+principles and algorithmic extensions that have enabled building expressive and
+efficient PCs, and provide a taxonomy of the field. We also discuss recent
+efforts to build deep and hybrid PCs by fusing notions from deep neural models,
+and outline the challenges and open questions that can guide future research in
+this evolving field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking General Purpose In-Context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17234v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17234v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Wang, Chuan Lin, Yang Cao, Yu Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning (ICL) is becoming increasingly appealing to the AI
+community due to its flexibility, generality, sample efficiency, and exemption
+from artificial optimization skills. It is desirable to further enhance the
+generality and capability of ICL, which gives rise to the concept of
+general-purpose in-context learning (GPICL). We aim to extend ICL to address a
+broader range of tasks with an extended learning horizon and higher improvement
+potential, albeit with relatively limited zero-shot generalization. To this
+end, we introduce two lightweight but insightful benchmarks specifically
+crafted to train and evaluate GPICL functionalities. Each benchmark includes a
+vast number of tasks characterized by significant task variance, featuring
+minimal transferable knowledge among tasks. These tasks are designed to
+facilitate lifelong in-context learning through continuous generation and
+interaction. These features pose significant challenges for models that rely on
+context or interactions to improve their proficiency, including language
+models, decision models, and world models. Our experiments reveal that the
+scale of parameters alone may not be crucial for ICL or GPICL, suggesting
+alternative approaches such as increasing the scale of contexts and memory
+states.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamics and triggers of misinformation on vaccines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.12264v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.12264v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuele Brugnoli, Marco Delmastro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Covid-19 pandemic has sparked renewed attention on the prevalence of
+misinformation online, whether intentional or not, underscoring the potential
+risks posed to individuals' quality of life associated with the dissemination
+of misconceptions and enduring myths on health-related subjects. In this study,
+we analyze 6 years (2016-2021) of Italian vaccine debate across diverse social
+media platforms (Facebook, Instagram, Twitter, YouTube), encompassing all major
+news sources - both questionable and reliable. We first use the symbolic
+transfer entropy analysis of news production time-series to dynamically
+determine which category of sources, questionable or reliable, causally drives
+the agenda on vaccines. Then, leveraging deep learning models capable to
+accurately classify vaccine-related content based on the conveyed stance and
+discussed topic, respectively, we evaluate the focus on various topics by news
+sources promoting opposing views and compare the resulting user engagement.
+Aside from providing valuable resources for further investigation of
+vaccine-related misinformation, particularly in a language (Italian) that
+receives less attention in scientific research compared to languages like
+English, our study uncovers misinformation not as a parasite of the news
+ecosystem that merely opposes the perspectives offered by mainstream media, but
+as an autonomous force capable of even overwhelming the production of
+vaccine-related content from the latter. While the pervasiveness of
+misinformation is evident in the significantly higher engagement of
+questionable sources compared to reliable ones, our findings underscore the
+importance of consistent and thorough pro-vax coverage. This is especially
+crucial in addressing the most sensitive topics where the risk of
+misinformation spreading and potentially exacerbating negative attitudes toward
+vaccines among the users involved is higher.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Label-Efficient Model Selection for Text Generation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07891v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07891v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shir Ashury-Tahan, Ariel Gera, Benjamin Sznajder, Leshem Choshen, Liat Ein-Dor, Eyal Shnarch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model selection for a given target task can be costly, as it may entail
+extensive annotation of the quality of outputs of different models. We
+introduce DiffUse, an efficient method to make an informed decision between
+candidate text generation models based on preference annotations. DiffUse
+reduces the required amount of annotations, thus saving valuable time and
+resources in performing evaluation. DiffUse intelligently selects instances by
+clustering embeddings that represent the semantic differences between model
+outputs. Thus, it is able to identify a subset of examples that are more
+informative for preference decisions. Our method is model-agnostic, and can be
+applied to any text generation model for selecting between models, prompts and
+configurations. Moreover, we propose a practical iterative approach for
+dynamically determining how many instances to annotate. In a series of
+experiments over hundreds of model pairs, we demonstrate that DiffUse can
+dramatically reduce the required number of annotations -- by up to 75% -- while
+maintaining high evaluation reliability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL (main conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">9</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VidMuse: A Simple Video-to-Music Generation Framework with
+  Long-Short-Term Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyue Tian, Zhaoyang Liu, Ruibin Yuan, Jiahao Pan, Xiaoqiang Huang, Qifeng Liu, Xu Tan, Qifeng Chen, Wei Xue, Yike Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we systematically study music generation conditioned solely on
+the video. First, we present a large-scale dataset comprising 190K video-music
+pairs, including various genres such as movie trailers, advertisements, and
+documentaries. Furthermore, we propose VidMuse, a simple framework for
+generating music aligned with video inputs. VidMuse stands out by producing
+high-fidelity music that is both acoustically and semantically aligned with the
+video. By incorporating local and global visual cues, VidMuse enables the
+creation of musically coherent audio tracks that consistently match the video
+content through Long-Short-Term modeling. Through extensive experiments,
+VidMuse outperforms existing models in terms of audio quality, diversity, and
+audio-visual alignment. The code and datasets will be available at
+https://github.com/ZeyueT/VidMuse/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code and datasets will be available at
+  https://github.com/ZeyueT/VidMuse/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReFiNe: Recursive Field Networks for Cross-modal Multi-scene
+  Representation <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.04309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.04309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sergey Zakharov, Katherine Liu, Adrien Gaidon, Rares Ambrus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The common trade-offs of state-of-the-art methods for multi-shape
+representation (a single model "packing" multiple objects) involve trading
+modeling accuracy against memory and storage. We show how to encode multiple
+shapes represented as continuous neural fields with a higher degree of
+precision than previously possible and with low memory usage. Key to our
+approach is a recursive hierarchical formulation that exploits object
+self-similarity, leading to a highly compressed and efficient shape latent
+space. Thanks to the recursive formulation, our method supports spatial and
+global-to-local latent feature fusion without needing to initialize and
+maintain auxiliary data structures, while still allowing for continuous field
+queries to enable applications such as raytracing. In experiments on a set of
+diverse datasets, we provide compelling qualitative results and demonstrate
+state-of-the-art multi-scene reconstruction and compression results with a
+single network per dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH 2024. Project Page:
+  https://zakharos.github.io/projects/refine/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gear-NeRF: Free-Viewpoint Rendering and Tracking with Motion-aware
+  Spatio-Temporal Sampling <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhang Liu, Yu-Wing Tai, Chi-Keung Tang, Pedro Miraldo, Suhas Lohit, Moitreya Chatterjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extensions of Neural Radiance Fields (NeRFs) to model dynamic scenes have
+enabled their near photo-realistic, free-viewpoint rendering. Although these
+methods have shown some potential in creating immersive experiences, two
+drawbacks limit their ubiquity: (i) a significant reduction in reconstruction
+quality when the computing budget is limited, and (ii) a lack of semantic
+understanding of the underlying scenes. To address these issues, we introduce
+Gear-NeRF, which leverages semantic information from powerful image
+segmentation models. Our approach presents a principled way for learning a
+spatio-temporal (4D) semantic embedding, based on which we introduce the
+concept of gears to allow for stratified modeling of dynamic regions of the
+scene based on the extent of their motion. Such differentiation allows us to
+adjust the spatio-temporal sampling resolution for each region in proportion to
+its motion scale, achieving more photo-realistic dynamic novel view synthesis.
+At the same time, almost for free, our approach enables free-viewpoint tracking
+of objects of interest - a functionality not yet achieved by existing
+NeRF-based methods. Empirical studies validate the effectiveness of our method,
+where we achieve state-of-the-art rendering and tracking performance on
+multiple challenging datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted to IEEE/CVF CVPR 2024 (Spotlight). Work done when XL
+  was an intern at MERL. Project Page Link:
+  https://merl.com/research/highlights/gear-nerf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ JIGMARK: A Black-Box Approach for Enhancing Image Watermarks against
+  Diffusion Model Edits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minzhou Pan, Yi Zeng, Xue Lin, Ning Yu, Cho-Jui Hsieh, Peter Henderson, Ruoxi Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we investigate the vulnerability of image watermarks to
+diffusion-model-based image editing, a challenge exacerbated by the
+computational cost of accessing gradient information and the closed-source
+nature of many diffusion models. To address this issue, we introduce JIGMARK.
+This first-of-its-kind watermarking technique enhances robustness through
+contrastive learning with pairs of images, processed and unprocessed by
+diffusion models, without needing a direct backpropagation of the diffusion
+process. Our evaluation reveals that JIGMARK significantly surpasses existing
+watermarking solutions in resilience to diffusion-model edits, demonstrating a
+True Positive Rate more than triple that of leading baselines at a 1% False
+Positive Rate while preserving image quality. At the same time, it consistently
+improves the robustness against other conventional perturbations (like JPEG,
+blurring, etc.) and malicious watermark attacks over the state-of-the-art,
+often by a large margin. Furthermore, we propose the Human Aligned Variation
+(HAV) score, a new metric that surpasses traditional similarity measures in
+quantifying the number of image derivatives from image editing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recognizing Everything from All Modalities at Once: Grounded Multimodal
+  Universal Information Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03701v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03701v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meishan Zhang, Hao Fei, Bin Wang, Shengqiong Wu, Yixin Cao, Fei Li, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of information extraction (IE), tasks across a wide range of
+modalities and their combinations have been traditionally studied in isolation,
+leaving a gap in deeply recognizing and analyzing cross-modal information. To
+address this, this work for the first time introduces the concept of grounded
+Multimodal Universal Information Extraction (MUIE), providing a unified task
+framework to analyze any IE tasks over various modalities, along with their
+fine-grained groundings. To tackle MUIE, we tailor a multimodal large language
+model (MLLM), Reamo, capable of extracting and grounding information from all
+modalities, i.e., recognizing everything from all modalities at once. Reamo is
+updated via varied tuning strategies, equipping it with powerful capabilities
+for information recognition and fine-grained multimodal grounding. To address
+the absence of a suitable benchmark for grounded MUIE, we curate a
+high-quality, diverse, and challenging test set, which encompasses IE tasks
+across 9 common modality combinations with the corresponding multimodal
+groundings. The extensive comparison of Reamo with existing MLLMs integrated
+into pipeline approaches demonstrates its advantages across all evaluation
+dimensions, establishing a strong benchmark for the follow-up research. Our
+resources are publicly released at https://haofei.vip/MUIE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Revolution of Multimodal Large Language Models: A <span class="highlight-title">Survey</span> <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12451v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12451v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Caffagni, Federico Cocchi, Luca Barsellotti, Nicholas Moratelli, Sara Sarto, Lorenzo Baraldi, Lorenzo Baraldi, Marcella Cornia, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Connecting text and visual modalities plays an essential role in generative
+intelligence. For this reason, inspired by the success of large language
+models, significant research efforts are being devoted to the development of
+Multimodal Large Language Models (MLLMs). These models can seamlessly integrate
+visual and textual modalities, while providing a dialogue-based interface and
+instruction-following capabilities. In this paper, we provide a comprehensive
+review of recent visual-based MLLMs, analyzing their architectural choices,
+multimodal alignment strategies, and training techniques. We also conduct a
+detailed analysis of these models across a wide range of tasks, including
+visual grounding, image generation and editing, visual understanding, and
+domain-specific applications. Additionally, we compile and describe training
+datasets and evaluation benchmarks, conducting comparisons among existing
+models in terms of performance and computational requirements. Overall, this
+survey offers a comprehensive overview of the current state of the art, laying
+the groundwork for future MLLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Token-Level Contrastive Learning with Modality-Aware <span class="highlight-title">Prompt</span>ing for
+  Multimodal Intent Recognition <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14667v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14667v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianrui Zhou, Hua Xu, Hao Li, Hanlei Zhang, Xiaohan Zhang, Yifan Wang, Kai Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal intent recognition aims to leverage diverse modalities such as
+expressions, body movements and tone of speech to comprehend user's intent,
+constituting a critical task for understanding human language and behavior in
+real-world multimodal scenarios. Nevertheless, the majority of existing methods
+ignore potential correlations among different modalities and own limitations in
+effectively learning semantic features from nonverbal modalities. In this
+paper, we introduce a token-level contrastive learning method with
+modality-aware prompting (TCL-MAP) to address the above challenges. To
+establish an optimal multimodal semantic environment for text modality, we
+develop a modality-aware prompting module (MAP), which effectively aligns and
+fuses features from text, video and audio modalities with similarity-based
+modality alignment and cross-modality attention mechanism. Based on the
+modality-aware prompt and ground truth labels, the proposed token-level
+contrastive learning framework (TCL) constructs augmented samples and employs
+NT-Xent loss on the label token. Specifically, TCL capitalizes on the optimal
+textual semantic insights derived from intent labels to guide the learning
+processes of other modalities in return. Extensive experiments show that our
+method achieves remarkable improvements compared to state-of-the-art methods.
+Additionally, ablation analyses demonstrate the superiority of the
+modality-aware prompt over the handcrafted prompt, which holds substantial
+significance for multimodal prompt learning. The codes are released at
+https://github.com/thuiar/TCL-MAP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by AAAI 2024 (Main Track, Long Paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ConsistencyTTA: Accelerating Diffusion-Based Text-to-Audio Generation
+  with Consistency Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10740v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10740v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yatong Bai, Trung Dang, Dung Tran, Kazuhito Koishida, Somayeh Sojoudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are instrumental in text-to-audio (TTA) generation.
+Unfortunately, they suffer from slow inference due to an excessive number of
+queries to the underlying denoising network per generation. To address this
+bottleneck, we introduce ConsistencyTTA, a framework requiring only a single
+non-autoregressive network query, thereby accelerating TTA by hundreds of
+times. We achieve so by proposing "CFG-aware latent consistency model," which
+adapts consistency generation into a latent space and incorporates
+classifier-free guidance (CFG) into model training. Moreover, unlike diffusion
+models, ConsistencyTTA can be finetuned closed-loop with audio-space text-aware
+metrics, such as CLAP score, to further enhance the generations. Our objective
+and subjective evaluation on the AudioCaps dataset shows that compared to
+diffusion-based counterparts, ConsistencyTTA reduces inference computation by
+400x while retaining generation quality and diversity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMFeed: A Benchmark <span class="highlight-title">Dataset</span> for Controllable Multimodal Feedback
+  Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07640v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07640v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Puneet Kumar, Sarthak Malik, Balasubramanian Raman, Xiaobai Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Controllable Multimodal Feedback Synthesis (CMFeed) dataset enables the
+generation of sentiment-controlled feedback from multimodal inputs. It contains
+images, text, human comments, comments' metadata and sentiment labels. Existing
+datasets for related tasks such as multimodal summarization, visual question
+answering, visual dialogue, and sentiment-aware text generation do not
+incorporate training models using human-generated outputs and their metadata, a
+gap that CMFeed addresses. This capability is critical for developing feedback
+systems that understand and replicate human-like spontaneous responses. Based
+on the CMFeed dataset, we define a novel task of controllable feedback
+synthesis to generate context-aware feedback aligned with the desired
+sentiment. We propose a benchmark feedback synthesis system comprising encoder,
+decoder, and controllability modules. It employs transformer and Faster R-CNN
+networks to extract features and generate sentiment-specific feedback,
+achieving a sentiment classification accuracy of 77.23%, which is 18.82% higher
+than models not leveraging the dataset's unique controllability features.
+Additionally, we incorporate a similarity module for relevance assessment
+through rank-based metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-06-05T00:00:00Z">2024-06-05</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Wings: Learning Multimodal LLMs without Text-only Forgetting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Kai Zhang, Shiyin Lu, Yang Li, Yanqing Ma, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, De-Chuan Zhan, Han-Jia Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs), initiated with a trained LLM, first
+align images with text and then fine-tune on multimodal mixed inputs. However,
+the MLLM catastrophically forgets the text-only instructions, which do not
+include images and can be addressed within the initial LLM. In this paper, we
+present Wings, a novel MLLM that excels in both text-only dialogues and
+multimodal comprehension. Analyzing MLLM attention in multimodal instructions
+reveals that text-only forgetting is related to the attention shifts from
+pre-image to post-image text. From that, we construct extra modules that act as
+the boosted learner to compensate for the attention shift. The complementary
+visual and textual learners, like "wings" on either side, are connected in
+parallel within each layer's attention block. Initially, image and text inputs
+are aligned with visual learners operating alongside the main attention,
+balancing focus on visual elements. Textual learners are later collaboratively
+integrated with attention-based routing to blend the outputs of the visual and
+textual learners. We design the Low-Rank Residual Attention (LoRRA) to
+guarantee high efficiency for learners. Our experimental results demonstrate
+that Wings outperforms equally-scaled MLLMs in both text-only and visual
+question-answering tasks. On a newly constructed Interleaved Image-Text (IIT)
+benchmark, Wings exhibits superior performance from text-only-rich to
+multimodal-rich question-answering tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing LLM Behavior in Dialogue Summarization: Unveiling
+  Circumstantial Hallucination Trends <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanjana Ramprasad, Elisa Ferracane, Zachary C. Lipton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language models (LLMs) have considerably
+advanced the capabilities of summarization systems. However, they continue to
+face concerns about hallucinations. While prior work has evaluated LLMs
+extensively in news domains, most evaluation of dialogue summarization has
+focused on BART-based models, leaving a gap in our understanding of their
+faithfulness. Our work benchmarks the faithfulness of LLMs for dialogue
+summarization, using human annotations and focusing on identifying and
+categorizing span-level inconsistencies. Specifically, we focus on two
+prominent LLMs: GPT-4 and Alpaca-13B. Our evaluation reveals subtleties as to
+what constitutes a hallucination: LLMs often generate plausible inferences,
+supported by circumstantial evidence in the conversation, that lack direct
+evidence, a pattern that is less prevalent in older models. We propose a
+refined taxonomy of errors, coining the category of "Circumstantial Inference"
+to bucket these LLM behaviors and release the dataset. Using our taxonomy, we
+compare the behavioral differences between LLMs and older fine-tuned models.
+Additionally, we systematically assess the efficacy of automatic error
+detection methods on LLM summaries and find that they struggle to detect these
+nuanced errors. To address this, we introduce two prompt-based approaches for
+fine-grained error detection that outperform existing metrics, particularly for
+identifying "Circumstantial Inference."
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BIPED: Pedagogically Informed Tutoring System for ESL Education <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soonwoo Kwon, Sojung Kim, Minju Park, Seunghyun Lee, Kyuseok Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have a great potential to serve as readily
+available and cost-efficient Conversational Intelligent Tutoring Systems (CITS)
+for teaching L2 learners of English. Existing CITS, however, are designed to
+teach only simple concepts or lack the pedagogical depth necessary to address
+diverse learning strategies. To develop a more pedagogically informed CITS
+capable of teaching complex concepts, we construct a BIlingual
+PEDagogically-informed Tutoring Dataset (BIPED) of one-on-one, human-to-human
+English tutoring interactions. Through post-hoc analysis of the tutoring
+interactions, we come up with a lexicon of dialogue acts (34 tutor acts and 9
+student acts), which we use to further annotate the collected dataset. Based on
+a two-step framework of first predicting the appropriate tutor act then
+generating the corresponding response, we implemented two CITS models using
+GPT-4 and SOLAR-KO, respectively. We experimentally demonstrate that the
+implemented models not only replicate the style of human teachers but also
+employ diverse and contextually appropriate pedagogical strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QJL: 1-Bit Quantized JL Transform for KV Cache Quantization with Zero
+  Overhead 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Zandieh, Majid Daliri, Insu Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Serving LLMs requires substantial memory due to the storage requirements of
+Key-Value (KV) embeddings in the KV cache, which grows with sequence length. An
+effective approach to compress KV cache is quantization. However, traditional
+quantization methods face significant memory overhead due to the need to store
+quantization constants (at least a zero point and a scale) in full precision
+per data block. Depending on the block size, this overhead can add 1 or 2 bits
+per quantized number. We introduce QJL, a new quantization approach that
+consists of a Johnson-Lindenstrauss (JL) transform followed by sign-bit
+quantization. In contrast to existing methods, QJL eliminates memory overheads
+by removing the need for storing quantization constants. We propose an
+asymmetric estimator for the inner product of two vectors and demonstrate that
+applying QJL to one vector and a standard JL transform without quantization to
+the other provides an unbiased estimator with minimal distortion. We have
+developed an efficient implementation of the QJL sketch and its corresponding
+inner product estimator, incorporating a lightweight CUDA kernel for optimized
+computation. When applied across various LLMs and NLP tasks to quantize the KV
+cache to only 3 bits, QJL demonstrates a more than fivefold reduction in KV
+cache memory usage without compromising accuracy, all while achieving faster
+runtime. Codes are available at \url{https://github.com/amirzandieh/QJL}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MODABS: Multi-Objective Learning for Dynamic Aspect-Based Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03479v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03479v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaobo Guo, Soroush Vosoughi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid proliferation of online content necessitates effective
+summarization methods, among which dynamic aspect-based summarization stands
+out. Unlike its traditional counterpart, which assumes a fixed set of known
+aspects, this approach adapts to the varied aspects of the input text. We
+introduce a novel multi-objective learning framework employing a
+Longformer-Encoder-Decoder for this task. The framework optimizes aspect number
+prediction, minimizes disparity between generated and reference summaries for
+each aspect, and maximizes dissimilarity across aspect-specific summaries.
+Extensive experiments show our method significantly outperforms baselines on
+three diverse datasets, largely due to the effective alignment of generated and
+reference aspect counts without sacrificing single-aspect summarization
+quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does your data spark joy? Performance gains from domain upsampling at
+  the end of training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cody Blakeney, Mansheej Paul, Brett W. Larsen, Sean Owen, Jonathan Frankle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretraining datasets for large language models (LLMs) have grown to trillions
+of tokens composed of large amounts of CommonCrawl (CC) web scrape along with
+smaller, domain-specific datasets. It is expensive to understand the impact of
+these domain-specific datasets on model capabilities as training at large FLOP
+scales is required to reveal significant changes to difficult and emergent
+benchmarks. Given the increasing cost of experimenting with pretraining data,
+how does one determine the optimal balance between the diversity in general web
+scrapes and the information density of domain specific data? In this work, we
+show how to leverage the smaller domain specific datasets by upsampling them
+relative to CC at the end of training to drive performance improvements on
+difficult benchmarks. This simple technique allows us to improve up to 6.90 pp
+on MMLU, 8.26 pp on GSM8K, and 6.17 pp on HumanEval relative to the base data
+mix for a 7B model trained for 1 trillion (T) tokens, thus rivaling Llama-2
+(7B)$\unicode{x2014}$a model trained for twice as long. We experiment with
+ablating the duration of domain upsampling from 5% to 30% of training and find
+that 10% to 20% percent is optimal for navigating the tradeoff between general
+language modeling capabilities and targeted benchmarks. We also use domain
+upsampling to characterize at scale the utility of individual datasets for
+improving various benchmarks by removing them during this final phase of
+training. This tool opens up the ability to experiment with the impact of
+different pretraining datasets at scale, but at an order of magnitude lower
+cost compared to full pretraining runs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first three authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using Synchronic Definitions and Semantic Relations to Classify Semantic
+  Change Types 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03452v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03452v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierluigi Cassotti, Stefano De Pascale, Nina Tahmasebi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is abundant evidence of the fact that the way words change their
+meaning can be classified in different types of change, highlighting the
+relationship between the old and new meanings (among which generalization,
+specialization and co-hyponymy transfer). In this paper, we present a way of
+detecting these types of change by constructing a model that leverages
+information both from synchronic lexical relations and definitions of word
+meanings. Specifically, we use synset definitions and hierarchy information
+from WordNet and test it on a digitized version of Blank's (1997) dataset of
+semantic change types. Finally, we show how the sense relationships can improve
+models for both approximation of human judgments of semantic relatedness as
+well as binary Lexical Semantic Change Detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What is the Best Way for Chat<span class="highlight-title">GPT</span> to Translate Poetry? <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanshan Wang, Derek F. Wong, Jingming Yao, Lidia S. Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine translation (MT) has historically faced significant challenges when
+applied to literary works, particularly in the domain of poetry translation.
+The advent of Large Language Models such as ChatGPT holds potential for
+innovation in this field. This study examines ChatGPT's capabilities in
+English-Chinese poetry translation tasks, utilizing targeted prompts and small
+sample scenarios to ascertain optimal performance. Despite promising outcomes,
+our analysis reveals persistent issues in the translations generated by ChatGPT
+that warrant attention. To address these shortcomings, we propose an
+Explanation-Assisted Poetry Machine Translation (EAPMT) method, which leverages
+monolingual poetry explanation as a guiding information for the translation
+process. Furthermore, we refine existing evaluation criteria to better suit the
+nuances of modern poetry translation. We engaged a panel of professional poets
+for assessments, complemented evaluations by using GPT-4. The results from both
+human and machine evaluations demonstrate that our EAPMT method outperforms
+traditional translation methods of ChatGPT and the existing online systems.
+This paper validates the efficacy of our method and contributes a novel
+perspective to machine-assisted literary translation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 1 figure. The paper has been accepted by ACL 2024(Main
+  Conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-train</span>ed Large Language Models Use Fourier Features to Compute
+  Addition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Zhou, Deqing Fu, Vatsal Sharan, Robin Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) exhibit impressive mathematical
+reasoning capabilities, yet how they compute basic arithmetic, such as
+addition, remains unclear. This paper shows that pre-trained LLMs add numbers
+using Fourier features -- dimensions in the hidden state that represent numbers
+via a set of features sparse in the frequency domain. Within the model, MLP and
+attention layers use Fourier features in complementary ways: MLP layers
+primarily approximate the magnitude of the answer using low-frequency features,
+while attention layers primarily perform modular addition (e.g., computing
+whether the answer is even or odd) using high-frequency features. Pre-training
+is crucial for this mechanism: models trained from scratch to add numbers only
+exploit low-frequency features, leading to lower accuracy. Introducing
+pre-trained token embeddings to a randomly initialized model rescues its
+performance. Overall, our analysis demonstrates that appropriate pre-trained
+representations (e.g., Fourier features) can unlock the ability of Transformers
+to learn precise mechanisms for algorithmic tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are language models rational? The case of coherence norms and belief
+  revision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Hofweber, Peter Hase, Elias Stengel-Eskin, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Do norms of rationality apply to machine learning models, in particular
+language models? In this paper we investigate this question by focusing on a
+special subset of rational norms: coherence norms. We consider both logical
+coherence norms as well as coherence norms tied to the strength of belief. To
+make sense of the latter, we introduce the Minimal Assent Connection (MAC) and
+propose a new account of credence, which captures the strength of belief in
+language models. This proposal uniformly assigns strength of belief simply on
+the basis of model internal next token probabilities. We argue that rational
+norms tied to coherence do apply to some language models, but not to others.
+This issue is significant since rationality is closely tied to predicting and
+explaining behavior, and thus it is connected to considerations about AI safety
+and alignment, as well as understanding model behavior more generally.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cycles of Thought: Measuring LLM Confidence through Stable Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evan Becker, Stefano Soatto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many high-risk machine learning applications it is essential for a model
+to indicate when it is uncertain about a prediction. While large language
+models (LLMs) can reach and even surpass human-level accuracy on a variety of
+benchmarks, their overconfidence in incorrect responses is still a
+well-documented failure mode. Traditional methods for ML uncertainty
+quantification can be difficult to directly adapt to LLMs due to the
+computational cost of implementation and closed-source nature of many models. A
+variety of black-box methods have recently been proposed, but these often rely
+on heuristics such as self-verbalized confidence. We instead propose a
+framework for measuring an LLM's uncertainty with respect to the distribution
+of generated explanations for an answer. While utilizing explanations is not a
+new idea in and of itself, by interpreting each possible model+explanation pair
+as a test-time classifier we can calculate a posterior answer distribution over
+the most likely of these classifiers. We demonstrate how a specific instance of
+this framework using explanation entailment as our classifier likelihood
+improves confidence score metrics (in particular AURC and AUROC) over baselines
+across five different datasets. We believe these results indicate that our
+framework is both a well-principled and effective way of quantifying
+uncertainty in LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automating Turkish Educational Quiz Generation Using Large Language
+  Models <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kamyar Zeinalipour, Yusuf Gökberk Keptiğ, Marco Maggini, Marco Gori
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Crafting quizzes from educational content is a pivotal activity that benefits
+both teachers and students by reinforcing learning and evaluating
+understanding. In this study, we introduce a novel approach to generate quizzes
+from Turkish educational texts, marking a pioneering endeavor in educational
+technology specifically tailored to the Turkish educational context. We present
+a specialized dataset, named the Turkish-Quiz-Instruct, comprising an extensive
+collection of Turkish educational texts accompanied by multiple-choice and
+short-answer quizzes. This research leverages the capabilities of Large
+Language Models (LLMs), including GPT-4-Turbo, GPT-3.5-Turbo,
+Llama-2-7b-chat-hf, and Llama-2-13b-chat-hf, to automatically generate quiz
+questions and answers from the Turkish educational content. Our work delineates
+the methodology for employing these LLMs in the context of Turkish educational
+material, thereby opening new avenues for automated Turkish quiz generation.
+The study not only demonstrates the efficacy of using such models for
+generating coherent and relevant quiz content but also sets a precedent for
+future research in the domain of automated educational content creation for
+languages other than English. The Turkish-Quiz-Instruct dataset is introduced
+as a valuable resource for researchers and practitioners aiming to explore the
+boundaries of educational technology and language-specific applications of LLMs
+in Turkish. By addressing the challenges of quiz generation in a non-English
+context specifically Turkish, this study contributes significantly to the field
+of Turkish educational technology, providing insights into the potential of
+leveraging LLMs for educational purposes across diverse linguistic landscapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted Paper for ISPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IrokoBench: A New Benchmark for African Languages in the Age of Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Ifeoluwa Adelani, Jessica Ojo, Israel Abebe Azime, Jian Yun Zhuang, Jesujoba O. Alabi, Xuanli He, Millicent Ochieng, Sara Hooker, Andiswa Bukula, En-Shiun Annie Lee, Chiamaka Chukwuneke, Happy Buzaaba, Blessing Sibanda, Godson Kalipe, Jonathan Mukiibi, Salomon Kabongo, Foutse Yuehgoh, Mmasibidi Setaka, Lolwethu Ndolela, Nkiruka Odu, Rooweither Mabuya, Shamsuddeen Hassan Muhammad, Salomey Osei, Sokhar Samb, Tadesse Kebede Guge, Pontus Stenetorp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the widespread adoption of Large language models (LLMs), their
+remarkable capabilities remain limited to a few high-resource languages.
+Additionally, many low-resource languages (e.g. African languages) are often
+evaluated only on basic text classification tasks due to the lack of
+appropriate or comprehensive benchmarks outside of high-resource languages. In
+this paper, we introduce IrokoBench -- a human-translated benchmark dataset for
+16 typologically-diverse low-resource African languages covering three tasks:
+natural language inference~(AfriXNLI), mathematical reasoning~(AfriMGSM), and
+multi-choice knowledge-based QA~(AfriMMLU). We use IrokoBench to evaluate
+zero-shot, few-shot, and translate-test settings~(where test sets are
+translated into English) across 10 open and four proprietary LLMs. Our
+evaluation reveals a significant performance gap between high-resource
+languages~(such as English and French) and low-resource African languages. We
+observe a significant performance gap between open and proprietary models, with
+the highest performing open model, Aya-101 only at 58\% of the best-performing
+proprietary model GPT-4o performance. Machine translating the test set to
+English before evaluation helped to close the gap for larger models that are
+English-centric, like LLaMa 3 70B. These findings suggest that more efforts are
+needed to develop and adapt LLMs for African languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM-based Rewriting of Inappropriate Argumentation using Reinforcement
+  Learning from Machine Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timon Ziegenbein, Gabriella Skitalinskaya, Alireza Bayat Makou, Henning Wachsmuth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring that online discussions are civil and productive is a major
+challenge for social media platforms. Such platforms usually rely both on users
+and on automated detection tools to flag inappropriate arguments of other
+users, which moderators then review. However, this kind of post-hoc moderation
+is expensive and time-consuming, and moderators are often overwhelmed by the
+amount and severity of flagged content. Instead, a promising alternative is to
+prevent negative behavior during content creation. This paper studies how
+inappropriate language in arguments can be computationally mitigated. We
+propose a reinforcement learning-based rewriting approach that balances content
+preservation and appropriateness based on existing classifiers, prompting an
+instruction-finetuned large language model (LLM) as our initial policy. Unlike
+related style transfer tasks, rewriting inappropriate arguments allows deleting
+and adding content permanently. It is therefore tackled on document level
+rather than sentence level. We evaluate different weighting schemes for the
+reward function in both absolute and relative human assessment studies.
+Systematic experiments on non-parallel data provide evidence that our approach
+can mitigate the inappropriateness of arguments while largely preserving their
+content. It significantly outperforms competitive baselines, including few-shot
+learning, prompting, and humans.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Challenges of Evaluating LLM Applications: An Analysis of Automated,
+  Human, and LLM-Based Approaches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhashithe Abeysinghe, Ruhan Circi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chatbots have been an interesting application of natural language generation
+since its inception. With novel transformer based Generative AI methods,
+building chatbots have become trivial. Chatbots which are targeted at specific
+domains such as medicine, psychology, and general information retrieval are
+implemented rapidly. This, however, should not distract from the need to
+evaluate the chatbot responses. Especially because the natural language
+generation community does not entirely agree upon how to effectively evaluate
+such applications. With this work we discuss the issue further with the
+increasingly popular LLM based evaluations and how they correlate with human
+evaluations. Additionally, we introduce a comprehensive factored evaluation
+mechanism that can be utilized in conjunction with both human and LLM-based
+evaluations.
+  We present the results of an experimental evaluation conducted using this
+scheme in one of our chatbot implementations, and subsequently compare
+automated, traditional human evaluation, factored human evaluation, and
+factored LLM evaluation. Results show that factor based evaluation produces
+better insights on which aspects need to be improved in LLM applications and
+further strengthens the argument to use human evaluation in critical spaces
+where main functionality is not direct retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Good, the Bad, and the Hulk-like <span class="highlight-title">GPT</span>: Analyzing Emotional Decisions
+  of Large Language Models in Cooperation and Bargaining Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikhail Mozikov, Nikita Severin, Valeria Bodishtianu, Maria Glushanina, Mikhail Baklashkin, Andrey V. Savchenko, Ilya Makarov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Behavior study experiments are an important part of society modeling and
+understanding human interactions. In practice, many behavioral experiments
+encounter challenges related to internal and external validity,
+reproducibility, and social bias due to the complexity of social interactions
+and cooperation in human user studies. Recent advances in Large Language Models
+(LLMs) have provided researchers with a new promising tool for the simulation
+of human behavior. However, existing LLM-based simulations operate under the
+unproven hypothesis that LLM agents behave similarly to humans as well as
+ignore a crucial factor in human decision-making: emotions.
+  In this paper, we introduce a novel methodology and the framework to study
+both, the decision-making of LLMs and their alignment with human behavior under
+emotional states. Experiments with GPT-3.5 and GPT-4 on four games from two
+different classes of behavioral game theory showed that emotions profoundly
+impact the performance of LLMs, leading to the development of more optimal
+strategies. While there is a strong alignment between the behavioral responses
+of GPT-3.5 and human participants, particularly evident in bargaining games,
+GPT-4 exhibits consistent behavior, ignoring induced emotions for rationality
+decisions. Surprisingly, emotional prompting, particularly with `anger'
+emotion, can disrupt the "superhuman" alignment of GPT-4, resembling human
+emotional responses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpikeLM: Towards General Spike-Driven Language Modeling via Elastic
+  Bi-Spiking Mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingrun Xing, Zheng Zhang, Ziyi Ni, Shitao Xiao, Yiming Ju, Siqi Fan, Yequan Wang, Jiajun Zhang, Guoqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Towards energy-efficient artificial intelligence similar to the human brain,
+the bio-inspired spiking neural networks (SNNs) have advantages of biological
+plausibility, event-driven sparsity, and binary activation. Recently,
+large-scale language models exhibit promising generalization capability, making
+it a valuable issue to explore more general spike-driven models. However, the
+binary spikes in existing SNNs fail to encode adequate semantic information,
+placing technological challenges for generalization. This work proposes the
+first fully spiking mechanism for general language tasks, including both
+discriminative and generative ones. Different from previous spikes with {0,1}
+levels, we propose a more general spike formulation with bi-directional,
+elastic amplitude, and elastic frequency encoding, while still maintaining the
+addition nature of SNNs. In a single time step, the spike is enhanced by
+direction and amplitude information; in spike frequency, a strategy to control
+spike firing rate is well designed. We plug this elastic bi-spiking mechanism
+in language modeling, named SpikeLM. It is the first time to handle general
+language tasks with fully spike-driven models, which achieve much higher
+accuracy than previously possible. SpikeLM also greatly bridges the performance
+gap between SNNs and ANNs in language modeling. Our code is available at
+https://github.com/Xingrun-Xing/SpikeLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FusionBench: A Comprehensive Benchmark of Deep Model Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anke Tang, Li Shen, Yong Luo, Han Hu, Bo Do, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep model fusion is an emerging technique that unifies the predictions or
+parameters of several deep neural networks into a single model in a
+cost-effective and data-efficient manner. This enables the unified model to
+take advantage of the original models' strengths, potentially exceeding their
+performance. Although a variety of deep model fusion techniques have been
+introduced, their evaluations tend to be inconsistent and often inadequate to
+validate their effectiveness and robustness against distribution shifts. To
+address this issue, we introduce FusionBench, which is the first comprehensive
+benchmark dedicated to deep model fusion. FusionBench covers a wide range of
+tasks, including open-vocabulary image classification, text classification, and
+text-to-text generation. Each category includes up to eight tasks with
+corresponding task-specific models, featuring both full fine-tuning and LoRA
+fine-tuning, as well as models of different sizes, to ensure fair and balanced
+comparisons of various multi-task model fusion techniques across different
+tasks, model scales, and fine-tuning strategies. We implement and evaluate a
+broad spectrum of deep model fusion techniques. These techniques range from
+model ensemble methods, which combine the predictions to improve the overall
+performance, to model merging, which integrates different models into a single
+one, and model mixing methods, which upscale or recombine the components of the
+original models. FusionBench now contains 26 distinct tasks, 74 fine-tuned
+models, and 16 fusion techniques, and we are committed to consistently
+expanding the benchmark with more tasks, models, and fusion techniques. In
+addition, we offer a well-documented set of resources and guidelines to aid
+researchers in understanding and replicating the benchmark results. Homepage
+https://tanganke.github.io/fusion_bench/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project homepage: https://tanganke.github.io/fusion_bench/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models as Evaluators for Recommendation Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Zhang, Yishan Li, Jiayin Wang, Bowen Sun, Weizhi Ma, Peijie Sun, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The explainability of recommender systems has attracted significant attention
+in academia and industry. Many efforts have been made for explainable
+recommendations, yet evaluating the quality of the explanations remains a
+challenging and unresolved issue. In recent years, leveraging LLMs as
+evaluators presents a promising avenue in Natural Language Processing tasks
+(e.g., sentiment classification, information extraction), as they perform
+strong capabilities in instruction following and common-sense reasoning.
+However, evaluating recommendation explanatory texts is different from these
+NLG tasks, as its criteria are related to human perceptions and are usually
+subjective. In this paper, we investigate whether LLMs can serve as evaluators
+of recommendation explanations. To answer the question, we utilize real user
+feedback on explanations given from previous work and additionally collect
+third-party annotations and LLM evaluations. We design and apply a 3-level meta
+evaluation strategy to measure the correlation between evaluator labels and the
+ground truth provided by users. Our experiments reveal that LLMs, such as GPT4,
+can provide comparable evaluations with appropriate prompts and settings. We
+also provide further insights into combining human labels with the LLM
+evaluation process and utilizing ensembles of multiple heterogeneous LLM
+evaluators to enhance the accuracy and stability of evaluations. Our study
+verifies that utilizing LLMs as evaluators can be an accurate, reproducible and
+cost-effective solution for evaluating recommendation explanation texts. Our
+code is available at https://github.com/Xiaoyu-SZ/LLMasEvaluator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Document-level Claim Extraction and Decontextualisation for
+  Fact-Checking <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyun Deng, Michael Schlichtkrul, Andreas Vlachos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Selecting which claims to check is a time-consuming task for human
+fact-checkers, especially from documents consisting of multiple sentences and
+containing multiple claims. However, existing claim extraction approaches focus
+more on identifying and extracting claims from individual sentences, e.g.,
+identifying whether a sentence contains a claim or the exact boundaries of the
+claim within a sentence. In this paper, we propose a method for document-level
+claim extraction for fact-checking, which aims to extract check-worthy claims
+from documents and decontextualise them so that they can be understood out of
+context. Specifically, we first recast claim extraction as extractive
+summarization in order to identify central sentences from documents, then
+rewrite them to include necessary context from the originating document through
+sentence decontextualisation. Evaluation with both automatic metrics and a
+fact-checking professional shows that our method is able to extract
+check-worthy claims from documents more accurately than previous work, while
+also improving evidence retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Error-preserving Automatic Speech Recognition of Young English Learners'
+  Language <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Janick Michot, Manuela Hürlimann, Jan Deriu, Luzia Sauer, Katsiaryna Mlynchyk, Mark Cieliebak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the central skills that language learners need to practice is speaking
+the language. Currently, students in school do not get enough speaking
+opportunities and lack conversational practice. Recent advances in speech
+technology and natural language processing allow for the creation of novel
+tools to practice their speaking skills. In this work, we tackle the first
+component of such a pipeline, namely, the automated speech recognition module
+(ASR), which faces a number of challenges: first, state-of-the-art ASR models
+are often trained on adult read-aloud data by native speakers and do not
+transfer well to young language learners' speech. Second, most ASR systems
+contain a powerful language model, which smooths out errors made by the
+speakers. To give corrective feedback, which is a crucial part of language
+learning, the ASR systems in our setting need to preserve the errors made by
+the language learners. In this work, we build an ASR system that satisfies
+these requirements: it works on spontaneous speech by young language learners
+and preserves their errors. For this, we collected a corpus containing around
+85 hours of English audio spoken by learners in Switzerland from grades 4 to 6
+on different language learning tasks, which we used to train an ASR model. Our
+experiments show that our model benefits from direct fine-tuning on children's
+voices and has a much higher error preservation rate than other models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linking Named Entities in Diderot's \textit{Encyclopédie} to Wikidata 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Nugues
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diderot's \textit{Encyclop\'edie} is a reference work from XVIIIth century in
+Europe that aimed at collecting the knowledge of its era. \textit{Wikipedia}
+has the same ambition with a much greater scope. However, the lack of digital
+connection between the two encyclopedias may hinder their comparison and the
+study of how knowledge has evolved. A key element of \textit{Wikipedia} is
+Wikidata that backs the articles with a graph of structured data. In this
+paper, we describe the annotation of more than 10,300 of the
+\textit{Encyclop\'edie} entries with Wikidata identifiers enabling us to
+connect these entries to the graph. We considered geographic and human
+entities. The \textit{Encyclop\'edie} does not contain biographic entries as
+they mostly appear as subentries of locations. We extracted all the geographic
+entries and we completely annotated all the entries containing a description of
+human entities. This represents more than 2,600 links referring to locations or
+human entities. In addition, we annotated more than 9,500 entries having a
+geographic content only. We describe the annotation process as well as
+application examples. This resource is available at
+https://github.com/pnugues/encyclopedie_1751
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChatLang-8: An LLM-Based Synthetic Data Generation Framework for
+  Grammatical Error Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeiyoon Park, Chanjun Park, Heuiseok Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore and improve the capabilities of LLMs to generate data for
+grammatical error correction (GEC). When merely producing parallel sentences,
+their patterns are too simplistic to be valuable as a corpus. To address this
+issue, we propose an automated framework that includes a Subject Selector,
+Grammar Selector, Prompt Manager, and Evaluator. Additionally, we introduce a
+new dataset for GEC tasks, named \textbf{ChatLang-8}, which encompasses eight
+types of subject nouns and 23 types of grammar. It consists of 1 million pairs
+featuring human-like grammatical errors. Our experiments reveal that ChatLang-8
+exhibits a more uniform pattern composition compared to existing GEC datasets.
+Furthermore, we observe improved model performance when using ChatLang-8
+instead of existing GEC datasets. The experimental results suggest that our
+framework and ChatLang-8 are valuable resources for enhancing ChatGPT's data
+generation capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Missci: Reconstructing Fallacies in Misrepresented Science <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03181v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03181v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Glockner, Yufang Hou, Preslav Nakov, Iryna Gurevych
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Health-related misinformation on social networks can lead to poor
+decision-making and real-world dangers. Such misinformation often misrepresents
+scientific publications and cites them as "proof" to gain perceived
+credibility. To effectively counter such claims automatically, a system must
+explain how the claim was falsely derived from the cited publication. Current
+methods for automated fact-checking or fallacy detection neglect to assess the
+(mis)used evidence in relation to misinformation claims, which is required to
+detect the mismatch between them. To address this gap, we introduce Missci, a
+novel argumentation theoretical model for fallacious reasoning together with a
+new dataset for real-world misinformation detection that misrepresents
+biomedical publications. Unlike previous fallacy detection datasets, Missci (i)
+focuses on implicit fallacies between the relevant content of the cited
+publication and the inaccurate claim, and (ii) requires models to verbalize the
+fallacious reasoning in addition to classifying it. We present Missci as a
+dataset to test the critical reasoning abilities of large language models
+(LLMs), that are required to reconstruct real-world fallacious arguments, in a
+zero-shot setting. We evaluate two representative LLMs and the impact of
+different levels of detail about the fallacy classes provided to the LLM via
+prompts. Our experiments and human evaluation show promising results for GPT 4,
+while also demonstrating the difficulty of this task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 (main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StatBot.Swiss: Bilingual Open Data Exploration in Natural Language <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farhad Nooralahzadeh, Yi Zhang, Ellery Smith, Sabine Maennel, Cyril Matthey-Doret, Raphaël de Fondville, Kurt Stockinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The potential for improvements brought by Large Language Models (LLMs) in
+Text-to-SQL systems is mostly assessed on monolingual English datasets.
+However, LLMs' performance for other languages remains vastly unexplored. In
+this work, we release the StatBot.Swiss dataset, the first bilingual benchmark
+for evaluating Text-to-SQL systems based on real-world applications. The
+StatBot.Swiss dataset contains 455 natural language/SQL-pairs over 35 big
+databases with varying level of complexity for both English and German.
+  We evaluate the performance of state-of-the-art LLMs such as GPT-3.5-Turbo
+and mixtral-8x7b-instruct for the Text-to-SQL translation task using an
+in-context learning approach. Our experimental analysis illustrates that
+current LLMs struggle to generalize well in generating SQL queries on our novel
+bilingual dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted at ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CSS: Contrastive Semantic Similarity for Uncertainty Quantification of
+  LLMs <span class="chip">UAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuang Ao, Stefan Rueger, Advaith Siddharthan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive capability of large language models (LLMs), knowing
+when to trust their generations remains an open challenge. The recent
+literature on uncertainty quantification of natural language generation (NLG)
+utilises a conventional natural language inference (NLI) classifier to measure
+the semantic dispersion of LLMs responses. These studies employ logits of NLI
+classifier for semantic clustering to estimate uncertainty. However, logits
+represent the probability of the predicted class and barely contain feature
+information for potential clustering. Alternatively, CLIP (Contrastive
+Language-Image Pre-training) performs impressively in extracting image-text
+pair features and measuring their similarity. To extend its usability, we
+propose Contrastive Semantic Similarity, the CLIP-based feature extraction
+module to obtain similarity features for measuring uncertainty for text pairs.
+We apply this method to selective NLG, which detects and rejects unreliable
+generations for better trustworthiness of LLMs. We conduct extensive
+experiments with three LLMs on several benchmark question-answering datasets
+with comprehensive evaluation metrics. Results show that our proposed method
+performs better in estimating reliable responses of LLMs than comparable
+baselines. Results show that our proposed method performs better in estimating
+reliable responses of LLMs than comparable baselines. The code are available at
+\url{https://github.com/AoShuang92/css_uq_llms}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper is accepted by The Conference on Uncertainty in Artificial
+  Intelligence (UAI), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Which Side Are You On? A Multi-task <span class="highlight-title">Dataset</span> for End-to-End Argument
+  Summarisation and Evaluation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Li, Yuping Wu, Viktor Schlegel, Riza Batista-Navarro, Tharindu Madusanka, Iqra Zahid, Jiayan Zeng, Xiaochi Wang, Xinran He, Yizhi Li, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent advances of large language models (LLMs), it is no longer
+infeasible to build an automated debate system that helps people to synthesise
+persuasive arguments. Previous work attempted this task by integrating multiple
+components. In our work, we introduce an argument mining dataset that captures
+the end-to-end process of preparing an argumentative essay for a debate, which
+covers the tasks of claim and evidence identification (Task 1 ED), evidence
+convincingness ranking (Task 2 ECR), argumentative essay summarisation and
+human preference ranking (Task 3 ASR) and metric learning for automated
+evaluation of resulting essays, based on human feedback along argument quality
+dimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are
+fully annotated with the various properties supporting the aforementioned
+tasks. We evaluate multiple generative baselines for each of these tasks,
+including representative LLMs. We find, that while they show promising results
+on individual tasks in our benchmark, their end-to-end performance on all four
+tasks in succession deteriorates significantly, both in automated measures as
+well as in human-centred evaluation. This challenge presented by our proposed
+dataset motivates future research on end-to-end argument mining and
+summarisation. The repository of this project is available at
+https://github.com/HarrywillDr/ArgSum-Datatset
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Real-world Scenario: Imbalanced New Intent Discovery <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shun Zhang, Chaoran Yan, Jian Yang, Jiaheng Liu, Ying Mo, Jiaqi Bai, Tongliang Li, Zhoujun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  New Intent Discovery (NID) aims at detecting known and previously undefined
+categories of user intent by utilizing limited labeled and massive unlabeled
+data. Most prior works often operate under the unrealistic assumption that the
+distribution of both familiar and new intent classes is uniform, overlooking
+the skewed and long-tailed distributions frequently encountered in real-world
+scenarios. To bridge the gap, our work introduces the imbalanced new intent
+discovery (i-NID) task, which seeks to identify familiar and novel intent
+categories within long-tailed distributions. A new benchmark (ImbaNID-Bench)
+comprised of three datasets is created to simulate the real-world long-tail
+distributions. ImbaNID-Bench ranges from broad cross-domain to specific
+single-domain intent categories, providing a thorough representation of
+practical use cases. Besides, a robust baseline model ImbaNID is proposed to
+achieve cluster-friendly intent representations. It includes three stages:
+model pre-training, generation of reliable pseudo-labels, and robust
+representation learning that strengthens the model performance to handle the
+intricacies of real-world data distributions. Our extensive experiments on
+previous benchmarks and the newly established benchmark demonstrate the
+superior performance of ImbaNID in addressing the i-NID task, highlighting its
+potential as a powerful baseline for uncovering and categorizing user intents
+in imbalanced and long-tailed
+distributions\footnote{\url{https://github.com/Zkdc/i-NID}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Space Decomposition for Sentence Embedding <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03125v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03125v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wuttikorn Ponwitayarat, Peerat Limkonchotiwat, Ekapol Chuangsuwanich, Sarana Nutanong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Determining sentence pair similarity is crucial for various NLP tasks. A
+common technique to address this is typically evaluated on a continuous
+semantic textual similarity scale from 0 to 5. However, based on a linguistic
+observation in STS annotation guidelines, we found that the score in the range
+[4,5] indicates an upper-range sample, while the rest are lower-range samples.
+This necessitates a new approach to treating the upper-range and lower-range
+classes separately. In this paper, we introduce a novel embedding space
+decomposition method called MixSP utilizing a Mixture of Specialized
+Projectors, designed to distinguish and rank upper-range and lower-range
+samples accurately. The experimental results demonstrate that MixSP decreased
+the overlap representation between upper-range and lower-range classes
+significantly while outperforming competitors on STS and zero-shot benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL Finding 2024. The code and pre-trained models are available at
+  https://github.com/KornWtp/MixSP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FragRel: Exploiting Fragment-level Relations in the External Memory of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xihang Yue, Linchao Zhu, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To process contexts with unlimited length using Large Language Models (LLMs),
+recent studies explore hierarchically managing the long text. Only several text
+fragments are taken from the external memory and passed into the temporary
+working memory, i.e., LLM's context window. However, existing approaches
+isolatedly handle the text fragments without considering their structural
+connections, thereby suffering limited capability on texts with intensive
+inter-relations, e.g., coherent stories and code repositories. This work
+attempts to resolve this by exploiting the fragment-level relations in external
+memory. First, we formulate the fragment-level relations and present several
+instantiations for different text types. Next, we introduce a relation-aware
+fragment assessment criteria upon previous independent fragment assessment.
+Finally, we present the fragment-connected Hierarchical Memory based LLM. We
+validate the benefits of involving these relations on long story understanding,
+repository-level code generation, and long-term chatting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cryptocurrency Frauds for Dummies: How Chat<span class="highlight-title">GPT</span> introduces us to fraud? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wail Zellagui, Abdessamad Imine, Yamina Tadjeddine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in the field of large language models (LLMs), particularly
+the ChatGPT family, have given rise to a powerful and versatile machine
+interlocutor, packed with knowledge and challenging our understanding of
+learning. This interlocutor is a double-edged sword: it can be harnessed for a
+wide variety of beneficial tasks, but it can also be used to cause harm. This
+study explores the complicated interaction between ChatGPT and the growing
+problem of cryptocurrency fraud. Although ChatGPT is known for its adaptability
+and ethical considerations when used for harmful purposes, we highlight the
+deep connection that may exist between ChatGPT and fraudulent actions in the
+volatile cryptocurrency ecosystem. Based on our categorization of
+cryptocurrency frauds, we show how to influence outputs, bypass ethical terms,
+and achieve specific fraud goals by manipulating ChatGPT prompts. Furthermore,
+our findings emphasize the importance of realizing that ChatGPT could be a
+valuable instructor even for novice fraudsters, as well as understanding and
+safely deploying complex language models, particularly in the context of
+cryptocurrency frauds. Finally, our study underlines the importance of using
+LLMs responsibly and ethically in the digital currency sector, identifying
+potential risks and resolving ethical issues. It should be noted that our work
+is not intended to encourage and promote fraud, but rather to raise awareness
+of the risks of fraud associated with the use of ChatGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in ACM journal "Digital Government: Research and
+  Practice"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Detecting LLMs Hallucination via Markov Chain-based Multi-agent
+  Debate Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxi Sun, Jinpeng Li, Yan Zhong, Dongyan Zhao, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of large language models (LLMs) has facilitated the development of
+natural language text generation. It also poses unprecedented challenges, with
+content hallucination emerging as a significant concern. Existing solutions
+often involve expensive and complex interventions during the training process.
+Moreover, some approaches emphasize problem disassembly while neglecting the
+crucial validation process, leading to performance degradation or limited
+applications. To overcome these limitations, we propose a Markov Chain-based
+multi-agent debate verification framework to enhance hallucination detection
+accuracy in concise claims. Our method integrates the fact-checking process,
+including claim detection, evidence retrieval, and multi-agent verification. In
+the verification stage, we deploy multiple agents through flexible Markov
+Chain-based debates to validate individual claims, ensuring meticulous
+verification outcomes. Experimental results across three generative tasks
+demonstrate that our approach achieves significant improvements over baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Truncating Weights Improves Reasoning in Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Chen, Joan Bruna, Alberto Bietti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In addition to the ability to generate fluent text in various languages,
+large language models have been successful at tasks that involve basic forms of
+logical "reasoning" over their context. Recent work found that selectively
+removing certain components from weight matrices in pre-trained models can
+improve such reasoning capabilities. We investigate this phenomenon further by
+carefully studying how certain global associations tend to be stored in
+specific weight components or Transformer blocks, in particular feed-forward
+layers. Such associations may hurt predictions in reasoning tasks, and removing
+the corresponding components may then improve performance. We analyze how this
+arises during training, both empirically and theoretically, on a two-layer
+Transformer trained on a basic reasoning task with noise, a toy associative
+memory model, and on the Pythia family of pre-trained models tested on simple
+reasoning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RadBARTsum: Domain Specific Adaption of Denoising Sequence-to-Sequence
+  Models for Abstractive Radiology Report Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinge Wu, Abul Hasan, Honghan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiology report summarization is a crucial task that can help doctors
+quickly identify clinically significant findings without the need to review
+detailed sections of reports. This study proposes RadBARTsum, a domain-specific
+and ontology facilitated adaptation of the BART model for abstractive radiology
+report summarization. The approach involves two main steps: 1) re-training the
+BART model on a large corpus of radiology reports using a novel entity masking
+strategy to improving biomedical domain knowledge learning, and 2) fine-tuning
+the model for the summarization task using the Findings and Background sections
+to predict the Impression section. Experiments are conducted using different
+masking strategies. Results show that the re-training process with domain
+knowledge facilitated masking improves performances consistently across various
+settings. This work contributes a domain-specific generative language model for
+radiology report summarization and a method for utilising medical knowledge to
+realise entity masking language model. The proposed approach demonstrates a
+promising direction of enhancing the efficiency of language models by deepening
+its understanding of clinical knowledge in radiology reports.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StreamSpeech: Simultaneous Speech-to-Speech Translation with Multi-task
+  Learning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaolei Zhang, Qingkai Fang, Shoutao Guo, Zhengrui Ma, Min Zhang, Yang Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simultaneous speech-to-speech translation (Simul-S2ST, a.k.a streaming speech
+translation) outputs target speech while receiving streaming speech inputs,
+which is critical for real-time communication. Beyond accomplishing translation
+between speech, Simul-S2ST requires a policy to control the model to generate
+corresponding target speech at the opportune moment within speech inputs,
+thereby posing a double challenge of translation and policy. In this paper, we
+propose StreamSpeech, a direct Simul-S2ST model that jointly learns translation
+and simultaneous policy in a unified framework of multi-task learning. Adhering
+to a multi-task learning approach, StreamSpeech can perform offline and
+simultaneous speech recognition, speech translation and speech synthesis via an
+"All-in-One" seamless model. Experiments on CVSS benchmark demonstrate that
+StreamSpeech achieves state-of-the-art performance in both offline S2ST and
+Simul-S2ST tasks. Besides, StreamSpeech is able to present high-quality
+intermediate results (i.e., ASR or translation results) during simultaneous
+translation process, offering a more comprehensive real-time communication
+experience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 main conference, Project Page:
+  https://ictnlp.github.io/StreamSpeech-site/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Tarzan to Tolkien: Controlling the Language Proficiency Level of
+  LLMs for Content Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Malik, Stephen Mayhew, Chris Piech, Klinton Bicknell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of controlling the difficulty level of text generated by
+Large Language Models (LLMs) for contexts where end-users are not fully
+proficient, such as language learners. Using a novel framework, we evaluate the
+effectiveness of several key approaches for this task, including few-shot
+prompting, supervised finetuning, and reinforcement learning (RL), utilising
+both GPT-4 and open source alternatives like LLama2-7B and Mistral-7B.
+  Our findings reveal a large performance gap between GPT-4 and the open source
+models when using prompt-based strategies. However, we show how to bridge this
+gap with a careful combination of finetuning and RL alignment. Our best model,
+CALM (CEFR-Aligned Language Model), surpasses the performance of GPT-4 and
+other strategies, at only a fraction of the cost. We further validate the
+quality of our results through a small-scale human study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling Selection Biases: Exploring Order and Token Sensitivity in
+  Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng-Lun Wei, Cheng-Kuang Wu, Hen-Hsen Huang, Hsin-Hsi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the phenomena of "selection biases" in Large
+Language Models (LLMs), focusing on problems where models are tasked with
+choosing the optimal option from an ordered sequence. We delve into biases
+related to option order and token usage, which significantly impact LLMs'
+decision-making processes. We also quantify the impact of these biases through
+an extensive empirical analysis across multiple models and tasks. Furthermore,
+we propose mitigation strategies to enhance model performance. Our key
+contributions are threefold: 1) Precisely quantifying the influence of option
+order and token on LLMs, 2) Developing strategies to mitigate the impact of
+token and order sensitivity to enhance robustness, and 3) Offering a detailed
+analysis of sensitivity across models and tasks, which informs the creation of
+more stable and reliable LLM applications for selection problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a long findings paper at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DriVLMe: Enhancing LLM-based Autonomous Driving Agents with Embodied and
+  Social Experiences <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yidong Huang, Jacob Sansom, Ziqiao Ma, Felix Gervits, Joyce Chai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in foundation models (FMs) have unlocked new prospects in
+autonomous driving, yet the experimental settings of these studies are
+preliminary, over-simplified, and fail to capture the complexity of real-world
+driving scenarios in human environments. It remains under-explored whether FM
+agents can handle long-horizon navigation tasks with free-from dialogue and
+deal with unexpected situations caused by environmental dynamics or task
+changes. To explore the capabilities and boundaries of FMs faced with the
+challenges above, we introduce DriVLMe, a video-language-model-based agent to
+facilitate natural and effective communication between humans and autonomous
+vehicles that perceive the environment and navigate. We develop DriVLMe from
+both embodied experiences in a simulated environment and social experiences
+from real human dialogue. While DriVLMe demonstrates competitive performance in
+both open-loop benchmarks and closed-loop human studies, we reveal several
+limitations and challenges, including unacceptable inference time, imbalanced
+training data, limited visual understanding, challenges with multi-turn
+interactions, simplified language generation from robotic experiences, and
+difficulties in handling on-the-fly unexpected situations like environmental
+dynamics and task changes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First Vision and Language for Autonomous Driving and Robotics
+  Workshop (VLADR @ CVPR 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BadAgent: Inserting and Activating Backdoor Attacks in LLM Agents <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Wang, Dizhan Xue, Shengjie Zhang, Shengsheng Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the prosperity of large language models (LLMs), powerful LLM-based
+intelligent agents have been developed to provide customized services with a
+set of user-defined tools. State-of-the-art methods for constructing LLM agents
+adopt trained LLMs and further fine-tune them on data for the agent task.
+However, we show that such methods are vulnerable to our proposed backdoor
+attacks named BadAgent on various agent tasks, where a backdoor can be embedded
+by fine-tuning on the backdoor data. At test time, the attacker can manipulate
+the deployed LLM agents to execute harmful operations by showing the trigger in
+the agent input or environment. To our surprise, our proposed attack methods
+are extremely robust even after fine-tuning on trustworthy data. Though
+backdoor attacks have been studied extensively in natural language processing,
+to the best of our knowledge, we could be the first to study them on LLM agents
+that are more dangerous due to the permission to use external tools. Our work
+demonstrates the clear risk of constructing LLM agents based on untrusted LLMs
+or data. Our code is public at https://github.com/DPamK/BadAgent
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluation of data inconsistency for multi-modal sentiment analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Wang, Mengyue Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotion semantic inconsistency is an ubiquitous challenge in multi-modal
+sentiment analysis (MSA). MSA involves analyzing sentiment expressed across
+various modalities like text, audio, and videos. Each modality may convey
+distinct aspects of sentiment, due to subtle and nuanced expression of human
+beings, leading to inconsistency, which may hinder the prediction of artificial
+agents. In this work, we introduce a modality conflicting test set and assess
+the performance of both traditional multi-modal sentiment analysis models and
+multi-modal large language models (MLLMs). Our findings reveal significant
+performance degradation across traditional models when confronted with
+semantically conflicting data and point out the drawbacks of MLLMs when
+handling multi-modal emotion analysis. Our research presents a new challenge
+and offer valuable insights for the future development of sentiment analysis
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Readability-guided Idiom-aware Sentence Simplification (RISS) for
+  Chinese <span class="chip">CCL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingshen Zhang, Xinglu Chen, Xinying Qiu, Zhimin Wang, Wenhe Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chinese sentence simplification faces challenges due to the lack of
+large-scale labeled parallel corpora and the prevalence of idioms. To address
+these challenges, we propose Readability-guided Idiom-aware Sentence
+Simplification (RISS), a novel framework that combines data augmentation
+techniques with lexcial simplification. RISS introduces two key components: (1)
+Readability-guided Paraphrase Selection (RPS), a method for mining high-quality
+sentence pairs, and (2) Idiom-aware Simplification (IAS), a model that enhances
+the comprehension and simplification of idiomatic expressions. By integrating
+RPS and IAS using multi-stage and multi-task learning strategies, RISS
+outperforms previous state-of-the-art methods on two Chinese sentence
+simplification datasets. Furthermore, RISS achieves additional improvements
+when fine-tuned on a small labeled dataset. Our approach demonstrates the
+potential for more effective and accessible Chinese text simplification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 23rd China National Conference on Computational
+  Linguistics (CCL 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Filtered not Mixed: Stochastic Filtering-Based Online Gating for Mixture
+  of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02969v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02969v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raeid Saqur, Anastasis Kratsios, Florian Krach, Yannick Limmer, Jacob-Junqi Tian, John Willes, Blanka Horvath, Frank Rudzicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MoE-F -- a formalised mechanism for combining $N$ pre-trained
+expert Large Language Models (LLMs) in online time-series prediction tasks by
+adaptively forecasting the best weighting of LLM predictions at every time
+step. Our mechanism leverages the conditional information in each expert's
+running performance to forecast the best combination of LLMs for predicting the
+time series in its next step. Diverging from static (learned) Mixture of
+Experts (MoE) methods, MoE-F employs time-adaptive stochastic filtering
+techniques to combine experts. By framing the expert selection problem as a
+finite state-space, continuous-time Hidden Markov model (HMM), we can leverage
+the Wohman-Shiryaev filter. Our approach first constructs $N$ parallel filters
+corresponding to each of the $N$ individual LLMs. Each filter proposes its best
+combination of LLMs, given the information that they have access to.
+Subsequently, the $N$ filter outputs are aggregated to optimize a lower bound
+for the loss of the aggregated LLMs, which can be optimized in closed-form,
+thus generating our ensemble predictor. Our contributions here are: (I) the
+MoE-F algorithm -- deployable as a plug-and-play filtering harness, (II)
+theoretical optimality guarantees of the proposed filtering-based gating
+algorithm, and (III) empirical evaluation and ablative results using state of
+the art foundational and MoE LLMs on a real-world Financial Market Movement
+task where MoE-F attains a remarkable 17% absolute and 48.5% relative F1
+measure improvement over the next best performing individual LLM expert.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 5 Appendix sections</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Docs2KG: Unified Knowledge Graph Construction from Heterogeneous
+  Documents Assisted by Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Sun, Yuanyi Luo, Wenxiao Zhang, Sirui Li, Jichunyang Li, Kai Niu, Xiangrui Kong, Wei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even for a conservative estimate, 80% of enterprise data reside in
+unstructured files, stored in data lakes that accommodate heterogeneous
+formats. Classical search engines can no longer meet information seeking needs,
+especially when the task is to browse and explore for insight formulation. In
+other words, there are no obvious search keywords to use. Knowledge graphs, due
+to their natural visual appeals that reduce the human cognitive load, become
+the winning candidate for heterogeneous data integration and knowledge
+representation.
+  In this paper, we introduce Docs2KG, a novel framework designed to extract
+multimodal information from diverse and heterogeneous unstructured documents,
+including emails, web pages, PDF files, and Excel files. Dynamically generates
+a unified knowledge graph that represents the extracted key information,
+Docs2KG enables efficient querying and exploration of document data lakes.
+Unlike existing approaches that focus on domain-specific data sources or
+pre-designed schemas, Docs2KG offers a flexible and extensible solution that
+can adapt to various document structures and content types. The proposed
+framework unifies data processing supporting a multitude of downstream tasks
+with improved domain interpretability. Docs2KG is publicly accessible at
+https://docs2kg.ai4wa.com, and a demonstration video is available at
+https://docs2kg.ai4wa.com/Video.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Moment-Matching Distillation of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation (KD) has been shown to be highly effective in guiding
+a student model with a larger teacher model and achieving practical benefits in
+improving the computational and memory efficiency for large language models
+(LLMs). State-of-the-art KD methods for LLMs mostly rely on minimizing explicit
+distribution distance between teacher and student probability predictions.
+Instead of optimizing these mandatory behaviour cloning objectives, we explore
+an imitation learning strategy for KD of LLMs. In particular, we minimize the
+imitation gap by matching the action-value moments of the teacher's behavior
+from both on- and off-policy perspectives. To achieve this action-value
+moment-matching goal, we propose an adversarial training algorithm to jointly
+estimate the moment-matching distance and optimize the student policy to
+minimize it. Results from both task-agnostic instruction-following experiments
+and task-specific experiments demonstrate the effectiveness of our method and
+achieve new state-of-the-art performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PrE-Text: Training Language Models on Private Federated Data in the Age
+  of LLMs <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charlie Hou, Akshat Shrivastava, Hongyuan Zhan, Rylan Conway, Trang Le, Adithya Sagar, Giulia Fanti, Daniel Lazar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  On-device training is currently the most common approach for training machine
+learning (ML) models on private, distributed user data. Despite this, on-device
+training has several drawbacks: (1) most user devices are too small to train
+large models on-device, (2) on-device training is communication- and
+computation-intensive, and (3) on-device training can be difficult to debug and
+deploy. To address these problems, we propose Private Evolution-Text
+(PrE-Text), a method for generating differentially private (DP) synthetic
+textual data. First, we show that across multiple datasets, training small
+models (models that fit on user devices) with PrE-Text synthetic data
+outperforms small models trained on-device under practical privacy regimes
+($\epsilon=1.29$, $\epsilon=7.58$). We achieve these results while using
+9$\times$ fewer rounds, 6$\times$ less client computation per round, and
+100$\times$ less communication per round. Second, finetuning large models on
+PrE-Text's DP synthetic data improves large language model (LLM) performance on
+private data across the same range of privacy budgets. Altogether, these
+results suggest that training on DP synthetic data can be a better option than
+training a model on-device on private distributed data. Code is available at
+https://github.com/houcharlie/PrE-Text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 4D ASR: Joint Beam Search Integrating CTC, Attention, Transducer, and
+  Mask Predict Decoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yui Sudo, Muhammad Shakeel, Yosuke Fukumoto, Brian Yan, Jiatong Shi, Yifan Peng, Shinji Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end automatic speech recognition (E2E-ASR) can be classified into
+several network architectures, such as connectionist temporal classification
+(CTC), recurrent neural network transducer (RNN-T), attention-based
+encoder-decoder, and mask-predict models. Each network architecture has
+advantages and disadvantages, leading practitioners to switch between these
+different models depending on application requirements. Instead of building
+separate models, we propose a joint modeling scheme where four decoders (CTC,
+RNN-T, attention, and mask-predict) share the same encoder -- we refer to this
+as 4D modeling. The 4D model is trained using multitask learning, which will
+bring model regularization and maximize the model robustness thanks to their
+complementary properties. To efficiently train the 4D model, we introduce a
+two-stage training strategy that stabilizes multitask learning. In addition, we
+propose three novel one-pass beam search algorithms by combining three decoders
+(CTC, RNN-T, and attention) to further improve performance. These three beam
+search algorithms differ in which decoder is used as the primary decoder. We
+carefully evaluate the performance and computational tradeoffs associated with
+each algorithm. Experimental results demonstrate that the jointly trained 4D
+model outperforms the E2E-ASR models trained with only one individual decoder.
+Furthermore, we demonstrate that the proposed one-pass beam search algorithm
+outperforms the previously proposed CTC/attention decoding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to IEEE/ACM Transactions on Audio Speech and Language
+  Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Task-oriented Queries Benchmark (ToQB) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keun Soo Yim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Task-oriented queries (e.g., one-shot queries to play videos, order food, or
+call a taxi) are crucial for assessing the quality of virtual assistants,
+chatbots, and other large language model (LLM)-based services. However, a
+standard benchmark for task-oriented queries is not yet available, as existing
+benchmarks in the relevant NLP (Natural Language Processing) fields have
+primarily focused on task-oriented dialogues. Thus, we present a new
+methodology for efficiently generating the Task-oriented Queries Benchmark
+(ToQB) using existing task-oriented dialogue datasets and an LLM service. Our
+methodology involves formulating the underlying NLP task to summarize the
+original intent of a speaker in each dialogue, detailing the key steps to
+perform the devised NLP task using an LLM service, and outlining a framework
+for automating a major part of the benchmark generation process. Through a case
+study encompassing three domains (i.e., two single-task domains and one
+multi-task domain), we demonstrate how to customize the LLM prompts (e.g.,
+omitting system utterances or speaker labels) for those three domains and
+characterize the generated task-oriented queries. The generated ToQB dataset is
+made available to the public. We further discuss new domains that can be added
+to ToQB by community contributors and its practical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Data available on GitHub,
+  https://github.com/google/task-oriented-queries</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SYN2REAL: Leveraging Task Arithmetic for Mitigating Synthetic-Real
+  Discrepancies in ASR Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02925v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02925v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hsuan Su, Hua Farn, Shang-Tse Chen, Hung-yi Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language models (LLMs) have introduced the 'task
+vector' concept, which has significantly impacted various domains but remains
+underexplored in speech recognition. This paper presents a novel 'SYN2REAL'
+task vector for domain adaptation in automatic speech recognition (ASR),
+specifically targeting text-only domains. Traditional fine-tuning on synthetic
+speech often results in performance degradation due to acoustic mismatches. To
+address this issue, we propose creating a 'SYN2REAL' vector by subtracting the
+parameter differences between models fine-tuned on real and synthetic speech.
+This vector effectively bridges the gap between the two domains. Experiments on
+the SLURP dataset demonstrate that our approach yields an average improvement
+of 11.15% in word error rate for unseen target domains, highlighting the
+potential of task vectors in enhancing speech domain adaptation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pruner-Zero: Evolving Symbolic Pruning Metric from scratch for Large
+  Language Models <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02924v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02924v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peijie Dong, Lujun Li, Zhenheng Tang, Xiang Liu, Xinglin Pan, Qiang Wang, Xiaowen Chu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the remarkable capabilities, Large Language Models (LLMs) face
+deployment challenges due to their extensive size. Pruning methods drop a
+subset of weights to accelerate, but many of them require retraining, which is
+prohibitively expensive and computationally demanding. Recently, post-training
+pruning approaches introduced novel metrics, enabling the pruning of LLMs
+without retraining. However, these metrics require the involvement of human
+experts and tedious trial and error. To efficiently identify superior pruning
+metrics, we develop an automatic framework for searching symbolic pruning
+metrics using genetic programming. In particular, we devise an elaborate search
+space encompassing the existing pruning metrics to discover the potential
+symbolic pruning metric. We propose an opposing operation simplification
+strategy to increase the diversity of the population. In this way, Pruner-Zero
+allows auto-generation of symbolic pruning metrics. Based on the searched
+results, we explore the correlation between pruning metrics and performance
+after pruning and summarize some principles. Extensive experiments on LLaMA and
+LLaMA-2 on language modeling and zero-shot tasks demonstrate that our
+Pruner-Zero obtains superior performance than SOTA post-training pruning
+methods. Code at: \url{https://github.com/pprp/Pruner-Zero}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML2024, 29 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text Injection for Neural Contextual Biasing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02921v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02921v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhong Meng, Zelin Wu, Rohit Prabhavalkar, Cal Peyser, Weiran Wang, Nanxin Chen, Tara N. Sainath, Bhuvana Ramabhadran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural contextual biasing effectively improves automatic speech recognition
+(ASR) for crucial phrases within a speaker's context, particularly those that
+are infrequent in the training data. This work proposes contextual text
+injection (CTI) to enhance contextual ASR. CTI leverages not only the paired
+speech-text data, but also a much larger corpus of unpaired text to optimize
+the ASR model and its biasing component. Unpaired text is converted into
+speech-like representations and used to guide the model's attention towards
+relevant bias phrases. Moreover, we introduce a contextual text-injected (CTI)
+minimum word error rate (MWER) training, which minimizes the expected WER
+caused by contextual biasing when unpaired text is injected into the model.
+Experiments show that CTI with 100 billion text sentences can achieve up to
+43.3% relative WER reduction from a strong neural biasing model. CTI-MWER
+provides a further relative improvement of 23.5%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MultifacetEval: Multifaceted Evaluation to Probe LLMs in Mastering
+  Medical Knowledge <span class="chip">IJCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02919v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02919v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Zhou, Xien Liu, Chen Ning, Ji Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have excelled across domains, also delivering
+notable performance on the medical evaluation benchmarks, such as MedQA.
+However, there still exists a significant gap between the reported performance
+and the practical effectiveness in real-world medical scenarios. In this paper,
+we aim to explore the causes of this gap by employing a multifaceted
+examination schema to systematically probe the actual mastery of medical
+knowledge by current LLMs. Specifically, we develop a novel evaluation
+framework MultifacetEval to examine the degree and coverage of LLMs in encoding
+and mastering medical knowledge at multiple facets (comparison, rectification,
+discrimination, and verification) concurrently. Based on the MultifacetEval
+framework, we construct two multifaceted evaluation datasets: MultiDiseK (by
+producing questions from a clinical disease knowledge base) and MultiMedQA (by
+rephrasing each question from a medical benchmark MedQA into multifaceted
+questions). The experimental results on these multifaceted datasets demonstrate
+that the extent of current LLMs in mastering medical knowledge is far below
+their performance on existing medical benchmarks, suggesting that they lack
+depth, precision, and comprehensiveness in mastering medical knowledge.
+Consequently, current LLMs are not yet ready for application in real-world
+medical tasks. The codes and datasets are available at
+https://github.com/THUMLP/MultifacetEval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IJCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving In-Context Learning with Prediction Feedback for Sentiment
+  Analysis <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02911v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02911v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongling Xu, Qianlong Wang, Yice Zhang, Min Yang, Xi Zeng, Bing Qin, Ruifeng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved promising results in sentiment
+analysis through the in-context learning (ICL) paradigm. However, their ability
+to distinguish subtle sentiments still remains a challenge. Inspired by the
+human ability to adjust understanding via feedback, this paper enhances ICL by
+incorporating prior predictions and feedback, aiming to rectify sentiment
+misinterpretation of LLMs. Specifically, the proposed framework consists of
+three steps: (1) acquiring prior predictions of LLMs, (2) devising predictive
+feedback based on correctness, and (3) leveraging a feedback-driven prompt to
+refine sentiment understanding. Experimental results across nine sentiment
+analysis datasets demonstrate the superiority of our framework over
+conventional ICL methods, with an average F1 improvement of 5.95%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open Grounded Planning: Challenges and Benchmark Construction <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiguang Guo, Ziliang Deng, Hongyu Lin, Yaojie Lu, Xianpei Han, Le Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of large language models (LLMs) has increasingly drawn
+attention to the use of LLMs for human-like planning. Existing work on
+LLM-based planning either focuses on leveraging the inherent language
+generation capabilities of LLMs to produce free-style plans, or employs
+reinforcement learning approaches to learn decision-making for a limited set of
+actions within restricted environments. However, both approaches exhibit
+significant discrepancies from the open and executable requirements in
+real-world planning. In this paper, we propose a new planning task--open
+grounded planning. The primary objective of open grounded planning is to ask
+the model to generate an executable plan based on a variable action set,
+thereby ensuring the executability of the produced plan. To this end, we
+establishes a benchmark for open grounded planning spanning a wide range of
+domains. Then we test current state-of-the-art LLMs along with five planning
+approaches, revealing that existing LLMs and methods still struggle to address
+the challenges posed by grounded planning in open domains. The outcomes of this
+paper define and establish a foundational dataset for open grounded planning,
+and shed light on the potential challenges and future directions of LLM-based
+planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept to ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ S$^2$GSL: Incorporating Segment to Syntactic Enhanced Graph Structure
+  Learning for Aspect-based Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingfeng Chen, Qihan Ouyang, Yongqi Luo, Boyan Xu, Ruichu Cai, Zhifeng Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous graph-based approaches in Aspect based Sentiment Analysis(ABSA) have
+demonstrated impressive performance by utilizing graph neural networks and
+attention mechanisms to learn structures of static dependency trees and dynamic
+latent trees. However, incorporating both semantic and syntactic information
+simultaneously within complex global structures can introduce irrelevant
+contexts and syntactic dependencies during the process of graph structure
+learning, potentially resulting in inaccurate predictions. In order to address
+the issues above, we propose S$^2$GSL, incorporating Segment to Syntactic
+enhanced Graph Structure Learning for ABSA. Specifically,S$^2$GSL is featured
+with a segment-aware semantic graph learning and a syntax-based latent graph
+learning enabling the removal of irrelevant contexts and dependencies,
+respectively. We further propose a self-adaptive aggregation network that
+facilitates the fusion of two graph learning branches, thereby achieving
+complementarity across diverse structures. Experimental results on four
+benchmarks demonstrate the effectiveness of our framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling Laws for Reward Model Overoptimization in Direct Alignment
+  Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rafael Rafailov, Yaswanth Chittepu, Ryan Park, Harshit Sikchi, Joey Hejna, Bradley Knox, Chelsea Finn, Scott Niekum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning from Human Feedback (RLHF) has been crucial to the
+recent success of Large Language Models (LLMs), however, it is often a complex
+and brittle process. In the classical RLHF framework, a reward model is first
+trained to represent human preferences, which is in turn used by an online
+reinforcement learning (RL) algorithm to optimize the LLM. A prominent issue
+with such methods is \emph{reward over-optimization} or \emph{reward hacking},
+where performance as measured by the learned proxy reward model increases, but
+true quality plateaus or even deteriorates. Direct Alignment Algorithms (DDAs)
+like Direct Preference Optimization have emerged as alternatives to the
+classical RLHF pipeline by circumventing the reward modeling phase. However,
+although DAAs do not use a separate proxy reward model, they still commonly
+deteriorate from over-optimization. While the so-called reward hacking
+phenomenon is not well-defined for DAAs, we still uncover similar trends: at
+higher KL budgets, DAA algorithms exhibit similar degradation patterns to their
+classic RLHF counterparts. In particular, we find that DAA methods deteriorate
+not only across a wide range of KL budgets but also often before even a single
+epoch of the dataset is completed. Through extensive empirical experimentation,
+this work formulates and formalizes the reward over-optimization or hacking
+problem for DAAs and explores its consequences across objectives, training
+regimes, and model scales.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Model Can Do Knowledge Tracing: Simple but Effective Method to
+  Integrate Language Model and Knowledge Tracing Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02893v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02893v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Unggi Lee, Jiyeong Bae, Dohee Kim, Sookbun Lee, Jaekwon Park, Taekyung Ahn, Gunho Lee, Damji Stratton, Hyeoncheol Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Tracing (KT) is a critical task in online learning for modeling
+student knowledge over time. Despite the success of deep learning-based KT
+models, which rely on sequences of numbers as data, most existing approaches
+fail to leverage the rich semantic information in the text of questions and
+concepts. This paper proposes Language model-based Knowledge Tracing (LKT), a
+novel framework that integrates pre-trained language models (PLMs) with KT
+methods. By leveraging the power of language models to capture semantic
+representations, LKT effectively incorporates textual information and
+significantly outperforms previous KT models on large benchmark datasets.
+Moreover, we demonstrate that LKT can effectively address the cold-start
+problem in KT by leveraging the semantic knowledge captured by PLMs.
+Interpretability of LKT is enhanced compared to traditional KT models due to
+its use of text-rich data. We conducted the local interpretable model-agnostic
+explanation technique and analysis of attention scores to interpret the model
+performance further. Our work highlights the potential of integrating PLMs with
+KT and paves the way for future research in KT domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HYDRA: Model Factorization Framework for Black-Box LLM Personalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Zhuang, Haotian Sun, Yue Yu, Qifan Wang, Chao Zhang, Bo Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalization has emerged as a critical research area in modern intelligent
+systems, focusing on mining users' behavioral history and adapting to their
+preferences for delivering tailored experiences. Despite the remarkable
+few-shot capabilities exhibited by black-box large language models (LLMs), the
+inherent opacity of their model parameters presents significant challenges in
+aligning the generated output with individual expectations. Existing solutions
+have primarily focused on prompt design to incorporate user-specific profiles
+and behaviors; however, such approaches often struggle to generalize
+effectively due to their inability to capture shared knowledge among all users.
+To address these challenges, we propose HYDRA, a model factorization framework
+that captures both user-specific behavior patterns from historical data and
+shared general knowledge among all users to deliver personalized generation. In
+order to capture user-specific behavior patterns, we first train a reranker to
+prioritize the most useful information from top-retrieved relevant historical
+records. By combining the prioritized history with the corresponding query, we
+train an adapter to align the output with individual user-specific preferences,
+eliminating the reliance on access to inherent model parameters of black-box
+LLMs. Both the reranker and the adapter can be decomposed into a base model
+with multiple user-specific heads, resembling a hydra. The base model maintains
+shared knowledge across users, while the multiple personal heads capture
+user-specific preferences. Experimental results demonstrate that HYDRA
+outperforms existing state-of-the-art prompt-based methods by an average
+relative improvement of 9.01% across five diverse personalization tasks in the
+LaMP benchmark. Our implementation is available at
+https://github.com/night-chen/HYDRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 6 figures, work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PLaD: Preference-based Large Language Model Distillation with
+  Pseudo-Preference Pairs <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongzhi Zhang, Jiaming Shen, Tianqi Liu, Haorui Wang, Zhen Qin, Feng Han, Jialu Liu, Simon Baumgartner, Michael Bendersky, Chao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have exhibited impressive capabilities in
+various tasks, yet their vast parameter sizes restrict their applicability in
+resource-constrained settings. Knowledge distillation (KD) offers a viable
+solution by transferring expertise from large teacher models to compact student
+models. However, traditional KD techniques face specific challenges when
+applied to LLMs, including restricted access to LLM outputs, significant
+teacher-student capacity gaps, and the inherited mis-calibration issue. In this
+work, we present PLaD, a novel preference-based LLM distillation framework.
+PLaD exploits the teacher-student capacity discrepancy to generate
+pseudo-preference pairs where teacher outputs are preferred over student
+outputs. Then, PLaD leverages a ranking loss to re-calibrate student's
+estimation of sequence likelihood, which steers the student's focus towards
+understanding the relative quality of outputs instead of simply imitating the
+teacher. PLaD bypasses the need for access to teacher LLM's internal states,
+tackles the student's expressivity limitations, and mitigates the student
+mis-calibration issue. Through extensive experiments on two sequence generation
+tasks and with various LLMs, we demonstrate the effectiveness of our proposed
+PLaD framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Outdated Issue Aware Decoding for Factual Knowledge Editing <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zengkui Sun, Yijin Liu, Jiaan Wang, Fandong Meng, Jinan Xu, Yufeng Chen, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Knowledge Editing has received increasing attention, since it could
+update the specific knowledge from outdated ones in pretrained models without
+re-training. However, as pointed out by recent studies, existing related
+methods tend to merely memorize the superficial word composition of the edited
+knowledge, rather than truly learning and absorbing it. Consequently, on the
+reasoning questions, we discover that existing methods struggle to utilize the
+edited knowledge to reason the new answer, and tend to retain outdated
+responses, which are generated by the original models utilizing original
+knowledge. Nevertheless, the outdated responses are unexpected for the correct
+answers to reasoning questions, which we named as the outdated issue. To
+alleviate this issue, in this paper, we propose a simple yet effective decoding
+strategy, i.e., outDated ISsue aware deCOding (DISCO), to enhance the
+performance of edited models on reasoning questions. Specifically, we capture
+the difference in the probability distribution between the original and edited
+models. Further, we amplify the difference of the token prediction in the
+edited model to alleviate the outdated issue, and thus enhance the model
+performance w.r.t the edited knowledge. Experimental results suggest that
+applying DISCO could enhance edited models to reason, e.g., on reasoning
+questions, DISCO outperforms the prior SOTA method by 12.99 F1 scores, and
+reduces the ratio of the outdated issue to 5.78% on the zsRE dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LCS: A Language Converter Strategy for Zero-Shot Neural Machine
+  Translation <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zengkui Sun, Yijin Liu, Fandong Meng, Jinan Xu, Yufeng Chen, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilingual neural machine translation models generally distinguish
+translation directions by the language tag (LT) in front of the source or
+target sentences. However, current LT strategies cannot indicate the desired
+target language as expected on zero-shot translation, i.e., the off-target
+issue. Our analysis reveals that the indication of the target language is
+sensitive to the placement of the target LT. For example, when placing the
+target LT on the decoder side, the indication would rapidly degrade along with
+decoding steps, while placing the target LT on the encoder side would lead to
+copying or paraphrasing the source input. To address the above issues, we
+propose a simple yet effective strategy named Language Converter Strategy
+(LCS). By introducing the target language embedding into the top encoder
+layers, LCS mitigates confusion in the encoder and ensures stable language
+indication for the decoder. Experimental results on MultiUN, TED, and OPUS-100
+datasets demonstrate that LCS could significantly mitigate the off-target
+issue, with language accuracy up to 95.28%, 96.21%, and 85.35% meanwhile
+outperforming the vanilla LT strategy by 3.07, 3,3, and 7.93 BLEU scores on
+zero-shot translation, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NUMCoT: Numerals and Units of Measurement in Chain-of-Thought Reasoning
+  using Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ancheng Xu, Minghuan Tan, Lei Wang, Min Yang, Ruifeng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Numeral systems and units of measurement are two conjoined topics in
+activities of human beings and have mutual effects with the languages
+expressing them. Currently, the evaluation of Large Language Models (LLMs)
+often involves mathematical reasoning, yet little attention is given to how
+minor changes in numbers or units can drastically alter the complexity of
+problems and the performance of LLMs. In this paper, we scrutinize existing
+LLMs on processing of numerals and units of measurement by constructing
+datasets with perturbations. We first anatomize the reasoning of math word
+problems to different sub-procedures like numeral conversions from language to
+numbers and measurement conversions based on units. Then we further annotate
+math word problems from ancient Chinese arithmetic works which are challenging
+in numerals and units of measurement. Experiments on perturbed datasets
+demonstrate that LLMs still encounter difficulties in handling numeral and
+measurement conversions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM as a Scorer: The Impact of Output Order on Dialogue Evaluation <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02863v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02863v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Pei Chen, KuanChao Chu, Hideki Nakayama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research investigates the effect of prompt design on dialogue evaluation
+using large language models (LLMs). While LLMs are increasingly used for
+scoring various inputs, creating effective prompts for dialogue evaluation
+remains challenging due to model sensitivity and subjectivity in dialogue
+assessments. Our study experimented with different prompt structures, altering
+the sequence of output instructions and including explanatory reasons. We found
+that the order of presenting reasons and scores significantly influences LLMs'
+scoring, with a "reason-first" approach yielding more comprehensive
+evaluations. This insight is crucial for enhancing the accuracy and consistency
+of LLM-based evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in AAAI 2024 Spring Symposium. The first two authors
+  contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Heuristic Core: Understanding Subnetwork Generalization in
+  <span class="highlight-title">Pretrain</span>ed Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03942v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03942v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adithya Bhaskar, Dan Friedman, Danqi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prior work has found that pretrained language models (LMs) fine-tuned with
+different random seeds can achieve similar in-domain performance but generalize
+differently on tests of syntactic generalization. In this work, we show that,
+even within a single model, we can find multiple subnetworks that perform
+similarly in-domain, but generalize vastly differently. To better understand
+these phenomena, we investigate if they can be understood in terms of
+"competing subnetworks": the model initially represents a variety of distinct
+algorithms, corresponding to different subnetworks, and generalization occurs
+when it ultimately converges to one. This explanation has been used to account
+for generalization in simple algorithmic tasks ("grokking"). Instead of finding
+competing subnetworks, we find that all subnetworks -- whether they generalize
+or not -- share a set of attention heads, which we refer to as the heuristic
+core. Further analysis suggests that these attention heads emerge early in
+training and compute shallow, non-generalizing features. The model learns to
+generalize by incorporating additional attention heads, which depend on the
+outputs of the "heuristic" heads to compute higher-level features. Overall, our
+results offer a more detailed picture of the mechanisms for syntactic
+generalization in pretrained LMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAFT: Adapting Language Model to Domain Specific RAG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10131v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10131v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianjun Zhang, Shishir G. Patil, Naman Jain, Sheng Shen, Matei Zaharia, Ion Stoica, Joseph E. Gonzalez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretraining Large Language Models (LLMs) on large corpora of textual data is
+now a standard paradigm. When using these LLMs for many downstream
+applications, it is common to additionally bake in new knowledge (e.g.,
+time-critical news, or private domain knowledge) into the pretrained model
+either through RAG-based-prompting, or fine-tuning. However, the optimal
+methodology for the model to gain such new knowledge remains an open question.
+In this paper, we present Retrieval Augmented FineTuning (RAFT), a training
+recipe that improves the model's ability to answer questions in a "open-book"
+in-domain settings. In RAFT, given a question, and a set of retrieved
+documents, we train the model to ignore those documents that don't help in
+answering the question, which we call, distractor documents. RAFT accomplishes
+this by citing verbatim the right sequence from the relevant document that
+would help answer the question. This coupled with RAFT's chain-of-thought-style
+response helps improve the model's ability to reason. In domain-specific RAG,
+RAFT consistently improves the model's performance across PubMed, HotpotQA, and
+Gorilla datasets, presenting a post-training recipe to improve pre-trained LLMs
+to in-domain RAG. RAFT's code and demo are open-sourced at
+github.com/ShishirPatil/gorilla.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPIN: Sparsifying and Integrating Internal Neurons in Large Language
+  Models for Text Classification <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15983v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15983v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Difan Jiao, Yilun Liu, Zhenwei Tang, Daniel Matter, Jürgen Pfeffer, Ashton Anderson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among the many tasks that Large Language Models (LLMs) have revolutionized is
+text classification. Current text classification paradigms, however, rely
+solely on the output of the final layer in the LLM, with the rich information
+contained in internal neurons largely untapped. In this study, we present SPIN:
+a model-agnostic framework that sparsifies and integrates internal neurons of
+intermediate layers of LLMs for text classification. Specifically, SPIN
+sparsifies internal neurons by linear probing-based salient neuron selection
+layer by layer, avoiding noise from unrelated neurons and ensuring efficiency.
+The cross-layer salient neurons are then integrated to serve as multi-layered
+features for the classification head. Extensive experimental results show our
+proposed SPIN significantly improves text classification accuracy, efficiency,
+and interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures, 12 tables Code available at
+  https://github.com/difanj0713/SPIN</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NEO-BENCH: Evaluating Robustness of Large Language Models with
+  Neologisms <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12261v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12261v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Zheng, Alan Ritter, Wei Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of Large Language Models (LLMs) degrades from the temporal
+drift between data used for model training and newer text seen during
+inference. One understudied avenue of language change causing data drift is the
+emergence of neologisms -- new word forms -- over time. We create a diverse
+resource of recent English neologisms by using several popular collection
+methods. We analyze temporal drift using neologisms by comparing sentences
+containing new words with near-identical sentences that replace neologisms with
+existing substitute words. Model performance is nearly halved in machine
+translation when a single neologism is introduced in a sentence. Motivated by
+these results, we construct a benchmark to evaluate LLMs' ability to generalize
+to neologisms with various natural language understanding tasks and model
+perplexity. Models with later knowledge cutoff dates yield lower perplexities
+and perform better in downstream tasks. LLMs are also affected differently
+based on the linguistic origins of words, indicating that neologisms are
+complex for static LLMs to address. We will release our benchmark and code for
+reproducing our experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ACL 2024 main conference, 9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PartialFormer: Modeling Part Instead of Whole for Machine Translation <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.14921v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.14921v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Zheng, Bei Li, Huiwen Bao, Jiale Wang, Weiqiao Shan, Tong Xiao, Jingbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The design choices in Transformer feed-forward neural networks have resulted
+in significant computational and parameter overhead. In this work, we emphasize
+the importance of hidden dimensions in designing lightweight FFNs, a factor
+often overlooked in previous architectures. Guided by this principle, we
+introduce PartialFormer, a parameter-efficient Transformer architecture
+utilizing multiple smaller FFNs to reduce parameters and computation while
+maintaining essential hidden dimensions. These smaller FFNs are integrated into
+a multi-head attention mechanism for effective collaboration. We also propose a
+tailored head scaling strategy to enhance PartialFormer's capabilities.
+Furthermore, we present a residual-like attention calculation to improve depth
+scaling within PartialFormer. Extensive experiments on 9 translation tasks and
+1 abstractive summarization task validate the effectiveness of our
+PartialFormer approach on machine translation and summarization tasks. Our code
+would be available at: https://github.com/zhengkid/PartialFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SaySelf: Teaching LLMs to Express Confidence with Self-Reflective
+  Rationales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20974v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20974v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyang Xu, Shujin Wu, Shizhe Diao, Xiaoze Liu, Xingyao Wang, Yangyi Chen, Jing Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often generate inaccurate or fabricated
+information and generally fail to indicate their confidence, which limits their
+broader applications. Previous work elicits confidence from LLMs by direct or
+self-consistency prompting, or constructing specific datasets for supervised
+finetuning. The prompting-based approaches have inferior performance, and the
+training-based approaches are limited to binary or inaccurate group-level
+confidence estimates. In this work, we present the advanced SaySelf, a training
+framework that teaches LLMs to express more accurate fine-grained confidence
+estimates. In addition, beyond the confidence scores, SaySelf initiates the
+process of directing LLMs to produce self-reflective rationales that clearly
+identify gaps in their parametric knowledge and explain their uncertainty. This
+is achieved by using an LLM to automatically summarize the uncertainties in
+specific knowledge via natural language. The summarization is based on the
+analysis of the inconsistency in multiple sampled reasoning chains, and the
+resulting data is utilized for supervised fine-tuning. Moreover, we utilize
+reinforcement learning with a meticulously crafted reward function to calibrate
+the confidence estimates, motivating LLMs to deliver accurate, high-confidence
+predictions and to penalize overconfidence in erroneous outputs. Experimental
+results in both in-distribution and out-of-distribution datasets demonstrate
+the effectiveness of SaySelf in reducing the confidence calibration error and
+maintaining the task performance. We show that the generated self-reflective
+rationales are reasonable and can further contribute to the calibration. The
+code is made public at https://github.com/xu1868/SaySelf.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is available at https://github.com/xu1868/SaySelf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stealthy Attack on Large Language Model based Recommendation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14836v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14836v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghao Zhang, Yuting Liu, Qiang Liu, Shu Wu, Guibing Guo, Liang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the powerful large language models (LLMs) have been instrumental in
+propelling the progress of recommender systems (RS). However, while these
+systems have flourished, their susceptibility to security threats has been
+largely overlooked. In this work, we reveal that the introduction of LLMs into
+recommendation models presents new security vulnerabilities due to their
+emphasis on the textual content of items. We demonstrate that attackers can
+significantly boost an item's exposure by merely altering its textual content
+during the testing phase, without requiring direct interference with the
+model's training process. Additionally, the attack is notably stealthy, as it
+does not affect the overall recommendation performance and the modifications to
+the text are subtle, making it difficult for users and platforms to detect. Our
+comprehensive experiments across four mainstream LLM-based recommendation
+models demonstrate the superior efficacy and stealthiness of our approach. Our
+work unveils a significant security gap in LLM-based recommendation systems and
+paves the way for future research on protecting these systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Context versus Prior Knowledge in Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.04633v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.04633v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Du, Vésteinn Snæbjarnarson, Niklas Stoehr, Jennifer C. White, Aaron Schein, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To answer a question, language models often need to integrate prior knowledge
+learned during pretraining and new information presented in context. We
+hypothesize that models perform this integration in a predictable way across
+different questions and contexts: models will rely more on prior knowledge for
+questions about entities (e.g., persons, places, etc.) that they are more
+familiar with due to higher exposure in the training corpus, and be more easily
+persuaded by some contexts than others. To formalize this problem, we propose
+two mutual information-based metrics to measure a model's dependency on a
+context and on its prior about an entity: first, the persuasion score of a
+given context represents how much a model depends on the context in its
+decision, and second, the susceptibility score of a given entity represents how
+much the model can be swayed away from its original answer distribution about
+an entity. We empirically test our metrics for their validity and reliability.
+Finally, we explore and find a relationship between the scores and the model's
+expected familiarity with an entity, and provide two use cases to illustrate
+their benefits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Long paper accepted at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Techniques for Optimization-Based Jailbreaking on Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21018v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21018v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojun Jia, Tianyu Pang, Chao Du, Yihao Huang, Jindong Gu, Yang Liu, Xiaochun Cao, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are being rapidly developed, and a key component
+of their widespread deployment is their safety-related alignment. Many
+red-teaming efforts aim to jailbreak LLMs, where among these efforts, the
+Greedy Coordinate Gradient (GCG) attack's success has led to a growing interest
+in the study of optimization-based jailbreaking techniques. Although GCG is a
+significant milestone, its attacking efficiency remains unsatisfactory. In this
+paper, we present several improved (empirical) techniques for
+optimization-based jailbreaks like GCG. We first observe that the single target
+template of "Sure" largely limits the attacking performance of GCG; given this,
+we propose to apply diverse target templates containing harmful self-suggestion
+and/or guidance to mislead LLMs. Besides, from the optimization aspects, we
+propose an automatic multi-coordinate updating strategy in GCG (i.e.,
+adaptively deciding how many tokens to replace in each step) to accelerate
+convergence, as well as tricks like easy-to-hard initialisation. Then, we
+combine these improved technologies to develop an efficient jailbreak method,
+dubbed I-GCG. In our experiments, we evaluate on a series of benchmarks (such
+as NeurIPS 2023 Red Teaming Track). The results demonstrate that our improved
+techniques can help GCG outperform state-of-the-art jailbreaking attacks and
+achieve nearly 100% attack success rate. The code is released at
+https://github.com/jiaxiaojunQAQ/I-GCG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuroPrune: A Neuro-inspired Topological Sparse Training Algorithm for
+  Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01306v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01306v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Dhurandhar, Tejaswini Pedapati, Ronny Luss, Soham Dan, Aurelie Lozano, Payel Das, Georgios Kollias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based Language Models have become ubiquitous in Natural Language
+Processing (NLP) due to their impressive performance on various tasks. However,
+expensive training as well as inference remains a significant impediment to
+their widespread applicability. While enforcing sparsity at various levels of
+the model architecture has found promise in addressing scaling and efficiency
+issues, there remains a disconnect between how sparsity affects network
+topology. Inspired by brain neuronal networks, we explore sparsity approaches
+through the lens of network topology. Specifically, we exploit mechanisms seen
+in biological networks, such as preferential attachment and redundant synapse
+pruning, and show that principled, model-agnostic sparsity approaches are
+performant and efficient across diverse NLP tasks, spanning both classification
+(such as natural language inference) and generation (summarization, machine
+translation), despite our sole objective not being optimizing performance.
+NeuroPrune is competitive with (or sometimes superior to) baselines on
+performance and can be up to $10$x faster in terms of training time for a given
+level of sparsity, simultaneously exhibiting measurable improvements in
+inference time in many cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ranking Large Language Models without Ground Truth <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14860v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14860v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Dhurandhar, Rahul Nair, Moninder Singh, Elizabeth Daly, Karthikeyan Natesan Ramamurthy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluation and ranking of large language models (LLMs) has become an
+important problem with the proliferation of these models and their impact.
+Evaluation methods either require human responses which are expensive to
+acquire or use pairs of LLMs to evaluate each other which can be unreliable. In
+this paper, we provide a novel perspective where, given a dataset of prompts
+(viz. questions, instructions, etc.) and a set of LLMs, we rank them without
+access to any ground truth or reference responses. Inspired by real life where
+both an expert and a knowledgeable person can identify a novice our main idea
+is to consider triplets of models, where each one of them evaluates the other
+two, correctly identifying the worst model in the triplet with high
+probability. We also analyze our idea and provide sufficient conditions for it
+to succeed. Applying this idea repeatedly, we propose two methods to rank LLMs.
+In experiments on different generative tasks (summarization, multiple-choice,
+and dialog), our methods reliably recover close to true rankings without
+reference data. This points to a viable low-resource mechanism for practical
+use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Computational Approaches for Integrating out Subjectivity in Cognate
+  Synonym Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.19328v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.19328v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luise Häuser, Gerhard Jäger, Alexandros Stamatakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Working with cognate data involves handling synonyms, that is, multiple words
+that describe the same concept in a language. In the early days of language
+phylogenetics it was recommended to select one synonym only. However, as we
+show here, binary character matrices, which are used as input for computational
+methods, do allow for representing the entire dataset including all synonyms.
+Here we address the question how one can and if one should include all synonyms
+or whether it is preferable to select synonyms a priori. To this end, we
+perform maximum likelihood tree inferences with the widely used RAxML-NG tool
+and show that it yields plausible trees when all synonyms are used as input.
+Furthermore, we show that a priori synonym selection can yield topologically
+substantially different trees and we therefore advise against doing so. To
+represent cognate data including all synonyms, we introduce two types of
+character matrices beyond the standard binary ones: probabilistic binary and
+probabilistic multi-valued character matrices. We further show that it is
+dataset-dependent for which character matrix type the inferred RAxML-NG tree is
+topologically closest to the gold standard. We also make available a Python
+interface for generating all of the above character matrix types for cognate
+data provided in CLDF format.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Experiments available on GitHub
+  (https://github.com/luisevonderwiese/synonyms,
+  https://github.com/luisevonderwiese/lingdata)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CR-UTP: Certified Robustness against Universal Text Perturbations on
+  Large Language Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01873v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01873v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Lou, Xin Liang, Jiaqi Xue, Yancheng Zhang, Rui Xie, Mengxin Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is imperative to ensure the stability of every prediction made by a
+language model; that is, a language's prediction should remain consistent
+despite minor input variations, like word substitutions. In this paper, we
+investigate the problem of certifying a language model's robustness against
+Universal Text Perturbations (UTPs), which have been widely used in universal
+adversarial attacks and backdoor attacks. Existing certified robustness based
+on random smoothing has shown considerable promise in certifying the
+input-specific text perturbations (ISTPs), operating under the assumption that
+any random alteration of a sample's clean or adversarial words would negate the
+impact of sample-wise perturbations. However, with UTPs, masking only the
+adversarial words can eliminate the attack. A naive method is to simply
+increase the masking ratio and the likelihood of masking attack tokens, but it
+leads to a significant reduction in both certified accuracy and the certified
+radius due to input corruption by extensive masking. To solve this challenge,
+we introduce a novel approach, the superior prompt search method, designed to
+identify a superior prompt that maintains higher certified accuracy under
+extensive masking. Additionally, we theoretically motivate why ensembles are a
+particularly suitable choice as base prompts for random smoothing. The method
+is denoted by superior prompt ensembling technique. We also empirically confirm
+this technique, obtaining state-of-the-art results in multiple settings. These
+methodologies, for the first time, enable high certified accuracy against both
+UTPs and ISTPs. The source code of CR-UTP is available at \url
+{https://github.com/UCFML-Research/CR-UTP}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Demonstrating Mutual Reinforcement Effect through Information Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02902v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02902v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengguang Gan, Xuzheng He, Qinghao Zhang, Tatsunori Mori
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Mutual Reinforcement Effect (MRE) investigates the synergistic
+relationship between word-level and text-level classifications in text
+classification tasks. It posits that the performance of both classification
+levels can be mutually enhanced. However, this mechanism has not been
+adequately demonstrated or explained in prior research. To address this gap, we
+employ information flow analysis to observe and substantiate the MRE theory.
+Our experiments on six MRE hybrid datasets revealed the presence of MRE in the
+model and its impact. Additionally, we conducted fine-tuning experiments, whose
+results were consistent with those of the information flow experiments. The
+convergence of findings from both experiments corroborates the existence of
+MRE. Furthermore, we extended the application of MRE to prompt learning,
+utilizing word-level information as a verbalizer to bolster the model's
+prediction of text-level classification labels. In our final experiment, the
+F1-score significantly surpassed the baseline in five out of six datasets,
+further validating the notion that word-level information enhances the language
+model's comprehension of the text as a whole.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The co-authors have requested that the manuscript be withdrawn. And
+  the paper has major flaws</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Edit: Aligning LLMs with Knowledge Editing <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11905v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11905v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Jiang, Yufei Wang, Chuhan Wu, Wanjun Zhong, Xingshan Zeng, Jiahui Gao, Liangyou Li, Xin Jiang, Lifeng Shang, Ruiming Tang, Qun Liu, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge editing techniques, aiming to efficiently modify a minor proportion
+of knowledge in large language models (LLMs) without negatively impacting
+performance across other inputs, have garnered widespread attention. However,
+existing methods predominantly rely on memorizing the updated knowledge,
+impeding LLMs from effectively combining the new knowledge with their inherent
+knowledge when answering questions. To this end, we propose a Learning to Edit
+(LTE) framework, focusing on teaching LLMs to apply updated knowledge into
+input questions, inspired by the philosophy of "Teach a man to fish." LTE
+features a two-phase process: (i) the Alignment Phase, which fine-tunes LLMs on
+a meticulously curated parallel dataset to make reliable, in-scope edits while
+preserving out-of-scope information and linguistic proficiency; and (ii) the
+Inference Phase, which employs a retrieval-based mechanism for real-time and
+mass knowledge editing. By comparing our approach with seven advanced baselines
+across four popular knowledge editing benchmarks and two LLM architectures, we
+demonstrate LTE's superiority in knowledge editing performance, robustness in
+both batch and sequential editing, minimal interference on general tasks, and
+rapid editing speeds. The data and code are available at
+https://github.com/YJiangcm/LTE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 8 figures, 9 tables. ACL 2024 main camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Luna: An Evaluation Foundation Model to Catch Language Model
+  Hallucinations with High Accuracy and Low Cost 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00975v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00975v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masha Belyi, Robert Friel, Shuai Shao, Atindriyo Sanyal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retriever Augmented Generation (RAG) systems have become pivotal in enhancing
+the capabilities of language models by incorporating external knowledge
+retrieval mechanisms. However, a significant challenge in deploying these
+systems in industry applications is the detection and mitigation of
+hallucinations: instances where the model generates information that is not
+grounded in the retrieved context. Addressing this issue is crucial for
+ensuring the reliability and accuracy of responses generated by large language
+models (LLMs) in diverse industry settings. Current hallucination detection
+techniques fail to deliver accuracy, low latency, and low cost simultaneously.
+We introduce Luna: a DeBERTA-large (440M) encoder, finetuned for hallucination
+detection in RAG settings. We demonstrate that Luna outperforms GPT-3.5 and
+commercial evaluation frameworks on the hallucination detection task, with 97%
+and 91% reduction in cost and latency, respectively. Luna is lightweight and
+generalizes across multiple industry verticals and out-of-domain data, making
+it an ideal candidate for industry LLM applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FollowBench: A Multi-level Fine-grained Constraints Following Benchmark
+  for Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.20410v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.20410v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Jiang, Yufei Wang, Xingshan Zeng, Wanjun Zhong, Liangyou Li, Fei Mi, Lifeng Shang, Xin Jiang, Qun Liu, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to follow instructions is crucial for Large Language Models
+(LLMs) to handle various real-world applications. Existing benchmarks primarily
+focus on evaluating pure response quality, rather than assessing whether the
+response follows constraints stated in the instruction. To fill this research
+gap, in this paper, we propose FollowBench, a Multi-level Fine-grained
+Constraints Following Benchmark for LLMs. FollowBench comprehensively includes
+five different types (i.e., Content, Situation, Style, Format, and Example) of
+fine-grained constraints. To enable a precise constraint following estimation
+on diverse difficulties, we introduce a Multi-level mechanism that
+incrementally adds a single constraint to the initial instruction at each
+increased level. To assess whether LLMs' outputs have satisfied every
+individual constraint, we propose to prompt strong LLMs with
+constraint-evolution paths to handle challenging open-ended instructions. By
+evaluating 13 closed-source and open-source popular LLMs on FollowBench, we
+highlight the weaknesses of LLMs in instruction following and point towards
+potential avenues for future work. The data and code are publicly available at
+https://github.com/YJiangcm/FollowBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures, 16 tables. ACL 2024 main camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Entity Matching using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11244v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11244v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ralph Peeters, Christian Bizer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity Matching is the task of deciding whether two entity descriptions refer
+to the same real-world entity and is a central step in most data integration
+pipelines. Many state-of-the-art entity matching methods rely on pre-trained
+language models (PLMs) such as BERT or RoBERTa. Two major drawbacks of these
+models for entity matching are that (i) the models require significant amounts
+of task-specific training data and (ii) the fine-tuned models are not robust
+concerning out-of-distribution entities. This paper investigates using
+generative large language models (LLMs) as a less task-specific training
+data-dependent and more robust alternative to PLM-based matchers. Our study
+covers hosted and open-source LLMs, which can be run locally. We evaluate these
+models in a zero-shot scenario and a scenario where task-specific training data
+is available. We compare different prompt designs and the prompt sensitivity of
+the models and show that there is no single best prompt but needs to be tuned
+for each model/dataset combination. We further investigate (i) the selection of
+in-context demonstrations, (ii) the generation of matching rules, as well as
+(iii) fine-tuning a hosted LLM using the same pool of training data. Our
+experiments show that the best LLMs require no or only a few training examples
+to perform similarly to PLMs that were fine-tuned using thousands of examples.
+LLM-based matchers further exhibit higher robustness to unseen entities. We
+show that GPT4 can generate structured explanations for matching decisions. The
+model can automatically identify potential causes of matching errors by
+analyzing explanations of wrong decisions. We demonstrate that the model can
+generate meaningful textual descriptions of the identified error classes, which
+can help data engineers improve entity matching pipelines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ I-LLM: Efficient Integer-Only Inference for Fully-Quantized Low-Bit
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17849v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17849v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xing Hu, Yuan Cheng, Dawei Yang, Zhihang Yuan, Jiangyong Yu, Chen Xu, Sifan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization (PTQ) serves as a potent technique to accelerate
+the inference of large language models (LLMs). Nonetheless, existing works
+still necessitate a considerable number of floating-point (FP) operations
+during inference, including additional quantization and de-quantization, as
+well as non-linear operators such as RMSNorm and Softmax. This limitation
+hinders the deployment of LLMs on the edge and cloud devices. In this paper, we
+identify the primary obstacle to integer-only quantization for LLMs lies in the
+large fluctuation of activations across channels and tokens in both linear and
+non-linear operations. To address this issue, we propose I-LLM, a novel
+integer-only fully-quantized PTQ framework tailored for LLMs. Specifically, (1)
+we develop Fully-Smooth Block-Reconstruction (FSBR) to aggressively smooth
+inter-channel variations of all activations and weights. (2) to alleviate
+degradation caused by inter-token variations, we introduce a novel approach
+called Dynamic Integer-only MatMul (DI-MatMul). This method enables dynamic
+quantization in full-integer matrix multiplication by dynamically quantizing
+the input and outputs with integer-only operations. (3) we design
+DI-ClippedSoftmax, DI-Exp, and DI-Normalization, which utilize bit shift to
+execute non-linear operators efficiently while maintaining accuracy. The
+experiment shows that our I-LLM achieves comparable accuracy to the FP baseline
+and outperforms non-integer quantization methods. For example, I-LLM can
+operate at W4A4 with negligible loss of accuracy. To our knowledge, we are the
+first to bridge the gap between integer-only quantization and LLMs. We've
+published our code on anonymous.4open.science, aiming to contribute to the
+advancement of this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Can Infer Psychological Dispositions of Social
+  Media Users 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08631v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08631v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heinrich Peters, Sandra Matz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) demonstrate increasingly human-like abilities
+across a wide variety of tasks. In this paper, we investigate whether LLMs like
+ChatGPT can accurately infer the psychological dispositions of social media
+users and whether their ability to do so varies across socio-demographic
+groups. Specifically, we test whether GPT-3.5 and GPT-4 can derive the Big Five
+personality traits from users' Facebook status updates in a zero-shot learning
+scenario. Our results show an average correlation of r = .29 (range = [.22,
+.33]) between LLM-inferred and self-reported trait scores - a level of accuracy
+that is similar to that of supervised machine learning models specifically
+trained to infer personality. Our findings also highlight heterogeneity in the
+accuracy of personality inferences across different age groups and gender
+categories: predictions were found to be more accurate for women and younger
+individuals on several traits, suggesting a potential bias stemming from the
+underlying training data or differences in online self-expression. The ability
+of LLMs to infer psychological dispositions from user-generated text has the
+potential to democratize access to cheap and scalable psychometric assessments
+for both researchers and practitioners. On the one hand, this democratization
+might facilitate large-scale research of high ecological validity and spark
+innovation in personalized services. On the other hand, it also raises ethical
+concerns regarding user privacy and self-determination, highlighting the need
+for stringent ethical frameworks and regulation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Preference Optimization for Sample Efficient RLHF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10500v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10500v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nirjhar Das, Souradip Chakraborty, Aldo Pacchiano, Sayak Ray Chowdhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning from Human Feedback (RLHF) is pivotal in aligning
+Large Language Models (LLMs) with human preferences. Although aligned
+generative models have shown remarkable abilities in various tasks, their
+reliance on high-quality human preference data creates a costly bottleneck in
+the practical application of RLHF. One primary reason is that current methods
+rely on uniformly picking prompt-generation pairs from a dataset of
+prompt-generations, to collect human feedback, resulting in sub-optimal
+alignment under a constrained budget, which highlights the criticality of
+adaptive strategies in efficient alignment. Recent works [Mehta et al., 2023,
+Muldrew et al., 2024] have tried to address this problem by designing various
+heuristics based on generation uncertainty. However, either the assumptions in
+[Mehta et al., 2023] are restrictive, or [Muldrew et al., 2024] do not provide
+any rigorous theoretical guarantee. To address these, we reformulate RLHF
+within contextual preference bandit framework, treating prompts as contexts,
+and develop an active-learning algorithm, $\textit{Active Preference
+Optimization}$ ($\texttt{APO}$), which enhances model alignment by querying
+preference data from the most important samples, achieving superior performance
+for small sample budget. We analyze the theoretical performance guarantees of
+$\texttt{APO}$ under the BTL preference model showing that the suboptimality
+gap of the policy learned via $\texttt{APO}$ scales as $O(1/\sqrt{T})$ for a
+budget of $T$. We also show that collecting preference data by choosing prompts
+randomly leads to a policy that suffers a constant sub-optimality. We perform
+detailed experimental evaluations on practical preference datasets to validate
+$\texttt{APO}$'s efficacy over the existing methods, establishing it as a
+sample-efficient and practical solution of alignment in a cost-effective and
+scalable manner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>New experimental results added. Some reorganization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LlamaCare: A Large Medical Language Model for Enhancing Healthcare
+  Knowledge Sharing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maojun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown amazing capabilities in knowledge
+memorization and the present. However, when it comes to domain-specific
+knowledge and downstream tasks like medical, general LLMs are often unable to
+give precise answers. In addition, when people want LLMs to answer
+classification questions, they usually go through instruction tuning first.
+However, LLMs do not always give a direct index of the categorization after
+instruction tuning. In this paper, we proposed LlamaCare, a fine-tuned medical
+language model, and Extended Classification Integration(ECI), a module to
+handle classification problems of LLMs. Our contributions are : (i) We
+fine-tuned a large language model of medical knowledge with very low carbon
+emissions and achieved similar performance with ChatGPT by a 24G GPU. (ii) We
+solved the problem of redundant categorical answers and improved the
+performance of LLMs by proposing a new module called Extended Classification
+Integration. (iii) We released our processed data for one-shot and few-shot
+training for some benchmarks such as PubMedQA and USMLE 1-3 step. Our method
+achieves a close performance comparable to some state-of-the-art models with
+the same quantity of parameters on benchmarks, while being more environmentally
+friendly by using less GPU computation time. Our models, codes, and datasets
+can be found at \url{https://github.com/Stephen-SMJ/LLamaCare}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ All Language Models Large and Small 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12061v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12061v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixun Chen, Yali Du, David Mguni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many leading language models (LMs) use high-intensity computational resources
+both during training and execution. This poses the challenge of lowering
+resource costs for deployment and faster execution of decision-making tasks
+among others. We introduce a novel plug-and-play LM framework named Language
+Optimising Network Distribution (LONDI) framework. LONDI learns to selectively
+employ large LMs only where complex decision-making and reasoning are required
+while using low-resource LMs (i.e. LMs require less GPU usage, but may not be
+able to solve the problem alone) everywhere else. LONDI consists of a system of
+two (off-)policy networks, an LM, a large LM (LLM), and a reinforcement
+learning module that uses switching controls to quickly learn which system
+states to call the LLM. We then introduce a variant of LONDI that maintains
+budget constraints on LLM calls and hence its resource usage. Theoretically, we
+prove LONDI learns the subset of system states to activate the LLM required to
+solve the task. We then prove that LONDI converges to optimal solutions while
+also preserving budgetary constraints on LLM calls almost surely enabling it to
+solve various tasks while significantly lowering computational costs. We test
+LONDI's performance in a range of tasks in ScienceWorld and BabyAI-Text and
+demonstrate that LONDI can solve tasks only solvable by resource-intensive LLMs
+while reducing GPU usage by up to 30%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty Estimation on Sequential Labeling via Uncertainty
+  Transmission 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.08726v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.08726v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianfeng He, Linlin Yu, Shuo Lei, Chang-Tien Lu, Feng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential labeling is a task predicting labels for each token in a sequence,
+such as Named Entity Recognition (NER). NER tasks aim to extract entities and
+predict their labels given a text, which is important in information
+extraction. Although previous works have shown great progress in improving NER
+performance, uncertainty estimation on NER (UE-NER) is still underexplored but
+essential. This work focuses on UE-NER, which aims to estimate uncertainty
+scores for the NER predictions. Previous uncertainty estimation models often
+overlook two unique characteristics of NER: the connection between entities
+(i.e., one entity embedding is learned based on the other ones) and wrong span
+cases in the entity extraction subtask. Therefore, we propose a Sequential
+Labeling Posterior Network (SLPN) to estimate uncertainty scores for the
+extracted entities, considering uncertainty transmitted from other tokens.
+Moreover, we have defined an evaluation strategy to address the specificity of
+wrong-span cases. Our SLPN has achieved significant improvements on three
+datasets, such as a 5.54-point improvement in AUPR on the MIT-Restaurant
+dataset. Our code is available at
+\url{https://github.com/he159ok/UncSeqLabeling_SLPN}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpretability Illusions in the Generalization of Simplified Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03656v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03656v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan Friedman, Andrew Lampinen, Lucas Dixon, Danqi Chen, Asma Ghandeharioun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common method to study deep learning systems is to use simplified model
+representations--for example, using singular value decomposition to visualize
+the model's hidden states in a lower dimensional space. This approach assumes
+that the results of these simplifications are faithful to the original model.
+Here, we illustrate an important caveat to this assumption: even if the
+simplified representations can accurately approximate the full model on the
+training set, they may fail to accurately capture the model's behavior out of
+distribution. We illustrate this by training Transformer models on controlled
+datasets with systematic generalization splits, including the Dyck
+balanced-parenthesis languages and a code completion task. We simplify these
+models using tools like dimensionality reduction and clustering, and then
+explicitly test how these simplified proxies match the behavior of the original
+model. We find consistent generalization gaps: cases in which the simplified
+proxies are more faithful to the original model on the in-distribution
+evaluations and less faithful on various tests of systematic generalization.
+This includes cases where the original model generalizes systematically but the
+simplified proxies fail, and cases where the simplified proxies generalize
+better. Together, our results raise questions about the extent to which
+mechanistic interpretations derived using tools like SVD can reliably predict
+what a model will do in novel situations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MCTS: A Multi-Reference Chinese Text Simplification <span class="highlight-title">Dataset</span> <span class="chip">COLING 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02796v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02796v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruining Chong, Luming Lu, Liner Yang, Jinran Nie, Zhenghao Liu, Shuo Wang, Shuhan Zhou, Yaoxin Li, Erhong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text simplification aims to make the text easier to understand by applying
+rewriting transformations. There has been very little research on Chinese text
+simplification for a long time. The lack of generic evaluation data is an
+essential reason for this phenomenon. In this paper, we introduce MCTS, a
+multi-reference Chinese text simplification dataset. We describe the annotation
+process of the dataset and provide a detailed analysis. Furthermore, we
+evaluate the performance of several unsupervised methods and advanced large
+language models. We additionally provide Chinese text simplification parallel
+data that can be used for training, acquired by utilizing machine translation
+and English text simplification. We hope to build a basic understanding of
+Chinese text simplification through the foundational work and provide
+references for future research. All of the code and data are released at
+https://github.com/blcuicall/mcts/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to COLING 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear <span class="highlight-title">Transformer</span>s with Learnable Kernel Functions are Better
+  In-Context Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaroslav Aksenov, Nikita Balagansky, Sofia Maria Lo Cicero Vaina, Boris Shaposhnikov, Alexey Gorbatovski, Daniil Gavrilov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancing the frontier of subquadratic architectures for Language Models
+(LMs) is crucial in the rapidly evolving field of natural language processing.
+Current innovations, including State Space Models, were initially celebrated
+for surpassing Transformer performance on language modeling tasks. However,
+these models have revealed deficiencies in essential In-Context Learning
+capabilities - a domain where the Transformer traditionally shines. The Based
+model emerged as a hybrid solution, blending a Linear Transformer with a kernel
+inspired by the Taylor expansion of exponential functions, augmented by
+convolutional networks. Mirroring the Transformer's in-context adeptness, it
+became a strong contender in the field. In our work, we present a singular,
+elegant alteration to the Based kernel that amplifies its In-Context Learning
+abilities evaluated with the Multi-Query Associative Recall task and overall
+language modeling process, as demonstrated on the Pile dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ JumpCoder: Go Beyond Autoregressive Coder via Online Modification <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mouxiang Chen, Hao Tian, Zhongxin Liu, Xiaoxue Ren, Jianling Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While existing code large language models (code LLMs) exhibit impressive
+capabilities in code generation, their autoregressive sequential generation
+inherently lacks reversibility. This limitation hinders them from timely
+correcting previous missing statements during coding as humans do, often
+leading to error propagation and suboptimal performance. We introduce
+JumpCoder, a novel model-agnostic framework that enables human-like online
+modification and non-sequential generation to augment code LLMs. The key idea
+behind JumpCoder is to insert new code into the currently generated code when
+necessary during generation, which is achieved through an auxiliary infilling
+model that works in tandem with the code LLM. Since identifying the best infill
+position beforehand is intractable, we adopt an \textit{infill-first,
+judge-later} strategy, which experiments with filling at the $k$ most critical
+positions following the generation of each line, and uses an Abstract Syntax
+Tree (AST) parser alongside the Generation Model Scoring to effectively judge
+the validity of each potential infill. Extensive experiments using six
+state-of-the-art code LLMs across multiple and multilingual benchmarks
+consistently indicate significant improvements over all baselines. Our code is
+public at https://github.com/Keytoyze/JumpCoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 (main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Unreasonable Effectiveness of Easy Training Data for Hard Tasks <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06751v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06751v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Hase, Mohit Bansal, Peter Clark, Sarah Wiegreffe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How can we train models to perform well on hard test data when hard training
+data is by definition difficult to label correctly? This question has been
+termed the scalable oversight problem and has drawn increasing attention as
+language models have continually improved. In this paper, we present the
+surprising conclusion that current pretrained language models often generalize
+relatively well from easy to hard data, even performing as well as oracle
+models finetuned on hard data. We demonstrate this kind of easy-to-hard
+generalization using simple finetuning methods like in-context learning, linear
+classifier heads, and QLoRA for seven different measures of datapoint hardness,
+including six empirically diverse human hardness measures (like grade level)
+and one model-based measure (loss-based). Furthermore, we show that even if one
+cares most about model performance on hard data, it can be better to collect
+easy data rather than hard data for finetuning, since hard data is generally
+noisier and costlier to collect. Our experiments use open models up to 70b in
+size and four publicly available question-answering datasets with questions
+ranging in difficulty from 3rd grade science questions to college level STEM
+questions and general-knowledge trivia. We conclude that easy-to-hard
+generalization in LMs is surprisingly strong for the tasks studied. Our code is
+available at: https://github.com/allenai/easy-to-hard-generalization
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024. 23 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Better Understanding of Contrastive Sentence Representation
+  Learning: A Unified Paradigm for Gradient <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18281v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18281v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingxin Li, Richong Zhang, Zhijie Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sentence Representation Learning (SRL) is a crucial task in Natural Language
+Processing (NLP), where contrastive Self-Supervised Learning (SSL) is currently
+a mainstream approach. However, the reasons behind its remarkable effectiveness
+remain unclear. Specifically, many studies have investigated the similarities
+between contrastive and non-contrastive SSL from a theoretical perspective.
+Such similarities can be verified in classification tasks, where the two
+approaches achieve comparable performance. But in ranking tasks (i.e., Semantic
+Textual Similarity (STS) in SRL), contrastive SSL significantly outperforms
+non-contrastive SSL. Therefore, two questions arise: First, *what commonalities
+enable various contrastive losses to achieve superior performance in STS?*
+Second, *how can we make non-contrastive SSL also effective in STS?* To address
+these questions, we start from the perspective of gradients and discover that
+four effective contrastive losses can be integrated into a unified paradigm,
+which depends on three components: the **Gradient Dissipation**, the
+**Weight**, and the **Ratio**. Then, we conduct an in-depth analysis of the
+roles these components play in optimization and experimentally demonstrate
+their significance for model performance. Finally, by adjusting these
+components, we enable non-contrastive SSL to achieve outstanding performance in
+STS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Hallucinations in Large Vision-Language Models with
+  Instruction Contrastive Decoding <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xintong Wang, Jingheng Pan, Liang Ding, Chris Biemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) are increasingly adept at generating
+contextually detailed and coherent responses from visual inputs. However, their
+application in multimodal decision-making and open-ended generation is hindered
+by a notable rate of hallucinations, where generated text inaccurately
+represents the visual contents. To address this issue, this paper introduces
+the Instruction Contrastive Decoding (ICD) method, a novel approach designed to
+reduce hallucinations during LVLM inference. Our method is inspired by our
+observation that what we call disturbance instructions significantly exacerbate
+hallucinations in multimodal fusion modules. ICD contrasts distributions from
+standard and instruction disturbance, thereby increasing alignment uncertainty
+and effectively subtracting hallucinated concepts from the original
+distribution. Through comprehensive experiments on discriminative benchmarks
+(POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that
+ICD significantly mitigates both object-level and attribute-level
+hallucinations. Moreover, our method not only addresses hallucinations but also
+significantly enhances the general perception and recognition capabilities of
+LVLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Large Language Models be Good Emotional Supporter? Mitigating
+  Preference Bias on Emotional Support Conversation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13211v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13211v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongjin Kang, Sunghwan Kim, Taeyoon Kwon, Seungjun Moon, Hyunsouk Cho, Youngjae Yu, Dongha Lee, Jinyoung Yeo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emotional Support Conversation (ESC) is a task aimed at alleviating
+individuals' emotional distress through daily conversation. Given its inherent
+complexity and non-intuitive nature, ESConv dataset incorporates support
+strategies to facilitate the generation of appropriate responses. Recently,
+despite the remarkable conversational ability of large language models (LLMs),
+previous studies have suggested that they often struggle with providing useful
+emotional support. Hence, this work initially analyzes the results of LLMs on
+ESConv, revealing challenges in selecting the correct strategy and a notable
+preference for a specific strategy. Motivated by these, we explore the impact
+of the inherent preference in LLMs on providing emotional support, and
+consequently, we observe that exhibiting high preference for specific
+strategies hinders effective emotional support, aggravating its robustness in
+predicting the appropriate strategy. Moreover, we conduct a methodological
+study to offer insights into the necessary approaches for LLMs to serve as
+proficient emotional supporters. Our findings emphasize that (1) low preference
+for specific strategies hinders the progress of emotional support, (2) external
+assistance helps reduce preference bias, and (3) existing LLMs alone cannot
+become good emotional supporters. These insights suggest promising avenues for
+future research to enhance the emotional intelligence of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BanglaAutoKG: Automatic Bangla Knowledge Graph Construction with
+  Semantic Neural Graph Filtering <span class="chip">LREC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03528v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03528v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azmine Toushik Wasi, Taki Hasan Rafi, Raima Islam, Dong-Kyu Chae
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graphs (KGs) have proven essential in information processing and
+reasoning applications because they link related entities and give context-rich
+information, supporting efficient information retrieval and knowledge
+discovery; presenting information flow in a very effective manner. Despite
+being widely used globally, Bangla is relatively underrepresented in KGs due to
+a lack of comprehensive datasets, encoders, NER (named entity recognition)
+models, POS (part-of-speech) taggers, and lemmatizers, hindering efficient
+information processing and reasoning applications in the language. Addressing
+the KG scarcity in Bengali, we propose BanglaAutoKG, a pioneering framework
+that is able to automatically construct Bengali KGs from any Bangla text. We
+utilize multilingual LLMs to understand various languages and correlate
+entities and relations universally. By employing a translation dictionary to
+identify English equivalents and extracting word features from pre-trained BERT
+models, we construct the foundational KG. To reduce noise and align word
+embeddings with our goal, we employ graph-based polynomial filters. Lastly, we
+implement a GNN-based semantic filter, which elevates contextual understanding
+and trims unnecessary edges, culminating in the formation of the definitive KG.
+Empirical findings and case studies demonstrate the universal effectiveness of
+our model, capable of autonomously constructing semantically enriched KGs from
+any text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures. Accepted to LREC-COLING 2024. Read in ACL
+  Anthology: https://aclanthology.org/2024.lrec-main.189/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Experiential Co-Learning of Software-Developing Agents <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.17025v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.17025v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Qian, Yufan Dang, Jiahao Li, Wei Liu, Zihao Xie, Yifei Wang, Weize Chen, Cheng Yang, Xin Cong, Xiaoyin Che, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language models (LLMs) have brought significant
+changes to various domains, especially through LLM-driven autonomous agents. A
+representative scenario is in software development, where LLM agents
+demonstrate efficient collaboration, task division, and assurance of software
+quality, markedly reducing the need for manual involvement. However, these
+agents frequently perform a variety of tasks independently, without benefiting
+from past experiences, which leads to repeated mistakes and inefficient
+attempts in multi-step task execution. To this end, we introduce Experiential
+Co-Learning, a novel LLM-agent learning framework in which instructor and
+assistant agents gather shortcut-oriented experiences from their historical
+trajectories and use these past experiences for future task execution. The
+extensive experiments demonstrate that the framework enables agents to tackle
+unseen software-developing tasks more effectively. We anticipate that our
+insights will guide LLM agents towards enhanced autonomy and contribute to
+their evolutionary growth in cooperative learning. The code and data are
+available at https://github.com/OpenBMB/ChatDev.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024, https://github.com/OpenBMB/ChatDev</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Self-Augmented In-Context Learning for Unsupervised Word Translation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10024v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10024v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaoyiran Li, Anna Korhonen, Ivan Vulić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that, while large language models (LLMs) demonstrate
+strong word translation or bilingual lexicon induction (BLI) capabilities in
+few-shot setups, they still cannot match the performance of 'traditional'
+mapping-based approaches in the unsupervised scenario where no seed translation
+pairs are available, especially for lower-resource languages. To address this
+challenge with LLMs, we propose self-augmented in-context learning (SAIL) for
+unsupervised BLI: starting from a zero-shot prompt, SAIL iteratively induces a
+set of high-confidence word translation pairs for in-context learning (ICL)
+from an LLM, which it then reapplies to the same LLM in the ICL fashion. Our
+method shows substantial gains over zero-shot prompting of LLMs on two
+established BLI benchmarks spanning a wide range of language pairs, also
+outperforming mapping-based baselines across the board. In addition to
+achieving state-of-the-art unsupervised BLI performance, we also conduct
+comprehensive analyses on SAIL and discuss its limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main Conference; 11 Pages, 3 Figures, 9 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ M2SA: Multimodal and Multilingual Model for Sentiment Analysis of Tweets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01753v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01753v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gaurish Thakkar, Sherzod Hakimov, Marko Tadić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, multimodal natural language processing, aimed at learning
+from diverse data types, has garnered significant attention. However, there
+needs to be more clarity when it comes to analysing multimodal tasks in
+multi-lingual contexts. While prior studies on sentiment analysis of tweets
+have predominantly focused on the English language, this paper addresses this
+gap by transforming an existing textual Twitter sentiment dataset into a
+multimodal format through a straightforward curation process. Our work opens up
+new avenues for sentiment-related research within the research community.
+Additionally, we conduct baseline experiments utilising this augmented dataset
+and report the findings. Notably, our evaluations reveal that when comparing
+unimodal and multimodal configurations, using a sentiment-tuned large language
+model as a text encoder performs exceptionally well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChatDev: Communicative Agents for Software Development <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07924v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07924v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Qian, Wei Liu, Hongzhang Liu, Nuo Chen, Yufan Dang, Jiahao Li, Cheng Yang, Weize Chen, Yusheng Su, Xin Cong, Juyuan Xu, Dahai Li, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Software development is a complex task that necessitates cooperation among
+multiple members with diverse skills. Numerous studies used deep learning to
+improve specific phases in a waterfall model, such as design, coding, and
+testing. However, the deep learning model in each phase requires unique
+designs, leading to technical inconsistencies across various phases, which
+results in a fragmented and ineffective development process. In this paper, we
+introduce ChatDev, a chat-powered software development framework in which
+specialized agents driven by large language models (LLMs) are guided in what to
+communicate (via chat chain) and how to communicate (via communicative
+dehallucination). These agents actively contribute to the design, coding, and
+testing phases through unified language-based communication, with solutions
+derived from their multi-turn dialogues. We found their utilization of natural
+language is advantageous for system design, and communicating in programming
+language proves helpful in debugging. This paradigm demonstrates how linguistic
+communication facilitates multi-agent collaboration, establishing language as a
+unifying bridge for autonomous task-solving among LLM agents. The code and data
+are available at https://github.com/OpenBMB/ChatDev.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024; https://github.com/OpenBMB/ChatDev</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Abstract Meaning Representation-Based Logic-Driven Data Augmentation for
+  Logical Reasoning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12599v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12599v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiming Bao, Alex Yuxuan Peng, Zhenyun Deng, Wanjun Zhong, Gael Gendron, Timothy Pistotti, Neset Tan, Nathan Young, Yang Chen, Yonghua Zhu, Paul Denny, Michael Witbrock, Jiamou Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Combining large language models with logical reasoning enhances their
+capacity to address problems in a robust and reliable manner. Nevertheless, the
+intricate nature of logical reasoning poses challenges when gathering reliable
+data from the web to build comprehensive training datasets, subsequently
+affecting performance on downstream tasks. To address this, we introduce a
+novel logic-driven data augmentation approach, AMR-LDA. AMR-LDA converts the
+original text into an Abstract Meaning Representation (AMR) graph, a structured
+semantic representation that encapsulates the logical structure of the
+sentence, upon which operations are performed to generate logically modified
+AMR graphs. The modified AMR graphs are subsequently converted back into text
+to create augmented data. Notably, our methodology is architecture-agnostic and
+enhances both generative large language models, such as GPT-3.5 and GPT-4,
+through prompt augmentation, and discriminative large language models through
+contrastive learning with logic-driven data augmentation. Empirical evidence
+underscores the efficacy of our proposed method with improvement in performance
+across seven downstream tasks, such as reading comprehension requiring logical
+reasoning, textual entailment, and natural language inference. Furthermore, our
+method leads on the ReClor
+leaderboard\footnote{\url{https://eval.ai/web/challenges/challenge-page/503/leaderboard/1347}}.
+The source code and data are publicly
+available\footnote{\href{https://github.com/Strong-AI-Lab/Logical-Equivalence-driven-AMR-Data-Augmentation-for-Representation-Learning}{AMR-LDA
+GitHub Repository}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 8 figures, the Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimal Transport Guided Correlation Assignment for Multimodal Entity
+  Linking <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01934v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01934v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zefeng Zhang, Jiawei Sheng, Chuang Zhang, Yunzhi Liang, Wenyuan Zhang, Siqi Wang, Tingwen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Entity Linking (MEL) aims to link ambiguous mentions in multimodal
+contexts to entities in a multimodal knowledge graph. A pivotal challenge is to
+fully leverage multi-element correlations between mentions and entities to
+bridge modality gap and enable fine-grained semantic matching. Existing methods
+attempt several local correlative mechanisms, relying heavily on the
+automatically learned attention weights, which may over-concentrate on partial
+correlations. To mitigate this issue, we formulate the correlation assignment
+problem as an optimal transport (OT) problem, and propose a novel MEL
+framework, namely OT-MEL, with OT-guided correlation assignment. Thereby, we
+exploit the correlation between multimodal features to enhance multimodal
+fusion, and the correlation between mentions and entities to enhance
+fine-grained matching. To accelerate model prediction, we further leverage
+knowledge distillation to transfer OT assignment knowledge to attention
+mechanism. Experimental results show that our model significantly outperforms
+previous state-of-the-art baselines and confirm the effectiveness of the
+OT-guided correlation assignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PreFLMR: Scaling Up Fine-Grained Late-Interaction Multi-modal Retrievers <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08327v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08327v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhe Lin, Jingbiao Mei, Jinghong Chen, Bill Byrne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Multimodal Models (LMMs) excel in natural language and visual
+understanding but are challenged by exacting tasks such as Knowledge-based
+Visual Question Answering (KB-VQA) which involve the retrieval of relevant
+information from document collections to use in shaping answers to questions.
+We present an extensive training and evaluation framework, M2KR, for KB-VQA.
+M2KR contains a collection of vision and language tasks which we have
+incorporated into a single suite of benchmark tasks for training and evaluating
+general-purpose multi-modal retrievers. We use M2KR to develop PreFLMR, a
+pre-trained version of the recently developed Fine-grained Late-interaction
+Multi-modal Retriever (FLMR) approach to KB-VQA, and we report new
+state-of-the-art results across a range of tasks. We also present
+investigations into the scaling behaviors of PreFLMR intended to be useful in
+future developments in general-purpose multi-modal retrievers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024; Project page: https://preflmr.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TruthX: Alleviating Hallucinations by Editing Large Language Models in
+  Truthful Space <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17811v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17811v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaolei Zhang, Tian Yu, Yang Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) sometimes suffer from producing hallucinations,
+especially LLMs may generate untruthful responses despite knowing the correct
+knowledge. Activating the truthfulness within LLM is the key to fully unlocking
+LLM's knowledge potential. In this paper, we propose TruthX, an inference-time
+intervention method to activate the truthfulness of LLM by identifying and
+editing the features within LLM's internal representations that govern the
+truthfulness. TruthX employs an auto-encoder to map LLM's representations into
+semantic and truthful latent spaces respectively, and applies contrastive
+learning to identify a truthful editing direction within the truthful space.
+During inference, by editing LLM's internal representations in truthful space,
+TruthX effectively enhances the truthfulness of LLM. Experiments show that
+TruthX improves the truthfulness of 13 advanced LLMs by an average of 20% on
+TruthfulQA benchmark. Further analyses suggest that TruthX can control LLM to
+produce truthful or hallucinatory responses via editing only one vector in
+LLM's internal representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 main conference, Project Page:
+  https://ictnlp.github.io/TruthX-site/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ranking Entities along Conceptual Space Dimensions with LLMs: An
+  Analysis of Fine-Tuning Strategies <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15337v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15337v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nitesh Kumar, Usashi Chatterjee, Steven Schockaert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conceptual spaces represent entities in terms of their primitive semantic
+features. Such representations are highly valuable but they are notoriously
+difficult to learn, especially when it comes to modelling perceptual and
+subjective features. Distilling conceptual spaces from Large Language Models
+(LLMs) has recently emerged as a promising strategy, but existing work has been
+limited to probing pre-trained LLMs using relatively simple zero-shot
+strategies. We focus in particular on the task of ranking entities according to
+a given conceptual space dimension. Unfortunately, we cannot directly fine-tune
+LLMs on this task, because ground truth rankings for conceptual space
+dimensions are rare. We therefore use more readily available features as
+training data and analyse whether the ranking capabilities of the resulting
+models transfer to perceptual and subjective features. We find that this is
+indeed the case, to some extent, but having at least some perceptual and
+subjective features in the training data seems essential for achieving the best
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Challenging the Validity of Personality Tests for Large Language Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.05297v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.05297v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Sühr, Florian E. Dorner, Samira Samadi, Augustin Kelava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With large language models (LLMs) like GPT-4 appearing to behave increasingly
+human-like in text-based interactions, it has become popular to attempt to
+evaluate personality traits of LLMs using questionnaires originally developed
+for humans. While reusing measures is a resource-efficient way to evaluate
+LLMs, careful adaptations are usually required to ensure that assessment
+results are valid even across human subpopulations. In this work, we provide
+evidence that LLMs' responses to personality tests systematically deviate from
+human responses, implying that the results of these tests cannot be interpreted
+in the same way. Concretely, reverse-coded items ("I am introverted" vs. "I am
+extraverted") are often both answered affirmatively. Furthermore, variation
+across prompts designed to "steer" LLMs to simulate particular personality
+types does not follow the clear separation into five independent personality
+factors from human samples. In light of these results, we believe that it is
+important to investigate tests' validity for LLMs before drawing strong
+conclusions about potentially ill-defined concepts like LLMs' "personality".
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A less extensive and shorter version of this work has been accepted
+  at Socially Responsible Language Modelling Research (SoLaR) 2023 Workshop at
+  NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Label-Efficient Model Selection for Text Generation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07891v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07891v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shir Ashury-Tahan, Ariel Gera, Benjamin Sznajder, Leshem Choshen, Liat Ein-Dor, Eyal Shnarch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model selection for a given target task can be costly, as it may entail
+extensive annotation of the quality of outputs of different models. We
+introduce DiffUse, an efficient method to make an informed decision between
+candidate text generation models based on preference annotations. DiffUse
+reduces the required amount of annotations, thus saving valuable time and
+resources in performing evaluation. DiffUse intelligently selects instances by
+clustering embeddings that represent the semantic differences between model
+outputs. Thus, it is able to identify a subset of examples that are more
+informative for preference decisions. Our method is model-agnostic, and can be
+applied to any text generation model for selecting between models, prompts and
+configurations. Moreover, we propose a practical iterative approach for
+dynamically determining how many instances to annotate. In a series of
+experiments over hundreds of model pairs, we demonstrate that DiffUse can
+dramatically reduce the required number of annotations -- by up to 75% -- while
+maintaining high evaluation reliability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL (main conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text Embedding Inversion Security for Multilingual Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12192v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12192v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyi Chen, Heather Lent, Johannes Bjerva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Textual data is often represented as real-numbered embeddings in NLP,
+particularly with the popularity of large language models (LLMs) and Embeddings
+as a Service (EaaS). However, storing sensitive information as embeddings can
+be susceptible to security breaches, as research shows that text can be
+reconstructed from embeddings, even without knowledge of the underlying model.
+While defence mechanisms have been explored, these are exclusively focused on
+English, leaving other languages potentially exposed to attacks. This work
+explores LLM security through multilingual embedding inversion. We define the
+problem of black-box multilingual and cross-lingual inversion attacks, and
+explore their potential implications. Our findings suggest that multilingual
+LLMs may be more vulnerable to inversion attacks, in part because English-based
+defences may be ineffective. To alleviate this, we propose a simple masking
+defense effective for both monolingual and multilingual models. This study is
+the first to investigate multilingual inversion attacks, shedding light on the
+differences in attacks and defenses across monolingual and multilingual
+settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 17 Tables, 6 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EIT: Enhanced Interactive <span class="highlight-title">Transformer</span> <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10197v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10197v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Zheng, Bei Li, Huiwen Bao, Tong Xiao, Jingbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Two principles: the complementary principle and the consensus principle are
+widely acknowledged in the literature of multi-view learning. However, the
+current design of multi-head self-attention, an instance of multi-view
+learning, prioritizes the complementarity while ignoring the consensus. To
+address this problem, we propose an enhanced multi-head self-attention (EMHA).
+First, to satisfy the complementary principle, EMHA removes the one-to-one
+mapping constraint among queries and keys in multiple subspaces and allows each
+query to attend to multiple keys. On top of that, we develop a method to fully
+encourage consensus among heads by introducing two interaction models, namely
+inner-subspace interaction and cross-subspace interaction. Extensive
+experiments on a wide range of language tasks (e.g., machine translation,
+abstractive summarization and grammar correction, language modeling), show its
+superiority, with a very modest increase in model size. Our code would be
+available at: https://github.com/zhengkid/EIT-Enhanced-Interactive-Transformer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Political Compass or Spinning Arrow? Towards More Meaningful Evaluations
+  for Values and Opinions in Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16786v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16786v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Röttger, Valentin Hofmann, Valentina Pyatkin, Musashi Hinck, Hannah Rose Kirk, Hinrich Schütze, Dirk Hovy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Much recent work seeks to evaluate values and opinions in large language
+models (LLMs) using multiple-choice surveys and questionnaires. Most of this
+work is motivated by concerns around real-world LLM applications. For example,
+politically-biased LLMs may subtly influence society when they are used by
+millions of people. Such real-world concerns, however, stand in stark contrast
+to the artificiality of current evaluations: real users do not typically ask
+LLMs survey questions. Motivated by this discrepancy, we challenge the
+prevailing constrained evaluation paradigm for values and opinions in LLMs and
+explore more realistic unconstrained evaluations. As a case study, we focus on
+the popular Political Compass Test (PCT). In a systematic review, we find that
+most prior work using the PCT forces models to comply with the PCT's
+multiple-choice format. We show that models give substantively different
+answers when not forced; that answers change depending on how models are
+forced; and that answers lack paraphrase robustness. Then, we demonstrate that
+models give different answers yet again in a more realistic open-ended answer
+setting. We distill these findings into recommendations and open challenges in
+evaluating values and opinions in LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 (Main Conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WaveCoder: Widespread And Versatile Enhancement For Code Large Language
+  Models By Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14187v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14187v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaojian Yu, Xin Zhang, Ning Shang, Yangyu Huang, Can Xu, Yishujie Zhao, Wenxiang Hu, Qiufeng Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work demonstrates that, after being fine-tuned on a high-quality
+instruction dataset, the resulting model can obtain impressive capabilities to
+address a wide range of tasks. However, existing methods for instruction data
+generation often produce duplicate data and are not controllable enough on data
+quality. In this paper, we extend the generalization of instruction tuning by
+classifying the instruction data to 4 code-related tasks and propose a
+LLM-based Generator-Discriminator data process framework to generate diverse,
+high-quality instruction data from open source code. Hence, we introduce
+CodeOcean, a dataset comprising 20,000 instruction instances across 4 universal
+code-related tasks,which is aimed at augmenting the effectiveness of
+instruction tuning and improving the generalization ability of fine-tuned
+model. Subsequently, we present WaveCoder, a fine-tuned Code LLM with
+Widespread And Versatile Enhanced instruction tuning. This model is
+specifically designed for enhancing instruction tuning of Code Language Models
+(LLMs). Our experiments demonstrate that Wavecoder models outperform other
+open-source models in terms of generalization ability across different
+code-related tasks at the same level of fine-tuning scale. Moreover, Wavecoder
+exhibits high efficiency in previous code generation tasks. This paper thus
+offers a significant contribution to the field of instruction data generation
+and fine-tuning models, providing new insights and tools for enhancing
+performance in code-related tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Correctable Landmark Discovery via Large Models for Vision-Language
+  Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18721v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18721v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingqian Lin, Yunshuang Nie, Ziming Wei, Yi Zhu, Hang Xu, Shikui Ma, Jianzhuang Liu, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Navigation (VLN) requires the agent to follow language
+instructions to reach a target position. A key factor for successful navigation
+is to align the landmarks implied in the instruction with diverse visual
+observations. However, previous VLN agents fail to perform accurate modality
+alignment especially in unexplored scenes, since they learn from limited
+navigation data and lack sufficient open-world alignment knowledge. In this
+work, we propose a new VLN paradigm, called COrrectable LaNdmark DiScOvery via
+Large ModEls (CONSOLE). In CONSOLE, we cast VLN as an open-world sequential
+landmark discovery problem, by introducing a novel correctable landmark
+discovery scheme based on two large models ChatGPT and CLIP. Specifically, we
+use ChatGPT to provide rich open-world landmark cooccurrence commonsense, and
+conduct CLIP-driven landmark discovery based on these commonsense priors. To
+mitigate the noise in the priors due to the lack of visual constraints, we
+introduce a learnable cooccurrence scoring module, which corrects the
+importance of each cooccurrence according to actual observations for accurate
+landmark discovery. We further design an observation enhancement strategy for
+an elegant combination of our framework with different VLN agents, where we
+utilize the corrected landmark features to obtain enhanced observation features
+for action decision. Extensive experimental results on multiple popular VLN
+benchmarks (R2R, REVERIE, R4R, RxR) show the significant superiority of CONSOLE
+over strong baselines. Especially, our CONSOLE establishes the new
+state-of-the-art results on R2R and R4R in unseen scenarios. Code is available
+at https://github.com/expectorlin/CONSOLE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Outliers and Calibration Sets have Diminishing Effect on Quantization of
+  Modern LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20835v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20835v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Paglieri, Saurabh Dash, Tim Rocktäschel, Jack Parker-Holder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-Training Quantization (PTQ) enhances the efficiency of Large Language
+Models (LLMs) by enabling faster operation and compatibility with more
+accessible hardware through reduced memory usage, at the cost of small
+performance drops. We explore the role of calibration sets in PTQ, specifically
+their effect on hidden activations in various notable open-source LLMs.
+Calibration sets are crucial for evaluating activation magnitudes and
+identifying outliers, which can distort the quantization range and negatively
+impact performance. Our analysis reveals a marked contrast in quantization
+effectiveness across models. The older OPT model, upon which much of the
+quantization literature is based, shows significant performance deterioration
+and high susceptibility to outliers with varying calibration sets. In contrast,
+newer models like Llama-2 7B, Llama-3 8B, Command-R 35B, and Mistral 7B
+demonstrate strong robustness, with Mistral 7B showing near-immunity to
+outliers and stable activations. These findings suggest a shift in PTQ
+strategies might be needed. As advancements in pre-training methods reduce the
+relevance of outliers, there is an emerging need to reassess the fundamentals
+of current quantization literature. The emphasis should pivot towards
+optimizing inference speed, rather than primarily focusing on outlier
+preservation, to align with the evolving characteristics of state-of-the-art
+LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gated Linear Attention <span class="highlight-title">Transformer</span>s with Hardware-Efficient Training <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.06635v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.06635v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songlin Yang, Bailin Wang, Yikang Shen, Rameswar Panda, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers with linear attention allow for efficient parallel training but
+can simultaneously be formulated as an RNN with 2D (matrix-valued) hidden
+states, thus enjoying linear-time inference complexity. However, linear
+attention generally underperforms ordinary softmax attention. Moreover, current
+implementations of linear attention lack I/O-awareness and are thus slower than
+highly optimized implementations of softmax attention. This work describes a
+hardware-efficient algorithm for linear attention that trades off memory
+movement against parallelizability. The resulting implementation, dubbed
+FLASHLINEARATTENTION, is faster than FLASHATTENTION-2 (Dao, 2023) as a
+standalone layer even on short sequence lengths (e.g., 1K). We then generalize
+this algorithm to a more expressive variant of linear attention with
+data-dependent gates. When used as a replacement for the standard attention
+layer in Transformers, the resulting gated linear attention (GLA) Transformer
+is found to perform competitively against the LLaMA-architecture Transformer
+(Touvron et al., 2023) as well recent linear-time-inference baselines such as
+RetNet (Sun et al., 2023a) and Mamba (Gu & Dao, 2023) on moderate-scale
+language modeling experiments. GLA Transformer is especially effective at
+length generalization, enabling a model trained on 2K to generalize to
+sequences longer than 20K without significant perplexity degradations. For
+training speed, the GLA Transformer has higher throughput than a
+similarly-sized Mamba model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML cameray ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rewards-in-Context: Multi-objective Alignment of Foundation Models with
+  Dynamic Preference Adjustment <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10207v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10207v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Yang, Xiaoman Pan, Feng Luo, Shuang Qiu, Han Zhong, Dong Yu, Jianshu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of multi-objective alignment of foundation models
+with human preferences, which is a critical step towards helpful and harmless
+AI systems. However, it is generally costly and unstable to fine-tune large
+foundation models using reinforcement learning (RL), and the
+multi-dimensionality, heterogeneity, and conflicting nature of human
+preferences further complicate the alignment process. In this paper, we
+introduce Rewards-in-Context (RiC), which conditions the response of a
+foundation model on multiple rewards in its prompt context and applies
+supervised fine-tuning for alignment. The salient features of RiC are
+simplicity and adaptivity, as it only requires supervised fine-tuning of a
+single foundation model and supports dynamic adjustment for user preferences
+during inference time. Inspired by the analytical solution of an abstracted
+convex optimization problem, our dynamic inference-time adjustment method
+approaches the Pareto-optimal solution for multiple objectives. Empirical
+evidence demonstrates the efficacy of our method in aligning both Large
+Language Models (LLMs) and diffusion models to accommodate diverse rewards with
+only around 10% GPU hours compared with multi-objective RL baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Batch-ICL: Effective, Efficient, and Order-Agnostic In-Context Learning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06469v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06469v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyi Zhang, Ang Lv, Yuhan Chen, Hansen Ha, Tao Xu, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, by treating in-context learning (ICL) as a meta-optimization
+process, we explain why LLMs are sensitive to the order of ICL examples. This
+understanding leads us to the development of Batch-ICL, an effective,
+efficient, and order-agnostic inference algorithm for ICL. Differing from the
+standard N-shot learning approach, Batch-ICL employs $N$ separate 1-shot
+forward computations and aggregates the resulting meta-gradients. These
+aggregated meta-gradients are then applied to the forward computation of a
+zero-shot query to generate the final prediction. This batch processing
+approach renders the LLM agnostic to the order of ICL examples. Through
+extensive experiments and analysis, we demonstrate that Batch-ICL consistently
+outperforms most permutations of ICL examples. In some cases, it even exceeds
+the performance of the best order for standard ICL, all while reducing the
+computational resources required. Furthermore, we develop a novel variant of
+Batch-ICL featuring multiple "epochs" of meta-optimization. This variant
+implicitly explores permutations of ICL examples, further enhancing ICL
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PyramidInfer: Pyramid KV Cache Compression for High-throughput LLM
+  Inference <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12532v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12532v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongjie Yang, XiaoDong Han, Yan Gao, Yao Hu, Shilin Zhang, Hai Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown remarkable comprehension abilities
+but face challenges in GPU memory usage during inference, hindering their
+scalability for real-time applications like chatbots. To accelerate inference,
+we store computed keys and values (KV cache) in the GPU memory. Existing
+methods study the KV cache compression to reduce memory by pruning the
+pre-computed KV cache. However, they neglect the inter-layer dependency between
+layers and huge memory consumption in pre-computation. To explore these
+deficiencies, we find that the number of crucial keys and values that influence
+future generations decreases layer by layer and we can extract them by the
+consistency in attention weights. Based on the findings, we propose
+PyramidInfer, a method that compresses the KV cache by layer-wise retaining
+crucial context. PyramidInfer saves significant memory by computing fewer keys
+and values without sacrificing performance. Experimental results show
+PyramidInfer improves 2.2x throughput compared to Accelerate with over 54% GPU
+memory reduction in KV cache.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language Model Decoding as Direct Metrics Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01041v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01041v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhe Ji, Pei Ke, Hongning Wang, Minlie Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the remarkable advances in language modeling, current mainstream
+decoding methods still struggle to generate texts that align with human texts
+across different aspects. In particular, sampling-based methods produce
+less-repetitive texts which are often disjunctive in discourse, while
+search-based methods maintain topic coherence at the cost of increased
+repetition. Overall, these methods fall short in achieving holistic alignment
+across a broad range of aspects. In this work, we frame decoding from a
+language model as an optimization problem with the goal of strictly matching
+the expected performance with human texts measured by multiple metrics of
+desired aspects simultaneously. The resulting decoding distribution enjoys an
+analytical solution that scales the input language model distribution via a
+sequence-level energy function defined by these metrics. And most importantly,
+we prove that this induced distribution is guaranteed to improve the perplexity
+on human texts, which suggests a better approximation to the underlying
+distribution of human texts. To facilitate tractable sampling from this
+globally normalized distribution, we adopt the Sampling-Importance-Resampling
+technique. Experiments on various domains and model scales demonstrate the
+superiority of our method in metrics alignment with human texts and human
+evaluation over strong baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pipeline Parallelism with Controllable Memory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15362v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15362v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Penghui Qi, Xinyi Wan, Nyamdavaa Amar, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pipeline parallelism has been widely explored, but most existing schedules
+lack a systematic methodology. In this paper, we propose a framework to
+decompose pipeline schedules as repeating a building block and we show that the
+lifespan of the building block decides the peak activation memory of the
+pipeline schedule. Guided by the observations, we find that almost all existing
+pipeline schedules, to the best of our knowledge, are memory inefficient. To
+address this, we introduce a family of memory efficient building blocks with
+controllable activation memory, which can reduce the peak activation memory to
+1/2 of 1F1B without sacrificing efficiency, and even to 1/3 with comparable
+throughput. We can also achieve almost zero pipeline bubbles while maintaining
+the same activation memory as 1F1B. Our evaluations demonstrate that in pure
+pipeline parallelism settings, our methods outperform 1F1B by from 7% to 55% in
+terms of throughput. When employing a grid search over hybrid parallelism
+hyperparameters in practical scenarios, our proposed methods demonstrate a 16%
+throughput improvement over the 1F1B baseline for large language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Efficient Exact Optimization of Language Model Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00856v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00856v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhe Ji, Cheng Lu, Yilin Niu, Pei Ke, Hongning Wang, Jun Zhu, Jie Tang, Minlie Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The alignment of language models with human preferences is vital for their
+application in real-world tasks. The problem is formulated as optimizing the
+model's policy to maximize the expected reward that reflects human preferences
+with minimal deviation from the initial policy. While considered as a
+straightforward solution, reinforcement learning (RL) suffers from high
+variance in policy updates, which impedes efficient policy improvement.
+Recently, direct preference optimization (DPO) was proposed to directly
+optimize the policy from preference data. However, we show that DPO derived
+based on the optimal solution of the problem leads to a compromised
+mean-seeking approximation of the optimal solution in practice. In this paper,
+we propose efficient exact optimization (EXO) of the alignment objective. EXO
+is guaranteed to optimize in the same direction as RL algorithms asymptotically
+for arbitrary policy parametrization. This leads to the same mode-seeking
+solution, while enables efficient optimization by circumventing the
+complexities of RL. We also compare our method to DPO with both theoretical and
+empirical analyses, and further demonstrate the advantages of our method over
+existing approaches on realistic human preference data. Code is available at
+https://github.com/haozheji/exact-optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analyzing Social Biases in Japanese Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02050v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02050v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hitomi Yanaka, Namgi Han, Ryoma Kumon, Jie Lu, Masashi Takeshita, Ryo Sekizawa, Taisei Kato, Hiromi Arai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of Large Language Models (LLMs), social biases in the
+LLMs have become a crucial issue. While various benchmarks for social biases
+have been provided across languages, the extent to which Japanese LLMs exhibit
+social biases has not been fully investigated. In this study, we construct the
+Japanese Bias Benchmark dataset for Question Answering (JBBQ) based on the
+English bias benchmark BBQ, and analyze social biases in Japanese LLMs. The
+results show that while current Japanese LLMs improve their accuracies on JBBQ
+by instruction-tuning, their bias scores become larger. In addition, augmenting
+their prompts with warning about social biases reduces the effect of biases in
+some models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WilKE: Wise-Layer Knowledge Editor for Lifelong Knowledge Editing <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10987v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10987v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenhui Hu, Pengfei Cao, Yubo Chen, Kang Liu, Jun Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge editing aims to rectify inaccuracies in large language models
+(LLMs) without costly retraining for outdated or erroneous knowledge. However,
+current knowledge editing methods primarily focus on single editing, failing to
+meet the requirements for lifelong editing. This study reveals a performance
+degradation encountered by knowledge editing in lifelong editing, characterized
+by toxicity buildup and toxicity flash, with the primary cause identified as
+pattern unmatch. We introduce a knowledge editing approach named Wise-Layer
+Knowledge Editor (WilKE), which selects editing layer based on the pattern
+matching degree of editing knowledge across different layers in language
+models. Experimental results demonstrate that, in lifelong editing, WilKE
+exhibits an average improvement of 46.2% and 67.8% on editing GPT2-XL and GPT-J
+relative to state-of-the-art knowledge editing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Error Analysis <span class="highlight-title">Prompt</span>ing Enables Human-Like Translation Evaluation in
+  Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13809v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13809v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyu Lu, Baopu Qiu, Liang Ding, Kanjian Zhang, Tom Kocmi, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative large language models (LLMs), e.g., ChatGPT, have demonstrated
+remarkable proficiency across several NLP tasks, such as machine translation,
+text summarization. Recent research (Kocmi and Federmann, 2023) has shown that
+utilizing LLMs for assessing the quality of machine translation (MT) achieves
+state-of-the-art performance at the system level but \textit{performs poorly at
+the segment level}. To further improve the performance of LLMs on MT quality
+assessment, we investigate several prompting designs, and propose a new
+prompting method called \textbf{\texttt{Error Analysis Prompting}} (EAPrompt)
+by combining Chain-of-Thoughts (Wei et al., 2022) and Error Analysis (Lu et
+al., 2023). This technique emulates the commonly accepted human evaluation
+framework - Multidimensional Quality Metrics (MQM, Freitag et al. (2021)) and
+\textit{produces explainable and reliable MT evaluations at both the system and
+segment level}. Experimental Results from the WMT22 metrics shared task
+validate the effectiveness of EAPrompt on various LLMs, with different
+structures. Further analysis confirms that EAPrompt effectively distinguishes
+major errors from minor ones, while also sharing a similar distribution of the
+number of errors with MQM. These findings highlight the potential of EAPrompt
+as a human-like evaluator prompting technique for MT evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dishonesty in Helpful and Harmless Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01931v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01931v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youcheng Huang, Jingkun Tang, Duanyu Feng, Zheng Zhang, Wenqiang Lei, Jiancheng Lv, Anthony G. Cohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  People tell lies when seeking rewards. Large language models (LLMs) are
+aligned to human values with reinforcement learning where they get rewards if
+they satisfy human preference. We find that this also induces dishonesty in
+helpful and harmless alignment where LLMs tell lies in generating harmless
+responses. Using the latest interpreting tools, we detect dishonesty, show how
+LLMs can be harmful if their honesty is increased, and analyze such conflicts
+at the parameter-level. Given these preliminaries and the hypothesis that
+reward-seeking stimulates dishonesty, we theoretically show that the dishonesty
+can in-turn decrease the alignment performances and augment reward-seeking
+alignment with representation regularization. Extensive results, including
+GPT-4 annotated win-rates, perplexities, and cases studies demonstrate that we
+can train more honest, helpful, and harmless LLMs. We will make all our codes
+and results be open-sourced upon this paper's acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Knowledge All Large Language Models Needed for Causal Reasoning? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00139v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00139v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengrui Cai, Shengjie Liu, Rui Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the causal reasoning of large language models (LLMs) to
+enhance their interpretability and reliability in advancing artificial
+intelligence. Despite the proficiency of LLMs in a range of tasks, their
+potential for understanding causality requires further exploration. We propose
+a novel causal attribution model that utilizes ``do-operators" for constructing
+counterfactual scenarios, allowing us to systematically quantify the influence
+of input numerical data and LLMs' pre-existing knowledge on their causal
+reasoning processes. Our newly developed experimental setup assesses LLMs'
+reliance on contextual information and inherent knowledge across various
+domains. Our evaluation reveals that LLMs' causal reasoning ability mainly
+depends on the context and domain-specific knowledge provided. In the absence
+of such knowledge, LLMs can still maintain a degree of causal reasoning using
+the available numerical data, albeit with limitations in the calculations. This
+motivates the proposed fine-tuned LLM for pairwise causal discovery,
+effectively leveraging both knowledge and numerical information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A Python implementation of our proposed method is available at
+  https://github.com/ncsulsj/Causal_LLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Blinded by Generated Contexts: How Language Models Merge Generated and
+  Retrieved Contexts When Knowledge Conflicts? <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11911v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11911v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hexiang Tan, Fei Sun, Wanli Yang, Yuanzhuo Wang, Qi Cao, Xueqi Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While auxiliary information has become a key to enhancing Large Language
+Models (LLMs), relatively little is known about how LLMs merge these contexts,
+specifically contexts generated by LLMs and those retrieved from external
+sources. To investigate this, we formulate a systematic framework to identify
+whether LLMs' responses are attributed to either generated or retrieved
+contexts. To easily trace the origin of the response, we construct datasets
+with conflicting contexts, i.e., each question is paired with both generated
+and retrieved contexts, yet only one of them contains the correct answer. Our
+experiments reveal a significant bias in several LLMs (GPT-4/3.5 and Llama2) to
+favor generated contexts, even when they provide incorrect information. We
+further identify two key factors contributing to this bias: i) contexts
+generated by LLMs typically show greater similarity to the questions,
+increasing their likelihood of being selected; ii) the segmentation process
+used in retrieved contexts disrupts their completeness, thereby hindering their
+full utilization in LLMs. Our analysis enhances the understanding of how LLMs
+merge diverse contexts, offers valuable insights for advancing current LLM
+augmentation methods, and highlights the risk of generated misinformation for
+retrieval-augmented LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04247v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04247v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangru Tang, Qiao Jin, Kunlun Zhu, Tongxin Yuan, Yichi Zhang, Wangchunshu Zhou, Meng Qu, Yilun Zhao, Jian Tang, Zhuosheng Zhang, Arman Cohan, Zhiyong Lu, Mark Gerstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent agents powered by large language models (LLMs) have demonstrated
+substantial promise in autonomously conducting experiments and facilitating
+scientific discoveries across various disciplines. While their capabilities are
+promising, these agents, called scientific LLM agents, also introduce novel
+vulnerabilities that demand careful consideration for safety. However, there
+exists a notable gap in the literature, as there has been no comprehensive
+exploration of these vulnerabilities. This perspective paper fills this gap by
+conducting a thorough examination of vulnerabilities in LLM-based agents within
+scientific domains, shedding light on potential risks associated with their
+misuse and emphasizing the need for safety measures. We begin by providing a
+comprehensive overview of the potential risks inherent to scientific LLM
+agents, taking into account user intent, the specific scientific domain, and
+their potential impact on the external environment. Then, we delve into the
+origins of these vulnerabilities and provide a scoping review of the limited
+existing works. Based on our analysis, we propose a triadic framework involving
+human regulation, agent alignment, and an understanding of environmental
+feedback (agent regulation) to mitigate these identified risks. Furthermore, we
+highlight the limitations and challenges associated with safeguarding
+scientific agents and advocate for the development of improved models, robust
+benchmarks, and comprehensive regulations to address these issues effectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Not All Attention is Needed: Parameter and Computation Efficient
+  Transfer Learning for Multi-modal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiong Wu, Weihao Ye, Yiyi Zhou, Xiaoshuai Sun, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel parameter and computation efficient tuning
+method for Multi-modal Large Language Models (MLLMs), termed Efficient
+Attention Skipping (EAS). Concretely, we first reveal that multi-head
+attentions (MHAs), the main computational overhead of MLLMs, are often
+redundant to downstream tasks. Based on this observation, EAS evaluates the
+attention redundancy and skips the less important MHAs to speed up inference.
+Besides, we also propose a novel propagation-of-information adapter (PIA) to
+serve the attention skipping of EAS and keep parameter efficiency, which can be
+further re-parameterized into feed-forward networks (FFNs) for zero-extra
+latency. To validate EAS, we apply it to a recently proposed MLLM called LaVIN
+and a classic VL pre-trained model called METER, and conduct extensive
+experiments on a set of benchmarks. The experiments show that EAS not only
+retains high performance and parameter efficiency, but also greatly speeds up
+inference speed. For instance, LaVIN-EAS can obtain 89.98\% accuracy on
+ScineceQA while speeding up inference by 2.2 times to LaVIN
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Biases for Instruction-following Language Models via Bias
+  Neurons Elimination <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09627v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09627v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nakyeong Yang, Taegwan Kang, Jungkyu Choi, Honglak Lee, Kyomin Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction-following language models often show undesirable biases. These
+undesirable biases may be accelerated in the real-world usage of language
+models, where a wide range of instructions is used through zero-shot example
+prompting. To solve this problem, we first define the bias neuron, which
+significantly affects biased outputs, and prove its existence empirically.
+Furthermore, we propose a novel and practical bias mitigation method, CRISPR,
+to eliminate bias neurons of language models in instruction-following settings.
+CRISPR automatically determines biased outputs and categorizes neurons that
+affect the biased outputs as bias neurons using an explainability method.
+Experimental results demonstrate the effectiveness of our method in mitigating
+biases under zero-shot instruction-following settings without losing the
+model's task performance and existing knowledge. The experimental results
+reveal the generalizability of our method as it shows robustness under various
+instructions and datasets. Surprisingly, our method can mitigate the bias in
+language models by eliminating only a few neurons (at least three).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Assessing Political Bias in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13041v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13041v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Rettenberger, Markus Reischl, Mark Schutera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The assessment of bias within Large Language Models (LLMs) has emerged as a
+critical concern in the contemporary discourse surrounding Artificial
+Intelligence (AI) in the context of their potential impact on societal
+dynamics. Recognizing and considering political bias within LLM applications is
+especially important when closing in on the tipping point toward performative
+prediction. Then, being educated about potential effects and the societal
+behavior LLMs can drive at scale due to their interplay with human operators.
+In this way, the upcoming elections of the European Parliament will not remain
+unaffected by LLMs. We evaluate the political bias of the currently most
+popular open-source LLMs (instruct or assistant models) concerning political
+issues within the European Union (EU) from a German voter's perspective. To do
+so, we use the "Wahl-O-Mat," a voting advice application used in Germany. From
+the voting advice of the "Wahl-O-Mat" we quantize the degree of alignment of
+LLMs with German political parties. We show that larger models, such as
+Llama3-70B, tend to align more closely with left-leaning political parties,
+while smaller models often remain neutral, particularly when prompted in
+English. The central finding is that LLMs are similarly biased, with low
+variances in the alignment concerning a specific party. Our findings underline
+the importance of rigorously assessing and making bias transparent in LLMs to
+safeguard the integrity and trustworthiness of applications that employ the
+capabilities of performative prediction and the invisible hand of machine
+learning prediction and language generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BoNBoN Alignment for Large Language Models and the Sweetness of
+  Best-of-n Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00832v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00832v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Gui, Cristina Gârbacea, Victor Veitch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper concerns the problem of aligning samples from large language
+models to human preferences using best-of-$n$ sampling, where we draw $n$
+samples, rank them, and return the best one. We consider two fundamental
+problems. First: what is the relationship between best-of-$n$ and approaches to
+alignment that train LLMs to output samples with a high expected reward (e.g.,
+RLHF or DPO)? To answer this, we embed both the best-of-$n$ distribution and
+the sampling distributions learned by alignment procedures in a common class of
+tiltings of the base LLM distribution. We then show that, within this class,
+best-of-$n$ is essentially optimal in terms of the trade-off between win-rate
+against the base model vs KL distance from the base model. That is, best-of-$n$
+is the best choice of alignment distribution if the goal is to maximize win
+rate. However, best-of-$n$ requires drawing $n$ samples for each inference, a
+substantial cost. To avoid this, the second problem we consider is how to
+fine-tune a LLM to mimic the best-of-$n$ sampling distribution. We derive
+BoNBoN Alignment to achieve this by exploiting the special structure of the
+best-of-$n$ distribution. Experiments show that BoNBoN alignment yields
+substantial improvements in producing a model that is preferred to the base
+policy while minimally affecting off-target aspects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RetrievalQA: Assessing Adaptive Retrieval-Augmented Generation for
+  Short-form Open-Domain Question Answering <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16457v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16457v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Zhang, Meng Fang, Ling Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive retrieval-augmented generation (ARAG) aims to dynamically determine
+the necessity of retrieval for queries instead of retrieving indiscriminately
+to enhance the efficiency and relevance of the sourced information. However,
+previous works largely overlook the evaluation of ARAG approaches, leading to
+their effectiveness being understudied. This work presents a benchmark,
+RetrievalQA, comprising 1,271 short-form questions covering new world and
+long-tail knowledge. The knowledge necessary to answer the questions is absent
+from LLMs; therefore, external information must be retrieved to answer
+correctly. This makes RetrievalQA a suitable testbed to evaluate existing ARAG
+methods. We observe that calibration-based methods heavily rely on threshold
+tuning, while vanilla prompting is inadequate for guiding LLMs to make reliable
+retrieval decisions. Based on our findings, we propose Time-Aware Adaptive
+Retrieval (TA-ARE), a simple yet effective method that helps LLMs assess the
+necessity of retrieval without calibration or additional training. The dataset
+and code will be available at https://github.com/hyintell/RetrievalQA
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StableSSM: Alleviating the Curse of Memory in State-space Models through
+  Stable Reparameterization <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.14495v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.14495v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shida Wang, Qianxiao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the long-term memory learning capabilities of
+state-space models (SSMs) from the perspective of parameterization. We prove
+that state-space models without any reparameterization exhibit a memory
+limitation similar to that of traditional RNNs: the target relationships that
+can be stably approximated by state-space models must have an exponential
+decaying memory. Our analysis identifies this "curse of memory" as a result of
+the recurrent weights converging to a stability boundary, suggesting that a
+reparameterization technique can be effective. To this end, we introduce a
+class of reparameterization techniques for SSMs that effectively lift its
+memory limitations. Besides improving approximation capabilities, we further
+illustrate that a principled choice of reparameterization scheme can also
+enhance optimization stability. We validate our findings using synthetic
+datasets, language models and image classifications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 7 figures, ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring and Improving Drafts in Blockwise Parallel Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taehyeon Kim, Ananda Theertha Suresh, Kishore Papineni, Michael Riley, Sanjiv Kumar, Adrian Benton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the remarkable strides made by autoregressive language models, their
+potential is often hampered by the slow inference speeds inherent in sequential
+token generation. Blockwise parallel decoding (BPD) was proposed by Stern et
+al. as a method to improve inference speed of language models by simultaneously
+predicting multiple future tokens, termed block drafts, which are subsequently
+verified and conditionally accepted by the autoregressive model. This paper
+contributes to the understanding and improvement of block drafts in two ways.
+First, we analyze the token distributions produced by multiple prediction
+heads. Secondly, we leverage this analysis to develop algorithms to improve BPD
+inference speed by refining the block drafts using n-gram and neural language
+models. Experiments demonstrate that refined block drafts yield a +5-21%
+increase in block efficiency (i.e., the number of accepted tokens from the
+block draft) across diverse datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Safer Large Language Models through Machine Unlearning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10058v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10058v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheyuan Liu, Guangyao Dou, Zhaoxuan Tan, Yijun Tian, Meng Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of Large Language Models (LLMs) has demonstrated their
+vast potential across various domains, attributed to their extensive
+pretraining knowledge and exceptional generalizability. However, LLMs often
+encounter challenges in generating harmful content when faced with problematic
+prompts. To address this problem, existing work attempted to implement a
+gradient ascent based approach to prevent LLMs from producing harmful output.
+While these methods can be effective, they frequently impact the model utility
+in responding to normal prompts. To address this gap, we introduce Selective
+Knowledge negation Unlearning (SKU), a novel unlearning framework for LLMs,
+designed to eliminate harmful knowledge while preserving utility on normal
+prompts. Specifically, SKU is consisted of two stages: harmful knowledge
+acquisition stage and knowledge negation stage. The first stage aims to
+identify and acquire harmful knowledge within the model, whereas the second is
+dedicated to remove this knowledge. SKU selectively isolates and removes
+harmful knowledge in model parameters, ensuring the model's performance remains
+robust on normal prompts. Our experiments conducted across various LLM
+architectures demonstrate that SKU identifies a good balance point between
+removing harmful information and preserving utility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MR-GSM8K: A Meta-Reasoning Benchmark for Large Language Model Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.17080v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.17080v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongshen Zeng, Pengguang Chen, Shu Liu, Haiyun Jiang, Jiaya Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce a novel evaluation paradigm for Large Language
+Models (LLMs) that compels them to transition from a traditional
+question-answering role, akin to a student, to a solution-scoring role, akin to
+a teacher. This paradigm, focusing on "reasoning about reasoning," hence termed
+meta-reasoning, shifts the emphasis from result-oriented assessments, which
+often neglect the reasoning process, to a more comprehensive evaluation that
+effectively distinguishes between the cognitive capabilities of different
+models. By applying this paradigm in the GSM8K dataset, we have developed the
+MR-GSM8K benchmark. Our extensive analysis includes several state-of-the-art
+models from both open-source and commercial domains, uncovering fundamental
+deficiencies in their training and evaluation methodologies. Notably, while
+models like Deepseek-v2 and Claude3-Sonnet closely competed with GPT-4 in
+GSM8K, their performance disparities expanded dramatically in MR-GSM8K, with
+differences widening to over 20 absolute points, underscoring the significant
+challenge posed by our meta-reasoning approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/dvlab-research/MR-GSM8K</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMLU-Pro: A More Robust and Challenging Multi-Task Language
+  Understanding Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01574v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01574v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubo Wang, Xueguang Ma, Ge Zhang, Yuansheng Ni, Abhranil Chandra, Shiguang Guo, Weiming Ren, Aaran Arulraj, Xuan He, Ziyan Jiang, Tianle Li, Max Ku, Kai Wang, Alex Zhuang, Rongqi Fan, Xiang Yue, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the age of large-scale language models, benchmarks like the Massive
+Multitask Language Understanding (MMLU) have been pivotal in pushing the
+boundaries of what AI can achieve in language comprehension and reasoning
+across diverse domains. However, as models continue to improve, their
+performance on these benchmarks has begun to plateau, making it increasingly
+difficult to discern differences in model capabilities. This paper introduces
+MMLU-Pro, an enhanced dataset designed to extend the mostly knowledge-driven
+MMLU benchmark by integrating more challenging, reasoning-focused questions and
+expanding the choice set from four to ten options. Additionally, MMLU-Pro
+eliminates the trivial and noisy questions in MMLU. Our experimental results
+show that MMLU-Pro not only raises the challenge, causing a significant drop in
+accuracy by 16% to 33% compared to MMLU but also demonstrates greater stability
+under varying prompts. With 24 different prompt styles tested, the sensitivity
+of model scores to prompt variations decreased from 4-5% in MMLU to just 2% in
+MMLU-Pro. Additionally, we found that models utilizing Chain of Thought (CoT)
+reasoning achieved better performance on MMLU-Pro compared to direct answering,
+which is in stark contrast to the findings on the original MMLU, indicating
+that MMLU-Pro includes more complex reasoning questions. Our assessments
+confirm that MMLU-Pro is a more discriminative benchmark to better track
+progress in the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapting Open-Source Large Language Models for Cost-Effective,
+  Expert-Level Clinical Note Generation with On-Policy Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanyin Wang, Chufan Gao, Bolun Liu, Qiping Xu, Guleid Hussein, Mohamad El Labban, Kingsley Iheasirim, Hariprasad Korsapati, Chuck Outcalt, Jimeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Proprietary Large Language Models (LLMs) such as GPT-4 and Gemini have
+demonstrated promising capabilities in clinical text summarization tasks.
+However, due to patient data privacy concerns and computational costs, many
+healthcare providers prefer using small, locally-hosted models over external
+generic LLMs. This study presents a comprehensive domain- and task-specific
+adaptation process for the open-source LLaMA-2 13 billion parameter model,
+enabling it to generate high-quality clinical notes from outpatient
+patient-doctor dialogues. Our process incorporates continued pre-training,
+supervised fine-tuning, and reinforcement learning from both AI and human
+feedback. We introduced a new approach, DistillDirect, for performing on-policy
+reinforcement learning with Gemini 1.0 Pro as the teacher model. Our resulting
+model, LLaMA-Clinic, can generate clinical notes comparable in quality to those
+authored by physicians. In a blinded physician reader study, the majority
+(90.4%) of individual evaluations rated the notes generated by LLaMA-Clinic as
+"acceptable" or higher across all three criteria: real-world readiness,
+completeness, and accuracy. In the more challenging "Assessment and Plan"
+section, LLaMA-Clinic scored higher (4.2/5) in real-world readiness than
+physician-authored notes (4.1/5). Our cost analysis for inference shows that
+our LLaMA-Clinic model achieves a 4.375-fold cost reduction compared to an
+external generic LLM service. Additionally, we highlight key considerations for
+future clinical note-generation tasks, emphasizing the importance of
+pre-defining a best-practice note format, rather than relying on LLMs to
+determine this for clinical practice. We have made our newly created synthetic
+clinic dialogue-note dataset and the physician feedback dataset publicly
+available to foster future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Code Comparison Tuning for Code Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.19121v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.19121v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufan Jiang, Qiaozhi He, Xiaomin Zhuang, Zhihua Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Code Comparison Tuning (CCT), a simple and effective tuning method
+for code large language models (Code LLMs) to better handle subtle code errors.
+Specifically, we integrate the concept of comparison into instruction tuning,
+both at the token and sequence levels, enabling the model to discern even the
+slightest deviations in code. To compare the original code with an erroneous
+version containing manually added code errors, we use token-level preference
+loss for detailed token-level comparisons. Additionally, we combine code
+segments to create a new instruction tuning sample for sequence-level
+comparisons, enhancing the model's bug-fixing capability. Experimental results
+on the HumanEvalFix benchmark show that CCT surpasses instruction tuning in
+pass@1 scores by up to 4 points across diverse code LLMs, and extensive
+analysis demonstrates the effectiveness of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SqueezeLLM: Dense-and-Sparse Quantization <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07629v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07629v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sehoon Kim, Coleman Hooper, Amir Gholami, Zhen Dong, Xiuyu Li, Sheng Shen, Michael W. Mahoney, Kurt Keutzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Large Language Models (LLMs) have demonstrated remarkable results
+for a wide range of tasks. However, deploying these models for inference has
+been a significant challenge due to their unprecedented resource requirements.
+This has forced existing deployment frameworks to use multi-GPU inference
+pipelines, which are often complex and costly, or to use smaller and less
+performant models. In this work, we demonstrate that the main bottleneck for
+generative inference with LLMs is memory bandwidth, rather than compute,
+specifically for single batch inference. While quantization has emerged as a
+promising solution by representing weights with reduced precision, previous
+efforts have often resulted in notable performance degradation. To address
+this, we introduce SqueezeLLM, a post-training quantization framework that not
+only enables lossless compression to ultra-low precisions of up to 3-bit, but
+also achieves higher quantization performance under the same memory constraint.
+Our framework incorporates two novel ideas: (i) sensitivity-based non-uniform
+quantization, which searches for the optimal bit precision assignment based on
+second-order information; and (ii) the Dense-and-Sparse decomposition that
+stores outliers and sensitive weight values in an efficient sparse format. When
+applied to the LLaMA models, our 3-bit quantization significantly reduces the
+perplexity gap from the FP16 baseline by up to 2.1x as compared to the
+state-of-the-art methods with the same memory requirement. Furthermore, when
+deployed on an A6000 GPU, our quantized models achieve up to 2.3x speedup
+compared to the baseline. Our code is available at
+https://github.com/SqueezeAILab/SqueezeLLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An LLM Compiler for Parallel Function Calling <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04511v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04511v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sehoon Kim, Suhong Moon, Ryan Tabrizi, Nicholas Lee, Michael W. Mahoney, Kurt Keutzer, Amir Gholami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The reasoning capabilities of the recent LLMs enable them to execute external
+function calls to overcome their inherent limitations, such as knowledge
+cutoffs, poor arithmetic skills, or lack of access to private data. This
+development has allowed LLMs to select and coordinate multiple functions based
+on the context to tackle more complex problems. However, current methods for
+function calling often require sequential reasoning and acting for each
+function which can result in high latency, cost, and sometimes inaccurate
+behavior. To address this, we introduce LLMCompiler, which executes functions
+in parallel to efficiently orchestrate multiple function calls. Drawing
+inspiration from the principles of classical compilers, LLMCompiler enables
+parallel function calling with three components: (i) a Function Calling
+Planner, formulating execution plans for function calling; (ii) a Task Fetching
+Unit, dispatching function calling tasks; and (iii) an Executor, executing
+these tasks in parallel. LLMCompiler automatically generates an optimized
+orchestration for the function calls and can be used with both open-source and
+closed-source models. We have benchmarked LLMCompiler on a range of tasks with
+different patterns of function calling. We observe consistent latency speedup
+of up to 3.7x, cost savings of up to 6.7x, and accuracy improvement of up to
+~9% compared to ReAct. Our code is available at
+https://github.com/SqueezeAILab/LLMCompiler.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Forward-Backward Reasoning in Large Language Models for Mathematical
+  Verification <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07758v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07758v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weisen Jiang, Han Shi, Longhui Yu, Zhengying Liu, Yu Zhang, Zhenguo Li, James T. Kwok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-Consistency samples diverse reasoning chains with answers and chooses
+the final answer by majority voting. It is based on forward reasoning and
+cannot further improve performance by sampling more reasoning chains when
+saturated. To further boost performance, we introduce backward reasoning to
+verify candidate answers. Specifically, for mathematical tasks, we mask a
+number in the question and ask the LLM to answer a backward question created by
+a simple template, i.e., to predict the masked number when a candidate answer
+is provided. Instead of using forward or backward reasoning alone, we propose
+FOBAR to combine FOrward and BAckward Reasoning for verification. Extensive
+experiments on six standard mathematical data sets and three LLMs show that
+FOBAR achieves state-of-the-art performance. In particular, FOBAR outperforms
+Self-Consistency, which uses forward reasoning alone, demonstrating that
+combining forward and forward reasoning is better. In addition, FOBAR performs
+better than existing verification methods, showing the effectiveness of the
+simple template used in backward reasoning and the proposed combination.
+Extensions to non-mathematical problems are also discussed and validated
+empirically.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LaMP: When Large Language Models Meet Personalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11406v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11406v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alireza Salemi, Sheshera Mysore, Michael Bendersky, Hamed Zamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper highlights the importance of personalization in large language
+models and introduces the LaMP benchmark -- a novel benchmark for training and
+evaluating language models for producing personalized outputs. LaMP offers a
+comprehensive evaluation framework with diverse language tasks and multiple
+entries for each user profile. It consists of seven personalized tasks,
+spanning three text classification and four text generation tasks. We
+additionally propose two retrieval augmentation approaches that retrieve
+personal items from each user profile for personalizing language model outputs.
+To this aim, we study various retrieval models, including term matching,
+semantic matching, and time-aware methods. Extensive experiments on LaMP for
+zero-shot and fine-tuned language models demonstrate the efficacy of the
+proposed retrieval augmentation approach and highlight the impact of
+personalization in various natural language tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Reasoning with Multimodal Knowledge Graph <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02030v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02030v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junlin Lee, Yequan Wang, Jing Li, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal reasoning with large language models (LLMs) often suffers from
+hallucinations and the presence of deficient or outdated knowledge within LLMs.
+Some approaches have sought to mitigate these issues by employing textual
+knowledge graphs, but their singular modality of knowledge limits comprehensive
+cross-modal understanding. In this paper, we propose the Multimodal Reasoning
+with Multimodal Knowledge Graph (MR-MKG) method, which leverages multimodal
+knowledge graphs (MMKGs) to learn rich and semantic knowledge across
+modalities, significantly enhancing the multimodal reasoning capabilities of
+LLMs. In particular, a relation graph attention network is utilized for
+encoding MMKGs and a cross-modal alignment module is designed for optimizing
+image-text alignment. A MMKG-grounded dataset is constructed to equip LLMs with
+initial expertise in multimodal reasoning through pretraining. Remarkably,
+MR-MKG achieves superior performance while training on only a small fraction of
+parameters, approximately 2.25% of the LLM's parameter size. Experimental
+results on multimodal question answering and multimodal analogy reasoning tasks
+demonstrate that our MR-MKG method outperforms previous state-of-the-art
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 (Main Conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis of Utterance Embeddings and Clustering Methods Related to
+  Intent Induction for Task-Oriented Dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02021v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02021v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeiyoon Park, Yoonna Jang, Chanhee Lee, Heuiseok Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The focus of this work is to investigate unsupervised approaches to overcome
+quintessential challenges in designing task-oriented dialog schema: assigning
+intent labels to each dialog turn (intent clustering) and generating a set of
+intents based on the intent clustering methods (intent induction). We postulate
+there are two salient factors for automatic induction of intents: (1)
+clustering algorithm for intent labeling and (2) user utterance embedding
+space. We compare existing off-the-shelf clustering models and embeddings based
+on DSTC11 evaluation. Our extensive experiments demonstrate that the combined
+selection of utterance embedding and clustering method in the intent induction
+task should be carefully considered. We also present that pretrained MiniLM
+with Agglomerative clustering shows significant improvement in NMI, ARI, F1,
+accuracy and example coverage in intent induction tasks. The source codes are
+available at https://github.com/Jeiyoon/dstc11-track2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Eleventh Dialog System Technology Challenge (DSTC11)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Building Efficient and Effective OpenQA Systems for Low-Resource
+  Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03590v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03590v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emrah Budur, Rıza Özçelik, Dilara Soylu, Omar Khattab, Tunga Güngör, Christopher Potts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering (QA) is the task of answering questions posed in natural
+language with free-form natural language answers extracted from a given
+passage. In the OpenQA variant, only a question text is given, and the system
+must retrieve relevant passages from an unstructured knowledge source and use
+them to provide answers, which is the case in the mainstream QA systems on the
+Web. QA systems currently are mostly limited to the English language due to the
+lack of large-scale labeled QA datasets in non-English languages. In this
+paper, we show that effective, low-cost OpenQA systems can be developed for
+low-resource contexts. The key ingredients are (1) weak supervision using
+machine-translated labeled datasets and (2) a relevant unstructured knowledge
+source in the target language context. Furthermore, we show that only a few
+hundred gold assessment examples are needed to reliably evaluate these systems.
+We apply our method to Turkish as a challenging case study, since English and
+Turkish are typologically very distinct and Turkish has limited resources for
+QA. We present SQuAD-TR, a machine translation of SQuAD2.0, and we build our
+OpenQA system by adapting ColBERT-QA and retraining it over Turkish resources
+and SQuAD-TR using two versions of Wikipedia dumps spanning two years. We
+obtain a performance improvement of 24-32% in the Exact Match (EM) score and
+22-29% in the F1 score compared to the BM25-based and DPR-based baseline QA
+reader models. Our results show that SQuAD-TR makes OpenQA feasible for
+Turkish, which we hope encourages researchers to build OpenQA systems in other
+low-resource languages. We make all the code, models, and the dataset publicly
+available at https://github.com/boun-tabi/SQuAD-TR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simul-LLM: A Framework for Exploring High-Quality Simultaneous
+  Translation with Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04691v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04691v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Agostinelli, Max Wild, Matthew Raffel, Kazi Ahmed Asif Fuad, Lizhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) with billions of parameters and pretrained on
+massive amounts of data are now capable of near or better than state-of-the-art
+performance in a variety of downstream natural language processing tasks.
+Neural machine translation (NMT) is one such task that LLMs have been applied
+to with great success. However, little research has focused on applying LLMs to
+the more difficult subset of NMT called simultaneous translation (SimulMT),
+where translation begins before the entire source context is available to the
+model. In this paper, we address key challenges facing LLMs fine-tuned for
+SimulMT, validate classical SimulMT concepts and practices in the context of
+LLMs, explore adapting LLMs that are fine-tuned for NMT to the task of SimulMT,
+and introduce Simul-LLM, the first open-source fine-tuning and evaluation
+pipeline development framework for LLMs focused on SimulMT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Knowledge Distillation via Synthetic Text
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00932v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00932v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Flemings, Murali Annavaram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language models (LLMs) are achieving state-of-the-art performance in
+many different downstream tasks. However, the increasing urgency of data
+privacy puts pressure on practitioners to train LLMs with Differential Privacy
+(DP) on private data. Concurrently, the exponential growth in parameter size of
+LLMs necessitates model compression before deployment of LLMs on
+resource-constrained devices or latency-sensitive applications. Differential
+privacy and model compression generally must trade off utility loss to achieve
+their objectives. Moreover, simultaneously applying both schemes can compound
+the utility degradation. To this end, we propose DistilDP: a novel
+differentially private knowledge distillation algorithm that exploits synthetic
+data generated by a differentially private teacher LLM. The knowledge of a
+teacher LLM is transferred onto the student in two ways: one way from the
+synthetic data itself -- the hard labels, and the other way by the output
+distribution of the teacher evaluated on the synthetic data -- the soft labels.
+Furthermore, if the teacher and student share a similar architectural
+structure, we can further distill knowledge by aligning the hidden
+representations between both. Our experimental results demonstrate that
+DistilDP can substantially improve the utility over existing baselines, at
+least $9.0$ PPL on the Big Patent dataset, with strong privacy parameters,
+$\epsilon=2$. These promising results progress privacy-preserving compression
+of autoregressive LLMs. Our code can be accessed here:
+https://github.com/james-flemings/dp_compress.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Patent<span class="highlight-title">GPT</span>: A Large Language Model for Intellectual Property 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18255v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18255v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zilong Bai, Ruiji Zhang, Linqing Chen, Qijun Cai, Yuan Zhong, Cong Wang, Yan Fang, Jie Fang, Jing Sun, Weikuan Wang, Lizhi Zhou, Haoran Hua, Tian Qiu, Chaochao Wang, Cheng Sun, Jianping Lu, Yixin Wang, Yubin Xia, Meng Hu, Haowen Liu, Peng Xu, Licong Xu, Fu Bian, Xiaolong Gu, Lisha Zhang, Weilei Wang, Changyang Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, large language models(LLMs) have attracted significant
+attention due to their exceptional performance across a multitude of natural
+language process tasks, and have been widely applied in various fields.
+However, the application of large language models in the Intellectual Property
+(IP) domain is challenging due to the strong need for specialized knowledge,
+privacy protection, processing of extremely long text in this field. In this
+technical report, we present for the first time a low-cost, standardized
+procedure for training IP-oriented LLMs, meeting the unique requirements of the
+IP domain. Using this standard process, we have trained the PatentGPT series
+models based on open-source pretrained models. By evaluating them on the
+open-source IP-oriented benchmark MOZIP, our domain-specific LLMs outperforms
+GPT-4, indicating the effectiveness of the proposed training procedure and the
+expertise of the PatentGPT models in the IP domain. Remarkably, our model
+surpassed GPT-4 on the 2019 China Patent Agent Qualification Examination,
+scoring 65 and matching human expert levels. Additionally, the PatentGPT model,
+which utilizes the SMoE architecture, achieves performance comparable to that
+of GPT-4 in the IP domain and demonstrates a better cost-performance ratio on
+long-text tasks, potentially serving as an alternative to GPT-4 within the IP
+domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Consistency and Role-Specific Knowledge Capturing by
+  Rebuilding Fictional Character's Persona 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19778v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19778v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeiyoon Park, Chanjun Park, Heuiseok Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent introduction of Assistants API, it is expected that
+document-based language models will be actively used in various domains,
+especially Role-playing. However, a key challenge lies in utilizing
+protagonist's persona: Assistants API often fails to achieve with its search
+because the information extraction part is different each time and it often
+omits important information such as protagonist's backstory or relationships.
+It is hard to maintain a consistent persona simply by using the persona
+document as input to the Assistants API. To address the challenge of achieving
+stable persona consistency, we propose CharacterGPT, a novel persona
+reconstruction framework to alleviate the shortcomings of the Assistants API.
+Our method involves Character Persona Training (CPT), an effective persona
+rebuilding process that updates the character persona by extracting the
+character's traits from given summary of the novel for each character as if the
+story in a novel progresses. In our experiments, we ask each character to take
+the Big Five Inventory personality test in various settings and analyze the
+results. To assess whether it can think outside the box, we let each character
+generate short novels. Extensive experiments and human evaluation demonstrate
+that CharacterGPT presents new possibilities for role-playing agent research.
+Code and results are available at: https://github.com/Jeiyoon/charactergpt
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MultiPA: A Multi-task Speech Pronunciation Assessment Model for Open
+  Response Scenarios <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12490v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12490v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu-Wen Chen, Zhou Yu, Julia Hirschberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pronunciation assessment models designed for open response scenarios enable
+users to practice language skills in a manner similar to real-life
+communication. However, previous open-response pronunciation assessment models
+have predominantly focused on a single pronunciation task, such as
+sentence-level accuracy, rather than offering a comprehensive assessment in
+various aspects. We propose MultiPA, a Multitask Pronunciation Assessment model
+that provides sentence-level accuracy, fluency, prosody, and word-level
+accuracy assessment for open responses. We examined the correlation between
+different pronunciation tasks and showed the benefits of multi-task learning.
+Our model reached the state-of-the-art performance on existing in-domain data
+sets and effectively generalized to an out-of-domain dataset that we newly
+collected. The experimental results demonstrate the practical utility of our
+model in real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>INTERSPEECH 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">139</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convolutional Neural Networks and Vision <span class="highlight-title">Transformer</span>s for Fashion MNIST
+  Classification: A Literature <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sonia Bbouzidi, Ghazala Hcini, Imen Jdey, Fadoua Drira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our review explores the comparative analysis between Convolutional Neural
+Networks (CNNs) and Vision Transformers (ViTs) in the domain of image
+classification, with a particular focus on clothing classification within the
+e-commerce sector. Utilizing the Fashion MNIST dataset, we delve into the
+unique attributes of CNNs and ViTs. While CNNs have long been the cornerstone
+of image classification, ViTs introduce an innovative self-attention mechanism
+enabling nuanced weighting of different input data components. Historically,
+transformers have primarily been associated with Natural Language Processing
+(NLP) tasks. Through a comprehensive examination of existing literature, our
+aim is to unveil the distinctions between ViTs and CNNs in the context of image
+classification. Our analysis meticulously scrutinizes state-of-the-art
+methodologies employing both architectures, striving to identify the factors
+influencing their performance. These factors encompass dataset characteristics,
+image dimensions, the number of target classes, hardware infrastructure, and
+the specific architectures along with their respective top results. Our key
+goal is to determine the most appropriate architecture between ViT and CNN for
+classifying images in the Fashion MNIST dataset within the e-commerce industry,
+while taking into account specific conditions and needs. We highlight the
+importance of combining these two architectures with different forms to enhance
+overall performance. By uniting these architectures, we can take advantage of
+their unique strengths, which may lead to more precise and reliable models for
+e-commerce applications. CNNs are skilled at recognizing local patterns, while
+ViTs are effective at grasping overall context, making their combination a
+promising strategy for boosting image classification performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AD-H: Autonomous Driving with Hierarchical Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03474v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03474v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zaibin Zhang, Shiyu Tang, Yuanhang Zhang, Talas Fu, Yifan Wang, Yang Liu, Dong Wang, Jing Shao, Lijun Wang, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the impressive capabilities of multimodal large language models
+(MLLMs), recent works have focused on employing MLLM-based agents for
+autonomous driving in large-scale and dynamic environments. However, prevalent
+approaches often directly translate high-level instructions into low-level
+vehicle control signals, which deviates from the inherent language generation
+paradigm of MLLMs and fails to fully harness their emergent powers. As a
+result, the generalizability of these methods is highly restricted by
+autonomous driving datasets used during fine-tuning. To tackle this challenge,
+we propose to connect high-level instructions and low-level control signals
+with mid-level language-driven commands, which are more fine-grained than
+high-level instructions but more universal and explainable than control
+signals, and thus can effectively bridge the gap in between. We implement this
+idea through a hierarchical multi-agent driving system named AD-H, including a
+MLLM planner for high-level reasoning and a lightweight controller for
+low-level execution. The hierarchical design liberates the MLLM from low-level
+control signal decoding and therefore fully releases their emergent capability
+in high-level perception, reasoning, and planning. We build a new dataset with
+action hierarchy annotations. Comprehensive closed-loop evaluations demonstrate
+several key advantages of our proposed AD-H system. First, AD-H can notably
+outperform state-of-the-art methods in achieving exceptional driving
+performance, even exhibiting self-correction capabilities during vehicle
+operation, a scenario not encountered in the training dataset. Second, AD-H
+demonstrates superior generalization under long-horizon instructions and novel
+environmental conditions, significantly surpassing current state-of-the-art
+methods. We will make our data and code publicly accessible at
+https://github.com/zhangzaibin/AD-H
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Polarization Wavefront Lidar: Learning Large Scene Reconstruction from
+  Polarized Wavefronts <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Scheuble, Chenyang Lei, Seung-Hwan Baek, Mario Bijelic, Felix Heide
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lidar has become a cornerstone sensing modality for 3D vision, especially for
+large outdoor scenarios and autonomous driving. Conventional lidar sensors are
+capable of providing centimeter-accurate distance information by emitting laser
+pulses into a scene and measuring the time-of-flight (ToF) of the reflection.
+However, the polarization of the received light that depends on the surface
+orientation and material properties is usually not considered. As such, the
+polarization modality has the potential to improve scene reconstruction beyond
+distance measurements. In this work, we introduce a novel long-range
+polarization wavefront lidar sensor (PolLidar) that modulates the polarization
+of the emitted and received light. Departing from conventional lidar sensors,
+PolLidar allows access to the raw time-resolved polarimetric wavefronts. We
+leverage polarimetric wavefronts to estimate normals, distance, and material
+properties in outdoor scenarios with a novel learned reconstruction method. To
+train and evaluate the method, we introduce a simulated and real-world
+long-range dataset with paired raw lidar data, ground truth distance, and
+normal maps. We find that the proposed method improves normal and distance
+reconstruction by 53\% mean angular error and 41\% mean absolute error compared
+to existing shape-from-polarization (SfP) and ToF methods. Code and data are
+open-sourced at https://light.princeton.edu/pollidar.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CVPR 2024; Project Website:
+  https://light.princeton.edu/publication/pollidar</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LW-DETR: A <span class="highlight-title">Transformer</span> Replacement to YOLO for Real-Time Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Chen, Xiangbo Su, Xinyu Zhang, Jian Wang, Jiahui Chen, Yunpeng Shen, Chuchu Han, Ziliang Chen, Weixiang Xu, Fanrong Li, Shan Zhang, Kun Yao, Errui Ding, Gang Zhang, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a light-weight detection transformer, LW-DETR,
+which outperforms YOLOs for real-time object detection. The architecture is a
+simple stack of a ViT encoder, a projector, and a shallow DETR decoder. Our
+approach leverages recent advanced techniques, such as training-effective
+techniques, e.g., improved loss and pretraining, and interleaved window and
+global attentions for reducing the ViT encoder complexity. We improve the ViT
+encoder by aggregating multi-level feature maps, and the intermediate and final
+feature maps in the ViT encoder, forming richer feature maps, and introduce
+window-major feature map organization for improving the efficiency of
+interleaved attention computation. Experimental results demonstrate that the
+proposed approach is superior over existing real-time detectors, e.g., YOLO and
+its variants, on COCO and other benchmark datasets. Code and models are
+available at (https://github.com/Atten4Vis/LW-DETR).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FILS: <span class="highlight-title">Self-Supervised</span> Video Feature Prediction In Semantic Language
+  Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mona Ahmadian, Frank Guerin, Andrew Gilbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper demonstrates a self-supervised approach for learning semantic
+video representations. Recent vision studies show that a masking strategy for
+vision and natural language supervision has contributed to developing
+transferable visual pretraining. Our goal is to achieve a more semantic video
+representation by leveraging the text related to the video content during the
+pretraining in a fully self-supervised manner. To this end, we present FILS, a
+novel self-supervised video Feature prediction In semantic Language Space
+(FILS). The vision model can capture valuable structured information by
+correctly predicting masked feature semantics in language space. It is learned
+using a patch-wise video-text contrastive strategy, in which the text
+representations act as prototypes for transforming vision features into a
+language space, which are then used as targets for semantically meaningful
+feature prediction using our masked encoder-decoder structure. FILS
+demonstrates remarkable transferability on downstream action recognition tasks,
+achieving state-of-the-art on challenging egocentric datasets, like
+Epic-Kitchens, Something-SomethingV2, Charades-Ego, and EGTEA, using ViT-Base.
+Our efficient method requires less computation and smaller batches compared to
+previous works.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-to-Events: Synthetic Event Camera Streams from Conditional Text
+  Input 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03439v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03439v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joachim Ott, Zuowen Wang, Shih-Chii Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event cameras are advantageous for tasks that require vision sensors with
+low-latency and sparse output responses. However, the development of deep
+network algorithms using event cameras has been slow because of the lack of
+large labelled event camera datasets for network training. This paper reports a
+method for creating new labelled event datasets by using a text-to-X model,
+where X is one or multiple output modalities, in the case of this work, events.
+Our proposed text-to-events model produces synthetic event frames directly from
+text prompts. It uses an autoencoder which is trained to produce sparse event
+frames representing event camera outputs. By combining the pretrained
+autoencoder with a diffusion model architecture, the new text-to-events model
+is able to generate smooth synthetic event streams of moving objects. The
+autoencoder was first trained on an event camera dataset of diverse scenes. In
+the combined training with the diffusion model, the DVS gesture dataset was
+used. We demonstrate that the model can generate realistic event sequences of
+human gestures prompted by different text statements. The classification
+accuracy of the generated sequences, using a classifier trained on the real
+dataset, ranges between 42% to 92%, depending on the gesture group. The results
+demonstrate the capability of this method in synthesizing event datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CattleFace-RGBT: RGB-T Cattle Facial Landmark Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ethan Coffman, Reagan Clark, Nhat-Tan Bui, Trong Thang Pham, Beth Kegley, Jeremy G. Powell, Jiangchao Zhao, Ngan Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To address this challenge, we introduce CattleFace-RGBT, a RGB-T Cattle
+Facial Landmark dataset consisting of 2,300 RGB-T image pairs, a total of 4,600
+images. Creating a landmark dataset is time-consuming, but AI-assisted
+annotation can help. However, applying AI to thermal images is challenging due
+to suboptimal results from direct thermal training and infeasible RGB-thermal
+alignment due to different camera views. Therefore, we opt to transfer models
+trained on RGB to thermal images and refine them using our AI-assisted
+annotation tool following a semi-automatic annotation approach. Accurately
+localizing facial key points on both RGB and thermal images enables us to not
+only discern the cattle's respiratory signs but also measure temperatures to
+assess the animal's thermal state. To the best of our knowledge, this is the
+first dataset for the cattle facial landmark on RGB-T images. We conduct
+benchmarking of the CattleFace-RGBT dataset across various backbone
+architectures, with the objective of establishing baselines for future
+research, analysis, and comparison. The dataset and models are at
+https://github.com/UARK-AICV/CattleFace-RGBT-benchmark
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computation-Efficient Era: A Comprehensive <span class="highlight-title">Survey</span> of State Space Models
+  in Medical Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moein Heidari, Sina Ghorbani Kolahi, Sanaz Karimijafarbigloo, Bobby Azad, Afshin Bozorgpour, Soheila Hatami, Reza Azad, Ali Diba, Ulas Bagci, Dorit Merhof, Ilker Hacihaliloglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequence modeling plays a vital role across various domains, with recurrent
+neural networks being historically the predominant method of performing these
+tasks. However, the emergence of transformers has altered this paradigm due to
+their superior performance. Built upon these advances, transformers have
+conjoined CNNs as two leading foundational models for learning visual
+representations. However, transformers are hindered by the $\mathcal{O}(N^2)$
+complexity of their attention mechanisms, while CNNs lack global receptive
+fields and dynamic weight allocation. State Space Models (SSMs), specifically
+the \textit{\textbf{Mamba}} model with selection mechanisms and hardware-aware
+architecture, have garnered immense interest lately in sequential modeling and
+visual representation learning, challenging the dominance of transformers by
+providing infinite context lengths and offering substantial efficiency
+maintaining linear complexity in the input sequence. Capitalizing on the
+advances in computer vision, medical imaging has heralded a new epoch with
+Mamba models. Intending to help researchers navigate the surge, this survey
+seeks to offer an encyclopedic review of Mamba models in medical imaging.
+Specifically, we start with a comprehensive theoretical review forming the
+basis of SSMs, including Mamba architecture and its alternatives for sequence
+modeling paradigms in this context. Next, we offer a structured classification
+of Mamba models in the medical field and introduce a diverse categorization
+scheme based on their application, imaging modalities, and targeted organs.
+Finally, we summarize key challenges, discuss different future research
+directions of the SSMs in the medical domain, and propose several directions to
+fulfill the demands of this field. In addition, we have compiled the studies
+discussed in this paper along with their open-source implementations on our
+GitHub repository.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is the first version of our survey, and the paper is currently
+  under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Post-hoc Part-prototype Networks <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andong Tan, Fengtao Zhou, Hao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-hoc explainability methods such as Grad-CAM are popular because they do
+not influence the performance of a trained model. However, they mainly reveal
+"where" a model looks at for a given input, fail to explain "what" the model
+looks for (e.g., what is important to classify a bird image to a Scott
+Oriole?). Existing part-prototype networks leverage part-prototypes (e.g.,
+characteristic Scott Oriole's wing and head) to answer both "where" and "what",
+but often under-perform their black box counterparts in the accuracy.
+Therefore, a natural question is: can one construct a network that answers both
+"where" and "what" in a post-hoc manner to guarantee the model's performance?
+To this end, we propose the first post-hoc part-prototype network via
+decomposing the classification head of a trained model into a set of
+interpretable part-prototypes. Concretely, we propose an unsupervised prototype
+discovery and refining strategy to obtain prototypes that can precisely
+reconstruct the classification head, yet being interpretable. Besides
+guaranteeing the performance, we show that our network offers more faithful
+explanations qualitatively and yields even better part-prototypes
+quantitatively than prior part-prototype networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoFie: Learning Compact Neural Surface Representations with Coordinate
+  Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanwen Jiang, Haitao Yang, Georgios Pavlakos, Qixing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces CoFie, a novel local geometry-aware neural surface
+representation. CoFie is motivated by the theoretical analysis of local SDFs
+with quadratic approximation. We find that local shapes are highly compressive
+in an aligned coordinate frame defined by the normal and tangent directions of
+local shapes. Accordingly, we introduce Coordinate Field, which is a
+composition of coordinate frames of all local shapes. The Coordinate Field is
+optimizable and is used to transform the local shapes from the world coordinate
+frame to the aligned shape coordinate frame. It largely reduces the complexity
+of local shapes and benefits the learning of MLP-based implicit
+representations. Moreover, we introduce quadratic layers into the MLP to
+enhance expressiveness concerning local shape geometry. CoFie is a
+generalizable surface representation. It is trained on a curated set of 3D
+shapes and works on novel shape instances during testing. When using the same
+amount of parameters with prior works, CoFie reduces the shape error by 48% and
+56% on novel instances of both training and unseen shape categories. Moreover,
+CoFie demonstrates comparable performance to prior works when using only 70%
+fewer parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://hwjiang1510.github.io/CoFie/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UnWave-Net: Unrolled Wavelet Network for Compton Tomography Image
+  Reconstruction <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ishak Ayad, Cécilia Tarpau, Javier Cebeiro, Maï K. Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computed tomography (CT) is a widely used medical imaging technique to scan
+internal structures of a body, typically involving collimation and mechanical
+rotation. Compton scatter tomography (CST) presents an interesting alternative
+to conventional CT by leveraging Compton physics instead of collimation to
+gather information from multiple directions. While CST introduces new imaging
+opportunities with several advantages such as high sensitivity, compactness,
+and entirely fixed systems, image reconstruction remains an open problem due to
+the mathematical challenges of CST modeling. In contrast, deep unrolling
+networks have demonstrated potential in CT image reconstruction, despite their
+computationally intensive nature. In this study, we investigate the efficiency
+of unrolling networks for CST image reconstruction. To address the important
+computational cost required for training, we propose UnWave-Net, a novel
+unrolled wavelet-based reconstruction network. This architecture includes a
+non-local regularization term based on wavelets, which captures long-range
+dependencies within images and emphasizes the multi-scale components of the
+wavelet transform. We evaluate our approach using a CST of circular geometry
+which stays completely static during data acquisition, where UnWave-Net
+facilitates image reconstruction in the absence of a specific reconstruction
+formula. Our method outperforms existing approaches and achieves
+state-of-the-art performance in terms of SSIM and PSNR, and offers an improved
+computational efficiency compared to traditional unrolling networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been early accepted by MICCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interactive Text-to-Image Retrieval with Large Language Models: A
+  Plug-and-Play Approach <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saehyung Lee, Sangwon Yu, Junsung Park, Jihun Yi, Sungroh Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we primarily address the issue of dialogue-form context query
+within the interactive text-to-image retrieval task. Our methodology, PlugIR,
+actively utilizes the general instruction-following capability of LLMs in two
+ways. First, by reformulating the dialogue-form context, we eliminate the
+necessity of fine-tuning a retrieval model on existing visual dialogue data,
+thereby enabling the use of any arbitrary black-box model. Second, we construct
+the LLM questioner to generate non-redundant questions about the attributes of
+the target image, based on the information of retrieval candidate images in the
+current context. This approach mitigates the issues of noisiness and redundancy
+in the generated questions. Beyond our methodology, we propose a novel
+evaluation metric, Best log Rank Integral (BRI), for a comprehensive assessment
+of the interactive retrieval system. PlugIR demonstrates superior performance
+compared to both zero-shot and fine-tuned baselines in various benchmarks.
+Additionally, the two methodologies comprising PlugIR can be flexibly applied
+together or separately in various situations. Our codes are available at
+https://github.com/Saehyung-Lee/PlugIR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ACL 2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gaussian Representation for Deformable Image Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihe Li, Fabian Zhang, Xia Li, Tianhao Zhang, Ye Zhang, Joachim Buhmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deformable image registration (DIR) is a fundamental task in radiotherapy,
+with existing methods often struggling to balance computational efficiency,
+registration accuracy, and speed effectively. We introduce a novel DIR approach
+employing parametric 3D Gaussian control points achieving a better tradeoff. It
+provides an explicit and flexible representation for spatial deformation fields
+between 3D volumetric medical images, producing a displacement vector field
+(DVF) across all volumetric positions. The movement of individual voxels is
+derived using linear blend skinning (LBS) through localized interpolation of
+transformations associated with neighboring Gaussians. This interpolation
+strategy not only simplifies the determination of voxel motions but also acts
+as an effective regularization technique. Our approach incorporates a unified
+optimization process through backpropagation, enabling iterative learning of
+both the parameters of the 3D Gaussians and their transformations.
+Additionally, the density of Gaussians is adjusted adaptively during the
+learning phase to accommodate varying degrees of motion complexity. We
+validated our approach on the 4D-CT lung DIR-Lab and cardiac ACDC datasets,
+achieving an average target registration error (TRE) of 1.06 mm within a
+much-improved processing time of 2.43 seconds for the DIR-Lab dataset over
+existing methods, demonstrating significant advancements in both accuracy and
+efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelfReDepth: <span class="highlight-title">Self-Supervised</span> Real-Time Depth Restoration for
+  Consumer-Grade Sensors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03388v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03388v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandre Duarte, Francisco Fernandes, João M. Pereira, Catarina Moreira, Jacinto C. Nascimento, Joaquim Jorge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth maps produced by consumer-grade sensors suffer from inaccurate
+measurements and missing data from either system or scene-specific sources.
+Data-driven denoising algorithms can mitigate such problems. However, they
+require vast amounts of ground truth depth data. Recent research has tackled
+this limitation using self-supervised learning techniques, but it requires
+multiple RGB-D sensors. Moreover, most existing approaches focus on denoising
+single isolated depth maps or specific subjects of interest, highlighting a
+need for methods to effectively denoise depth maps in real-time dynamic
+environments. This paper extends state-of-the-art approaches for
+depth-denoising commodity depth devices, proposing SelfReDepth, a
+self-supervised deep learning technique for depth restoration, via denoising
+and hole-filling by inpainting full-depth maps captured with RGB-D sensors. The
+algorithm targets depth data in video streams, utilizing multiple sequential
+depth frames coupled with color data to achieve high-quality depth videos with
+temporal coherence. Finally, SelfReDepth is designed to be compatible with
+various RGB-D sensors and usable in real-time scenarios as a pre-processing
+step before applying other depth-dependent algorithms. Our results demonstrate
+our approach's real-time performance on real-world datasets. They show that it
+outperforms state-of-the-art denoising and restoration performance at over
+30fps on Commercial Depth Cameras, with potential benefits for augmented and
+mixed-reality applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13pp, 5 figures, 1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SuperFormer: Volumetric <span class="highlight-title">Transformer</span> Architectures for MRI
+  Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristhian Forigua, Maria Escobar, Pablo Arbelaez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel framework for processing volumetric medical
+information using Visual Transformers (ViTs). First, We extend the
+state-of-the-art Swin Transformer model to the 3D medical domain. Second, we
+propose a new approach for processing volumetric information and encoding
+position in ViTs for 3D applications. We instantiate the proposed framework and
+present SuperFormer, a volumetric transformer-based approach for Magnetic
+Resonance Imaging (MRI) Super-Resolution. Our method leverages the 3D
+information of the MRI domain and uses a local self-attention mechanism with a
+3D relative positional encoding to recover anatomical details. In addition, our
+approach takes advantage of multi-domain information from volume and feature
+domains and fuses them to reconstruct the High-Resolution MRI. We perform an
+extensive validation on the Human Connectome Project dataset and demonstrate
+the superiority of volumetric transformers over 3D CNN-based methods. Our code
+and pretrained models are available at
+https://github.com/BCV-Uniandes/SuperFormer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Flexible Recursive Network for Video Stereo Matching Based on Residual
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youchen Zhao, Guorong Luo, Hua Zhong, Haixiong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the high similarity of disparity between consecutive frames in video
+sequences, the area where disparity changes is defined as the residual map,
+which can be calculated. Based on this, we propose RecSM, a network based on
+residual estimation with a flexible recursive structure for video stereo
+matching. The RecSM network accelerates stereo matching using a Multi-scale
+Residual Estimation Module (MREM), which employs the temporal context as a
+reference and rapidly calculates the disparity for the current frame by
+computing only the residual values between the current and previous frames. To
+further reduce the error of estimated disparities, we use the Disparity
+Optimization Module (DOM) and Temporal Attention Module (TAM) to enforce
+constraints between each module, and together with MREM, form a flexible
+Stackable Computation Structure (SCS), which allows for the design of different
+numbers of SCS based on practical scenarios. Experimental results demonstrate
+that with a stack count of 3, RecSM achieves a 4x speed improvement compared to
+ACVNet, running at 0.054 seconds based on one NVIDIA RTX 2080TI GPU, with an
+accuracy decrease of only 0.7%. Code is available at
+https://github.com/Y0uchenZ/RecSM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EngineBench: Flow Reconstruction in the Transparent Combustion Chamber
+  III Optical Engine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel J. Baker, Michael A. Hobley, Isabel Scherl, Xiaohang Fang, Felix C. P. Leach, Martin H. Davy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present EngineBench, the first machine learning (ML) oriented database to
+use high quality experimental data for the study of turbulent flows inside
+combustion machinery. Prior datasets for ML in fluid mechanics are synthetic or
+use overly simplistic geometries. EngineBench is comprised of real-world
+particle image velocimetry (PIV) data that captures the turbulent airflow
+patterns in a specially-designed optical engine. However, in PIV data from
+internal flows, such as from engines, it is often challenging to achieve a full
+field of view and large occlusions can be present. In order to design optimal
+combustion systems, insight into the turbulent flows in these obscured areas is
+needed, which can be provided via inpainting models. Here we propose a novel
+inpainting task using random edge gaps, a technique that emphasises realism by
+introducing occlusions at random sizes and orientations at the edges of the PIV
+images. We test five ML methods on random edge gaps using pixel-wise,
+vector-based, and multi-scale performance metrics. We find that UNet-based
+models are more accurate than the industry-norm non-parametric approach and the
+context encoder at this task on both small and large gap sizes. The dataset and
+inpainting task presented in this paper support the development of more
+general-purpose pre-trained ML models for engine design problems. The method
+comparisons allow for more informed selection of ML models for problems in
+experimental flow diagnostics. All data and code are publicly available at
+https://eng.ox.ac.uk/tpsrg/research/enginebench/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparative Benchmarking of Failure Detection Methods in Medical Image
+  Segmentation: Unveiling the Role of Confidence Aggregation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Zenk, David Zimmerer, Fabian Isensee, Jeremias Traub, Tobias Norajitra, Paul F. Jäger, Klaus Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation is an essential component of medical image analysis
+research, with recent deep learning algorithms offering out-of-the-box
+applicability across diverse datasets. Despite these advancements, segmentation
+failures remain a significant concern for real-world clinical applications,
+necessitating reliable detection mechanisms. This paper introduces a
+comprehensive benchmarking framework aimed at evaluating failure detection
+methodologies within medical image segmentation. Through our analysis, we
+identify the strengths and limitations of current failure detection metrics,
+advocating for the risk-coverage analysis as a holistic evaluation approach.
+Utilizing a collective dataset comprising five public 3D medical image
+collections, we assess the efficacy of various failure detection strategies
+under realistic test-time distribution shifts. Our findings highlight the
+importance of pixel confidence aggregation and we observe superior performance
+of the pairwise Dice score (Roy et al., 2019) between ensemble predictions,
+positioning it as a simple and robust baseline for failure detection in medical
+image segmentation. To promote ongoing research, we make the benchmarking
+framework available to the community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted for possible publication. Copyright may
+  be transferred without notice, after which this version may no longer be
+  accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Visual <span class="highlight-title">Prompt</span>s for Guiding the Attention of Vision <span class="highlight-title">Transformer</span>s <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Razieh Rezaei, Masoud Jalili Sabet, Jindong Gu, Daniel Rueckert, Philip Torr, Ashkan Khakzar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual prompting infuses visual information into the input image to adapt
+models toward specific predictions and tasks. Recently, manually crafted
+markers such as red circles are shown to guide the model to attend to a target
+region on the image. However, these markers only work on models trained with
+data containing those markers. Moreover, finding these prompts requires
+guesswork or prior knowledge of the domain on which the model is trained. This
+work circumvents manual design constraints by proposing to learn the visual
+prompts for guiding the attention of vision transformers. The learned visual
+prompt, added to any input image would redirect the attention of the
+pre-trained vision transformer to its spatial location on the image.
+Specifically, the prompt is learned in a self-supervised manner without
+requiring annotations and without fine-tuning the vision transformer. Our
+experiments demonstrate the effectiveness of the proposed optimization-based
+visual prompting strategy across various pre-trained vision encoders.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Short version (4-pages) accepted as a spotlight paper at T4V
+  workshop, CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ L-PR: Exploiting LiDAR Fiducial Marker for Unordered Low Overlap
+  Multiview Point Cloud Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibo Liu, Jinjun Shan, Amaldev Haridevan, Shuo Zhang, Kejian Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud registration is a prerequisite for many applications in computer
+vision and robotics. Most existing methods focus on pairwise registration of
+two point clouds with high overlap. Although there have been some methods for
+low overlap cases, they struggle in degraded scenarios. This paper introduces a
+novel framework named L-PR, designed to register unordered low overlap
+multiview point clouds leveraging LiDAR fiducial markers. We refer to them as
+LiDAR fiducial markers, but they are the same as the popular AprilTag and ArUco
+markers, thin sheets of paper that do not affect the 3D geometry of the
+environment. We first propose an improved adaptive threshold marker detection
+method to provide robust detection results when the viewpoints among point
+clouds change dramatically. Then, we formulate the unordered multiview point
+cloud registration problem as a maximum a-posteriori (MAP) problem and develop
+a framework consisting of two levels of graphs to address it. The first-level
+graph, constructed as a weighted graph, is designed to efficiently and
+optimally infer initial values of scan poses from the unordered set. The
+second-level graph is constructed as a factor graph. By globally optimizing the
+variables on the graph, including scan poses, marker poses, and marker corner
+positions, we tackle the MAP problem. We conduct qualitative and quantitative
+experiments to demonstrate that the proposed method exhibits superiority over
+competitors in four aspects: registration accuracy, instance reconstruction
+quality, localization accuracy, and robustness to the degraded scene. To
+benefit the community, we open-source our method and dataset at
+https://github.com/yorklyb/LiDAR-SFM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-to-Image Rectified Flow as Plug-and-Play Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofeng Yang, Cheng Chen, Xulei Yang, Fayao Liu, Guosheng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale diffusion models have achieved remarkable performance in
+generative tasks. Beyond their initial training applications, these models have
+proven their ability to function as versatile plug-and-play priors. For
+instance, 2D diffusion models can serve as loss functions to optimize 3D
+implicit models. Rectified flow, a novel class of generative models, enforces a
+linear progression from the source to the target distribution and has
+demonstrated superior performance across various domains. Compared to
+diffusion-based methods, rectified flow approaches surpass in terms of
+generation quality and efficiency, requiring fewer inference steps. In this
+work, we present theoretical and experimental evidence demonstrating that
+rectified flow based methods offer similar functionalities to diffusion models
+- they can also serve as effective priors. Besides the generative capabilities
+of diffusion priors, motivated by the unique time-symmetry properties of
+rectified flow models, a variant of our method can additionally perform image
+inversion. Experimentally, our rectified flow-based priors outperform their
+diffusion counterparts - the SDS and VSD losses - in text-to-3D generation. Our
+method also displays competitive performance in image inversion and editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/yangxiaofeng/rectified_flow_prior</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VWise: A novel benchmark for evaluating scene classification for
+  vehicular applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Azevedo, Emanuella Araújo, Gabriel Pierre, Willams de Lima Costa, João Marcelo Teixeira, Valter Ferreira, Roberto Jones, Veronica Teichrieb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current datasets for vehicular applications are mostly collected in North
+America or Europe. Models trained or evaluated on these datasets might suffer
+from geographical bias when deployed in other regions. Specifically, for scene
+classification, a highway in a Latin American country differs drastically from
+an Autobahn, for example, both in design and maintenance levels. We propose
+VWise, a novel benchmark for road-type classification and scene classification
+tasks, in addition to tasks focused on external contexts related to vehicular
+applications in LatAm. We collected over 520 video clips covering diverse urban
+and rural environments across Latin American countries, annotated with six
+classes of road types. We also evaluated several state-of-the-art
+classification models in baseline experiments, obtaining over 84% accuracy.
+With this dataset, we aim to enhance research on vehicular tasks in Latin
+America.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Copy-Move Forgery Detection and Localization Scheme: How to Avoid
+  Missed Detection and False Alarm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Jiang, Zhaowei Lu, Yuebing Gao, Yifan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image copy-move is an operation that replaces one part of the image with
+another part of the same image, which can be used for illegal purposes due to
+the potential semantic changes. Recent studies have shown that keypoint-based
+algorithms achieved excellent and robust localization performance even when
+small or smooth tampered areas were involved. However, when the input image is
+low-resolution, most existing keypoint-based algorithms are difficult to
+generate sufficient keypoints, resulting in more missed detections. In
+addition, existing algorithms are usually unable to distinguish between Similar
+but Genuine Objects (SGO) images and tampered images, resulting in more false
+alarms. This is mainly due to the lack of further verification of local
+homography matrix in forgery localization stage. To tackle these problems, this
+paper firstly proposes an excessive keypoint extraction strategy to overcome
+missed detection. Subsequently, a group matching algorithm is used to speed up
+the matching of excessive keypoints. Finally, a new iterative forgery
+localization algorithm is introduced to quickly form pixel-level localization
+results while ensuring a lower false alarm. Extensive experimental results show
+that our scheme has superior performance than state-of-the-art algorithms in
+overcoming missed detection and false alarm. Our code is available at
+https://github.com/LUZW1998/CMFDL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Generative Models for Proton Zero Degree Calorimeter Simulations in
+  ALICE, CERN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patryk Będkowski, Jan Dubiński, Kamil Deja, Przemysław Rokita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulating detector responses is a crucial part of understanding the
+inner-workings of particle collisions in the Large Hadron Collider at CERN. The
+current reliance on statistical Monte-Carlo simulations strains CERN's
+computational grid, underscoring the urgency for more efficient alternatives.
+Addressing these challenges, recent proposals advocate for generative machine
+learning methods. In this study, we present an innovative deep learning
+simulation approach tailored for the proton Zero Degree Calorimeter in the
+ALICE experiment. Leveraging a Generative Adversarial Network model with
+Selective Diversity Increase loss, we directly simulate calorimeter responses.
+To enhance its capabilities in modeling a broad range of calorimeter response
+intensities, we expand the SDI-GAN architecture with additional regularization.
+Moreover, to improve the spatial fidelity of the generated data, we introduce
+an auxiliary regressor network. Our method offers a significant speedup when
+comparing to the traditional Monte-Carlo based approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, PP-RAI 2024 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADer: A Comprehensive Benchmark for Multi-class Visual Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03262v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03262v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangning Zhang, Haoyang He, Zhenye Gan, Qingdong He, Yuxuan Cai, Zhucun Xue, Yabiao Wang, Chengjie Wang, Lei Xie, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual anomaly detection aims to identify anomalous regions in images through
+unsupervised learning paradigms, with increasing application demand and value
+in fields such as industrial inspection and medical lesion detection. Despite
+significant progress in recent years, there is a lack of comprehensive
+benchmarks to adequately evaluate the performance of various mainstream methods
+across different datasets under the practical multi-class setting. The absence
+of standardized experimental setups can lead to potential biases in training
+epochs, resolution, and metric results, resulting in erroneous conclusions.
+This paper addresses this issue by proposing a comprehensive visual anomaly
+detection benchmark, \textbf{\textit{ADer}}, which is a modular framework that
+is highly extensible for new methods. The benchmark includes multiple datasets
+from industrial and medical domains, implementing fifteen state-of-the-art
+methods and nine comprehensive metrics. Additionally, we have open-sourced the
+GPU-assisted \href{https://pypi.org/project/ADEval}{ADEval} package to address
+the slow evaluation problem of metrics like time-consuming mAU-PRO on
+large-scale data, significantly reducing evaluation time by more than
+\textit{1000-fold}. Through extensive experimental results, we objectively
+reveal the strengths and weaknesses of different methods and provide insights
+into the challenges and future directions of multi-class visual anomaly
+detection. We hope that \textbf{\textit{ADer}} will become a valuable resource
+for researchers and practitioners in the field, promoting the development of
+more robust and generalizable anomaly detection systems. Full codes have been
+attached in Appendix and open-sourced at
+\url{https://github.com/zhangzjn/ader}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>-based Visual Alignment for Zero-shot Policy Transfer <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haihan Gao, Rui Zhang, Qi Yi, Hantao Yao, Haochen Li, Jiaming Guo, Shaohui Peng, Yunkai Gao, QiCheng Wang, Xing Hu, Yuanbo Wen, Zihao Zhang, Zidong Du, Ling Li, Qi Guo, Yunji Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Overfitting in RL has become one of the main obstacles to applications in
+reinforcement learning(RL). Existing methods do not provide explicit semantic
+constrain for the feature extractor, hindering the agent from learning a
+unified cross-domain representation and resulting in performance degradation on
+unseen domains. Besides, abundant data from multiple domains are needed. To
+address these issues, in this work, we propose prompt-based visual alignment
+(PVA), a robust framework to mitigate the detrimental domain bias in the image
+for zero-shot policy transfer. Inspired that Visual-Language Model (VLM) can
+serve as a bridge to connect both text space and image space, we leverage the
+semantic information contained in a text sequence as an explicit constraint to
+train a visual aligner. Thus, the visual aligner can map images from multiple
+domains to a unified domain and achieve good generalization performance. To
+better depict semantic information, prompt tuning is applied to learn a
+sequence of learnable tokens. With explicit constraints of semantic
+information, PVA can learn unified cross-domain representation under limited
+access to cross-domain data and achieves great zero-shot generalization ability
+in unseen domains. We verify PVA on a vision-based autonomous driving task with
+CARLA simulator. Experiments show that the agent generalizes well on unseen
+domains under limited access to multi-domain data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Diffusion Models for Fast Simulations of Particle Collisions
+  at CERN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikołaj Kita, Jan Dubiński, Przemysław Rokita, Kamil Deja
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In High Energy Physics simulations play a crucial role in unraveling the
+complexities of particle collision experiments within CERN's Large Hadron
+Collider. Machine learning simulation methods have garnered attention as
+promising alternatives to traditional approaches. While existing methods mainly
+employ Variational Autoencoders (VAEs) or Generative Adversarial Networks
+(GANs), recent advancements highlight the efficacy of diffusion models as
+state-of-the-art generative machine learning methods. We present the first
+simulation for Zero Degree Calorimeter (ZDC) at the ALICE experiment based on
+diffusion models, achieving the highest fidelity compared to existing
+baselines. We perform an analysis of trade-offs between generation times and
+the simulation quality. The results indicate a significant potential of latent
+diffusion model due to its rapid generation time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Clipper: Enhancing Safety and Reliability of <span class="highlight-title">Transformer</span>-based
+  Object Detection Models <span class="chip">IJCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qutub Syed Sha, Michael Paulitsch, Karthik Pattabiraman, Korbinian Hagn, Fabian Oboril, Cornelius Buerkle, Kay-Ulrich Scholl, Gereon Hinz, Alois Knoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As transformer-based object detection models progress, their impact in
+critical sectors like autonomous vehicles and aviation is expected to grow.
+Soft errors causing bit flips during inference have significantly impacted DNN
+performance, altering predictions. Traditional range restriction solutions for
+CNNs fall short for transformers. This study introduces the Global Clipper and
+Global Hybrid Clipper, effective mitigation strategies specifically designed
+for transformer-based models. It significantly enhances their resilience to
+soft errors and reduces faulty inferences to ~ 0\%. We also detail extensive
+testing across over 64 scenarios involving two transformer models (DINO-DETR
+and Lite-DETR) and two CNN models (YOLOv3 and SSD) using three datasets,
+totalling approximately 3.3 million inferences, to assess model robustness
+comprehensively. Moreover, the paper explores unique aspects of attention
+blocks in transformers and their operational differences from CNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IJCAI-AISafety'24 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interactive Image Selection and Training for Brain Tumor Segmentation
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matheus A. Cerqueira, Flávia Sprenger, Bernardo C. A. Teixeira, Alexandre X. Falcão
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation is a relevant problem, with deep learning being an
+exponent. However, the necessity of a high volume of fully annotated images for
+training massive models can be a problem, especially for applications whose
+images present a great diversity, such as brain tumors, which can occur in
+different sizes and shapes. In contrast, a recent methodology, Feature Learning
+from Image Markers (FLIM), has involved an expert in the learning loop,
+producing small networks that require few images to train the convolutional
+layers. In this work, We employ an interactive method for image selection and
+training based on FLIM, exploring the user's knowledge. The results
+demonstrated that with our methodology, we could choose a small set of images
+to train the encoder of a U-shaped network, obtaining performance equal to
+manual selection and even surpassing the same U-shaped network trained with
+backpropagation and all training images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures, and 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Searching Priors Makes Text-to-Video Synthesis Better 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Cheng, Liang Peng, Linxuan Xia, Yuepeng Hu, Hengjia Li, Qinglin Lu, Xiaofei He, Boxi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant advancements in video diffusion models have brought substantial
+progress to the field of text-to-video (T2V) synthesis. However, existing T2V
+synthesis model struggle to accurately generate complex motion dynamics,
+leading to a reduction in video realism. One possible solution is to collect
+massive data and train the model on it, but this would be extremely expensive.
+To alleviate this problem, in this paper, we reformulate the typical T2V
+generation process as a search-based generation pipeline. Instead of scaling up
+the model training, we employ existing videos as the motion prior database.
+Specifically, we divide T2V generation process into two steps: (i) For a given
+prompt input, we search existing text-video datasets to find videos with text
+labels that closely match the prompt motions. We propose a tailored search
+algorithm that emphasizes object motion features. (ii) Retrieved videos are
+processed and distilled into motion priors to fine-tune a pre-trained base T2V
+model, followed by generating desired videos using input prompt. By utilizing
+the priors gleaned from the searched videos, we enhance the realism of the
+generated videos' motion. All operations can be finished on a single NVIDIA RTX
+4090 GPU. We validate our method against state-of-the-art T2V models across
+diverse prompt inputs. The code will be public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identification of Stone Deterioration Patterns with Large Multimodal
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Corradetti, Jose Delgado Rodrigues
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conservation of stone-based cultural heritage sites is a critical concern
+for preserving cultural and historical landmarks. With the advent of Large
+Multimodal Models, as GPT-4omni (OpenAI), Claude 3 Opus (Anthropic) and Gemini
+1.5 Pro (Google), it is becoming increasingly important to define the
+operational capabilities of these models. In this work, we systematically
+evaluate the abilities of the main foundational multimodal models to recognise
+and classify anomalies and deterioration patterns of the stone elements that
+are useful in the practice of conservation and restoration of world heritage.
+After defining a taxonomy of the main stone deterioration patterns and
+anomalies, we asked the foundational models to identify a curated selection of
+354 highly representative images of stone-built heritage, offering them a
+careful selection of labels to choose from. The result, which varies depending
+on the type of pattern, allowed us to identify the strengths and weaknesses of
+these models in the field of heritage conservation and restoration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures, submitted to Journal of Cultural Heritage</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Writing Order Recovery in Complex and Long Static Handwriting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moises Diaz, Gioele Crispo, Antonio Parziale, Angelo Marcelli, Miguel A. Ferrer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The order in which the trajectory is executed is a powerful source of
+information for recognizers. However, there is still no general approach for
+recovering the trajectory of complex and long handwriting from static images.
+Complex specimens can result in multiple pen-downs and in a high number of
+trajectory crossings yielding agglomerations of pixels (also known as
+clusters). While the scientific literature describes a wide range of approaches
+for recovering the writing order in handwriting, these approaches nevertheless
+lack a common evaluation metric. In this paper, we introduce a new system to
+estimate the order recovery of thinned static trajectories, which allows to
+effectively resolve the clusters and select the order of the executed
+pen-downs. We evaluate how knowing the starting points of the pen-downs affects
+the quality of the recovered writing. Once the stability and sensitivity of the
+system is analyzed, we describe a series of experiments with three publicly
+available databases, showing competitive results in all cases. We expect the
+proposed system, whose code is made publicly available to the research
+community, to reduce potential confusion when the order of complex trajectories
+are recovered, and this will in turn make the trajectories recovered to be
+viable for further applications, such as velocity estimation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Situation Monitor: Diversity-Driven Zero-Shot Out-of-Distribution
+  Detection using Budding Ensemble Architecture for Object Detection <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qutub Syed, Michael Paulitsch, Korbinian Hagn, Neslihan Kose Cihangir, Kay-Ulrich Scholl, Fabian Oboril, Gereon Hinz, Alois Knoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Situation Monitor, a novel zero-shot Out-of-Distribution (OOD)
+detection approach for transformer-based object detection models to enhance
+reliability in safety-critical machine learning applications such as autonomous
+driving. The Situation Monitor utilizes the Diversity-based Budding Ensemble
+Architecture (DBEA) and increases the OOD performance by integrating a
+diversity loss into the training process on top of the budding ensemble
+architecture, detecting Far-OOD samples and minimizing false positives on
+Near-OOD samples. Moreover, utilizing the resulting DBEA increases the model's
+OOD performance and improves the calibration of confidence scores, particularly
+concerning the intersection over union of the detected objects. The DBEA model
+achieves these advancements with a 14% reduction in trainable parameters
+compared to the vanilla model. This signifies a substantial improvement in
+efficiency without compromising the model's ability to detect OOD instances and
+calibrate the confidence scores accurately.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted at CVPR SAIAD Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ouroboros3D: Image-to-3D Generation via 3D-aware Recursive Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Wen, Zehuan Huang, Yaohui Wang, Xinyuan Chen, Yu Qiao, Lu Sheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing single image-to-3D creation methods typically involve a two-stage
+process, first generating multi-view images, and then using these images for 3D
+reconstruction. However, training these two stages separately leads to
+significant data bias in the inference phase, thus affecting the quality of
+reconstructed results. We introduce a unified 3D generation framework, named
+Ouroboros3D, which integrates diffusion-based multi-view image generation and
+3D reconstruction into a recursive diffusion process. In our framework, these
+two modules are jointly trained through a self-conditioning mechanism, allowing
+them to adapt to each other's characteristics for robust inference. During the
+multi-view denoising process, the multi-view diffusion model uses the 3D-aware
+maps rendered by the reconstruction module at the previous timestep as
+additional conditions. The recursive diffusion framework with 3D-aware feedback
+unites the entire process and improves geometric consistency.Experiments show
+that our framework outperforms separation of these two stages and existing
+methods that combine them at the inference phase. Project page:
+https://costwen.github.io/Ouroboros3D/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See our project page at https://costwen.github.io/Ouroboros3D/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Geometric Localization of Homology Cycles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03183v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03183v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amritendu Dhar, Vijay Natarajan, Abhishek Rathod
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computing an optimal cycle in a given homology class, also referred to as the
+homology localization problem, is known to be an NP-hard problem in general.
+Furthermore, there is currently no known optimality criterion that localizes
+classes geometrically and admits a stability property under the setting of
+persistent homology. We present a geometric optimization of the cycles that is
+computable in polynomial time and is stable in an approximate sense. Tailoring
+our search criterion to different settings, we obtain various optimization
+problems like optimal homologous cycle, minimum homology basis, and minimum
+persistent homology basis. In practice, the (trivial) exact algorithm is
+computationally expensive despite having a worst case polynomial runtime.
+Therefore, we design approximation algorithms for the above problems and study
+their performance experimentally. These algorithms have reasonable runtimes for
+moderate sized datasets and the cycles computed by these algorithms are
+consistently of high quality as demonstrated via experiments on multiple
+datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To Appear in CCCG 2024 : Proc. 36th Canadian Conference on
+  Computational Geometry</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FAPNet: An Effective Frequency Adaptive Point-based Eye Tracker <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaopeng Lin, Hongwei Ren, Bojun Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Eye tracking is crucial for human-computer interaction in different domains.
+Conventional cameras encounter challenges such as power consumption and image
+quality during different eye movements, prompting the need for advanced
+solutions with ultra-fast, low-power, and accurate eye trackers. Event cameras,
+fundamentally designed to capture information about moving objects, exhibit low
+power consumption and high temporal resolution. This positions them as an
+alternative to traditional cameras in the realm of eye tracking. Nevertheless,
+existing event-based eye tracking networks neglect the pivotal sparse and
+fine-grained temporal information in events, resulting in unsatisfactory
+performance. Moreover, the energy-efficient features are further compromised by
+the use of excessively complex models, hindering efficient deployment on edge
+devices. In this paper, we utilize Point Cloud as the event representation to
+harness the high temporal resolution and sparse characteristics of events in
+eye tracking tasks. We rethink the point-based architecture PEPNet with
+preprocessing the long-term relationships between samples, leading to the
+innovative design of FAPNet. A frequency adaptive mechanism is designed to
+realize adaptive tracking according to the speed of the pupil movement and the
+Inter Sample LSTM module is introduced to utilize the temporal correlation
+between samples. In the Event-based Eye Tracking Challenge, we utilize vanilla
+PEPNet, which is the former work to achieve the $p_{10}$ accuracy of 97.95\%.
+On the SEET synthetic dataset, FAPNet can achieve state-of-the-art while
+consuming merely 10\% of the PEPNet's computational resources. Notably, the
+computational demand of FAPNet is independent of the sensor's spatial
+resolution, enhancing its applicability on resource-limited edge devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPRW 2024 (AIS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMCL: Boosting Deformable DETR-Based Detectors with Multi-Class
+  Min-Margin Contrastive Learning for Superior Prohibited Item Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyuan Li, Tong Jia, Hui Lu, Bowen Ma, Hao Wang, Dongyue Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prohibited Item detection in X-ray images is one of the most effective
+security inspection methods.However, differing from natural light images, the
+unique overlapping phenomena in X-ray images lead to the coupling of foreground
+and background features, thereby lowering the accuracy of general object
+detectors.Therefore, we propose a Multi-Class Min-Margin Contrastive Learning
+(MMCL) method that, by clarifying the category semantic information of content
+queries under the deformable DETR architecture, aids the model in extracting
+specific category foreground information from coupled features.Specifically,
+after grouping content queries by the number of categories, we employ the
+Multi-Class Inter-Class Exclusion (MIE) loss to push apart content queries from
+different groups. Concurrently, the Intra-Class Min-Margin Clustering (IMC)
+loss is utilized to attract content queries within the same group, while
+ensuring the preservation of necessary disparity. As training, the inherent
+Hungarian matching of the model progressively strengthens the alignment between
+each group of queries and the semantic features of their corresponding category
+of objects. This evolving coherence ensures a deep-seated grasp of category
+characteristics, consequently bolstering the anti-overlapping detection
+capabilities of models.MMCL is versatile and can be easily plugged into any
+deformable DETR-based model with dozens of lines of code. Extensive experiments
+on the PIXray and OPIXray datasets demonstrate that MMCL significantly enhances
+the performance of various state-of-the-art models without increasing
+complexity. The code has been released at
+https://github.com/anonymity0403/MMCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic 3D Gaussian Fields for Urban Areas 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Fischer, Jonas Kulhanek, Samuel Rota Bulò, Lorenzo Porzi, Marc Pollefeys, Peter Kontschieder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an efficient neural 3D scene representation for novel-view
+synthesis (NVS) in large-scale, dynamic urban areas. Existing works are not
+well suited for applications like mixed-reality or closed-loop simulation due
+to their limited visual quality and non-interactive rendering speeds. Recently,
+rasterization-based approaches have achieved high-quality NVS at impressive
+speeds. However, these methods are limited to small-scale, homogeneous data,
+i.e. they cannot handle severe appearance and geometry variations due to
+weather, season, and lighting and do not scale to larger, dynamic areas with
+thousands of images. We propose 4DGF, a neural scene representation that scales
+to large-scale dynamic urban areas, handles heterogeneous input data, and
+substantially improves rendering speeds. We use 3D Gaussians as an efficient
+geometry scaffold while relying on neural fields as a compact and flexible
+appearance model. We integrate scene dynamics via a scene graph at global scale
+while modeling articulated motions on a local level via deformations. This
+decomposed approach enables flexible scene composition suitable for real-world
+applications. In experiments, we surpass the state-of-the-art by over 3 dB in
+PSNR and more than 200 times in rendering speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page is available at https://tobiasfshr.github.io/pub/4dgf/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Task Multi-Scale Contrastive Knowledge Distillation for Efficient
+  Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Risab Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This thesis aims to investigate the feasibility of knowledge transfer between
+neural networks for medical image segmentation tasks, specifically focusing on
+the transfer from a larger multi-task "Teacher" network to a smaller "Student"
+network. In the context of medical imaging, where the data volumes are often
+limited, leveraging knowledge from a larger pre-trained network could be
+useful. The primary objective is to enhance the performance of a smaller
+student model by incorporating knowledge representations acquired by a teacher
+model that adopts a multi-task pre-trained architecture trained on CT images,
+to a more resource-efficient student network, which can essentially be a
+smaller version of the same, trained on a mere 50% of the data than that of the
+teacher model.
+  To facilitate knowledge transfer between the two models, we devised an
+architecture incorporating multi-scale feature distillation and supervised
+contrastive learning. Our study aims to improve the student model's performance
+by integrating knowledge representations from the teacher model. We investigate
+whether this approach is particularly effective in scenarios with limited
+computational resources and limited training data availability. To assess the
+impact of multi-scale feature distillation, we conducted extensive experiments.
+We also conducted a detailed ablation study to determine whether it is
+essential to distil knowledge at various scales, including low-level features
+from encoder layers, for effective knowledge transfer. In addition, we examine
+different losses in the knowledge distillation process to gain insights into
+their effects on overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Master's thesis</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sample-specific Masks for Visual Reprogramming-based <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03150v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03150v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengyi Cai, Zesheng Ye, Lei Feng, Jianzhong Qi, Feng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual reprogramming (VR) is a prompting technique that aims to re-purpose a
+pre-trained model (e.g., a classifier on ImageNet) to target tasks (e.g.,
+medical data prediction) by learning a small-scale pattern added into input
+images instead of tuning considerable parameters within the model. The location
+of the pattern within input samples is usually determined by a pre-defined mask
+shared across all samples. In this paper, we show that the shared mask
+potentially limits VR's generalization and increases its approximation error
+due to the lack of sample-level adaptation. Motivated by this finding, we
+design a new framework for VR called sample-specific multi-channel masks (SMM).
+Specifically, SMM employs a lightweight ConvNet and patch-wise interpolation to
+generate sample-specific three-channel masks instead of a shared and
+pre-defined mask. Since we generate different masks for individual samples, SMM
+is theoretically shown to reduce approximation error for the target tasks
+compared with existing state-of-the-art VR methods. We also empirically
+demonstrate its performance gain on both ResNet and ViT. The success of SMM
+further highlights the broader applicability of VR in leveraging the latent
+knowledge of pre-trained models for various target tasks. Our code is available
+at https://github.com/tmlr-group/SMM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tiny models from tiny data: Textual and null-text inversion for few-shot
+  distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Landolsi, Fredrik Kahl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot image classification involves classifying images using very few
+training examples. Recent vision foundation models show excellent few-shot
+transfer abilities, but are large and slow at inference. Using knowledge
+distillation, the capabilities of high-performing but slow models can be
+transferred to tiny, efficient models. However, common distillation methods
+require a large set of unlabeled data, which is not available in the few-shot
+setting. To overcome this lack of data, there has been a recent interest in
+using synthetic data.
+  We expand on this work by presenting a novel diffusion model inversion
+technique (TINT) combining the diversity of textual inversion with the
+specificity of null-text inversion. Using this method in a few-shot
+distillation pipeline leads to state-of-the-art accuracy among small student
+models on popular benchmarks, while being significantly faster than prior work.
+This allows us to push even tiny models to high accuracy using only a tiny
+application-specific dataset, albeit relying on extra data for pre-training.
+  Popular few-shot benchmarks involve evaluation over a large number of
+episodes, which is computationally cumbersome for methods involving synthetic
+data generation. Therefore, we also present a theoretical analysis on how the
+variance of the accuracy estimator depends on the number of episodes and query
+examples, and use these results to lower the computational effort required for
+method evaluation. In addition, to further motivate the use of generative
+models in few-shot distillation, we demonstrate that our method performs better
+compared to training on real data mined from the dataset used to train the
+diffusion model.
+  Source code will be made available at https://github.com/pixwse/tiny2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages (9 main pages + references and appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZeroPur: Succinct Training-Free Adversarial Purification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiuli Bi, Zonglin Yang, Bo Liu, Xiaodong Cun, Chi-Man Pun, Pietro Lio, Bin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial purification is a kind of defense technique that can defend
+various unseen adversarial attacks without modifying the victim classifier.
+Existing methods often depend on external generative models or cooperation
+between auxiliary functions and victim classifiers. However, retraining
+generative models, auxiliary functions, or victim classifiers relies on the
+domain of the fine-tuned dataset and is computation-consuming. In this work, we
+suppose that adversarial images are outliers of the natural image manifold and
+the purification process can be considered as returning them to this manifold.
+Following this assumption, we present a simple adversarial purification method
+without further training to purify adversarial images, called ZeroPur. ZeroPur
+contains two steps: given an adversarial example, Guided Shift obtains the
+shifted embedding of the adversarial example by the guidance of its blurred
+counterparts; after that, Adaptive Projection constructs a directional vector
+by this shifted embedding to provide momentum, projecting adversarial images
+onto the manifold adaptively. ZeroPur is independent of external models and
+requires no retraining of victim classifiers or auxiliary functions, relying
+solely on victim classifiers themselves to achieve purification. Extensive
+experiments on three datasets (CIFAR-10, CIFAR-100, and ImageNet-1K) using
+various classifier architectures (ResNet, WideResNet) demonstrate that our
+method achieves state-of-the-art robust performance. The code will be publicly
+available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced Automotive Object Detection via RGB-D Fusion in a DiffusionDet
+  Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eliraz Orfaig, Inna Stainvas, Igal Bilik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-based autonomous driving requires reliable and efficient object
+detection. This work proposes a DiffusionDet-based framework that exploits data
+fusion from the monocular camera and depth sensor to provide the RGB and depth
+(RGB-D) data. Within this framework, ground truth bounding boxes are randomly
+reshaped as part of the training phase, allowing the model to learn the reverse
+diffusion process of noise addition. The system methodically enhances a
+randomly generated set of boxes at the inference stage, guiding them toward
+accurate final detections. By integrating the textural and color features from
+RGB images with the spatial depth information from the LiDAR sensors, the
+proposed framework employs a feature fusion that substantially enhances object
+detection of automotive targets. The $2.3$ AP gain in detecting automotive
+targets is achieved through comprehensive experiments using the KITTI dataset.
+Specifically, the improved performance of the proposed approach in detecting
+small objects is demonstrated.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VQUNet: Vector Quantization U-Net for Defending Adversarial Atacks by
+  Regularizing Unwanted Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixun He, Mukesh Singhal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Neural Networks (DNN) have become a promising paradigm when developing
+Artificial Intelligence (AI) and Machine Learning (ML) applications. However,
+DNN applications are vulnerable to fake data that are crafted with adversarial
+attack algorithms. Under adversarial attacks, the prediction accuracy of DNN
+applications suffers, making them unreliable. In order to defend against
+adversarial attacks, we introduce a novel noise-reduction procedure, Vector
+Quantization U-Net (VQUNet), to reduce adversarial noise and reconstruct data
+with high fidelity. VQUNet features a discrete latent representation learning
+through a multi-scale hierarchical structure for both noise reduction and data
+reconstruction. The empirical experiments show that the proposed VQUNet
+provides better robustness to the target DNN models, and it outperforms other
+state-of-the-art noise-reduction-based defense methods under various
+adversarial attacks for both Fashion-MNIST and CIFAR10 datasets. When there is
+no adversarial attack, the defense method has less than 1% accuracy degradation
+for both datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing 3D Lane Detection and Topology Reasoning with 2D Lane Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03105v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03105v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Li, Zehao Huang, Zitian Wang, Wenge Rong, Naiyan Wang, Si Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D lane detection and topology reasoning are essential tasks in autonomous
+driving scenarios, requiring not only detecting the accurate 3D coordinates on
+lane lines, but also reasoning the relationship between lanes and traffic
+elements. Current vision-based methods, whether explicitly constructing BEV
+features or not, all establish the lane anchors/queries in 3D space while
+ignoring the 2D lane priors. In this study, we propose Topo2D, a novel
+framework based on Transformer, leveraging 2D lane instances to initialize 3D
+queries and 3D positional embeddings. Furthermore, we explicitly incorporate 2D
+lane features into the recognition of topology relationships among lane
+centerlines and between lane centerlines and traffic elements. Topo2D achieves
+44.5% OLS on multi-view topology reasoning benchmark OpenLane-V2 and 62.6%
+F-Socre on single-view 3D lane detection benchmark OpenLane, exceeding the
+performance of existing state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 9 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EpidermaQuant: Unsupervised detection and quantification of epidermal
+  differentiation markers on H-DAB-stained images of reconstructed human
+  epidermis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03103v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03103v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dawid Zamojski, Agnieszka Gogler, Dorota Scieglinska, Michal Marczyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integrity of the reconstructed human epidermis generated in vitro could
+be assessed using histological analyses combined with immunohistochemical
+staining of keratinocyte differentiation markers. Computer-based analysis of
+scanned tissue saves the expert time and may improve the accuracy of
+quantification by eliminating interrater reliability issues. However, technical
+differences during the preparation and capture of stained images and the
+presence of multiple artifacts may influence the outcome of computational
+methods. Using a dataset with 598 unannotated images showing cross-sections of
+in vitro reconstructed human epidermis stained with DAB-based
+immunohistochemistry reaction to visualize 4 different keratinocyte
+differentiation marker proteins (filaggrin, keratin 10, Ki67, HSPA2) and
+counterstained with hematoxylin, we developed an unsupervised method for the
+detection and quantification of immunohistochemical staining. The proposed
+pipeline includes the following steps: (i) color normalization to reduce the
+variability of pixel intensity values in different samples; (ii) color
+deconvolution to acquire color channels of the stains used; (iii) morphological
+operations to find the background area of the image; (iv) automatic image
+rotation; and (v) finding markers of human epidermal differentiation with
+clustering. Also, we created a method to exclude images without DAB-stained
+areas. The most effective combination of methods includes: (i) Reinhard's
+normalization; (ii) Ruifrok and Johnston color deconvolution method; (iii)
+proposed image rotation method based on boundary distribution of image
+intensity; (iv) k-means clustering using DAB stain intensity. These results
+should enhance the performance of quantitative analysis of protein markers in
+reconstructed human epidermis samples and enable comparison of their spatial
+distribution between different experimental conditions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EgoSurgery-Tool: A <span class="highlight-title">Dataset</span> of Surgical Tool and Hand Detection from
+  Egocentric Open Surgery Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03095v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03095v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryo Fujii, Hideo Saito, Hiroyuki Kajita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surgical tool detection is a fundamental task for understanding egocentric
+open surgery videos. However, detecting surgical tools presents significant
+challenges due to their highly imbalanced class distribution, similar shapes
+and similar textures, and heavy occlusion. The lack of a comprehensive
+large-scale dataset compounds these challenges. In this paper, we introduce
+EgoSurgery-Tool, an extension of the existing EgoSurgery-Phase dataset, which
+contains real open surgery videos captured using an egocentric camera attached
+to the surgeon's head, along with phase annotations. EgoSurgery-Tool has been
+densely annotated with surgical tools and comprises over 49K surgical tool
+bounding boxes across 15 categories, constituting a large-scale surgical tool
+detection dataset. EgoSurgery-Tool also provides annotations for hand detection
+with over 46K hand-bounding boxes, capturing hand-object interactions that are
+crucial for understanding activities in egocentric open surgery.
+EgoSurgery-Tool is superior to existing datasets due to its larger scale,
+greater variety of surgical tools, more annotations, and denser scenes. We
+conduct a comprehensive analysis of EgoSurgery-Tool using nine popular object
+detectors to assess their effectiveness in both surgical tool and hand
+detection. The dataset will be released at
+https://github.com/Fujiry0/EgoSurgery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lossless Image Compression Using Multi-level Dictionaries: Binary Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samar Agnihotri, Renu Rameshan, Ritwik Ghosal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lossless image compression is required in various applications to reduce
+storage or transmission costs of images, while requiring the reconstructed
+images to have zero information loss compared to the original. Existing
+lossless image compression methods either have simple design but poor
+compression performance, or complex design, better performance, but with no
+performance guarantees. In our endeavor to develop a lossless image compression
+method with low complexity and guaranteed performance, we argue that
+compressibility of a color image is essentially derived from the patterns in
+its spatial structure, intensity variations, and color variations. Thus, we
+divide the overall design of a lossless image compression scheme into three
+parts that exploit corresponding redundancies. We further argue that the
+binarized version of an image captures its fundamental spatial structure and in
+this work, we propose a scheme for lossless compression of binary images.
+  The proposed scheme first learns dictionaries of $16\times16$, $8\times8$,
+$4\times4$, and $2\times 2$ square pixel patterns from various datasets of
+binary images. It then uses these dictionaries to encode binary images. These
+dictionaries have various interesting properties that are further exploited to
+construct an efficient scheme. Our preliminary results show that the proposed
+scheme consistently outperforms existing conventional and learning based
+lossless compression approaches, and provides, on average, as much as
+$1.5\times$ better performance than a common general purpose lossless
+compression scheme (WebP), more than $3\times$ better performance than a state
+of the art learning based scheme, and better performance than a specialized
+scheme for binary image compression (JBIG2).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures, and 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting LMM-based knowledge for image classification tasks <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Tzelepi, Vasileios Mezaris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we address image classification tasks leveraging knowledge
+encoded in Large Multimodal Models (LMMs). More specifically, we use the
+MiniGPT-4 model to extract semantic descriptions for the images, in a
+multimodal prompting fashion. In the current literature, vision language models
+such as CLIP, among other approaches, are utilized as feature extractors, using
+only the image encoder, for solving image classification tasks. In this paper,
+we propose to additionally use the text encoder to obtain the text embeddings
+corresponding to the MiniGPT-4-generated semantic descriptions. Thus, we use
+both the image and text embeddings for solving the image classification task.
+The experimental evaluation on three datasets validates the improved
+classification performance achieved by exploiting LMM-based knowledge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication, 25th Int. Conf. on Engineering Applications
+  of Neural Networks (EANN/EAAAI 2024), Corfu, Greece, June 2024. This is the
+  "submitted manuscript"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A-Bench: Are LMMs Masters at Evaluating AI-generated Images? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zicheng Zhang, Haoning Wu, Chunyi Li, Yingjie Zhou, Wei Sun, Xiongkuo Min, Zijian Chen, Xiaohong Liu, Weisi Lin, Guangtao Zhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How to accurately and efficiently assess AI-generated images (AIGIs) remains
+a critical challenge for generative models. Given the high costs and extensive
+time commitments required for user studies, many researchers have turned
+towards employing large multi-modal models (LMMs) as AIGI evaluators, the
+precision and validity of which are still questionable. Furthermore,
+traditional benchmarks often utilize mostly natural-captured content rather
+than AIGIs to test the abilities of LMMs, leading to a noticeable gap for
+AIGIs. Therefore, we introduce A-Bench in this paper, a benchmark designed to
+diagnose whether LMMs are masters at evaluating AIGIs. Specifically, A-Bench is
+organized under two key principles: 1) Emphasizing both high-level semantic
+understanding and low-level visual quality perception to address the intricate
+demands of AIGIs. 2) Various generative models are utilized for AIGI creation,
+and various LMMs are employed for evaluation, which ensures a comprehensive
+validation scope. Ultimately, 2,864 AIGIs from 16 text-to-image models are
+sampled, each paired with question-answers annotated by human experts, and
+tested across 18 leading LMMs. We hope that A-Bench will significantly enhance
+the evaluation process and promote the generation quality for AIGIs. The
+benchmark is available at https://github.com/Q-Future/A-Bench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decision Boundary-aware Knowledge Consolidation Generates Better
+  Instance-Incremental Learner 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03065v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03065v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Nie, Weifu Fu, Yuhuan Lin, Jialin Li, Yifeng Zhou, Yong Liu, Lei Zhu, Chengjie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instance-incremental learning (IIL) focuses on learning continually with data
+of the same classes. Compared to class-incremental learning (CIL), the IIL is
+seldom explored because IIL suffers less from catastrophic forgetting (CF).
+However, besides retaining knowledge, in real-world deployment scenarios where
+the class space is always predefined, continual and cost-effective model
+promotion with the potential unavailability of previous data is a more
+essential demand. Therefore, we first define a new and more practical IIL
+setting as promoting the model's performance besides resisting CF with only new
+observations. Two issues have to be tackled in the new IIL setting: 1) the
+notorious catastrophic forgetting because of no access to old data, and 2)
+broadening the existing decision boundary to new observations because of
+concept drift. To tackle these problems, our key insight is to moderately
+broaden the decision boundary to fail cases while retain old boundary. Hence,
+we propose a novel decision boundary-aware distillation method with
+consolidating knowledge to teacher to ease the student learning new knowledge.
+We also establish the benchmarks on existing datasets Cifar-100 and ImageNet.
+Notably, extensive experiments demonstrate that the teacher model can be a
+better incremental learner than the student model, which overturns previous
+knowledge distillation-based methods treating student as the main role.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapter-X: A Novel General Parameter-Efficient Fine-Tuning Framework for
+  Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minglei Li, Peng Ye, Yongqi Huang, Lin Zhang, Tao Chen, Tong He, Jiayuan Fan, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) has become increasingly important as
+foundation models continue to grow in both popularity and size. Adapter has
+been particularly well-received due to their potential for parameter reduction
+and adaptability across diverse tasks. However, striking a balance between high
+efficiency and robust generalization across tasks remains a challenge for
+adapter-based methods. We analyze existing methods and find that: 1) parameter
+sharing is the key to reducing redundancy; 2) more tunable parameters, dynamic
+allocation, and block-specific design are keys to improving performance.
+Unfortunately, no previous work considers all these factors. Inspired by this
+insight, we introduce a novel framework named Adapter-X. First, a Sharing
+Mixture of Adapters (SMoA) module is proposed to fulfill token-level dynamic
+allocation, increased tunable parameters, and inter-block sharing at the same
+time. Second, some block-specific designs like Prompt Generator (PG) are
+introduced to further enhance the ability of adaptation. Extensive experiments
+across 2D image and 3D point cloud modalities demonstrate that Adapter-X
+represents a significant milestone as it is the first to outperform full
+fine-tuning in both 2D image and 3D point cloud modalities with significantly
+fewer parameters, i.e., only 0.20% and 1.88% of original trainable parameters
+for 2D and 3D classification tasks. Our code will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Giving each task what it needs -- leveraging structured sparsity for
+  tailored multi-task learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richa Upadhyay, Ronald Phlypo, Rajkumar Saini, Marcus Liwicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Every task demands distinct feature representations, ranging from low-level
+to high-level attributes, so it is vital to address the specific needs of each
+task, especially in the Multi-task Learning (MTL) framework. This work,
+therefore, introduces Layer-Optimized Multi-Task (LOMT) models that utilize
+structured sparsity to refine feature selection for individual tasks and
+enhance the performance of all tasks in a multi-task scenario. Structured or
+group sparsity systematically eliminates parameters from trivial channels and,
+eventually, entire layers within a convolution neural network during training.
+Consequently, the remaining layers provide the most optimal features for a
+given task. In this two-step approach, we subsequently leverage this
+sparsity-induced optimal layer information to build the LOMT models by
+connecting task-specific decoders to these strategically identified layers,
+deviating from conventional approaches that uniformly connect decoders at the
+end of the network. This tailored architecture optimizes the network, focusing
+on essential features while reducing redundancy. We validate the efficacy of
+the proposed approach on two datasets, ie NYU-v2 and CelebAMask-HD datasets,
+for multiple heterogeneous tasks. A detailed performance analysis of the LOMT
+models, in contrast to the conventional MTL models, reveals that the LOMT
+models outperform for most task combinations. The excellent qualitative and
+quantitative outcomes highlight the effectiveness of employing structured
+sparsity for optimal layer (or feature) selection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Follow-Your-Pose v2: Multiple-Condition Guided Character Image Animation
+  for Stable Pose Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03035v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03035v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyun Xue, Hongfa Wang, Qi Tian, Yue Ma, Andong Wang, Zhiyuan Zhao, Shaobo Min, Wenzhe Zhao, Kaihao Zhang, Heung-Yeung Shum, Wei Liu, Mengyang Liu, Wenhan Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pose-controllable character video generation is in high demand with extensive
+applications for fields such as automatic advertising and content creation on
+social media platforms. While existing character image animation methods using
+pose sequences and reference images have shown promising performance, they tend
+to struggle with incoherent animation in complex scenarios, such as multiple
+character animation and body occlusion. Additionally, current methods request
+large-scale high-quality videos with stable backgrounds and temporal
+consistency as training datasets, otherwise, their performance will greatly
+deteriorate. These two issues hinder the practical utilization of character
+image animation tools. In this paper, we propose a practical and robust
+framework Follow-Your-Pose v2, which can be trained on noisy open-sourced
+videos readily available on the internet. Multi-condition guiders are designed
+to address the challenges of background stability, body occlusion in
+multi-character generation, and consistency of character appearance. Moreover,
+to fill the gap of fair evaluation of multi-character pose animation, we
+propose a new benchmark comprising approximately 4,000 frames. Extensive
+experiments demonstrate that our approach outperforms state-of-the-art methods
+by a margin of over 35\% across 2 datasets and on 7 metrics. Meanwhile,
+qualitative assessments reveal a significant improvement in the quality of
+generated video, particularly in scenarios involving complex backgrounds and
+body occlusion of multi-character, suggesting the superiority of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instructing <span class="highlight-title">Prompt</span>-to-<span class="highlight-title">Prompt</span> Generation for Zero-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Man Liu, Huihui Bai, Feng Li, Chunjie Zhang, Yunchao Wei, Meng Wang, Tat-Seng Chua, Yao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot learning (ZSL) aims to explore the semantic-visual interactions to
+discover comprehensive knowledge transferred from seen categories to classify
+unseen categories. Recently, prompt engineering has emerged in ZSL,
+demonstrating impressive potential as it enables the zero-shot transfer of
+diverse visual concepts to downstream tasks. However, these methods are still
+not well generalized to broad unseen domains. A key reason is that the fixed
+adaption of learnable prompts on seen domains makes it tend to over-emphasize
+the primary visual features observed during training. In this work, we propose
+a \textbf{P}rompt-to-\textbf{P}rompt generation methodology (\textbf{P2P}),
+which addresses this issue by further embracing the instruction-following
+technique to distill instructive visual prompts for comprehensive transferable
+knowledge discovery. The core of P2P is to mine semantic-related instruction
+from prompt-conditioned visual features and text instruction on modal-sharing
+semantic concepts and then inversely rectify the visual representations with
+the guidance of the learned instruction prompts. This enforces the compensation
+for missing visual details to primary contexts and further eliminates the
+cross-modal disparity, endowing unseen domain generalization. Through extensive
+experimental results, we demonstrate the efficacy of P2P in achieving superior
+performance over state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Puzzle Pieces Picker: Deciphering Ancient Chinese Characters with
+  Radical Reconstruction <span class="chip">ICDAR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengjie Wang, Kaile Zhang, Xinyu Wang, Shengwei Han, Yongge Liu, Lianwen Jin, Xiang Bai, Yuliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Oracle Bone Inscriptions is one of the oldest existing forms of writing in
+the world. However, due to the great antiquity of the era, a large number of
+Oracle Bone Inscriptions (OBI) remain undeciphered, making it one of the global
+challenges in the field of paleography today. This paper introduces a novel
+approach, namely Puzzle Pieces Picker (P$^3$), to decipher these enigmatic
+characters through radical reconstruction. We deconstruct OBI into foundational
+strokes and radicals, then employ a Transformer model to reconstruct them into
+their modern (conterpart)\textcolor{blue}{counterparts}, offering a
+groundbreaking solution to ancient script analysis. To further this endeavor, a
+new Ancient Chinese Character Puzzles (ACCP) dataset was developed, comprising
+an extensive collection of character images from seven key historical stages,
+annotated with detailed radical sequences. The experiments have showcased
+considerable promising insights, underscoring the potential and effectiveness
+of our approach in deciphering the intricacies of ancient Chinese scripts.
+Through this novel dataset and methodology, we aim to bridge the gap between
+traditional philology and modern document analysis techniques, offering new
+insights into the rich history of Chinese linguistic heritage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICDAR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DifAttack++: Query-Efficient Black-Box Adversarial Attack via
+  Hierarchical Disentangled Feature Space in Cross Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Liu, Jiantao Zhou, Jiandian Zeng, Jinyu Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work investigates efficient score-based black-box adversarial attacks
+with a high Attack Success Rate (ASR) and good generalizability. We design a
+novel attack method based on a \textit{Hierarchical} \textbf{Di}sentangled
+\textbf{F}eature space and \textit{cross domain}, called \textbf{DifAttack++},
+which differs significantly from the existing ones operating over the entire
+feature space. Specifically, DifAttack++ firstly disentangles an image's latent
+feature into an \textit{adversarial feature} (AF) and a \textit{visual feature}
+(VF) via an autoencoder equipped with our specially designed
+\textbf{H}ierarchical \textbf{D}ecouple-\textbf{F}usion (HDF) module, where the
+AF dominates the adversarial capability of an image, while the VF largely
+determines its visual appearance. We train such autoencoders for the clean and
+adversarial image domains respectively, meanwhile realizing feature
+disentanglement, by using pairs of clean images and their Adversarial Examples
+(AEs) generated from available surrogate models via white-box attack methods.
+Eventually, in the black-box attack stage, DifAttack++ iteratively optimizes
+the AF according to the query feedback from the victim model until a successful
+AE is generated, while keeping the VF unaltered. Extensive experimental results
+demonstrate that our method achieves superior ASR and query efficiency than
+SOTA methods, meanwhile exhibiting much better visual quality of AEs. The code
+is available at https://github.com/csjunjun/DifAttack.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2309.14585</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Balancing Performance and Efficiency in Zero-shot Robotic Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmytro Kuzmenko, Nadiya Shvai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an optimization study of the Vision-Language Frontier Maps (VLFM)
+applied to the Object Goal Navigation task in robotics. Our work evaluates the
+efficiency and performance of various vision-language models, object detectors,
+segmentation models, and multi-modal comprehension and Visual Question
+Answering modules. Using the $\textit{val-mini}$ and $\textit{val}$ splits of
+Habitat-Matterport 3D dataset, we conduct experiments on a desktop with limited
+VRAM. We propose a solution that achieves a higher success rate (+1.55%)
+improving over the VLFM BLIP-2 baseline without substantial success-weighted
+path length loss while requiring $\textbf{2.3 times}$ less video memory. Our
+findings provide insights into balancing model performance and computational
+efficiency, suggesting effective deployment strategies for resource-limited
+environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICTERI 2024 Posters Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DriVLMe: Enhancing LLM-based Autonomous Driving Agents with Embodied and
+  Social Experiences <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yidong Huang, Jacob Sansom, Ziqiao Ma, Felix Gervits, Joyce Chai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in foundation models (FMs) have unlocked new prospects in
+autonomous driving, yet the experimental settings of these studies are
+preliminary, over-simplified, and fail to capture the complexity of real-world
+driving scenarios in human environments. It remains under-explored whether FM
+agents can handle long-horizon navigation tasks with free-from dialogue and
+deal with unexpected situations caused by environmental dynamics or task
+changes. To explore the capabilities and boundaries of FMs faced with the
+challenges above, we introduce DriVLMe, a video-language-model-based agent to
+facilitate natural and effective communication between humans and autonomous
+vehicles that perceive the environment and navigate. We develop DriVLMe from
+both embodied experiences in a simulated environment and social experiences
+from real human dialogue. While DriVLMe demonstrates competitive performance in
+both open-loop benchmarks and closed-loop human studies, we reveal several
+limitations and challenges, including unacceptable inference time, imbalanced
+training data, limited visual understanding, challenges with multi-turn
+interactions, simplified language generation from robotic experiences, and
+difficulties in handling on-the-fly unexpected situations like environmental
+dynamics and task changes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First Vision and Language for Autonomous Driving and Robotics
+  Workshop (VLADR @ CVPR 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Phy-Diff: Physics-guided Hourglass Diffusion Model for Diffusion MRI
+  Synthesis <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juanhua Zhang, Ruodan Yan, Alessandro Perelli, Xi Chen, Chao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion MRI (dMRI) is an important neuroimaging technique with high
+acquisition costs. Deep learning approaches have been used to enhance dMRI and
+predict diffusion biomarkers through undersampled dMRI. To generate more
+comprehensive raw dMRI, generative adversarial network based methods are
+proposed to include b-values and b-vectors as conditions, but they are limited
+by unstable training and less desirable diversity. The emerging diffusion model
+(DM) promises to improve generative performance. However, it remains
+challenging to include essential information in conditioning DM for more
+relevant generation, i.e., the physical principles of dMRI and white matter
+tract structures. In this study, we propose a physics-guided diffusion model to
+generate high-quality dMRI. Our model introduces the physical principles of
+dMRI in the noise evolution in the diffusion process and introduce a
+query-based conditional mapping within the difussion model. In addition, to
+enhance the anatomical fine detials of the generation, we introduce the XTRACT
+atlas as prior of white matter tracts by adopting an adapter technique. Our
+experiment results show that our method outperforms other state-of-the-art
+methods and has the potential to advance dMRI enhancement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MICCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EdgeSync: Faster Edge-model Updating via Adaptive Continuous Learning
+  for Video Data Drift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Zhao, Runchu Dong, Guiqin Wang, Cong Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-time video analytics systems typically place models with fewer weights
+on edge devices to reduce latency. The distribution of video content features
+may change over time for various reasons (i.e. light and weather change) ,
+leading to accuracy degradation of existing models, to solve this problem,
+recent work proposes a framework that uses a remote server to continually train
+and adapt the lightweight model at edge with the help of complex model.
+However, existing analytics approaches leave two challenges untouched: firstly,
+retraining task is compute-intensive, resulting in large model update delays;
+secondly, new model may not fit well enough with the data distribution of the
+current video stream. To address these challenges, in this paper, we present
+EdgeSync, EdgeSync filters the samples by considering both timeliness and
+inference results to make training samples more relevant to the current video
+content as well as reduce the update delay, to improve the quality of training,
+EdgeSync also designs a training management module that can efficiently adjusts
+the model training time and training order on the runtime. By evaluating real
+datasets with complex scenes, our method improves about 3.4% compared to
+existing methods and about 10% compared to traditional means.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantifying Task Priority for Multi-Task Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wooseong Jeong, Kuk-Jin Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of multi-task learning is to learn diverse tasks within a single
+unified network. As each task has its own unique objective function, conflicts
+emerge during training, resulting in negative transfer among them. Earlier
+research identified these conflicting gradients in shared parameters between
+tasks and attempted to realign them in the same direction. However, we prove
+that such optimization strategies lead to sub-optimal Pareto solutions due to
+their inability to accurately determine the individual contributions of each
+parameter across various tasks. In this paper, we propose the concept of task
+priority to evaluate parameter contributions across different tasks. To learn
+task priority, we identify the type of connections related to links between
+parameters influenced by task-specific losses during backpropagation. The
+strength of connections is gauged by the magnitude of parameters to determine
+task priority. Based on these, we present a new method named connection
+strength-based optimization for multi-task learning which consists of two
+phases. The first phase learns the task priority within the network, while the
+second phase modifies the gradients while upholding this priority. This
+ultimately leads to finding new Pareto optimal solutions for multiple tasks.
+Through extensive experiments, we show that our approach greatly enhances
+multi-task performance in comparison to earlier gradient manipulation methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Human-Annotated Video <span class="highlight-title">Dataset</span> for Training and Evaluation of
+  360-Degree Video Summarization Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Kontostathis, Evlampios Apostolidis, Vasileios Mezaris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we introduce a new dataset for 360-degree video summarization:
+the transformation of 360-degree video content to concise 2D-video summaries
+that can be consumed via traditional devices, such as TV sets and smartphones.
+The dataset includes ground-truth human-generated summaries, that can be used
+for training and objectively evaluating 360-degree video summarization methods.
+Using this dataset, we train and assess two state-of-the-art summarization
+methods that were originally proposed for 2D-video summarization, to serve as a
+baseline for future comparisons with summarization methods that are
+specifically tailored to 360-degree video. Finally, we present an interactive
+tool that was developed to facilitate the data annotation process and can
+assist other annotation activities that rely on video fragment selection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication, 1st Int. Workshop on Video for Immersive
+  Experiences (Video4IMX-2024) at ACM IMX 2024, Stockholm, Sweden, June 2024.
+  This is the "accepted version"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting Genetic Mutation from Whole Slide Images via
+  Biomedical-Linguistic Knowledge Enhanced Multi-label Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02990v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02990v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gexin Huang, Chenfei Wu, Mingjie Li, Xiaojun Chang, Ling Chen, Ying Sun, Shen Zhao, Xiaodan Liang, Liang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting genetic mutations from whole slide images is indispensable for
+cancer diagnosis. However, existing work training multiple binary
+classification models faces two challenges: (a) Training multiple binary
+classifiers is inefficient and would inevitably lead to a class imbalance
+problem. (b) The biological relationships among genes are overlooked, which
+limits the prediction performance. To tackle these challenges, we innovatively
+design a Biological-knowledge enhanced PathGenomic multi-label Transformer to
+improve genetic mutation prediction performances. BPGT first establishes a
+novel gene encoder that constructs gene priors by two carefully designed
+modules: (a) A gene graph whose node features are the genes' linguistic
+descriptions and the cancer phenotype, with edges modeled by genes' pathway
+associations and mutation consistencies. (b) A knowledge association module
+that fuses linguistic and biomedical knowledge into gene priors by
+transformer-based graph representation learning, capturing the intrinsic
+relationships between different genes' mutations. BPGT then designs a label
+decoder that finally performs genetic mutation prediction by two tailored
+modules: (a) A modality fusion module that firstly fuses the gene priors with
+critical regions in WSIs and obtains gene-wise mutation logits. (b) A
+comparative multi-label loss that emphasizes the inherent comparisons among
+mutation status to enhance the discrimination capabilities. Sufficient
+experiments on The Cancer Genome Atlas benchmark demonstrate that BPGT
+outperforms the state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 8 figures, and 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Multimodal Large Language Models with Multi-instance Visual
+  <span class="highlight-title">Prompt</span> Generator for Visual Representation Enrichment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenliang Zhong, Wenyi Wu, Qi Li, Rob Barton, Boxin Du, Shioulin Sam, Karim Bouyarmane, Ismail Tutar, Junzhou Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Large Language Models (MLLMs) have achieved SOTA performance in
+various visual language tasks by fusing the visual representations with LLMs
+leveraging some visual adapters. In this paper, we first establish that
+adapters using query-based Transformers such as Q-former is a simplified
+Multi-instance Learning method without considering instance
+heterogeneity/correlation. We then propose a general component termed
+Multi-instance Visual Prompt Generator (MIVPG) to incorporate enriched visual
+representations into LLMs by taking advantage of instance correlation between
+images or patches for the same sample. Quantatitive evaluation on three public
+vision-language (VL) datasets from different scenarios shows that the proposed
+MIVPG improves Q-former in main VL tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Skeleton Action Representation Learning: A Benchmark and
+  Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahang Zhang, Lilang Lin, Shuai Yang, Jiaying Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL), which aims to learn meaningful prior
+representations from unlabeled data, has been proven effective for
+label-efficient skeleton-based action understanding. Different from the image
+domain, skeleton data possesses sparser spatial structures and diverse
+representation forms, with the absence of background clues and the additional
+temporal dimension. This presents the new challenges for the pretext task
+design of spatial-temporal motion representation learning. Recently, many
+endeavors have been made for skeleton-based SSL and remarkable progress has
+been achieved. However, a systematic and thorough review is still lacking. In
+this paper, we conduct, for the first time, a comprehensive survey on
+self-supervised skeleton-based action representation learning, where various
+literature is organized according to their pre-training pretext task
+methodologies. Following the taxonomy of context-based, generative learning,
+and contrastive learning approaches, we make a thorough review and benchmark of
+existing works and shed light on the future possible directions. Our
+investigation demonstrates that most SSL works rely on the single paradigm,
+learning representations of a single level, and are evaluated on the action
+recognition task solely, which leaves the generalization power of skeleton SSL
+models under-explored. To this end, a novel and effective SSL method for
+skeleton is further proposed, which integrates multiple pretext tasks to
+jointly learn versatile representations of different granularity, substantially
+boosting the generalization capacity for different downstream tasks. Extensive
+experiments under three large-scale datasets demonstrate that the proposed
+method achieves the superior generalization performance on various downstream
+tasks, including recognition, retrieval, detection, and few-shot learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse Color-Code Net: Real-Time RGB-Based 6D Object Pose Estimation on
+  Edge Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingjian Yang, Zhitao Yu, Ashis G. Banerjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As robotics and augmented reality applications increasingly rely on precise
+and efficient 6D object pose estimation, real-time performance on edge devices
+is required for more interactive and responsive systems. Our proposed Sparse
+Color-Code Net (SCCN) embodies a clear and concise pipeline design to
+effectively address this requirement. SCCN performs pixel-level predictions on
+the target object in the RGB image, utilizing the sparsity of essential object
+geometry features to speed up the Perspective-n-Point (PnP) computation
+process. Additionally, it introduces a novel pixel-level geometry-based object
+symmetry representation that seamlessly integrates with the initial pose
+predictions, effectively addressing symmetric object ambiguities. SCCN notably
+achieves an estimation rate of 19 frames per second (FPS) and 6 FPS on the
+benchmark LINEMOD dataset and the Occlusion LINEMOD dataset, respectively, for
+an NVIDIA Jetson AGX Xavier, while consistently maintaining high estimation
+accuracy at these rates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in the Proceedings of the 2024 IEEE 20th
+  International Conference on Automation Science and Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DA-Flow: Dual Attention Normalizing Flow for Skeleton-based Video
+  Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruituo Wu, Yang Chen, Jian Xiao, Bing Li, Jicong Fan, Frédéric Dufaux, Ce Zhu, Yipeng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cooperation between temporal convolutional networks (TCN) and graph
+convolutional networks (GCN) as a processing module has shown promising results
+in skeleton-based video anomaly detection (SVAD). However, to maintain a
+lightweight model with low computational and storage complexity, shallow GCN
+and TCN blocks are constrained by small receptive fields and a lack of
+cross-dimension interaction capture. To tackle this limitation, we propose a
+lightweight module called the Dual Attention Module (DAM) for capturing
+cross-dimension interaction relationships in spatio-temporal skeletal data. It
+employs the frame attention mechanism to identify the most significant frames
+and the skeleton attention mechanism to capture broader relationships across
+fixed partitions with minimal parameters and flops. Furthermore, the proposed
+Dual Attention Normalizing Flow (DA-Flow) integrates the DAM as a
+post-processing unit after GCN within the normalizing flow framework.
+Simulations show that the proposed model is robust against noise and negative
+samples. Experimental results show that DA-Flow reaches competitive or better
+performance than the existing state-of-the-art (SOTA) methods in terms of the
+micro AUC metric with the fewest number of parameters. Moreover, we found that
+even without training, simply using random projection without dimensionality
+reduction on skeleton data enables substantial anomaly detection capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Event3DGS: Event-based 3D Gaussian Splatting for Fast Egomotion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02972v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02972v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Xiong, Jiayi Wu, Botao He, Cornelia Fermuller, Yiannis Aloimonos, Heng Huang, Christopher A. Metzler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent emergence of 3D Gaussian splatting (3DGS) leverages the advantage
+of explicit point-based representations, which significantly improves the
+rendering speed and quality of novel-view synthesis. However, 3D radiance field
+rendering in environments with high-dynamic motion or challenging illumination
+condition remains problematic in real-world robotic tasks. The reason is that
+fast egomotion is prevalent real-world robotic tasks, which induces motion
+blur, leading to inaccuracies and artifacts in the reconstructed structure. To
+alleviate this problem, we propose Event3DGS, the first method that learns
+Gaussian Splatting solely from raw event streams. By exploiting the high
+temporal resolution of event cameras and explicit point-based representation,
+Event3DGS can reconstruct high-fidelity 3D structures solely from the event
+streams under fast egomotion. Our sparsity-aware sampling and progressive
+training approaches allow for better reconstruction quality and consistency. To
+further enhance the fidelity of appearance, we explicitly incorporate the
+motion blur formation process into a differentiable rasterizer, which is used
+with a limited set of blurred RGB images to refine the appearance. Extensive
+experiments on multiple datasets validate the superior rendering quality of
+Event3DGS compared with existing approaches, with over 95% lower training time
+and faster rendering speed in orders of magnitude.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Generation of Hierarchical Gaussians for 3D Generative Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangeek Hyun, Jae-Pil Heo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most advances in 3D Generative Adversarial Networks (3D GANs) largely depend
+on ray casting-based volume rendering, which incurs demanding rendering costs.
+One promising alternative is rasterization-based 3D Gaussian Splatting (3D-GS),
+providing a much faster rendering speed and explicit 3D representation. In this
+paper, we exploit Gaussian as a 3D representation for 3D GANs by leveraging its
+efficient and explicit characteristics. However, in an adversarial framework,
+we observe that a na\"ive generator architecture suffers from training
+instability and lacks the capability to adjust the scale of Gaussians. This
+leads to model divergence and visual artifacts due to the absence of proper
+guidance for initialized positions of Gaussians and densification to manage
+their scales adaptively. To address these issues, we introduce a generator
+architecture with a hierarchical multi-scale Gaussian representation that
+effectively regularizes the position and scale of generated Gaussians.
+Specifically, we design a hierarchy of Gaussians where finer-level Gaussians
+are parameterized by their coarser-level counterparts; the position of
+finer-level Gaussians would be located near their coarser-level counterparts,
+and the scale would monotonically decrease as the level becomes finer, modeling
+both coarse and fine details of the 3D scene. Experimental results demonstrate
+that ours achieves a significantly faster rendering speed (x100) compared to
+state-of-the-art 3D consistent GANs with comparable 3D generation capability.
+Project page: https://hse1032.github.io/gsgan.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://hse1032.github.io/gsgan</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding the Impact of Negative <span class="highlight-title">Prompt</span>s: When and How Do They Take
+  Effect? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02965v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02965v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Ban, Ruochen Wang, Tianyi Zhou, Minhao Cheng, Boqing Gong, Cho-Jui Hsieh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The concept of negative prompts, emerging from conditional generation models
+like Stable Diffusion, allows users to specify what to exclude from the
+generated images.%, demonstrating significant practical efficacy. Despite the
+widespread use of negative prompts, their intrinsic mechanisms remain largely
+unexplored. This paper presents the first comprehensive study to uncover how
+and when negative prompts take effect. Our extensive empirical analysis
+identifies two primary behaviors of negative prompts. Delayed Effect: The
+impact of negative prompts is observed after positive prompts render
+corresponding content. Deletion Through Neutralization: Negative prompts delete
+concepts from the generated image through a mutual cancellation effect in
+latent space with positive prompts. These insights reveal significant potential
+real-world applications; for example, we demonstrate that negative prompts can
+facilitate object inpainting with minimal alterations to the background via a
+simple adaptive algorithm. We believe our findings will offer valuable insights
+for the community in capitalizing on the potential of negative prompts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AVFF: Audio-Visual Feature Fusion for Video Deepfake Detection <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trevine Oorloff, Surya Koppisetti, Nicolò Bonettini, Divyaraj Solanki, Ben Colman, Yaser Yacoob, Ali Shahriyari, Gaurav Bharaj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid growth in deepfake video content, we require improved and
+generalizable methods to detect them. Most existing detection methods either
+use uni-modal cues or rely on supervised training to capture the dissonance
+between the audio and visual modalities. While the former disregards the
+audio-visual correspondences entirely, the latter predominantly focuses on
+discerning audio-visual cues within the training corpus, thereby potentially
+overlooking correspondences that can help detect unseen deepfakes. We present
+Audio-Visual Feature Fusion (AVFF), a two-stage cross-modal learning method
+that explicitly captures the correspondence between the audio and visual
+modalities for improved deepfake detection. The first stage pursues
+representation learning via self-supervision on real videos to capture the
+intrinsic audio-visual correspondences. To extract rich cross-modal
+representations, we use contrastive learning and autoencoding objectives, and
+introduce a novel audio-visual complementary masking and feature fusion
+strategy. The learned representations are tuned in the second stage, where
+deepfake classification is pursued via supervised learning on both real and
+fake videos. Extensive experiments and analysis suggest that our novel
+representation learning paradigm is highly discriminative in nature. We report
+98.6% accuracy and 99.1% AUC on the FakeAVCeleb dataset, outperforming the
+current audio-visual state-of-the-art by 14.9% and 9.9%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Radiomics-guided Multimodal Self-attention Network for Predicting
+  Pathological Complete Response in Breast MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02936v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02936v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonghun Kim, Hyunjin Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is the most prevalent cancer among women and predicting
+pathologic complete response (pCR) after anti-cancer treatment is crucial for
+patient prognosis and treatment customization. Deep learning has shown promise
+in medical imaging diagnosis, particularly when utilizing multiple imaging
+modalities to enhance accuracy. This study presents a model that predicts pCR
+in breast cancer patients using dynamic contrast-enhanced (DCE) magnetic
+resonance imaging (MRI) and apparent diffusion coefficient (ADC) maps.
+Radiomics features are established hand-crafted features of the tumor region
+and thus could be useful in medical image analysis. Our approach extracts
+features from both DCE MRI and ADC using an encoder with a self-attention
+mechanism, leveraging radiomics to guide feature extraction from tumor-related
+regions. Our experimental results demonstrate the superior performance of our
+model in predicting pCR compared to other baseline methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 5 figures, IEEE ISBI 2024 proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ P2PFormer: A Primitive-to-polygon Method for Regular Building Contour
+  Extraction from Remote Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Zhang, Shiqing Wei, Yikang Zhou, Muying Luo, Wenling You, Shunping Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting building contours from remote sensing imagery is a significant
+challenge due to buildings' complex and diverse shapes, occlusions, and noise.
+Existing methods often struggle with irregular contours, rounded corners, and
+redundancy points, necessitating extensive post-processing to produce regular
+polygonal building contours. To address these challenges, we introduce a novel,
+streamlined pipeline that generates regular building contours without
+post-processing. Our approach begins with the segmentation of generic geometric
+primitives (which can include vertices, lines, and corners), followed by the
+prediction of their sequence. This allows for the direct construction of
+regular building contours by sequentially connecting the segmented primitives.
+Building on this pipeline, we developed P2PFormer, which utilizes a
+transformer-based architecture to segment geometric primitives and predict
+their order. To enhance the segmentation of primitives, we introduce a unique
+representation called group queries. This representation comprises a set of
+queries and a singular query position, which improve the focus on multiple
+midpoints of primitives and their efficient linkage. Furthermore, we propose an
+innovative implicit update strategy for the query position embedding aimed at
+sharpening the focus of queries on the correct positions and, consequently,
+enhancing the quality of primitive segmentation. Our experiments demonstrate
+that P2PFormer achieves new state-of-the-art performance on the WHU, CrowdAI,
+and WHU-Mix datasets, surpassing the previous SOTA PolyWorld by a margin of 2.7
+AP and 6.5 AP75 on the largest CrowdAI dataset. We intend to make the code and
+trained weights publicly available to promote their use and facilitate further
+research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Data Efficiency in Zero-Shot Learning with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Ye, Shreyank N. Gowda, Xiaobo Jin, Xiaowei Huang, Haotian Xu, Yaochu Jin, Kaizhu Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-Shot Learning (ZSL) aims to enable classifiers to identify unseen
+classes by enhancing data efficiency at the class level. This is achieved by
+generating image features from pre-defined semantics of unseen classes.
+However, most current approaches heavily depend on the number of samples from
+seen classes, i.e. they do not consider instance-level effectiveness. In this
+paper, we demonstrate that limited seen examples generally result in
+deteriorated performance of generative models. To overcome these challenges, we
+propose ZeroDiff, a Diffusion-based Generative ZSL model. This unified
+framework incorporates diffusion models to improve data efficiency at both the
+class and instance levels. Specifically, for instance-level effectiveness,
+ZeroDiff utilizes a forward diffusion chain to transform limited data into an
+expanded set of noised data. For class-level effectiveness, we design a
+two-branch generation structure that consists of a Diffusion-based Feature
+Generator (DFG) and a Diffusion-based Representation Generator (DRG). DFG
+focuses on learning and sampling the distribution of cross-entropy-based
+features, whilst DRG learns the supervised contrastive-based representation to
+boost the zero-shot capabilities of DFG. Additionally, we employ three
+discriminators to evaluate generated features from various aspects and
+introduce a Wasserstein-distance-based mutual learning loss to transfer
+knowledge among discriminators, thereby enhancing guidance for generation.
+Demonstrated through extensive experiments on three popular ZSL benchmarks, our
+ZeroDiff not only achieves significant improvements over existing ZSL methods
+but also maintains robust performance even with scarce training data. Code will
+be released upon acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ U-KAN Makes Strong Backbone for Medical Image Segmentation and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02918v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02918v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxin Li, Xinyu Liu, Wuyang Li, Cheng Wang, Hengyu Liu, Yixuan Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  U-Net has become a cornerstone in various visual applications such as image
+segmentation and diffusion probability models. While numerous innovative
+designs and improvements have been introduced by incorporating transformers or
+MLPs, the networks are still limited to linearly modeling patterns as well as
+the deficient interpretability. To address these challenges, our intuition is
+inspired by the impressive results of the Kolmogorov-Arnold Networks (KANs) in
+terms of accuracy and interpretability, which reshape the neural network
+learning via the stack of non-linear learnable activation functions derived
+from the Kolmogorov-Anold representation theorem. Specifically, in this paper,
+we explore the untapped potential of KANs in improving backbones for vision
+tasks. We investigate, modify and re-design the established U-Net pipeline by
+integrating the dedicated KAN layers on the tokenized intermediate
+representation, termed U-KAN. Rigorous medical image segmentation benchmarks
+verify the superiority of U-KAN by higher accuracy even with less computation
+cost. We further delved into the potential of U-KAN as an alternative U-Net
+noise predictor in diffusion models, demonstrating its applicability in
+generating task-oriented model architectures. These endeavours unveil valuable
+insights and sheds light on the prospect that with U-KAN, you can make strong
+backbone for medical image segmentation and generation. Project page:
+https://yes-ukan.github.io/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2405.14399,
+  arXiv:2203.04967 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual-Text Cross Alignment: Refining the Similarity Score in
+  Vision-Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhao Li, Haopeng Li, Sarah Erfani, Lei Feng, James Bailey, Feng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has recently been discovered that using a pre-trained vision-language
+model (VLM), e.g., CLIP, to align a whole query image with several finer text
+descriptions generated by a large language model can significantly enhance
+zero-shot performance. However, in this paper, we empirically find that the
+finer descriptions tend to align more effectively with local areas of the query
+image rather than the whole image, and then we theoretically validate this
+finding. Thus, we present a method called weighted visual-text cross alignment
+(WCA). This method begins with a localized visual prompting technique, designed
+to identify local visual areas within the query image. The local visual areas
+are then cross-aligned with the finer descriptions by creating a similarity
+matrix using the pre-trained VLM. To determine how well a query image aligns
+with each category, we develop a score function based on the weighted
+similarities in this matrix. Extensive experiments demonstrate that our method
+significantly improves zero-shot performance across various datasets, achieving
+results that are even comparable to few-shot learning methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 16 figures, published to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Self-Supervised</span> Denoising Strategy for Underwater Acoustic Camera
+  Imageries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoteng Zhou, Katsunori Mizuno, Yilong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In low-visibility marine environments characterized by turbidity and
+darkness, acoustic cameras serve as visual sensors capable of generating
+high-resolution 2D sonar images. However, acoustic camera images are interfered
+with by complex noise and are difficult to be directly ingested by downstream
+visual algorithms. This paper introduces a novel strategy for denoising
+acoustic camera images using deep learning techniques, which comprises two
+principal components: a self-supervised denoising framework and a fine
+feature-guided block. Additionally, the study explores the relationship between
+the level of image denoising and the improvement in feature-matching
+performance. Experimental results show that the proposed denoising strategy can
+effectively filter acoustic camera images without prior knowledge of the noise
+model. The denoising process is nearly end-to-end without complex parameter
+tuning and post-processing. It successfully removes noise while preserving fine
+feature details, thereby enhancing the performance of local feature matching.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language-guided Detection and Mitigation of Unknown <span class="highlight-title">Dataset</span> Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zaiying Zhao, Soichiro Kumano, Toshihiko Yamasaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset bias is a significant problem in training fair classifiers. When
+attributes unrelated to classification exhibit strong biases towards certain
+classes, classifiers trained on such dataset may overfit to these bias
+attributes, substantially reducing the accuracy for minority groups. Mitigation
+techniques can be categorized according to the availability of bias information
+(\ie, prior knowledge). Although scenarios with unknown biases are better
+suited for real-world settings, previous work in this field often suffers from
+a lack of interpretability regarding biases and lower performance. In this
+study, we propose a framework to identify potential biases as keywords without
+prior knowledge based on the partial occurrence in the captions. We further
+propose two debiasing methods: (a) handing over to an existing debiasing
+approach which requires prior knowledge by assigning pseudo-labels, and (b)
+employing data augmentation via text-to-image generative models, using acquired
+bias keywords as prompts. Despite its simplicity, experimental results show
+that our framework not only outperforms existing methods without prior
+knowledge, but also is even comparable with a method that assumes prior
+knowledge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PosterLLaVa: Constructing a Unified Multi-modal Layout Generator with
+  LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Yang, Yingmin Luo, Zhongang Qi, Yang Wu, Ying Shan, Chang Wen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Layout generation is the keystone in achieving automated graphic design,
+requiring arranging the position and size of various multi-modal design
+elements in a visually pleasing and constraint-following manner. Previous
+approaches are either inefficient for large-scale applications or lack
+flexibility for varying design requirements. Our research introduces a unified
+framework for automated graphic layout generation, leveraging the multi-modal
+large language model (MLLM) to accommodate diverse design tasks. In contrast,
+our data-driven method employs structured text (JSON format) and visual
+instruction tuning to generate layouts under specific visual and textual
+constraints, including user-defined natural language specifications. We
+conducted extensive experiments and achieved state-of-the-art (SOTA)
+performance on public multi-modal layout generation benchmarks, demonstrating
+the effectiveness of our method. Moreover, recognizing existing datasets'
+limitations in capturing the complexity of real-world graphic designs, we
+propose two new datasets for much more challenging tasks (user-constrained
+generation and complicated poster), further validating our model's utility in
+real-life settings. Marking by its superior accessibility and adaptability,
+this approach further automates large-scale graphic design tasks. The code and
+datasets will be publicly available on
+https://github.com/posterllava/PosterLLaVA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inv-Adapter: ID Customization Generation via Image Inversion and
+  Lightweight Adapter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02881v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02881v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Xing, Ning Wang, Jianbo Ouyang, Zechao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable advancement in text-to-image generation models significantly
+boosts the research in ID customization generation. However, existing
+personalization methods cannot simultaneously satisfy high fidelity and
+high-efficiency requirements. Their main bottleneck lies in the prompt image
+encoder, which produces weak alignment signals with the text-to-image model and
+significantly increased model size. Towards this end, we propose a lightweight
+Inv-Adapter, which first extracts diffusion-domain representations of ID images
+utilizing a pre-trained text-to-image model via DDIM image inversion, without
+additional image encoder. Benefiting from the high alignment of the extracted
+ID prompt features and the intermediate features of the text-to-image model, we
+then embed them efficiently into the base text-to-image model by carefully
+designing a lightweight attention adapter. We conduct extensive experiments to
+assess ID fidelity, generation loyalty, speed, and training parameters, all of
+which show that the proposed Inv-Adapter is highly competitive in ID
+customization generation and model scale.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controllable Talking Face Generation by Implicit Facial Keypoints
+  Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Zhao, Jiaying Shi, Wenjun Li, Shudong Wang, Shenghui Xu, Zhaoming Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-driven talking face generation has garnered significant interest within
+the domain of digital human research. Existing methods are encumbered by
+intricate model architectures that are intricately dependent on each other,
+complicating the process of re-editing image or video inputs. In this work, we
+present ControlTalk, a talking face generation method to control face
+expression deformation based on driven audio, which can construct the head pose
+and facial expression including lip motion for both single image or sequential
+video inputs in a unified manner. By utilizing a pre-trained video synthesis
+renderer and proposing the lightweight adaptation, ControlTalk achieves precise
+and naturalistic lip synchronization while enabling quantitative control over
+mouth opening shape. Our experiments show that our method is superior to
+state-of-the-art performance on widely used benchmarks, including HDTF and
+MEAD. The parameterized adaptation demonstrates remarkable generalization
+capabilities, effectively handling expression deformation across same-ID and
+cross-ID scenarios, and extending its utility to out-of-domain portraits,
+regardless of languages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Second-order differential operators, stochastic differential equations
+  and Brownian motions on embedded manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Du Nguyen, Stefan Sommer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We specify the conditions when a manifold M embedded in an inner product
+space E is an invariant manifold of a stochastic differential equation (SDE) on
+E, linking it with the notion of second-order differential operators on M. When
+M is given a Riemannian metric, we derive a simple formula for the
+Laplace-Beltrami operator in terms of the gradient and Hessian on E and
+construct the Riemannian Brownian motions on M as solutions of conservative
+Stratonovich and Ito SDEs on E. We derive explicitly the SDE for Brownian
+motions on several important manifolds in applications, including
+left-invariant matrix Lie groups using embedded coordinates. Numerically, we
+propose three simulation schemes to solve SDEs on manifolds. In addition to the
+stochastic projection method, to simulate Riemannian Brownian motions, we
+construct a second-order tangent retraction of the Levi-Civita connection using
+a given E-tubular retraction. We also propose the retractive Euler-Maruyama
+method to solve a SDE, taking into account the second-order term of a tangent
+retraction. We provide software to implement the methods in the paper,
+including Brownian motions of the manifolds discussed. We verify numerically
+that on several compact Riemannian manifolds, the long-term limit of Brownian
+simulation converges to the uniform distributions, suggesting a method to
+sample Riemannian uniform distributions
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Guidance Information to Utilize Unlabeled Samples:A Label
+  Encoding Perspective <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulong Zhang, Yuan Yao, Shuhao Chen, Pengrong Jin, Yu Zhang, Jian Jin, Jiangang Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Empirical Risk Minimization (ERM) is fragile in scenarios with insufficient
+labeled samples. A vanilla extension of ERM to unlabeled samples is Entropy
+Minimization (EntMin), which employs the soft-labels of unlabeled samples to
+guide their learning. However, EntMin emphasizes prediction discriminability
+while neglecting prediction diversity. To alleviate this issue, in this paper,
+we rethink the guidance information to utilize unlabeled samples. By analyzing
+the learning objective of ERM, we find that the guidance information for
+labeled samples in a specific category is the corresponding label encoding.
+Inspired by this finding, we propose a Label-Encoding Risk Minimization (LERM).
+It first estimates the label encodings through prediction means of unlabeled
+samples and then aligns them with their corresponding ground-truth label
+encodings. As a result, the LERM ensures both prediction discriminability and
+diversity, and it can be integrated into existing methods as a plugin.
+Theoretically, we analyze the relationships between LERM and ERM as well as
+EntMin. Empirically, we verify the superiority of the LERM under several label
+insufficient scenarios. The codes are available at
+https://github.com/zhangyl660/LERM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-Shot Image Segmentation via Recursive Normalized Cut on Diffusion
+  Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Couairon, Mustafa Shukor, Jean-Emmanuel Haugeard, Matthieu Cord, Nicolas Thome
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models have emerged as powerful tools across various domains
+including language, vision, and multimodal tasks. While prior works have
+addressed unsupervised image segmentation, they significantly lag behind
+supervised models. In this paper, we use a diffusion UNet encoder as a
+foundation vision encoder and introduce DiffCut, an unsupervised zero-shot
+segmentation method that solely harnesses the output features from the final
+self-attention block. Through extensive experimentation, we demonstrate that
+the utilization of these diffusion features in a graph based segmentation
+algorithm, significantly outperforms previous state-of-the-art methods on
+zero-shot segmentation. Specifically, we leverage a recursive Normalized Cut
+algorithm that softly regulates the granularity of detected objects and
+produces well-defined segmentation maps that precisely capture intricate image
+details. Our work highlights the remarkably accurate semantic knowledge
+embedded within diffusion UNet encoders that could then serve as foundation
+vision encoders for downstream tasks. Project page at
+https://diffcut-segmentation.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditional Idempotent Generative Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02841v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02841v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niccolò Ronchetti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Conditional Idempotent Generative Networks (CIGN), a novel
+approach that expands upon Idempotent Generative Networks (IGN) to enable
+conditional generation. While IGNs offer efficient single-pass generation, they
+lack the ability to control the content of the generated data. CIGNs address
+this limitation by incorporating conditioning mechanisms, allowing users to
+steer the generation process towards specific types of data.
+  We establish the theoretical foundations for CIGNs, outlining their scope,
+loss function design, and evaluation metrics. We then present two potential
+architectures for implementing CIGNs: channel conditioning and filter
+conditioning. Finally, we discuss experimental results on the MNIST dataset,
+demonstrating the effectiveness of both approaches. Our findings pave the way
+for further exploration of CIGNs on larger datasets and with more powerful
+computing resources to determine the optimal implementation strategy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DREW : Towards Robust Data Provenance by Leveraging Error-Controlled
+  Watermarking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehrdad Saberi, Vinu Sankar Sadasivan, Arman Zarei, Hessam Mahdavifar, Soheil Feizi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying the origin of data is crucial for data provenance, with
+applications including data ownership protection, media forensics, and
+detecting AI-generated content. A standard approach involves embedding-based
+retrieval techniques that match query data with entries in a reference dataset.
+However, this method is not robust against benign and malicious edits. To
+address this, we propose Data Retrieval with Error-corrected codes and
+Watermarking (DREW). DREW randomly clusters the reference dataset, injects
+unique error-controlled watermark keys into each cluster, and uses these keys
+at query time to identify the appropriate cluster for a given sample. After
+locating the relevant cluster, embedding vector similarity retrieval is
+performed within the cluster to find the most accurate matches. The integration
+of error control codes (ECC) ensures reliable cluster assignments, enabling the
+method to perform retrieval on the entire dataset in case the ECC algorithm
+cannot detect the correct cluster with high confidence. This makes DREW
+maintain baseline performance, while also providing opportunities for
+performance improvements due to the increased likelihood of correctly matching
+queries to their origin when performing retrieval on a smaller subset of the
+dataset. Depending on the watermark technique used, DREW can provide
+substantial improvements in retrieval accuracy (up to 40\% for some datasets
+and modification types) across multiple datasets and state-of-the-art embedding
+models (e.g., DinoV2, CLIP), making our method a promising solution for secure
+and reliable source identification. The code is available at
+https://github.com/mehrdadsaberi/DREW
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DenoDet: Attention as Deformable Multi-Subspace Feature Denoising for
+  Target Detection in SAR Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimian Dai, Minrui Zou, Yuxuan Li, Xiang Li, Kang Ni, Jian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic Aperture Radar (SAR) target detection has long been impeded by
+inherent speckle noise and the prevalence of diminutive, ambiguous targets.
+While deep neural networks have advanced SAR target detection, their intrinsic
+low-frequency bias and static post-training weights falter with coherent noise
+and preserving subtle details across heterogeneous terrains. Motivated by
+traditional SAR image denoising, we propose DenoDet, a network aided by
+explicit frequency domain transform to calibrate convolutional biases and pay
+more attention to high-frequencies, forming a natural multi-scale subspace
+representation to detect targets from the perspective of multi-subspace
+denoising. We design TransDeno, a dynamic frequency domain attention module
+that performs as a transform domain soft thresholding operation, dynamically
+denoising across subspaces by preserving salient target signals and attenuating
+noise. To adaptively adjust the granularity of subspace processing, we also
+propose a deformable group fully-connected layer (DeGroFC) that dynamically
+varies the group conditioned on the input features. Without bells and whistles,
+our plug-and-play TransDeno sets state-of-the-art scores on multiple SAR target
+detection datasets. The code is available at https://github.com/GrokCV/GrokSAR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distilling Aggregated Knowledge for Weakly-Supervised Video Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jash Dalvi, Ali Dabouei, Gunjan Dhanuka, Min Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video anomaly detection aims to develop automated models capable of
+identifying abnormal events in surveillance videos. The benchmark setup for
+this task is extremely challenging due to: i) the limited size of the training
+sets, ii) weak supervision provided in terms of video-level labels, and iii)
+intrinsic class imbalance induced by the scarcity of abnormal events. In this
+work, we show that distilling knowledge from aggregated representations of
+multiple backbones into a relatively simple model achieves state-of-the-art
+performance. In particular, we develop a bi-level distillation approach along
+with a novel disentangled cross-attention-based feature aggregation network.
+Our proposed approach, DAKD (Distilling Aggregated Knowledge with Disentangled
+Attention), demonstrates superior performance compared to existing methods
+across multiple benchmark datasets. Notably, we achieve significant
+improvements of 1.36%, 0.78%, and 7.02% on the UCF-Crime, ShanghaiTech, and
+XD-Violence datasets, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Push Past Green: Learning to Look Behind Plant Foliage by Moving It 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03175v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03175v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Zhang, Saurabh Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous agriculture applications (e.g., inspection, phenotyping, plucking
+fruits) require manipulating the plant foliage to look behind the leaves and
+the branches. Partial visibility, extreme clutter, thin structures, and unknown
+geometry and dynamics for plants make such manipulation challenging. We tackle
+these challenges through data-driven methods. We use self-supervision to train
+SRPNet, a neural network that predicts what space is revealed on execution of a
+candidate action on a given plant. We use SRPNet with the cross-entropy method
+to predict actions that are effective at revealing space beneath plant foliage.
+Furthermore, as SRPNet does not just predict how much space is revealed but
+also where it is revealed, we can execute a sequence of actions that
+incrementally reveal more and more space beneath the plant foliage. We
+experiment with a synthetic (vines) and a real plant (Dracaena) on a physical
+test-bed across 5 settings including 2 settings that test generalization to
+novel plant configurations. Our experiments reveal the effectiveness of our
+overall method, PPG, over a competitive hand-crafted exploration method, and
+the effectiveness of SRPNet over a hand-crafted dynamics model and relevant
+ablations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Conference on Robot Learning (CoRL) 2023. for project
+  website with video, see https://sites.google.com/view/pushpastgreen/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Confronting Reward Overoptimization for Diffusion Models: A Perspective
+  of Inductive and Primacy Biases <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08552v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08552v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyi Zhang, Sen Zhang, Yibing Zhan, Yong Luo, Yonggang Wen, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bridging the gap between diffusion models and human preferences is crucial
+for their integration into practical generative workflows. While optimizing
+downstream reward models has emerged as a promising alignment strategy,
+concerns arise regarding the risk of excessive optimization with learned reward
+models, which potentially compromises ground-truth performance. In this work,
+we confront the reward overoptimization problem in diffusion model alignment
+through the lenses of both inductive and primacy biases. We first identify a
+mismatch between current methods and the temporal inductive bias inherent in
+the multi-step denoising process of diffusion models, as a potential source of
+reward overoptimization. Then, we surprisingly discover that dormant neurons in
+our critic model act as a regularization against reward overoptimization while
+active neurons reflect primacy bias. Motivated by these observations, we
+propose Temporal Diffusion Policy Optimization with critic active neuron Reset
+(TDPO-R), a policy gradient algorithm that exploits the temporal inductive bias
+of diffusion models and mitigates the primacy bias stemming from active
+neurons. Empirical results demonstrate the superior efficacy of our methods in
+mitigating reward overoptimization. Code is avaliable at
+https://github.com/ZiyiZhang27/tdpo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Meets DAgger: Supercharging Eye-in-hand Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17768v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17768v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Zhang, Matthew Chang, Pranav Kumar, Saurabh Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common failure mode for policies trained with imitation is compounding
+execution errors at test time. When the learned policy encounters states that
+are not present in the expert demonstrations, the policy fails, leading to
+degenerate behavior. The Dataset Aggregation, or DAgger approach to this
+problem simply collects more data to cover these failure states. However, in
+practice, this is often prohibitively expensive. In this work, we propose
+Diffusion Meets DAgger (DMD), a method to reap the benefits of DAgger without
+the cost for eye-in-hand imitation learning problems. Instead of collecting new
+samples to cover out-of-distribution states, DMD uses recent advances in
+diffusion models to synthesize these samples. This leads to robust performance
+from few demonstrations. We compare DMD against behavior cloning baseline
+across four tasks: pushing, stacking, pouring, and shirt hanging. In pushing,
+DMD achieves 80% success rate with as few as 8 expert demonstrations, where
+naive behavior cloning reaches only 20%. In stacking, DMD succeeds on average
+92% of the time across 5 cups, versus 40% for BC. When pouring coffee beans,
+DMD transfers to another cup successfully 80% of the time. Finally, DMD attains
+90% success rate for hanging shirt on a clothing rack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Robotics: Science and Systems (RSS) 2024. project website
+  with video, see https://sites.google.com/view/diffusion-meets-dagger</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fiducial Tag Localization on a 3D LiDAR Prior Map 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.01072v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.01072v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yibo Liu, Jinjun Shan, Hunter Schofield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The LiDAR fiducial tag, akin to the well-known AprilTag used in camera
+applications, serves as a convenient resource to impart artificial features to
+the LiDAR sensor, facilitating robotics applications. Unfortunately, the
+existing LiDAR fiducial tag localization methods do not apply to 3D LiDAR maps
+while resolving this problem is beneficial to LiDAR-based relocalization and
+navigation. In this paper, we develop a novel approach to directly localize
+fiducial tags on a 3D LiDAR prior map, returning the tag poses (labeled by ID
+number) and vertex locations (labeled by index) w.r.t. the global coordinate
+system of the map. In particular, considering that fiducial tags are thin sheet
+objects indistinguishable from the attached planes, we design a new pipeline
+that gradually analyzes the 3D point cloud of the map from the intensity and
+geometry perspectives, extracting potential tag-containing point clusters.
+Then, we introduce an intermediate-plane-based method to further check if each
+potential cluster has a tag and compute the vertex locations and tag pose if
+found. We conduct both qualitative and quantitative experiments to demonstrate
+that our approach is the first method applicable to localize tags on a 3D LiDAR
+map while achieving better accuracy compared to previous methods. The
+open-source implementation of this work is available at:
+https://github.com/York-SDCNLab/Marker-Detection-General.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Once-for-All: Controllable Generative Image Compression with Dynamic
+  Granularity Adaption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00758v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00758v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anqi Li, Yuxi Liu, Huihui Bai, Feng Li, Runmin Cong, Meng Wang, Yao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although recent generative image compression methods have demonstrated
+impressive potential in optimizing the rate-distortion-perception trade-off,
+they still face the critical challenge of flexible rate adaption to diverse
+compression necessities and scenarios. To overcome this challenge, this paper
+proposes a Controllable Generative Image Compression framework, Control-GIC,
+the first capable of fine-grained bitrate adaption across a broad spectrum
+while ensuring high-fidelity and generality compression. We base Control-GIC on
+a VQGAN framework representing an image as a sequence of variable-length codes
+(i.e. VQ-indices), which can be losslessly compressed and exhibits a direct
+positive correlation with the bitrates. Therefore, drawing inspiration from the
+classical coding principle, we naturally correlate the information density of
+local image patches with their granular representations, to achieve dynamic
+adjustment of the code quantity following different granularity decisions. This
+implies we can flexibly determine a proper allocation of granularity for the
+patches to acquire desirable compression rates. We further develop a
+probabilistic conditional decoder that can trace back to historic encoded
+multi-granularity representations according to transmitted codes, and then
+reconstruct hierarchical granular features in the formalization of conditional
+probability, enabling more informative aggregation to improve reconstruction
+realism. Our experiments show that Control-GIC allows highly flexible and
+controllable bitrate adaption and even once compression on an entire dataset to
+fulfill constrained bitrate conditions. Experimental results demonstrate its
+superior performance over recent state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mixture of Gaussian-distributed Prototypes with Generative Modelling for
+  Interpretable and Trustworthy Image Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.00092v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.00092v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chong Wang, Yuanhong Chen, Fengbei Liu, Yuyuan Liu, Davis James McCarthy, Helen Frazer, Gustavo Carneiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prototypical-part methods, e.g., ProtoPNet, enhance interpretability in image
+recognition by linking predictions to training prototypes, thereby offering
+intuitive insights into their decision-making. Existing methods, which rely on
+a point-based learning of prototypes, typically face two critical issues: 1)
+the learned prototypes have limited representation power and are not suitable
+to detect Out-of-Distribution (OoD) inputs, reducing their decision
+trustworthiness; and 2) the necessary projection of the learned prototypes back
+into the space of training images causes a drastic degradation in the
+predictive performance. Furthermore, current prototype learning adopts an
+aggressive approach that considers only the most active object parts during
+training, while overlooking sub-salient object regions which still hold crucial
+classification information. In this paper, we present a new generative paradigm
+to learn prototype distributions, termed as Mixture of Gaussian-distributed
+Prototypes (MGProto). The distribution of prototypes from MGProto enables both
+interpretable image classification and trustworthy recognition of OoD inputs.
+The optimisation of MGProto naturally projects the learned prototype
+distributions back into the training image space, thereby addressing the
+performance degradation caused by prototype projection. Additionally, we
+develop a novel and effective prototype mining strategy that considers not only
+the most active but also sub-salient object parts. To promote model
+compactness, we further propose to prune MGProto by removing prototypes with
+low importance priors. Experiments on CUB-200-2011, Stanford Cars, Stanford
+Dogs, and Oxford-IIIT Pets datasets show that MGProto achieves state-of-the-art
+image recognition and OoD detection performances, while providing encouraging
+interpretability results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to Train Neural Field Representations: A Comprehensive Study and
+  Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10531v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10531v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuele Papa, Riccardo Valperga, David Knigge, Miltiadis Kofinas, Phillip Lippe, Jan-Jakob Sonke, Efstratios Gavves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural fields (NeFs) have recently emerged as a versatile method for modeling
+signals of various modalities, including images, shapes, and scenes.
+Subsequently, a number of works have explored the use of NeFs as
+representations for downstream tasks, e.g. classifying an image based on the
+parameters of a NeF that has been fit to it. However, the impact of the NeF
+hyperparameters on their quality as downstream representation is scarcely
+understood and remains largely unexplored. This is in part caused by the large
+amount of time required to fit datasets of neural fields.
+  In this work, we propose a JAX-based library that leverages parallelization
+to enable fast optimization of large-scale NeF datasets, resulting in a
+significant speed-up. With this library, we perform a comprehensive study that
+investigates the effects of different hyperparameters on fitting NeFs for
+downstream tasks. In particular, we explore the use of a shared initialization,
+the effects of overtraining, and the expressiveness of the network
+architectures used. Our study provides valuable insights on how to train NeFs
+and offers guidance for optimizing their effectiveness in downstream
+applications. Finally, based on the proposed library and our analysis, we
+propose Neural Field Arena, a benchmark consisting of neural field variants of
+popular vision datasets, including MNIST, CIFAR, variants of ImageNet, and
+ShapeNetv2. Our library and the Neural Field Arena will be open-sourced to
+introduce standardized benchmarking and promote further research on neural
+fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AFF-ttention! Affordances and Attention models for Short-Term Object
+  Interaction Anticipation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01194v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01194v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Mur-Labadia, Ruben Martinez-Cantin, Josechu Guerrero, Giovanni Maria Farinella, Antonino Furnari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Short-Term object-interaction Anticipation consists of detecting the location
+of the next-active objects, the noun and verb categories of the interaction,
+and the time to contact from the observation of egocentric video. This ability
+is fundamental for wearable assistants or human robot interaction to understand
+the user goals, but there is still room for improvement to perform STA in a
+precise and reliable way. In this work, we improve the performance of STA
+predictions with two contributions: 1. We propose STAformer, a novel
+attention-based architecture integrating frame guided temporal pooling, dual
+image-video attention, and multiscale feature fusion to support STA predictions
+from an image-input video pair. 2. We introduce two novel modules to ground STA
+predictions on human behavior by modeling affordances.First, we integrate an
+environment affordance model which acts as a persistent memory of interactions
+that can take place in a given physical scene. Second, we predict interaction
+hotspots from the observation of hands and object trajectories, increasing
+confidence in STA predictions localized around the hotspot. Our results show
+significant relative Overall Top-5 mAP improvements of up to +45% on Ego4D and
++42% on a novel set of curated EPIC-Kitchens STA labels. We will release the
+code, annotations, and pre extracted affordances on Ego4D and EPIC- Kitchens to
+encourage future research in this area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoopHash: Cooperative Learning of Multipurpose Descriptor and
+  Contrastive Pair Generator via Variational MCMC Teaching for Supervised Image
+  Hashing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04288v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04288v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khoa D. Doan, Jianwen Xie, Yaxuan Zhu, Yang Zhao, Ping Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging supervised information can lead to superior retrieval performance
+in the image hashing domain but the performance degrades significantly without
+enough labeled data. One effective solution to boost performance is to employ
+generative models, such as Generative Adversarial Networks (GANs), to generate
+synthetic data in an image hashing model. However, GAN-based methods are
+difficult to train, which prevents the hashing approaches from jointly training
+the generative models and the hash functions. This limitation results in
+sub-optimal retrieval performance. To overcome this limitation, we propose a
+novel framework, the generative cooperative hashing network, which is based on
+energy-based cooperative learning. This framework jointly learns a powerful
+generative representation of the data and a robust hash function via two
+components: a top-down contrastive pair generator that synthesizes contrastive
+images and a bottom-up multipurpose descriptor that simultaneously represents
+the images from multiple perspectives, including probability density, hash
+code, latent code, and category. The two components are jointly learned via a
+novel likelihood-based cooperative learning scheme. We conduct experiments on
+several real-world datasets and show that the proposed method outperforms the
+competing hashing supervised methods, achieving up to 10\% relative improvement
+over the current state-of-the-art supervised hashing methods, and exhibits a
+significantly better performance in out-of-distribution retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust CLIP: Unsupervised Adversarial Fine-Tuning of Vision Embeddings
+  for Robust Large Vision-Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12336v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12336v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Schlarmann, Naman Deep Singh, Francesco Croce, Matthias Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal foundation models like OpenFlamingo, LLaVA, and GPT-4 are
+increasingly used for various real-world tasks. Prior work has shown that these
+models are highly vulnerable to adversarial attacks on the vision modality.
+These attacks can be leveraged to spread fake information or defraud users, and
+thus pose a significant risk, which makes the robustness of large multi-modal
+foundation models a pressing problem. The CLIP model, or one of its variants,
+is used as a frozen vision encoder in many large vision-language models
+(LVLMs), e.g. LLaVA and OpenFlamingo. We propose an unsupervised adversarial
+fine-tuning scheme to obtain a robust CLIP vision encoder, which yields
+robustness on all vision down-stream tasks (LVLMs, zero-shot classification)
+that rely on CLIP. In particular, we show that stealth-attacks on users of
+LVLMs by a malicious third party providing manipulated images are no longer
+possible once one replaces the original CLIP model with our robust one. No
+retraining or fine-tuning of the down-stream LVLMs is required. The code and
+robust models are available at https://github.com/chs20/RobustVLM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Chosen One: Consistent Characters in Text-to-Image Diffusion Models <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.10093v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.10093v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omri Avrahami, Amir Hertz, Yael Vinker, Moab Arar, Shlomi Fruchter, Ohad Fried, Daniel Cohen-Or, Dani Lischinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in text-to-image generation models have unlocked vast
+potential for visual creativity. However, the users that use these models
+struggle with the generation of consistent characters, a crucial aspect for
+numerous real-world applications such as story visualization, game development,
+asset design, advertising, and more. Current methods typically rely on multiple
+pre-existing images of the target character or involve labor-intensive manual
+processes. In this work, we propose a fully automated solution for consistent
+character generation, with the sole input being a text prompt. We introduce an
+iterative procedure that, at each stage, identifies a coherent set of images
+sharing a similar identity and extracts a more consistent identity from this
+set. Our quantitative analysis demonstrates that our method strikes a better
+balance between prompt alignment and identity consistency compared to the
+baseline methods, and these findings are reinforced by a user study. To
+conclude, we showcase several practical applications of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIGGRAPH 2024. Project page is available at
+  https://omriavrahami.com/the-chosen-one/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pulmonary Embolism Mortality Prediction Using Multimodal Learning Based
+  on Computed Tomography Angiography and Clinical Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01302v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01302v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhusi Zhong, Helen Zhang, Fayez H. Fayad, Andrew C. Lancaster, John Sollee, Shreyas Kulkarni, Cheng Ting Lin, Jie Li, Xinbo Gao, Scott Collins, Colin Greineder, Sun H. Ahn, Harrison X. Bai, Zhicheng Jiao, Michael K. Atalay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Pulmonary embolism (PE) is a significant cause of mortality in the
+United States. The objective of this study is to implement deep learning (DL)
+models using Computed Tomography Pulmonary Angiography (CTPA), clinical data,
+and PE Severity Index (PESI) scores to predict PE mortality. Materials and
+Methods: 918 patients (median age 64 years, range 13-99 years, 52% female) with
+3,978 CTPAs were identified via retrospective review across three institutions.
+To predict survival, an AI model was used to extract disease-related imaging
+features from CTPAs. Imaging features and/or clinical variables were then
+incorporated into DL models to predict survival outcomes. Four models were
+developed as follows: (1) using CTPA imaging features only; (2) using clinical
+variables only; (3) multimodal, integrating both CTPA and clinical variables;
+and (4) multimodal fused with calculated PESI score. Performance and
+contribution from each modality were evaluated using concordance index
+(c-index) and Net Reclassification Improvement, respectively. Performance was
+compared to PESI predictions using the Wilcoxon signed-rank test. Kaplan-Meier
+analysis was performed to stratify patients into high- and low-risk groups.
+Additional factor-risk analysis was conducted to account for right ventricular
+(RV) dysfunction. Results: For both data sets, the PESI-fused and multimodal
+models achieved higher c-indices than PESI alone. Following stratification of
+patients into high- and low-risk groups by multimodal and PESI-fused models,
+mortality outcomes differed significantly (both p<0.001). A strong correlation
+was found between high-risk grouping and RV dysfunction. Conclusions: Multiomic
+DL models incorporating CTPA features, clinical data, and PESI achieved higher
+c-indices than PESI alone for PE survival prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Hallucinations in Large Vision-Language Models with
+  Instruction Contrastive Decoding <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xintong Wang, Jingheng Pan, Liang Ding, Chris Biemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) are increasingly adept at generating
+contextually detailed and coherent responses from visual inputs. However, their
+application in multimodal decision-making and open-ended generation is hindered
+by a notable rate of hallucinations, where generated text inaccurately
+represents the visual contents. To address this issue, this paper introduces
+the Instruction Contrastive Decoding (ICD) method, a novel approach designed to
+reduce hallucinations during LVLM inference. Our method is inspired by our
+observation that what we call disturbance instructions significantly exacerbate
+hallucinations in multimodal fusion modules. ICD contrasts distributions from
+standard and instruction disturbance, thereby increasing alignment uncertainty
+and effectively subtracting hallucinated concepts from the original
+distribution. Through comprehensive experiments on discriminative benchmarks
+(POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that
+ICD significantly mitigates both object-level and attribute-level
+hallucinations. Moreover, our method not only addresses hallucinations but also
+significantly enhances the general perception and recognition capabilities of
+LVLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FindingEmo: An Image <span class="highlight-title">Dataset</span> for Emotion Recognition in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01355v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01355v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurent Mertens, Elahe' Yargholi, Hans Op de Beeck, Jan Van den Stock, Joost Vennekens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce FindingEmo, a new image dataset containing annotations for 25k
+images, specifically tailored to Emotion Recognition. Contrary to existing
+datasets, it focuses on complex scenes depicting multiple people in various
+naturalistic, social settings, with images being annotated as a whole, thereby
+going beyond the traditional focus on faces or single individuals. Annotated
+dimensions include Valence, Arousal and Emotion label, with annotations
+gathered using Prolific. Together with the annotations, we release the list of
+URLs pointing to the original images, as well as all associated source code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 21 figures, 12 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IterMask2: Iterative Unsupervised Anomaly Segmentation via Spatial and
+  Frequency Masking for Brain Lesions in MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02422v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02422v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyun Liang, Xiaoqing Guo, J. Alison Noble, Konstantinos Kamnitsas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly segmentation approaches to pathology segmentation train
+a model on images of healthy subjects, that they define as the 'normal' data
+distribution. At inference, they aim to segment any pathologies in new images
+as 'anomalies', as they exhibit patterns that deviate from those in 'normal'
+training data. Prevailing methods follow the 'corrupt-and-reconstruct'
+paradigm. They intentionally corrupt an input image, reconstruct it to follow
+the learned 'normal' distribution, and subsequently segment anomalies based on
+reconstruction error. Corrupting an input image, however, inevitably leads to
+suboptimal reconstruction even of normal regions, causing false positives. To
+alleviate this, we propose a novel iterative spatial mask-refining strategy
+IterMask2. We iteratively mask areas of the image, reconstruct them, and update
+the mask based on reconstruction error. This iterative process progressively
+adds information about areas that are confidently normal as per the model. The
+increasing content guides reconstruction of nearby masked areas, improving
+reconstruction of normal tissue under these areas, reducing false positives. We
+also use high-frequency image content as an auxiliary input to provide
+additional structural information for masked areas. This further improves
+reconstruction error of normal in comparison to anomalous areas, facilitating
+segmentation of the latter. We conduct experiments on several brain lesion
+datasets and demonstrate effectiveness of our method. Code is available at:
+https://github.com/ZiyunLiang/IterMask2
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Position: Quo Vadis, Unsupervised Time Series Anomaly Detection? <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02678v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02678v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Saquib Sarfraz, Mei-Yen Chen, Lukas Layer, Kunyu Peng, Marios Koulakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current state of machine learning scholarship in Timeseries Anomaly
+Detection (TAD) is plagued by the persistent use of flawed evaluation metrics,
+inconsistent benchmarking practices, and a lack of proper justification for the
+choices made in novel deep learning-based model designs. Our paper presents a
+critical analysis of the status quo in TAD, revealing the misleading track of
+current research and highlighting problematic methods, and evaluation
+practices. Our position advocates for a shift in focus from solely pursuing
+novel model designs to improving benchmarking practices, creating non-trivial
+datasets, and critically evaluating the utility of complex methods against
+simpler baselines. Our findings demonstrate the need for rigorous evaluation
+protocols, the creation of simple baselines, and the revelation that
+state-of-the-art deep anomaly detection models effectively learn linear
+mappings. These findings suggest the need for more exploration and development
+of simple and interpretable TAD methods. The increment of model complexity in
+the state-of-the-art deep-learning based models unfortunately offers very
+little improvement. We offer insights and suggestions for the field to move
+forward.
+  Code: https://github.com/ssarfraz/QuoVadisTAD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AnoVox: A Benchmark for Multimodal Anomaly Detection in Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Bogdoll, Iramm Hamdard, Lukas Namgyu Rößler, Felix Geisler, Muhammed Bayram, Felix Wang, Jan Imhof, Miguel de Campos, Anushervon Tabarov, Yitian Yang, Hanno Gottschalk, J. Marius Zöllner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scale-up of autonomous vehicles depends heavily on their ability to deal
+with anomalies, such as rare objects on the road. In order to handle such
+situations, it is necessary to detect anomalies in the first place. Anomaly
+detection for autonomous driving has made great progress in the past years but
+suffers from poorly designed benchmarks with a strong focus on camera data. In
+this work, we propose AnoVox, the largest benchmark for ANOmaly detection in
+autonomous driving to date. AnoVox incorporates large-scale multimodal sensor
+data and spatial VOXel ground truth, allowing for the comparison of methods
+independent of their used sensor. We propose a formal definition of normality
+and provide a compliant training dataset. AnoVox is the first benchmark to
+contain both content and temporal anomalies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Daniel Bogdoll, Iramm Hamdard, and Lukas Namgyu R\"o{\ss}ler
+  contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unified-modal Salient Object Detection via Adaptive <span class="highlight-title">Prompt</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16835v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16835v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunpeng Wang, Chenglong Li, Zhengzheng Tu, Zhengyi Liu, Bin Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing single-modal and multi-modal salient object detection (SOD) methods
+focus on designing specific architectures tailored for their respective tasks.
+However, developing completely different models for different tasks leads to
+labor and time consumption, as well as high computational and practical
+deployment costs. In this paper, we attempt to address both single-modal and
+multi-modal SOD in a unified framework called UniSOD, which fully exploits the
+overlapping prior knowledge between different tasks. Nevertheless, assigning
+appropriate strategies to modality variable inputs is challenging. To this end,
+UniSOD learns modality-aware prompts with task-specific hints through adaptive
+prompt learning, which are plugged into the proposed pre-trained baseline SOD
+model to handle corresponding tasks, while only requiring few learnable
+parameters compared to training the entire model. Each modality-aware prompt is
+generated from a switchable prompt generation block, which adaptively performs
+structural switching based on single-modal and multi-modal inputs without human
+intervention. Through end-to-end joint training, UniSOD achieves overall
+performance improvement on 14 benchmark datasets for RGB, RGB-D, and RGB-T SOD,
+which demonstrates that our method effectively and efficiently unifies
+single-modal and multi-modal SOD tasks.The code and results are available at
+https://github.com/Angknpng/UniSOD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PLA4D: Pixel-Level Alignments for Text-to-4D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19957v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19957v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaowei Miao, Yawei Luo, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As text-conditioned diffusion models (DMs) achieve breakthroughs in image,
+video, and 3D generation, the research community's focus has shifted to the
+more challenging task of text-to-4D synthesis, which introduces a temporal
+dimension to generate dynamic 3D objects. In this context, we identify Score
+Distillation Sampling (SDS), a widely used technique for text-to-3D synthesis,
+as a significant hindrance to text-to-4D performance due to its Janus-faced and
+texture-unrealistic problems coupled with high computational costs. In this
+paper, we propose \textbf{P}ixel-\textbf{L}evel \textbf{A}lignments for
+Text-to-\textbf{4D} Gaussian Splatting (\textbf{PLA4D}), a novel method that
+utilizes text-to-video frames as explicit pixel alignment targets to generate
+static 3D objects and inject motion into them. Specifically, we introduce Focal
+Alignment to calibrate camera poses for rendering and GS-Mesh Contrastive
+Learning to distill geometry priors from rendered image contrasts at the pixel
+level. Additionally, we develop Motion Alignment using a deformation network to
+drive changes in Gaussians and implement Reference Refinement for smooth 4D
+object surfaces. These techniques enable 4D Gaussian Splatting to align
+geometry, texture, and motion with generated videos at the pixel level.
+Compared to previous methods, PLA4D produces synthesized outputs with better
+texture details in less time and effectively mitigates the Janus-faced problem.
+PLA4D is fully implemented using open-source models, offering an accessible,
+user-friendly, and promising direction for 4D digital content creation. Our
+project page: https://miaoqiaowei.github.io/PLA4D/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cobra: Extending Mamba to Multi-Modal Large Language Model for Efficient
+  Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.14520v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.14520v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Zhao, Min Zhang, Wei Zhao, Pengxiang Ding, Siteng Huang, Donglin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the application of multimodal large language models (MLLM)
+in various fields has achieved remarkable success. However, as the foundation
+model for many downstream tasks, current MLLMs are composed of the well-known
+Transformer network, which has a less efficient quadratic computation
+complexity. To improve the efficiency of such basic models, we propose Cobra, a
+linear computational complexity MLLM. Specifically, Cobra integrates the
+efficient Mamba language model into the visual modality. Moreover, we explore
+and study various modal fusion schemes to create an effective multi-modal
+Mamba. Extensive experiments demonstrate that (1) Cobra achieves extremely
+competitive performance with current computationally efficient state-of-the-art
+methods, e.g., LLaVA-Phi, TinyLLaVA, and MobileVLM v2, and has faster speed due
+to Cobra's linear sequential modeling. (2) Interestingly, the results of
+closed-set challenging prediction benchmarks show that Cobra performs well in
+overcoming visual illusions and spatial relationship judgments. (3) Notably,
+Cobra even achieves comparable performance to LLaVA with about 43% of the
+number of parameters. We will make all codes of Cobra open-source and hope that
+the proposed method can facilitate future research on complexity problems in
+MLLM. Our project page is available at: https://sites.google.com/view/cobravlm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update ablation results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AD3: Implicit Action is the Key for World Models to Distinguish the
+  Diverse Visual Distractors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09976v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09976v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yucen Wang, Shenghua Wan, Le Gan, Shuai Feng, De-Chuan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model-based methods have significantly contributed to distinguishing
+task-irrelevant distractors for visual control. However, prior research has
+primarily focused on heterogeneous distractors like noisy background videos,
+leaving homogeneous distractors that closely resemble controllable agents
+largely unexplored, which poses significant challenges to existing methods. To
+tackle this problem, we propose Implicit Action Generator (IAG) to learn the
+implicit actions of visual distractors, and present a new algorithm named
+implicit Action-informed Diverse visual Distractors Distinguisher (AD3), that
+leverages the action inferred by IAG to train separated world models. Implicit
+actions effectively capture the behavior of background distractors, aiding in
+distinguishing the task-irrelevant components, and the agent can optimize the
+policy within the task-relevant state space. Our method achieves superior
+performance on various visual control tasks featuring both heterogeneous and
+homogeneous distractors. The indispensable role of implicit actions learned by
+IAG is also empirically validated.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SSR-2D: Semantic 3D Scene Reconstruction from 2D Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03640v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03640v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwen Huang, Alexey Artemov, Yujin Chen, Shuaifeng Zhi, Kai Xu, Matthias Nießner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most deep learning approaches to comprehensive semantic modeling of 3D indoor
+spaces require costly dense annotations in the 3D domain. In this work, we
+explore a central 3D scene modeling task, namely, semantic scene reconstruction
+without using any 3D annotations. The key idea of our approach is to design a
+trainable model that employs both incomplete 3D reconstructions and their
+corresponding source RGB-D images, fusing cross-domain features into volumetric
+embeddings to predict complete 3D geometry, color, and semantics with only 2D
+labeling which can be either manual or machine-generated. Our key technical
+innovation is to leverage differentiable rendering of color and semantics to
+bridge 2D observations and unknown 3D space, using the observed RGB images and
+2D semantics as supervision, respectively. We additionally develop a learning
+pipeline and corresponding method to enable learning from imperfect predicted
+2D labels, which could be additionally acquired by synthesizing in an augmented
+set of virtual training views complementing the original real captures,
+enabling more efficient self-supervision loop for semantics. As a result, our
+end-to-end trainable solution jointly addresses geometry completion,
+colorization, and semantic mapping from limited RGB-D images, without relying
+on any 3D ground-truth information. Our method achieves the state-of-the-art
+performance of semantic scene completion on two large-scale benchmark datasets
+MatterPort3D and ScanNet, surpasses baselines even with costly 3D annotations
+in predicting both geometry and semantics. To our knowledge, our method is also
+the first 2D-driven method addressing completion and semantic segmentation of
+real-world 3D scans simultaneously.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ContrastAlign: Toward Robust BEV Feature Alignment via Contrastive
+  Learning for Multi-Modal 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16873v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16873v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziying Song, Feiyang Jia, Hongyu Pan, Yadan Luo, Caiyan Jia, Guoxin Zhang, Lin Liu, Yang Ji, Lei Yang, Li Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of 3D object detection tasks, fusing heterogeneous features from
+LiDAR and camera sensors into a unified Bird's Eye View (BEV) representation is
+a widely adopted paradigm. However, existing methods are often compromised by
+imprecise sensor calibration, resulting in feature misalignment in LiDAR-camera
+BEV fusion. Moreover, such inaccuracies result in errors in depth estimation
+for the camera branch, ultimately causing misalignment between LiDAR and camera
+BEV features. In this work, we propose a novel ContrastAlign approach that
+utilizes contrastive learning to enhance the alignment of heterogeneous
+modalities, thereby improving the robustness of the fusion process.
+Specifically, our approach includes the L-Instance module, which directly
+outputs LiDAR instance features within LiDAR BEV features. Then, we introduce
+the C-Instance module, which predicts camera instance features through RoI
+(Region of Interest) pooling on the camera BEV features. We propose the
+InstanceFusion module, which utilizes contrastive learning to generate similar
+instance features across heterogeneous modalities. We then use graph matching
+to calculate the similarity between the neighboring camera instance features
+and the similarity instance features to complete the alignment of instance
+features. Our method achieves state-of-the-art performance, with an mAP of
+70.3%, surpassing BEVFusion by 1.8% on the nuScenes validation set.
+Importantly, our method outperforms BEVFusion by 7.3% under conditions with
+misalignment noise.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Key-Locked Rank One Editing for Text-to-Image Personalization <span class="chip">SIGGRAPH 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoad Tewel, Rinon Gal, Gal Chechik, Yuval Atzmon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image models (T2I) offer a new level of flexibility by allowing users
+to guide the creative process through natural language. However, personalizing
+these models to align with user-provided visual concepts remains a challenging
+problem. The task of T2I personalization poses multiple hard challenges, such
+as maintaining high visual fidelity while allowing creative control, combining
+multiple personalized concepts in a single image, and keeping a small model
+size. We present Perfusion, a T2I personalization method that addresses these
+challenges using dynamic rank-1 updates to the underlying T2I model. Perfusion
+avoids overfitting by introducing a new mechanism that "locks" new concepts'
+cross-attention Keys to their superordinate category. Additionally, we develop
+a gated rank-1 approach that enables us to control the influence of a learned
+concept during inference time and to combine multiple concepts. This allows
+runtime-efficient balancing of visual-fidelity and textual-alignment with a
+single 100KB trained model, which is five orders of magnitude smaller than the
+current state of the art. Moreover, it can span different operating points
+across the Pareto front without additional training. Finally, we show that
+Perfusion outperforms strong baselines in both qualitative and quantitative
+terms. Importantly, key-locking leads to novel results compared to traditional
+approaches, allowing to portray personalized object interactions in
+unprecedented ways, even in one-shot settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIGGRAPH 2023. Project page is in
+  https://research.nvidia.com/labs/par/Perfusion/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distribution-Aware Data Expansion with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06741v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06741v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowei Zhu, Ling Yang, Jun-Hai Yong, Hongzhi Yin, Jiawei Jiang, Meng Xiao, Wentao Zhang, Bin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scale and quality of a dataset significantly impact the performance of
+deep models. However, acquiring large-scale annotated datasets is both a costly
+and time-consuming endeavor. To address this challenge, dataset expansion
+technologies aim to automatically augment datasets, unlocking the full
+potential of deep models. Current data expansion techniques include image
+transformation and image synthesis methods. Transformation-based methods
+introduce only local variations, leading to limited diversity. In contrast,
+synthesis-based methods generate entirely new content, greatly enhancing
+informativeness. However, existing synthesis methods carry the risk of
+distribution deviations, potentially degrading model performance with
+out-of-distribution samples. In this paper, we propose DistDiff, a
+training-free data expansion framework based on the distribution-aware
+diffusion model. DistDiff constructs hierarchical prototypes to approximate the
+real data distribution, optimizing latent data points within diffusion models
+with hierarchical energy guidance. We demonstrate its capability to generate
+distribution-consistent samples, significantly improving data expansion tasks.
+DistDiff consistently enhances accuracy across a diverse range of datasets
+compared to models trained solely on original data. Furthermore, our approach
+consistently outperforms existing synthesis-based techniques and demonstrates
+compatibility with widely adopted transformation-based augmentation methods.
+Additionally, the expanded dataset exhibits robustness across various
+architectural frameworks. Our code is available at
+https://github.com/haoweiz23/DistDiff
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project: https://github.com/haoweiz23/DistDiff</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Correctable Landmark Discovery via Large Models for Vision-Language
+  Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18721v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18721v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingqian Lin, Yunshuang Nie, Ziming Wei, Yi Zhu, Hang Xu, Shikui Ma, Jianzhuang Liu, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Navigation (VLN) requires the agent to follow language
+instructions to reach a target position. A key factor for successful navigation
+is to align the landmarks implied in the instruction with diverse visual
+observations. However, previous VLN agents fail to perform accurate modality
+alignment especially in unexplored scenes, since they learn from limited
+navigation data and lack sufficient open-world alignment knowledge. In this
+work, we propose a new VLN paradigm, called COrrectable LaNdmark DiScOvery via
+Large ModEls (CONSOLE). In CONSOLE, we cast VLN as an open-world sequential
+landmark discovery problem, by introducing a novel correctable landmark
+discovery scheme based on two large models ChatGPT and CLIP. Specifically, we
+use ChatGPT to provide rich open-world landmark cooccurrence commonsense, and
+conduct CLIP-driven landmark discovery based on these commonsense priors. To
+mitigate the noise in the priors due to the lack of visual constraints, we
+introduce a learnable cooccurrence scoring module, which corrects the
+importance of each cooccurrence according to actual observations for accurate
+landmark discovery. We further design an observation enhancement strategy for
+an elegant combination of our framework with different VLN agents, where we
+utilize the corrected landmark features to obtain enhanced observation features
+for action decision. Extensive experimental results on multiple popular VLN
+benchmarks (R2R, REVERIE, R4R, RxR) show the significant superiority of CONSOLE
+over strong baselines. Especially, our CONSOLE establishes the new
+state-of-the-art results on R2R and R4R in unseen scenarios. Code is available
+at https://github.com/expectorlin/CONSOLE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting the Trade-off between Accuracy and Robustness via Weight
+  Distribution of Filters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03430v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03430v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingxing Wei, Shiji Zhao, Bo li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks have been proven to be potential threats to Deep Neural
+Networks (DNNs), and many methods are proposed to defend against adversarial
+attacks. However, while enhancing the robustness, the clean accuracy will
+decline to a certain extent, implying a trade-off existed between the accuracy
+and robustness. In this paper, to meet the trade-off problem, we theoretically
+explore the underlying reason for the difference of the filters' weight
+distribution between standard-trained and robust-trained models and then argue
+that this is an intrinsic property for static neural networks, thus they are
+difficult to fundamentally improve the accuracy and adversarial robustness at
+the same time. Based on this analysis, we propose a sample-wise dynamic network
+architecture named Adversarial Weight-Varied Network (AW-Net), which focuses on
+dealing with clean and adversarial examples with a "divide and rule" weight
+strategy. The AW-Net adaptively adjusts the network's weights based on
+regulation signals generated by an adversarial router, which is directly
+influenced by the input sample. Benefiting from the dynamic network
+architecture, clean and adversarial examples can be processed with different
+network weights, which provides the potential to enhance both accuracy and
+adversarial robustness. A series of experiments demonstrate that our AW-Net is
+architecture-friendly to handle both clean and adversarial examples and can
+achieve better trade-off performance than state-of-the-art robust models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EgoExoLearn: A <span class="highlight-title">Dataset</span> for Bridging Asynchronous Ego- and Exo-centric
+  View of Procedural Activities in Real World <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16182v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16182v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Huang, Guo Chen, Jilan Xu, Mingfang Zhang, Lijin Yang, Baoqi Pei, Hongjie Zhang, Lu Dong, Yali Wang, Limin Wang, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being able to map the activities of others into one's own point of view is
+one fundamental human skill even from a very early age. Taking a step toward
+understanding this human ability, we introduce EgoExoLearn, a large-scale
+dataset that emulates the human demonstration following process, in which
+individuals record egocentric videos as they execute tasks guided by
+demonstration videos. Focusing on the potential applications in daily
+assistance and professional support, EgoExoLearn contains egocentric and
+demonstration video data spanning 120 hours captured in daily life scenarios
+and specialized laboratories. Along with the videos we record high-quality gaze
+data and provide detailed multimodal annotations, formulating a playground for
+modeling the human ability to bridge asynchronous procedural actions from
+different viewpoints. To this end, we present benchmarks such as cross-view
+association, cross-view action planning, and cross-view referenced skill
+assessment, along with detailed analysis. We expect EgoExoLearn can serve as an
+important resource for bridging the actions across views, thus paving the way
+for creating AI agents capable of seamlessly learning by observing humans in
+the real world. Code and data can be found at:
+https://github.com/OpenGVLab/EgoExoLearn
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synergistic Integration of Coordinate Network and Tensorial Feature for
+  Improving Neural Radiance Fields from Sparse Inputs <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07857v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07857v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyu Kim, Jun-Seong Kim, Se-Young Yun, Jin-Hwa Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The multi-plane representation has been highlighted for its fast training and
+inference across static and dynamic neural radiance fields. This approach
+constructs relevant features via projection onto learnable grids and
+interpolating adjacent vertices. However, it has limitations in capturing
+low-frequency details and tends to overuse parameters for low-frequency
+features due to its bias toward fine details, despite its multi-resolution
+concept. This phenomenon leads to instability and inefficiency when training
+poses are sparse. In this work, we propose a method that synergistically
+integrates multi-plane representation with a coordinate-based MLP network known
+for strong bias toward low-frequency signals. The coordinate-based network is
+responsible for capturing low-frequency details, while the multi-plane
+representation focuses on capturing fine-grained details. We demonstrate that
+using residual connections between them seamlessly preserves their own inherent
+properties. Additionally, the proposed progressive training scheme accelerates
+the disentanglement of these two features. We demonstrate empirically that our
+proposed method not only outperforms baseline models for both static and
+dynamic NeRFs with sparse inputs, but also achieves comparable results with
+fewer parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML2024 ; Project page is accessible at
+  https://mingyukim87.github.io/SynergyNeRF ; Code is available at
+  https://github.com/MingyuKim87/SynergyNeRF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PixelLM: Pixel Reasoning with Large Multimodal Model <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02228v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02228v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongwei Ren, Zhicheng Huang, Yunchao Wei, Yao Zhao, Dongmei Fu, Jiashi Feng, Xiaojie Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large multimodal models (LMMs) have achieved remarkable progress,
+generating pixel-level masks for image reasoning tasks involving multiple
+open-world targets remains a challenge. To bridge this gap, we introduce
+PixelLM, an effective and efficient LMM for pixel-level reasoning and
+understanding. Central to PixelLM is a novel, lightweight pixel decoder and a
+comprehensive segmentation codebook. The decoder efficiently produces masks
+from the hidden embeddings of the codebook tokens, which encode detailed
+target-relevant information. With this design, PixelLM harmonizes with the
+structure of popular LMMs and avoids the need for additional costly
+segmentation models. Furthermore, we propose a target refinement loss to
+enhance the model's ability to differentiate between multiple targets, leading
+to substantially improved mask quality. To advance research in this area, we
+construct MUSE, a high-quality multi-target reasoning segmentation benchmark.
+PixelLM excels across various pixel-level image reasoning and understanding
+tasks, outperforming well-established methods in multiple benchmarks, including
+MUSE, single- and multi-referring segmentation. Comprehensive ablations confirm
+the efficacy of each proposed component. All code, models, and datasets will be
+publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>(Accepted by CVPR 2024) Code and models are released at:
+  https://pixellm.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FUSU: A Multi-temporal-source Land Use Change Segmentation <span class="highlight-title">Dataset</span> for
+  Fine-grained Urban Semantic Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19055v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19055v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Yuan, Guancong Lin, Lixian Zhang, Runmin Dong, Jinxiao Zhang, Shuang Chen, Juepeng Zheng, Jie Wang, Haohuan Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine urban change segmentation using multi-temporal remote sensing images is
+essential for understanding human-environment interactions in urban areas.
+Despite advances in remote sensing data for urban monitoring, coarse-grained
+classification systems and the lack of continuous temporal observations hinder
+the application of deep learning to urban change analysis. To address this, we
+introduce FUSU, a multi-source, multi-temporal change segmentation dataset for
+Fine-grained Urban Semantic Understanding. FUSU features the most detailed land
+use classification system to date, with 17 classes and 30 billion pixels of
+annotations. It includes bi-temporal high-resolution satellite images with
+20-50 cm ground sample distance and monthly optical and radar satellite time
+series, covering 847 km2 across five urban areas in China. The fine-grained
+pixel-wise annotations and high spatial-temporal resolution data provide a
+robust foundation for deep learning models to understand urbanization and land
+use changes. To fully leverage FUSU, we propose a unified time-series
+architecture for both change detection and segmentation and then benchmark FUSU
+on various methods for several tasks. Dataset and code will be available at:
+https://github.com/yuanshuai0914/FUSU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Few-shot Object Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12466v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12466v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunhan Ren, Bo Li, Chengyang Zhang, Yong Zhang, Baocai Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing object localization methods are tailored to locate specific classes
+of objects, relying heavily on abundant labeled data for model optimization.
+However, acquiring large amounts of labeled data is challenging in many
+real-world scenarios, significantly limiting the broader application of
+localization models. To bridge this research gap, this paper defines a novel
+task named Few-Shot Object Localization (FSOL), which aims to achieve precise
+localization with limited samples. This task achieves generalized object
+localization by leveraging a small number of labeled support samples to query
+the positional information of objects within corresponding images. To advance
+this field, we design an innovative high-performance baseline model. This model
+integrates a dual-path feature augmentation module to enhance shape association
+and gradient differences between supports and query images, alongside a self
+query module to explore the association between feature maps and query images.
+Experimental results demonstrate a significant performance improvement of our
+approach in the FSOL task, establishing an efficient benchmark for further
+research. All codes and data are available at https://github.com/Ryh1218/FSOL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KerasCV and KerasNLP: Vision and Language Power-Ups 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20247v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20247v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Watson, Divyashree Shivakumar Sreepathihalli, Francois Chollet, Martin Gorner, Kiranbir Sodhia, Ramesh Sampath, Tirth Patel, Haifeng Jin, Neel Kovelamudi, Gabriel Rasskin, Samaneh Saadat, Luke Wood, Chen Qian, Jonathan Bischof, Ian Stenbit, Abheesht Sharma, Anshuman Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Keras domain packages KerasCV and KerasNLP, extensions of the
+Keras API for Computer Vision and Natural Language Processing workflows,
+capable of running on either JAX, TensorFlow, or PyTorch. These domain packages
+are designed to enable fast experimentation, with a focus on ease-of-use and
+performance. We adopt a modular, layered design: at the library's lowest level
+of abstraction, we provide building blocks for creating models and data
+preprocessing pipelines, and at the library's highest level of abstraction, we
+provide pretrained ``task" models for popular architectures such as Stable
+Diffusion, YOLOv8, GPT2, BERT, Mistral, CLIP, Gemma, T5, etc. Task models have
+built-in preprocessing, pretrained weights, and can be fine-tuned on raw
+inputs. To enable efficient training, we support XLA compilation for all
+models, and run all preprocessing via a compiled graph of TensorFlow operations
+using the tf.data API. The libraries are fully open-source (Apache 2.0 license)
+and available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Journal of Machine Learning Open Source Software</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Randomized Principal Component Analysis for Hyperspectral Image
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09117v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09117v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mustafa Ustuner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The high-dimensional feature space of the hyperspectral imagery poses major
+challenges to the processing and analysis of the hyperspectral data sets. In
+such a case, dimensionality reduction is necessary to decrease the
+computational complexity. The random projections open up new ways of
+dimensionality reduction, especially for large data sets. In this paper, the
+principal component analysis (PCA) and randomized principal component analysis
+(R-PCA) for the classification of hyperspectral images using support vector
+machines (SVM) and light gradient boosting machines (LightGBM) have been
+investigated. In this experimental research, the number of features was reduced
+to 20 and 30 for classification of two hyperspectral datasets (Indian Pines and
+Pavia University). The experimental results demonstrated that PCA outperformed
+R-PCA for SVM for both datasets, but received close accuracy values for
+LightGBM. The highest classification accuracies were obtained as 0.9925 and
+0.9639 by LightGBM with original features for the Pavia University and Indian
+Pines, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, I have submitted this paper to M2GARSS 2024, 2024 IEEE
+  Mediterranean and Middle-East Geoscience and Remote Sensing Symposium</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient HDR Reconstruction from Real-World Raw Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10311v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10311v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qirui Yang, Yihao Liu, Qihua Chen, Huanjing Yue, Kun Li, Jingyu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread usage of high-definition screens on edge devices stimulates a
+strong demand for efficient high dynamic range (HDR) algorithms. However, many
+existing HDR methods either deliver unsatisfactory results or consume too much
+computational and memory resources, hindering their application to
+high-resolution images (usually with more than 12 megapixels) in practice. In
+addition, existing HDR dataset collection methods often are labor-intensive. In
+this work, in a new aspect, we discover an excellent opportunity for HDR
+reconstructing directly from raw images and investigating novel neural network
+structures that benefit the deployment of mobile devices. Our key insights are
+threefold: (1) we develop a lightweight-efficient HDR model, RepUNet, using the
+structural re-parameterization technique to achieve fast and robust HDR; (2) we
+design a new computational raw HDR data formation pipeline and construct a
+real-world raw HDR dataset, RealRaw-HDR; (3) we propose a plug-and-play motion
+alignment loss to mitigate motion ghosting under limited bandwidth conditions.
+Our model contains less than 830K parameters and takes less than 3 ms to
+process an image of 4K resolution using one RTX 3090 GPU. While being highly
+efficient, our model also outperforms the state-of-the-art HDR methods in terms
+of PSNR, SSIM, and a color difference metric.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PixOOD: Pixel-Level Out-of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19882v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19882v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomáš Vojíř, Jan Šochman, Jiří Matas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a dense image prediction out-of-distribution detection algorithm,
+called PixOOD, which does not require training on samples of anomalous data and
+is not designed for a specific application which avoids traditional training
+biases. In order to model the complex intra-class variability of the
+in-distribution data at the pixel level, we propose an online data condensation
+algorithm which is more robust than standard K-means and is easily trainable
+through SGD. We evaluate PixOOD on a wide range of problems. It achieved
+state-of-the-art results on four out of seven datasets, while being competitive
+on the rest. The source code is available at https://github.com/vojirt/PixOOD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-shot High-fidelity and Pose-controllable Character Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13680v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13680v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bingwen Zhu, Fanyi Wang, Tianyi Lu, Peng Liu, Jingwen Su, Jinxiu Liu, Yanhao Zhang, Zuxuan Wu, Guo-Jun Qi, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-to-video (I2V) generation aims to create a video sequence from a single
+image, which requires high temporal coherence and visual fidelity. However,
+existing approaches suffer from inconsistency of character appearances and poor
+preservation of fine details. Moreover, they require a large amount of video
+data for training, which can be computationally demanding. To address these
+limitations, we propose PoseAnimate, a novel zero-shot I2V framework for
+character animation. PoseAnimate contains three key components: 1) a Pose-Aware
+Control Module (PACM) that incorporates diverse pose signals into text
+embeddings, to preserve character-independent content and maintain precise
+alignment of actions. 2) a Dual Consistency Attention Module (DCAM) that
+enhances temporal consistency and retains character identity and intricate
+background details. 3) a Mask-Guided Decoupling Module (MGDM) that refines
+distinct feature perception abilities, improving animation fidelity by
+decoupling the character and background. We also propose a Pose Alignment
+Transition Algorithm (PATA) to ensure smooth action transition. Extensive
+experiment results demonstrate that our approach outperforms the
+state-of-the-art training-based methods in terms of character consistency and
+detail fidelity. Moreover, it maintains a high level of temporal coherence
+throughout the generated animations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An open <span class="highlight-title">dataset</span> for oracle bone script recognition and decipherment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15365v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15365v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengjie Wang, Kaile Zhang, Xinyu Wang, Shengwei Han, Yongge Liu, Jinpeng Wan, Haisu Guan, Zhebin Kuang, Lianwen Jin, Xiang Bai, Yuliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Oracle Bone Script (OBS), one of the earliest known forms of ancient Chinese
+writing, holds invaluable insights into the humanities and geography of the
+Shang Dynasty, dating back 3,000 years. The immense historical and cultural
+significance of these writings cannot be overstated. However, the passage of
+time has obscured much of their meaning, presenting a significant challenge in
+deciphering these ancient texts. With the advent of Artificial Intelligence
+(AI), employing AI to assist in interpreting OBS has become a feasible option.
+Yet, progress in this area has been hindered by a lack of high-quality
+datasets. To address this issue, this paper details the creation of the
+HUST-OBS dataset. This dataset encompasses 77,064 images of 1,588 individual
+deciphered scripts and 62,989 images of 9,411 undeciphered characters, with a
+total of 140,053 images, compiled from diverse sources. Additionally, all
+images and labels have been reviewed and corrected by experts in oracle bone
+studies. The hope is that this dataset could inspire and assist future research
+in deciphering those unknown OBS. All the codes and datasets are available at
+https://github.com/Pengjie-W/HUST-OBC.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EffiVED:Efficient Video Editing via Text-instruction Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11568v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11568v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenghao Zhang, Zuozhuo Dai, Long Qin, Weizhi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale text-to-video models have shown remarkable abilities, but their
+direct application in video editing remains challenging due to limited
+available datasets. Current video editing methods commonly require per-video
+fine-tuning of diffusion models or specific inversion optimization to ensure
+high-fidelity edits. In this paper, we introduce EffiVED, an efficient
+diffusion-based model that directly supports instruction-guided video editing.
+To achieve this, we present two efficient workflows to gather video editing
+pairs, utilizing augmentation and fundamental vision-language techniques. These
+workflows transform vast image editing datasets and open-world videos into a
+high-quality dataset for training EffiVED. Experimental results reveal that
+EffiVED not only generates high-quality editing videos but also executes
+rapidly. Finally, we demonstrate that our data collection method significantly
+improves editing performance and can potentially tackle the scarcity of video
+editing data. Code can be found at https://github.com/alibaba/EffiVED.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards More General Video-based Deepfake Detection through Facial
+  Feature Guided Adaptation for Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.05583v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.05583v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue-Hua Han, Tai-Ming Huang, Shu-Tzu Lo, Po-Han Huang, Kai-Lung Hua, Jun-Cheng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rise of deep learning, generative models have enabled the creation
+of highly realistic synthetic images, presenting challenges due to their
+potential misuse. While research in Deepfake detection has grown rapidly in
+response, many detection methods struggle with unseen Deepfakes generated by
+new synthesis techniques. To address this generalisation challenge, we propose
+a novel Deepfake detection approach by adapting the Foundation Models with rich
+information encoded inside, specifically using the image encoder from CLIP
+which has demonstrated strong zero-shot capability for downstream tasks.
+Inspired by the recent advances of parameter efficient fine-tuning, we propose
+a novel side-network-based decoder to extract spatial and temporal cues from
+the given video clip, with the promotion of the Facial Component Guidance (FCG)
+to encourage the spatial feature to include features of key facial parts for
+more robust and general Deepfake detection. Through extensive cross-dataset
+evaluations, our approach exhibits superior effectiveness in identifying unseen
+Deepfake samples, achieving notable performance improvement even with limited
+training samples and manipulation types. Our model secures an average
+performance enhancement of 0.9\% AUROC in cross-dataset assessments comparing
+with state-of-the-art methods, especially a significant lead of achieving 4.4\%
+improvement on the challenging DFDC dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Temporal Consistency in Video Editing by Reconstructing Videos
+  with 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02541v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02541v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inkyu Shin, Qihang Yu, Xiaohui Shen, In So Kweon, Kuk-Jin Yoon, Liang-Chieh Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in zero-shot video diffusion models have shown promise
+for text-driven video editing, but challenges remain in achieving high temporal
+consistency. To address this, we introduce Video-3DGS, a 3D Gaussian Splatting
+(3DGS)-based video refiner designed to enhance temporal consistency in
+zero-shot video editors. Our approach utilizes a two-stage 3D Gaussian
+optimizing process tailored for editing dynamic monocular videos. In the first
+stage, Video-3DGS employs an improved version of COLMAP, referred to as
+MC-COLMAP, which processes original videos using a Masked and Clipped approach.
+For each video clip, MC-COLMAP generates the point clouds for dynamic
+foreground objects and complex backgrounds. These point clouds are utilized to
+initialize two sets of 3D Gaussians (Frg-3DGS and Bkg-3DGS) aiming to represent
+foreground and background views. Both foreground and background views are then
+merged with a 2D learnable parameter map to reconstruct full views. In the
+second stage, we leverage the reconstruction ability developed in the first
+stage to impose the temporal constraints on the video diffusion model. To
+demonstrate the efficacy of Video-3DGS on both stages, we conduct extensive
+experiments across two related tasks: Video Reconstruction and Video Editing.
+Video-3DGS trained with 3k iterations significantly improves video
+reconstruction quality (+3 PSNR, +7 PSNR increase) and training efficiency
+(x1.9, x4.5 times faster) over NeRF-based and 3DGS-based state-of-art methods
+on DAVIS dataset, respectively. Moreover, it enhances video editing by ensuring
+temporal consistency across 58 dynamic monocular videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at this https://video-3dgs-project.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAM-LAD: Segment Anything Model Meets Zero-Shot Logic Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00625v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00625v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Peng, Xiao Lin, Nachuan Ma, Jiayuan Du, Chuangwei Liu, Chengju Liu, Qijun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual anomaly detection is vital in real-world applications, such as
+industrial defect detection and medical diagnosis. However, most existing
+methods focus on local structural anomalies and fail to detect higher-level
+functional anomalies under logical conditions. Although recent studies have
+explored logical anomaly detection, they can only address simple anomalies like
+missing or addition and show poor generalizability due to being heavily
+data-driven. To fill this gap, we propose SAM-LAD, a zero-shot, plug-and-play
+framework for logical anomaly detection in any scene. First, we obtain a query
+image's feature map using a pre-trained backbone. Simultaneously, we retrieve
+the reference images and their corresponding feature maps via the nearest
+neighbor search of the query image. Then, we introduce the Segment Anything
+Model (SAM) to obtain object masks of the query and reference images. Each
+object mask is multiplied with the entire image's feature map to obtain object
+feature maps. Next, an Object Matching Model (OMM) is proposed to match objects
+in the query and reference images. To facilitate object matching, we further
+propose a Dynamic Channel Graph Attention (DCGA) module, treating each object
+as a keypoint and converting its feature maps into feature vectors. Finally,
+based on the object matching relations, an Anomaly Measurement Model (AMM) is
+proposed to detect objects with logical anomalies. Structural anomalies in the
+objects can also be detected. We validate our proposed SAM-LAD using various
+benchmarks, including industrial datasets (MVTec Loco AD, MVTec AD), and the
+logical dataset (DigitAnatomy). Extensive experimental results demonstrate that
+SAM-LAD outperforms existing SoTA methods, particularly in detecting logical
+anomalies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Real World Map Change Generalization of Prior-Informed HD Map
+  Prediction Models <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel M. Bateman, Ning Xu, H. Charles Zhao, Yael Ben Shalom, Vince Gong, Greg Long, Will Maddern
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building and maintaining High-Definition (HD) maps represents a large barrier
+to autonomous vehicle deployment. This, along with advances in modern online
+map detection models, has sparked renewed interest in the online mapping
+problem. However, effectively predicting online maps at a high enough quality
+to enable safe, driverless deployments remains a significant challenge. Recent
+work on these models proposes training robust online mapping systems using low
+quality map priors with synthetic perturbations in an attempt to simulate
+out-of-date HD map priors. In this paper, we investigate how models trained on
+these synthetically perturbed map priors generalize to performance on
+deployment-scale, real world map changes. We present a large-scale experimental
+study to determine which synthetic perturbations are most useful in
+generalizing to real world HD map changes, evaluated using multiple years of
+real-world autonomous driving data. We show there is still a substantial
+sim2real gap between synthetic prior perturbations and observed real-world
+changes, which limits the utility of current prior-informed HD map prediction
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024, Workshop on Autonomous Driving</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Synthesis for Zero-Shot Model Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15977v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15977v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyun Yang, Juan Cao, Danding Wang, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, generative models are shaping various fields such as art, design,
+and human-computer interaction, yet accompanied by challenges related to
+copyright infringement and content management. In response, existing research
+seeks to identify the unique fingerprints on the images they generate, which
+can be leveraged to attribute the generated images to their source models.
+Existing methods, however, are constrained to identifying models within a
+static set included in the classifier training, failing to adapt to newly
+emerged unseen models dynamically. To bridge this gap, we aim to develop a
+generalized model fingerprint extractor capable of zero-shot attribution,
+effectively attributes unseen models without exposure during training. Central
+to our method is a model synthesis technique, which generates numerous
+synthetic models mimicking the fingerprint patterns of real-world generative
+models. The design of the synthesis technique is motivated by observations on
+how the basic generative model's architecture building blocks and parameters
+influence fingerprint patterns, and it is validated through two designed
+metrics that examine synthetic models' fidelity and diversity. Our experiments
+demonstrate that this fingerprint extractor, trained solely on synthetic
+models, achieves impressive zero-shot generalization on a wide range of
+real-world generative models, improving model identification and verification
+accuracy on unseen models by over 40% and 15%, respectively, compared to
+existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bidirectional Autoregressive Diffusion Model for Dance Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04356v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04356v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Canyu Zhang, Youbao Tang, Ning Zhang, Ruei-Sung Lin, Mei Han, Jing Xiao, Song Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dance serves as a powerful medium for expressing human emotions, but the
+lifelike generation of dance is still a considerable challenge. Recently,
+diffusion models have showcased remarkable generative abilities across various
+domains. They hold promise for human motion generation due to their adaptable
+many-to-many nature. Nonetheless, current diffusion-based motion generation
+models often create entire motion sequences directly and unidirectionally,
+lacking focus on the motion with local and bidirectional enhancement. When
+choreographing high-quality dance movements, people need to take into account
+not only the musical context but also the nearby music-aligned dance motions.
+To authentically capture human behavior, we propose a Bidirectional
+Autoregressive Diffusion Model (BADM) for music-to-dance generation, where a
+bidirectional encoder is built to enforce that the generated dance is
+harmonious in both the forward and backward directions. To make the generated
+dance motion smoother, a local information decoder is built for local motion
+enhancement. The proposed framework is able to generate new motions based on
+the input conditions and nearby motions, which foresees individual motion
+slices iteratively and consolidates all predictions. To further refine the
+synchronicity between the generated dance and the beat, the beat information is
+incorporated as an input to generate better music-aligned dance movements.
+Experimental results demonstrate that the proposed model achieves
+state-of-the-art performance compared to existing unidirectional approaches on
+the prominent benchmark for music-to-dance generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Seedling to Harvest: The GrowingSoy <span class="highlight-title">Dataset</span> for Weed Detection in
+  Soy Crops via Instance Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00313v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00313v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raul Steinmetz, Victor A. Kich, Henrique Krever, Joao D. Rigo Mazzarolo, Ricardo B. Grando, Vinicius Marini, Celio Trois, Ard Nieuwenhuizen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning, particularly Convolutional Neural Networks (CNNs), has gained
+significant attention for its effectiveness in computer vision, especially in
+agricultural tasks. Recent advancements in instance segmentation have improved
+image classification accuracy. In this work, we introduce a comprehensive
+dataset for training neural networks to detect weeds and soy plants through
+instance segmentation. Our dataset covers various stages of soy growth,
+offering a chronological perspective on weed invasion's impact, with 1,000
+meticulously annotated images. We also provide 6 state of the art models,
+trained in this dataset, that can understand and detect soy and weed in every
+stage of the plantation process. By using this dataset for weed and soy
+segmentation, we achieved a segmentation average precision of 79.1% and an
+average recall of 69.2% across all plant classes, with the YOLOv8X model.
+Moreover, the YOLOv8M model attained 78.7% mean average precision (mAp-50) in
+caruru weed segmentation, 69.7% in grassy weed segmentation, and 90.1% in soy
+plant segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11th IEEE International Conference on Cybernetics and Intelligent
+  Systems (CIS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Test-Time Degradation Adaptation for Open-Set Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.02197v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.02197v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanbiao Gou, Haiyu Zhao, Boyun Li, Xinyan Xiao, Xi Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contrast to close-set scenarios that restore images from a predefined set
+of degradations, open-set image restoration aims to handle the unknown
+degradations that were unforeseen during the pretraining phase, which is
+less-touched as far as we know. This work study this challenging problem and
+reveal its essence as unidentified distribution shifts between the test and
+training data. Recently, test-time adaptation has emerged as a fundamental
+method to address this inherent disparities. Inspired by it, we propose a
+test-time degradation adaptation framework for open-set image restoration,
+which consists of three components, \textit{i.e.}, i) a pre-trained and
+degradation-agnostic diffusion model for generating clean images, ii) a
+test-time degradation adapter adapts the unknown degradations based on the
+input image during the testing phase, and iii) the adapter-guided image
+restoration guides the model through the adapter to produce the corresponding
+clean image. Through experiments on multiple degradations, we show that our
+method achieves comparable even better performance than those task-specific
+methods. The code is available at
+https://github.com/XLearning-SCU/2024-ICML-TAO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contextual Hourglass Network for Semantic Segmentation of High
+  Resolution Aerial Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1810.12813v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1810.12813v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panfeng Li, Youzuo Lin, Emily Schultz-Fellenz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation for aerial imagery is a challenging and important
+problem in remotely sensed imagery analysis. In recent years, with the success
+of deep learning, various convolutional neural network (CNN) based models have
+been developed. However, due to the varying sizes of the objects and imbalanced
+class labels, it can be challenging to obtain accurate pixel-wise semantic
+segmentation results. To address those challenges, we develop a novel semantic
+segmentation method and call it Contextual Hourglass Network. In our method, in
+order to improve the robustness of the prediction, we design a new contextual
+hourglass module which incorporates attention mechanism on processed
+low-resolution featuremaps to exploit the contextual semantics. We further
+exploit the stacked encoder-decoder structure by connecting multiple contextual
+hourglass modules from end to end. This architecture can effectively extract
+rich multi-scale features and add more feedback loops for better learning
+contextual semantics through intermediate supervision. To demonstrate the
+efficacy of our semantic segmentation method, we test it on Potsdam and
+Vaihingen datasets. Through the comparisons to other baseline methods, our
+method yields the best results on overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 2024 5th International Conference on Electronic
+  Communication and Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CODIS: Benchmarking Context-Dependent Visual Comprehension for
+  Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13607v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13607v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fuwen Luo, Chi Chen, Zihao Wan, Zhaolu Kang, Qidong Yan, Yingjie Li, Xiaolong Wang, Siyu Wang, Ziyue Wang, Xiaoyue Mi, Peng Li, Ning Ma, Maosong Sun, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) have demonstrated promising results
+in a variety of tasks that combine vision and language. As these models become
+more integral to research and applications, conducting comprehensive
+evaluations of their capabilities has grown increasingly important. However,
+most existing benchmarks fail to consider that, in certain situations, images
+need to be interpreted within a broader context. In this work, we introduce a
+new benchmark, named as CODIS, designed to assess the ability of models to use
+context provided in free-form text to enhance visual comprehension. Our
+findings indicate that MLLMs consistently fall short of human performance on
+this benchmark. Further analysis confirms that these models struggle to
+effectively extract and utilize contextual information to improve their
+understanding of images. This underscores the pressing need to enhance the
+ability of MLLMs to comprehend visuals in a context-dependent manner. View our
+project website at https://thunlp-mt.github.io/CODIS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FuRL: Visual-Language Models as Fuzzy Rewards for Reinforcement Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00645v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00645v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuwei Fu, Haichao Zhang, Di Wu, Wei Xu, Benoit Boulet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate how to leverage pre-trained visual-language
+models (VLM) for online Reinforcement Learning (RL). In particular, we focus on
+sparse reward tasks with pre-defined textual task descriptions. We first
+identify the problem of reward misalignment when applying VLM as a reward in RL
+tasks. To address this issue, we introduce a lightweight fine-tuning method,
+named Fuzzy VLM reward-aided RL (FuRL), based on reward alignment and relay RL.
+Specifically, we enhance the performance of SAC/DrQ baseline agents on sparse
+reward tasks by fine-tuning VLM representations and using relay RL to avoid
+local minima. Extensive experiments on the Meta-world benchmark tasks
+demonstrate the efficacy of the proposed method. Code is available at:
+https://github.com/fuyw/FuRL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">21</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models as Evaluators for Recommendation Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Zhang, Yishan Li, Jiayin Wang, Bowen Sun, Weizhi Ma, Peijie Sun, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The explainability of recommender systems has attracted significant attention
+in academia and industry. Many efforts have been made for explainable
+recommendations, yet evaluating the quality of the explanations remains a
+challenging and unresolved issue. In recent years, leveraging LLMs as
+evaluators presents a promising avenue in Natural Language Processing tasks
+(e.g., sentiment classification, information extraction), as they perform
+strong capabilities in instruction following and common-sense reasoning.
+However, evaluating recommendation explanatory texts is different from these
+NLG tasks, as its criteria are related to human perceptions and are usually
+subjective. In this paper, we investigate whether LLMs can serve as evaluators
+of recommendation explanations. To answer the question, we utilize real user
+feedback on explanations given from previous work and additionally collect
+third-party annotations and LLM evaluations. We design and apply a 3-level meta
+evaluation strategy to measure the correlation between evaluator labels and the
+ground truth provided by users. Our experiments reveal that LLMs, such as GPT4,
+can provide comparable evaluations with appropriate prompts and settings. We
+also provide further insights into combining human labels with the LLM
+evaluation process and utilizing ensembles of multiple heterogeneous LLM
+evaluators to enhance the accuracy and stability of evaluations. Our study
+verifies that utilizing LLMs as evaluators can be an accurate, reproducible and
+cost-effective solution for evaluating recommendation explanation texts. Our
+code is available at https://github.com/Xiaoyu-SZ/LLMasEvaluator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linking Named Entities in Diderot's \textit{Encyclopédie} to Wikidata 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Nugues
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diderot's \textit{Encyclop\'edie} is a reference work from XVIIIth century in
+Europe that aimed at collecting the knowledge of its era. \textit{Wikipedia}
+has the same ambition with a much greater scope. However, the lack of digital
+connection between the two encyclopedias may hinder their comparison and the
+study of how knowledge has evolved. A key element of \textit{Wikipedia} is
+Wikidata that backs the articles with a graph of structured data. In this
+paper, we describe the annotation of more than 10,300 of the
+\textit{Encyclop\'edie} entries with Wikidata identifiers enabling us to
+connect these entries to the graph. We considered geographic and human
+entities. The \textit{Encyclop\'edie} does not contain biographic entries as
+they mostly appear as subentries of locations. We extracted all the geographic
+entries and we completely annotated all the entries containing a description of
+human entities. This represents more than 2,600 links referring to locations or
+human entities. In addition, we annotated more than 9,500 entries having a
+geographic content only. We describe the annotation process as well as
+application examples. This resource is available at
+https://github.com/pnugues/encyclopedie_1751
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text-like Encoding of Collaborative Information in Large Language Models
+  for Recommendation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Zhang, Keqin Bao, Ming Yan, Wenjie Wang, Fuli Feng, Xiangnan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When adapting Large Language Models for Recommendation (LLMRec), it is
+crucial to integrate collaborative information. Existing methods achieve this
+by learning collaborative embeddings in LLMs' latent space from scratch or by
+mapping from external models. However, they fail to represent the information
+in a text-like format, which may not align optimally with LLMs. To bridge this
+gap, we introduce BinLLM, a novel LLMRec method that seamlessly integrates
+collaborative information through text-like encoding. BinLLM converts
+collaborative embeddings from external models into binary sequences -- a
+specific text format that LLMs can understand and operate on directly,
+facilitating the direct usage of collaborative information in text-like format
+by LLMs. Additionally, BinLLM provides options to compress the binary sequence
+using dot-decimal notation to avoid excessively long lengths. Extensive
+experiments validate that BinLLM introduces collaborative information in a
+manner better aligned with LLMs, resulting in enhanced performance. We release
+our code at https://github.com/zyang1580/BinLLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CAPRI-FAIR: Integration of Multi-sided Fairness in Contextual POI
+  Recommendation Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francis Zac dela Cruz, Flora D. Salim, Yonchanok Khaokaew, Jeffrey Chan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point-of-interest (POI) recommendation, a form of context-aware
+recommendation, takes into account spatio-temporal constraints and contexts
+like distance, peak business hours, and previous user check-ins. Given the
+ability of these kinds of systems to influence not just the consumer's travel
+experience, but also the POI's business, it is important to consider fairness
+from multiple perspectives. Unfortunately, these systems tend to provide less
+accurate recommendations to inactive users, and less exposure to unpopular
+POIs. The goal of this paper is to develop a post-filter methodology that
+incorporates provider and consumer fairness factors into pre-existing
+recommendation models, to satisfy fairness metrics like item exposure, and
+performance metrics like precision and distance, making the system more
+sustainable to both consumers and providers. Experiments have shown that using
+a linear scoring model for provider fairness in re-scoring recommended items
+yields the best tradeoff between performance and long-tail exposure, in some
+cases without a significant decrease in precision. When attempting to address
+consumer fairness by recommending more popular POIs to inactive users, the
+result was an increase in precision for only some recommendation models and
+datasets. Finally, when considering the tradeoff between both parameters, the
+combinations that reached the Pareto front of consumer and provider fairness,
+unfortunately, achieved the lowest precision values. We find that the nature of
+this tradeoff depends heavily on the model and the dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring User Retrieval Integration towards Large Language Models for
+  Cross-Domain Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tingjia Shen, Hao Wang, Jiaqing Zhang, Sirui Zhao, Liangyue Li, Zulong Chen, Defu Lian, Enhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-Domain Sequential Recommendation (CDSR) aims to mine and transfer
+users' sequential preferences across different domains to alleviate the
+long-standing cold-start issue. Traditional CDSR models capture collaborative
+information through user and item modeling while overlooking valuable semantic
+information. Recently, Large Language Model (LLM) has demonstrated powerful
+semantic reasoning capabilities, motivating us to introduce them to better
+capture semantic information. However, introducing LLMs to CDSR is non-trivial
+due to two crucial issues: seamless information integration and domain-specific
+generation. To this end, we propose a novel framework named URLLM, which aims
+to improve the CDSR performance by exploring the User Retrieval approach and
+domain grounding on LLM simultaneously. Specifically, we first present a novel
+dual-graph sequential model to capture the diverse information, along with an
+alignment and contrastive learning method to facilitate domain knowledge
+transfer. Subsequently, a user retrieve-generation model is adopted to
+seamlessly integrate the structural information into LLM, fully harnessing its
+emergent inferencing ability. Furthermore, we propose a domain-specific
+strategy and a refinement module to prevent out-of-domain generation. Extensive
+experiments on Amazon demonstrated the information integration and
+domain-specific generation ability of URLLM in comparison to state-of-the-art
+baselines. Our code is available at https://github.com/TingJShen/URLLM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Path-Specific Causal Reasoning for Fairness-aware Cognitive Diagnosis <span class="chip">KDD'2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dacao Zhang, Kun Zhang, Le Wu, Mi Tian, Richang Hong, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cognitive Diagnosis~(CD), which leverages students and exercise data to
+predict students' proficiency levels on different knowledge concepts, is one of
+fundamental components in Intelligent Education. Due to the scarcity of
+student-exercise interaction data, most existing methods focus on making the
+best use of available data, such as exercise content and student
+information~(e.g., educational context). Despite the great progress, the abuse
+of student sensitive information has not been paid enough attention. Due to the
+important position of CD in Intelligent Education, employing sensitive
+information when making diagnosis predictions will cause serious social issues.
+Moreover, data-driven neural networks are easily misled by the shortcut between
+input data and output prediction, exacerbating this problem. Therefore, it is
+crucial to eliminate the negative impact of sensitive information in CD models.
+In response, we argue that sensitive attributes of students can also provide
+useful information, and only the shortcuts directly related to the sensitive
+information should be eliminated from the diagnosis process. Thus, we employ
+causal reasoning and design a novel Path-Specific Causal Reasoning Framework
+(PSCRF) to achieve this goal. Specifically, we first leverage an encoder to
+extract features and generate embeddings for general information and sensitive
+information of students. Then, we design a novel attribute-oriented predictor
+to decouple the sensitive attributes, in which fairness-related sensitive
+features will be eliminated and other useful information will be retained.
+Finally, we designed a multi-factor constraint to ensure the performance of
+fairness and diagnosis performance simultaneously. Extensive experiments over
+real-world datasets (e.g., PISA dataset) demonstrate the effectiveness of our
+proposed PSCRF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accpeted by KDD'2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Docs2KG: Unified Knowledge Graph Construction from Heterogeneous
+  Documents Assisted by Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Sun, Yuanyi Luo, Wenxiao Zhang, Sirui Li, Jichunyang Li, Kai Niu, Xiangrui Kong, Wei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even for a conservative estimate, 80% of enterprise data reside in
+unstructured files, stored in data lakes that accommodate heterogeneous
+formats. Classical search engines can no longer meet information seeking needs,
+especially when the task is to browse and explore for insight formulation. In
+other words, there are no obvious search keywords to use. Knowledge graphs, due
+to their natural visual appeals that reduce the human cognitive load, become
+the winning candidate for heterogeneous data integration and knowledge
+representation.
+  In this paper, we introduce Docs2KG, a novel framework designed to extract
+multimodal information from diverse and heterogeneous unstructured documents,
+including emails, web pages, PDF files, and Excel files. Dynamically generates
+a unified knowledge graph that represents the extracted key information,
+Docs2KG enables efficient querying and exploration of document data lakes.
+Unlike existing approaches that focus on domain-specific data sources or
+pre-designed schemas, Docs2KG offers a flexible and extensible solution that
+can adapt to various document structures and content types. The proposed
+framework unifies data processing supporting a multitude of downstream tasks
+with improved domain interpretability. Docs2KG is publicly accessible at
+https://docs2kg.ai4wa.com, and a demonstration video is available at
+https://docs2kg.ai4wa.com/Video.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Task-oriented Queries Benchmark (ToQB) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keun Soo Yim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Task-oriented queries (e.g., one-shot queries to play videos, order food, or
+call a taxi) are crucial for assessing the quality of virtual assistants,
+chatbots, and other large language model (LLM)-based services. However, a
+standard benchmark for task-oriented queries is not yet available, as existing
+benchmarks in the relevant NLP (Natural Language Processing) fields have
+primarily focused on task-oriented dialogues. Thus, we present a new
+methodology for efficiently generating the Task-oriented Queries Benchmark
+(ToQB) using existing task-oriented dialogue datasets and an LLM service. Our
+methodology involves formulating the underlying NLP task to summarize the
+original intent of a speaker in each dialogue, detailing the key steps to
+perform the devised NLP task using an LLM service, and outlining a framework
+for automating a major part of the benchmark generation process. Through a case
+study encompassing three domains (i.e., two single-task domains and one
+multi-task domain), we demonstrate how to customize the LLM prompts (e.g.,
+omitting system utterances or speaker labels) for those three domains and
+characterize the generated task-oriented queries. The generated ToQB dataset is
+made available to the public. We further discuss new domains that can be added
+to ToQB by community contributors and its practical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Data available on GitHub,
+  https://github.com/google/task-oriented-queries</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Bi-metric Framework for Fast Similarity Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haike Xu, Sandeep Silwal, Piotr Indyk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new "bi-metric" framework for designing nearest neighbor data
+structures. Our framework assumes two dissimilarity functions: a ground-truth
+metric that is accurate but expensive to compute, and a proxy metric that is
+cheaper but less accurate. In both theory and practice, we show how to
+construct data structures using only the proxy metric such that the query
+procedure achieves the accuracy of the expensive metric, while only using a
+limited number of calls to both metrics. Our theoretical results instantiate
+this framework for two popular nearest neighbor search algorithms: DiskANN and
+Cover Tree. In both cases we show that, as long as the proxy metric used to
+construct the data structure approximates the ground-truth metric up to a
+bounded factor, our data structure achieves arbitrarily good approximation
+guarantees with respect to the ground-truth metric. On the empirical side, we
+apply the framework to the text retrieval problem with two dissimilarity
+functions evaluated by ML models with vastly different computational costs. We
+observe that for almost all data sets in the MTEB benchmark, our approach
+achieves a considerably better accuracy-efficiency tradeoff than the
+alternatives, such as re-ranking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Item-Language Model for Conversational Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02844v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02844v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Yang, Anushya Subbiah, Hardik Patel, Judith Yue Li, Yanwei Song, Reza Mirghaderi, Vikram Aggarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-language Models (LLMs) have been extremely successful at tasks like
+complex dialogue understanding, reasoning and coding due to their emergent
+abilities. These emergent abilities have been extended with multi-modality to
+include image, audio, and video capabilities. Recommender systems, on the other
+hand, have been critical for information seeking and item discovery needs.
+Recently, there have been attempts to apply LLMs for recommendations. One
+difficulty of current attempts is that the underlying LLM is usually not
+trained on the recommender system data, which largely contains user interaction
+signals and is often not publicly available. Another difficulty is user
+interaction signals often have a different pattern from natural language text,
+and it is currently unclear if the LLM training setup can learn more
+non-trivial knowledge from interaction signals compared with traditional
+recommender system methods. Finally, it is difficult to train multiple LLMs for
+different use-cases, and to retain the original language and reasoning
+abilities when learning from recommender system data. To address these three
+limitations, we propose an Item-Language Model (ILM), which is composed of an
+item encoder to produce text-aligned item representations that encode user
+interaction signals, and a frozen LLM that can understand those item
+representations with preserved pretrained knowledge. We conduct extensive
+experiments which demonstrate both the importance of the language-alignment and
+of user interaction knowledge in the item encoder.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stealthy Attack on Large Language Model based Recommendation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14836v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14836v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghao Zhang, Yuting Liu, Qiang Liu, Shu Wu, Guibing Guo, Liang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the powerful large language models (LLMs) have been instrumental in
+propelling the progress of recommender systems (RS). However, while these
+systems have flourished, their susceptibility to security threats has been
+largely overlooked. In this work, we reveal that the introduction of LLMs into
+recommendation models presents new security vulnerabilities due to their
+emphasis on the textual content of items. We demonstrate that attackers can
+significantly boost an item's exposure by merely altering its textual content
+during the testing phase, without requiring direct interference with the
+model's training process. Additionally, the attack is notably stealthy, as it
+does not affect the overall recommendation performance and the modifications to
+the text are subtle, making it difficult for users and platforms to detect. Our
+comprehensive experiments across four mainstream LLM-based recommendation
+models demonstrate the superior efficacy and stealthiness of our approach. Our
+work unveils a significant security gap in LLM-based recommendation systems and
+paves the way for future research on protecting these systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoopHash: Cooperative Learning of Multipurpose Descriptor and
+  Contrastive Pair Generator via Variational MCMC Teaching for Supervised Image
+  Hashing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04288v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04288v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khoa D. Doan, Jianwen Xie, Yaxuan Zhu, Yang Zhao, Ping Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging supervised information can lead to superior retrieval performance
+in the image hashing domain but the performance degrades significantly without
+enough labeled data. One effective solution to boost performance is to employ
+generative models, such as Generative Adversarial Networks (GANs), to generate
+synthetic data in an image hashing model. However, GAN-based methods are
+difficult to train, which prevents the hashing approaches from jointly training
+the generative models and the hash functions. This limitation results in
+sub-optimal retrieval performance. To overcome this limitation, we propose a
+novel framework, the generative cooperative hashing network, which is based on
+energy-based cooperative learning. This framework jointly learns a powerful
+generative representation of the data and a robust hash function via two
+components: a top-down contrastive pair generator that synthesizes contrastive
+images and a bottom-up multipurpose descriptor that simultaneously represents
+the images from multiple perspectives, including probability density, hash
+code, latent code, and category. The two components are jointly learned via a
+novel likelihood-based cooperative learning scheme. We conduct experiments on
+several real-world datasets and show that the proposed method outperforms the
+competing hashing supervised methods, achieving up to 10\% relative improvement
+over the current state-of-the-art supervised hashing methods, and exhibits a
+significantly better performance in out-of-distribution retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visualization for Recommendation Explainability: A <span class="highlight-title">Survey</span> and New
+  Perspectives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11755v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11755v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Amine Chatti, Mouadh Guesmi, Arham Muslim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Providing system-generated explanations for recommendations represents an
+important step towards transparent and trustworthy recommender systems.
+Explainable recommender systems provide a human-understandable rationale for
+their outputs. Over the last two decades, explainable recommendation has
+attracted much attention in the recommender systems research community. This
+paper aims to provide a comprehensive review of research efforts on visual
+explanation in recommender systems. More concretely, we systematically review
+the literature on explanations in recommender systems based on four dimensions,
+namely explanation goal, explanation scope, explanation style, and explanation
+format. Recognizing the importance of visualization, we approach the
+recommender system literature from the angle of explanatory visualizations,
+that is using visualizations as a display style of explanation. As a result, we
+derive a set of guidelines that might be constructive for designing explanatory
+visualizations in recommender systems and identify perspectives for future work
+in this field. The aim of this review is to help recommendation researchers and
+practitioners better understand the potential of visually explainable
+recommendation research and to support them in the systematic design of visual
+explanations in current and future recommender systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article has been accepted for publication in the ACM
+  Transactions on Interactive Intelligent Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BanglaAutoKG: Automatic Bangla Knowledge Graph Construction with
+  Semantic Neural Graph Filtering <span class="chip">LREC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03528v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03528v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azmine Toushik Wasi, Taki Hasan Rafi, Raima Islam, Dong-Kyu Chae
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graphs (KGs) have proven essential in information processing and
+reasoning applications because they link related entities and give context-rich
+information, supporting efficient information retrieval and knowledge
+discovery; presenting information flow in a very effective manner. Despite
+being widely used globally, Bangla is relatively underrepresented in KGs due to
+a lack of comprehensive datasets, encoders, NER (named entity recognition)
+models, POS (part-of-speech) taggers, and lemmatizers, hindering efficient
+information processing and reasoning applications in the language. Addressing
+the KG scarcity in Bengali, we propose BanglaAutoKG, a pioneering framework
+that is able to automatically construct Bengali KGs from any Bangla text. We
+utilize multilingual LLMs to understand various languages and correlate
+entities and relations universally. By employing a translation dictionary to
+identify English equivalents and extracting word features from pre-trained BERT
+models, we construct the foundational KG. To reduce noise and align word
+embeddings with our goal, we employ graph-based polynomial filters. Lastly, we
+implement a GNN-based semantic filter, which elevates contextual understanding
+and trims unnecessary edges, culminating in the formation of the definitive KG.
+Empirical findings and case studies demonstrate the universal effectiveness of
+our model, capable of autonomously constructing semantically enriched KGs from
+any text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures. Accepted to LREC-COLING 2024. Read in ACL
+  Anthology: https://aclanthology.org/2024.lrec-main.189/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Self-Augmented In-Context Learning for Unsupervised Word Translation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10024v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10024v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaoyiran Li, Anna Korhonen, Ivan Vulić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that, while large language models (LLMs) demonstrate
+strong word translation or bilingual lexicon induction (BLI) capabilities in
+few-shot setups, they still cannot match the performance of 'traditional'
+mapping-based approaches in the unsupervised scenario where no seed translation
+pairs are available, especially for lower-resource languages. To address this
+challenge with LLMs, we propose self-augmented in-context learning (SAIL) for
+unsupervised BLI: starting from a zero-shot prompt, SAIL iteratively induces a
+set of high-confidence word translation pairs for in-context learning (ICL)
+from an LLM, which it then reapplies to the same LLM in the ICL fashion. Our
+method shows substantial gains over zero-shot prompting of LLMs on two
+established BLI benchmarks spanning a wide range of language pairs, also
+outperforming mapping-based baselines across the board. In addition to
+achieving state-of-the-art unsupervised BLI performance, we also conduct
+comprehensive analyses on SAIL and discuss its limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main Conference; 11 Pages, 3 Figures, 9 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Open-world Cross-Domain Sequential Recommendation: A
+  Model-Agnostic Contrastive Denoising Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04760v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04760v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wujiang Xu, Xuying Ning, Wenfang Lin, Mingming Ha, Qiongxu Ma, Qianqiao Liang, Xuewen Tao, Linxun Chen, Bing Han, Minnan Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain sequential recommendation (CDSR) aims to address the data
+sparsity problems that exist in traditional sequential recommendation (SR)
+systems.
+  The existing approaches aim to design a specific cross-domain unit that can
+transfer and propagate information across multiple domains by relying on
+overlapping users with abundant behaviors. However, in real-world recommender
+systems, CDSR scenarios usually consist of a majority of long-tailed users with
+sparse behaviors and cold-start users who only exist in one domain. This leads
+to a drop in the performance of existing CDSR methods in the real-world
+industry platform. Therefore, improving the consistency and effectiveness of
+models in open-world CDSR scenarios is crucial for constructing CDSR models
+(\textit{1st} CH). Recently, some SR approaches have utilized auxiliary
+behaviors to complement the information for long-tailed users. However, these
+multi-behavior SR methods cannot deliver promising performance in CDSR, as they
+overlook the semantic gap between target and auxiliary behaviors, as well as
+user interest deviation across domains (\textit{2nd} CH).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Sample Dynamic Time Warping for Few-Shot Keyword Spotting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14903v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14903v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Wilkinghoff, Alessia Cornaggia-Urrigshardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In multi-sample keyword spotting, each keyword class is represented by
+multiple spoken instances, called samples. A na\"ive approach to detect
+keywords in a target sequence consists of querying all samples of all classes
+using sub-sequence dynamic time warping. However, the resulting processing time
+increases linearly with respect to the number of samples belonging to each
+class. Alternatively, only a single Fr\'echet mean can be queried for each
+class, resulting in reduced processing time but usually also in worse detection
+performance as the variability of the query samples is not captured
+sufficiently well. In this work, multi-sample dynamic time warping is proposed
+to compute class-specific cost-tensors that include the variability of all
+query samples. To significantly reduce the computational complexity during
+inference, these cost tensors are converted to cost matrices before applying
+dynamic time warping. In experimental evaluations for few-shot keyword
+spotting, it is shown that this method yields a very similar performance as
+using all individual query samples as templates while having a runtime that is
+only slightly slower than when using Fr\'echet means.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at EUSIPCO 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Poisoning Attacks and Defenses in Recommender Systems: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01022v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01022v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongwei Wang, Junliang Yu, Min Gao, Wei Yuan, Guanhua Ye, Shazia Sadiq, Hongzhi Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern recommender systems (RS) have profoundly enhanced user experience
+across digital platforms, yet they face significant threats from poisoning
+attacks. These attacks, aimed at manipulating recommendation outputs for
+unethical gains, exploit vulnerabilities in RS through injecting malicious data
+or intervening model training. This survey presents a unique perspective by
+examining these threats through the lens of an attacker, offering fresh
+insights into their mechanics and impacts. Concretely, we detail a systematic
+pipeline that encompasses four stages of a poisoning attack: setting attack
+goals, assessing attacker capabilities, analyzing victim architecture, and
+implementing poisoning strategies. The pipeline not only aligns with various
+attack tactics but also serves as a comprehensive taxonomy to pinpoint focuses
+of distinct poisoning attacks. Correspondingly, we further classify defensive
+strategies into two main categories: poisoning data filtering and robust
+training from the defender's perspective. Finally, we highlight existing
+limitations and suggest innovative directions for further exploration in this
+field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pairwise Ranking Loss for Multi-Task Learning in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02163v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02163v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Furkan Durmus, Hasan Saribas, Said Aldemir, Junyan Yang, Hakan Cevikalp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Task Learning (MTL) plays a crucial role in real-world advertising
+applications such as recommender systems, aiming to achieve robust
+representations while minimizing resource consumption. MTL endeavors to
+simultaneously optimize multiple tasks to construct a unified model serving
+diverse objectives. In online advertising systems, tasks like Click-Through
+Rate (CTR) and Conversion Rate (CVR) are often treated as MTL problems
+concurrently. However, it has been overlooked that a conversion ($y_{cvr}=1$)
+necessitates a preceding click ($y_{ctr}=1$). In other words, while certain CTR
+tasks are associated with corresponding conversions, others lack such
+associations. Moreover, the likelihood of noise is significantly higher in CTR
+tasks where conversions do not occur compared to those where they do, and
+existing methods lack the ability to differentiate between these two scenarios.
+In this study, exposure labels corresponding to conversions are regarded as
+definitive indicators, and a novel task-specific loss is introduced by
+calculating a \textbf{p}air\textbf{wise} \textbf{r}anking (PWiseR) loss between
+model predictions, manifesting as pairwise ranking loss, to encourage the model
+to rely more on them. To demonstrate the effect of the proposed loss function,
+experiments were conducted on different MTL and Single-Task Learning (STL)
+models using four distinct public MTL datasets, namely Alibaba FR, NL, US, and
+CCP, along with a proprietary industrial dataset. The results indicate that our
+proposed loss function outperforms the BCE loss function in most cases in terms
+of the AUC metric.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What's happening in your neighborhood? A Weakly Supervised Approach to
+  Detect Local News 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08146v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08146v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deven Santosh Shah, Shiying He, Gosuddin Kamaruddin Siddiqi, Radhika Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Local news articles are a subset of news that impact users in a geographical
+area, such as a city, county, or state. Detecting local news (Step 1) and
+subsequently deciding its geographical location as well as radius of impact
+(Step 2) are two important steps towards accurate local news recommendation.
+Naive rule-based methods, such as detecting city names from the news title,
+tend to give erroneous results due to lack of understanding of the news
+content. Empowered by the latest development in natural language processing, we
+develop an integrated pipeline that enables automatic local news detection and
+content-based local news recommendations. In this paper, we focus on Step 1 of
+the pipeline, which highlights: (1) a weakly supervised framework incorporated
+with domain knowledge and auto data processing, and (2) scalability to
+multi-lingual settings. Compared with Stanford CoreNLP NER model, our pipeline
+has higher precision and recall evaluated on a real-world and human-labeled
+dataset. This pipeline has potential to more precise local news to users, helps
+local businesses get more exposure, and gives people more information about
+their neighborhood safety.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 2 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DRAGIN: Dynamic Retrieval Augmented Generation based on the Information
+  Needs of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10081v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10081v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihang Su, Yichen Tang, Qingyao Ai, Zhijing Wu, Yiqun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic retrieval augmented generation (RAG) paradigm actively decides when
+and what to retrieve during the text generation process of Large Language
+Models (LLMs). There are two key elements of this paradigm: identifying the
+optimal moment to activate the retrieval module (deciding when to retrieve) and
+crafting the appropriate query once retrieval is triggered (determining what to
+retrieve). However, current dynamic RAG methods fall short in both aspects.
+Firstly, the strategies for deciding when to retrieve often rely on static
+rules. Moreover, the strategies for deciding what to retrieve typically limit
+themselves to the LLM's most recent sentence or the last few tokens, while the
+LLM's real-time information needs may span across the entire context. To
+overcome these limitations, we introduce a new framework, DRAGIN, i.e., Dynamic
+Retrieval Augmented Generation based on the real-time Information Needs of
+LLMs. Our framework is specifically designed to make decisions on when and what
+to retrieve based on the LLM's real-time information needs during the text
+generation process. We evaluate DRAGIN along with existing methods
+comprehensively over 4 knowledge-intensive generation datasets. Experimental
+results show that DRAGIN achieves superior performance on all tasks,
+demonstrating the effectiveness of our method. We have open-sourced all the
+code, data, and models in GitHub: https://github.com/oneal2000/DRAGIN/tree/main
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Wings: Learning Multimodal LLMs without Text-only Forgetting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Kai Zhang, Shiyin Lu, Yang Li, Yanqing Ma, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, De-Chuan Zhan, Han-Jia Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs), initiated with a trained LLM, first
+align images with text and then fine-tune on multimodal mixed inputs. However,
+the MLLM catastrophically forgets the text-only instructions, which do not
+include images and can be addressed within the initial LLM. In this paper, we
+present Wings, a novel MLLM that excels in both text-only dialogues and
+multimodal comprehension. Analyzing MLLM attention in multimodal instructions
+reveals that text-only forgetting is related to the attention shifts from
+pre-image to post-image text. From that, we construct extra modules that act as
+the boosted learner to compensate for the attention shift. The complementary
+visual and textual learners, like "wings" on either side, are connected in
+parallel within each layer's attention block. Initially, image and text inputs
+are aligned with visual learners operating alongside the main attention,
+balancing focus on visual elements. Textual learners are later collaboratively
+integrated with attention-based routing to blend the outputs of the visual and
+textual learners. We design the Low-Rank Residual Attention (LoRRA) to
+guarantee high efficiency for learners. Our experimental results demonstrate
+that Wings outperforms equally-scaled MLLMs in both text-only and visual
+question-answering tasks. On a newly constructed Interleaved Image-Text (IIT)
+benchmark, Wings exhibits superior performance from text-only-rich to
+multimodal-rich question-answering tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grokking Modular Polynomials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Darshil Doshi, Tianyu He, Aritra Das, Andrey Gromov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks readily learn a subset of the modular arithmetic tasks, while
+failing to generalize on the rest. This limitation remains unmoved by the
+choice of architecture and training strategies. On the other hand, an
+analytical solution for the weights of Multi-layer Perceptron (MLP) networks
+that generalize on the modular addition task is known in the literature. In
+this work, we (i) extend the class of analytical solutions to include modular
+multiplication as well as modular addition with many terms. Additionally, we
+show that real networks trained on these datasets learn similar solutions upon
+generalization (grokking). (ii) We combine these "expert" solutions to
+construct networks that generalize on arbitrary modular polynomials. (iii) We
+hypothesize a classification of modular polynomials into learnable and
+non-learnable via neural networks training; and provide experimental evidence
+supporting our claims.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7+4 pages, 3 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving Poisson Equations using Neural Walk-on-Spheres <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03494v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03494v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Chul Nam, Julius Berner, Anima Anandkumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Neural Walk-on-Spheres (NWoS), a novel neural PDE solver for the
+efficient solution of high-dimensional Poisson equations. Leveraging stochastic
+representations and Walk-on-Spheres methods, we develop novel losses for neural
+networks based on the recursive solution of Poisson equations on spheres inside
+the domain. The resulting method is highly parallelizable and does not require
+spatial gradients for the loss. We provide a comprehensive comparison against
+competing methods based on PINNs, the Deep Ritz method, and (backward)
+stochastic differential equations. In several challenging, high-dimensional
+numerical examples, we demonstrate the superiority of NWoS in accuracy, speed,
+and computational costs. Compared to commonly used PINNs, our approach can
+reduce memory usage and errors by orders of magnitude. Furthermore, we apply
+NWoS to problems in PDE-constrained optimization and molecular dynamics to show
+its efficiency in practical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Highway Value Iteration Networks <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhui Wang, Weida Li, Francesco Faccio, Qingyuan Wu, Jürgen Schmidhuber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Value iteration networks (VINs) enable end-to-end learning for planning tasks
+by employing a differentiable "planning module" that approximates the value
+iteration algorithm. However, long-term planning remains a challenge because
+training very deep VINs is difficult. To address this problem, we embed highway
+value iteration -- a recent algorithm designed to facilitate long-term credit
+assignment -- into the structure of VINs. This improvement augments the
+"planning module" of the VIN with three additional components: 1) an "aggregate
+gate," which constructs skip connections to improve information flow across
+many layers; 2) an "exploration module," crafted to increase the diversity of
+information and gradient flow in spatial dimensions; 3) a "filter gate"
+designed to ensure safe exploration. The resulting novel highway VIN can be
+trained effectively with hundreds of layers using standard backpropagation. In
+long-term planning tasks requiring hundreds of planning steps, deep highway
+VINs outperform both traditional VINs and several advanced, very deep NNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QJL: 1-Bit Quantized JL Transform for KV Cache Quantization with Zero
+  Overhead 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Zandieh, Majid Daliri, Insu Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Serving LLMs requires substantial memory due to the storage requirements of
+Key-Value (KV) embeddings in the KV cache, which grows with sequence length. An
+effective approach to compress KV cache is quantization. However, traditional
+quantization methods face significant memory overhead due to the need to store
+quantization constants (at least a zero point and a scale) in full precision
+per data block. Depending on the block size, this overhead can add 1 or 2 bits
+per quantized number. We introduce QJL, a new quantization approach that
+consists of a Johnson-Lindenstrauss (JL) transform followed by sign-bit
+quantization. In contrast to existing methods, QJL eliminates memory overheads
+by removing the need for storing quantization constants. We propose an
+asymmetric estimator for the inner product of two vectors and demonstrate that
+applying QJL to one vector and a standard JL transform without quantization to
+the other provides an unbiased estimator with minimal distortion. We have
+developed an efficient implementation of the QJL sketch and its corresponding
+inner product estimator, incorporating a lightweight CUDA kernel for optimized
+computation. When applied across various LLMs and NLP tasks to quantize the KV
+cache to only 3 bits, QJL demonstrates a more than fivefold reduction in KV
+cache memory usage without compromising accuracy, all while achieving faster
+runtime. Codes are available at \url{https://github.com/amirzandieh/QJL}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convolutional Neural Networks and Vision <span class="highlight-title">Transformer</span>s for Fashion MNIST
+  Classification: A Literature <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03478v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03478v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sonia Bbouzidi, Ghazala Hcini, Imen Jdey, Fadoua Drira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our review explores the comparative analysis between Convolutional Neural
+Networks (CNNs) and Vision Transformers (ViTs) in the domain of image
+classification, with a particular focus on clothing classification within the
+e-commerce sector. Utilizing the Fashion MNIST dataset, we delve into the
+unique attributes of CNNs and ViTs. While CNNs have long been the cornerstone
+of image classification, ViTs introduce an innovative self-attention mechanism
+enabling nuanced weighting of different input data components. Historically,
+transformers have primarily been associated with Natural Language Processing
+(NLP) tasks. Through a comprehensive examination of existing literature, our
+aim is to unveil the distinctions between ViTs and CNNs in the context of image
+classification. Our analysis meticulously scrutinizes state-of-the-art
+methodologies employing both architectures, striving to identify the factors
+influencing their performance. These factors encompass dataset characteristics,
+image dimensions, the number of target classes, hardware infrastructure, and
+the specific architectures along with their respective top results. Our key
+goal is to determine the most appropriate architecture between ViT and CNN for
+classifying images in the Fashion MNIST dataset within the e-commerce industry,
+while taking into account specific conditions and needs. We highlight the
+importance of combining these two architectures with different forms to enhance
+overall performance. By uniting these architectures, we can take advantage of
+their unique strengths, which may lead to more precise and reliable models for
+e-commerce applications. CNNs are skilled at recognizing local patterns, while
+ViTs are effective at grasping overall context, making their combination a
+promising strategy for boosting image classification performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does your data spark joy? Performance gains from domain upsampling at
+  the end of training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03476v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03476v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cody Blakeney, Mansheej Paul, Brett W. Larsen, Sean Owen, Jonathan Frankle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretraining datasets for large language models (LLMs) have grown to trillions
+of tokens composed of large amounts of CommonCrawl (CC) web scrape along with
+smaller, domain-specific datasets. It is expensive to understand the impact of
+these domain-specific datasets on model capabilities as training at large FLOP
+scales is required to reveal significant changes to difficult and emergent
+benchmarks. Given the increasing cost of experimenting with pretraining data,
+how does one determine the optimal balance between the diversity in general web
+scrapes and the information density of domain specific data? In this work, we
+show how to leverage the smaller domain specific datasets by upsampling them
+relative to CC at the end of training to drive performance improvements on
+difficult benchmarks. This simple technique allows us to improve up to 6.90 pp
+on MMLU, 8.26 pp on GSM8K, and 6.17 pp on HumanEval relative to the base data
+mix for a 7B model trained for 1 trillion (T) tokens, thus rivaling Llama-2
+(7B)$\unicode{x2014}$a model trained for twice as long. We experiment with
+ablating the duration of domain upsampling from 5% to 30% of training and find
+that 10% to 20% percent is optimal for navigating the tradeoff between general
+language modeling capabilities and targeted benchmarks. We also use domain
+upsampling to characterize at scale the utility of individual datasets for
+improving various benchmarks by removing them during this final phase of
+training. This tool opens up the ability to experiment with the impact of
+different pretraining datasets at scale, but at an order of magnitude lower
+cost compared to full pretraining runs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first three authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving Differential Equations using Physics-Informed Deep Equilibrium
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03472v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03472v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Machado Pacheco, Eduardo Camponogara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Physics-Informed Deep Equilibrium Models (PIDEQs) for
+solving initial value problems (IVPs) of ordinary differential equations
+(ODEs). Leveraging recent advancements in deep equilibrium models (DEQs) and
+physics-informed neural networks (PINNs), PIDEQs combine the implicit output
+representation of DEQs with physics-informed training techniques. We validate
+PIDEQs using the Van der Pol oscillator as a benchmark problem, demonstrating
+their efficiency and effectiveness in solving IVPs. Our analysis includes key
+hyperparameter considerations for optimizing PIDEQ performance. By bridging
+deep learning and physics-based modeling, this work advances computational
+techniques for solving IVPs, with implications for scientific computing and
+engineering applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CASE 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Node-wise Filtering in Graph Neural Networks: A Mixture of Experts
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Han, Juanhui Li, Wei Huang, Xianfeng Tang, Hanqing Lu, Chen Luo, Hui Liu, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have proven to be highly effective for node
+classification tasks across diverse graph structural patterns. Traditionally,
+GNNs employ a uniform global filter, typically a low-pass filter for homophilic
+graphs and a high-pass filter for heterophilic graphs. However, real-world
+graphs often exhibit a complex mix of homophilic and heterophilic patterns,
+rendering a single global filter approach suboptimal. In this work, we
+theoretically demonstrate that a global filter optimized for one pattern can
+adversely affect performance on nodes with differing patterns. To address this,
+we introduce a novel GNN framework Node-MoE that utilizes a mixture of experts
+to adaptively select the appropriate filters for different nodes. Extensive
+experiments demonstrate the effectiveness of Node-MoE on both homophilic and
+heterophilic graphs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The PESQetarian: On the Relevance of Goodhart's Law for Speech
+  Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03460v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03460v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danilo de Oliveira, Simon Welker, Julius Richter, Timo Gerkmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To obtain improved speech enhancement models, researchers often focus on
+increasing performance according to specific instrumental metrics. However,
+when the same metric is used in a loss function to optimize models, it may be
+detrimental to aspects that the given metric does not see. The goal of this
+paper is to illustrate the risk of overfitting a speech enhancement model to
+the metric used for evaluation. For this, we introduce enhancement models that
+exploit the widely used PESQ measure. Our "PESQetarian" model achieves 3.82
+PESQ on VB-DMD while scoring very poorly in a listening experiment. While the
+obtained PESQ value of 3.82 would imply "state-of-the-art" PESQ-performance on
+the VB-DMD benchmark, our examples show that when optimizing w.r.t. a metric,
+an isolated evaluation on the same metric may be misleading. Instead, other
+metrics should be included in the evaluation and the resulting performance
+predictions should be confirmed by listening.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Interspeech 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distributional Adversarial Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03458v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03458v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saba Ahmadi, Siddharth Bhandari, Avrim Blum, Chen Dan, Prabhav Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major challenge in defending against adversarial attacks is the enormous
+space of possible attacks that even a simple adversary might perform. To
+address this, prior work has proposed a variety of defenses that effectively
+reduce the size of this space. These include randomized smoothing methods that
+add noise to the input to take away some of the adversary's impact. Another
+approach is input discretization which limits the adversary's possible number
+of actions.
+  Motivated by these two approaches, we introduce a new notion of adversarial
+loss which we call distributional adversarial loss, to unify these two forms of
+effectively weakening an adversary. In this notion, we assume for each original
+example, the allowed adversarial perturbation set is a family of distributions
+(e.g., induced by a smoothing procedure), and the adversarial loss over each
+example is the maximum loss over all the associated distributions. The goal is
+to minimize the overall adversarial loss.
+  We show generalization guarantees for our notion of adversarial loss in terms
+of the VC-dimension of the hypothesis class and the size of the set of allowed
+adversarial distributions associated with each input. We also investigate the
+role of randomness in achieving robustness against adversarial attacks in the
+methods described above. We show a general derandomization technique that
+preserves the extent of a randomized classifier's robustness against
+adversarial attacks. We corroborate the procedure experimentally via
+derandomizing the Random Projection Filters framework of
+\cite{dong2023adversarial}. Our procedure also improves the robustness of the
+model against various adversarial attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FILS: <span class="highlight-title">Self-Supervised</span> Video Feature Prediction In Semantic Language
+  Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mona Ahmadian, Frank Guerin, Andrew Gilbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper demonstrates a self-supervised approach for learning semantic
+video representations. Recent vision studies show that a masking strategy for
+vision and natural language supervision has contributed to developing
+transferable visual pretraining. Our goal is to achieve a more semantic video
+representation by leveraging the text related to the video content during the
+pretraining in a fully self-supervised manner. To this end, we present FILS, a
+novel self-supervised video Feature prediction In semantic Language Space
+(FILS). The vision model can capture valuable structured information by
+correctly predicting masked feature semantics in language space. It is learned
+using a patch-wise video-text contrastive strategy, in which the text
+representations act as prototypes for transforming vision features into a
+language space, which are then used as targets for semantically meaningful
+feature prediction using our masked encoder-decoder structure. FILS
+demonstrates remarkable transferability on downstream action recognition tasks,
+achieving state-of-the-art on challenging egocentric datasets, like
+Epic-Kitchens, Something-SomethingV2, Charades-Ego, and EGTEA, using ViT-Base.
+Our efficient method requires less computation and smaller batches compared to
+previous works.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Pre-train</span>ed Large Language Models Use Fourier Features to Compute
+  Addition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Zhou, Deqing Fu, Vatsal Sharan, Robin Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained large language models (LLMs) exhibit impressive mathematical
+reasoning capabilities, yet how they compute basic arithmetic, such as
+addition, remains unclear. This paper shows that pre-trained LLMs add numbers
+using Fourier features -- dimensions in the hidden state that represent numbers
+via a set of features sparse in the frequency domain. Within the model, MLP and
+attention layers use Fourier features in complementary ways: MLP layers
+primarily approximate the magnitude of the answer using low-frequency features,
+while attention layers primarily perform modular addition (e.g., computing
+whether the answer is even or odd) using high-frequency features. Pre-training
+is crucial for this mechanism: models trained from scratch to add numbers only
+exploit low-frequency features, leading to lower accuracy. Introducing
+pre-trained token embeddings to a randomly initialized model rescues its
+performance. Overall, our analysis demonstrates that appropriate pre-trained
+representations (e.g., Fourier features) can unlock the ability of Transformers
+to learn precise mechanisms for algorithmic tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cycles of Thought: Measuring LLM Confidence through Stable Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evan Becker, Stefano Soatto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many high-risk machine learning applications it is essential for a model
+to indicate when it is uncertain about a prediction. While large language
+models (LLMs) can reach and even surpass human-level accuracy on a variety of
+benchmarks, their overconfidence in incorrect responses is still a
+well-documented failure mode. Traditional methods for ML uncertainty
+quantification can be difficult to directly adapt to LLMs due to the
+computational cost of implementation and closed-source nature of many models. A
+variety of black-box methods have recently been proposed, but these often rely
+on heuristics such as self-verbalized confidence. We instead propose a
+framework for measuring an LLM's uncertainty with respect to the distribution
+of generated explanations for an answer. While utilizing explanations is not a
+new idea in and of itself, by interpreting each possible model+explanation pair
+as a test-time classifier we can calculate a posterior answer distribution over
+the most likely of these classifiers. We demonstrate how a specific instance of
+this framework using explanation entailment as our classifier likelihood
+improves confidence score metrics (in particular AURC and AUROC) over baselines
+across five different datasets. We believe these results indicate that our
+framework is both a well-principled and effective way of quantifying
+uncertainty in LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transfer Learning for Latent Variable Network Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03437v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03437v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akhil Jalan, Arya Mazumdar, Soumendu Sundar Mukherjee, Purnamrita Sarkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study transfer learning for estimation in latent variable network models.
+In our setting, the conditional edge probability matrices given the latent
+variables are represented by $P$ for the source and $Q$ for the target. We wish
+to estimate $Q$ given two kinds of data: (1) edge data from a subgraph induced
+by an $o(1)$ fraction of the nodes of $Q$, and (2) edge data from all of $P$.
+If the source $P$ has no relation to the target $Q$, the estimation error must
+be $\Omega(1)$. However, we show that if the latent variables are shared, then
+vanishing error is possible. We give an efficient algorithm that utilizes the
+ordering of a suitably defined graph distance. Our algorithm achieves $o(1)$
+error and does not assume a parametric form on the source or target networks.
+Next, for the specific case of Stochastic Block Models we prove a minimax lower
+bound and show that a simple algorithm achieves this rate. Finally, we
+empirically demonstrate our algorithm's use on real-world and simulated graph
+transfer problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified PAC-Bayesian Study of Pessimism for Offline Policy Learning with
+  Regularized Importance Sampling <span class="chip">UAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03434v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03434v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Imad Aouali, Victor-Emmanuel Brunel, David Rohde, Anna Korba
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Off-policy learning (OPL) often involves minimizing a risk estimator based on
+importance weighting to correct bias from the logging policy used to collect
+data. However, this method can produce an estimator with a high variance. A
+common solution is to regularize the importance weights and learn the policy by
+minimizing an estimator with penalties derived from generalization bounds
+specific to the estimator. This approach, known as pessimism, has gained recent
+attention but lacks a unified framework for analysis. To address this gap, we
+introduce a comprehensive PAC-Bayesian framework to examine pessimism with
+regularized importance weighting. We derive a tractable PAC-Bayesian
+generalization bound that universally applies to common importance weight
+regularizations, enabling their comparison within a single framework. Our
+empirical results challenge common understanding, demonstrating the
+effectiveness of standard IW regularization techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at UAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HelloFresh: LLM Evaluations on Streams of Real-World Human Editorial
+  Actions across X Community Notes and Wikipedia edits <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Franzmeyer, Aleksandar Shtedritski, Samuel Albanie, Philip Torr, João F. Henriques, Jakob N. Foerster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benchmarks have been essential for driving progress in machine learning. A
+better understanding of LLM capabilities on real world tasks is vital for safe
+development. Designing adequate LLM benchmarks is challenging: Data from
+real-world tasks is hard to collect, public availability of static evaluation
+data results in test data contamination and benchmark overfitting, and
+periodically generating new evaluation data is tedious and may result in
+temporally inconsistent results. We introduce HelloFresh, based on continuous
+streams of real-world data generated by intrinsically motivated human labelers.
+It covers recent events from X (formerly Twitter) community notes and edits of
+Wikipedia pages, mitigating the risk of test data contamination and benchmark
+overfitting. Any X user can propose an X note to add additional context to a
+misleading post (formerly tweet); if the community classifies it as helpful, it
+is shown with the post. Similarly, Wikipedia relies on community-based
+consensus, allowing users to edit articles or revert edits made by other users.
+Verifying whether an X note is helpful or whether a Wikipedia edit should be
+accepted are hard tasks that require grounding by querying the web. We backtest
+state-of-the-art LLMs supplemented with simple web search access and find that
+HelloFresh yields a temporally consistent ranking. To enable continuous
+evaluation on HelloFresh, we host a public leaderboard and periodically updated
+evaluation data at https://tinyurl.com/hello-fresh-LLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Methods for Class-Imbalanced Learning with Support Vector Machines: A
+  <span class="highlight-title">Review</span> and an Empirical Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salim rezvani, Farhad Pourpanah, Chee Peng Lim, Q. M. Jonathan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a review on methods for class-imbalanced learning with
+the Support Vector Machine (SVM) and its variants. We first explain the
+structure of SVM and its variants and discuss their inefficiency in learning
+with class-imbalanced data sets. We introduce a hierarchical categorization of
+SVM-based models with respect to class-imbalanced learning. Specifically, we
+categorize SVM-based models into re-sampling, algorithmic, and fusion methods,
+and discuss the principles of the representative models in each category. In
+addition, we conduct a series of empirical evaluations to compare the
+performances of various representative SVM-based models in each category using
+benchmark imbalanced data sets, ranging from low to high imbalanced ratios. Our
+findings reveal that while algorithmic methods are less time-consuming owing to
+no data pre-processing requirements, fusion methods, which combine both
+re-sampling and algorithmic approaches, generally perform the best, but with a
+higher computational load. A discussion on research gaps and future research
+directions is provided.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in Soft Computing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Noisy Data Visualization using Functional Data Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03396v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03396v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhe Chen, Andres Felipe Duque Correa, Guy Wolf, Kevin R. Moon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data visualization via dimensionality reduction is an important tool in
+exploratory data analysis. However, when the data are noisy, many existing
+methods fail to capture the underlying structure of the data. The method called
+Empirical Intrinsic Geometry (EIG) was previously proposed for performing
+dimensionality reduction on high dimensional dynamical processes while
+theoretically eliminating all noise. However, implementing EIG in practice
+requires the construction of high-dimensional histograms, which suffer from the
+curse of dimensionality. Here we propose a new data visualization method called
+Functional Information Geometry (FIG) for dynamical processes that adapts the
+EIG framework while using approaches from functional data analysis to mitigate
+the curse of dimensionality. We experimentally demonstrate that the resulting
+method outperforms a variant of EIG designed for visualization in terms of
+capturing the true structure, hyperparameter robustness, and computational
+speed. We then use our method to visualize EEG brain measurements of sleep
+activity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Author, Content or Sharers? Estimating Spread Dynamics with Bayesian
+  Mixture Hawkes <span class="chip">ECML-PKDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pio Calderon, Marian-Andrei Rizoiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The spread of content on social media is shaped by intertwining factors on
+three levels: the source, the content itself, and the pathways of content
+spread. At the lowest level, the popularity of the sharing user determines its
+eventual reach. However, higher-level factors such as the nature of the online
+item and the credibility of its source also play crucial roles in determining
+how widely and rapidly the online item spreads. In this work, we propose the
+Bayesian Mixture Hawkes (BMH) model to jointly learn the influence of source,
+content and spread. We formulate the BMH model as a hierarchical mixture model
+of separable Hawkes processes, accommodating different classes of Hawkes
+dynamics and the influence of feature sets on these classes. We test the BMH
+model on two learning tasks, cold-start popularity prediction and temporal
+profile generalization performance, applying to two real-world retweet cascade
+datasets referencing articles from controversial and traditional media
+publishers. The BMH model outperforms the state-of-the-art models and
+predictive baselines on both datasets and utilizes cascade- and item-level
+information better than the alternatives. Lastly, we perform a counter-factual
+analysis where we apply the trained publisher-level BMH models to a set of
+article headlines and show that effectiveness of headline writing style
+(neutral, clickbait, inflammatory) varies across publishers. The BMH model
+unveils differences in style effectiveness between controversial and reputable
+publishers, where we find clickbait to be notably more effective for reputable
+publishers as opposed to controversial ones, which links to the latter's
+overuse of clickbait.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted in the European Conference on Machine Learning and
+  Principles and Practice of Knowledge Discovery in Databases (ECML-PKDD) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Long Range Dependencies on Graphs via Random Walks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03386v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03386v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dexiong Chen, Till Hendrik Schulz, Karsten Borgwardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Message-passing graph neural networks (GNNs), while excelling at capturing
+local relationships, often struggle with long-range dependencies on graphs.
+Conversely, graph transformers (GTs) enable information exchange between all
+nodes but oversimplify the graph structure by treating them as a set of
+fixed-length vectors. This work proposes a novel architecture, NeuralWalker,
+that overcomes the limitations of both methods by combining random walks with
+message passing. NeuralWalker achieves this by treating random walks as
+sequences, allowing for the application of recent advances in sequence models
+in order to capture long-range dependencies within these walks. Based on this
+concept, we propose a framework that offers (1) more expressive graph
+representations through random walk sequences, (2) the ability to utilize any
+sequence model for capturing long-range dependencies, and (3) the flexibility
+by integrating various GNN and GT architectures. Our experimental evaluations
+demonstrate that NeuralWalker achieves significant performance improvements on
+19 graph and node benchmark datasets, notably outperforming existing methods by
+up to 13% on the PascalVoc-SP and COCO-SP datasets. Code is available at
+https://github.com/BorgwardtLab/NeuralWalker.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training of Physical Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03372v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03372v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Momeni, Babak Rahmani, Benjamin Scellier, Logan G. Wright, Peter L. McMahon, Clara C. Wanjura, Yuhang Li, Anas Skalli, Natalia G. Berloff, Tatsuhiro Onodera, Ilker Oguz, Francesco Morichetti, Philipp del Hougne, Manuel Le Gallo, Abu Sebastian, Azalia Mirhoseini, Cheng Zhang, Danijela Marković, Daniel Brunner, Christophe Moser, Sylvain Gigan, Florian Marquardt, Aydogan Ozcan, Julie Grollier, Andrea J. Liu, Demetri Psaltis, Andrea Alù, Romain Fleury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physical neural networks (PNNs) are a class of neural-like networks that
+leverage the properties of physical systems to perform computation. While PNNs
+are so far a niche research area with small-scale laboratory demonstrations,
+they are arguably one of the most underappreciated important opportunities in
+modern AI. Could we train AI models 1000x larger than current ones? Could we do
+this and also have them perform inference locally and privately on edge
+devices, such as smartphones or sensors? Research over the past few years has
+shown that the answer to all these questions is likely "yes, with enough
+research": PNNs could one day radically change what is possible and practical
+for AI systems. To do this will however require rethinking both how AI models
+work, and how they are trained - primarily by considering the problems through
+the constraints of the underlying hardware physics. To train PNNs at large
+scale, many methods including backpropagation-based and backpropagation-free
+approaches are now being explored. These methods have various trade-offs, and
+so far no method has been shown to scale to the same scale and performance as
+the backpropagation algorithm widely used in deep learning today. However, this
+is rapidly changing, and a diverse ecosystem of training techniques provides
+clues for how PNNs may one day be utilized to create both more efficient
+realizations of current-scale AI models, and to enable unprecedented-scale
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Posterior and variational inference for deep neural networks with
+  heavy-tailed weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03369v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03369v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ismaël Castillo, Paul Egels
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider deep neural networks in a Bayesian framework with a prior
+distribution sampling the network weights at random. Following a recent idea of
+Agapiou and Castillo (2023), who show that heavy-tailed prior distributions
+achieve automatic adaptation to smoothness, we introduce a simple Bayesian deep
+learning prior based on heavy-tailed weights and ReLU activation. We show that
+the corresponding posterior distribution achieves near-optimal minimax
+contraction rates, simultaneously adaptive to both intrinsic dimension and
+smoothness of the underlying function, in a variety of contexts including
+nonparametric regression, geometric data and Besov spaces. While most works so
+far need a form of model selection built-in within the prior distribution, a
+key aspect of our approach is that it does not require to sample
+hyperparameters to learn the architecture of the network. We also provide
+variational Bayes counterparts of the results, that show that mean-field
+variational approximations still benefit from near-optimal theoretical support.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Matters in Hierarchical Search for Combinatorial Reasoning
+  Problems? <span class="chip">ICLR
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michał Zawalski, Gracjan Góral, Michał Tyrolski, Emilia Wiśnios, Franciszek Budrowski, Łukasz Kuciński, Piotr Miłoś
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficiently tackling combinatorial reasoning problems, particularly the
+notorious NP-hard tasks, remains a significant challenge for AI research.
+Recent efforts have sought to enhance planning by incorporating hierarchical
+high-level search strategies, known as subgoal methods. While promising, their
+performance against traditional low-level planners is inconsistent, raising
+questions about their application contexts. In this study, we conduct an
+in-depth exploration of subgoal-planning methods for combinatorial reasoning.
+We identify the attributes pivotal for leveraging the advantages of high-level
+search: hard-to-learn value functions, complex action spaces, presence of dead
+ends in the environment, or using data collected from diverse experts. We
+propose a consistent evaluation methodology to achieve meaningful comparisons
+between methods and reevaluate the state-of-the-art algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for Generative Models for Decision Making Workshop at ICLR
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cooperative learning of Pl@ntNet's Artificial Intelligence algorithm:
+  how does it work and how can we improve it? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanguy Lefort, Antoine Affouard, Benjamin Charlier, Jean-Christophe Lombardo, Mathias Chouet, Hervé Goëau, Joseph Salmon, Pierre Bonnet, Alexis Joly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models for plant species identification rely on large annotated
+datasets. The PlantNet system enables global data collection by allowing users
+to upload and annotate plant observations, leading to noisy labels due to
+diverse user skills. Achieving consensus is crucial for training, but the vast
+scale of collected data makes traditional label aggregation strategies
+challenging. Existing methods either retain all observations, resulting in
+noisy training data or selectively keep those with sufficient votes, discarding
+valuable information. Additionally, as many species are rarely observed, user
+expertise can not be evaluated as an inter-user agreement: otherwise, botanical
+experts would have a lower weight in the AI training step than the average
+user. Our proposed label aggregation strategy aims to cooperatively train plant
+identification AI models. This strategy estimates user expertise as a trust
+score per user based on their ability to identify plant species from
+crowdsourced data. The trust score is recursively estimated from correctly
+identified species given the current estimated labels. This interpretable score
+exploits botanical experts' knowledge and the heterogeneity of users.
+Subsequently, our strategy removes unreliable observations but retains those
+with limited trusted annotations, unlike other approaches. We evaluate
+PlantNet's strategy on a released large subset of the PlantNet database focused
+on European flora, comprising over 6M observations and 800K users. We
+demonstrate that estimating users' skills based on the diversity of their
+expertise enhances labeling performance. Our findings emphasize the synergy of
+human annotation and data filtering in improving AI performance for a refined
+dataset. We explore incorporating AI-based votes alongside human input. This
+can further enhance human-AI interactions to detect unreliable observations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Position: A Call to Action for a Human-Centered AutoML Paradigm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03348v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03348v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marius Lindauer, Florian Karl, Anne Klier, Julia Moosbauer, Alexander Tornede, Andreas Mueller, Frank Hutter, Matthias Feurer, Bernd Bischl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated machine learning (AutoML) was formed around the fundamental
+objectives of automatically and efficiently configuring machine learning (ML)
+workflows, aiding the research of new ML algorithms, and contributing to the
+democratization of ML by making it accessible to a broader audience. Over the
+past decade, commendable achievements in AutoML have primarily focused on
+optimizing predictive performance. This focused progress, while substantial,
+raises questions about how well AutoML has met its broader, original goals. In
+this position paper, we argue that a key to unlocking AutoML's full potential
+lies in addressing the currently underexplored aspect of user interaction with
+AutoML systems, including their diverse roles, expectations, and expertise. We
+envision a more human-centered approach in future AutoML research, promoting
+the collaborative design of ML systems that tightly integrates the
+complementary strengths of human expertise and AutoML methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Normalizing Flows for Conformal Regression <span class="chip">UAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolo Colombo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conformal Prediction (CP) algorithms estimate the uncertainty of a prediction
+model by calibrating its outputs on labeled data. The same calibration scheme
+usually applies to any model and data without modifications. The obtained
+prediction intervals are valid by construction but could be inefficient, i.e.
+unnecessarily big, if the prediction errors are not uniformly distributed over
+the input space.
+  We present a general scheme to localize the intervals by training the
+calibration process. The standard prediction error is replaced by an optimized
+distance metric that depends explicitly on the object attributes. Learning the
+optimal metric is equivalent to training a Normalizing Flow that acts on the
+joint distribution of the errors and the inputs. Unlike the Error Re-weighting
+CP algorithm of Papadopoulos et al. (2008), the framework allows estimating the
+gap between nominal and empirical conditional validity. The approach is
+compatible with existing locally-adaptive CP strategies based on re-weighting
+the calibration samples and applies to any point-prediction model without
+retraining.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented at the 40th Conference on Uncertainty in Artificial
+  Intelligence (UAI 2024). 13 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Contamination: Neural Networks Learn Uncorrelated Features and
+  Fail to Generalize <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianren Zhang, Chujie Zhao, Guanyu Chen, Yizhou Jiang, Feng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning representations that generalize under distribution shifts is
+critical for building robust machine learning models. However, despite
+significant efforts in recent years, algorithmic advances in this direction
+have been limited. In this work, we seek to understand the fundamental
+difficulty of out-of-distribution generalization with deep neural networks. We
+first empirically show that perhaps surprisingly, even allowing a neural
+network to explicitly fit the representations obtained from a teacher network
+that can generalize out-of-distribution is insufficient for the generalization
+of the student network. Then, by a theoretical study of two-layer ReLU networks
+optimized by stochastic gradient descent (SGD) under a structured feature
+model, we identify a fundamental yet unexplored feature learning proclivity of
+neural networks, feature contamination: neural networks can learn uncorrelated
+features together with predictive features, resulting in generalization failure
+under distribution shifts. Notably, this mechanism essentially differs from the
+prevailing narrative in the literature that attributes the generalization
+failure to spurious correlations. Overall, our results offer new insights into
+the non-linear feature learning dynamics of neural networks and highlight the
+necessity of considering inductive biases in out-of-distribution
+generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tackling GenAI Copyright Issues: Originality Estimation and
+  Genericization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hiroaki Chiba-Okabe, Weijie J. Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid progress of generative AI technology has sparked significant
+copyright concerns, leading to numerous lawsuits filed against AI developers.
+While some studies explore methods to mitigate copyright risks by steering the
+outputs of generative models away from those resembling copyrighted data,
+little attention has been paid to the question of how much of a resemblance is
+undesirable; more original or unique data are afforded stronger protection, and
+the threshold level of resemblance for constituting infringement
+correspondingly lower. Here, leveraging this principle, we propose a
+genericization method that modifies the outputs of a generative model to make
+them more generic and less likely to infringe copyright. To achieve this, we
+introduce a metric for quantifying the level of originality of data in a manner
+that is consistent with the legal framework. This metric can be practically
+estimated by drawing samples from a generative model, which is then used for
+the genericization process. Experiments demonstrate that our genericization
+method successfully modifies the output of a text-to-image generative model so
+that it produces more generic, copyright-compliant images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying latent state transition in non-linear dynamical systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Çağlar Hızlı, Çağatay Yıldız, Matthias Bethge, ST John, Pekka Marttinen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work aims to improve generalization and interpretability of dynamical
+systems by recovering the underlying lower-dimensional latent states and their
+time evolutions. Previous work on disentangled representation learning within
+the realm of dynamical systems focused on the latent states, possibly with
+linear transition approximations. As such, they cannot identify nonlinear
+transition dynamics, and hence fail to reliably predict complex future
+behavior. Inspired by the advances in nonlinear ICA, we propose a state-space
+modeling framework in which we can identify not just the latent states but also
+the unknown transition function that maps the past states to the present. We
+introduce a practical algorithm based on variational auto-encoders and
+empirically demonstrate in realistic synthetic settings that we can (i) recover
+latent state dynamics with high accuracy, (ii) correspondingly achieve high
+future prediction accuracy, and (iii) adapt fast to new environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reparameterization invariance in approximate Bayesian inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hrittik Roy, Marco Miani, Carl Henrik Ek, Philipp Hennig, Marvin Pförtner, Lukas Tatzel, Søren Hauberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current approximate posteriors in Bayesian neural networks (BNNs) exhibit a
+crucial limitation: they fail to maintain invariance under reparameterization,
+i.e. BNNs assign different posterior densities to different parametrizations of
+identical functions. This creates a fundamental flaw in the application of
+Bayesian principles as it breaks the correspondence between uncertainty over
+the parameters with uncertainty over the parametrized function. In this paper,
+we investigate this issue in the context of the increasingly popular linearized
+Laplace approximation. Specifically, it has been observed that linearized
+predictives alleviate the common underfitting problems of the Laplace
+approximation. We develop a new geometric view of reparametrizations from which
+we explain the success of linearization. Moreover, we demonstrate that these
+reparameterization invariance properties can be extended to the original neural
+network predictive using a Riemannian diffusion process giving a
+straightforward algorithm for approximate posterior sampling, which empirically
+improves posterior fit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UDQL: Bridging The Gap between MSE Loss and The Optimal Value Function
+  in Offline Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhang, Rui Yu, Zhipeng Yao, Wenyuan Zhang, Jun Wang, Liming Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Mean Square Error (MSE) is commonly utilized to estimate the solution of
+the optimal value function in the vast majority of offline reinforcement
+learning (RL) models and has achieved outstanding performance. However, we find
+that its principle can lead to overestimation phenomenon for the value
+function. In this paper, we first theoretically analyze overestimation
+phenomenon led by MSE and provide the theoretical upper bound of the
+overestimated error. Furthermore, to address it, we propose a novel Bellman
+underestimated operator to counteract overestimation phenomenon and then prove
+its contraction characteristics. At last, we propose the offline RL algorithm
+based on underestimated operator and diffusion policy model. Extensive
+experimental results on D4RL tasks show that our method can outperform
+state-of-the-art offline RL algorithms, which demonstrates that our theoretical
+analysis and underestimation way are effective for offline RL tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reproducibility study of FairAC 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gijs de Jong, Macha J. Meijer, Derck W. E. Prinzhorn, Harold Ruiter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work aims to reproduce the findings of the paper "Fair Attribute
+Completion on Graph with Missing Attributes" written by Guo, Chu, and Li
+arXiv:2302.12977 by investigating the claims made in the paper. This paper
+suggests that the results of the original paper are reproducible and thus, the
+claims hold. However, the claim that FairAC is a generic framework for many
+downstream tasks is very broad and could therefore only be partially tested.
+Moreover, we show that FairAC is generalizable to various datasets and
+sensitive attributes and show evidence that the improvement in group fairness
+of the FairAC framework does not come at the expense of individual fairness.
+Lastly, the codebase of FairAC has been refactored and is now easily applicable
+for various datasets and models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 2 figures, accepted at TMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embarrassingly Parallel GFlowNets <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03288v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03288v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tiago da Silva, Luiz Max Carvalho, Amauri Souza, Samuel Kaski, Diego Mesquita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GFlowNets are a promising alternative to MCMC sampling for discrete
+compositional random variables. Training GFlowNets requires repeated
+evaluations of the unnormalized target distribution or reward function.
+However, for large-scale posterior sampling, this may be prohibitive since it
+incurs traversing the data several times. Moreover, if the data are distributed
+across clients, employing standard GFlowNets leads to intensive client-server
+communication. To alleviate both these issues, we propose embarrassingly
+parallel GFlowNet (EP-GFlowNet). EP-GFlowNet is a provably correct
+divide-and-conquer method to sample from product distributions of the form
+$R(\cdot) \propto R_1(\cdot) ... R_N(\cdot)$ -- e.g., in parallel or federated
+Bayes, where each $R_n$ is a local posterior defined on a data partition.
+First, in parallel, we train a local GFlowNet targeting each $R_n$ and send the
+resulting models to the server. Then, the server learns a global GFlowNet by
+enforcing our newly proposed \emph{aggregating balance} condition, requiring a
+single communication step. Importantly, EP-GFlowNets can also be applied to
+multi-objective optimization and model reuse. Our experiments illustrate the
+EP-GFlowNets's effectiveness on many tasks, including parallel Bayesian
+phylogenetics, multi-objective multiset, sequence generation, and federated
+Bayesian structure learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpikeLM: Towards General Spike-Driven Language Modeling via Elastic
+  Bi-Spiking Mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingrun Xing, Zheng Zhang, Ziyi Ni, Shitao Xiao, Yiming Ju, Siqi Fan, Yequan Wang, Jiajun Zhang, Guoqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Towards energy-efficient artificial intelligence similar to the human brain,
+the bio-inspired spiking neural networks (SNNs) have advantages of biological
+plausibility, event-driven sparsity, and binary activation. Recently,
+large-scale language models exhibit promising generalization capability, making
+it a valuable issue to explore more general spike-driven models. However, the
+binary spikes in existing SNNs fail to encode adequate semantic information,
+placing technological challenges for generalization. This work proposes the
+first fully spiking mechanism for general language tasks, including both
+discriminative and generative ones. Different from previous spikes with {0,1}
+levels, we propose a more general spike formulation with bi-directional,
+elastic amplitude, and elastic frequency encoding, while still maintaining the
+addition nature of SNNs. In a single time step, the spike is enhanced by
+direction and amplitude information; in spike frequency, a strategy to control
+spike firing rate is well designed. We plug this elastic bi-spiking mechanism
+in language modeling, named SpikeLM. It is the first time to handle general
+language tasks with fully spike-driven models, which achieve much higher
+accuracy than previously possible. SpikeLM also greatly bridges the performance
+gap between SNNs and ANNs in language modeling. Our code is available at
+https://github.com/Xingrun-Xing/SpikeLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FusionBench: A Comprehensive Benchmark of Deep Model Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anke Tang, Li Shen, Yong Luo, Han Hu, Bo Do, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep model fusion is an emerging technique that unifies the predictions or
+parameters of several deep neural networks into a single model in a
+cost-effective and data-efficient manner. This enables the unified model to
+take advantage of the original models' strengths, potentially exceeding their
+performance. Although a variety of deep model fusion techniques have been
+introduced, their evaluations tend to be inconsistent and often inadequate to
+validate their effectiveness and robustness against distribution shifts. To
+address this issue, we introduce FusionBench, which is the first comprehensive
+benchmark dedicated to deep model fusion. FusionBench covers a wide range of
+tasks, including open-vocabulary image classification, text classification, and
+text-to-text generation. Each category includes up to eight tasks with
+corresponding task-specific models, featuring both full fine-tuning and LoRA
+fine-tuning, as well as models of different sizes, to ensure fair and balanced
+comparisons of various multi-task model fusion techniques across different
+tasks, model scales, and fine-tuning strategies. We implement and evaluate a
+broad spectrum of deep model fusion techniques. These techniques range from
+model ensemble methods, which combine the predictions to improve the overall
+performance, to model merging, which integrates different models into a single
+one, and model mixing methods, which upscale or recombine the components of the
+original models. FusionBench now contains 26 distinct tasks, 74 fine-tuned
+models, and 16 fusion techniques, and we are committed to consistently
+expanding the benchmark with more tasks, models, and fusion techniques. In
+addition, we offer a well-documented set of resources and guidelines to aid
+researchers in understanding and replicating the benchmark results. Homepage
+https://tanganke.github.io/fusion_bench/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project homepage: https://tanganke.github.io/fusion_bench/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using GNN property predictors as molecule generators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Félix Therrien, Edward H. Sargent, Oleksandr Voznyy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have emerged as powerful tools to accurately
+predict materials and molecular properties in computational discovery
+pipelines. In this article, we exploit the invertible nature of these neural
+networks to directly generate molecular structures with desired electronic
+properties. Starting from a random graph or an existing molecule, we perform a
+gradient ascent while holding the GNN weights fixed in order to optimize its
+input, the molecular graph, towards the target property. Valence rules are
+enforced strictly through a judicious graph construction. The method relies
+entirely on the property predictor; no additional training is required on
+molecular structures. We demonstrate the application of this method by
+generating molecules with specific DFT-verified energy gaps and octanol-water
+partition coefficients (logP). Our approach hits target properties with rates
+comparable to or better than state-of-the-art generative models while
+consistently generating more diverse molecules.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Scalable Hessian Diagonal Approximations for Applications in
+  Reinforcement Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Elsayed, Homayoon Farrahi, Felix Dangel, A. Rupam Mahmood
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Second-order information is valuable for many applications but challenging to
+compute. Several works focus on computing or approximating Hessian diagonals,
+but even this simplification introduces significant additional costs compared
+to computing a gradient. In the absence of efficient exact computation schemes
+for Hessian diagonals, we revisit an early approximation scheme proposed by
+Becker and LeCun (1989, BL89), which has a cost similar to gradients and
+appears to have been overlooked by the community. We introduce HesScale, an
+improvement over BL89, which adds negligible extra computation. On small
+networks, we find that this improvement is of higher quality than all
+alternatives, even those with theoretical guarantees, such as unbiasedness,
+while being much cheaper to compute. We use this insight in reinforcement
+learning problems where small networks are used and demonstrate HesScale in
+second-order optimization and scaling the step-size parameter. In our
+experiments, HesScale optimizes faster than existing methods and improves
+stability through step-size scaling. These findings are promising for scaling
+second-order methods in larger models in the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the Proceedings of the 41st International Conference on
+  Machine Learning (ICML 2024). Code is available at
+  https://github.com/mohmdelsayed/HesScale. arXiv admin note: substantial text
+  overlap with arXiv:2210.11639</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Microphone Speech Emotion Recognition using the Hierarchical
+  Token-semantic Audio <span class="highlight-title">Transformer</span> Architecture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03272v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03272v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ohad Cohen, Gershon Hazan, Sharon Gannot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most emotion recognition systems fail in real-life situations (in the wild
+scenarios) where the audio is contaminated by reverberation. Our study explores
+new methods to alleviate the performance degradation of Speech Emotion
+Recognition (SER) algorithms and develop a more robust system for adverse
+conditions. We propose processing multi-microphone signals to address these
+challenges and improve emotion classification accuracy. We adopt a
+state-of-the-art transformer model, the Hierarchical Token-semantic Audio
+Transformer (HTS-AT), to handle multi-channel audio inputs. We evaluate two
+strategies: averaging mel-spectrograms across channels and summing
+patch-embedded representations. Our multimicrophone model achieves superior
+performance compared to single-channel baselines when tested on real-world
+reverberant environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No-Regret Algorithms for Safe Bayesian Optimization with Monotonicity
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arpan Losalka, Jonathan Scarlett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of sequentially maximizing an unknown function $f$
+over a set of actions of the form $(s,\mathbf{x})$, where the selected actions
+must satisfy a safety constraint with respect to an unknown safety function
+$g$. We model $f$ and $g$ as lying in a reproducing kernel Hilbert space
+(RKHS), which facilitates the use of Gaussian process methods. While existing
+works for this setting have provided algorithms that are guaranteed to identify
+a near-optimal safe action, the problem of attaining low cumulative regret has
+remained largely unexplored, with a key challenge being that expanding the safe
+region can incur high regret. To address this challenge, we show that if $g$ is
+monotone with respect to just the single variable $s$ (with no such constraint
+on $f$), sublinear regret becomes achievable with our proposed algorithm. In
+addition, we show that a modified version of our algorithm is able to attain
+sublinear regret (for suitably defined notions of regret) for the task of
+finding a near-optimal $s$ corresponding to every $\mathbf{x}$, as opposed to
+only finding the global safe optimum. Our findings are supported with empirical
+evaluations on various objective and safety functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Generative Models for Proton Zero Degree Calorimeter Simulations in
+  ALICE, CERN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patryk Będkowski, Jan Dubiński, Kamil Deja, Przemysław Rokita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulating detector responses is a crucial part of understanding the
+inner-workings of particle collisions in the Large Hadron Collider at CERN. The
+current reliance on statistical Monte-Carlo simulations strains CERN's
+computational grid, underscoring the urgency for more efficient alternatives.
+Addressing these challenges, recent proposals advocate for generative machine
+learning methods. In this study, we present an innovative deep learning
+simulation approach tailored for the proton Zero Degree Calorimeter in the
+ALICE experiment. Leveraging a Generative Adversarial Network model with
+Selective Diversity Increase loss, we directly simulate calorimeter responses.
+To enhance its capabilities in modeling a broad range of calorimeter response
+intensities, we expand the SDI-GAN architecture with additional regularization.
+Moreover, to improve the spatial fidelity of the generated data, we introduce
+an auxiliary regressor network. Our method offers a significant speedup when
+comparing to the traditional Monte-Carlo based approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, PP-RAI 2024 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature learning in finite-width Bayesian deep linear networks with
+  multiple outputs and convolutional layers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03260v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03260v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Bassetti, Marco Gherardi, Alessandro Ingrosso, Mauro Pastore, Pietro Rotondo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep linear networks have been extensively studied, as they provide
+simplified models of deep learning. However, little is known in the case of
+finite-width architectures with multiple outputs and convolutional layers. In
+this manuscript, we provide rigorous results for the statistics of functions
+implemented by the aforementioned class of networks, thus moving closer to a
+complete characterization of feature learning in the Bayesian setting. Our
+results include: (i) an exact and elementary non-asymptotic integral
+representation for the joint prior distribution over the outputs, given in
+terms of a mixture of Gaussians; (ii) an analytical formula for the posterior
+distribution in the case of squared error loss function (Gaussian likelihood);
+(iii) a quantitative description of the feature learning infinite-width regime,
+using large deviation theory. From a physical perspective, deep architectures
+with multiple outputs or convolutional layers represent different
+manifestations of kernel shape renormalization, and our work provides a
+dictionary that translates this physics intuition and terminology into rigorous
+Bayesian statistics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relaxed Quantile Regression: Prediction Intervals for Asymmetric Noise <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Pouplin, Alan Jeffares, Nabeel Seedat, Mihaela van der Schaar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Constructing valid prediction intervals rather than point estimates is a
+well-established approach for uncertainty quantification in the regression
+setting. Models equipped with this capacity output an interval of values in
+which the ground truth target will fall with some prespecified probability.
+This is an essential requirement in many real-world applications where simple
+point predictions' inability to convey the magnitude and frequency of errors
+renders them insufficient for high-stakes decisions. Quantile regression is a
+leading approach for obtaining such intervals via the empirical estimation of
+quantiles in the (non-parametric) distribution of outputs. This method is
+simple, computationally inexpensive, interpretable, assumption-free, and
+effective. However, it does require that the specific quantiles being learned
+are chosen a priori. This results in (a) intervals that are arbitrarily
+symmetric around the median which is sub-optimal for realistic skewed
+distributions, or (b) learning an excessive number of intervals. In this work,
+we propose Relaxed Quantile Regression (RQR), a direct alternative to quantile
+regression based interval construction that removes this arbitrary constraint
+whilst maintaining its strengths. We demonstrate that this added flexibility
+results in intervals with an improvement in desirable qualities (e.g. mean
+width) whilst retaining the essential coverage guarantees of quantile
+regression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at International Conference on Machine Learning (ICML) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Maximal Local Disparity of Fairness-Aware Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinqiu Jin, Haoxuan Li, Fuli Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fairness has become a crucial aspect in the development of trustworthy
+machine learning algorithms. Current fairness metrics to measure the violation
+of demographic parity have the following drawbacks: (i) the average difference
+of model predictions on two groups cannot reflect their distribution disparity,
+and (ii) the overall calculation along all possible predictions conceals the
+extreme local disparity at or around certain predictions. In this work, we
+propose a novel fairness metric called Maximal Cumulative ratio Disparity along
+varying Predictions' neighborhood (MCDP), for measuring the maximal local
+disparity of the fairness-aware classifiers. To accurately and efficiently
+calculate the MCDP, we develop a provably exact and an approximate calculation
+algorithm that greatly reduces the computational complexity with low estimation
+error. We further propose a bi-level optimization algorithm using a
+differentiable approximation of the MCDP for improving the algorithmic
+fairness. Extensive experiments on both tabular and image datasets validate
+that our fair training algorithm can achieve superior fairness-accuracy
+trade-offs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Higher Order Structures in Graph Explanantions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akshit Sinha, Sreeram Vennam, Charu Sharma, Ponnurangam Kumaraguru
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in graph learning contributed to explaining predictions
+generated by Graph Neural Networks. However, existing methodologies often fall
+short when applied to real-world datasets. We introduce HOGE, a framework to
+capture higher-order structures using cell complexes, which excel at modeling
+higher-order relationships. In the real world, higher-order structures are
+ubiquitous like in molecules or social networks, thus our work significantly
+enhances the practical applicability of graph explanations. HOGE produces
+clearer and more accurate explanations compared to prior methods. Our method
+can be integrated with all existing graph explainers, ensuring seamless
+integration into current frameworks. We evaluate on GraphXAI benchmark
+datasets, HOGE achieves improved or comparable performance with minimal
+computational overhead. Ablation studies show that the performance gain
+observed can be attributed to the higher-order structures that come from
+introducing cell complexes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Near-field Beamforming for Extremely Large-scale MIMO Based on
+  Unsupervised Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03249v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03249v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiali Nie, Yuanhao Cui, Zhaohui Yang, Weijie Yuan, Xiaojun Jing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extremely Large-scale Array (ELAA) is considered a frontier technology for
+future communication systems, pivotal in improving wireless systems' rate and
+spectral efficiency. However, as ELAA employs a multitude of antennas operating
+at higher frequencies, users are typically situated in the near-field region
+where the spherical wavefront propagates. This inevitably leads to a
+significant increase in the overhead of beam training, requiring complex
+two-dimensional beam searching in both the angle domain and the distance
+domain. To address this problem, we propose a near-field beamforming method
+based on unsupervised deep learning. Our convolutional neural network
+efficiently extracts complex channel state information features by
+strategically selecting padding and kernel size. We optimize the beamformers to
+maximize achievable rates in a multi-user network without relying on predefined
+custom codebooks. Upon deployment, the model requires solely the input of
+pre-estimated channel state information to derive the optimal beamforming
+vector. Simulation results show that our proposed scheme can obtain stable
+beamforming gain compared with the baseline scheme. Furthermore, owing to the
+inherent traits of deep learning methodologies, this approach substantially
+diminishes the beam training costs in near-field regions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variational Pseudo Marginal Methods for Jet Reconstruction in Particle
+  Physics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03242v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03242v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanming Yang, Antonio Khalil Moretti, Sebastian Macaluso, Philippe Chlenski, Christian A. Naesseth, Itsik Pe'er
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing jets, which provide vital insights into the properties and
+histories of subatomic particles produced in high-energy collisions, is a main
+problem in data analyses in collider physics. This intricate task deals with
+estimating the latent structure of a jet (binary tree) and involves parameters
+such as particle energy, momentum, and types. While Bayesian methods offer a
+natural approach for handling uncertainty and leveraging prior knowledge, they
+face significant challenges due to the super-exponential growth of potential
+jet topologies as the number of observed particles increases. To address this,
+we introduce a Combinatorial Sequential Monte Carlo approach for inferring jet
+latent structures. As a second contribution, we leverage the resulting
+estimator to develop a variational inference algorithm for parameter learning.
+Building on this, we introduce a variational family using a pseudo-marginal
+framework for a fully Bayesian treatment of all variables, unifying the
+generative model with the inference process. We illustrate our method's
+effectiveness through experiments using data generated with a collider physics
+generative model, highlighting superior speed and accuracy across a range of
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-Grained Causal Dynamics Learning with Quantization for Improving
+  Robustness in Reinforcement Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inwoo Hwang, Yunhyeok Kwak, Suhyung Choi, Byoung-Tak Zhang, Sanghack Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal dynamics learning has recently emerged as a promising approach to
+enhancing robustness in reinforcement learning (RL). Typically, the goal is to
+build a dynamics model that makes predictions based on the causal relationships
+among the entities. Despite the fact that causal connections often manifest
+only under certain contexts, existing approaches overlook such fine-grained
+relationships and lack a detailed understanding of the dynamics. In this work,
+we propose a novel dynamics model that infers fine-grained causal structures
+and employs them for prediction, leading to improved robustness in RL. The key
+idea is to jointly learn the dynamics model with a discrete latent variable
+that quantizes the state-action space into subgroups. This leads to recognizing
+meaningful context that displays sparse dependencies, where causal structures
+are learned for each subgroup throughout the training. Experimental results
+demonstrate the robustness of our method to unseen states and locally spurious
+correlations in downstream tasks where fine-grained causal reasoning is
+crucial. We further illustrate the effectiveness of our subgroup-based approach
+with quantization in discovering fine-grained causal relationships compared to
+prior methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CommonPower: Supercharging Machine Learning for Smart Grids 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Eichelbeck, Hannah Markgraf, Matthias Althoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing complexity of power system management has led to an increased
+interest in the use of reinforcement learning (RL). However, no tool for
+comprehensive and realistic benchmarking of RL in smart grids exists. One
+prerequisite for such a comparison is a safeguarding mechanism since vanilla RL
+controllers can not guarantee the satisfaction of system constraints. Other
+central requirements include flexible modeling of benchmarking scenarios,
+credible baselines, and the possibility to investigate the impact of forecast
+uncertainties. Our Python tool CommonPower is the first modular framework
+addressing these needs. CommonPower offers a unified interface for single-agent
+and multi-agent RL training algorithms and includes a built-in model predictive
+control approach based on a symbolic representation of the system equations.
+This makes it possible to combine model predictive controllers with RL
+controllers in the same system. Leveraging the symbolic system model,
+CommonPower facilitates the study of safeguarding strategies via the flexible
+formulation of safety layers. Furthermore equipped with a generic forecasting
+interface, CommonPower constitutes a versatile tool significantly augmenting
+the exploration of safe RL controllers in smart grids on several dimensions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For the corresponding code repository, see
+  https://github.com/TUMcps/commonpower</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defending Large Language Models Against Attacks With Residual Stream
+  Activation Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03230v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03230v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amelia Kawasaki, Andrew Davis, Houssam Abbas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread adoption of Large Language Models (LLMs), exemplified by
+OpenAI's ChatGPT, brings to the forefront the imperative to defend against
+adversarial threats on these models. These attacks, which manipulate an LLM's
+output by introducing malicious inputs, undermine the model's integrity and the
+trust users place in its outputs. In response to this challenge, our paper
+presents an innovative defensive strategy, given white box access to an LLM,
+that harnesses residual activation analysis between transformer layers of the
+LLM. We apply an established methodology for analyzing distinctive activation
+patterns in the residual streams for a novel result of attack prompt
+classification. We curate multiple datasets to demonstrate how this method of
+classification has high accuracy across multiple types of attack scenarios,
+including our newly-created attack dataset. Furthermore, we enhance the model's
+resilience by integrating safety fine-tuning techniques for LLMs in order to
+measure its effect on our capability to detect attacks. The results underscore
+the effectiveness of our approach in enhancing the detection and mitigation of
+adversarial inputs, advancing the security framework within which LLMs operate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Clipper: Enhancing Safety and Reliability of <span class="highlight-title">Transformer</span>-based
+  Object Detection Models <span class="chip">IJCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qutub Syed Sha, Michael Paulitsch, Karthik Pattabiraman, Korbinian Hagn, Fabian Oboril, Cornelius Buerkle, Kay-Ulrich Scholl, Gereon Hinz, Alois Knoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As transformer-based object detection models progress, their impact in
+critical sectors like autonomous vehicles and aviation is expected to grow.
+Soft errors causing bit flips during inference have significantly impacted DNN
+performance, altering predictions. Traditional range restriction solutions for
+CNNs fall short for transformers. This study introduces the Global Clipper and
+Global Hybrid Clipper, effective mitigation strategies specifically designed
+for transformer-based models. It significantly enhances their resilience to
+soft errors and reduces faulty inferences to ~ 0\%. We also detail extensive
+testing across over 64 scenarios involving two transformer models (DINO-DETR
+and Lite-DETR) and two CNN models (YOLOv3 and SSD) using three datasets,
+totalling approximately 3.3 million inferences, to assess model robustness
+comprehensively. Moreover, the paper explores unique aspects of attention
+blocks in transformers and their operational differences from CNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IJCAI-AISafety'24 Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Choice of PEFT Technique in Continual Learning: <span class="highlight-title">Prompt</span> Tuning is Not All
+  You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Wistuba, Prabhu Teja Sivaprasad, Lukas Balles, Giovanni Zappella
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent Continual Learning (CL) methods have combined pretrained Transformers
+with prompt tuning, a parameter-efficient fine-tuning (PEFT) technique. We
+argue that the choice of prompt tuning in prior works was an undefended and
+unablated decision, which has been uncritically adopted by subsequent research,
+but warrants further research to understand its implications. In this paper, we
+conduct this research and find that the choice of prompt tuning as a PEFT
+method hurts the overall performance of the CL system. To illustrate this, we
+replace prompt tuning with LoRA in two state-of-the-art continual learning
+methods: Learning to Prompt and S-Prompts. These variants consistently achieve
+higher accuracy across a wide range of domain-incremental and class-incremental
+benchmarks, while being competitive in inference speed. Our work highlights a
+crucial argument: unexamined choices can hinder progress in the field, and
+rigorous ablations, such as the PEFT method, are required to drive meaningful
+adoption of CL techniques in real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inferring the time-varying coupling of dynamical systems with temporal
+  convolutional autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josuan Calderon, Gordon J. Berman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most approaches for assessing causality in complex dynamical systems fail
+when the interactions between variables are inherently non-linear and
+non-stationary. Here we introduce Temporal Autoencoders for Causal Inference
+(TACI), a methodology that combines a new surrogate data metric for assessing
+causal interactions with a novel two-headed machine learning architecture to
+identify and measure the direction and strength of time-varying causal
+interactions. Through tests on both synthetic and real-world datasets, we
+demonstrate TACI's ability to accurately quantify dynamic causal interactions
+across a variety of systems. Our findings display the method's effectiveness
+compared to existing approaches and also highlight our approach's potential to
+build a deeper understanding of the mechanisms that underlie time-varying
+interactions in physical and biological systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Challenges and Considerations in the Evaluation of Bayesian Causal
+  Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir Mohammad Karimi Mamaghan, Panagiotis Tigas, Karl Henrik Johansson, Yarin Gal, Yashas Annadani, Stefan Bauer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representing uncertainty in causal discovery is a crucial component for
+experimental design, and more broadly, for safe and reliable causal decision
+making. Bayesian Causal Discovery (BCD) offers a principled approach to
+encapsulating this uncertainty. Unlike non-Bayesian causal discovery, which
+relies on a single estimated causal graph and model parameters for assessment,
+evaluating BCD presents challenges due to the nature of its inferred quantity -
+the posterior distribution. As a result, the research community has proposed
+various metrics to assess the quality of the approximate posterior. However,
+there is, to date, no consensus on the most suitable metric(s) for evaluation.
+In this work, we reexamine this question by dissecting various metrics and
+understanding their limitations. Through extensive empirical evaluation, we
+find that many existing metrics fail to exhibit a strong correlation with the
+quality of approximation to the true posterior, especially in scenarios with
+low sample sizes where BCD is most desirable. We highlight the suitability (or
+lack thereof) of these metrics under two distinct factors: the identifiability
+of the underlying causal model and the quantity of available data. Both factors
+affect the entropy of the true posterior, indicating that the current metrics
+are less fitting in settings of higher entropy. Our findings underline the
+importance of a more nuanced evaluation of new methods by taking into account
+the nature of the true posterior, as well as guide and motivate the development
+of new evaluation procedures for this challenge.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Neural Network Explanations are Fragile 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03193v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03193v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiate Li, Meng Pang, Yun Dong, Jinyuan Jia, Binghui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable Graph Neural Network (GNN) has emerged recently to foster the
+trust of using GNNs. Existing GNN explainers are developed from various
+perspectives to enhance the explanation performance. We take the first step to
+study GNN explainers under adversarial attack--We found that an adversary
+slightly perturbing graph structure can ensure GNN model makes correct
+predictions, but the GNN explainer yields a drastically different explanation
+on the perturbed graph. Specifically, we first formulate the attack problem
+under a practical threat model (i.e., the adversary has limited knowledge about
+the GNN explainer and a restricted perturbation budget). We then design two
+methods (i.e., one is loss-based and the other is deduction-based) to realize
+the attack. We evaluate our attacks on various GNN explainers and the results
+show these explainers are fragile.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 64 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Initialization-enhanced Physics-Informed Neural Network with Domain
+  Decomposition (IDPINN) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenhao Si, Ming Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new physics-informed neural network framework, IDPINN, based on
+the enhancement of initialization and domain decomposition to improve
+prediction accuracy. We train a PINN using a small dataset to obtain an initial
+network structure, including the weighted matrix and bias, which initializes
+the PINN for each subdomain. Moreover, we leverage the smoothness condition on
+the interface to enhance the prediction performance. We numerically evaluated
+it on several forward problems and demonstrated the benefits of IDPINN in terms
+of accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Dimensional Kernel Methods under Covariate Shift: Data-Dependent
+  Implicit Regularization <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03171v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03171v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihang Chen, Fanghui Liu, Taiji Suzuki, Volkan Cevher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies kernel ridge regression in high dimensions under covariate
+shifts and analyzes the role of importance re-weighting. We first derive the
+asymptotic expansion of high dimensional kernels under covariate shifts. By a
+bias-variance decomposition, we theoretically demonstrate that the re-weighting
+strategy allows for decreasing the variance. For bias, we analyze the
+regularization of the arbitrary or well-chosen scale, showing that the bias can
+behave very differently under different regularization scales. In our analysis,
+the bias and variance can be characterized by the spectral decay of a
+data-dependent regularized kernel: the original kernel matrix associated with
+an additional re-weighting matrix, and thus the re-weighting strategy can be
+regarded as a data-dependent regularization for better understanding. Besides,
+our analysis provides asymptotic expansion of kernel functions/vectors under
+covariate shift, which has its own interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topological Neural Networks go Persistent, Equivariant, and Continuous <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03164v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03164v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yogesh Verma, Amauri H Souza, Vikas Garg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topological Neural Networks (TNNs) incorporate higher-order relational
+information beyond pairwise interactions, enabling richer representations than
+Graph Neural Networks (GNNs). Concurrently, topological descriptors based on
+persistent homology (PH) are being increasingly employed to augment the GNNs.
+We investigate the benefits of integrating these two paradigms. Specifically,
+we introduce TopNets as a broad framework that subsumes and unifies various
+methods in the intersection of GNNs/TNNs and PH such as (generalizations of)
+RePHINE and TOGL. TopNets can also be readily adapted to handle (symmetries in)
+geometric complexes, extending the scope of TNNs and PH to spatial settings.
+Theoretically, we show that PH descriptors can provably enhance the
+expressivity of simplicial message-passing networks. Empirically, (continuous
+and E(n)-equivariant extensions of) TopNets achieve strong performance across
+diverse tasks, including antibody design, molecular dynamics simulation, and
+drug property prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ethical considerations of use of hold-out sets in clinical prediction
+  model management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03161v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03161v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis Chislett, Louis JM Aslett, Alisha R Davies, Catalina A Vallejos, James Liley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical prediction models are statistical or machine learning models used to
+quantify the risk of a certain health outcome using patient data. These can
+then inform potential interventions on patients, causing an effect called
+performative prediction: predictions inform interventions which influence the
+outcome they were trying to predict, leading to a potential underestimation of
+risk in some patients if a model is updated on this data. One suggested
+resolution to this is the use of hold-out sets, in which a set of patients do
+not receive model derived risk scores, such that a model can be safely
+retrained. We present an overview of clinical and research ethics regarding
+potential implementation of hold-out sets for clinical prediction models in
+health settings. We focus on the ethical principles of beneficence,
+non-maleficence, autonomy and justice. We also discuss informed consent,
+clinical equipoise, and truth-telling. We present illustrative cases of
+potential hold-out set implementations and discuss statistical issues arising
+from different hold-out set sampling methods. We also discuss differences
+between hold-out sets and randomised control trials, in terms of ethics and
+statistical issues. Finally, we give practical recommendations for researchers
+interested in the use hold-out sets for clinical prediction models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Combination Model Based on Sequential General Variational Mode
+  Decomposition Method for Time Series Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Chen, Yuanyuan Yang, Jianyu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of financial time series is a key concern for market
+economy makers and investors. The article selects online store sales and
+Australian beer sales as representatives of non-stationary, trending, and
+seasonal financial time series, and constructs a new SGVMD-ARIMA combination
+model in a non-linear combination way to predict financial time series. The
+ARIMA model, LSTM model, and other classic decomposition prediction models are
+used as control models to compare the accuracy of different models. The
+empirical results indicate that the constructed combination prediction model
+has universal advantages over the single prediction model and linear
+combination prediction model of the control group. Within the prediction
+interval, our proposed combination model has improved advantages over
+traditional decomposition prediction control group models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Model Misspecification in Amortized Bayesian Inference with
+  Neural Networks: An Extended Investigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03154v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03154v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marvin Schmitt, Paul-Christian Bürkner, Ullrich Köthe, Stefan T. Radev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in probabilistic deep learning enable efficient amortized
+Bayesian inference in settings where the likelihood function is only implicitly
+defined by a simulation program (simulation-based inference; SBI). But how
+faithful is such inference if the simulation represents reality somewhat
+inaccurately, that is, if the true system behavior at test time deviates from
+the one seen during training? We conceptualize the types of such model
+misspecification arising in SBI and systematically investigate how the
+performance of neural posterior approximators gradually deteriorates as a
+consequence, making inference results less and less trustworthy. To notify
+users about this problem, we propose a new misspecification measure that can be
+trained in an unsupervised fashion (i.e., without training data from the true
+distribution) and reliably detects model misspecification at test time. Our
+experiments clearly demonstrate the utility of our new measure both on toy
+examples with an analytical ground-truth and on representative scientific tasks
+in cell biology, cognitive decision making, disease outbreak dynamics, and
+computer vision. We show how the proposed misspecification test warns users
+about suspicious outputs, raises an alarm when predictions are not trustworthy,
+and guides model designers in their search for better simulators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of the conference paper
+  https://doi.org/10.1007/978-3-031-54605-1_35. arXiv admin note: text overlap
+  with arXiv:2112.08866</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Spectral Clustering with Provable Approximation Guarantee <span class="chip">ICML'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steinar Laenen, He Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies clustering algorithms for dynamically evolving graphs
+$\{G_t\}$, in which new edges (and potential new vertices) are added into a
+graph, and the underlying cluster structure of the graph can gradually change.
+The paper proves that, under some mild condition on the cluster-structure, the
+clusters of the final graph $G_T$ of $n_T$ vertices at time $T$ can be well
+approximated by a dynamic variant of the spectral clustering algorithm. The
+algorithm runs in amortised update time $O(1)$ and query time $o(n_T)$.
+Experimental studies on both synthetic and real-world datasets further confirm
+the practicality of our designed algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted at the 41st International Conference on Machine
+  Learning (ICML'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Which Side Are You On? A Multi-task <span class="highlight-title">Dataset</span> for End-to-End Argument
+  Summarisation and Evaluation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Li, Yuping Wu, Viktor Schlegel, Riza Batista-Navarro, Tharindu Madusanka, Iqra Zahid, Jiayan Zeng, Xiaochi Wang, Xinran He, Yizhi Li, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent advances of large language models (LLMs), it is no longer
+infeasible to build an automated debate system that helps people to synthesise
+persuasive arguments. Previous work attempted this task by integrating multiple
+components. In our work, we introduce an argument mining dataset that captures
+the end-to-end process of preparing an argumentative essay for a debate, which
+covers the tasks of claim and evidence identification (Task 1 ED), evidence
+convincingness ranking (Task 2 ECR), argumentative essay summarisation and
+human preference ranking (Task 3 ASR) and metric learning for automated
+evaluation of resulting essays, based on human feedback along argument quality
+dimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are
+fully annotated with the various properties supporting the aforementioned
+tasks. We evaluate multiple generative baselines for each of these tasks,
+including representative LLMs. We find, that while they show promising results
+on individual tasks in our benchmark, their end-to-end performance on all four
+tasks in succession deteriorates significantly, both in automated measures as
+well as in human-centred evaluation. This challenge presented by our proposed
+dataset motivates future research on end-to-end argument mining and
+summarisation. The repository of this project is available at
+https://github.com/HarrywillDr/ArgSum-Datatset
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published on ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sample-specific Masks for Visual Reprogramming-based <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03150v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03150v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengyi Cai, Zesheng Ye, Lei Feng, Jianzhong Qi, Feng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual reprogramming (VR) is a prompting technique that aims to re-purpose a
+pre-trained model (e.g., a classifier on ImageNet) to target tasks (e.g.,
+medical data prediction) by learning a small-scale pattern added into input
+images instead of tuning considerable parameters within the model. The location
+of the pattern within input samples is usually determined by a pre-defined mask
+shared across all samples. In this paper, we show that the shared mask
+potentially limits VR's generalization and increases its approximation error
+due to the lack of sample-level adaptation. Motivated by this finding, we
+design a new framework for VR called sample-specific multi-channel masks (SMM).
+Specifically, SMM employs a lightweight ConvNet and patch-wise interpolation to
+generate sample-specific three-channel masks instead of a shared and
+pre-defined mask. Since we generate different masks for individual samples, SMM
+is theoretically shown to reduce approximation error for the target tasks
+compared with existing state-of-the-art VR methods. We also empirically
+demonstrate its performance gain on both ResNet and ViT. The success of SMM
+further highlights the broader applicability of VR in leveraging the latent
+knowledge of pre-trained models for various target tasks. Our code is available
+at https://github.com/tmlr-group/SMM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aligning <span class="highlight-title">Transformer</span>s with Weisfeiler-Leman <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03148v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03148v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luis Müller, Christopher Morris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural network architectures aligned with the $k$-dimensional
+Weisfeiler--Leman ($k$-WL) hierarchy offer theoretically well-understood
+expressive power. However, these architectures often fail to deliver
+state-of-the-art predictive performance on real-world graphs, limiting their
+practical utility. While recent works aligning graph transformer architectures
+with the $k$-WL hierarchy have shown promising empirical results, employing
+transformers for higher orders of $k$ remains challenging due to a prohibitive
+runtime and memory complexity of self-attention as well as impractical
+architectural assumptions, such as an infeasible number of attention heads.
+Here, we advance the alignment of transformers with the $k$-WL hierarchy,
+showing stronger expressivity results for each $k$, making them more feasible
+in practice. In addition, we develop a theoretical framework that allows the
+study of established positional encodings such as Laplacian PEs and SPE. We
+evaluate our transformers on the large-scale PCQM4Mv2 dataset, showing
+competitive predictive performance with the state-of-the-art and demonstrating
+strong downstream performance when fine-tuning them on small-scale molecular
+datasets. Our code is available at
+https://github.com/luis-mueller/wl-transformers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tiny models from tiny data: Textual and null-text inversion for few-shot
+  distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erik Landolsi, Fredrik Kahl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot image classification involves classifying images using very few
+training examples. Recent vision foundation models show excellent few-shot
+transfer abilities, but are large and slow at inference. Using knowledge
+distillation, the capabilities of high-performing but slow models can be
+transferred to tiny, efficient models. However, common distillation methods
+require a large set of unlabeled data, which is not available in the few-shot
+setting. To overcome this lack of data, there has been a recent interest in
+using synthetic data.
+  We expand on this work by presenting a novel diffusion model inversion
+technique (TINT) combining the diversity of textual inversion with the
+specificity of null-text inversion. Using this method in a few-shot
+distillation pipeline leads to state-of-the-art accuracy among small student
+models on popular benchmarks, while being significantly faster than prior work.
+This allows us to push even tiny models to high accuracy using only a tiny
+application-specific dataset, albeit relying on extra data for pre-training.
+  Popular few-shot benchmarks involve evaluation over a large number of
+episodes, which is computationally cumbersome for methods involving synthetic
+data generation. Therefore, we also present a theoretical analysis on how the
+variance of the accuracy estimator depends on the number of episodes and query
+examples, and use these results to lower the computational effort required for
+method evaluation. In addition, to further motivate the use of generative
+models in few-shot distillation, we demonstrate that our method performs better
+compared to training on real data mined from the dataset used to train the
+diffusion model.
+  Source code will be made available at https://github.com/pixwse/tiny2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages (9 main pages + references and appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ E(n) Equivariant Message Passing Cellular Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03145v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03145v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Veljko Kovac, Erik J. Bekkers, Pietro Liò, Floor Eijkelboom
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces E(n) Equivariant Message Passing Cellular Networks
+(EMPCNs), an extension of E(n) Equivariant Graph Neural Networks to
+CW-complexes. Our approach addresses two aspects of geometric message passing
+networks: 1) enhancing their expressiveness by incorporating arbitrary cells,
+and 2) achieving this in a computationally efficient way with a decoupled
+EMPCNs technique. We demonstrate that EMPCNs achieve close to state-of-the-art
+performance on multiple tasks without the need for steerability, including
+many-body predictions and motion capture. Moreover, ablation studies confirm
+that decoupled EMPCNs exhibit stronger generalization capabilities than their
+non-topologically informed counterparts. These findings show that EMPCNs can be
+used as a scalable and expressive framework for higher-order message passing in
+geometric and topological graphs
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Combination Model for Time Series Prediction using LSTM via Extracting
+  Dynamic Features Based on Spatial Smoothing and Sequential General
+  Variational Mode Decomposition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianyu Liu, Wei Chen, Yong Zhang, Zhenfeng Chen, Bin Wan, Jinwei Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In order to solve the problems such as difficult to extract effective
+features and low accuracy of sales volume prediction caused by complex
+relationships such as market sales volume in time series prediction, we
+proposed a time series prediction method of market sales volume based on
+Sequential General VMD and spatial smoothing Long short-term memory neural
+network (SS-LSTM) combination model. Firstly, the spatial smoothing algorithm
+is used to decompose and calculate the sample data of related industry sectors
+affected by the linkage effect of market sectors, extracting modal features
+containing information via Sequential General VMD on overall market and
+specific price trends; Then, according to the background of different Market
+data sets, LSTM network is used to model and predict the price of fundamental
+data and modal characteristics. The experimental results of data prediction
+with seasonal and periodic trends show that this method can achieve higher
+price prediction accuracy and more accurate accuracy in specific market
+contexts compared to traditional prediction methods Describe the changes in
+market sales volume.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Power of Randomization in Fair Classification and Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sushant Agarwal, Amit Deshpande
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fair classification and fair representation learning are two important
+problems in supervised and unsupervised fair machine learning, respectively.
+Fair classification asks for a classifier that maximizes accuracy on a given
+data distribution subject to fairness constraints. Fair representation maps a
+given data distribution over the original feature space to a distribution over
+a new representation space such that all classifiers over the representation
+satisfy fairness. In this paper, we examine the power of randomization in both
+these problems to minimize the loss of accuracy that results when we impose
+fairness constraints. Previous work on fair classification has characterized
+the optimal fair classifiers on a given data distribution that maximize
+accuracy subject to fairness constraints, e.g., Demographic Parity (DP), Equal
+Opportunity (EO), and Predictive Equality (PE). We refine these
+characterizations to demonstrate when the optimal randomized fair classifiers
+can surpass their deterministic counterparts in accuracy. We also show how the
+optimal randomized fair classifier that we characterize can be obtained as a
+solution to a convex optimization problem. Recent work has provided techniques
+to construct fair representations for a given data distribution such that any
+classifier over this representation satisfies DP. However, the classifiers on
+these fair representations either come with no or weak accuracy guarantees when
+compared to the optimal fair classifier on the original data distribution.
+Extending our ideas for randomized fair classification, we improve on these
+works, and construct DP-fair, EO-fair, and PE-fair representations that have
+provably optimal accuracy and suffer no accuracy loss compared to the optimal
+DP-fair, EO-fair, and PE-fair classifiers respectively on the original data
+distribution.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Appeared in ACM FAccT 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Floating Anchor Diffusion Model for Multi-motif Scaffolding <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Liu, Weian Mao, Shuaike Shen, Xiaoran Jiao, Zheng Sun, Hao Chen, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motif scaffolding seeks to design scaffold structures for constructing
+proteins with functions derived from the desired motif, which is crucial for
+the design of vaccines and enzymes. Previous works approach the problem by
+inpainting or conditional generation. Both of them can only scaffold motifs
+with fixed positions, and the conditional generation cannot guarantee the
+presence of motifs. However, prior knowledge of the relative motif positions in
+a protein is not readily available, and constructing a protein with multiple
+functions in one protein is more general and significant because of the
+synergies between functions. We propose a Floating Anchor Diffusion (FADiff)
+model. FADiff allows motifs to float rigidly and independently in the process
+of diffusion, which guarantees the presence of motifs and automates the motif
+position design. Our experiments demonstrate the efficacy of FADiff with high
+success rates and designable novel scaffolds. To the best of our knowledge,
+FADiff is the first work to tackle the challenge of scaffolding multiple motifs
+without relying on the expertise of relative motif positions in the protein.
+Code is available at https://github.com/aim-uofa/FADiff.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Push Past Green: Learning to Look Behind Plant Foliage by Moving It 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03175v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03175v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Zhang, Saurabh Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous agriculture applications (e.g., inspection, phenotyping, plucking
+fruits) require manipulating the plant foliage to look behind the leaves and
+the branches. Partial visibility, extreme clutter, thin structures, and unknown
+geometry and dynamics for plants make such manipulation challenging. We tackle
+these challenges through data-driven methods. We use self-supervision to train
+SRPNet, a neural network that predicts what space is revealed on execution of a
+candidate action on a given plant. We use SRPNet with the cross-entropy method
+to predict actions that are effective at revealing space beneath plant foliage.
+Furthermore, as SRPNet does not just predict how much space is revealed but
+also where it is revealed, we can execute a sequence of actions that
+incrementally reveal more and more space beneath the plant foliage. We
+experiment with a synthetic (vines) and a real plant (Dracaena) on a physical
+test-bed across 5 settings including 2 settings that test generalization to
+novel plant configurations. Our experiments reveal the effectiveness of our
+overall method, PPG, over a competitive hand-crafted exploration method, and
+the effectiveness of SRPNet over a hand-crafted dynamics model and relevant
+ablations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Conference on Robot Learning (CoRL) 2023. for project
+  website with video, see https://sites.google.com/view/pushpastgreen/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Future Directions in the Theory of Graph Machine Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02287v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02287v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Morris, Fabrizio Frasca, Nadav Dym, Haggai Maron, İsmail İlkan Ceylan, Ron Levie, Derek Lim, Michael Bronstein, Martin Grohe, Stefanie Jegelka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning on graphs, especially using graph neural networks (GNNs),
+has seen a surge in interest due to the wide availability of graph data across
+a broad spectrum of disciplines, from life to social and engineering sciences.
+Despite their practical success, our theoretical understanding of the
+properties of GNNs remains highly incomplete. Recent theoretical advancements
+primarily focus on elucidating the coarse-grained expressive power of GNNs,
+predominantly employing combinatorial techniques. However, these studies do not
+perfectly align with practice, particularly in understanding the generalization
+behavior of GNNs when trained with stochastic first-order optimization
+techniques. In this position paper, we argue that the graph machine learning
+community needs to shift its attention to developing a balanced theory of graph
+machine learning, focusing on a more thorough understanding of the interplay of
+expressive power, generalization, and optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Universality of Coupling-based Normalizing Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06578v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06578v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Draxler, Stefan Wahl, Christoph Schnörr, Ullrich Köthe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel theoretical framework for understanding the expressive
+power of normalizing flows. Despite their prevalence in scientific
+applications, a comprehensive understanding of flows remains elusive due to
+their restricted architectures. Existing theorems fall short as they require
+the use of arbitrarily ill-conditioned neural networks, limiting practical
+applicability. We propose a distributional universality theorem for
+well-conditioned coupling-based normalizing flows such as RealNVP. In addition,
+we show that volume-preserving normalizing flows are not universal, what
+distribution they learn instead, and how to fix their expressivity. Our results
+support the general wisdom that affine and related couplings are expressive and
+in general outperform volume-preserving flows, bridging a gap between empirical
+results and theoretical understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of the 41 st International Conference on Machine
+  Learning, Vienna, Austria. PMLR 235, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Heuristic Core: Understanding Subnetwork Generalization in
+  <span class="highlight-title">Pretrain</span>ed Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03942v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03942v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adithya Bhaskar, Dan Friedman, Danqi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prior work has found that pretrained language models (LMs) fine-tuned with
+different random seeds can achieve similar in-domain performance but generalize
+differently on tests of syntactic generalization. In this work, we show that,
+even within a single model, we can find multiple subnetworks that perform
+similarly in-domain, but generalize vastly differently. To better understand
+these phenomena, we investigate if they can be understood in terms of
+"competing subnetworks": the model initially represents a variety of distinct
+algorithms, corresponding to different subnetworks, and generalization occurs
+when it ultimately converges to one. This explanation has been used to account
+for generalization in simple algorithmic tasks ("grokking"). Instead of finding
+competing subnetworks, we find that all subnetworks -- whether they generalize
+or not -- share a set of attention heads, which we refer to as the heuristic
+core. Further analysis suggests that these attention heads emerge early in
+training and compute shallow, non-generalizing features. The model learns to
+generalize by incorporating additional attention heads, which depend on the
+outputs of the "heuristic" heads to compute higher-level features. Overall, our
+results offer a more detailed picture of the mechanisms for syntactic
+generalization in pretrained LMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Confronting Reward Overoptimization for Diffusion Models: A Perspective
+  of Inductive and Primacy Biases <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08552v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08552v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyi Zhang, Sen Zhang, Yibing Zhan, Yong Luo, Yonggang Wen, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bridging the gap between diffusion models and human preferences is crucial
+for their integration into practical generative workflows. While optimizing
+downstream reward models has emerged as a promising alignment strategy,
+concerns arise regarding the risk of excessive optimization with learned reward
+models, which potentially compromises ground-truth performance. In this work,
+we confront the reward overoptimization problem in diffusion model alignment
+through the lenses of both inductive and primacy biases. We first identify a
+mismatch between current methods and the temporal inductive bias inherent in
+the multi-step denoising process of diffusion models, as a potential source of
+reward overoptimization. Then, we surprisingly discover that dormant neurons in
+our critic model act as a regularization against reward overoptimization while
+active neurons reflect primacy bias. Motivated by these observations, we
+propose Temporal Diffusion Policy Optimization with critic active neuron Reset
+(TDPO-R), a policy gradient algorithm that exploits the temporal inductive bias
+of diffusion models and mitigates the primacy bias stemming from active
+neurons. Empirical results demonstrate the superior efficacy of our methods in
+mitigating reward overoptimization. Code is avaliable at
+https://github.com/ZiyiZhang27/tdpo.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Meets DAgger: Supercharging Eye-in-hand Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17768v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17768v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Zhang, Matthew Chang, Pranav Kumar, Saurabh Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common failure mode for policies trained with imitation is compounding
+execution errors at test time. When the learned policy encounters states that
+are not present in the expert demonstrations, the policy fails, leading to
+degenerate behavior. The Dataset Aggregation, or DAgger approach to this
+problem simply collects more data to cover these failure states. However, in
+practice, this is often prohibitively expensive. In this work, we propose
+Diffusion Meets DAgger (DMD), a method to reap the benefits of DAgger without
+the cost for eye-in-hand imitation learning problems. Instead of collecting new
+samples to cover out-of-distribution states, DMD uses recent advances in
+diffusion models to synthesize these samples. This leads to robust performance
+from few demonstrations. We compare DMD against behavior cloning baseline
+across four tasks: pushing, stacking, pouring, and shirt hanging. In pushing,
+DMD achieves 80% success rate with as few as 8 expert demonstrations, where
+naive behavior cloning reaches only 20%. In stacking, DMD succeeds on average
+92% of the time across 5 cups, versus 40% for BC. When pouring coffee beans,
+DMD transfers to another cup successfully 80% of the time. Finally, DMD attains
+90% success rate for hanging shirt on a clothing rack.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Robotics: Science and Systems (RSS) 2024. project website
+  with video, see https://sites.google.com/view/diffusion-meets-dagger</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPIN: Sparsifying and Integrating Internal Neurons in Large Language
+  Models for Text Classification <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15983v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15983v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Difan Jiao, Yilun Liu, Zhenwei Tang, Daniel Matter, Jürgen Pfeffer, Ashton Anderson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among the many tasks that Large Language Models (LLMs) have revolutionized is
+text classification. Current text classification paradigms, however, rely
+solely on the output of the final layer in the LLM, with the rich information
+contained in internal neurons largely untapped. In this study, we present SPIN:
+a model-agnostic framework that sparsifies and integrates internal neurons of
+intermediate layers of LLMs for text classification. Specifically, SPIN
+sparsifies internal neurons by linear probing-based salient neuron selection
+layer by layer, avoiding noise from unrelated neurons and ensuring efficiency.
+The cross-layer salient neurons are then integrated to serve as multi-layered
+features for the classification head. Extensive experimental results show our
+proposed SPIN significantly improves text classification accuracy, efficiency,
+and interpretability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures, 12 tables Code available at
+  https://github.com/difanj0713/SPIN</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SaySelf: Teaching LLMs to Express Confidence with Self-Reflective
+  Rationales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20974v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20974v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyang Xu, Shujin Wu, Shizhe Diao, Xiaoze Liu, Xingyao Wang, Yangyi Chen, Jing Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often generate inaccurate or fabricated
+information and generally fail to indicate their confidence, which limits their
+broader applications. Previous work elicits confidence from LLMs by direct or
+self-consistency prompting, or constructing specific datasets for supervised
+finetuning. The prompting-based approaches have inferior performance, and the
+training-based approaches are limited to binary or inaccurate group-level
+confidence estimates. In this work, we present the advanced SaySelf, a training
+framework that teaches LLMs to express more accurate fine-grained confidence
+estimates. In addition, beyond the confidence scores, SaySelf initiates the
+process of directing LLMs to produce self-reflective rationales that clearly
+identify gaps in their parametric knowledge and explain their uncertainty. This
+is achieved by using an LLM to automatically summarize the uncertainties in
+specific knowledge via natural language. The summarization is based on the
+analysis of the inconsistency in multiple sampled reasoning chains, and the
+resulting data is utilized for supervised fine-tuning. Moreover, we utilize
+reinforcement learning with a meticulously crafted reward function to calibrate
+the confidence estimates, motivating LLMs to deliver accurate, high-confidence
+predictions and to penalize overconfidence in erroneous outputs. Experimental
+results in both in-distribution and out-of-distribution datasets demonstrate
+the effectiveness of SaySelf in reducing the confidence calibration error and
+maintaining the task performance. We show that the generated self-reflective
+rationales are reasonable and can further contribute to the calibration. The
+code is made public at https://github.com/xu1868/SaySelf.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is available at https://github.com/xu1868/SaySelf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Locality-Sensitive Hashing-Based Efficient Point <span class="highlight-title">Transformer</span> with
+  Applications in High-Energy Physics <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12535v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12535v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siqi Miao, Zhiyuan Lu, Mia Liu, Javier Duarte, Pan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study introduces a novel transformer model optimized for large-scale
+point cloud processing in scientific domains such as high-energy physics (HEP)
+and astrophysics. Addressing the limitations of graph neural networks and
+standard transformers, our model integrates local inductive bias and achieves
+near-linear complexity with hardware-friendly regular operations. One
+contribution of this work is the quantitative analysis of the error-complexity
+tradeoff of various sparsification techniques for building efficient
+transformers. Our findings highlight the superiority of using
+locality-sensitive hashing (LSH), especially OR & AND-construction LSH, in
+kernel approximation for large-scale point cloud data with local inductive
+bias. Based on this finding, we propose LSH-based Efficient Point Transformer
+(HEPT), which combines E$^2$LSH with OR & AND constructions and is built upon
+regular computations. HEPT demonstrates remarkable performance on two critical
+yet time-consuming HEP tasks, significantly outperforming existing GNNs and
+transformers in accuracy and computational speed, marking a significant
+advancement in geometric deep learning and large-scale scientific data
+processing. Our code is available at https://github.com/Graph-COM/HEPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TKAN: Temporal Kolmogorov-Arnold Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07344v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07344v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Remi Genet, Hugo Inzirillo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recurrent Neural Networks (RNNs) have revolutionized many areas of machine
+learning, particularly in natural language and data sequence processing. Long
+Short-Term Memory (LSTM) has demonstrated its ability to capture long-term
+dependencies in sequential data. Inspired by the Kolmogorov-Arnold Networks
+(KANs) a promising alternatives to Multi-Layer Perceptrons (MLPs), we proposed
+a new neural networks architecture inspired by KAN and the LSTM, the Temporal
+Kolomogorov-Arnold Networks (TKANs). TKANs combined the strenght of both
+networks, it is composed of Recurring Kolmogorov-Arnold Networks (RKANs) Layers
+embedding memory management. This innovation enables us to perform multi-step
+time series forecasting with enhanced accuracy and efficiency. By addressing
+the limitations of traditional models in handling complex sequential patterns,
+the TKAN architecture offers significant potential for advancements in fields
+requiring more than one step ahead forecasting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intersectional Unfairness Discovery <span class="chip">ICML-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20790v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20790v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gezheng Xu, Qi Chen, Charles Ling, Boyu Wang, Changjian Shui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI systems have been shown to produce unfair results for certain subgroups of
+population, highlighting the need to understand bias on certain sensitive
+attributes. Current research often falls short, primarily focusing on the
+subgroups characterized by a single sensitive attribute, while neglecting the
+nature of intersectional fairness of multiple sensitive attributes. This paper
+focuses on its one fundamental aspect by discovering diverse high-bias
+subgroups under intersectional sensitive attributes. Specifically, we propose a
+Bias-Guided Generative Network (BGGN). By treating each bias value as a reward,
+BGGN efficiently generates high-bias intersectional sensitive attributes.
+Experiments on real-world text and image datasets demonstrate a diverse and
+efficient discovery of BGGN. To further evaluate the generated unseen but
+possible unfair intersectional sensitive attributes, we formulate them as
+prompts and use modern generative AI to produce new texts and images. The
+results of frequently generating biased data provides new insights of
+discovering potential unfairness in popular modern generative AI systems.
+Warning: This paper contains generative examples that are offensive in nature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML-2024 camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trust Regions for Explanations via Black-Box Probabilistic Certification <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11168v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11168v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Dhurandhar, Swagatam Haldar, Dennis Wei, Karthikeyan Natesan Ramamurthy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the black box nature of machine learning models, a plethora of
+explainability methods have been developed to decipher the factors behind
+individual decisions. In this paper, we introduce a novel problem of black box
+(probabilistic) explanation certification. We ask the question: Given a black
+box model with only query access, an explanation for an example and a quality
+metric (viz. fidelity, stability), can we find the largest hypercube (i.e.,
+$\ell_{\infty}$ ball) centered at the example such that when the explanation is
+applied to all examples within the hypercube, (with high probability) a quality
+criterion is met (viz. fidelity greater than some value)? Being able to
+efficiently find such a \emph{trust region} has multiple benefits: i) insight
+into model behavior in a \emph{region}, with a \emph{guarantee}; ii)
+ascertained \emph{stability} of the explanation; iii) \emph{explanation reuse},
+which can save time, energy and money by not having to find explanations for
+every example; and iv) a possible \emph{meta-metric} to compare explanation
+methods. Our contributions include formalizing this problem, proposing
+solutions, providing theoretical guarantees for these solutions that are
+computable, and experimentally showing their efficacy on synthetic and real
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Techniques for Optimization-Based Jailbreaking on Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21018v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21018v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojun Jia, Tianyu Pang, Chao Du, Yihao Huang, Jindong Gu, Yang Liu, Xiaochun Cao, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are being rapidly developed, and a key component
+of their widespread deployment is their safety-related alignment. Many
+red-teaming efforts aim to jailbreak LLMs, where among these efforts, the
+Greedy Coordinate Gradient (GCG) attack's success has led to a growing interest
+in the study of optimization-based jailbreaking techniques. Although GCG is a
+significant milestone, its attacking efficiency remains unsatisfactory. In this
+paper, we present several improved (empirical) techniques for
+optimization-based jailbreaks like GCG. We first observe that the single target
+template of "Sure" largely limits the attacking performance of GCG; given this,
+we propose to apply diverse target templates containing harmful self-suggestion
+and/or guidance to mislead LLMs. Besides, from the optimization aspects, we
+propose an automatic multi-coordinate updating strategy in GCG (i.e.,
+adaptively deciding how many tokens to replace in each step) to accelerate
+convergence, as well as tricks like easy-to-hard initialisation. Then, we
+combine these improved technologies to develop an efficient jailbreak method,
+dubbed I-GCG. In our experiments, we evaluate on a series of benchmarks (such
+as NeurIPS 2023 Red Teaming Track). The results demonstrate that our improved
+techniques can help GCG outperform state-of-the-art jailbreaking attacks and
+achieve nearly 100% attack success rate. The code is released at
+https://github.com/jiaxiaojunQAQ/I-GCG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fault Tolerant ML: Efficient Meta-Aggregation and Synchronous Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tehila Dahan, Kfir Y. Levy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the challenging framework of Byzantine-robust
+training in distributed machine learning (ML) systems, focusing on enhancing
+both efficiency and practicality. As distributed ML systems become integral for
+complex ML tasks, ensuring resilience against Byzantine failures-where workers
+may contribute incorrect updates due to malice or error-gains paramount
+importance. Our first contribution is the introduction of the Centered Trimmed
+Meta Aggregator (CTMA), an efficient meta-aggregator that upgrades baseline
+aggregators to optimal performance levels, while requiring low computational
+demands. Additionally, we propose harnessing a recently developed gradient
+estimation technique based on a double-momentum strategy within the Byzantine
+context. Our paper highlights its theoretical and practical advantages for
+Byzantine-robust training, especially in simplifying the tuning process and
+reducing the reliance on numerous hyperparameters. The effectiveness of this
+technique is supported by theoretical insights within the stochastic convex
+optimization (SCO) framework and corroborated by empirical evidence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Temporal Kolmogorov-Arnold <span class="highlight-title">Transformer</span> for Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02486v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02486v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Remi Genet, Hugo Inzirillo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing complex temporal patterns and relationships within multivariate
+data streams is a difficult task. We propose the Temporal Kolmogorov-Arnold
+Transformer (TKAT), a novel attention-based architecture designed to address
+this task using Temporal Kolmogorov-Arnold Networks (TKANs). Inspired by the
+Temporal Fusion Transformer (TFT), TKAT emerges as a powerful encoder-decoder
+model tailored to handle tasks in which the observed part of the features is
+more important than the a priori known part. This new architecture combined the
+theoretical foundation of the Kolmogorov-Arnold representation with the power
+of transformers. TKAT aims to simplify the complex dependencies inherent in
+time series, making them more "interpretable". The use of transformer
+architecture in this framework allows us to capture long-range dependencies
+through self-attention mechanisms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2405.07344</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HarmonyDream: Task Harmonization Inside World Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00344v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00344v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Ma, Jialong Wu, Ningya Feng, Chenjun Xiao, Dong Li, Jianye Hao, Jianmin Wang, Mingsheng Long
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model-based reinforcement learning (MBRL) holds the promise of
+sample-efficient learning by utilizing a world model, which models how the
+environment works and typically encompasses components for two tasks:
+observation modeling and reward modeling. In this paper, through a dedicated
+empirical investigation, we gain a deeper understanding of the role each task
+plays in world models and uncover the overlooked potential of sample-efficient
+MBRL by mitigating the domination of either observation or reward modeling. Our
+key insight is that while prevalent approaches of explicit MBRL attempt to
+restore abundant details of the environment via observation models, it is
+difficult due to the environment's complexity and limited model capacity. On
+the other hand, reward models, while dominating implicit MBRL and adept at
+learning compact task-centric dynamics, are inadequate for sample-efficient
+learning without richer learning signals. Motivated by these insights and
+discoveries, we propose a simple yet effective approach, HarmonyDream, which
+automatically adjusts loss coefficients to maintain task harmonization, i.e. a
+dynamic equilibrium between the two tasks in world model learning. Our
+experiments show that the base MBRL method equipped with HarmonyDream gains
+10%-69% absolute performance boosts on visual robotic tasks and sets a new
+state-of-the-art result on the Atari 100K benchmark. Code is available at
+https://github.com/thuml/HarmonyDream.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Code is available at https://github.com/thuml/HarmonyDream</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HAAQI-Net: A Non-intrusive Neural Music Audio Quality Assessment Model
+  for Hearing Aids 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01145v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01145v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dyah A. M. G. Wisnu, Stefano Rini, Ryandhimas E. Zezario, Hsin-Min Wang, Yu Tsao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces HAAQI-Net, a non-intrusive deep learning model for
+music audio quality assessment tailored for hearing aid users. Unlike
+traditional methods like the Hearing Aid Audio Quality Index (HAAQI), which
+rely on intrusive comparisons to a reference signal, HAAQI-Net offers a more
+accessible and efficient alternative. Using a bidirectional Long Short-Term
+Memory (BLSTM) architecture with attention mechanisms and features from the
+pre-trained BEATs model, HAAQI-Net predicts HAAQI scores directly from music
+audio clips and hearing loss patterns. Results show HAAQI-Net's effectiveness,
+with predicted scores achieving a Linear Correlation Coefficient (LCC) of
+0.9368, a Spearman's Rank Correlation Coefficient (SRCC) of 0.9486, and a Mean
+Squared Error (MSE) of 0.0064, reducing inference time from 62.52 seconds to
+2.54 seconds. Although effective, feature extraction via the large BEATs model
+incurs computational overhead. To address this, a knowledge distillation
+strategy creates a student distillBEATs model, distilling information from the
+teacher BEATs model during HAAQI-Net training, reducing required parameters.
+The distilled HAAQI-Net maintains strong performance with an LCC of 0.9071, an
+SRCC of 0.9307, and an MSE of 0.0091, while reducing parameters by 75.85% and
+inference time by 96.46%. This reduction enhances HAAQI-Net's efficiency and
+scalability, making it viable for real-world music audio quality assessment in
+hearing aid settings. This work also opens avenues for further research into
+optimizing deep learning models for specific applications, contributing to
+audio signal processing and quality assessment by providing insights into
+developing efficient and accurate models for practical applications in hearing
+aid technology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuroPrune: A Neuro-inspired Topological Sparse Training Algorithm for
+  Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01306v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01306v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Dhurandhar, Tejaswini Pedapati, Ronny Luss, Soham Dan, Aurelie Lozano, Payel Das, Georgios Kollias
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based Language Models have become ubiquitous in Natural Language
+Processing (NLP) due to their impressive performance on various tasks. However,
+expensive training as well as inference remains a significant impediment to
+their widespread applicability. While enforcing sparsity at various levels of
+the model architecture has found promise in addressing scaling and efficiency
+issues, there remains a disconnect between how sparsity affects network
+topology. Inspired by brain neuronal networks, we explore sparsity approaches
+through the lens of network topology. Specifically, we exploit mechanisms seen
+in biological networks, such as preferential attachment and redundant synapse
+pruning, and show that principled, model-agnostic sparsity approaches are
+performant and efficient across diverse NLP tasks, spanning both classification
+(such as natural language inference) and generation (summarization, machine
+translation), despite our sole objective not being optimizing performance.
+NeuroPrune is competitive with (or sometimes superior to) baselines on
+performance and can be up to $10$x faster in terms of training time for a given
+level of sparsity, simultaneously exhibiting measurable improvements in
+inference time in many cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distribution-Free Conformal Joint Prediction Regions for Neural Marked
+  Temporal Point Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Dheur, Tanguy Bosser, Rafael Izbicki, Souhaib Ben Taieb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequences of labeled events observed at irregular intervals in continuous
+time are ubiquitous across various fields. Temporal Point Processes (TPPs)
+provide a mathematical framework for modeling these sequences, enabling
+inferences such as predicting the arrival time of future events and their
+associated label, called mark. However, due to model misspecification or lack
+of training data, these probabilistic models may provide a poor approximation
+of the true, unknown underlying process, with prediction regions extracted from
+them being unreliable estimates of the underlying uncertainty. This paper
+develops more reliable methods for uncertainty quantification in neural TPP
+models via the framework of conformal prediction. A primary objective is to
+generate a distribution-free joint prediction region for an event's arrival
+time and mark, with a finite-sample marginal coverage guarantee. A key
+challenge is to handle both a strictly positive, continuous response and a
+categorical response, without distributional assumptions. We first consider a
+simple but conservative approach that combines individual prediction regions
+for the event's arrival time and mark. Then, we introduce a more effective
+method based on bivariate highest density regions derived from the joint
+predictive density of arrival times and marks. By leveraging the dependencies
+between these two variables, this method excludes unlikely combinations of the
+two, resulting in sharper prediction regions while still attaining the
+pre-specified coverage level. We also explore the generation of individual
+univariate prediction regions for events' arrival times and marks through
+conformal regression and classification techniques. Moreover, we evaluate the
+stronger notion of conditional coverage. Finally, through extensive
+experimentation on both simulated and real-world datasets, we assess the
+validity and efficiency of these methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compressed Federated Reinforcement Learning with a Generative Model <span class="chip">ECML-PKDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.10635v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.10635v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Beikmohammadi, Sarit Khirirat, Sindri Magnússon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning has recently gained unprecedented popularity, yet it
+still grapples with sample inefficiency. Addressing this challenge, federated
+reinforcement learning (FedRL) has emerged, wherein agents collaboratively
+learn a single policy by aggregating local estimations. However, this
+aggregation step incurs significant communication costs. In this paper, we
+propose CompFedRL, a communication-efficient FedRL approach incorporating both
+\textit{periodic aggregation} and (direct/error-feedback) compression
+mechanisms. Specifically, we consider compressed federated $Q$-learning with a
+generative model setup, where a central server learns an optimal $Q$-function
+by periodically aggregating compressed $Q$-estimates from local agents. For the
+first time, we characterize the impact of these two mechanisms (which have
+remained elusive) by providing a finite-time analysis of our algorithm,
+demonstrating strong convergence behaviors when utilizing either direct or
+error-feedback compression. Our bounds indicate improved solution accuracy
+concerning the number of agents and other federated hyperparameters while
+simultaneously reducing communication costs. To corroborate our theory, we also
+conduct in-depth numerical experiments to verify our findings, considering
+Top-$K$ and Sparsified-$K$ sparsification operators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>European Conference on Machine Learning and Principles and Practice
+  of Knowledge Discovery in Databases (ECML-PKDD 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A General Framework for Learning from Weak Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01922v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01922v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Chen, Jindong Wang, Lei Feng, Xiang Li, Yidong Wang, Xing Xie, Masashi Sugiyama, Rita Singh, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly supervised learning generally faces challenges in applicability to
+various scenarios with diverse weak supervision and in scalability due to the
+complexity of existing algorithms, thereby hindering the practical deployment.
+This paper introduces a general framework for learning from weak supervision
+(GLWS) with a novel algorithm. Central to GLWS is an Expectation-Maximization
+(EM) formulation, adeptly accommodating various weak supervision sources,
+including instance partial labels, aggregate statistics, pairwise observations,
+and unlabeled data. We further present an advanced algorithm that significantly
+simplifies the EM computational demands using a Non-deterministic Finite
+Automaton (NFA) along with a forward-backward algorithm, which effectively
+reduces time complexity from quadratic or factorial often required in existing
+solutions to linear scale. The problem of learning from arbitrary weak
+supervision is therefore converted to the NFA modeling of them. GLWS not only
+enhances the scalability of machine learning models but also demonstrates
+superior performance and versatility across 11 weak supervision scenarios. We
+hope our work paves the way for further advancements and practical deployment
+in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 20 tables, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Neuro-Symbolic Framework for Answering Graph Pattern Queries in
+  Knowledge Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04598v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04598v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tamara Cucumides, Daniel Daza, Pablo Barceló, Michael Cochez, Floris Geerts, Juan L Reutter, Miguel Romero
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The challenge of answering graph queries over incomplete knowledge graphs is
+gaining significant attention in the machine learning community. Neuro-symbolic
+models have emerged as a promising approach, combining good performance with
+high interpretability. These models utilize trained architectures to execute
+atomic queries and integrate modules that mimic symbolic query operators.
+However, most neuro-symbolic query processors are constrained to tree-like
+graph pattern queries. These queries admit a bottom-up execution with constant
+values or anchors at the leaves and the target variable at the root. While
+expressive, tree-like queries fail to capture critical properties in knowledge
+graphs, such as the existence of multiple edges between entities or the
+presence of triangles. We introduce a framework for answering arbitrary graph
+pattern queries over incomplete knowledge graphs, encompassing both cyclic
+queries and tree-like queries with existentially quantified leaves. These
+classes of queries are vital for practical applications but are beyond the
+scope of most current neuro-symbolic models. Our approach employs an
+approximation scheme that facilitates acyclic traversals for cyclic patterns,
+thereby embedding additional symbolic bias into the query execution process.
+Our experimental evaluation demonstrates that our framework performs
+competitively on three datasets, effectively handling cyclic queries through
+our approximation strategy. Additionally, it maintains the performance of
+existing neuro-symbolic models on anchored tree-like queries and extends their
+capabilities to queries with existentially quantified variables.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ranking Large Language Models without Ground Truth <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14860v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14860v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Dhurandhar, Rahul Nair, Moninder Singh, Elizabeth Daly, Karthikeyan Natesan Ramamurthy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluation and ranking of large language models (LLMs) has become an
+important problem with the proliferation of these models and their impact.
+Evaluation methods either require human responses which are expensive to
+acquire or use pairs of LLMs to evaluate each other which can be unreliable. In
+this paper, we provide a novel perspective where, given a dataset of prompts
+(viz. questions, instructions, etc.) and a set of LLMs, we rank them without
+access to any ground truth or reference responses. Inspired by real life where
+both an expert and a knowledgeable person can identify a novice our main idea
+is to consider triplets of models, where each one of them evaluates the other
+two, correctly identifying the worst model in the triplet with high
+probability. We also analyze our idea and provide sufficient conditions for it
+to succeed. Applying this idea repeatedly, we propose two methods to rank LLMs.
+In experiments on different generative tasks (summarization, multiple-choice,
+and dialog), our methods reliably recover close to true rankings without
+reference data. This points to a viable low-resource mechanism for practical
+use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CR-UTP: Certified Robustness against Universal Text Perturbations on
+  Large Language Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01873v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01873v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Lou, Xin Liang, Jiaqi Xue, Yancheng Zhang, Rui Xie, Mengxin Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is imperative to ensure the stability of every prediction made by a
+language model; that is, a language's prediction should remain consistent
+despite minor input variations, like word substitutions. In this paper, we
+investigate the problem of certifying a language model's robustness against
+Universal Text Perturbations (UTPs), which have been widely used in universal
+adversarial attacks and backdoor attacks. Existing certified robustness based
+on random smoothing has shown considerable promise in certifying the
+input-specific text perturbations (ISTPs), operating under the assumption that
+any random alteration of a sample's clean or adversarial words would negate the
+impact of sample-wise perturbations. However, with UTPs, masking only the
+adversarial words can eliminate the attack. A naive method is to simply
+increase the masking ratio and the likelihood of masking attack tokens, but it
+leads to a significant reduction in both certified accuracy and the certified
+radius due to input corruption by extensive masking. To solve this challenge,
+we introduce a novel approach, the superior prompt search method, designed to
+identify a superior prompt that maintains higher certified accuracy under
+extensive masking. Additionally, we theoretically motivate why ensembles are a
+particularly suitable choice as base prompts for random smoothing. The method
+is denoted by superior prompt ensembling technique. We also empirically confirm
+this technique, obtaining state-of-the-art results in multiple settings. These
+methodologies, for the first time, enable high certified accuracy against both
+UTPs and ISTPs. The source code of CR-UTP is available at \url
+{https://github.com/UCFML-Research/CR-UTP}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformal Validity Guarantees Exist for Any Data Distribution (and How
+  to Find Them) <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.06627v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.06627v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Drew Prinster, Samuel Stanton, Anqi Liu, Suchi Saria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As artificial intelligence (AI) / machine learning (ML) gain widespread
+adoption, practitioners are increasingly seeking means to quantify and control
+the risk these systems incur. This challenge is especially salient when such
+systems have autonomy to collect their own data, such as in black-box
+optimization and active learning, where their actions induce sequential
+feedback-loop shifts in the data distribution. Conformal prediction is a
+promising approach to uncertainty and risk quantification, but prior variants'
+validity guarantees have assumed some form of ``quasi-exchangeability'' on the
+data distribution, thereby excluding many types of sequential shifts. In this
+paper we prove that conformal prediction can theoretically be extended to
+\textit{any} joint data distribution, not just exchangeable or
+quasi-exchangeable ones. Although the most general case is exceedingly
+impractical to compute, for concrete practical applications we outline a
+procedure for deriving specific conformal algorithms for any data distribution,
+and we use this procedure to derive tractable algorithms for a series of
+AI/ML-agent-induced covariate shifts. We evaluate the proposed algorithms
+empirically on synthetic black-box optimization and active learning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Code available at
+  https://github.com/drewprinster/conformal-mfcs</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Error Feedback Can Accurately Compress Preconditioners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06098v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06098v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ionut-Vlad Modoranu, Aleksei Kalinov, Eldar Kurtic, Elias Frantar, Dan Alistarh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging second-order information about the loss at the scale of deep
+networks is one of the main lines of approach for improving the performance of
+current optimizers for deep learning. Yet, existing approaches for accurate
+full-matrix preconditioning, such as Full-Matrix Adagrad (GGT) or Matrix-Free
+Approximate Curvature (M-FAC) suffer from massive storage costs when applied
+even to small-scale models, as they must store a sliding window of gradients,
+whose memory requirements are multiplicative in the model dimension. In this
+paper, we address this issue via a novel and efficient error-feedback technique
+that can be applied to compress preconditioners by up to two orders of
+magnitude in practice, without loss of convergence. Specifically, our approach
+compresses the gradient information via sparsification or low-rank compression
+\emph{before} it is fed into the preconditioner, feeding the compression error
+back into future iterations. Experiments on deep neural networks show that this
+approach can compress full-matrix preconditioners to up to 99\% sparsity
+without accuracy loss, effectively removing the memory overhead of full-matrix
+preconditioners such as GGT and M-FAC. Our code is available at
+\url{https://github.com/IST-DASLab/EFCP}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Entity Matching using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11244v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11244v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ralph Peeters, Christian Bizer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity Matching is the task of deciding whether two entity descriptions refer
+to the same real-world entity and is a central step in most data integration
+pipelines. Many state-of-the-art entity matching methods rely on pre-trained
+language models (PLMs) such as BERT or RoBERTa. Two major drawbacks of these
+models for entity matching are that (i) the models require significant amounts
+of task-specific training data and (ii) the fine-tuned models are not robust
+concerning out-of-distribution entities. This paper investigates using
+generative large language models (LLMs) as a less task-specific training
+data-dependent and more robust alternative to PLM-based matchers. Our study
+covers hosted and open-source LLMs, which can be run locally. We evaluate these
+models in a zero-shot scenario and a scenario where task-specific training data
+is available. We compare different prompt designs and the prompt sensitivity of
+the models and show that there is no single best prompt but needs to be tuned
+for each model/dataset combination. We further investigate (i) the selection of
+in-context demonstrations, (ii) the generation of matching rules, as well as
+(iii) fine-tuning a hosted LLM using the same pool of training data. Our
+experiments show that the best LLMs require no or only a few training examples
+to perform similarly to PLMs that were fine-tuned using thousands of examples.
+LLM-based matchers further exhibit higher robustness to unseen entities. We
+show that GPT4 can generate structured explanations for matching decisions. The
+model can automatically identify potential causes of matching errors by
+analyzing explanations of wrong decisions. We demonstrate that the model can
+generate meaningful textual descriptions of the identified error classes, which
+can help data engineers improve entity matching pipelines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoopHash: Cooperative Learning of Multipurpose Descriptor and
+  Contrastive Pair Generator via Variational MCMC Teaching for Supervised Image
+  Hashing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04288v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04288v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khoa D. Doan, Jianwen Xie, Yaxuan Zhu, Yang Zhao, Ping Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging supervised information can lead to superior retrieval performance
+in the image hashing domain but the performance degrades significantly without
+enough labeled data. One effective solution to boost performance is to employ
+generative models, such as Generative Adversarial Networks (GANs), to generate
+synthetic data in an image hashing model. However, GAN-based methods are
+difficult to train, which prevents the hashing approaches from jointly training
+the generative models and the hash functions. This limitation results in
+sub-optimal retrieval performance. To overcome this limitation, we propose a
+novel framework, the generative cooperative hashing network, which is based on
+energy-based cooperative learning. This framework jointly learns a powerful
+generative representation of the data and a robust hash function via two
+components: a top-down contrastive pair generator that synthesizes contrastive
+images and a bottom-up multipurpose descriptor that simultaneously represents
+the images from multiple perspectives, including probability density, hash
+code, latent code, and category. The two components are jointly learned via a
+novel likelihood-based cooperative learning scheme. We conduct experiments on
+several real-world datasets and show that the proposed method outperforms the
+competing hashing supervised methods, achieving up to 10\% relative improvement
+over the current state-of-the-art supervised hashing methods, and exhibits a
+significantly better performance in out-of-distribution retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust CLIP: Unsupervised Adversarial Fine-Tuning of Vision Embeddings
+  for Robust Large Vision-Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12336v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12336v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christian Schlarmann, Naman Deep Singh, Francesco Croce, Matthias Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal foundation models like OpenFlamingo, LLaVA, and GPT-4 are
+increasingly used for various real-world tasks. Prior work has shown that these
+models are highly vulnerable to adversarial attacks on the vision modality.
+These attacks can be leveraged to spread fake information or defraud users, and
+thus pose a significant risk, which makes the robustness of large multi-modal
+foundation models a pressing problem. The CLIP model, or one of its variants,
+is used as a frozen vision encoder in many large vision-language models
+(LVLMs), e.g. LLaVA and OpenFlamingo. We propose an unsupervised adversarial
+fine-tuning scheme to obtain a robust CLIP vision encoder, which yields
+robustness on all vision down-stream tasks (LVLMs, zero-shot classification)
+that rely on CLIP. In particular, we show that stealth-attacks on users of
+LVLMs by a malicious third party providing manipulated images are no longer
+possible once one replaces the original CLIP model with our robust one. No
+retraining or fine-tuning of the down-stream LVLMs is required. The code and
+robust models are available at https://github.com/chs20/RobustVLM
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sampling in Unit Time with Kernel Fisher-Rao Flow <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03892v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03892v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aimee Maurais, Youssef Marzouk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new mean-field ODE and corresponding interacting particle
+systems (IPS) for sampling from an unnormalized target density. The IPS are
+gradient-free, available in closed form, and only require the ability to sample
+from a reference density and compute the (unnormalized) target-to-reference
+density ratio. The mean-field ODE is obtained by solving a Poisson equation for
+a velocity field that transports samples along the geometric mixture of the two
+densities, which is the path of a particular Fisher-Rao gradient flow. We
+employ a RKHS ansatz for the velocity field, which makes the Poisson equation
+tractable and enables discretization of the resulting mean-field ODE over
+finite samples. The mean-field ODE can be additionally be derived from a
+discrete-time perspective as the limit of successive linearizations of the
+Monge-Amp\`ere equations within a framework known as sample-driven optimal
+transport. We introduce a stochastic variant of our approach and demonstrate
+empirically that our IPS can produce high-quality samples from varied target
+distributions, outperforming comparable gradient-free particle systems and
+competitive with gradient-based alternatives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICML 2024. Updated with additional numerical examples</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The AI Community Building the Future? A Quantitative Analysis of
+  Development Activity on Hugging Face Hub 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13058v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13058v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cailean Osborne, Jennifer Ding, Hannah Rose Kirk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open model developers have emerged as key actors in the political economy of
+artificial intelligence (AI), but we still have a limited understanding of
+collaborative practices in the open AI ecosystem. This paper responds to this
+gap with a three-part quantitative analysis of development activity on the
+Hugging Face (HF) Hub, a popular platform for building, sharing, and
+demonstrating models. First, various types of activity across 348,181 model,
+65,761 dataset, and 156,642 space repositories exhibit right-skewed
+distributions. Activity is extremely imbalanced between repositories; for
+example, over 70% of models have 0 downloads, while 1% account for 99% of
+downloads. Furthermore, licenses matter: there are statistically significant
+differences in collaboration patterns in model repositories with permissive,
+restrictive, and no licenses. Second, we analyse a snapshot of the social
+network structure of collaboration in model repositories, finding that the
+community has a core-periphery structure, with a core of prolific developers
+and a majority of isolate developers (89%). Upon removing the isolate
+developers from the network, collaboration is characterised by high reciprocity
+regardless of developers' network positions. Third, we examine model adoption
+through the lens of model usage in spaces, finding that a minority of models,
+developed by a handful of companies, are widely used on the HF Hub. Overall,
+activity on the HF Hub is characterised by Pareto distributions, congruent with
+OSS development patterns on platforms like GitHub. We conclude with
+recommendations for researchers, companies, and policymakers to advance our
+understanding of open AI development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 5 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ I-LLM: Efficient Integer-Only Inference for Fully-Quantized Low-Bit
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17849v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17849v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xing Hu, Yuan Cheng, Dawei Yang, Zhihang Yuan, Jiangyong Yu, Chen Xu, Sifan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization (PTQ) serves as a potent technique to accelerate
+the inference of large language models (LLMs). Nonetheless, existing works
+still necessitate a considerable number of floating-point (FP) operations
+during inference, including additional quantization and de-quantization, as
+well as non-linear operators such as RMSNorm and Softmax. This limitation
+hinders the deployment of LLMs on the edge and cloud devices. In this paper, we
+identify the primary obstacle to integer-only quantization for LLMs lies in the
+large fluctuation of activations across channels and tokens in both linear and
+non-linear operations. To address this issue, we propose I-LLM, a novel
+integer-only fully-quantized PTQ framework tailored for LLMs. Specifically, (1)
+we develop Fully-Smooth Block-Reconstruction (FSBR) to aggressively smooth
+inter-channel variations of all activations and weights. (2) to alleviate
+degradation caused by inter-token variations, we introduce a novel approach
+called Dynamic Integer-only MatMul (DI-MatMul). This method enables dynamic
+quantization in full-integer matrix multiplication by dynamically quantizing
+the input and outputs with integer-only operations. (3) we design
+DI-ClippedSoftmax, DI-Exp, and DI-Normalization, which utilize bit shift to
+execute non-linear operators efficiently while maintaining accuracy. The
+experiment shows that our I-LLM achieves comparable accuracy to the FP baseline
+and outperforms non-integer quantization methods. For example, I-LLM can
+operate at W4A4 with negligible loss of accuracy. To our knowledge, we are the
+first to bridge the gap between integer-only quantization and LLMs. We've
+published our code on anonymous.4open.science, aiming to contribute to the
+advancement of this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Can Infer Psychological Dispositions of Social
+  Media Users 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08631v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08631v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heinrich Peters, Sandra Matz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) demonstrate increasingly human-like abilities
+across a wide variety of tasks. In this paper, we investigate whether LLMs like
+ChatGPT can accurately infer the psychological dispositions of social media
+users and whether their ability to do so varies across socio-demographic
+groups. Specifically, we test whether GPT-3.5 and GPT-4 can derive the Big Five
+personality traits from users' Facebook status updates in a zero-shot learning
+scenario. Our results show an average correlation of r = .29 (range = [.22,
+.33]) between LLM-inferred and self-reported trait scores - a level of accuracy
+that is similar to that of supervised machine learning models specifically
+trained to infer personality. Our findings also highlight heterogeneity in the
+accuracy of personality inferences across different age groups and gender
+categories: predictions were found to be more accurate for women and younger
+individuals on several traits, suggesting a potential bias stemming from the
+underlying training data or differences in online self-expression. The ability
+of LLMs to infer psychological dispositions from user-generated text has the
+potential to democratize access to cheap and scalable psychometric assessments
+for both researchers and practitioners. On the one hand, this democratization
+might facilitate large-scale research of high ecological validity and spark
+innovation in personalized services. On the other hand, it also raises ethical
+concerns regarding user privacy and self-determination, highlighting the need
+for stringent ethical frameworks and regulation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Remove that Square Root: A New Efficient Scale-Invariant Version of
+  AdaGrad 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02648v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02648v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sayantan Choudhury, Nazarii Tupitsa, Nicolas Loizou, Samuel Horvath, Martin Takac, Eduard Gorbunov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive methods are extremely popular in machine learning as they make
+learning rate tuning less expensive. This paper introduces a novel optimization
+algorithm named KATE, which presents a scale-invariant adaptation of the
+well-known AdaGrad algorithm. We prove the scale-invariance of KATE for the
+case of Generalized Linear Models. Moreover, for general smooth non-convex
+problems, we establish a convergence rate of $O \left(\frac{\log T}{\sqrt{T}}
+\right)$ for KATE, matching the best-known ones for AdaGrad and Adam. We also
+compare KATE to other state-of-the-art adaptive algorithms Adam and AdaGrad in
+numerical experiments with different problems, including complex machine
+learning tasks like image classification and text classification on real data.
+The results indicate that KATE consistently outperforms AdaGrad and
+matches/surpasses the performance of Adam in all considered scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Grokfast: Accelerated Grokking by Amplifying Slow Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One puzzling artifact in machine learning dubbed grokking is where delayed
+generalization is achieved tenfolds of iterations after near perfect
+overfitting to the training data. Focusing on the long delay itself on behalf
+of machine learning practitioners, our goal is to accelerate generalization of
+a model under grokking phenomenon. By regarding a series of gradients of a
+parameter over training iterations as a random signal over time, we can
+spectrally decompose the parameter trajectories under gradient descent into two
+components: the fast-varying, overfitting-yielding component and the
+slow-varying, generalization-inducing component. This analysis allows us to
+accelerate the grokking phenomenon more than $\times 50$ with only a few lines
+of code that amplifies the slow-varying components of gradients. The
+experiments show that our algorithm applies to diverse tasks involving images,
+languages, and graphs, enabling practical availability of this peculiar
+artifact of sudden generalization. Our code is available at
+https://github.com/ironjr/grokfast.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 13 figures. Typo fixed. Project page:
+  https://jaerinlee.com/research/grokfast</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Preference Optimization for Sample Efficient RLHF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10500v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10500v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nirjhar Das, Souradip Chakraborty, Aldo Pacchiano, Sayak Ray Chowdhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning from Human Feedback (RLHF) is pivotal in aligning
+Large Language Models (LLMs) with human preferences. Although aligned
+generative models have shown remarkable abilities in various tasks, their
+reliance on high-quality human preference data creates a costly bottleneck in
+the practical application of RLHF. One primary reason is that current methods
+rely on uniformly picking prompt-generation pairs from a dataset of
+prompt-generations, to collect human feedback, resulting in sub-optimal
+alignment under a constrained budget, which highlights the criticality of
+adaptive strategies in efficient alignment. Recent works [Mehta et al., 2023,
+Muldrew et al., 2024] have tried to address this problem by designing various
+heuristics based on generation uncertainty. However, either the assumptions in
+[Mehta et al., 2023] are restrictive, or [Muldrew et al., 2024] do not provide
+any rigorous theoretical guarantee. To address these, we reformulate RLHF
+within contextual preference bandit framework, treating prompts as contexts,
+and develop an active-learning algorithm, $\textit{Active Preference
+Optimization}$ ($\texttt{APO}$), which enhances model alignment by querying
+preference data from the most important samples, achieving superior performance
+for small sample budget. We analyze the theoretical performance guarantees of
+$\texttt{APO}$ under the BTL preference model showing that the suboptimality
+gap of the policy learned via $\texttt{APO}$ scales as $O(1/\sqrt{T})$ for a
+budget of $T$. We also show that collecting preference data by choosing prompts
+randomly leads to a policy that suffers a constant sub-optimality. We perform
+detailed experimental evaluations on practical preference datasets to validate
+$\texttt{APO}$'s efficacy over the existing methods, establishing it as a
+sample-efficient and practical solution of alignment in a cost-effective and
+scalable manner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>New experimental results added. Some reorganization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ All Language Models Large and Small 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12061v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12061v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixun Chen, Yali Du, David Mguni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many leading language models (LMs) use high-intensity computational resources
+both during training and execution. This poses the challenge of lowering
+resource costs for deployment and faster execution of decision-making tasks
+among others. We introduce a novel plug-and-play LM framework named Language
+Optimising Network Distribution (LONDI) framework. LONDI learns to selectively
+employ large LMs only where complex decision-making and reasoning are required
+while using low-resource LMs (i.e. LMs require less GPU usage, but may not be
+able to solve the problem alone) everywhere else. LONDI consists of a system of
+two (off-)policy networks, an LM, a large LM (LLM), and a reinforcement
+learning module that uses switching controls to quickly learn which system
+states to call the LLM. We then introduce a variant of LONDI that maintains
+budget constraints on LLM calls and hence its resource usage. Theoretically, we
+prove LONDI learns the subset of system states to activate the LLM required to
+solve the task. We then prove that LONDI converges to optimal solutions while
+also preserving budgetary constraints on LLM calls almost surely enabling it to
+solve various tasks while significantly lowering computational costs. We test
+LONDI's performance in a range of tasks in ScienceWorld and BabyAI-Text and
+demonstrate that LONDI can solve tasks only solvable by resource-intensive LLMs
+while reducing GPU usage by up to 30%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpretability Illusions in the Generalization of Simplified Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03656v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03656v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan Friedman, Andrew Lampinen, Lucas Dixon, Danqi Chen, Asma Ghandeharioun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common method to study deep learning systems is to use simplified model
+representations--for example, using singular value decomposition to visualize
+the model's hidden states in a lower dimensional space. This approach assumes
+that the results of these simplifications are faithful to the original model.
+Here, we illustrate an important caveat to this assumption: even if the
+simplified representations can accurately approximate the full model on the
+training set, they may fail to accurately capture the model's behavior out of
+distribution. We illustrate this by training Transformer models on controlled
+datasets with systematic generalization splits, including the Dyck
+balanced-parenthesis languages and a code completion task. We simplify these
+models using tools like dimensionality reduction and clustering, and then
+explicitly test how these simplified proxies match the behavior of the original
+model. We find consistent generalization gaps: cases in which the simplified
+proxies are more faithful to the original model on the in-distribution
+evaluations and less faithful on various tests of systematic generalization.
+This includes cases where the original model generalizes systematically but the
+simplified proxies fail, and cases where the simplified proxies generalize
+better. Together, our results raise questions about the extent to which
+mechanistic interpretations derived using tools like SVD can reliably predict
+what a model will do in novel situations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DRED: Zero-Shot Transfer in Reinforcement Learning via Data-Regularised
+  Environment Design <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03479v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03479v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Garcin, James Doran, Shangmin Guo, Christopher G. Lucas, Stefano V. Albrecht
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous agents trained using deep reinforcement learning (RL) often lack
+the ability to successfully generalise to new environments, even when these
+environments share characteristics with the ones they have encountered during
+training. In this work, we investigate how the sampling of individual
+environment instances, or levels, affects the zero-shot generalisation (ZSG)
+ability of RL agents. We discover that, for deep actor-critic architectures
+sharing their base layers, prioritising levels according to their value loss
+minimises the mutual information between the agent's internal representation
+and the set of training levels in the generated training data. This provides a
+novel theoretical justification for the regularisation achieved by certain
+adaptive sampling strategies. We then turn our attention to unsupervised
+environment design (UED) methods, which assume control over level generation.
+We find that existing UED methods can significantly shift the training
+distribution, which translates to low ZSG performance. To prevent both
+overfitting and distributional shift, we introduce data-regularised environment
+design (DRED). DRED generates levels using a generative model trained to
+approximate the ground truth distribution of an initial set of level
+parameters. Through its grounding, DRED achieves significant improvements in
+ZSG over adaptive level sampling strategies and UED methods. Our code and
+experimental data are available at https://github.com/uoe-agents/dred.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ICML 2024. A preliminary version of this work
+  (arXiv:2310.03494) was presented at the ALOE workshop, NeurIPS 2023. arXiv
+  admin note: text overlap with arXiv:2310.03494</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Isolated pulsar population synthesis with simulation-based inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14848v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14848v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vanessa Graber, Michele Ronchi, Celsa Pardo-Araujo, Nanda Rea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We combine pulsar population synthesis with simulation-based inference (SBI)
+to constrain the magnetorotational properties of isolated Galactic radio
+pulsars. We first develop a framework to model neutron star birth properties
+and their dynamical and magnetorotational evolution. We specifically sample
+initial magnetic field strengths, $B$, and spin periods, $P$, from lognormal
+distributions and capture the late-time magnetic field decay with a power law.
+Each lognormal is described by a mean, $\mu_{\log B}, \mu_{\log P}$, and
+standard deviation, $\sigma_{\log B}, \sigma_{\log P}$, while the power law is
+characterized by the index, $a_{\rm late}$. We subsequently model the stars'
+radio emission and observational biases to mimic detections with three radio
+surveys, and we produce a large database of synthetic $P$--$\dot{P}$ diagrams
+by varying our five magnetorotational input parameters. We then follow an SBI
+approach that focuses on neural posterior estimation and train deep neural
+networks to infer the parameters' posterior distributions. After successfully
+validating these individual neural density estimators on simulated data, we use
+an ensemble of networks to infer the posterior distributions for the observed
+pulsar population. We obtain $\mu_{\log B} = 13.10^{+0.08}_{-0.10}$,
+$\sigma_{\log B} = 0.45^{+0.05}_{-0.05}$ and $\mu_{\log P} =
+-1.00^{+0.26}_{-0.21}$, $\sigma_{\log P} = 0.38^{+0.33}_{-0.18}$ for the
+lognormal distributions and $a_{\rm late} = -1.80^{+0.65}_{-0.61}$ for the
+power law at the $95\%$ credible interval. We contrast our results with
+previous studies and highlight uncertainties of the inferred $a_{\rm late}$
+value. Our approach represents a crucial step toward robust statistical
+inference for complex population synthesis frameworks and forms the basis for
+future multiwavelength analyses of Galactic pulsars.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 16 figures, 5 tables, 2 appendices; published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Chosen One: Consistent Characters in Text-to-Image Diffusion Models <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.10093v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.10093v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omri Avrahami, Amir Hertz, Yael Vinker, Moab Arar, Shlomi Fruchter, Ohad Fried, Daniel Cohen-Or, Dani Lischinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in text-to-image generation models have unlocked vast
+potential for visual creativity. However, the users that use these models
+struggle with the generation of consistent characters, a crucial aspect for
+numerous real-world applications such as story visualization, game development,
+asset design, advertising, and more. Current methods typically rely on multiple
+pre-existing images of the target character or involve labor-intensive manual
+processes. In this work, we propose a fully automated solution for consistent
+character generation, with the sole input being a text prompt. We introduce an
+iterative procedure that, at each stage, identifies a coherent set of images
+sharing a similar identity and extracts a more consistent identity from this
+set. Our quantitative analysis demonstrates that our method strikes a better
+balance between prompt alignment and identity consistency compared to the
+baseline methods, and these findings are reinforced by a user study. To
+conclude, we showcase several practical applications of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIGGRAPH 2024. Project page is available at
+  https://omriavrahami.com/the-chosen-one/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ cDVGAN: One Flexible Model for Multi-class Gravitational Wave Signal and
+  Glitch Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.16356v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.16356v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom Dooney, Lyana Curier, Daniel Tan, Melissa Lopez, Chris Van Den Broeck, Stefano Bromuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simulating realistic time-domain observations of gravitational waves (GWs)
+and GW detector glitches can help in advancing GW data analysis. Simulated data
+can be used in downstream tasks by augmenting datasets for signal searches,
+balancing data sets for machine learning, and validating detection schemes. In
+this work, we present Conditional Derivative GAN (cDVGAN), a novel conditional
+model in the Generative Adversarial Network framework for simulating multiple
+classes of time-domain observations that represent gravitational waves (GWs)
+and detector glitches. cDVGAN can also generate generalized hybrid samples that
+span the variation between classes through interpolation in the conditioned
+class vector. cDVGAN introduces an additional player into the typical 2-player
+adversarial game of GANs, where an auxiliary discriminator analyzes the
+first-order derivative time-series. Our results show that this provides
+synthetic data that better captures the features of the original data. cDVGAN
+conditions on three classes, two denoised from LIGO blip and tomte glitch
+events from its 3rd observing run (O3), and the third representing binary black
+hole (BBH) mergers. Our proposed cDVGAN outperforms 4 different baseline GAN
+models in replicating the features of the three classes. Specifically, our
+experiments show that training convolutional neural networks (CNNs) with our
+cDVGAN-generated data improves the detection of samples embedded in detector
+noise beyond the synthetic data from other state-of-the-art GAN models. Our
+best synthetic dataset yields as much as a 4.2% increase in
+area-under-the-curve (AUC) performance compared to synthetic datasets from
+baseline GANs. Moreover, training the CNN with hybrid samples from our cDVGAN
+outperforms CNNs trained only on the standard classes, when identifying real
+samples embedded in LIGO detector background (4% AUC improvement for cDVGAN).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 17 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accounting for multiplicity in machine learning benchmark performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07272v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07272v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kajsa Møllersen, Einar Holsbø
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning methods are commonly evaluated and compared by their
+performance on data sets from public repositories. This allows for multiple
+methods, oftentimes several thousands, to be evaluated under identical
+conditions and across time. The highest ranked performance on a problem is
+referred to as state-of-the-art (SOTA) performance, and is used, among other
+things, as a reference point for publication of new methods. Using the
+highest-ranked performance as an estimate for SOTA is a biased estimator,
+giving overly optimistic results. The mechanisms at play are those of
+multiplicity, a topic that is well-studied in the context of multiple
+comparisons and multiple testing, but has, as far as the authors are aware of,
+been nearly absent from the discussion regarding SOTA estimates. The optimistic
+state-of-the-art estimate is used as a standard for evaluating new methods, and
+methods with substantial inferior results are easily overlooked. In this
+article, we provide a probability distribution for the case of multiple
+classifiers so that known analyses methods can be engaged and a better SOTA
+estimate can be provided. We demonstrate the impact of multiplicity through a
+simulated example with independent classifiers. We show how classifier
+dependency impacts the variance, but also that the impact is limited when the
+accuracy is high. Finally, we discuss three real-world examples; Kaggle
+competitions that demonstrate various aspects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Negative impact of heavy-tailed uncertainty and error distributions on
+  the reliability of calibration statistics for machine learning regression
+  tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10043v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10043v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Pernot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Average calibration of the (variance-based) prediction uncertainties of
+machine learning regression tasks can be tested in two ways: one is to estimate
+the calibration error (CE) as the difference between the mean absolute error
+(MSE) and the mean variance (MV); the alternative is to compare the mean
+squared z-scores (ZMS) to 1. The problem is that both approaches might lead to
+different conclusions, as illustrated in this study for an ensemble of datasets
+from the recent machine learning uncertainty quantification (ML-UQ) literature.
+It is shown that the estimation of MV, MSE and their confidence intervals
+becomes unreliable for heavy-tailed uncertainty and error distributions, which
+seems to be a frequent feature of ML-UQ datasets. By contrast, the ZMS
+statistic is less sensitive and offers the most reliable approach in this
+context. Unfortunately, the same problem is expected to affect also conditional
+calibrations statistics, such as the popular ENCE, and very likely post-hoc
+calibration methods based on similar statistics. Several solutions to
+circumvent the outlined problems are proposed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recurrent Distance Filtering for Graph Representation Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.01538v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.01538v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhui Ding, Antonio Orvieto, Bobby He, Thomas Hofmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks based on iterative one-hop message passing have been
+shown to struggle in harnessing the information from distant nodes effectively.
+Conversely, graph transformers allow each node to attend to all other nodes
+directly, but lack graph inductive bias and have to rely on ad-hoc positional
+encoding. In this paper, we propose a new architecture to reconcile these
+challenges. Our approach stems from the recent breakthroughs in long-range
+modeling provided by deep state-space models: for a given target node, our
+model aggregates other nodes by their shortest distances to the target and uses
+a linear RNN to encode the sequence of hop representations. The linear RNN is
+parameterized in a particular diagonal form for stable long-range signal
+propagation and is theoretically expressive enough to encode the neighborhood
+hierarchy. With no need for positional encoding, we empirically show that the
+performance of our model is comparable to or better than that of
+state-of-the-art graph transformers on various benchmarks, with a significantly
+reduced computational cost. Our code is open-source at
+https://github.com/skeletondyh/GRED.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Implicit Bias Imply Adversarial Robustness? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15942v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15942v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hancheng Min, René Vidal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The implicit bias of gradient-based training algorithms has been considered
+mostly beneficial as it leads to trained networks that often generalize well.
+However, Frei et al. (2023) show that such implicit bias can harm adversarial
+robustness. Specifically, they show that if the data consists of clusters with
+small inter-cluster correlation, a shallow (two-layer) ReLU network trained by
+gradient flow generalizes well, but it is not robust to adversarial attacks of
+small radius. Moreover, this phenomenon occurs despite the existence of a much
+more robust classifier that can be explicitly constructed from a shallow
+network. In this paper, we extend recent analyses of neuron alignment to show
+that a shallow network with a polynomial ReLU activation (pReLU) trained by
+gradient flow not only generalizes well but is also robust to adversarial
+attacks. Our results highlight the importance of the interplay between data
+structure and architecture design in the implicit bias and robustness of
+trained networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>icml 2024 camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear <span class="highlight-title">Transformer</span>s with Learnable Kernel Functions are Better
+  In-Context Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaroslav Aksenov, Nikita Balagansky, Sofia Maria Lo Cicero Vaina, Boris Shaposhnikov, Alexey Gorbatovski, Daniil Gavrilov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancing the frontier of subquadratic architectures for Language Models
+(LMs) is crucial in the rapidly evolving field of natural language processing.
+Current innovations, including State Space Models, were initially celebrated
+for surpassing Transformer performance on language modeling tasks. However,
+these models have revealed deficiencies in essential In-Context Learning
+capabilities - a domain where the Transformer traditionally shines. The Based
+model emerged as a hybrid solution, blending a Linear Transformer with a kernel
+inspired by the Taylor expansion of exponential functions, augmented by
+convolutional networks. Mirroring the Transformer's in-context adeptness, it
+became a strong contender in the field. In our work, we present a singular,
+elegant alteration to the Based kernel that amplifies its In-Context Learning
+abilities evaluated with the Multi-Query Associative Recall task and overall
+language modeling process, as demonstrated on the Pile dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Unreasonable Effectiveness of Easy Training Data for Hard Tasks <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06751v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06751v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Hase, Mohit Bansal, Peter Clark, Sarah Wiegreffe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How can we train models to perform well on hard test data when hard training
+data is by definition difficult to label correctly? This question has been
+termed the scalable oversight problem and has drawn increasing attention as
+language models have continually improved. In this paper, we present the
+surprising conclusion that current pretrained language models often generalize
+relatively well from easy to hard data, even performing as well as oracle
+models finetuned on hard data. We demonstrate this kind of easy-to-hard
+generalization using simple finetuning methods like in-context learning, linear
+classifier heads, and QLoRA for seven different measures of datapoint hardness,
+including six empirically diverse human hardness measures (like grade level)
+and one model-based measure (loss-based). Furthermore, we show that even if one
+cares most about model performance on hard data, it can be better to collect
+easy data rather than hard data for finetuning, since hard data is generally
+noisier and costlier to collect. Our experiments use open models up to 70b in
+size and four publicly available question-answering datasets with questions
+ranging in difficulty from 3rd grade science questions to college level STEM
+questions and general-knowledge trivia. We conclude that easy-to-hard
+generalization in LMs is surprisingly strong for the tasks studied. Our code is
+available at: https://github.com/allenai/easy-to-hard-generalization
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024. 23 pages, 20 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Partial-Label Learning with a Reject Option 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00592v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00592v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Fuchs, Florian Kalinke, Klemens Böhm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world applications, one often encounters ambiguously labeled data,
+where different annotators assign conflicting class labels. Partial-label
+learning allows training classifiers in this weakly supervised setting, where
+state-of-the-art methods already show good predictive performance. However,
+even the best algorithms give incorrect predictions, which can have severe
+consequences when they impact actions or decisions. We propose a novel
+risk-consistent partial-label learning algorithm with a reject option, that is,
+the algorithm can reject unsure predictions. Extensive experiments on
+artificial and real-world datasets show that our method provides the best
+trade-off between the number and accuracy of non-rejected predictions when
+compared to our competitors, which use confidence thresholds for rejecting
+unsure predictions instead. When evaluated without the reject option, our
+nearest neighbor-based approach also achieves competitive prediction
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MoMo: Momentum Models for Adaptive Learning Rates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07583v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07583v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Schaipp, Ruben Ohana, Michael Eickenberg, Aaron Defazio, Robert M. Gower
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training a modern machine learning architecture on a new task requires
+extensive learning-rate tuning, which comes at a high computational cost. Here
+we develop new Polyak-type adaptive learning rates that can be used on top of
+any momentum method, and require less tuning to perform well. We first develop
+MoMo, a Momentum Model based adaptive learning rate for SGD-M (stochastic
+gradient descent with momentum). MoMo uses momentum estimates of the losses and
+gradients sampled at each iteration to build a model of the loss function. Our
+model makes use of any known lower bound of the loss function by using
+truncation, e.g. most losses are lower-bounded by zero. The model is then
+approximately minimized at each iteration to compute the next step. We show how
+MoMo can be used in combination with any momentum-based method, and showcase
+this by developing MoMo-Adam, which is Adam with our new model-based adaptive
+learning rate. We show that MoMo attains a $\mathcal{O}(1/\sqrt{K})$
+convergence rate for convex problems with interpolation, needing knowledge of
+no problem-specific quantities other than the optimal value. Additionally, for
+losses with unknown lower bounds, we develop on-the-fly estimates of a lower
+bound, that are incorporated in our model. We show that MoMo and MoMo-Adam
+improve over SGD-M and Adam in terms of robustness to hyperparameter tuning for
+training image classifiers on MNIST, CIFAR, and Imagenet, for recommender
+systems on Criteo, for a transformer model on the translation task IWSLT14, and
+for a diffusion model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How do <span class="highlight-title">Transformer</span>s perform In-Context Autoregressive Learning? <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05787v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05787v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael E. Sander, Raja Giryes, Taiji Suzuki, Mathieu Blondel, Gabriel Peyré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have achieved state-of-the-art performance in language modeling
+tasks. However, the reasons behind their tremendous success are still unclear.
+In this paper, towards a better understanding, we train a Transformer model on
+a simple next token prediction task, where sequences are generated as a
+first-order autoregressive process $s_{t+1} = W s_t$. We show how a trained
+Transformer predicts the next token by first learning $W$ in-context, then
+applying a prediction mapping. We call the resulting procedure in-context
+autoregressive learning. More precisely, focusing on commuting orthogonal
+matrices $W$, we first show that a trained one-layer linear Transformer
+implements one step of gradient descent for the minimization of an inner
+objective function, when considering augmented tokens. When the tokens are not
+augmented, we characterize the global minima of a one-layer diagonal linear
+multi-head Transformer. Importantly, we exhibit orthogonality between heads and
+show that positional encoding captures trigonometric relations in the data. On
+the experimental side, we consider the general case of non-commuting orthogonal
+matrices and generalize our theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BanglaAutoKG: Automatic Bangla Knowledge Graph Construction with
+  Semantic Neural Graph Filtering <span class="chip">LREC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03528v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03528v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Azmine Toushik Wasi, Taki Hasan Rafi, Raima Islam, Dong-Kyu Chae
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graphs (KGs) have proven essential in information processing and
+reasoning applications because they link related entities and give context-rich
+information, supporting efficient information retrieval and knowledge
+discovery; presenting information flow in a very effective manner. Despite
+being widely used globally, Bangla is relatively underrepresented in KGs due to
+a lack of comprehensive datasets, encoders, NER (named entity recognition)
+models, POS (part-of-speech) taggers, and lemmatizers, hindering efficient
+information processing and reasoning applications in the language. Addressing
+the KG scarcity in Bengali, we propose BanglaAutoKG, a pioneering framework
+that is able to automatically construct Bengali KGs from any Bangla text. We
+utilize multilingual LLMs to understand various languages and correlate
+entities and relations universally. By employing a translation dictionary to
+identify English equivalents and extracting word features from pre-trained BERT
+models, we construct the foundational KG. To reduce noise and align word
+embeddings with our goal, we employ graph-based polynomial filters. Lastly, we
+implement a GNN-based semantic filter, which elevates contextual understanding
+and trims unnecessary edges, culminating in the formation of the definitive KG.
+Empirical findings and case studies demonstrate the universal effectiveness of
+our model, capable of autonomously constructing semantically enriched KGs from
+any text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures. Accepted to LREC-COLING 2024. Read in ACL
+  Anthology: https://aclanthology.org/2024.lrec-main.189/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Experiential Co-Learning of Software-Developing Agents <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.17025v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.17025v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Qian, Yufan Dang, Jiahao Li, Wei Liu, Zihao Xie, Yifei Wang, Weize Chen, Cheng Yang, Xin Cong, Xiaoyin Che, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language models (LLMs) have brought significant
+changes to various domains, especially through LLM-driven autonomous agents. A
+representative scenario is in software development, where LLM agents
+demonstrate efficient collaboration, task division, and assurance of software
+quality, markedly reducing the need for manual involvement. However, these
+agents frequently perform a variety of tasks independently, without benefiting
+from past experiences, which leads to repeated mistakes and inefficient
+attempts in multi-step task execution. To this end, we introduce Experiential
+Co-Learning, a novel LLM-agent learning framework in which instructor and
+assistant agents gather shortcut-oriented experiences from their historical
+trajectories and use these past experiences for future task execution. The
+extensive experiments demonstrate that the framework enables agents to tackle
+unseen software-developing tasks more effectively. We anticipate that our
+insights will guide LLM agents towards enhanced autonomy and contribute to
+their evolutionary growth in cooperative learning. The code and data are
+available at https://github.com/OpenBMB/ChatDev.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024, https://github.com/OpenBMB/ChatDev</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Self-Augmented In-Context Learning for Unsupervised Word Translation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10024v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10024v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaoyiran Li, Anna Korhonen, Ivan Vulić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that, while large language models (LLMs) demonstrate
+strong word translation or bilingual lexicon induction (BLI) capabilities in
+few-shot setups, they still cannot match the performance of 'traditional'
+mapping-based approaches in the unsupervised scenario where no seed translation
+pairs are available, especially for lower-resource languages. To address this
+challenge with LLMs, we propose self-augmented in-context learning (SAIL) for
+unsupervised BLI: starting from a zero-shot prompt, SAIL iteratively induces a
+set of high-confidence word translation pairs for in-context learning (ICL)
+from an LLM, which it then reapplies to the same LLM in the ICL fashion. Our
+method shows substantial gains over zero-shot prompting of LLMs on two
+established BLI benchmarks spanning a wide range of language pairs, also
+outperforming mapping-based baselines across the board. In addition to
+achieving state-of-the-art unsupervised BLI performance, we also conduct
+comprehensive analyses on SAIL and discuss its limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main Conference; 11 Pages, 3 Figures, 9 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Olfactory Label Prediction on Aroma-Chemical Pairs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.16124v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.16124v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Sisson, Aryan Amit Barsainyan, Mrityunjay Sharma, Ritesh Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of deep learning techniques on aroma-chemicals has resulted
+in models more accurate than human experts at predicting olfactory qualities.
+However, public research in this domain has been limited to predicting the
+qualities of single molecules, whereas in industry applications, perfumers and
+food scientists are often concerned with blends of many molecules. In this
+paper, we apply both existing and novel approaches to a dataset we gathered
+consisting of labeled pairs of molecules. We present graph neural network
+models capable of accurately predicting the odor qualities arising from blends
+of aroma-chemicals, with an analysis of how variations in architecture can lead
+to significant differences in predictive power.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Listenable Maps for Audio Classifiers <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13086v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13086v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Paissan, Mirco Ravanelli, Cem Subakan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive performance of deep learning models across diverse
+tasks, their complexity poses challenges for interpretation. This challenge is
+particularly evident for audio signals, where conveying interpretations becomes
+inherently difficult. To address this issue, we introduce Listenable Maps for
+Audio Classifiers (L-MAC), a posthoc interpretation method that generates
+faithful and listenable interpretations. L-MAC utilizes a decoder on top of a
+pretrained classifier to generate binary masks that highlight relevant portions
+of the input audio. We train the decoder with a loss function that maximizes
+the confidence of the classifier decision on the masked-in portion of the audio
+while minimizing the probability of model output for the masked-out portion.
+Quantitative evaluations on both in-domain and out-of-domain data demonstrate
+that L-MAC consistently produces more faithful interpretations than several
+gradient and masking-based methodologies. Furthermore, a user study confirms
+that, on average, users prefer the interpretations generated by the proposed
+technique.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IterMask2: Iterative Unsupervised Anomaly Segmentation via Spatial and
+  Frequency Masking for Brain Lesions in MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02422v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02422v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyun Liang, Xiaoqing Guo, J. Alison Noble, Konstantinos Kamnitsas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly segmentation approaches to pathology segmentation train
+a model on images of healthy subjects, that they define as the 'normal' data
+distribution. At inference, they aim to segment any pathologies in new images
+as 'anomalies', as they exhibit patterns that deviate from those in 'normal'
+training data. Prevailing methods follow the 'corrupt-and-reconstruct'
+paradigm. They intentionally corrupt an input image, reconstruct it to follow
+the learned 'normal' distribution, and subsequently segment anomalies based on
+reconstruction error. Corrupting an input image, however, inevitably leads to
+suboptimal reconstruction even of normal regions, causing false positives. To
+alleviate this, we propose a novel iterative spatial mask-refining strategy
+IterMask2. We iteratively mask areas of the image, reconstruct them, and update
+the mask based on reconstruction error. This iterative process progressively
+adds information about areas that are confidently normal as per the model. The
+increasing content guides reconstruction of nearby masked areas, improving
+reconstruction of normal tissue under these areas, reducing false positives. We
+also use high-frequency image content as an auxiliary input to provide
+additional structural information for masked areas. This further improves
+reconstruction error of normal in comparison to anomalous areas, facilitating
+segmentation of the latter. We conduct experiments on several brain lesion
+datasets and demonstrate effectiveness of our method. Code is available at:
+https://github.com/ZiyunLiang/IterMask2
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Position: Quo Vadis, Unsupervised Time Series Anomaly Detection? <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02678v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02678v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Saquib Sarfraz, Mei-Yen Chen, Lukas Layer, Kunyu Peng, Marios Koulakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current state of machine learning scholarship in Timeseries Anomaly
+Detection (TAD) is plagued by the persistent use of flawed evaluation metrics,
+inconsistent benchmarking practices, and a lack of proper justification for the
+choices made in novel deep learning-based model designs. Our paper presents a
+critical analysis of the status quo in TAD, revealing the misleading track of
+current research and highlighting problematic methods, and evaluation
+practices. Our position advocates for a shift in focus from solely pursuing
+novel model designs to improving benchmarking practices, creating non-trivial
+datasets, and critically evaluating the utility of complex methods against
+simpler baselines. Our findings demonstrate the need for rigorous evaluation
+protocols, the creation of simple baselines, and the revelation that
+state-of-the-art deep anomaly detection models effectively learn linear
+mappings. These findings suggest the need for more exploration and development
+of simple and interpretable TAD methods. The increment of model complexity in
+the state-of-the-art deep-learning based models unfortunately offers very
+little improvement. We offer insights and suggestions for the field to move
+forward.
+  Code: https://github.com/ssarfraz/QuoVadisTAD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-Supervised</span> Interpretable End-to-End Learning via Latent Functional
+  Modularity <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18947v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18947v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyunki Seong, David Hyunchul Shim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MoNet, a novel functionally modular network for self-supervised
+and interpretable end-to-end learning. By leveraging its functional modularity
+with a latent-guided contrastive loss function, MoNet efficiently learns
+task-specific decision-making processes in latent space without requiring
+task-level supervision. Moreover, our method incorporates an online, post-hoc
+explainability approach that enhances the interpretability of end-to-end
+inferences without compromising sensorimotor control performance. In real-world
+indoor environments, MoNet demonstrates effective visual autonomous navigation,
+outperforming baseline models by 7% to 28% in task specificity analysis. We
+further explore the interpretability of our network through post-hoc analysis
+of perceptual saliency maps and latent decision vectors. This provides valuable
+insights into the incorporation of explainable artificial intelligence into
+robotic learning, encompassing both perceptual and behavioral perspectives.
+Supplementary materials are available at
+https://sites.google.com/view/monet-lgc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 9 figures. Accepted at ICML 2024. Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Information Theoretic Approach to Machine Unlearning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01401v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01401v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jack Foster, Kyle Fogarty, Stefan Schoepf, Cengiz Öztireli, Alexandra Brintrup
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To comply with AI and data regulations, the need to forget private or
+copyrighted information from trained machine learning models is increasingly
+important. The key challenge in unlearning is forgetting the necessary data in
+a timely manner, while preserving model performance. In this work, we address
+the zero-shot unlearning scenario, whereby an unlearning algorithm must be able
+to remove data given only a trained model and the data to be forgotten. We
+explore unlearning from an information theoretic perspective, connecting the
+influence of a sample to the information gain a model receives by observing it.
+From this, we derive a simple but principled zero-shot unlearning method based
+on the geometry of the model. Our approach takes the form of minimising the
+gradient of a learned function with respect to a small neighbourhood around a
+target forget point. This induces a smoothing effect, causing forgetting by
+moving the boundary of the classifier. We explore the intuition behind why this
+approach can jointly unlearn forget samples while preserving general model
+performance through a series of low-dimensional experiments. We perform
+extensive empirical evaluation of our method over a range of contemporary
+benchmarks, verifying that our method is competitive with state-of-the-art
+performance under the strict constraints of zero-shot unlearning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated, new low-dimensional experiments and updated perspective on
+  unlearning from an information theoretic view</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CKGConv: General Graph Convolution with Continuous Kernels <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13604v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13604v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liheng Ma, Soumyasundar Pal, Yitian Zhang, Jiaming Zhou, Yingxue Zhang, Mark Coates
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The existing definitions of graph convolution, either from spatial or
+spectral perspectives, are inflexible and not unified. Defining a general
+convolution operator in the graph domain is challenging due to the lack of
+canonical coordinates, the presence of irregular structures, and the properties
+of graph symmetries. In this work, we propose a novel and general graph
+convolution framework by parameterizing the kernels as continuous functions of
+pseudo-coordinates derived via graph positional encoding. We name this
+Continuous Kernel Graph Convolution (CKGConv). Theoretically, we demonstrate
+that CKGConv is flexible and expressive. CKGConv encompasses many existing
+graph convolutions, and exhibits a stronger expressiveness, as powerful as
+graph transformers in terms of distinguishing non-isomorphic graphs.
+Empirically, we show that CKGConv-based Networks outperform existing graph
+convolutional networks and perform comparably to the best graph transformers
+across a variety of graph datasets. The code and models are publicly available
+at https://github.com/networkslab/CKGConv.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>On International Conference on Machine Learning (ICML) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional Wasserstein Distances with Applications in Bayesian OT Flow
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18705v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18705v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannis Chemseddine, Paul Hagemann, Gabriele Steidl, Christian Wald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In inverse problems, many conditional generative models approximate the
+posterior measure by minimizing a distance between the joint measure and its
+learned approximation. While this approach also controls the distance between
+the posterior measures in the case of the Kullback--Leibler divergence, this is
+in general not hold true for the Wasserstein distance. In this paper, we
+introduce a conditional Wasserstein distance via a set of restricted couplings
+that equals the expected Wasserstein distance of the posteriors. Interestingly,
+the dual formulation of the conditional Wasserstein-1 flow resembles losses in
+the conditional Wasserstein GAN literature in a quite natural way. We derive
+theoretical properties of the conditional Wasserstein distance, characterize
+the corresponding geodesics and velocity fields as well as the flow ODEs.
+Subsequently, we propose to approximate the velocity fields by relaxing the
+conditional Wasserstein distance. Based on this, we propose an extension of OT
+Flow Matching for solving Bayesian inverse problems and demonstrate its
+numerical advantages on an inverse problem and class-conditional image
+generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper supersedes arXiv:2310.13433</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforcement Learning for Node Selection in Branch-and-Bound 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00112v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00112v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Mattick, Christopher Mutschler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A big challenge in branch and bound lies in identifying the optimal node
+within the search tree from which to proceed. Current state-of-the-art
+selectors utilize either hand-crafted ensembles that automatically switch
+between naive sub-node selectors, or learned node selectors that rely on
+individual node data. We propose a novel simulation technique that uses
+reinforcement learning (RL) while considering the entire tree state, rather
+than just isolated nodes. To achieve this, we train a graph neural network that
+produces a probability distribution based on the path from the model's root to
+its "to-be-selected" leaves. Modelling node-selection as a probability
+distribution allows us to train the model using state-of-the-art RL techniques
+that capture both intrinsic node-quality and node-evaluation costs. Our method
+induces a high quality node selection policy on a set of varied and complex
+problem sets, despite only being trained on specially designed, synthetic
+travelling salesmen problem (TSP) instances. Using such a fixed pretrained
+policy shows significant improvements on several benchmarks in optimality gap
+reductions and per-node efficiency under strict time constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimization without Retraction on the Random Generalized Stiefel
+  Manifold <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.01702v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.01702v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Vary, Pierre Ablin, Bin Gao, P. -A. Absil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization over the set of matrices $X$ that satisfy $X^\top B X = I_p$,
+referred to as the generalized Stiefel manifold, appears in many applications
+involving sampled covariance matrices such as the canonical correlation
+analysis (CCA), independent component analysis (ICA), and the generalized
+eigenvalue problem (GEVP). Solving these problems is typically done by
+iterative methods that require a fully formed $B$. We propose a cheap
+stochastic iterative method that solves the optimization problem while having
+access only to a random estimates of $B$. Our method does not enforce the
+constraint in every iteration; instead, it produces iterations that converge to
+critical points on the generalized Stiefel manifold defined in expectation. The
+method has lower per-iteration cost, requires only matrix multiplications, and
+has the same convergence rates as its Riemannian optimization counterparts that
+require the full matrix $B$. Experiments demonstrate its effectiveness in
+various machine learning applications involving generalized orthogonality
+constraints, including CCA, ICA, and the GEVP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This v2 is the camera-ready version for ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AD3: Implicit Action is the Key for World Models to Distinguish the
+  Diverse Visual Distractors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09976v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09976v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yucen Wang, Shenghua Wan, Le Gan, Shuai Feng, De-Chuan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model-based methods have significantly contributed to distinguishing
+task-irrelevant distractors for visual control. However, prior research has
+primarily focused on heterogeneous distractors like noisy background videos,
+leaving homogeneous distractors that closely resemble controllable agents
+largely unexplored, which poses significant challenges to existing methods. To
+tackle this problem, we propose Implicit Action Generator (IAG) to learn the
+implicit actions of visual distractors, and present a new algorithm named
+implicit Action-informed Diverse visual Distractors Distinguisher (AD3), that
+leverages the action inferred by IAG to train separated world models. Implicit
+actions effectively capture the behavior of background distractors, aiding in
+distinguishing the task-irrelevant components, and the agent can optimize the
+policy within the task-relevant state space. Our method achieves superior
+performance on various visual control tasks featuring both heterogeneous and
+homogeneous distractors. The indispensable role of implicit actions learned by
+IAG is also empirically validated.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AROMA: Preserving Spatial Structure for Latent PDE Modeling with Local
+  Neural Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02176v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02176v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis Serrano, Thomas X Wang, Etienne Le Naour, Jean-Noël Vittaut, Patrick Gallinari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present AROMA (Attentive Reduced Order Model with Attention), a framework
+designed to enhance the modeling of partial differential equations (PDEs) using
+local neural fields. Our flexible encoder-decoder architecture can obtain
+smooth latent representations of spatial physical fields from a variety of data
+types, including irregular-grid inputs and point clouds. This versatility
+eliminates the need for patching and allows efficient processing of diverse
+geometries. The sequential nature of our latent representation can be
+interpreted spatially and permits the use of a conditional transformer for
+modeling the temporal dynamics of PDEs. By employing a diffusion-based
+formulation, we achieve greater stability and enable longer rollouts compared
+to conventional MSE training. AROMA's superior performance in simulating 1D and
+2D equations underscores the efficacy of our approach in capturing complex
+dynamical behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convergence of Some Convex Message Passing Algorithms to a Fixed Point <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07004v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07004v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaclav Voracek, Tomas Werner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A popular approach to the MAP inference problem in graphical models is to
+minimize an upper bound obtained from a dual linear programming or Lagrangian
+relaxation by (block-)coordinate descent. This is also known as
+convex/convergent message passing; examples are max-sum diffusion and
+sequential tree-reweighted message passing (TRW-S). Convergence properties of
+these methods are currently not fully understood. They have been proved to
+converge to the set characterized by local consistency of active constraints,
+with unknown convergence rate; however, it was not clear if the iterates
+converge at all (to any point). We prove a stronger result (conjectured before
+but never proved): the iterates converge to a fixed point of the method.
+Moreover, we show that the algorithm terminates within
+$\mathcal{O}(1/\varepsilon)$ iterations. We first prove this for a version of
+coordinate descent applied to a general piecewise-affine convex objective. Then
+we show that several convex message passing methods are special cases of this
+method. Finally, we show that a slightly different version of coordinate
+descent can cycle.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024; comments are welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Effects of Exponential Gaussian Distribution on (Double Sampling)
+  Randomized Smoothing <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02309v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02309v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youwei Shu, Xi Xiao, Derui Wang, Yuxin Cao, Siji Chen, Jason Xue, Linyi Li, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Randomized Smoothing (RS) is currently a scalable certified defense method
+providing robustness certification against adversarial examples. Although
+significant progress has been achieved in providing defenses against $\ell_p$
+adversaries, the interaction between the smoothing distribution and the
+robustness certification still remains vague. In this work, we comprehensively
+study the effect of two families of distributions, named Exponential Standard
+Gaussian (ESG) and Exponential General Gaussian (EGG) distributions, on
+Randomized Smoothing and Double Sampling Randomized Smoothing (DSRS). We derive
+an analytic formula for ESG's certified radius, which converges to the origin
+formula of RS as the dimension $d$ increases. Additionally, we prove that EGG
+can provide tighter constant factors than DSRS in providing $\Omega(\sqrt{d})$
+lower bounds of $\ell_2$ certified radius, and thus further addresses the curse
+of dimensionality in RS. Our experiments on real-world datasets confirm our
+theoretical analysis of the ESG distributions, that they provide almost the
+same certification under different exponents $\eta$ for both RS and DSRS. In
+addition, EGG brings a significant improvement to the DSRS certification, but
+the mechanism can be different when the classifier properties are different.
+Compared to the primitive DSRS, the increase in certified accuracy provided by
+EGG is prominent, up to 6.4% on ImageNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DoubleML -- An Object-Oriented Implementation of Double Machine Learning
+  in R 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.09603v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.09603v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Bach, Victor Chernozhukov, Malte S. Kurz, Martin Spindler, Sven Klaassen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The R package DoubleML implements the double/debiased machine learning
+framework of Chernozhukov et al. (2018). It provides functionalities to
+estimate parameters in causal models based on machine learning methods. The
+double machine learning framework consist of three key ingredients: Neyman
+orthogonality, high-quality machine learning estimation and sample splitting.
+Estimation of nuisance components can be performed by various state-of-the-art
+machine learning methods that are available in the mlr3 ecosystem. DoubleML
+makes it possible to perform inference in a variety of causal models, including
+partially linear and interactive regression models and their extensions to
+instrumental variable estimation. The object-oriented implementation of
+DoubleML enables a high flexibility for the model specification and makes it
+easily extendable. This paper serves as an introduction to the double machine
+learning framework and the R package DoubleML. In reproducible code examples
+with simulated and real data sets, we demonstrate how DoubleML users can
+perform valid inference based on machine learning methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>56 pages, 8 Figures, 1 Table; Updated version for DoubleML 1.0.0;
+  Updated version due to changes in R package paradox (for parameter tuning
+  with mlr3)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Almost exact recovery in noisy semi-supervised learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2007.14717v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2007.14717v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Avrachenkov, Maximilien Dreveton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-based semi-supervised learning methods combine the graph structure and
+labeled data to classify unlabeled data. In this work, we study the effect of a
+noisy oracle on classification. In particular, we derive the Maximum A
+Posteriori (MAP) estimator for clustering a Degree Corrected Stochastic Block
+Model (DC-SBM) when a noisy oracle reveals a fraction of the labels. We then
+propose an algorithm derived from a continuous relaxation of the MAP, and we
+establish its consistency. Numerical experiments show that our approach
+achieves promising performance on synthetic and real data sets, even in the
+case of very noisy labeled data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PEARL: Zero-shot Cross-task Preference Alignment and Robust Reward
+  Learning for Robotic Manipulation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runze Liu, Yali Du, Fengshuo Bai, Jiafei Lyu, Xiu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In preference-based Reinforcement Learning (RL), obtaining a large number of
+preference labels are both time-consuming and costly. Furthermore, the queried
+human preferences cannot be utilized for the new tasks. In this paper, we
+propose Zero-shot Cross-task Preference Alignment and Robust Reward Learning
+(PEARL), which learns policies from cross-task preference transfer without any
+human labels of the target task. Our contributions include two novel components
+that facilitate the transfer and learning process. The first is Cross-task
+Preference Alignment (CPA), which transfers the preferences between tasks via
+optimal transport. The key idea of CPA is to use Gromov-Wasserstein distance to
+align the trajectories between tasks, and the solved optimal transport matrix
+serves as the correspondence between trajectories. The target task preferences
+are computed as the weighted sum of source task preference labels with the
+correspondence as weights. Moreover, to ensure robust learning from these
+transferred labels, we introduce Robust Reward Learning (RRL), which considers
+both reward mean and uncertainty by modeling rewards as Gaussian distributions.
+Empirical results on robotic manipulation tasks from Meta-World and Robomimic
+demonstrate that our method is capable of transferring preference labels across
+tasks accurately and then learns well-behaved policies. Notably, our approach
+significantly exceeds existing methods when there are few human preferences.
+The code and videos of our method are available at:
+https://sites.google.com/view/pearl-preference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptively Perturbed Mirror Descent for Learning in Games <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16610v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16610v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenshi Abe, Kaito Ariu, Mitsuki Sakamoto, Atsushi Iwasaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a payoff perturbation technique for the Mirror Descent
+(MD) algorithm in games where the gradient of the payoff functions is monotone
+in the strategy profile space, potentially containing additive noise. The
+optimistic family of learning algorithms, exemplified by optimistic MD,
+successfully achieves {\it last-iterate} convergence in scenarios devoid of
+noise, leading the dynamics to a Nash equilibrium. A recent re-emerging trend
+underscores the promise of the perturbation approach, where payoff functions
+are perturbed based on the distance from an anchoring, or {\it slingshot},
+strategy. In response, we propose {\it Adaptively Perturbed MD} (APMD), which
+adjusts the magnitude of the perturbation by repeatedly updating the slingshot
+strategy at a predefined interval. This innovation empowers us to find a Nash
+equilibrium of the underlying game with guaranteed rates. Empirical
+demonstrations affirm that our algorithm exhibits significantly accelerated
+convergence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UniAP: Unifying Inter- and Intra-Layer Automatic Parallelism by Mixed
+  Integer Quadratic Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16375v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16375v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Lin, Ke Wu, Jie Li, Jun Li, Wu-Jun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributed learning is commonly used for training deep learning models,
+especially large models. In distributed learning, manual parallelism (MP)
+methods demand considerable human effort and have limited flexibility. Hence,
+automatic parallelism (AP) methods have recently been proposed for automating
+the parallel strategy optimization process. Existing AP methods suffer from
+sub-optimal solutions because they do not jointly optimize the two categories
+of parallel strategies (i.e., inter-layer parallelism and intra-layer
+parallelism). In this paper, we propose a novel AP method called UniAP, which
+unifies inter- and intra-layer automatic parallelism by mixed integer quadratic
+programming. To the best of our knowledge, UniAP is the first parallel method
+that can jointly optimize the two categories of parallel strategies to find an
+optimal solution. Experimental results show that UniAP outperforms
+state-of-the-art methods by up to 3.80$\times$ in throughput and reduces
+strategy optimization time by up to 107$\times$ across five Transformer-based
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Model Meets Graph Neural Network in Knowledge
+  Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05894v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05894v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengxiang Hu, Guobing Zou, Song Yang, Yanglan Gan, Bofeng Zhang, Yixin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In service-oriented architectures, accurately predicting the Quality of
+Service (QoS) is crucial for maintaining reliability and enhancing user
+satisfaction. However, significant challenges remain due to existing methods
+always overlooking high-order latent collaborative relationships between users
+and services and failing to dynamically adjust feature learning for every
+specific user-service invocation, which are critical for learning accurate
+features. Additionally, reliance on RNNs for capturing QoS evolution hampers
+models' ability to detect long-term trends due to difficulties in managing
+long-range dependencies. To address these challenges, we propose the
+\underline{T}arget-Prompt \underline{O}nline \underline{G}raph
+\underline{C}ollaborative \underline{L}earning (TOGCL) framework for
+temporal-aware QoS prediction. TOGCL leverages a dynamic user-service
+invocation graph to model historical interactions, providing a comprehensive
+representation of user-service relationships. Building on this graph, it
+develops a target-prompt graph attention network to extract online deep latent
+features of users and services at each time slice, simultaneously considering
+implicit collaborative relationships between target users/services and their
+neighbors, as well as relevant historical QoS values. Additionally, a
+multi-layer Transformer encoder is employed to uncover temporal feature
+evolution patterns of users and services, leading to temporal-aware QoS
+prediction. Extensive experiments conducted on the WS-DREAM dataset demonstrate
+that our proposed TOGCL framework significantly outperforms state-of-the-art
+methods across multiple metrics, achieving improvements of up to 38.80\%. These
+results underscore the effectiveness of the TOGCL framework for precise
+temporal QoS prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The No Free Lunch Theorem, Kolmogorov Complexity, and the Role of
+  Inductive Biases in Machine Learning <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05366v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05366v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Micah Goldblum, Marc Finzi, Keefer Rowan, Andrew Gordon Wilson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  No free lunch theorems for supervised learning state that no learner can
+solve all problems or that all learners achieve exactly the same accuracy on
+average over a uniform distribution on learning problems. Accordingly, these
+theorems are often referenced in support of the notion that individual problems
+require specially tailored inductive biases. While virtually all uniformly
+sampled datasets have high complexity, real-world problems disproportionately
+generate low-complexity data, and we argue that neural network models share
+this same preference, formalized using Kolmogorov complexity. Notably, we show
+that architectures designed for a particular domain, such as computer vision,
+can compress datasets on a variety of seemingly unrelated domains. Our
+experiments show that pre-trained and even randomly initialized language models
+prefer to generate low-complexity sequences. Whereas no free lunch theorems
+seemingly indicate that individual problems require specialized learners, we
+explain how tasks that often require human intervention such as picking an
+appropriately sized model when labeled data is scarce or plentiful can be
+automated into a single learning algorithm. These observations justify the
+trend in deep learning of unifying seemingly disparate problems with an
+increasingly small set of machine learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the International Conference on Machine Learning (ICML)
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Double-I Watermark: Protecting Model Copyright for LLM Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14883v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14883v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shen Li, Liuyi Yao, Jinyang Gao, Lan Zhang, Yaliang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To support various applications, a prevalent and efficient approach for
+business owners is leveraging their valuable datasets to fine-tune a
+pre-trained LLM through the API provided by LLM owners or cloud servers.
+However, this process carries a substantial risk of model misuse, potentially
+resulting in severe economic consequences for business owners. Thus,
+safeguarding the copyright of these customized models during LLM fine-tuning
+has become an urgent practical requirement, but there are limited existing
+solutions to provide such protection. To tackle this pressing issue, we propose
+a novel watermarking approach named ``Double-I watermark''. Specifically, based
+on the instruct-tuning data, two types of backdoor data paradigms are
+introduced with trigger in the instruction and the input, respectively. By
+leveraging LLM's learning capability to incorporate customized backdoor samples
+into the dataset, the proposed approach effectively injects specific
+watermarking information into the customized model during fine-tuning, which
+makes it easy to inject and verify watermarks in commercial scenarios. We
+evaluate the proposed "Double-I watermark" under various fine-tuning methods,
+demonstrating its harmlessness, robustness, uniqueness, imperceptibility, and
+validity through both quantitative and qualitative analyses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Temporal Graph Rewiring with Expander Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02362v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02362v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katarina Petrović, Shenyang Huang, Farimah Poursafaei, Petar Veličković
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evolving relations in real-world networks are often modelled by temporal
+graphs. Graph rewiring techniques have been utilised on Graph Neural Networks
+(GNNs) to improve expressiveness and increase model performance. In this work,
+we propose Temporal Graph Rewiring (TGR), the first approach for graph rewiring
+on temporal graphs. TGR enables communication between temporally distant nodes
+in a continuous time dynamic graph by utilising expander graph propagation to
+construct a message passing highway for message passing between distant nodes.
+Expander graphs are suitable candidates for rewiring as they help overcome the
+oversquashing problem often observed in GNNs. On the public tgbl-wiki
+benchmark, we show that TGR improves the performance of a widely used TGN model
+by a significant margin. Our code repository is accessible at
+https://github.com/kpetrovicc/TGR.git .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Disentanglement Learning via Topology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12696v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12696v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikita Balabin, Daria Voronkova, Ilya Trofimov, Evgeny Burnaev, Serguei Barannikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose TopDis (Topological Disentanglement), a method for learning
+disentangled representations via adding a multi-scale topological loss term.
+Disentanglement is a crucial property of data representations substantial for
+the explainability and robustness of deep learning models and a step towards
+high-level cognition. The state-of-the-art methods are based on VAE and
+encourage the joint distribution of latent variables to be factorized. We take
+a different perspective on disentanglement by analyzing topological properties
+of data manifolds. In particular, we optimize the topological similarity for
+data manifolds traversals. To the best of our knowledge, our paper is the first
+one to propose a differentiable topological loss for disentanglement learning.
+Our experiments have shown that the proposed TopDis loss improves
+disentanglement scores such as MIG, FactorVAE score, SAP score, and DCI
+disentanglement score with respect to state-of-the-art results while preserving
+the reconstruction quality. Our method works in an unsupervised manner,
+permitting us to apply it to problems without labeled factors of variation. The
+TopDis loss works even when factors of variation are correlated. Additionally,
+we show how to use the proposed topological loss to find disentangled
+directions in a trained GAN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TruthX: Alleviating Hallucinations by Editing Large Language Models in
+  Truthful Space <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17811v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17811v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaolei Zhang, Tian Yu, Yang Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) sometimes suffer from producing hallucinations,
+especially LLMs may generate untruthful responses despite knowing the correct
+knowledge. Activating the truthfulness within LLM is the key to fully unlocking
+LLM's knowledge potential. In this paper, we propose TruthX, an inference-time
+intervention method to activate the truthfulness of LLM by identifying and
+editing the features within LLM's internal representations that govern the
+truthfulness. TruthX employs an auto-encoder to map LLM's representations into
+semantic and truthful latent spaces respectively, and applies contrastive
+learning to identify a truthful editing direction within the truthful space.
+During inference, by editing LLM's internal representations in truthful space,
+TruthX effectively enhances the truthfulness of LLM. Experiments show that
+TruthX improves the truthfulness of 13 advanced LLMs by an average of 20% on
+TruthfulQA benchmark. Further analyses suggest that TruthX can control LLM to
+produce truthful or hallucinatory responses via editing only one vector in
+LLM's internal representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 main conference, Project Page:
+  https://ictnlp.github.io/TruthX-site/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Copycat Perceptron: Smashing Barriers Through Collective Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03743v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03743v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni Catania, Aurélien Decelle, Beatriz Seoane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We characterize the equilibrium properties of a model of $y$ coupled binary
+perceptrons in the teacher-student scenario, subject to a suitable cost
+function, with an explicit ferromagnetic coupling proportional to the Hamming
+distance between the students' weights. In contrast to recent works, we analyze
+a more general setting in which thermal noise is present that affects each
+student's generalization performance. In the nonzero temperature regime, we
+find that the coupling of replicas leads to a bend of the phase diagram towards
+smaller values of $\alpha$: This suggests that the free entropy landscape gets
+smoother around the solution with perfect generalization (i.e., the teacher) at
+a fixed fraction of examples, allowing standard thermal updating algorithms
+such as Simulated Annealing to easily reach the teacher solution and avoid
+getting trapped in metastable states as it happens in the unreplicated case,
+even in the computationally \textit{easy} regime of the inference phase
+diagram. These results provide additional analytic and numerical evidence for
+the recently conjectured Bayes-optimal property of Replicated Simulated
+Annealing (RSA) for a sufficient number of replicas. From a learning
+perspective, these results also suggest that multiple students working together
+(in this case reviewing the same data) are able to learn the same rule both
+significantly faster and with fewer examples, a property that could be
+exploited in the context of cooperative and federated learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2 figures in the main, 5 figures in the appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Save It for the "Hot" Day: An LLM-Empowered Visual Analytics System for
+  Heat Risk Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haobo Li, Wong Kam-Kwai, Yan Luo, Juntong Chen, Chengzhong Liu, Yaxuan Zhang, Alexis Kai Hon Lau, Huamin Qu, Dongyu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The escalating frequency and intensity of heat-related climate events,
+particularly heatwaves, emphasize the pressing need for advanced heat risk
+management strategies. Current approaches, primarily relying on numerical
+models, face challenges in spatial-temporal resolution and in capturing the
+dynamic interplay of environmental, social, and behavioral factors affecting
+heat risks. This has led to difficulties in translating risk assessments into
+effective mitigation actions. Recognizing these problems, we introduce a novel
+approach leveraging the burgeoning capabilities of Large Language Models (LLMs)
+to extract rich and contextual insights from news reports. We hence propose an
+LLM-empowered visual analytics system, Havior, that integrates the precise,
+data-driven insights of numerical models with nuanced news report information.
+This hybrid approach enables a more comprehensive assessment of heat risks and
+better identification, assessment, and mitigation of heat-related threats. The
+system incorporates novel visualization designs, such as "thermoglyph" and news
+glyph, enhancing intuitive understanding and analysis of heat risks. The
+integration of LLM-based techniques also enables advanced information retrieval
+and semantic knowledge extraction that can be guided by experts' analytics
+needs. Our case studies on two cities that faced significant heatwave events
+and interviews with five experts have demonstrated the usefulness of our system
+in providing in-depth and actionable insights for heat risk management.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Globally and Locally Optimized Pannini Projection for High FoV Rendering
+  of 360-degree Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Falah Jabar, Joao Ascenso, Maria Paula Queluz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To render a spherical (360 degree or omnidirectional) image on planar
+displays, a 2D image -- called as viewport -- must be obtained by projecting a
+sphere region on a plane, according to the users viewing direction and a
+predefined field of view (FoV). However, any sphere to plan projection
+introduces geometric distortions, such as object stretching and/or bending of
+straight lines, which intensity increases with the considered FoV. In this
+paper, a fully automatic content-aware projection is proposed, aiming to reduce
+the geometric distortions when high FoVs are used. This new projection is based
+on the Pannini projection, whose parameters are firstly globally optimized
+according to the image content, followed by a local conformality improvement of
+relevant viewport objects. A crowdsourcing subjective test showed that the
+proposed projection is the most preferred solution among the considered
+state-of-the-art sphere to plan projections, producing viewports with a more
+pleasant visual quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 12 figures, to be published in Signal Processing: Image
+  Communication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting LMM-based knowledge for image classification tasks <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.03071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.03071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Tzelepi, Vasileios Mezaris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we address image classification tasks leveraging knowledge
+encoded in Large Multimodal Models (LMMs). More specifically, we use the
+MiniGPT-4 model to extract semantic descriptions for the images, in a
+multimodal prompting fashion. In the current literature, vision language models
+such as CLIP, among other approaches, are utilized as feature extractors, using
+only the image encoder, for solving image classification tasks. In this paper,
+we propose to additionally use the text encoder to obtain the text embeddings
+corresponding to the MiniGPT-4-generated semantic descriptions. Thus, we use
+both the image and text embeddings for solving the image classification task.
+The experimental evaluation on three datasets validates the improved
+classification performance achieved by exploiting LMM-based knowledge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication, 25th Int. Conf. on Engineering Applications
+  of Neural Networks (EANN/EAAAI 2024), Corfu, Greece, June 2024. This is the
+  "submitted manuscript"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Human-Annotated Video <span class="highlight-title">Dataset</span> for Training and Evaluation of
+  360-Degree Video Summarization Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Kontostathis, Evlampios Apostolidis, Vasileios Mezaris
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we introduce a new dataset for 360-degree video summarization:
+the transformation of 360-degree video content to concise 2D-video summaries
+that can be consumed via traditional devices, such as TV sets and smartphones.
+The dataset includes ground-truth human-generated summaries, that can be used
+for training and objectively evaluating 360-degree video summarization methods.
+Using this dataset, we train and assess two state-of-the-art summarization
+methods that were originally proposed for 2D-video summarization, to serve as a
+baseline for future comparisons with summarization methods that are
+specifically tailored to 360-degree video. Finally, we present an interactive
+tool that was developed to facilitate the data annotation process and can
+assist other annotation activities that rely on video fragment selection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication, 1st Int. Workshop on Video for Immersive
+  Experiences (Video4IMX-2024) at ACM IMX 2024, Stockholm, Sweden, June 2024.
+  This is the "accepted version"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AVFF: Audio-Visual Feature Fusion for Video Deepfake Detection <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trevine Oorloff, Surya Koppisetti, Nicolò Bonettini, Divyaraj Solanki, Ben Colman, Yaser Yacoob, Ali Shahriyari, Gaurav Bharaj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid growth in deepfake video content, we require improved and
+generalizable methods to detect them. Most existing detection methods either
+use uni-modal cues or rely on supervised training to capture the dissonance
+between the audio and visual modalities. While the former disregards the
+audio-visual correspondences entirely, the latter predominantly focuses on
+discerning audio-visual cues within the training corpus, thereby potentially
+overlooking correspondences that can help detect unseen deepfakes. We present
+Audio-Visual Feature Fusion (AVFF), a two-stage cross-modal learning method
+that explicitly captures the correspondence between the audio and visual
+modalities for improved deepfake detection. The first stage pursues
+representation learning via self-supervision on real videos to capture the
+intrinsic audio-visual correspondences. To extract rich cross-modal
+representations, we use contrastive learning and autoencoding objectives, and
+introduce a novel audio-visual complementary masking and feature fusion
+strategy. The learned representations are tuned in the second stage, where
+deepfake classification is pursued via supervised learning on both real and
+fake videos. Extensive experiments and analysis suggest that our novel
+representation learning paradigm is highly discriminative in nature. We report
+98.6% accuracy and 99.1% AUC on the FakeAVCeleb dataset, outperforming the
+current audio-visual state-of-the-art by 14.9% and 9.9%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Once-for-All: Controllable Generative Image Compression with Dynamic
+  Granularity Adaption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00758v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00758v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anqi Li, Yuxi Liu, Huihui Bai, Feng Li, Runmin Cong, Meng Wang, Yao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although recent generative image compression methods have demonstrated
+impressive potential in optimizing the rate-distortion-perception trade-off,
+they still face the critical challenge of flexible rate adaption to diverse
+compression necessities and scenarios. To overcome this challenge, this paper
+proposes a Controllable Generative Image Compression framework, Control-GIC,
+the first capable of fine-grained bitrate adaption across a broad spectrum
+while ensuring high-fidelity and generality compression. We base Control-GIC on
+a VQGAN framework representing an image as a sequence of variable-length codes
+(i.e. VQ-indices), which can be losslessly compressed and exhibits a direct
+positive correlation with the bitrates. Therefore, drawing inspiration from the
+classical coding principle, we naturally correlate the information density of
+local image patches with their granular representations, to achieve dynamic
+adjustment of the code quantity following different granularity decisions. This
+implies we can flexibly determine a proper allocation of granularity for the
+patches to acquire desirable compression rates. We further develop a
+probabilistic conditional decoder that can trace back to historic encoded
+multi-granularity representations according to transmitted codes, and then
+reconstruct hierarchical granular features in the formalization of conditional
+probability, enabling more informative aggregation to improve reconstruction
+realism. Our experiments show that Control-GIC allows highly flexible and
+controllable bitrate adaption and even once compression on an entire dataset to
+fulfill constrained bitrate conditions. Experimental results demonstrate its
+superior performance over recent state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Hallucinations in Large Vision-Language Models with
+  Instruction Contrastive Decoding <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xintong Wang, Jingheng Pan, Liang Ding, Chris Biemann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) are increasingly adept at generating
+contextually detailed and coherent responses from visual inputs. However, their
+application in multimodal decision-making and open-ended generation is hindered
+by a notable rate of hallucinations, where generated text inaccurately
+represents the visual contents. To address this issue, this paper introduces
+the Instruction Contrastive Decoding (ICD) method, a novel approach designed to
+reduce hallucinations during LVLM inference. Our method is inspired by our
+observation that what we call disturbance instructions significantly exacerbate
+hallucinations in multimodal fusion modules. ICD contrasts distributions from
+standard and instruction disturbance, thereby increasing alignment uncertainty
+and effectively subtracting hallucinated concepts from the original
+distribution. Through comprehensive experiments on discriminative benchmarks
+(POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that
+ICD significantly mitigates both object-level and attribute-level
+hallucinations. Moreover, our method not only addresses hallucinations but also
+significantly enhances the general perception and recognition capabilities of
+LVLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Not All Attention is Needed: Parameter and Computation Efficient
+  Transfer Learning for Multi-modal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15226v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15226v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiong Wu, Weihao Ye, Yiyi Zhou, Xiaoshuai Sun, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel parameter and computation efficient tuning
+method for Multi-modal Large Language Models (MLLMs), termed Efficient
+Attention Skipping (EAS). Concretely, we first reveal that multi-head
+attentions (MHAs), the main computational overhead of MLLMs, are often
+redundant to downstream tasks. Based on this observation, EAS evaluates the
+attention redundancy and skips the less important MHAs to speed up inference.
+Besides, we also propose a novel propagation-of-information adapter (PIA) to
+serve the attention skipping of EAS and keep parameter efficiency, which can be
+further re-parameterized into feed-forward networks (FFNs) for zero-extra
+latency. To validate EAS, we apply it to a recently proposed MLLM called LaVIN
+and a classic VL pre-trained model called METER, and conduct extensive
+experiments on a set of benchmarks. The experiments show that EAS not only
+retains high performance and parameter efficiency, but also greatly speeds up
+inference speed. For instance, LaVIN-EAS can obtain 89.98\% accuracy on
+ScineceQA while speeding up inference by 2.2 times to LaVIN
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-06-04T00:00:00Z">2024-06-04</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ To Believe or Not to Believe Your LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasin Abbasi Yadkori, Ilja Kuzborskij, András György, Csaba Szepesvári
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore uncertainty quantification in large language models (LLMs), with
+the goal to identify when uncertainty in responses given a query is large. We
+simultaneously consider both epistemic and aleatoric uncertainties, where the
+former comes from the lack of knowledge about the ground truth (such as about
+facts or the language), and the latter comes from irreducible randomness (such
+as multiple possible answers). In particular, we derive an
+information-theoretic metric that allows to reliably detect when only epistemic
+uncertainty is large, in which case the output of the model is unreliable. This
+condition can be computed based solely on the output of the model obtained
+simply by some special iterative prompting based on the previous responses.
+Such quantification, for instance, allows to detect hallucinations (cases when
+epistemic uncertainty is high) in both single- and multi-answer responses. This
+is in contrast to many standard uncertainty quantification strategies (such as
+thresholding the log-likelihood of a response) where hallucinations in the
+multi-answer case cannot be detected. We conduct a series of experiments which
+demonstrate the advantage of our formulation. Further, our investigations shed
+some light on how the probabilities assigned to a given output by an LLM can be
+amplified by iterative prompting, which might be of independent interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parrot: Multilingual Visual Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai-Long Sun, Da-Wei Zhou, Yang Li, Shiyin Lu, Chao Yi, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, De-Chuan Zhan, Han-Jia Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of Multimodal Large Language Models (MLLMs) like GPT-4V
+has marked a significant step towards artificial general intelligence. Existing
+methods mainly focus on aligning vision encoders with LLMs through supervised
+fine-tuning (SFT) to endow LLMs with multimodal abilities, making MLLMs'
+inherent ability to react to multiple languages progressively deteriorate as
+the training process evolves. We empirically find that the imbalanced SFT
+datasets, primarily composed of English-centric image-text pairs, lead to
+significantly reduced performance in non-English languages. This is due to the
+failure of aligning the vision encoder and LLM with multilingual tokens during
+the SFT process. In this paper, we introduce Parrot, a novel method that
+utilizes textual guidance to drive visual token alignment at the language
+level. Parrot makes the visual tokens condition on diverse language inputs and
+uses Mixture-of-Experts (MoE) to promote the alignment of multilingual tokens.
+Specifically, to enhance non-English visual tokens alignment, we compute the
+cross-attention using the initial visual features and textual embeddings, the
+result of which is then fed into the MoE router to select the most relevant
+experts. The selected experts subsequently convert the initial visual tokens
+into language-specific visual tokens. Moreover, considering the current lack of
+benchmarks for evaluating multilingual capabilities within the field, we
+collect and make available a Massive Multilingual Multimodal Benchmark which
+includes 6 languages, 15 categories, and 12,000 questions, named as MMMB. Our
+method not only demonstrates state-of-the-art performance on multilingual
+MMBench and MMMB, but also excels across a broad range of multimodal tasks.
+Both the source code and the training dataset of Parrot will be made publicly
+available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TopViewRS: Vision-Language Models as Top-View Spatial Reasoners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengzu Li, Caiqi Zhang, Han Zhou, Nigel Collier, Anna Korhonen, Ivan Vulić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Top-view perspective denotes a typical way in which humans read and reason
+over different types of maps, and it is vital for localization and navigation
+of humans as well as of `non-human' agents, such as the ones backed by large
+Vision-Language Models (VLMs). Nonetheless, spatial reasoning capabilities of
+modern VLMs remain unattested and underexplored. In this work, we thus study
+their capability to understand and reason over spatial relations from the top
+view. The focus on top view also enables controlled evaluations at different
+granularity of spatial reasoning; we clearly disentangle different abilities
+(e.g., recognizing particular objects versus understanding their relative
+positions). We introduce the TopViewRS (Top-View Reasoning in Space) dataset,
+consisting of 11,384 multiple-choice questions with either realistic or
+semantic top-view map as visual input. We then use it to study and evaluate
+VLMs across 4 perception and reasoning tasks with different levels of
+complexity. Evaluation of 10 representative open- and closed-source VLMs
+reveals the gap of more than 50% compared to average human performance, and it
+is even lower than the random baseline in some cases. Although additional
+experiments show that Chain-of-Thought reasoning can boost model capabilities
+by 5.82% on average, the overall performance of VLMs remains limited. Our
+findings underscore the critical need for enhanced model capability in top-view
+spatial reasoning and set a foundation for further research towards human-level
+proficiency of VLMs in real-world multimodal tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures, 3 tables (21 pages, 4 figures, 15 tables
+  including references and appendices)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigate Position Bias in Large Language Models via Scaling a Single
+  Dimension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02536v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02536v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijiong Yu, Huiqiang Jiang, Xufang Luo, Qianhui Wu, Chin-Yew Lin, Dongsheng Li, Yuqing Yang, Yongfeng Huang, Lili Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are increasingly applied in various real-world
+scenarios due to their excellent generalization capabilities and robust
+generative abilities. However, they exhibit position bias, also known as "lost
+in the middle", a phenomenon that is especially pronounced in long-context
+scenarios, which indicates the placement of the key information in different
+positions of a prompt can significantly affect accuracy. This paper first
+explores the micro-level manifestations of position bias, concluding that
+attention weights are a micro-level expression of position bias. It further
+identifies that, in addition to position embeddings, causal attention mask also
+contributes to position bias by creating position-specific hidden states. Based
+on these insights, we propose a method to mitigate position bias by scaling
+this positional hidden states. Experiments on the NaturalQuestions
+Multi-document QA, KV retrieval, LongBench and timeline reorder tasks, using
+various models including RoPE models, context windowextended models, and Alibi
+models, demonstrate the effectiveness and generalizability of our approach. Our
+method can improve performance by up to 15.2% by modifying just one dimension
+of hidden states. Our code is available at https://aka.ms/PositionalHidden.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpecExec: Massively Parallel Speculative Decoding for Interactive LLM
+  Inference on Consumer Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruslan Svirschevski, Avner May, Zhuoming Chen, Beidi Chen, Zhihao Jia, Max Ryabinin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models gain widespread adoption, running them efficiently
+becomes crucial. Recent works on LLM inference use speculative decoding to
+achieve extreme speedups. However, most of these works implicitly design their
+algorithms for high-end datacenter hardware. In this work, we ask the opposite
+question: how fast can we run LLMs on consumer machines? Consumer GPUs can no
+longer fit the largest available models (50B+ parameters) and must offload them
+to RAM or SSD. When running with offloaded parameters, the inference engine can
+process batches of hundreds or thousands of tokens at the same time as just one
+token, making it a natural fit for speculative decoding. We propose SpecExec
+(Speculative Execution), a simple parallel decoding method that can generate up
+to 20 tokens per target model iteration for popular LLM families. It utilizes
+the high spikiness of the token probabilities distribution in modern LLMs and a
+high degree of alignment between model output probabilities. SpecExec takes the
+most probable tokens continuation from the draft model to build a "cache" tree
+for the target model, which then gets validated in a single pass. Using
+SpecExec, we demonstrate inference of 50B+ parameter LLMs on consumer GPUs with
+RAM offloading at 4-6 tokens per second with 4-bit quantization or 2-3 tokens
+per second with 16-bit weights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint. arXiv admin note: text overlap with arXiv:2312.17238 by
+  other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable MatMul-free Language Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui-Jie Zhu, Yu Zhang, Ethan Sifferman, Tyler Sheaves, Yiqiao Wang, Dustin Richmond, Peng Zhou, Jason K. Eshraghian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Matrix multiplication (MatMul) typically dominates the overall computational
+cost of large language models (LLMs). This cost only grows as LLMs scale to
+larger embedding dimensions and context lengths. In this work, we show that
+MatMul operations can be completely eliminated from LLMs while maintaining
+strong performance at billion-parameter scales. Our experiments show that our
+proposed MatMul-free models achieve performance on-par with state-of-the-art
+Transformers that require far more memory during inference at a scale up to at
+least 2.7B parameters. We investigate the scaling laws and find that the
+performance gap between our MatMul-free models and full precision Transformers
+narrows as the model size increases. We also provide a GPU-efficient
+implementation of this model which reduces memory usage by up to 61% over an
+unoptimized baseline during training. By utilizing an optimized kernel during
+inference, our model's memory consumption can be reduced by more than 10x
+compared to unoptimized models. To properly quantify the efficiency of our
+architecture, we build a custom hardware solution on an FPGA which exploits
+lightweight operations beyond what GPUs are capable of. We processed
+billion-parameter scale models at 13W beyond human readable throughput, moving
+LLMs closer to brain-like efficiency. This work not only shows how far LLMs can
+be stripped back while still performing effectively, but also points at the
+types of operations future accelerators should be optimized for in processing
+the next generation of lightweight LLMs. Our code implementation is available
+at \url{https://github.com/ridgerchu/matmulfreellm}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CheckEmbed: Effective Verification of LLM Solutions to Open-Ended Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02524v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02524v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maciej Besta, Lorenzo Paleari, Ales Kubicek, Piotr Nyczyk, Robert Gerstenberger, Patrick Iff, Tomasz Lehmann, Hubert Niewiadomski, Torsten Hoefler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are revolutionizing various domains, yet
+verifying their answers remains a significant challenge, especially for
+intricate open-ended tasks such as consolidation, summarization, and extraction
+of knowledge. In this work, we propose CheckEmbed: an accurate, scalable, and
+simple LLM verification approach. CheckEmbed is driven by a straightforward yet
+powerful idea: in order to compare LLM solutions to one another or to the
+ground-truth, compare their corresponding answer-level embeddings obtained with
+a model such as GPT Text Embedding Large. This reduces a complex textual answer
+to a single embedding, facilitating straightforward, fast, and meaningful
+verification. We develop a comprehensive verification pipeline implementing the
+CheckEmbed methodology. The CheckEmbed pipeline also comes with metrics for
+assessing the truthfulness of the LLM answers, such as embedding heatmaps and
+their summaries. We show how to use these metrics for deploying practical
+engines that decide whether an LLM answer is satisfactory or not. We apply the
+pipeline to real-world document analysis tasks, including term extraction and
+document summarization, showcasing significant improvements in accuracy,
+cost-effectiveness, and runtime performance compared to existing token-,
+sentence-, and fact-level schemes such as BERTScore or SelfCheckGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deterministic Reversible Data Augmentation for Neural Machine
+  Translation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiashu Yao, Heyan Huang, Zeming Liu, Yuhang Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is an effective way to diversify corpora in machine
+translation, but previous methods may introduce semantic inconsistency between
+original and augmented data because of irreversible operations and random
+subword sampling procedures. To generate both symbolically diverse and
+semantically consistent augmentation data, we propose Deterministic Reversible
+Data Augmentation (DRDA), a simple but effective data augmentation method for
+neural machine translation. DRDA adopts deterministic segmentations and
+reversible operations to generate multi-granularity subword representations and
+pulls them closer together with multi-view techniques. With no extra corpora or
+model changes required, DRDA outperforms strong baselines on several
+translation tasks with a clear margin (up to 4.3 BLEU gain over Transformer)
+and exhibits good robustness in noisy, low-resource, and cross-domain datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Spot Phishing Emails with Surprising Accuracy: A
+  Comparative Analysis of Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.15485v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.15485v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Het Patel, Umair Rehman, Farkhund Iqbal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phishing, a prevalent cybercrime tactic for decades, remains a significant
+threat in today's digital world. By leveraging clever social engineering
+elements and modern technology, cybercrime targets many individuals,
+businesses, and organizations to exploit trust and security. These
+cyber-attackers are often disguised in many trustworthy forms to appear as
+legitimate sources. By cleverly using psychological elements like urgency,
+fear, social proof, and other manipulative strategies, phishers can lure
+individuals into revealing sensitive and personalized information. Building on
+this pervasive issue within modern technology, this paper aims to analyze the
+effectiveness of 15 Large Language Models (LLMs) in detecting phishing
+attempts, specifically focusing on a randomized set of "419 Scam" emails. The
+objective is to determine which LLMs can accurately detect phishing emails by
+analyzing a text file containing email metadata based on predefined criteria.
+The experiment concluded that the following models, ChatGPT 3.5,
+GPT-3.5-Turbo-Instruct, and ChatGPT, were the most effective in detecting
+phishing emails.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long Is More for Alignment: A Simple but Tough-to-Beat Baseline for
+  Instruction Fine-Tuning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Zhao, Maksym Andriushchenko, Francesco Croce, Nicolas Flammarion
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is a consensus that instruction fine-tuning of LLMs requires
+high-quality data, but what are they? LIMA (NeurIPS 2023) and AlpaGasus (ICLR
+2024) are state-of-the-art methods for selecting such high-quality examples,
+either via manual curation or using GPT-3.5-Turbo as a quality scorer. We show
+that the extremely simple baseline of selecting the 1,000 instructions with
+longest responses -- that intuitively contain more learnable information and
+are harder to overfit -- from standard datasets can consistently outperform
+these sophisticated methods according to GPT-4 and PaLM-2 as judges, while
+remaining competitive on the Open LLM benchmarks that test factual knowledge.
+We demonstrate this for several LLMs (Llama-2-7B, Llama-2-13B, Mistral-7B-v0.1)
+and datasets (Alpaca-52k, Evol-Instruct-70k). In addition, a lightweight
+refinement of such long instructions can further improve the abilities of the
+fine-tuned LLMs, and allows us to obtain competitive results on MT-Bench and
+the 2nd highest-ranked Llama-2-7B-based model on AlpacaEval 2.0, while training
+on only 1,000 examples and no extra preference data. We also conduct a thorough
+analysis of our models to ensure that their enhanced performance is not simply
+due to GPT-4's preference for longer responses. Overall, our findings suggest
+that fine-tuning on the longest responses should be the default baseline for
+any work on instruction fine-tuning. We provide our code at
+https://github.com/tml-epfl/long-is-more-for-alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024. This camera-ready version adds MT-Bench
+  evaluations, a human study, more thorough analysis of length bias. Code at
+  https://github.com/tml-epfl/long-is-more-for-alignment</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VHS: High-Resolution Iterative Stereo Matching with Visual Hull Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus Plack, Hannah Dröge, Leif Van Holland, Matthias B. Hullin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a stereo-matching method for depth estimation from high-resolution
+images using visual hulls as priors, and a memory-efficient technique for the
+correlation computation. Our method uses object masks extracted from
+supplementary views of the scene to guide the disparity estimation, effectively
+reducing the search space for matches. This approach is specifically tailored
+to stereo rigs in volumetric capture systems, where an accurate depth plays a
+key role in the downstream reconstruction task. To enable training and
+regression at high resolutions targeted by recent systems, our approach extends
+a sparse correlation computation into a hybrid sparse-dense scheme suitable for
+application in leading recurrent network architectures. We evaluate the
+performance-efficiency trade-off of our method compared to state-of-the-art
+methods, and demonstrate the efficacy of the visual hull guidance. In addition,
+we propose a training scheme for a further reduction of memory requirements
+during optimization, facilitating training on high-resolution data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dreamguider: Improved Training free Diffusion-based Conditional
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nithin Gopalakrishnan Nair, Vishal M Patel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as a formidable tool for training-free
+conditional generation.However, a key hurdle in inference-time guidance
+techniques is the need for compute-heavy backpropagation through the diffusion
+network for estimating the guidance direction. Moreover, these techniques often
+require handcrafted parameter tuning on a case-by-case basis. Although some
+recent works have introduced minimal compute methods for linear inverse
+problems, a generic lightweight guidance solution to both linear and non-linear
+guidance problems is still missing. To this end, we propose Dreamguider, a
+method that enables inference-time guidance without compute-heavy
+backpropagation through the diffusion network. The key idea is to regulate the
+gradient flow through a time-varying factor. Moreover, we propose an empirical
+guidance scale that works for a wide variety of tasks, hence removing the need
+for handcrafted parameter tuning. We further introduce an effective lightweight
+augmentation strategy that significantly boosts the performance during
+inference-time guidance. We present experiments using Dreamguider on multiple
+tasks across multiple datasets and models to show the effectiveness of the
+proposed modules. To facilitate further research, we will make the code public
+after the review process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open-YOLO 3D: Towards Fast and Accurate Open-Vocabulary 3D Instance
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed El Amine Boudjoghra, Angela Dai, Jean Lahoud, Hisham Cholakkal, Rao Muhammad Anwer, Salman Khan, Fahad Shahbaz Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works on open-vocabulary 3D instance segmentation show strong promise,
+but at the cost of slow inference speed and high computation requirements. This
+high computation cost is typically due to their heavy reliance on 3D clip
+features, which require computationally expensive 2D foundation models like
+Segment Anything (SAM) and CLIP for multi-view aggregation into 3D. As a
+consequence, this hampers their applicability in many real-world applications
+that require both fast and accurate predictions. To this end, we propose a fast
+yet accurate open-vocabulary 3D instance segmentation approach, named Open-YOLO
+3D, that effectively leverages only 2D object detection from multi-view RGB
+images for open-vocabulary 3D instance segmentation. We address this task by
+generating class-agnostic 3D masks for objects in the scene and associating
+them with text prompts. We observe that the projection of class-agnostic 3D
+point cloud instances already holds instance information; thus, using SAM might
+only result in redundancy that unnecessarily increases the inference time. We
+empirically find that a better performance of matching text prompts to 3D masks
+can be achieved in a faster fashion with a 2D object detector. We validate our
+Open-YOLO 3D on two benchmarks, ScanNet200 and Replica, under two scenarios:
+(i) with ground truth masks, where labels are required for given object
+proposals, and (ii) with class-agnostic 3D proposals generated from a 3D
+proposal network. Our Open-YOLO 3D achieves state-of-the-art performance on
+both datasets while obtaining up to $\sim$16$\times$ speedup compared to the
+best existing method in literature. On ScanNet200 val. set, our Open-YOLO 3D
+achieves mean average precision (mAP) of 24.7\% while operating at 22 seconds
+per scene. Code and model are available at github.com/aminebdj/OpenYOLO3D.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Visual Tokens for Extended Text Contexts in Multi-Modal
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02547v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02547v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Jinpeng Wang, Linjie Li, Yiqi Lin, Min Li, Lijuan Wang, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training models with longer in-context lengths is a significant challenge for
+multimodal model due to substantial GPU memory and computational costs. This
+exploratory study does not present state-of-the-art models; rather, it
+introduces an innovative method designed to increase in-context text length in
+multi-modality large language models (MLLMs) efficiently. We present Visualized
+In-Context Text Processing (VisInContext), which processes long in-context text
+using visual tokens. This technique significantly reduces GPU memory usage and
+floating point operations (FLOPs) for both training and inferenceing stage. For
+instance, our method expands the pre-training in-context text length from 256
+to 2048 tokens with nearly same FLOPs for a 56 billion parameter MOE model.
+Experimental results demonstrate that model trained with VisInContext delivers
+superior performance on common downstream benchmarks for in-context few-shot
+evaluation. Additionally, VisInContext is complementary to existing methods for
+increasing in-context text length and enhances document understanding
+capabilities, showing great potential in document QA tasks and sequential
+document retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages. The website is
+  \url{https://fingerrec.github.io/visincontext}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Temporal Consistency in Video Editing by Reconstructing Videos
+  with 3D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inkyu Shin, Qihang Yu, Xiaohui Shen, In So Kweon, Kuk-Jin Yoon, Liang-Chieh Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in zero-shot video diffusion models have shown promise
+for text-driven video editing, but challenges remain in achieving high temporal
+consistency. To address this, we introduce Video-3DGS, a 3D Gaussian Splatting
+(3DGS)-based video refiner designed to enhance temporal consistency in
+zero-shot video editors. Our approach utilizes a two-stage 3D Gaussian
+optimizing process tailored for editing dynamic monocular videos. In the first
+stage, Video-3DGS employs an improved version of COLMAP, referred to as
+MC-COLMAP, which processes original videos using a Masked and Clipped approach.
+For each video clip, MC-COLMAP generates the point clouds for dynamic
+foreground objects and complex backgrounds. These point clouds are utilized to
+initialize two sets of 3D Gaussians (Frg-3DGS and Bkg-3DGS) aiming to represent
+foreground and background views. Both foreground and background views are then
+merged with a 2D learnable parameter map to reconstruct full views. In the
+second stage, we leverage the reconstruction ability developed in the first
+stage to impose the temporal constraints on the video diffusion model. To
+demonstrate the efficacy of Video-3DGS on both stages, we conduct extensive
+experiments across two related tasks: Video Reconstruction and Video Editing.
+Video-3DGS trained with 3k iterations significantly improves video
+reconstruction quality (+3 PSNR, +7 PSNR increase) and training efficiency
+(x1.9, x4.5 times faster) over NeRF-based and 3DGS-based state-of-art methods
+on DAVIS dataset, respectively. Moreover, it enhances video editing by ensuring
+temporal consistency across 58 dynamic monocular videos.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViDiT-Q: Efficient and Accurate Quantization of Diffusion <span class="highlight-title">Transformer</span>s
+  for Image and Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianchen Zhao, Tongcheng Fang, Enshu Liu, Wan Rui, Widyadewi Soedarmadji, Shiyao Li, Zinan Lin, Guohao Dai, Shengen Yan, Huazhong Yang, Xuefei Ning, Yu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion transformers (DiTs) have exhibited remarkable performance in visual
+generation tasks, such as generating realistic images or videos based on
+textual instructions. However, larger model sizes and multi-frame processing
+for video generation lead to increased computational and memory costs, posing
+challenges for practical deployment on edge devices. Post-Training Quantization
+(PTQ) is an effective method for reducing memory costs and computational
+complexity. When quantizing diffusion transformers, we find that applying
+existing diffusion quantization methods designed for U-Net faces challenges in
+preserving quality. After analyzing the major challenges for quantizing
+diffusion transformers, we design an improved quantization scheme: "ViDiT-Q":
+Video and Image Diffusion Transformer Quantization) to address these issues.
+Furthermore, we identify highly sensitive layers and timesteps hinder
+quantization for lower bit-widths. To tackle this, we improve ViDiT-Q with a
+novel metric-decoupled mixed-precision quantization method (ViDiT-Q-MP). We
+validate the effectiveness of ViDiT-Q across a variety of text-to-image and
+video models. While baseline quantization methods fail at W8A8 and produce
+unreadable content at W4A8, ViDiT-Q achieves lossless W8A8 quantization.
+ViDiTQ-MP achieves W4A8 with negligible visual quality degradation, resulting
+in a 2.5x memory optimization and a 1.5x latency speedup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://a-suozhang.xyz/viditq.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parrot: Multilingual Visual Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai-Long Sun, Da-Wei Zhou, Yang Li, Shiyin Lu, Chao Yi, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, De-Chuan Zhan, Han-Jia Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of Multimodal Large Language Models (MLLMs) like GPT-4V
+has marked a significant step towards artificial general intelligence. Existing
+methods mainly focus on aligning vision encoders with LLMs through supervised
+fine-tuning (SFT) to endow LLMs with multimodal abilities, making MLLMs'
+inherent ability to react to multiple languages progressively deteriorate as
+the training process evolves. We empirically find that the imbalanced SFT
+datasets, primarily composed of English-centric image-text pairs, lead to
+significantly reduced performance in non-English languages. This is due to the
+failure of aligning the vision encoder and LLM with multilingual tokens during
+the SFT process. In this paper, we introduce Parrot, a novel method that
+utilizes textual guidance to drive visual token alignment at the language
+level. Parrot makes the visual tokens condition on diverse language inputs and
+uses Mixture-of-Experts (MoE) to promote the alignment of multilingual tokens.
+Specifically, to enhance non-English visual tokens alignment, we compute the
+cross-attention using the initial visual features and textual embeddings, the
+result of which is then fed into the MoE router to select the most relevant
+experts. The selected experts subsequently convert the initial visual tokens
+into language-specific visual tokens. Moreover, considering the current lack of
+benchmarks for evaluating multilingual capabilities within the field, we
+collect and make available a Massive Multilingual Multimodal Benchmark which
+includes 6 languages, 15 categories, and 12,000 questions, named as MMMB. Our
+method not only demonstrates state-of-the-art performance on multilingual
+MMBench and MMMB, but also excels across a broad range of multimodal tasks.
+Both the source code and the training dataset of Parrot will be made publicly
+available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TopViewRS: Vision-Language Models as Top-View Spatial Reasoners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengzu Li, Caiqi Zhang, Han Zhou, Nigel Collier, Anna Korhonen, Ivan Vulić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Top-view perspective denotes a typical way in which humans read and reason
+over different types of maps, and it is vital for localization and navigation
+of humans as well as of `non-human' agents, such as the ones backed by large
+Vision-Language Models (VLMs). Nonetheless, spatial reasoning capabilities of
+modern VLMs remain unattested and underexplored. In this work, we thus study
+their capability to understand and reason over spatial relations from the top
+view. The focus on top view also enables controlled evaluations at different
+granularity of spatial reasoning; we clearly disentangle different abilities
+(e.g., recognizing particular objects versus understanding their relative
+positions). We introduce the TopViewRS (Top-View Reasoning in Space) dataset,
+consisting of 11,384 multiple-choice questions with either realistic or
+semantic top-view map as visual input. We then use it to study and evaluate
+VLMs across 4 perception and reasoning tasks with different levels of
+complexity. Evaluation of 10 representative open- and closed-source VLMs
+reveals the gap of more than 50% compared to average human performance, and it
+is even lower than the random baseline in some cases. Although additional
+experiments show that Chain-of-Thought reasoning can boost model capabilities
+by 5.82% on average, the overall performance of VLMs remains limited. Our
+findings underscore the critical need for enhanced model capability in top-view
+spatial reasoning and set a foundation for further research towards human-level
+proficiency of VLMs in real-world multimodal tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures, 3 tables (21 pages, 4 figures, 15 tables
+  including references and appendices)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing 2D Representation Learning with a 3D Prior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehmet Aygün, Prithviraj Dhar, Zhicheng Yan, Oisin Mac Aodha, Rakesh Ranjan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning robust and effective representations of visual data is a fundamental
+task in computer vision. Traditionally, this is achieved by training models
+with labeled data which can be expensive to obtain. Self-supervised learning
+attempts to circumvent the requirement for labeled data by learning
+representations from raw unlabeled visual data alone. However, unlike humans
+who obtain rich 3D information from their binocular vision and through motion,
+the majority of current self-supervised methods are tasked with learning from
+monocular 2D image collections. This is noteworthy as it has been demonstrated
+that shape-centric visual processing is more robust compared to texture-biased
+automated methods. Inspired by this, we propose a new approach for
+strengthening existing self-supervised methods by explicitly enforcing a strong
+3D structural prior directly into the model during training. Through
+experiments, across a range of datasets, we demonstrate that our 3D aware
+representations are more robust compared to conventional self-supervised
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing predictive imaging biomarker discovery through treatment
+  effect analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuhan Xiao, Lukas Klein, Jens Petersen, Philipp Vollmuth, Paul F. Jaeger, Klaus H. Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying predictive biomarkers, which forecast individual treatment
+effectiveness, is crucial for personalized medicine and informs decision-making
+across diverse disciplines. These biomarkers are extracted from pre-treatment
+data, often within randomized controlled trials, and have to be distinguished
+from prognostic biomarkers, which are independent of treatment assignment. Our
+study focuses on the discovery of predictive imaging biomarkers, aiming to
+leverage pre-treatment images to unveil new causal relationships. Previous
+approaches relied on labor-intensive handcrafted or manually derived features,
+which may introduce biases. In response, we present a new task of discovering
+predictive imaging biomarkers directly from the pre-treatment images to learn
+relevant image features. We propose an evaluation protocol for this task to
+assess a model's ability to identify predictive imaging biomarkers and
+differentiate them from prognostic ones. It employs statistical testing and a
+comprehensive analysis of image feature attribution. We explore the suitability
+of deep learning models originally designed for estimating the conditional
+average treatment effect (CATE) for this task, which previously have been
+primarily assessed for the precision of CATE estimation, overlooking the
+evaluation of imaging biomarker discovery. Our proof-of-concept analysis
+demonstrates promising results in discovering and validating predictive imaging
+biomarkers from synthetic outcomes and real-world image datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SatSplatYOLO: 3D Gaussian Splatting-based Virtual Object Detection
+  Ensembles for Satellite Feature Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02533v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02533v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Van Minh Nguyen, Emma Sandidge, Trupti Mahendrakar, Ryan T. White
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  On-orbit servicing (OOS), inspection of spacecraft, and active debris removal
+(ADR). Such missions require precise rendezvous and proximity operations in the
+vicinity of non-cooperative, possibly unknown, resident space objects. Safety
+concerns with manned missions and lag times with ground-based control
+necessitate complete autonomy. In this article, we present an approach for
+mapping geometries and high-confidence detection of components of unknown,
+non-cooperative satellites on orbit. We implement accelerated 3D Gaussian
+splatting to learn a 3D representation of the satellite, render virtual views
+of the target, and ensemble the YOLOv5 object detector over the virtual views,
+resulting in reliable, accurate, and precise satellite component detections.
+The full pipeline capable of running on-board and stand to enable downstream
+machine intelligence tasks necessary for autonomous guidance, navigation, and
+control tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReLUs Are Sufficient for Learning Implicit Neural Representations <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Shenouda, Yamin Zhou, Robert D. Nowak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the growing theoretical understanding of neural networks that
+employ the Rectified Linear Unit (ReLU) as their activation function, we
+revisit the use of ReLU activation functions for learning implicit neural
+representations (INRs). Inspired by second order B-spline wavelets, we
+incorporate a set of simple constraints to the ReLU neurons in each layer of a
+deep neural network (DNN) to remedy the spectral bias. This in turn enables its
+use for various INR tasks. Empirically, we demonstrate that, contrary to
+popular belief, one can learn state-of-the-art INRs based on a DNN composed of
+only ReLU neurons. Next, by leveraging recent theoretical works which
+characterize the kinds of functions ReLU neural networks learn, we provide a
+way to quantify the regularity of the learned function. This offers a
+principled approach to selecting the hyperparameters in INR architectures. We
+substantiate our claims through experiments in signal representation, super
+resolution, and computed tomography, demonstrating the versatility and
+effectiveness of our method. The code for all experiments can be found at
+https://github.com/joeshenouda/relu-inrs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DDGS-CT: Direction-Disentangled Gaussian Splatting for Realistic Volume
+  Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongpai Gao, Benjamin Planche, Meng Zheng, Xiao Chen, Terrence Chen, Ziyan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digitally reconstructed radiographs (DRRs) are simulated 2D X-ray images
+generated from 3D CT volumes, widely used in preoperative settings but limited
+in intraoperative applications due to computational bottlenecks, especially for
+accurate but heavy physics-based Monte Carlo methods. While analytical DRR
+renderers offer greater efficiency, they overlook anisotropic X-ray image
+formation phenomena, such as Compton scattering. We present a novel approach
+that marries realistic physics-inspired X-ray simulation with efficient,
+differentiable DRR generation using 3D Gaussian splatting (3DGS). Our
+direction-disentangled 3DGS (DDGS) method separates the radiosity contribution
+into isotropic and direction-dependent components, approximating complex
+anisotropic interactions without intricate runtime simulations. Additionally,
+we adapt the 3DGS initialization to account for tomography data properties,
+enhancing accuracy and efficiency. Our method outperforms state-of-the-art
+techniques in image accuracy. Furthermore, our DDGS shows promise for
+intraoperative applications and inverse problems such as pose registration,
+delivering superior registration accuracy and runtime performance compared to
+analytical DRR methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ V-Express: Conditional Dropout for Progressive Training of Portrait
+  Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Wang, Kuan Tian, Jun Zhang, Yonghang Guan, Feng Luo, Fei Shen, Zhiwei Jiang, Qing Gu, Xiao Han, Wei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of portrait video generation, the use of single images to
+generate portrait videos has become increasingly prevalent. A common approach
+involves leveraging generative models to enhance adapters for controlled
+generation. However, control signals (e.g., text, audio, reference image, pose,
+depth map, etc.) can vary in strength. Among these, weaker conditions often
+struggle to be effective due to interference from stronger conditions, posing a
+challenge in balancing these conditions. In our work on portrait video
+generation, we identified audio signals as particularly weak, often
+overshadowed by stronger signals such as facial pose and reference image.
+However, direct training with weak signals often leads to difficulties in
+convergence. To address this, we propose V-Express, a simple method that
+balances different control signals through the progressive training and the
+conditional dropout operation. Our method gradually enables effective control
+by weak conditions, thereby achieving generation capabilities that
+simultaneously take into account the facial pose, reference image, and audio.
+The experimental results demonstrate that our method can effectively generate
+portrait videos controlled by audio. Furthermore, a potential solution is
+provided for the simultaneous and effective use of conditions of varying
+strengths.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CamCo: Camera-Controllable 3D-Consistent Image-to-Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02509v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02509v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dejia Xu, Weili Nie, Chao Liu, Sifei Liu, Jan Kautz, Zhangyang Wang, Arash Vahdat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently video diffusion models have emerged as expressive generative tools
+for high-quality video content creation readily available to general users.
+However, these models often do not offer precise control over camera poses for
+video generation, limiting the expression of cinematic language and user
+control. To address this issue, we introduce CamCo, which allows fine-grained
+Camera pose Control for image-to-video generation. We equip a pre-trained
+image-to-video generator with accurately parameterized camera pose input using
+Pl\"ucker coordinates. To enhance 3D consistency in the videos produced, we
+integrate an epipolar attention module in each attention block that enforces
+epipolar constraints to the feature maps. Additionally, we fine-tune CamCo on
+real-world videos with camera poses estimated through structure-from-motion
+algorithms to better synthesize object motion. Our experiments show that CamCo
+significantly improves 3D consistency and camera control capabilities compared
+to previous models while effectively generating plausible object motion.
+Project page: https://ir1d.github.io/CamCo/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://ir1d.github.io/CamCo/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guiding a Diffusion Model with a Bad Version of Itself 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tero Karras, Miika Aittala, Tuomas Kynkäänniemi, Jaakko Lehtinen, Timo Aila, Samuli Laine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The primary axes of interest in image-generating diffusion models are image
+quality, the amount of variation in the results, and how well the results align
+with a given condition, e.g., a class label or a text prompt. The popular
+classifier-free guidance approach uses an unconditional model to guide a
+conditional model, leading to simultaneously better prompt alignment and
+higher-quality images at the cost of reduced variation. These effects seem
+inherently entangled, and thus hard to control. We make the surprising
+observation that it is possible to obtain disentangled control over image
+quality without compromising the amount of variation by guiding generation
+using a smaller, less-trained version of the model itself rather than an
+unconditional model. This leads to significant improvements in ImageNet
+generation, setting record FIDs of 1.01 for 64x64 and 1.25 for 512x512, using
+publicly available networks. Furthermore, the method is also applicable to
+unconditional diffusion models, drastically improving their quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Open-Source Tool for Mapping War Destruction at Scale in Ukraine
+  using Sentinel-1 Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olivier Dietrich, Torben Peters, Vivien Sainte Fare Garnot, Valerie Sticher, Thao Ton-That Whelan, Konrad Schindler, Jan Dirk Wegner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Access to detailed war impact assessments is crucial for humanitarian
+organizations to effectively assist populations most affected by armed
+conflicts. However, maintaining a comprehensive understanding of the situation
+on the ground is challenging, especially in conflicts that cover vast
+territories and extend over long periods. This study presents a scalable and
+transferable method for estimating war-induced damage to buildings. We first
+train a machine learning model to output pixel-wise probability of destruction
+from Synthetic Aperture Radar (SAR) satellite image time series, leveraging
+existing, manual damage assessments as ground truth and cloud-based geospatial
+analysis tools for large-scale inference. We further post-process these
+assessments using open building footprints to obtain a final damage estimate
+per building. We introduce an accessible, open-source tool that allows users to
+adjust the confidence interval based on their specific requirements and use
+cases. Our approach enables humanitarian organizations and other actors to
+rapidly screen large geographic regions for war impacts. We provide two
+publicly accessible dashboards: a Ukraine Damage Explorer to dynamically view
+our pre-computed estimates, and a Rapid Damage Mapping Tool to easily run our
+method and produce custom maps.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenS: Generalizable Neural Surface Reconstruction from Multi-View Images <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Peng, Xiaodong Gu, Luyang Tang, Shihe Shen, Fanqi Yu, Ronggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Combining the signed distance function (SDF) and differentiable volume
+rendering has emerged as a powerful paradigm for surface reconstruction from
+multi-view images without 3D supervision. However, current methods are impeded
+by requiring long-time per-scene optimizations and cannot generalize to new
+scenes. In this paper, we present GenS, an end-to-end generalizable neural
+surface reconstruction model. Unlike coordinate-based methods that train a
+separate network for each scene, we construct a generalized multi-scale volume
+to directly encode all scenes. Compared with existing solutions, our
+representation is more powerful, which can recover high-frequency details while
+maintaining global smoothness. Meanwhile, we introduce a multi-scale
+feature-metric consistency to impose the multi-view consistency in a more
+discriminative multi-scale feature space, which is robust to the failures of
+the photometric consistency. And the learnable feature can be self-enhanced to
+continuously improve the matching accuracy and mitigate aggregation ambiguity.
+Furthermore, we design a view contrast loss to force the model to be robust to
+those regions covered by few viewpoints through distilling the geometric prior
+from dense input to sparse input. Extensive experiments on popular benchmarks
+show that our model can generalize well to new scenes and outperform existing
+state-of-the-art methods even those employing ground-truth depth supervision.
+Code is available at https://github.com/prstrive/GenS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stable-Pose: Leveraging <span class="highlight-title">Transformer</span>s for Pose-Guided Text-to-Image
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Wang, Morteza Ghahremani, Yitong Li, Björn Ommer, Christian Wachinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controllable text-to-image (T2I) diffusion models have shown impressive
+performance in generating high-quality visual content through the incorporation
+of various conditions. Current methods, however, exhibit limited performance
+when guided by skeleton human poses, especially in complex pose conditions such
+as side or rear perspectives of human figures. To address this issue, we
+present Stable-Pose, a novel adapter model that introduces a coarse-to-fine
+attention masking strategy into a vision Transformer (ViT) to gain accurate
+pose guidance for T2I models. Stable-Pose is designed to adeptly handle pose
+conditions within pre-trained Stable Diffusion, providing a refined and
+efficient way of aligning pose representation during image synthesis. We
+leverage the query-key self-attention mechanism of ViTs to explore the
+interconnections among different anatomical parts in human pose skeletons.
+Masked pose images are used to smoothly refine the attention maps based on
+target pose-related features in a hierarchical manner, transitioning from
+coarse to fine levels. Additionally, our loss function is formulated to
+allocate increased emphasis to the pose region, thereby augmenting the model's
+precision in capturing intricate pose details. We assessed the performance of
+Stable-Pose across five public datasets under a wide range of indoor and
+outdoor human pose scenarios. Stable-Pose achieved an AP score of 57.1 in the
+LAION-Human dataset, marking around 13% improvement over the established
+technique ControlNet. The project link and code is available at
+https://github.com/ai-med/StablePose.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inpainting Pathology in Lumbar Spine MRI with Latent Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02477v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02477v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Colin Hansen, Simas Glinskis, Ashwin Raju, Micha Kornreich, JinHyeong Park, Jayashri Pawar, Richard Herzog, Li Zhang, Benjamin Odry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data driven models for automated diagnosis in radiology suffer from
+insufficient and imbalanced datasets due to low representation of pathology in
+a population and the cost of expert annotations. Datasets can be bolstered
+through data augmentation. However, even when utilizing a full suite of
+transformations during model training, typical data augmentations do not
+address variations in human anatomy. An alternative direction is to synthesize
+data using generative models, which can potentially craft datasets with
+specific attributes. While this holds promise, commonly used generative models
+such as Generative Adversarial Networks may inadvertently produce anatomically
+inaccurate features. On the other hand, diffusion models, which offer greater
+stability, tend to memorize training data, raising concerns about privacy and
+generative diversity. Alternatively, inpainting has the potential to augment
+data through directly inserting pathology in medical images. However, this
+approach introduces a new challenge: accurately merging the generated
+pathological features with the surrounding anatomical context. While inpainting
+is a well established method for addressing simple lesions, its application to
+pathologies that involve complex structural changes remains relatively
+unexplored. We propose an efficient method for inpainting pathological features
+onto healthy anatomy in MRI through voxelwise noise scheduling in a latent
+diffusion model. We evaluate the method's ability to insert disc herniation and
+central canal stenosis in lumbar spine sagittal T2 MRI, and it achieves
+superior Frechet Inception Distance compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DL-KDD: Dual-Light Knowledge Distillation for Action Recognition in the
+  Dark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi-Jui Chang, Oscar Tai-Yuan Chen, Vincent S. Tseng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human action recognition in dark videos is a challenging task for computer
+vision. Recent research focuses on applying dark enhancement methods to improve
+the visibility of the video. However, such video processing results in the loss
+of critical information in the original (un-enhanced) video. Conversely,
+traditional two-stream methods are capable of learning information from both
+original and processed videos, but it can lead to a significant increase in the
+computational cost during the inference phase in the task of video
+classification. To address these challenges, we propose a novel teacher-student
+video classification framework, named Dual-Light KnowleDge Distillation for
+Action Recognition in the Dark (DL-KDD). This framework enables the model to
+learn from both original and enhanced video without introducing additional
+computational cost during inference. Specifically, DL-KDD utilizes the strategy
+of knowledge distillation during training. The teacher model is trained with
+enhanced video, and the student model is trained with both the original video
+and the soft target generated by the teacher model. This teacher-student
+framework allows the student model to predict action using only the original
+input video during inference. In our experiments, the proposed DL-KDD framework
+outperforms state-of-the-art methods on the ARID, ARID V1.5, and Dark-48
+datasets. We achieve the best performance on each dataset and up to a 4.18%
+improvement on Dark-48, using only original video inputs, thus avoiding the use
+of two-stream framework or enhancement modules for inference. We further
+validate the effectiveness of the distillation strategy in ablative
+experiments. The results highlight the advantages of our knowledge distillation
+framework in dark human action recognition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study into Clustering of Unseen <span class="highlight-title">Dataset</span>s with
+  <span class="highlight-title">Self-Supervised</span> Encoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Scott C. Lowe, Joakim Bruslund Haurum, Sageev Oore, Thomas B. Moeslund, Graham W. Taylor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can pretrained models generalize to new datasets without any retraining? We
+deploy pretrained image models on datasets they were not trained for, and
+investigate whether their embeddings form meaningful clusters. Our suite of
+benchmarking experiments use encoders pretrained solely on ImageNet-1k with
+either supervised or self-supervised training techniques, deployed on image
+datasets that were not seen during training, and clustered with conventional
+clustering algorithms. This evaluation provides new insights into the
+embeddings of self-supervised models, which prioritize different features to
+supervised models. Supervised encoders typically offer more utility than SSL
+encoders within the training domain, and vice-versa far outside of it, however,
+fine-tuned encoders demonstrate the opposite trend. Clustering provides a way
+to evaluate the utility of self-supervised learned representations orthogonal
+to existing methods such as kNN. Additionally, we find the silhouette score
+when measured in a UMAP-reduced space is highly correlated with clustering
+performance, and can therefore be used as a proxy for clustering performance on
+data with no ground truth labels. Our code implementation is available at
+\url{https://github.com/scottclowe/zs-ssl-clustering/}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Image Priors through Patch-based Diffusion Models for Solving
+  Inverse Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02462v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02462v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Hu, Bowen Song, Xiaojian Xu, Liyue Shen, Jeffrey A. Fessler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models can learn strong image priors from underlying data
+distribution and use them to solve inverse problems, but the training process
+is computationally expensive and requires lots of data. Such bottlenecks
+prevent most existing works from being feasible for high-dimensional and
+high-resolution data such as 3D images. This paper proposes a method to learn
+an efficient data prior for the entire image by training diffusion models only
+on patches of images. Specifically, we propose a patch-based position-aware
+diffusion inverse solver, called PaDIS, where we obtain the score function of
+the whole image through scores of patches and their positional encoding and
+utilize this as the prior for solving inverse problems. First of all, we show
+that this diffusion model achieves an improved memory efficiency and data
+efficiency while still maintaining the capability to generate entire images via
+positional encoding. Additionally, the proposed PaDIS model is highly flexible
+and can be plugged in with different diffusion inverse solvers (DIS). We
+demonstrate that the proposed PaDIS approach enables solving various inverse
+problems in both natural and medical image domains, including CT
+reconstruction, deblurring, and superresolution, given only patch-based priors.
+Notably, PaDIS outperforms previous DIS methods trained on entire image priors
+in the case of limited training data, demonstrating the data efficiency of our
+proposed approach by learning patch-based prior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoomTex: Texturing Compositional Indoor Scenes via Iterative Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Wang, Ruijie Lu, Xudong Xu, Jingbo Wang, Michael Yu Wang, Bo Dai, Gang Zeng, Dan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of diffusion models has pushed the boundary of text-to-3D
+object generation. While it is straightforward to composite objects into a
+scene with reasonable geometry, it is nontrivial to texture such a scene
+perfectly due to style inconsistency and occlusions between objects. To tackle
+these problems, we propose a coarse-to-fine 3D scene texturing framework,
+referred to as RoomTex, to generate high-fidelity and style-consistent textures
+for untextured compositional scene meshes. In the coarse stage, RoomTex first
+unwraps the scene mesh to a panoramic depth map and leverages ControlNet to
+generate a room panorama, which is regarded as the coarse reference to ensure
+the global texture consistency. In the fine stage, based on the panoramic image
+and perspective depth maps, RoomTex will refine and texture every single object
+in the room iteratively along a series of selected camera views, until this
+object is completely painted. Moreover, we propose to maintain superior
+alignment between RGB and depth spaces via subtle edge detection methods.
+Extensive experiments show our method is capable of generating high-quality and
+diverse room textures, and more importantly, supporting interactive
+fine-grained texture control and flexible scene editing thanks to our
+inpainting-based framework and compositional mesh input. Our project page is
+available at https://qwang666.github.io/RoomTex/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Active Learning for Long-tailed Instance Segmentation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02435v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02435v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muzhi Zhu, Chengxiang Fan, Hao Chen, Yang Liu, Weian Mao, Xiaogang Xu, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large-scale language-image generative models have gained widespread
+attention and many works have utilized generated data from these models to
+further enhance the performance of perception tasks. However, not all generated
+data can positively impact downstream models, and these methods do not
+thoroughly explore how to better select and utilize generated data. On the
+other hand, there is still a lack of research oriented towards active learning
+on generated data. In this paper, we explore how to perform active learning
+specifically for generated data in the long-tailed instance segmentation task.
+Subsequently, we propose BSGAL, a new algorithm that online estimates the
+contribution of the generated data based on gradient cache. BSGAL can handle
+unlimited generated data and complex downstream segmentation tasks effectively.
+Experiments show that BSGAL outperforms the baseline approach and effectually
+improves the performance of long-tailed segmentation. Our code can be found at
+https://github.com/aim-uofa/DiverGen.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoNav: A Benchmark for Human-Centered Collaborative Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changhao Li, Xinyu Sun, Peihao Chen, Jugang Fan, Zixu Wang, Yanxia Liu, Jinhui Zhu, Chuang Gan, Mingkui Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-robot collaboration, in which the robot intelligently assists the human
+with the upcoming task, is an appealing objective. To achieve this goal, the
+agent needs to be equipped with a fundamental collaborative navigation ability,
+where the agent should reason human intention by observing human activities and
+then navigate to the human's intended destination in advance of the human.
+However, this vital ability has not been well studied in previous literature.
+To fill this gap, we propose a collaborative navigation (CoNav) benchmark. Our
+CoNav tackles the critical challenge of constructing a 3D navigation
+environment with realistic and diverse human activities. To achieve this, we
+design a novel LLM-based humanoid animation generation framework, which is
+conditioned on both text descriptions and environmental context. The generated
+humanoid trajectory obeys the environmental context and can be easily
+integrated into popular simulators. We empirically find that the existing
+navigation methods struggle in CoNav task since they neglect the perception of
+human intention. To solve this problem, we propose an intention-aware agent for
+reasoning both long-term and short-term human intention. The agent predicts
+navigation action based on the predicted intention and panoramic observation.
+The emergent agent behavior including observing humans, avoiding human
+collision, and navigation reveals the efficiency of the proposed datasets and
+agents.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IterMask2: Iterative Unsupervised Anomaly Segmentation via Spatial and
+  Frequency Masking for Brain Lesions in MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyun Liang, Xiaoqing Guo, J. Alison Noble, Konstantinos Kamnitsas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly segmentation approaches to pathology segmentation train
+a model on images of healthy subjects, that they define as the 'normal' data
+distribution. At inference, they aim to segment any pathologies in new images
+as 'anomalies', as they exhibit patterns that deviate from those in 'normal'
+training data. Prevailing methods follow the 'corrupt-and-reconstruct'
+paradigm. They intentionally corrupt an input image, reconstruct it to follow
+the learned 'normal' distribution, and subsequently segment anomalies based on
+reconstruction error. Corrupting an input image, however, inevitably leads to
+suboptimal reconstruction even of normal regions, causing false positives. To
+alleviate this, we propose a novel iterative spatial mask-refining strategy
+IterMask2. We iteratively mask areas of the image, reconstruct them, and update
+the mask based on reconstruction error. This iterative process progressively
+adds information about areas that are confidently normal as per the model. The
+increasing content guides reconstruction of nearby masked areas, improving
+reconstruction of normal tissue under these areas, reducing false positives. We
+also use high-frequency image content as an auxiliary input to provide
+additional structural information for masked areas. This further improves
+reconstruction error of normal in comparison to anomalous areas, facilitating
+segmentation of the latter. We conduct experiments on several brain lesion
+datasets and demonstrate effectiveness of our method. Code is available at:
+https://github.com/ZiyunLiang/IterMasks2
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Decoupling of neural network calibration measures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02411v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02411v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Werner Wolf, Prasannavenkatesh Balaji, Alexander Braun, Markus Ulrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A lot of effort is currently invested in safeguarding autonomous driving
+systems, which heavily rely on deep neural networks for computer vision. We
+investigate the coupling of different neural network calibration measures with
+a special focus on the Area Under the Sparsification Error curve (AUSE) metric.
+We elaborate on the well-known inconsistency in determining optimal calibration
+using the Expected Calibration Error (ECE) and we demonstrate similar issues
+for the AUSE, the Uncertainty Calibration Score (UCS), as well as the
+Uncertainty Calibration Error (UCE). We conclude that the current methodologies
+leave a degree of freedom, which prevents a unique model calibration for the
+homologation of safety-critical functionalities. Furthermore, we propose the
+AUSE as an indirect measure for the residual uncertainty, which is irreducible
+for a fixed network architecture and is driven by the stochasticity in the
+underlying data generation process (aleatoric contribution) as well as the
+limitation in the hypothesis space (epistemic contribution).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the German Conference on Pattern Recognition (GCPR) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WE-GS: An In-the-wild Efficient 3D Gaussian Representation for
+  Unconstrained Photo Collections 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuze Wang, Junyi Wang, Yue Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Novel View Synthesis (NVS) from unconstrained photo collections is
+challenging in computer graphics. Recently, 3D Gaussian Splatting (3DGS) has
+shown promise for photorealistic and real-time NVS of static scenes. Building
+on 3DGS, we propose an efficient point-based differentiable rendering framework
+for scene reconstruction from photo collections. Our key innovation is a
+residual-based spherical harmonic coefficients transfer module that adapts 3DGS
+to varying lighting conditions and photometric post-processing. This
+lightweight module can be pre-computed and ensures efficient gradient
+propagation from rendered images to 3D Gaussian attributes. Additionally, we
+observe that the appearance encoder and the transient mask predictor, the two
+most critical parts of NVS from unconstrained photo collections, can be
+mutually beneficial. We introduce a plug-and-play lightweight spatial attention
+module to simultaneously predict transient occluders and latent appearance
+representation for each image. After training and preprocessing, our method
+aligns with the standard 3DGS format and rendering pipeline, facilitating
+seamlessly integration into various 3DGS applications. Extensive experiments on
+diverse datasets show our approach outperforms existing approaches on the
+rendering quality of novel view and appearance synthesis with high converge and
+rendering speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our project page is available at
+  https://yuzewang1998.github.io/we-gs.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GrootVL: Tree Topology is All You Need in State Space Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicheng Xiao, Lin Song, Shaoli Huang, Jiangshan Wang, Siyu Song, Yixiao Ge, Xiu Li, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The state space models, employing recursively propagated features,
+demonstrate strong representation capabilities comparable to Transformer models
+and superior efficiency. However, constrained by the inherent geometric
+constraints of sequences, it still falls short in modeling long-range
+dependencies. To address this issue, we propose the GrootVL network, which
+first dynamically generates a tree topology based on spatial relationships and
+input features. Then, feature propagation is performed based on this graph,
+thereby breaking the original sequence constraints to achieve stronger
+representation capabilities. Additionally, we introduce a linear complexity
+dynamic programming algorithm to enhance long-range interactions without
+increasing computational cost. GrootVL is a versatile multimodal framework that
+can be applied to both visual and textual tasks. Extensive experiments
+demonstrate that our method significantly outperforms existing structured state
+space models on image classification, object detection and segmentation.
+Besides, by fine-tuning large language models, our approach achieves consistent
+improvements in multiple textual tasks at minor training cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is available at https://github.com/EasonXiao-888/GrootVL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low-Rank Adaption on <span class="highlight-title">Transformer</span>-based Oriented Object Detector for
+  Satellite Onboard Processing of Remote Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02385v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02385v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyang Pu, Feng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models in satellite onboard enable real-time interpretation of
+remote sensing images, reducing the need for data transmission to the ground
+and conserving communication resources. As satellite numbers and observation
+frequencies increase, the demand for satellite onboard real-time image
+interpretation grows, highlighting the expanding importance and development of
+this technology. However, updating the extensive parameters of models deployed
+on the satellites for spaceborne object detection model is challenging due to
+the limitations of uplink bandwidth in wireless satellite communications. To
+address this issue, this paper proposes a method based on parameter-efficient
+fine-tuning technology with low-rank adaptation (LoRA) module. It involves
+training low-rank matrix parameters and integrating them with the original
+model's weight matrix through multiplication and summation, thereby fine-tuning
+the model parameters to adapt to new data distributions with minimal weight
+updates. The proposed method combines parameter-efficient fine-tuning with full
+fine-tuning in the parameter update strategy of the oriented object detection
+algorithm architecture. This strategy enables model performance improvements
+close to full fine-tuning effects with minimal parameter updates. In addition,
+low rank approximation is conducted to pick an optimal rank value for LoRA
+matrices. Extensive experiments verify the effectiveness of the proposed
+method. By fine-tuning and updating only 12.4$\%$ of the model's total
+parameters, it is able to achieve 97$\%$ to 100$\%$ of the performance of full
+fine-tuning models. Additionally, the reduced number of trainable parameters
+accelerates model training iterations and enhances the generalization and
+robustness of the oriented object detection model. The source code is available
+at: \url{https://github.com/fudanxu/LoRA-Det}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Edit Visual Programs with Self-Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02383v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02383v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. Kenny Jones, Renhao Zhang, Aditya Ganeshan, Daniel Ritchie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We design a system that learns how to edit visual programs. Our edit network
+consumes a complete input program and a visual target. From this input, we task
+our network with predicting a local edit operation that could be applied to the
+input program to improve its similarity to the target. In order to apply this
+scheme for domains that lack program annotations, we develop a self-supervised
+learning approach that integrates this edit network into a bootstrapped
+finetuning loop along with a network that predicts entire programs in one-shot.
+Our joint finetuning scheme, when coupled with an inference procedure that
+initializes a population from the one-shot model and evolves members of this
+population with the edit network, helps to infer more accurate visual programs.
+Over multiple domains, we experimentally compare our method against the
+alternative of using only the one-shot model, and find that even under equal
+search-time budgets, our editing-based paradigm provides significant
+advantages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EUFCC-340K: A Faceted Hierarchical <span class="highlight-title">Dataset</span> for Metadata Annotation in
+  GLAM Collections 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesc Net, Marc Folia, Pep Casals, Andrew D. Bagdanov, Lluis Gomez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address the challenges of automatic metadata annotation in
+the domain of Galleries, Libraries, Archives, and Museums (GLAMs) by
+introducing a novel dataset, EUFCC340K, collected from the Europeana portal.
+Comprising over 340,000 images, the EUFCC340K dataset is organized across
+multiple facets: Materials, Object Types, Disciplines, and Subjects, following
+a hierarchical structure based on the Art & Architecture Thesaurus (AAT). We
+developed several baseline models, incorporating multiple heads on a ConvNeXT
+backbone for multi-label image tagging on these facets, and fine-tuning a CLIP
+model with our image text pairs. Our experiments to evaluate model robustness
+and generalization capabilities in two different test scenarios demonstrate the
+utility of the dataset in improving multi-label classification tools that have
+the potential to alleviate cataloging tasks in the cultural heritage sector.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedDr+: Stabilizing Dot-regression with Global Feature Distillation for
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seongyoon Kim, Minchan Jeong, Sungnyun Kim, Sungwoo Cho, Sumyeong Ahn, Se-Young Yun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has emerged as a pivotal framework for the
+development of effective global models (global FL) or personalized models
+(personalized FL) across clients with heterogeneous, non-iid data distribution.
+A key challenge in FL is client drift, where data heterogeneity impedes the
+aggregation of scattered knowledge. Recent studies have tackled the client
+drift issue by identifying significant divergence in the last classifier layer.
+To mitigate this divergence, strategies such as freezing the classifier weights
+and aligning the feature extractor accordingly have proven effective. Although
+the local alignment between classifier and feature extractor has been studied
+as a crucial factor in FL, we observe that it may lead the model to
+overemphasize the observed classes within each client. Thus, our objectives are
+twofold: (1) enhancing local alignment while (2) preserving the representation
+of unseen class samples. This approach aims to effectively integrate knowledge
+from individual clients, thereby improving performance for both global and
+personalized FL. To achieve this, we introduce a novel algorithm named FedDr+,
+which empowers local model alignment using dot-regression loss. FedDr+ freezes
+the classifier as a simplex ETF to align the features and improves aggregated
+global models by employing a feature distillation mechanism to retain
+information about unseen/missing classes. Consequently, we provide empirical
+evidence demonstrating that our algorithm surpasses existing methods that use a
+frozen classifier to boost alignment across the diverse distribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CADE: Cosine Annealing Differential Evolution for Spiking Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02349v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02349v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runhua Jiang, Guodong Du, Shuyang Yu, Yifei Guo, Sim Kuan Goh, Ho-Kin Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) have gained prominence for their potential in
+neuromorphic computing and energy-efficient artificial intelligence, yet
+optimizing them remains a formidable challenge for gradient-based methods due
+to their discrete, spike-based computation. This paper attempts to tackle the
+challenges by introducing Cosine Annealing Differential Evolution (CADE),
+designed to modulate the mutation factor (F) and crossover rate (CR) of
+differential evolution (DE) for the SNN model, i.e., Spiking Element Wise (SEW)
+ResNet. Extensive empirical evaluations were conducted to analyze CADE. CADE
+showed a balance in exploring and exploiting the search space, resulting in
+accelerated convergence and improved accuracy compared to existing
+gradient-based and DE-based methods. Moreover, an initialization method based
+on a transfer learning setting was developed, pretraining on a source dataset
+(i.e., CIFAR-10) and fine-tuning the target dataset (i.e., CIFAR-100), to
+improve population diversity. It was found to further enhance CADE for SNN.
+Remarkably, CADE elevates the performance of the highest accuracy SEW model by
+an additional 0.52 percentage points, underscoring its effectiveness in
+fine-tuning and enhancing SNNs. These findings emphasize the pivotal role of a
+scheduler for F and CR adjustment, especially for DE-based SNN. Source Code on
+Github: https://github.com/Tank-Jiang/CADE4SNN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flash Diffusion: Accelerating Any Conditional Diffusion Model for Few
+  Steps Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02347v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02347v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clement Chadebec, Onur Tasar, Eyal Benaroche, Benjamin Aubin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose an efficient, fast, and versatile distillation
+method to accelerate the generation of pre-trained diffusion models: Flash
+Diffusion. The method reaches state-of-the-art performances in terms of FID and
+CLIP-Score for few steps image generation on the COCO2014 and COCO2017
+datasets, while requiring only several GPU hours of training and fewer
+trainable parameters than existing methods. In addition to its efficiency, the
+versatility of the method is also exposed across several tasks such as
+text-to-image, inpainting, face-swapping, super-resolution and using different
+backbones such as UNet-based denoisers (SD1.5, SDXL) or DiT (Pixart-$\alpha$),
+as well as adapters. In all cases, the method allowed to reduce drastically the
+number of sampling steps while maintaining very high-quality image generation.
+The official implementation is available at
+https://github.com/gojasper/flash-diffusion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages + 16 pages appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive Confident Masking Attention Network for Audio-Visual
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Wang, Feng Dong, Jinchao Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio and visual signals typically occur simultaneously, and humans possess
+an innate ability to correlate and synchronize information from these two
+modalities. Recently, a challenging problem known as Audio-Visual Segmentation
+(AVS) has emerged, intending to produce segmentation maps for sounding objects
+within a scene. However, the methods proposed so far have not sufficiently
+integrated audio and visual information, and the computational costs have been
+extremely high. Additionally, the outputs of different stages have not been
+fully utilized. To facilitate this research, we introduce a novel Progressive
+Confident Masking Attention Network (PMCANet). It leverages attention
+mechanisms to uncover the intrinsic correlations between audio signals and
+visual frames. Furthermore, we design an efficient and effective
+cross-attention module to enhance semantic perception by selecting query
+tokens. This selection is determined through confidence-driven units based on
+the network's multi-stage predictive outputs. Experiments demonstrate that our
+network outperforms other AVS methods while requiring less computational
+resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures, submitted to IEEE TRANSACTIONS ON CIRCUITS AND
+  SYSTEMS FOR VIDEO TECHNOLOGY</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cluster-Aware Similarity Diffusion for Instance Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jifei Luo, Hantao Yao, Changsheng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based re-ranking is a common method used for retrieving instances
+by performing similarity propagation in a nearest neighbor graph. However,
+existing techniques that construct the affinity graph based on pairwise
+instances can lead to the propagation of misinformation from outliers and other
+manifolds, resulting in inaccurate results. To overcome this issue, we propose
+a novel Cluster-Aware Similarity (CAS) diffusion for instance retrieval. The
+primary concept of CAS is to conduct similarity diffusion within local
+clusters, which can reduce the influence from other manifolds explicitly. To
+obtain a symmetrical and smooth similarity matrix, our Bidirectional Similarity
+Diffusion strategy introduces an inverse constraint term to the optimization
+objective of local cluster diffusion. Additionally, we have optimized a
+Neighbor-guided Similarity Smoothing approach to ensure similarity consistency
+among the local neighbors of each instance. Evaluations in instance retrieval
+and object re-identification validate the effectiveness of the proposed CAS,
+our code is publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Unsupervised Out-of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Doorenbos, Raphael Sznitman, Pablo Márquez-Neila
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models excel when the data distribution during training aligns
+with testing data. Yet, their performance diminishes when faced with
+out-of-distribution (OOD) samples, leading to great interest in the field of
+OOD detection. Current approaches typically assume that OOD samples originate
+from an unconcentrated distribution complementary to the training distribution.
+While this assumption is appropriate in the traditional unsupervised OOD
+(U-OOD) setting, it proves inadequate when considering the place of deployment
+of the underlying deep learning model. To better reflect this real-world
+scenario, we introduce the novel setting of continual U-OOD detection. To
+tackle this new setting, we propose a method that starts from a U-OOD detector,
+which is agnostic to the OOD distribution, and slowly updates during deployment
+to account for the actual OOD distribution. Our method uses a new U-OOD scoring
+function that combines the Mahalanobis distance with a nearest-neighbor
+approach. Furthermore, we design a confidence-scaled few-shot OOD detector that
+outperforms previous methods. We show our method greatly improves upon strong
+baselines from related fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimised ProPainter for Video Diminished Reality Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengze Li, Lihao Liu, Carola-Bibiane Schönlieb, Angelica I Aviles-Rivero
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, part of the DREAMING Challenge - Diminished Reality for
+Emerging Applications in Medicine through Inpainting, we introduce a refined
+video inpainting technique optimised from the ProPainter method to meet the
+specialised demands of medical imaging, specifically in the context of oral and
+maxillofacial surgery. Our enhanced algorithm employs the zero-shot ProPainter,
+featuring optimized parameters and pre-processing, to adeptly manage the
+complex task of inpainting surgical video sequences, without requiring any
+training process. It aims to produce temporally coherent and detail-rich
+reconstructions of occluded regions, facilitating clearer views of operative
+fields. The efficacy of our approach is evaluated using comprehensive metrics,
+positioning it as a significant advancement in the application of diminished
+reality for medical purposes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ISBI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Retrieval Robustness for Retrieval-Augmented Image
+  Captioning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenyan Li, Jiaang Li, Rita Ramos, Raphael Tang, Desmond Elliott
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in retrieval-augmented models for image captioning
+highlight the significance of retrieving related captions for efficient,
+lightweight models with strong domain-transfer capabilities. While these models
+demonstrate the success of retrieval augmentation, retrieval models are still
+far from perfect in practice. Retrieved information can sometimes mislead the
+model generation, negatively impacting performance. In this paper, we analyze
+the robustness of the SmallCap retrieval-augmented captioning model. Our
+analysis shows that SmallCap is sensitive to tokens that appear in the majority
+of the retrieved captions, and integrated gradients attribution shows that
+those tokens are likely copied into the final caption. Given these findings, we
+propose to train the model by sampling retrieved captions from more diverse
+sets. This reduces the probability that the model learns to copy majority
+tokens and improves both in-domain and cross-domain performance effectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, long paper at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image contrast enhancement based on the Schrödinger operator spectrum 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan M. Vargas, Taous-Meriem Laleg-Kirati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a novel image contrast enhancement method based on image
+projection onto the squared eigenfunctions of the two dimensional Schr\"odinger
+operator. This projection depends on a design parameter
+\texorpdfstring{\(\gamma\)}{gamma} which is proposed to control the pixel
+intensity during image reconstruction. The performance of the proposed method
+is investigated through its application to color images. The selection of
+\texorpdfstring{\(\gamma\)}{gamma} values is performed using k-means, which
+helps preserve the image spatial adjacency information. Furthermore,
+multi-objective optimization using the Non dominated Sorting Genetic Algorithm
+II (NSAG2) algorithm is proposed to select the optimal values of
+\texorpdfstring{\(\gamma\)}{gamma} and the semi-classical parameter h from the
+2DSCSA. The results demonstrate the effectiveness of the proposed method for
+enhancing image contrast while preserving the inherent characteristics of the
+original image, producing the desired enhancement with almost no artifacts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M3DM-NR: RGB-3D Noisy-Resistant Industrial Anomaly Detection via
+  Multimodal Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengjie Wang, Haokun Zhu, Jinlong Peng, Yue Wang, Ran Yi, Yunsheng Wu, Lizhuang Ma, Jiangning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing industrial anomaly detection methods primarily concentrate on
+unsupervised learning with pristine RGB images. Yet, both RGB and 3D data are
+crucial for anomaly detection, and the datasets are seldom completely clean in
+practical scenarios. To address above challenges, this paper initially delves
+into the RGB-3D multi-modal noisy anomaly detection, proposing a novel
+noise-resistant M3DM-NR framework to leveraging strong multi-modal
+discriminative capabilities of CLIP. M3DM-NR consists of three stages: Stage-I
+introduces the Suspected References Selection module to filter a few normal
+samples from the training dataset, using the multimodal features extracted by
+the Initial Feature Extraction, and a Suspected Anomaly Map Computation module
+to generate a suspected anomaly map to focus on abnormal regions as reference.
+Stage-II uses the suspected anomaly maps of the reference samples as reference,
+and inputs image, point cloud, and text information to achieve denoising of the
+training samples through intra-modal comparison and multi-scale aggregation
+operations. Finally, Stage-III proposes the Point Feature Alignment,
+Unsupervised Feature Fusion, Noise Discriminative Coreset Selection, and
+Decision Layer Fusion modules to learn the pattern of the training dataset,
+enabling anomaly detection and segmentation while filtering out noise.
+Extensive experiments show that M3DM-NR outperforms state-of-the-art methods in
+3D-RGB multi-modal noisy anomaly detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PuFace: Defending against Facial Cloaking Attacks for Facial Recognition
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently proposed facial cloaking attacks add invisible perturbation
+(cloaks) to facial images to protect users from being recognized by
+unauthorized facial recognition models. However, we show that the "cloaks" are
+not robust enough and can be removed from images.
+  This paper introduces PuFace, an image purification system leveraging the
+generalization ability of neural networks to diminish the impact of cloaks by
+pushing the cloaked images towards the manifold of natural (uncloaked) images
+before the training process of facial recognition models. Specifically, we
+devise a purifier that takes all the training images including both cloaked and
+natural images as input and generates the purified facial images close to the
+manifold where natural images lie. To meet the defense goal, we propose to
+train the purifier on particularly amplified cloaked images with a loss
+function that combines image loss and feature loss. Our empirical experiment
+shows PuFace can effectively defend against two state-of-the-art facial
+cloaking attacks and reduces the attack success rate from 69.84\% to 7.61\% on
+average without degrading the normal accuracy for various facial recognition
+models. Moreover, PuFace is a model-agnostic defense mechanism that can be
+applied to any facial recognition model without modifying the model structure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ I4VGen: Image as Stepping Stone for Text-to-Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02230v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02230v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiefan Guo, Jinlin Liu, Miaomiao Cui, Di Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-video generation has lagged behind text-to-image synthesis in quality
+and diversity due to the complexity of spatio-temporal modeling and limited
+video-text datasets. This paper presents I4VGen, a training-free and
+plug-and-play video diffusion inference framework, which enhances text-to-video
+generation by leveraging robust image techniques. Specifically, following
+text-to-image-to-video, I4VGen decomposes the text-to-video generation into two
+stages: anchor image synthesis and anchor image-guided video synthesis.
+Correspondingly, a well-designed generation-selection pipeline is employed to
+achieve visually-realistic and semantically-faithful anchor image, and an
+innovative Noise-Invariant Video Score Distillation Sampling is incorporated to
+animate the image to a dynamic video, followed by a video regeneration process
+to refine the video. This inference strategy effectively mitigates the
+prevalent issue of non-zero terminal signal-to-noise ratio. Extensive
+evaluations show that I4VGen not only produces videos with higher visual
+realism and textual fidelity but also integrates seamlessly into existing
+image-to-video diffusion models, thereby improving overall video quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://xiefan-guo.github.io/i4vgen</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMCL: Saliency Masked Contrastive Learning for Long-tailed Recognition <span class="chip">ICASSP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanglee Park, Seung-won Hwang, Jungmin So
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world data often follow a long-tailed distribution with a high imbalance
+in the number of samples between classes. The problem with training from
+imbalanced data is that some background features, common to all classes, can be
+unobserved in classes with scarce samples. As a result, this background
+correlates to biased predictions into ``major" classes. In this paper, we
+propose saliency masked contrastive learning, a new method that uses saliency
+masking and contrastive learning to mitigate the problem and improve the
+generalizability of a model. Our key idea is to mask the important part of an
+image using saliency detection and use contrastive learning to move the masked
+image towards minor classes in the feature space, so that background features
+present in the masked image are no longer correlated with the original class.
+Experiment results show that our method achieves state-of-the-art level
+performance on benchmark long-tailed datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at ICASSP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why Only Text: Empowering Vision-and-Language Navigation with
+  Multi-modal <span class="highlight-title">Prompt</span>s <span class="chip">IJCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02208v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02208v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haodong Hong, Sen Wang, Zi Huang, Qi Wu, Jiajun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current Vision-and-Language Navigation (VLN) tasks mainly employ textual
+instructions to guide agents. However, being inherently abstract, the same
+textual instruction can be associated with different visual signals, causing
+severe ambiguity and limiting the transfer of prior knowledge in the vision
+domain from the user to the agent. To fill this gap, we propose
+Vision-and-Language Navigation with Multi-modal Prompts (VLN-MP), a novel task
+augmenting traditional VLN by integrating both natural language and images in
+instructions. VLN-MP not only maintains backward compatibility by effectively
+handling text-only prompts but also consistently shows advantages with
+different quantities and relevance of visual prompts. Possible forms of visual
+prompts include both exact and similar object images, providing adaptability
+and versatility in diverse navigation scenarios. To evaluate VLN-MP under a
+unified framework, we implement a new benchmark that offers: (1) a
+training-free pipeline to transform textual instructions into multi-modal forms
+with landmark images; (2) diverse datasets with multi-modal instructions for
+different downstream tasks; (3) a novel module designed to process various
+image prompts for seamless integration with state-of-the-art VLN models.
+Extensive experiments on four VLN benchmarks (R2R, RxR, REVERIE, CVDN) show
+that incorporating visual prompts significantly boosts navigation performance.
+While maintaining efficiency with text-only prompts, VLN-MP enables agents to
+navigate in the pre-explore setting and outperform text-based models, showing
+its broader applicability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can CLIP help CLIP in learning 3D? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristian Sbrolli, Matteo Matteucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we explore an alternative approach to enhance contrastive
+text-image-3D alignment in the absence of textual descriptions for 3D objects.
+We introduce two unsupervised methods, $I2I$ and $(I2L)^2$, which leverage CLIP
+knowledge about textual and 2D data to compute the neural perceived similarity
+between two 3D samples. We employ the proposed methods to mine 3D hard
+negatives, establishing a multimodal contrastive pipeline with hard negative
+weighting via a custom loss function. We train on different configurations of
+the proposed hard negative mining approach, and we evaluate the accuracy of our
+models in 3D classification and on the cross-modal retrieval benchmark, testing
+image-to-shape and shape-to-image retrieval. Results demonstrate that our
+approach, even without explicit text alignment, achieves comparable or superior
+performance on zero-shot and standard 3D classification, while significantly
+improving both image-to-shape and shape-to-image retrieval compared to previous
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GraVITON: Graph based garment warping with attention guided inversion
+  for Virtual-tryon 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanhita Pathak, Vinay Kaushik, Brejesh Lall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual try-on, a rapidly evolving field in computer vision, is transforming
+e-commerce by improving customer experiences through precise garment warping
+and seamless integration onto the human body. While existing methods such as
+TPS and flow address the garment warping but overlook the finer contextual
+details. In this paper, we introduce a novel graph based warping technique
+which emphasizes the value of context in garment flow. Our graph based warping
+module generates warped garment as well as a coarse person image, which is
+utilised by a simple refinement network to give a coarse virtual tryon image.
+The proposed work exploits latent diffusion model to generate the final tryon,
+treating garment transfer as an inpainting task. The diffusion model is
+conditioned with decoupled cross attention based inversion of visual and
+textual information. We introduce an occlusion aware warping constraint that
+generates dense warped garment, without any holes and occlusion. Our method,
+validated on VITON-HD and Dresscode datasets, showcases substantial
+state-of-the-art qualitative and quantitative results showing considerable
+improvement in garment warping, texture preservation, and overall realism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 7 Figures and 6 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Radar Spectra-Language Model for Automotive Scene Parsing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mariia Pushkareva, Yuri Feldman, Csaba Domokos, Kilian Rambach, Dotan Di Castro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radar sensors are low cost, long-range, and weather-resilient. Therefore,
+they are widely used for driver assistance functions, and are expected to be
+crucial for the success of autonomous driving in the future. In many perception
+tasks only pre-processed radar point clouds are considered. In contrast, radar
+spectra are a raw form of radar measurements and contain more information than
+radar point clouds. However, radar spectra are rather difficult to interpret.
+In this work, we aim to explore the semantic information contained in spectra
+in the context of automated driving, thereby moving towards better
+interpretability of radar spectra. To this end, we create a radar
+spectra-language model, allowing us to query radar spectra measurements for the
+presence of scene elements using free text. We overcome the scarcity of radar
+spectra data by matching the embedding space of an existing vision-language
+model (VLM). Finally, we explore the benefit of the learned representation for
+scene parsing, and obtain improvements in free space segmentation and object
+detection merely by injecting the spectra embedding into a baseline model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing the Feature Extractor Networks for Face Image Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erdi Sarıtaş, Hazım Kemal Ekenel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements like Generative Adversarial Networks have attracted the
+attention of researchers toward face image synthesis to generate ever more
+realistic images. Thereby, the need for the evaluation criteria to assess the
+realism of the generated images has become apparent. While FID utilized with
+InceptionV3 is one of the primary choices for benchmarking, concerns about
+InceptionV3's limitations for face images have emerged. This study investigates
+the behavior of diverse feature extractors -- InceptionV3, CLIP, DINOv2, and
+ArcFace -- considering a variety of metrics -- FID, KID, Precision\&Recall.
+While the FFHQ dataset is used as the target domain, as the source domains, the
+CelebA-HQ dataset and the synthetic datasets generated using StyleGAN2 and
+Projected FastGAN are used. Experiments include deep-down analysis of the
+features: $L_2$ normalization, model attention during extraction, and domain
+distributions in the feature space. We aim to give valuable insights into the
+behavior of feature extractors for evaluating face image synthesis
+methodologies. The code is publicly available at
+https://github.com/ThEnded32/AnalyzingFeatureExtractors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 18th International Conference on Automatic Face and
+  Gesture Recognition (FG) on 1st SD-FGA Workshop 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UA-Track: Uncertainty-Aware End-to-End 3D Multi-Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lijun Zhou, Tao Tang, Pengkun Hao, Zihang He, Kalok Ho, Shuo Gu, Wenbo Hou, Zhihui Hao, Haiyang Sun, Kun Zhan, Peng Jia, Xianpeng Lang, Xiaodan Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D multiple object tracking (MOT) plays a crucial role in autonomous driving
+perception. Recent end-to-end query-based trackers simultaneously detect and
+track objects, which have shown promising potential for the 3D MOT task.
+However, existing methods overlook the uncertainty issue, which refers to the
+lack of precise confidence about the state and location of tracked objects.
+Uncertainty arises owing to various factors during motion observation by
+cameras, especially occlusions and the small size of target objects, resulting
+in an inaccurate estimation of the object's position, label, and identity. To
+this end, we propose an Uncertainty-Aware 3D MOT framework, UA-Track, which
+tackles the uncertainty problem from multiple aspects. Specifically, we first
+introduce an Uncertainty-aware Probabilistic Decoder to capture the uncertainty
+in object prediction with probabilistic attention. Secondly, we propose an
+Uncertainty-guided Query Denoising strategy to further enhance the training
+process. We also utilize Uncertainty-reduced Query Initialization, which
+leverages predicted 2D object location and depth information to reduce query
+uncertainty. As a result, our UA-Track achieves state-of-the-art performance on
+the nuScenes benchmark, i.e., 66.3% AMOTA on the test split, surpassing the
+previous best end-to-end solution by a significant margin of 8.9% AMOTA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing the Effect of Combined Degradations on Face Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erdi Sarıtaş, Hazım Kemal Ekenel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A face recognition model is typically trained on large datasets of images
+that may be collected from controlled environments. This results in performance
+discrepancies when applied to real-world scenarios due to the domain gap
+between clean and in-the-wild images. Therefore, some researchers have
+investigated the robustness of these models by analyzing synthetic
+degradations. Yet, existing studies have mostly focused on single degradation
+factors, which may not fully capture the complexity of real-world degradations.
+This work addresses this problem by analyzing the impact of both single and
+combined degradations using a real-world degradation pipeline extended with
+under/over-exposure conditions. We use the LFW dataset for our experiments and
+assess the model's performance based on verification accuracy. Results reveal
+that single and combined degradations show dissimilar model behavior. The
+combined effect of degradation significantly lowers performance even if its
+single effect is negligible. This work emphasizes the importance of accounting
+for real-world complexity to assess the robustness of face recognition models
+in real-world settings. The code is publicly available at
+https://github.com/ThEnded32/AnalyzingCombinedDegradations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 18th International Conference on Automatic Face and
+  Gesture Recognition (FG) on 2nd PrivAAL Workshop 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain Game: Disentangle Anatomical Feature for Single Domain
+  Generalized Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02125v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02125v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Chen, Hongrun Zhang, U Wang Chan, Rui Yin, Xiaofei Wang, Chao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single domain generalization aims to address the challenge of
+out-of-distribution generalization problem with only one source domain
+available. Feature distanglement is a classic solution to this purpose, where
+the extracted task-related feature is presumed to be resilient to domain shift.
+However, the absence of references from other domains in a single-domain
+scenario poses significant uncertainty in feature disentanglement
+(ill-posedness). In this paper, we propose a new framework, named
+\textit{Domain Game}, to perform better feature distangling for medical image
+segmentation, based on the observation that diagnostic relevant features are
+more sensitive to geometric transformations, whilist domain-specific features
+probably will remain invariant to such operations. In domain game, a set of
+randomly transformed images derived from a singular source image is
+strategically encoded into two separate feature sets to represent diagnostic
+features and domain-specific features, respectively, and we apply forces to
+pull or repel them in the feature space, accordingly. Results from cross-site
+test domain evaluation showcase approximately an ~11.8% performance boost in
+prostate segmentation and around ~10.5% in brain tumor segmentation compared to
+the second-best method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-target stain normalization for histology slides 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Desislav Ivanov, Carlo Alberto Barbano, Marco Grangetto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional staining normalization approaches, e.g. Macenko, typically rely
+on the choice of a single representative reference image, which may not
+adequately account for the diverse staining patterns of datasets collected in
+practical scenarios. In this study, we introduce a novel approach that
+leverages multiple reference images to enhance robustness against stain
+variation. Our method is parameter-free and can be adopted in existing
+computational pathology pipelines with no significant changes. We evaluate the
+effectiveness of our method through experiments using a deep-learning pipeline
+for automatic nuclei segmentation on colorectal images. Our results show that
+by leveraging multiple reference images, better results can be achieved when
+generalizing to external data, where the staining can widely differ from the
+training set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaceCom: Towards High-fidelity 3D Facial Shape Completion via
+  Optimization and Inpainting Guidance <span class="chip">CVPR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinglong Li, Hongyu Wu, Xiaogang Wang, Qingzhao Qin, Yijiao Zhao, Yong wang, Aimin Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose FaceCom, a method for 3D facial shape completion, which delivers
+high-fidelity results for incomplete facial inputs of arbitrary forms. Unlike
+end-to-end shape completion methods based on point clouds or voxels, our
+approach relies on a mesh-based generative network that is easy to optimize,
+enabling it to handle shape completion for irregular facial scans. We first
+train a shape generator on a mixed 3D facial dataset containing 2405
+identities. Based on the incomplete facial input, we fit complete faces using
+an optimization approach under image inpainting guidance. The completion
+results are refined through a post-processing step. FaceCom demonstrates the
+ability to effectively and naturally complete facial scan data with varying
+missing regions and degrees of missing areas. Our method can be used in medical
+prosthetic fabrication and the registration of deficient scanning data. Our
+experimental results demonstrate that FaceCom achieves exceptional performance
+in fitting and shape completion tasks. The code is available at
+https://github.com/dragonylee/FaceCom.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to CVPR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Generalized Transfer Attack with Initialization Derived
+  Bilevel Optimization and Dynamic Sequence Truncation <span class="chip">IJCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaohua Liu, Jiaxin Gao, Xuan Liu, Xianghao Jiao, Xin Fan, Risheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transfer attacks generate significant interest for real-world black-box
+applications by crafting transferable adversarial examples through surrogate
+models. Whereas, existing works essentially directly optimize the single-level
+objective w.r.t. the surrogate model, which always leads to poor
+interpretability of attack mechanism and limited generalization performance
+over unknown victim models. In this work, we propose the
+\textbf{B}il\textbf{E}vel \textbf{T}ransfer \textbf{A}ttac\textbf{K} (BETAK)
+framework by establishing an initialization derived bilevel optimization
+paradigm, which explicitly reformulates the nested constraint relationship
+between the Upper-Level (UL) pseudo-victim attacker and the Lower-Level (LL)
+surrogate attacker. Algorithmically, we introduce the Hyper Gradient Response
+(HGR) estimation as an effective feedback for the transferability over
+pseudo-victim attackers, and propose the Dynamic Sequence Truncation (DST)
+technique to dynamically adjust the back-propagation path for HGR and reduce
+computational overhead simultaneously. Meanwhile, we conduct detailed
+algorithmic analysis and provide convergence guarantee to support non-convexity
+of the LL surrogate attacker. Extensive evaluations demonstrate substantial
+improvement of BETAK (e.g., $\mathbf{53.41}$\% increase of attack success rates
+against IncRes-v$2_{ens}$) against different victims and defense methods in
+targeted and untargeted attack scenarios. The source code is available at
+https://github.com/callous-youth/BETAK.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IJCAI 2024. 10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenGaussian: Towards Point-Level 3D Gaussian-based Open Vocabulary
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanmin Wu, Jiarui Meng, Haijie Li, Chenming Wu, Yahao Shi, Xinhua Cheng, Chen Zhao, Haocheng Feng, Errui Ding, Jingdong Wang, Jian Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces OpenGaussian, a method based on 3D Gaussian Splatting
+(3DGS) capable of 3D point-level open vocabulary understanding. Our primary
+motivation stems from observing that existing 3DGS-based open vocabulary
+methods mainly focus on 2D pixel-level parsing. These methods struggle with 3D
+point-level tasks due to weak feature expressiveness and inaccurate 2D-3D
+feature associations. To ensure robust feature presentation and 3D point-level
+understanding, we first employ SAM masks without cross-frame associations to
+train instance features with 3D consistency. These features exhibit both
+intra-object consistency and inter-object distinction. Then, we propose a
+two-stage codebook to discretize these features from coarse to fine levels. At
+the coarse level, we consider the positional information of 3D points to
+achieve location-based clustering, which is then refined at the fine level.
+Finally, we introduce an instance-level 3D-2D feature association method that
+links 3D points to 2D masks, which are further associated with 2D CLIP
+features. Extensive experiments, including open vocabulary-based 3D object
+selection, 3D point cloud understanding, click-based 3D object selection, and
+ablation studies, demonstrate the effectiveness of our proposed method. Project
+page: https://3d-aigc.github.io/OpenGaussian
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>technical report, 15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Predicate and Triplet Learning for Scene Graph Generation <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiankai Li, Yunhong Wang, Xiefan Guo, Ruijie Yang, Weixin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Graph Generation (SGG) aims to identify entities and predict the
+relationship triplets \textit{\textless subject, predicate, object\textgreater
+} in visual scenes. Given the prevalence of large visual variations of
+subject-object pairs even in the same predicate, it can be quite challenging to
+model and refine predicate representations directly across such pairs, which is
+however a common strategy adopted by most existing SGG methods. We observe that
+visual variations within the identical triplet are relatively small and certain
+relation cues are shared in the same type of triplet, which can potentially
+facilitate the relation learning in SGG. Moreover, for the long-tail problem
+widely studied in SGG task, it is also crucial to deal with the limited types
+and quantity of triplets in tail predicates. Accordingly, in this paper, we
+propose a Dual-granularity Relation Modeling (DRM) network to leverage
+fine-grained triplet cues besides the coarse-grained predicate ones. DRM
+utilizes contexts and semantics of predicate and triplet with Dual-granularity
+Constraints, generating compact and balanced representations from two
+perspectives to facilitate relation recognition. Furthermore, a
+Dual-granularity Knowledge Transfer (DKT) strategy is introduced to transfer
+variation from head predicates/triplets to tail ones, aiming to enrich the
+pattern diversity of tail classes to alleviate the long-tail problem. Extensive
+experiments demonstrate the effectiveness of our method, which establishes new
+state-of-the-art performance on Visual Genome, Open Image, and GQA datasets.
+Our code is available at \url{https://github.com/jkli1998/DRM}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Scale Direction-Aware Network for Infrared Small Target Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02037v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02037v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinmiao Zhao, Zelin Shi, Chuang Yu, Yunpeng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Infrared small target detection faces the problem that it is difficult to
+effectively separate the background and the target. Existing deep
+learning-based methods focus on appearance features and ignore high-frequency
+directional features. Therefore, we propose a multi-scale direction-aware
+network (MSDA-Net), which is the first attempt to integrate the high-frequency
+directional features of infrared small targets as domain prior knowledge into
+neural networks. Specifically, an innovative multi-directional feature
+awareness (MDFA) module is constructed, which fully utilizes the prior
+knowledge of targets and emphasizes the focus on high-frequency directional
+features. On this basis, combined with the multi-scale local relation learning
+(MLRL) module, a multi-scale direction-aware (MSDA) module is further
+constructed. The MSDA module promotes the full extraction of local relations at
+different scales and the full perception of key features in different
+directions. Meanwhile, a high-frequency direction injection (HFDI) module
+without training parameters is constructed to inject the high-frequency
+directional information of the original image into the network. This helps
+guide the network to pay attention to detailed information such as target edges
+and shapes. In addition, we propose a feature aggregation (FA) structure that
+aggregates multi-level features to solve the problem of small targets
+disappearing in deep feature maps. Furthermore, a lightweight feature alignment
+fusion (FAF) module is constructed, which can effectively alleviate the pixel
+offset existing in multi-level feature map fusion. Extensive experimental
+results show that our MSDA-Net achieves state-of-the-art (SOTA) results on the
+public NUDT-SIRST, SIRST and IRSTD-1k datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inference Attacks in Machine Learning as a Service: A Taxonomy, <span class="highlight-title">Review</span>,
+  and Promising Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02027v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02027v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Wu, Lei Cui, Shaowen Yao, Shui Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prosperity of machine learning has also brought people's concerns about
+data privacy. Among them, inference attacks can implement privacy breaches in
+various MLaaS scenarios and model training/prediction phases. Specifically,
+inference attacks can perform privacy inference on undisclosed target training
+sets based on outputs of the target model, including but not limited to
+statistics, membership, semantics, data representation, etc. For instance,
+infer whether the target data has the characteristics of AIDS. In addition, the
+rapid development of the machine learning community in recent years, especially
+the surge of model types and application scenarios, has further stimulated the
+inference attacks' research. Thus, studying inference attacks and analyzing
+them in depth is urgent and significant. However, there is still a gap in the
+systematic discussion of inference attacks from taxonomy, global perspective,
+attack, and defense perspectives. This survey provides an in-depth and
+comprehensive inference of attacks and corresponding countermeasures in
+ML-as-a-service based on taxonomy and the latest researches. Without
+compromising researchers' intuition, we first propose the 3MP taxonomy based on
+the community research status, trying to normalize the confusing naming system
+of inference attacks. Also, we analyze the pros and cons of each type of
+inference attack, their workflow, countermeasure, and how they interact with
+other attacks. In the end, we point out several promising directions for
+researchers from a more comprehensive and novel perspective.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MetaMixer Is All You Need 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seokju Yun, Dongheon Lee, Youngmin Ro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer, composed of self-attention and Feed-Forward Network, has
+revolutionized the landscape of network design across various vision tasks. FFN
+is a versatile operator seamlessly integrated into nearly all AI models to
+effectively harness rich representations. Recent works also show that FFN
+functions like key-value memories. Thus, akin to the query-key-value mechanism
+within self-attention, FFN can be viewed as a memory network, where the input
+serves as query and the two projection weights operate as keys and values,
+respectively. We hypothesize that the importance lies in query-key-value
+framework itself rather than in self-attention. To verify this, we propose
+converting self-attention into a more FFN-like efficient token mixer with only
+convolutions while retaining query-key-value framework, namely FFNification.
+Specifically, FFNification replaces query-key and attention coefficient-value
+interactions with large kernel convolutions and adopts GELU activation function
+instead of softmax. The derived token mixer, FFNified attention, serves as
+key-value memories for detecting locally distributed spatial patterns, and
+operates in the opposite dimension to the ConvNeXt block within each
+corresponding sub-operation of the query-key-value framework. Building upon the
+above two modules, we present a family of Fast-Forward Networks. Our FFNet
+achieves remarkable performance improvements over previous state-of-the-art
+methods across a wide range of tasks. The strong and general performance of our
+proposed method validates our hypothesis and leads us to introduce MetaMixer, a
+general mixer architecture that does not specify sub-operations within the
+query-key-value framework. We show that using only simple operations like
+convolution and GELU in the MetaMixer can achieve superior performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/ysj9909/FFNet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian Mesh Optimization for Graph Neural Networks to Enhance
+  Engineering Performance Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jangseop Park, Namwoo Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In engineering design, surrogate models are widely employed to replace
+computationally expensive simulations by leveraging design variables and
+geometric parameters from computer-aided design (CAD) models. However, these
+models often lose critical information when simplified to lower dimensions and
+face challenges in parameter definition, especially with the complex 3D shapes
+commonly found in industrial datasets. To address these limitations, we propose
+a Bayesian graph neural network (GNN) framework for a 3D deep-learning-based
+surrogate model that predicts engineering performance by directly learning
+geometric features from CAD using mesh representation. Our framework determines
+the optimal size of mesh elements through Bayesian optimization, resulting in a
+high-accuracy surrogate model. Additionally, it effectively handles the
+irregular and complex structures of 3D CADs, which differ significantly from
+the regular and uniform pixel structures of 2D images typically used in deep
+learning. Experimental results demonstrate that the quality of the mesh
+significantly impacts the prediction accuracy of the surrogate model, with an
+optimally sized mesh achieving superior performance. We compare the performance
+of models based on various 3D representations such as voxel, point cloud, and
+graph, and evaluate the computational costs of Monte Carlo simulation and
+Bayesian optimization methods to find the optimal mesh size. We anticipate that
+our proposed framework has the potential to be applied to mesh-based
+simulations across various engineering fields, leveraging physics-based
+information commonly used in computer-aided engineering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 8 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Imaging of Complex Specular Surfaces by Fusing Polarimetric and
+  Deflectometric Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhang Wang, Oliver Cossairt, Florian Willomitzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and fast 3D imaging of specular surfaces still poses major
+challenges for state-of-the-art optical measurement principles. Frequently used
+methods, such as phase-measuring deflectometry (PMD) or shape-from-polarization
+(SfP), rely on strong assumptions about the measured objects, limiting their
+generalizability in broader application areas like medical imaging, industrial
+inspection, virtual reality, or cultural heritage analysis. In this paper, we
+introduce a measurement principle that utilizes a novel technique to
+effectively encode and decode the information contained in a light field
+reflected off a specular surface. We combine polarization cues from SfP with
+geometric information obtained from PMD to resolve all arising ambiguities in
+the 3D measurement. Moreover, our approach removes the unrealistic orthographic
+imaging assumption for SfP, which significantly improves the respective
+results. We showcase our new technique by demonstrating single-shot and
+multi-shot measurements on complex-shaped specular surfaces, displaying an
+evaluated accuracy of surface normals below $0.6^\circ$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Choroidal Vessel Segmentation on Indocyanine Green Angiography Images
+  via Human-in-the-Loop Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01993v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01993v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruoyu Chen, Ziwei Zhao, Mayinuer Yusufu, Xianwen Shang, Danli Shi, Mingguang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human-in-the-loop (HITL) strategy has been recently introduced into the field
+of medical image processing. Indocyanine green angiography (ICGA) stands as a
+well-established examination for visualizing choroidal vasculature and
+detecting chorioretinal diseases. However, the intricate nature of choroidal
+vascular networks makes large-scale manual segmentation of ICGA images
+challenging. Thus, the study aims to develop a high-precision choroidal vessel
+segmentation model with limited labor using HITL framework. We utilized a
+multi-source ICGA dataset, including 55 degree view and ultra-widefield ICGA
+(UWF-ICGA) images for model development. The choroidal vessel network was
+pre-segmented by a pre-trained vessel segmentation model, and then manually
+modified by two ophthalmologists. Choroidal vascular diameter, density,
+complexity, tortuosity, and branching angle were automatically quantified based
+on the segmentation. We finally conducted four cycles of HITL. One hundred and
+fifty 55 degree view ICGA images were used for the first three cycles (50
+images per cycle), and twenty UWF-ICGA images for the last cycle. The average
+time needed to manually correct a pre-segmented ICGA image per cycle reduced
+from 20 minutes to 1 minute. High segmentation accuracy has been achieved on
+both 55 degree view ICGA and UWF-ICGA images. Additionally, the
+multi-dimensional choroidal vascular parameters were significantly associated
+with various chorioretinal diseases. Our study not only demonstrated the
+feasibility of the HITL strategy in improving segmentation performance with
+reduced manual labeling, but also innovatively introduced several risk
+predictors for choroidal abnormalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages,4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dealing with All-stage Missing Modality: Towards A Universal Model with
+  Robust Reconstruction and Personalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunpeng Zhao, Cheng Chen, Qing You Pang, Quanzheng Li, Carol Tang, Beng-Ti Ang, Yueming Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Addressing missing modalities presents a critical challenge in multimodal
+learning. Current approaches focus on developing models that can handle
+modality-incomplete inputs during inference, assuming that the full set of
+modalities are available for all the data during training. This reliance on
+full-modality data for training limits the use of abundant modality-incomplete
+samples that are often encountered in practical settings. In this paper, we
+propose a robust universal model with modality reconstruction and model
+personalization, which can effectively tackle the missing modality at both
+training and testing stages. Our method leverages a multimodal masked
+autoencoder to reconstruct the missing modality and masked patches
+simultaneously, incorporating an innovative distribution approximation
+mechanism to fully utilize both modality-complete and modality-incomplete data.
+The reconstructed modalities then contributes to our designed data-model
+co-distillation scheme to guide the model learning in the presence of missing
+modalities. Moreover, we propose a CLIP-driven hyper-network to personalize
+partial model parameters, enabling the model to adapt to each distinct missing
+modality scenario. Our method has been extensively validated on two brain tumor
+segmentation benchmarks. Experimental results demonstrate the promising
+performance of our method, which consistently exceeds previous state-of-the-art
+approaches under the all-stage missing modality settings with different missing
+ratios. Code will be available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Dense Connectivity Benefit Outlier Detection? An Odyssey with NAS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Fu, Tunhou Zhang, Hai Li, Yiran Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in Out-of-Distribution (OOD) Detection is the driving force
+behind safe and reliable deployment of Convolutional Neural Networks (CNNs) in
+real world applications. However, existing studies focus on OOD detection
+through confidence score and deep generative model-based methods, without
+considering the impact of DNN structures, especially dense connectivity in
+architecture fabrications. In addition, existing outlier detection approaches
+exhibit high variance in generalization performance, lacking stability and
+confidence in evaluating and ranking different outlier detectors. In this work,
+we propose a novel paradigm, Dense Connectivity Search of Outlier Detector
+(DCSOD), that automatically explore the dense connectivity of CNN architectures
+on near-OOD detection task using Neural Architecture Search (NAS). We introduce
+a hierarchical search space containing versatile convolution operators and
+dense connectivity, allowing a flexible exploration of CNN architectures with
+diverse connectivity patterns. To improve the quality of evaluation on OOD
+detection during search, we propose evolving distillation based on our
+multi-view feature learning explanation. Evolving distillation stabilizes
+training for OOD detection evaluation, thus improves the quality of search. We
+thoroughly examine DCSOD on CIFAR benchmarks under OOD detection protocol.
+Experimental results show that DCSOD achieve remarkable performance over widely
+used architectures and previous NAS baselines. Notably, DCSOD achieves
+state-of-the-art (SOTA) performance on CIFAR benchmark, with AUROC improvement
+of $\sim$1.0%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Crystal Ball Hypothesis in diffusion models: Anticipating object
+  positions from initial noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Ban, Ruochen Wang, Tianyi Zhou, Boqing Gong, Cho-Jui Hsieh, Minhao Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have achieved remarkable success in text-to-image generation
+tasks; however, the role of initial noise has been rarely explored. In this
+study, we identify specific regions within the initial noise image, termed
+trigger patches, that play a key role for object generation in the resulting
+images. Notably, these patches are ``universal'' and can be generalized across
+various positions, seeds, and prompts. To be specific, extracting these patches
+from one noise and injecting them into another noise leads to object generation
+in targeted areas. We identify these patches by analyzing the dispersion of
+object bounding boxes across generated images, leading to the development of a
+posterior analysis technique. Furthermore, we create a dataset consisting of
+Gaussian noises labeled with bounding boxes corresponding to the objects
+appearing in the generated images and train a detector that identifies these
+patches from the initial noise. To explain the formation of these patches, we
+reveal that they are outliers in Gaussian noise, and follow distinct
+distributions through two-sample tests. Finally, we find the misalignment
+between prompts and the trigger patch patterns can result in unsuccessful image
+generations. The study proposes a reject-sampling strategy to obtain optimal
+noise, aiming to improve prompt adherence and positional diversity in image
+generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Real World Map Change Generalization of Prior-Informed HD Map
+  Prediction Models <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel M. Bateman, Ning Xu, H. Charles Zhao, Yael Ben Shalom, Vince Gong, Greg Long, Will Maddern
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building and maintaining High-Definition (HD) maps represents a large barrier
+to autonomous vehicle deployment. This, along with advances in modern online
+map detection models, has sparked renewed interest in the online mapping
+problem. However, effectively predicting online maps at a high enough quality
+to enable safe, driverless deployments remains a significant challenge. Recent
+work on these models proposes training robust online mapping systems using low
+quality map priors with synthetic perturbations in an attempt to simulate
+out-of-date HD map priors. In this paper, we investigate how models trained on
+these synthetically perturbed map priors generalize to performance on
+deployment-scale, real world map changes. We present a large-scale experimental
+study to determine which synthetic perturbations are most useful in
+generalizing to real world HD map changes, evaluated using multiple years of
+real-world autonomous driving data. We show there is still a substantial
+sim2real gap between synthetic prior perturbations and observed real-world
+changes, which limits the utility of current prior-informed HD map prediction
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024, Workshop on Autonomous Driving</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhance Image-to-Image Generation with LLaVA <span class="highlight-title">Prompt</span> and Negative <span class="highlight-title">Prompt</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhicheng Ding, Panfeng Li, Qikai Yang, Siyang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach to enhance image-to-image generation by
+leveraging the multimodal capabilities of the Large Language and Vision
+Assistant (LLaVA). We propose a framework where LLaVA analyzes input images and
+generates textual descriptions, hereinafter LLaVA-generated prompts. These
+prompts, along with the original image, are fed into the image-to-image
+generation pipeline. This enriched representation guides the generation process
+towards outputs that exhibit a stronger resemblance to the input image.
+Extensive experiments demonstrate the effectiveness of LLaVA-generated prompts
+in promoting image similarity. We observe a significant improvement in the
+visual coherence between the generated and input images compared to traditional
+methods. Future work will explore fine-tuning LLaVA prompts for increased
+control over the creative process. By providing more specific details within
+the prompts, we aim to achieve a delicate balance between faithfulness to the
+original image and artistic expression in the generated outputs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 2024 5th International Conference on Information Science,
+  Parallel and Distributed Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Plug-and-Play Diffusion Distillation <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Ting Hsiao, Siavash Khodadadeh, Kevin Duarte, Wei-An Lin, Hui Qu, Mingi Kwon, Ratheesh Kalarot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have shown tremendous results in image generation. However,
+due to the iterative nature of the diffusion process and its reliance on
+classifier-free guidance, inference times are slow. In this paper, we propose a
+new distillation approach for guided diffusion models in which an external
+lightweight guide model is trained while the original text-to-image model
+remains frozen. We show that our method reduces the inference computation of
+classifier-free guided latent-space diffusion models by almost half, and only
+requires 1\% trainable parameters of the base model. Furthermore, once trained,
+our guide model can be applied to various fine-tuned, domain-specific versions
+of the base diffusion model without the need for additional training: this
+"plug-and-play" functionality drastically improves inference computation while
+maintaining the visual fidelity of generated images. Empirically, we show that
+our approach is able to produce visually appealing results and achieve a
+comparable FID score to the teacher with as few as 8 to 16 steps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nutrition Estimation for Dietary Management: A <span class="highlight-title">Transformer</span> Approach with
+  Depth Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyi Kwan, Wei Zhang, Zhengkui Wang, Aik Beng Ng, Simon See
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nutrition estimation is crucial for effective dietary management and overall
+health and well-being. Existing methods often struggle with sub-optimal
+accuracy and can be time-consuming. In this paper, we propose NuNet, a
+transformer-based network designed for nutrition estimation that utilizes both
+RGB and depth information from food images. We have designed and implemented a
+multi-scale encoder and decoder, along with two types of feature fusion
+modules, specialized for estimating five nutritional factors. These modules
+effectively balance the efficiency and effectiveness of feature extraction with
+flexible usage of our customized attention mechanisms and fusion strategies.
+Our experimental study shows that NuNet outperforms its variants and existing
+solutions significantly for nutrition estimation. It achieves an error rate of
+15.65%, the lowest known to us, largely due to our multi-scale architecture and
+fusion modules. This research holds practical values for dietary management
+with huge potential for transnational research and deployment and could inspire
+other applications involving multiple data types with varying degrees of
+importance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Endangered Marine Species in Autonomous Underwater Vehicle
+  Imagery Using Point Annotations and Few-Shot Learning <span class="chip">IROS 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heather Doig, Oscar Pizarro, Jacquomo Monk, Stefan Williams
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One use of Autonomous Underwater Vehicles (AUVs) is the monitoring of
+habitats associated with threatened, endangered and protected marine species,
+such as the handfish of Tasmania, Australia. Seafloor imagery collected by AUVs
+can be used to identify individuals within their broader habitat context, but
+the sheer volume of imagery collected can overwhelm efforts to locate rare or
+cryptic individuals. Machine learning models can be used to identify the
+presence of a particular species in images using a trained object detector, but
+the lack of training examples reduces detection performance, particularly for
+rare species that may only have a small number of examples in the wild. In this
+paper, inspired by recent work in few-shot learning, images and annotations of
+common marine species are exploited to enhance the ability of the detector to
+identify rare and cryptic species. Annotated images of six common marine
+species are used in two ways. Firstly, the common species are used in a
+pre-training step to allow the backbone to create rich features for marine
+species. Secondly, a copy-paste operation is used with the common species
+images to augment the training data. While annotations for more common marine
+species are available in public datasets, they are often in point format, which
+is unsuitable for training an object detector. A popular semantic segmentation
+model efficiently generates bounding box annotations for training from the
+available point annotations. Our proposed framework is applied to AUV images of
+handfish, increasing average precision by up to 48\% compared to baseline
+object detection training. This approach can be applied to other objects with
+low numbers of annotations and promises to increase the ability to actively
+monitor threatened, endangered and protected species.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 5 figures. Submitted to the 2024 IEEE/RSJ International
+  Conference on Intelligent Robots and Systems (IROS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CODE: Contrasting Self-generated Description to Combat Hallucination in
+  Large Multi-modal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junho Kim, Hyunjun Kim, Yeonju Kim, Yong Man Ro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Multi-modal Models (LMMs) have recently demonstrated remarkable
+abilities in visual context understanding and coherent response generation.
+However, alongside these advancements, the issue of hallucinations has emerged
+as a significant challenge, producing erroneous responses that are unrelated to
+the visual contents. In this paper, we introduce a novel contrastive-based
+decoding method, COuntering DEscription Contrastive Decoding (CODE), which
+leverages self-generated descriptions as contrasting references during the
+decoding phase of LMMs to address hallucination issues. CODE utilizes the
+comprehensive descriptions from model itself as visual counterpart to correct
+and improve response alignment with actual visual content. By dynamically
+adjusting the information flow and distribution of next-token predictions in
+the LMM's vocabulary, CODE enhances the coherence and informativeness of
+generated responses. Extensive experiments demonstrate that our method
+significantly reduces hallucinations and improves cross-modal consistency
+across various benchmarks and cutting-edge LMMs. Our method provides a simple
+yet effective decoding strategy that can be integrated to existing LMM
+frameworks without additional training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://ivy-lvlm.github.io/CODE/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GOMAA-Geo: GOal Modality Agnostic Active Geo-localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anindya Sarkar, Srikumar Sastry, Aleksis Pirinen, Chongjie Zhang, Nathan Jacobs, Yevgeniy Vorobeychik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the task of active geo-localization (AGL) in which an agent uses
+a sequence of visual cues observed during aerial navigation to find a target
+specified through multiple possible modalities. This could emulate a UAV
+involved in a search-and-rescue operation navigating through an area, observing
+a stream of aerial images as it goes. The AGL task is associated with two
+important challenges. Firstly, an agent must deal with a goal specification in
+one of multiple modalities (e.g., through a natural language description) while
+the search cues are provided in other modalities (aerial imagery). The second
+challenge is limited localization time (e.g., limited battery life, urgency) so
+that the goal must be localized as efficiently as possible, i.e. the agent must
+effectively leverage its sequentially observed aerial views when searching for
+the goal. To address these challenges, we propose GOMAA-Geo - a goal modality
+agnostic active geo-localization agent - for zero-shot generalization between
+different goal modalities. Our approach combines cross-modality contrastive
+learning to align representations across modalities with supervised foundation
+model pretraining and reinforcement learning to obtain highly effective
+navigation and localization policies. Through extensive evaluations, we show
+that GOMAA-Geo outperforms alternative learnable approaches and that it
+generalizes across datasets - e.g., to disaster-hit areas without seeing a
+single disaster scenario during training - and goal modalities - e.g., to
+ground-level imagery or textual descriptions, despite only being trained with
+goals specified as aerial views. Code and models are publicly available at
+https://github.com/mvrl/GOMAA-Geo/tree/main.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FastLGS: Speeding up Language Embedded Gaussians with Feature Grid
+  Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhou Ji, He Zhu, Junshu Tang, Wuyi Liu, Zhizhong Zhang, Yuan Xie, Lizhuang Ma, Xin Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The semantically interactive radiance field has always been an appealing task
+for its potential to facilitate user-friendly and automated real-world 3D scene
+understanding applications. However, it is a challenging task to achieve high
+quality, efficiency and zero-shot ability at the same time with semantics in
+radiance fields. In this work, we present FastLGS, an approach that supports
+real-time open-vocabulary query within 3D Gaussian Splatting (3DGS) under high
+resolution. We propose the semantic feature grid to save multi-view CLIP
+features which are extracted based on Segment Anything Model (SAM) masks, and
+map the grids to low dimensional features for semantic field training through
+3DGS. Once trained, we can restore pixel-aligned CLIP embeddings through
+feature grids from rendered features for open-vocabulary queries. Comparisons
+with other state-of-the-art methods prove that FastLGS can achieve the first
+place performance concerning both speed and accuracy, where FastLGS is 98x
+faster than LERF and 4x faster than LangSplat. Meanwhile, experiments show that
+FastLGS is adaptive and compatible with many downstream tasks, such as 3D
+segmentation and 3D object inpainting, which can be easily applied to other 3D
+manipulation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HPE-CogVLM: New Head Pose Grounding Task Exploration on Vision Language
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Tian, Tianqi Shao, Tsukasa Demizu, Xuyang Wu, Hsin-Tai Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Head pose estimation (HPE) task requires a sophisticated understanding of 3D
+spatial relationships and precise numerical output of yaw, pitch, and roll
+Euler angles. Previous HPE studies are mainly based on Non-large language
+models (Non-LLMs), which rely on close-up human heads cropped from the full
+image as inputs and lack robustness in real-world scenario. In this paper, we
+present a novel framework to enhance the HPE prediction task by leveraging the
+visual grounding capability of CogVLM. CogVLM is a vision language model (VLM)
+with grounding capability of predicting object bounding boxes (BBoxes), which
+enables HPE training and prediction using full image information input. To
+integrate the HPE task into the VLM, we first cop with the catastrophic
+forgetting problem in large language models (LLMs) by investigating the
+rehearsal ratio in the data rehearsal method. Then, we propose and validate a
+LoRA layer-based model merging method, which keeps the integrity of parameters,
+to enhance the HPE performance in the framework. The results show our
+HPE-CogVLM achieves a 31.5\% reduction in Mean Absolute Error for HPE
+prediction over the current Non-LLM based state-of-the-art in cross-dataset
+evaluation. Furthermore, we compare our LoRA layer-based model merging method
+with LoRA fine-tuning only and other merging methods in CogVLM. The results
+demonstrate our framework outperforms them in all HPE metrics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProGEO: Generating <span class="highlight-title">Prompt</span>s through Image-Text Contrastive Learning for
+  Visual Geo-localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Mao, Jingqi Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Geo-localization (VG) refers to the process to identify the location
+described in query images, which is widely applied in robotics field and
+computer vision tasks, such as autonomous driving, metaverse, augmented
+reality, and SLAM. In fine-grained images lacking specific text descriptions,
+directly applying pure visual methods to represent neighborhood features often
+leads to the model focusing on overly fine-grained features, unable to fully
+mine the semantic information in the images. Therefore, we propose a two-stage
+training method to enhance visual performance and use contrastive learning to
+mine challenging samples. We first leverage the multi-modal description
+capability of CLIP (Contrastive Language-Image Pretraining) to create a set of
+learnable text prompts for each geographic image feature to form vague
+descriptions. Then, by utilizing dynamic text prompts to assist the training of
+the image encoder, we enable the image encoder to learn better and more
+generalizable visual features. This strategy of applying text to purely visual
+tasks addresses the challenge of using multi-modal models for geographic
+images, which often suffer from a lack of precise descriptions, making them
+difficult to utilize widely. We validate the effectiveness of the proposed
+strategy on several large-scale visual geo-localization datasets, and our
+method achieves competitive results on multiple visual geo-localization
+datasets. Our code and model are available at
+https://github.com/Chain-Mao/ProGEO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Follow-Your-Emoji: Fine-Controllable and Expressive Freestyle Portrait
+  Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Ma, Hongyu Liu, Hongfa Wang, Heng Pan, Yingqing He, Junkun Yuan, Ailing Zeng, Chengfei Cai, Heung-Yeung Shum, Wei Liu, Qifeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Follow-Your-Emoji, a diffusion-based framework for portrait
+animation, which animates a reference portrait with target landmark sequences.
+The main challenge of portrait animation is to preserve the identity of the
+reference portrait and transfer the target expression to this portrait while
+maintaining temporal consistency and fidelity. To address these challenges,
+Follow-Your-Emoji equipped the powerful Stable Diffusion model with two
+well-designed technologies. Specifically, we first adopt a new explicit motion
+signal, namely expression-aware landmark, to guide the animation process. We
+discover this landmark can not only ensure the accurate motion alignment
+between the reference portrait and target motion during inference but also
+increase the ability to portray exaggerated expressions (i.e., large pupil
+movements) and avoid identity leakage. Then, we propose a facial fine-grained
+loss to improve the model's ability of subtle expression perception and
+reference portrait appearance reconstruction by using both expression and
+facial masks. Accordingly, our method demonstrates significant performance in
+controlling the expression of freestyle portraits, including real humans,
+cartoons, sculptures, and even animals. By leveraging a simple and effective
+progressive generation strategy, we extend our model to stable long-term
+animation, thus increasing its potential application value. To address the lack
+of a benchmark for this field, we introduce EmojiBench, a comprehensive
+benchmark comprising diverse portrait images, driving videos, and landmarks. We
+show extensive evaluations on EmojiBench to verify the superiority of
+Follow-Your-Emoji.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://follow-your-emoji.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SVASTIN: Sparse Video Adversarial Attack via Spatio-Temporal Invertible
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Pan, Jun-Jie Huang, Zihan Chen, Wentao Zhao, Ziyue Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust and imperceptible adversarial video attack is challenging due to the
+spatial and temporal characteristics of videos. The existing video adversarial
+attack methods mainly take a gradient-based approach and generate adversarial
+videos with noticeable perturbations. In this paper, we propose a novel Sparse
+Adversarial Video Attack via Spatio-Temporal Invertible Neural Networks
+(SVASTIN) to generate adversarial videos through spatio-temporal feature space
+information exchanging. It consists of a Guided Target Video Learning (GTVL)
+module to balance the perturbation budget and optimization speed and a
+Spatio-Temporal Invertible Neural Network (STIN) module to perform
+spatio-temporal feature space information exchanging between a source video and
+the target feature tensor learned by GTVL module. Extensive experiments on
+UCF-101 and Kinetics-400 demonstrate that our proposed SVASTIN can generate
+adversarial examples with higher imperceptibility than the state-of-the-art
+methods with the higher fooling rate. Code is available at
+\href{https://github.com/Brittany-Chen/SVASTIN}{https://github.com/Brittany-Chen/SVASTIN}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rank-based No-reference Quality Assessment for Face Swapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinghui Zhou, Wenbo Zhou, Tianyi Wei, Shen Chen, Taiping Yao, Shouhong Ding, Weiming Zhang, Nenghai Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face swapping has become a prominent research area in computer vision and
+image processing due to rapid technological advancements. The metric of
+measuring the quality in most face swapping methods relies on several distances
+between the manipulated images and the source image, or the target image, i.e.,
+there are suitable known reference face images. Therefore, there is still a gap
+in accurately assessing the quality of face interchange in reference-free
+scenarios. In this study, we present a novel no-reference image quality
+assessment (NR-IQA) method specifically designed for face swapping, addressing
+this issue by constructing a comprehensive large-scale dataset, implementing a
+method for ranking image quality based on multiple facial attributes, and
+incorporating a Siamese network based on interpretable qualitative comparisons.
+Our model demonstrates the state-of-the-art performance in the quality
+assessment of swapped faces, providing coarse- and fine-grained. Enhanced by
+this metric, an improved face-swapping model achieved a more advanced level
+with respect to expressions and poses. Extensive experiments confirm the
+superiority of our method over existing general no-reference image quality
+assessment metrics and the latest metric of facial image quality assessment,
+making it well suited for evaluating face swapping images in real-world
+scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fruit Classification System with Deep Learning and Neural Architecture
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christine Dewi, Dhananjay Thiruvady, Nayyar Zaidi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fruit identification process involves analyzing and categorizing
+different types of fruits based on their visual characteristics. This activity
+can be achieved using a range of methodologies, encompassing manual
+examination, conventional computer vision methodologies, and more sophisticated
+methodologies employing machine learning and deep learning. Our study
+identified a total of 15 distinct categories of fruit, consisting of class
+Avocado, Banana, Cherry, Apple Braeburn, Apple golden 1, Apricot, Grape, Kiwi,
+Mango, Orange, Papaya, Peach, Pineapple, Pomegranate and Strawberry. Neural
+Architecture Search (NAS) is a technological advancement employed within the
+realm of deep learning and artificial intelligence, to automate conceptualizing
+and refining neural network topologies. NAS aims to identify neural network
+structures that are highly suitable for tasks, such as the detection of fruits.
+Our suggested model with 99.98% mAP increased the detection performance of the
+preceding research study that used Fruit datasets. In addition, after the
+completion of the study, a comparative analysis was carried out to assess the
+findings in conjunction with those of another research that is connected to the
+topic. When compared to the findings of earlier studies, the detector that was
+proposed exhibited higher performance in terms of both its accuracy and its
+precision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MoLA: Motion Generation and Editing with Latent Diffusion Enhanced by
+  Adversarial Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kengo Uchida, Takashi Shibuya, Yuhta Takida, Naoki Murata, Shusuke Takahashi, Yuki Mitsufuji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In motion generation, controllability as well as generation quality and speed
+is becoming more and more important. There are various motion editing tasks,
+such as in-betweening, upper body editing, and path-following, but existing
+methods perform motion editing with a data-space diffusion model, which is slow
+in inference compared to a latent diffusion model. In this paper, we propose
+MoLA, which provides fast and high-quality motion generation and also can deal
+with multiple editing tasks in a single framework. For high-quality and fast
+generation, we employ a variational autoencoder and latent diffusion model, and
+improve the performance with adversarial training. In addition, we apply a
+training-free guided generation framework to achieve various editing tasks with
+motion control inputs. We quantitatively show the effectiveness of adversarial
+learning in text-to-motion generation, and demonstrate the applicability of our
+editing framework to multiple editing tasks in the motion domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ORACLE: Leveraging Mutual Information for Consistent Character
+  Generation with LoRAs in Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02820v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02820v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kiymet Akdemir, Pinar Yanardag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image diffusion models have recently taken center stage as pivotal
+tools in promoting visual creativity across an array of domains such as comic
+book artistry, children's literature, game development, and web design. These
+models harness the power of artificial intelligence to convert textual
+descriptions into vivid images, thereby enabling artists and creators to bring
+their imaginative concepts to life with unprecedented ease. However, one of the
+significant hurdles that persist is the challenge of maintaining consistency in
+character generation across diverse contexts. Variations in textual prompts,
+even if minor, can yield vastly different visual outputs, posing a considerable
+problem in projects that require a uniform representation of characters
+throughout. In this paper, we introduce a novel framework designed to produce
+consistent character representations from a single text prompt across diverse
+settings. Through both quantitative and qualitative analyses, we demonstrate
+that our framework outperforms existing methods in generating characters with
+consistent visual identities, underscoring its potential to transform creative
+industries. By addressing the critical challenge of character consistency, we
+not only enhance the practical utility of these models but also broaden the
+horizons for artistic and creative expression.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LADI v2: Multi-label <span class="highlight-title">Dataset</span> and Classifiers for Low-Altitude Disaster
+  Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02780v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02780v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Scheele, Katherine Picchione, Jeffrey Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ML-based computer vision models are promising tools for supporting emergency
+management operations following natural disasters. Arial photographs taken from
+small manned and unmanned aircraft can be available soon after a disaster and
+provide valuable information from multiple perspectives for situational
+awareness and damage assessment applications. However, emergency managers often
+face challenges finding the most relevant photos among the tens of thousands
+that may be taken after an incident. While ML-based solutions could enable more
+effective use of aerial photographs, there is still a lack of training data for
+imagery of this type from multiple perspectives and for multiple hazard types.
+To address this, we present the LADI v2 (Low Altitude Disaster Imagery version
+2) dataset, a curated set of about 10,000 disaster images captured in the
+United States by the Civil Air Patrol (CAP) in response to federally-declared
+emergencies (2015-2023) and annotated for multi-label classification by trained
+CAP volunteers. We also provide two pretrained baseline classifiers and compare
+their performance to state-of-the-art vision-language models in multi-label
+classification. The data and code are released publicly to support the
+development of computer vision models for emergency management research and
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeshVPR: Citywide Visual Place Recognition Using 3D Meshes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02776v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02776v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriele Berton, Lorenz Junglas, Riccardo Zaccone, Thomas Pollok, Barbara Caputo, Carlo Masone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mesh-based scene representation offers a promising direction for simplifying
+large-scale hierarchical visual localization pipelines, combining a visual
+place recognition step based on global features (retrieval) and a visual
+localization step based on local features. While existing work demonstrates the
+viability of meshes for visual localization, the impact of using synthetic
+databases rendered from them in visual place recognition remains largely
+unexplored. In this work we investigate using dense 3D textured meshes for
+large-scale Visual Place Recognition (VPR) and identify a significant
+performance drop when using synthetic mesh-based databases compared to
+real-world images for retrieval. To address this, we propose MeshVPR, a novel
+VPR pipeline that utilizes a lightweight features alignment framework to bridge
+the gap between real-world and synthetic domains. MeshVPR leverages pre-trained
+VPR models and it is efficient and scalable for city-wide deployments. We
+introduce novel datasets with freely available 3D meshes and manually collected
+queries from Berlin, Paris, and Melbourne. Extensive evaluations demonstrate
+that MeshVPR achieves competitive performance with standard VPR pipelines,
+paving the way for mesh-based localization systems. Our contributions include
+the new task of citywide mesh-based VPR, the new benchmark datasets, MeshVPR,
+and a thorough analysis of open challenges. Data, code, and interactive
+visualizations are available at https://mesh-vpr.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Website: https://mesh-vpr.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion-Refined VQA Annotations for Semi-Supervised Gaze Following 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaomu Miao, Alexandros Graikos, Jingwei Zhang, Sounak Mondal, Minh Hoai, Dimitris Samaras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training gaze following models requires a large number of images with gaze
+target coordinates annotated by human annotators, which is a laborious and
+inherently ambiguous process. We propose the first semi-supervised method for
+gaze following by introducing two novel priors to the task. We obtain the first
+prior using a large pretrained Visual Question Answering (VQA) model, where we
+compute Grad-CAM heatmaps by `prompting' the VQA model with a gaze following
+question. These heatmaps can be noisy and not suited for use in training. The
+need to refine these noisy annotations leads us to incorporate a second prior.
+We utilize a diffusion model trained on limited human annotations and modify
+the reverse sampling process to refine the Grad-CAM heatmaps. By tuning the
+diffusion process we achieve a trade-off between the human annotation prior and
+the VQA heatmap prior, which retains the useful VQA prior information while
+exhibiting similar properties to the training data distribution. Our method
+outperforms simple pseudo-annotation generation baselines on the GazeFollow
+image dataset. More importantly, our pseudo-annotation strategy, applied to a
+widely used supervised gaze following model (VAT), reduces the annotation need
+by 50%. Our method also performs the best on the VideoAttentionTarget dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cyclic Sparse Training: Is it Enough? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02773v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02773v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Advait Gadhikar, Sree Harsha Nelaturu, Rebekka Burkholz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of iterative pruning methods in achieving state-of-the-art sparse
+networks has largely been attributed to improved mask identification and an
+implicit regularization induced by pruning. We challenge this hypothesis and
+instead posit that their repeated cyclic training schedules enable improved
+optimization. To verify this, we show that pruning at initialization is
+significantly boosted by repeated cyclic training, even outperforming standard
+iterative pruning methods. The dominant mechanism how this is achieved, as we
+conjecture, can be attributed to a better exploration of the loss landscape
+leading to a lower training loss. However, at high sparsity, repeated cyclic
+training alone is not enough for competitive performance. A strong coupling
+between learnt parameter initialization and mask seems to be required. Standard
+methods obtain this coupling via expensive pruning-training iterations,
+starting from a dense network. To achieve this with sparse training instead, we
+propose SCULPT-ing, i.e., repeated cyclic training of any sparse mask followed
+by a single pruning step to couple the parameters and the mask, which is able
+to match the performance of state-of-the-art iterative pruning methods in the
+high sparsity regime at reduced computational cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-layer Learnable Attention Mask for Multimodal Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wayner Barrios, SouYoung Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the Self-Attention mechanism in the Transformer model has proven to be
+effective in many domains, we observe that it is less effective in more diverse
+settings (e.g. multimodality) due to the varying granularity of each token and
+the high computational demands of lengthy sequences. To address the challenges,
+we introduce the Learnable Attention Mask (LAM), strategically designed to
+globally regulate attention maps and prioritize critical tokens within the
+sequence. Leveraging the Self-Attention module in a BERT-like transformer
+network, our approach adeptly captures associations between tokens. The
+extension of the LAM to a multi-layer version accommodates the varied
+information aspects embedded at each layer of the Transformer network.
+Comprehensive experimental validation on various datasets, such as MADv2,
+QVHighlights, ImageNet 1K, and MSRVTT, demonstrates the efficacy of the LAM,
+exemplifying its ability to enhance model performance while mitigating
+redundant computations. This pioneering approach presents a significant
+advancement in enhancing the understanding of complex scenarios, such as in
+movie understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dsfer-Net: A Deep Supervision and Feature Retrieval Network for
+  Bitemporal Change Detection Using Modern Hopfield Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01101v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01101v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shizhen Chang, Michael Kopp, Pedram Ghamisi, Bo Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Change detection, an essential application for high-resolution remote sensing
+images, aims to monitor and analyze changes in the land surface over time. Due
+to the rapid increase in the quantity of high-resolution remote sensing data
+and the complexity of texture features, several quantitative deep
+learning-based methods have been proposed. These methods outperform traditional
+change detection methods by extracting deep features and combining
+spatial-temporal information. However, reasonable explanations for how deep
+features improve detection performance are still lacking. In our
+investigations, we found that modern Hopfield network layers significantly
+enhance semantic understanding. In this paper, we propose a Deep Supervision
+and FEature Retrieval network (Dsfer-Net) for bitemporal change detection.
+Specifically, the highly representative deep features of bitemporal images are
+jointly extracted through a fully convolutional Siamese network. Based on the
+sequential geographical information of the bitemporal images, we designed a
+feature retrieval module to extract difference features and leverage
+discriminative information in a deeply supervised manner. Additionally, we
+observed that the deeply supervised feature retrieval module provides
+explainable evidence of the semantic understanding of the proposed network in
+its deep layers. Finally, our end-to-end network establishes a novel framework
+by aggregating retrieved features and feature pairs from different layers.
+Experiments conducted on three public datasets (LEVIR-CD, WHU-CD, and CDD)
+confirm the superiority of the proposed Dsfer-Net over other state-of-the-art
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VideoPoet: A Large Language Model for Zero-Shot Video Generation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14125v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14125v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan Kondratyuk, Lijun Yu, Xiuye Gu, José Lezama, Jonathan Huang, Grant Schindler, Rachel Hornung, Vighnesh Birodkar, Jimmy Yan, Ming-Chang Chiu, Krishna Somandepalli, Hassan Akbari, Yair Alon, Yong Cheng, Josh Dillon, Agrim Gupta, Meera Hahn, Anja Hauth, David Hendon, Alonso Martinez, David Minnen, Mikhail Sirotenko, Kihyuk Sohn, Xuan Yang, Hartwig Adam, Ming-Hsuan Yang, Irfan Essa, Huisheng Wang, David A. Ross, Bryan Seybold, Lu Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present VideoPoet, a language model capable of synthesizing high-quality
+video, with matching audio, from a large variety of conditioning signals.
+VideoPoet employs a decoder-only transformer architecture that processes
+multimodal inputs -- including images, videos, text, and audio. The training
+protocol follows that of Large Language Models (LLMs), consisting of two
+stages: pretraining and task-specific adaptation. During pretraining, VideoPoet
+incorporates a mixture of multimodal generative objectives within an
+autoregressive Transformer framework. The pretrained LLM serves as a foundation
+that can be adapted for a range of video generation tasks. We present empirical
+results demonstrating the model's state-of-the-art capabilities in zero-shot
+video generation, specifically highlighting VideoPoet's ability to generate
+high-fidelity motions. Project page: http://sites.research.google/videopoet/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICML 2024; Project page:
+  http://sites.research.google/videopoet/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COMQ: A Backpropagation-Free Algorithm for Post-Training Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07134v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07134v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aozhong Zhang, Zi Yang, Naigang Wang, Yingyong Qin, Jack Xin, Xin Li, Penghang Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization (PTQ) has emerged as a practical approach to
+compress large neural networks, making them highly efficient for deployment.
+However, effectively reducing these models to their low-bit counterparts
+without compromising the original accuracy remains a key challenge. In this
+paper, we propose an innovative PTQ algorithm termed COMQ, which sequentially
+conducts coordinate-wise minimization of the layer-wise reconstruction errors.
+We consider the widely used integer quantization, where every quantized weight
+can be decomposed into a shared floating-point scalar and an integer bit-code.
+Within a fixed layer, COMQ treats all the scaling factor(s) and bit-codes as
+the variables of the reconstruction error. Every iteration improves this error
+along a single coordinate while keeping all other variables constant. COMQ is
+easy to use and requires no hyper-parameter tuning. It instead involves only
+dot products and rounding operations. We update these variables in a carefully
+designed greedy order, significantly enhancing the accuracy. COMQ achieves
+remarkable results in quantizing 4-bit Vision Transformers, with a negligible
+loss of less than 1% in Top-1 accuracy. In 4-bit INT quantization of
+convolutional neural networks, COMQ maintains near-lossless accuracy with a
+minimal drop of merely 0.3% in Top-1 accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lay-A-Scene: Personalized 3D Object Arrangement Using Text-to-Image
+  Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ohad Rahamim, Hilit Segev, Idan Achituve, Yuval Atzmon, Yoni Kasten, Gal Chechik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating 3D visual scenes is at the forefront of visual generative AI, but
+current 3D generation techniques struggle with generating scenes with multiple
+high-resolution objects. Here we introduce Lay-A-Scene, which solves the task
+of Open-set 3D Object Arrangement, effectively arranging unseen objects. Given
+a set of 3D objects, the task is to find a plausible arrangement of these
+objects in a scene. We address this task by leveraging pre-trained
+text-to-image models. We personalize the model and explain how to generate
+images of a scene that contains multiple predefined objects without neglecting
+any of them. Then, we describe how to infer the 3D poses and arrangement of
+objects from a 2D generated image by finding a consistent projection of objects
+onto the 2D scene. We evaluate the quality of Lay-A-Scene using 3D objects from
+Objaverse and human raters and find that it often generates coherent and
+feasible 3D object arrangements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AI-Face: A Million-Scale Demographically Annotated AI-Generated Face
+  <span class="highlight-title">Dataset</span> and Fairness Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00783v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00783v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Lin,  Santosh, Xin Wang, Shu Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI-generated faces have enriched human life, such as entertainment,
+education, and art. However, they also pose misuse risks. Therefore, detecting
+AI-generated faces becomes crucial, yet current detectors show biased
+performance across different demographic groups. Mitigating biases can be done
+by designing algorithmic fairness methods, which usually require
+demographically annotated face datasets for model training. However, no
+existing dataset comprehensively encompasses both demographic attributes and
+diverse generative methods, which hinders the development of fair detectors for
+AI-generated faces. In this work, we introduce the AI-Face dataset, the first
+million-scale demographically annotated AI-generated face image dataset,
+including real faces, faces from deepfake videos, and faces generated by
+Generative Adversarial Networks and Diffusion Models. Based on this dataset, we
+conduct the first comprehensive fairness benchmark to assess various AI face
+detectors and provide valuable insights and findings to promote the future fair
+design of AI face detectors. Our AI-Face dataset and benchmark code are
+publicly available at https://github.com/Purdue-M2/AI-Face-FairnessBench.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Motion-aware Latent Diffusion Models for Video Frame Interpolation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13534v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13534v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhilin Huang, Yijie Yu, Ling Yang, Chujun Qin, Bing Zheng, Xiawu Zheng, Zikun Zhou, Yaowei Wang, Wenming Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of AIGC, video frame interpolation (VFI) has become a
+crucial component in existing video generation frameworks, attracting
+widespread research interest. For the VFI task, the motion estimation between
+neighboring frames plays a crucial role in avoiding motion ambiguity. However,
+existing VFI methods always struggle to accurately predict the motion
+information between consecutive frames, and this imprecise estimation leads to
+blurred and visually incoherent interpolated frames. In this paper, we propose
+a novel diffusion framework, motion-aware latent diffusion models (MADiff),
+which is specifically designed for the VFI task. By incorporating motion priors
+between the conditional neighboring frames with the target interpolated frame
+predicted throughout the diffusion sampling procedure, MADiff progressively
+refines the intermediate outcomes, culminating in generating both visually
+smooth and realistic results. Extensive experiments conducted on benchmark
+datasets demonstrate that our method achieves state-of-the-art performance
+significantly outperforming existing approaches, especially under challenging
+scenarios involving dynamic textures with complex motion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 4 figures. arXiv admin note: substantial text overlap with
+  arXiv:2303.09508 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SuperGaussian: Repurposing Video Models for 3D Super Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00609v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00609v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Shen, Duygu Ceylan, Paul Guerrero, Zexiang Xu, Niloy J. Mitra, Shenlong Wang, Anna Frühstück
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple, modular, and generic method that upsamples coarse 3D
+models by adding geometric and appearance details. While generative 3D models
+now exist, they do not yet match the quality of their counterparts in image and
+video domains. We demonstrate that it is possible to directly repurpose
+existing (pretrained) video models for 3D super-resolution and thus sidestep
+the problem of the shortage of large repositories of high-quality 3D training
+models. We describe how to repurpose video upsampling models, which are not 3D
+consistent, and combine them with 3D consolidation to produce 3D-consistent
+results. As output, we produce high quality Gaussian Splat models, which are
+object centric and effective. Our method is category agnostic and can be easily
+incorporated into existing 3D workflows. We evaluate our proposed SuperGaussian
+on a variety of 3D inputs, which are diverse both in terms of complexity and
+representation (e.g., Gaussian Splats or NeRFs), and demonstrate that our
+simple method significantly improves the fidelity of the final 3D models. Check
+our project website for details: supergaussian.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Check our project website for details:
+  https://supergaussian.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A-SDM: Accelerating Stable Diffusion through Model Assembly and Feature
+  Inheritance Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinchao Zhu, Yuxuan Wang, Siyuan Pan, Pengfei Wan, Di Zhang, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Stable Diffusion Model (SDM) is a prevalent and effective model for
+text-to-image (T2I) and image-to-image (I2I) generation. Despite various
+attempts at sampler optimization, model distillation, and network
+quantification, these approaches typically maintain the original network
+architecture. The extensive parameter scale and substantial computational
+demands have limited research into adjusting the model architecture. This study
+focuses on reducing redundant computation in SDM and optimizes the model
+through both tuning and tuning-free methods. 1) For the tuning method, we
+design a model assembly strategy to reconstruct a lightweight model while
+preserving performance through distillation. Second, to mitigate performance
+loss due to pruning, we incorporate multi-expert conditional convolution
+(ME-CondConv) into compressed UNets to enhance network performance by
+increasing capacity without sacrificing speed. Third, we validate the
+effectiveness of the multi-UNet switching method for improving network speed.
+2) For the tuning-free method, we propose a feature inheritance strategy to
+accelerate inference by skipping local computations at the block, layer, or
+unit level within the network structure. We also examine multiple sampling
+modes for feature inheritance at the time-step level. Experiments demonstrate
+that both the proposed tuning and the tuning-free methods can improve the speed
+and performance of the SDM. The lightweight model reconstructed by the model
+assembly strategy increases generation speed by $22.4%$, while the feature
+inheritance strategy enhances the SDM generation speed by $40.0%$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 16 figures, submitted to IEEE Transactions on Neural
+  Networks and Learning Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Label Correction for Semantic Segmentation with Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10820v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10820v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoyoung Kim, Sehyun Hwang, Suha Kwak, Jungseul Ok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training and validating models for semantic segmentation require datasets
+with pixel-wise annotations, which are notoriously labor-intensive. Although
+useful priors such as foundation models or crowdsourced datasets are available,
+they are error-prone. We hence propose an effective framework of active label
+correction (ALC) based on a design of correction query to rectify pseudo labels
+of pixels, which in turn is more annotator-friendly than the standard one
+inquiring to classify a pixel directly according to our theoretical analysis
+and user study. Specifically, leveraging foundation models providing useful
+zero-shot predictions on pseudo labels and superpixels, our method comprises
+two key techniques: (i) an annotator-friendly design of correction query with
+the pseudo labels, and (ii) an acquisition function looking ahead label
+expansions based on the superpixels. Experimental results on PASCAL,
+Cityscapes, and Kvasir-SEG datasets demonstrate the effectiveness of our ALC
+framework, outperforming prior methods for active semantic segmentation and
+label correction. Notably, utilizing our method, we obtained a revised dataset
+of PASCAL by rectifying errors in 2.6 million pixels in PASCAL dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fine-Grained Image-Text Alignment in Medical Imaging Enables Explainable
+  Cyclic Image-Report Generation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.08078v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.08078v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenting Chen, Linlin Shen, Jingyang Lin, Jiebo Luo, Xiang Li, Yixuan Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To address these issues, we propose a novel Adaptive patch-word Matching
+(AdaMatch) model to correlate chest X-ray (CXR) image regions with words in
+medical reports and apply it to CXR-report generation to provide explainability
+for the generation process. AdaMatch exploits the fine-grained relation between
+adaptive patches and words to provide explanations of specific image regions
+with corresponding words. To capture the abnormal regions of varying sizes and
+positions, we introduce the Adaptive Patch extraction (AdaPatch) module to
+acquire the adaptive patches for these regions adaptively. In order to provide
+explicit explainability for CXR-report generation task, we propose an
+AdaMatch-based bidirectional large language model for Cyclic CXR-report
+generation (AdaMatch-Cyclic). It employs the AdaMatch to obtain the keywords
+for CXR images and `keypatches' for medical reports as hints to guide
+CXR-report generation. Extensive experiments on two publicly available CXR
+datasets prove the effectiveness of our method and its superior performance to
+existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Gaussian Splatting with Deferred Reflection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18454v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18454v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keyang Ye, Qiming Hou, Kun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of neural and Gaussian-based radiance field methods have achieved
+great success in the field of novel view synthesis. However, specular
+reflection remains non-trivial, as the high frequency radiance field is
+notoriously difficult to fit stably and accurately. We present a deferred
+shading method to effectively render specular reflection with Gaussian
+splatting. The key challenge comes from the environment map reflection model,
+which requires accurate surface normal while simultaneously bottlenecks normal
+estimation with discontinuous gradients. We leverage the per-pixel reflection
+gradients generated by deferred shading to bridge the optimization process of
+neighboring Gaussians, allowing nearly correct normal estimations to gradually
+propagate and eventually spread over all reflective objects. Our method
+significantly outperforms state-of-the-art techniques and concurrent work in
+synthesizing high-quality specular reflection effects, demonstrating a
+consistent improvement of peak signal-to-noise ratio (PSNR) for both synthetic
+and real-world scenes, while running at a frame rate almost identical to
+vanilla Gaussian splatting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Looks Too Good To Be True: An Information-Theoretic Analysis of
+  Hallucinations in Generative Restoration Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16475v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16475v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Regev Cohen, Idan Kligvasser, Ehud Rivlin, Daniel Freedman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pursuit of high perceptual quality in image restoration has driven the
+development of revolutionary generative models, capable of producing results
+often visually indistinguishable from real data. However, as their perceptual
+quality continues to improve, these models also exhibit a growing tendency to
+generate hallucinations - realistic-looking details that do not exist in the
+ground truth images. The presence of hallucinations introduces uncertainty
+regarding the reliability of the models' predictions, raising major concerns
+about their practical application. In this paper, we employ information-theory
+tools to investigate this phenomenon, revealing a fundamental tradeoff between
+uncertainty and perception. We rigorously analyze the relationship between
+these two factors, proving that the global minimal uncertainty in generative
+models grows in tandem with perception. In particular, we define the inherent
+uncertainty of the restoration problem and show that attaining perfect
+perceptual quality entails at least twice this uncertainty. Additionally, we
+establish a relation between mean squared-error distortion, uncertainty and
+perception, through which we prove the aforementioned uncertainly-perception
+tradeoff induces the well-known perception-distortion tradeoff. This work
+uncovers fundamental limitations of generative models in achieving both high
+perceptual quality and reliable predictions for image restoration. We
+demonstrate our theoretical findings through an analysis of single image
+super-resolution algorithms. Our work aims to raise awareness among
+practitioners about this inherent tradeoff, empowering them to make informed
+decisions and potentially prioritize safety over perceptual performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ASCNet: Asymmetric Sampling Correction Network for Infrared Image
+  Destriping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15578v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15578v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Yuan, Hanlin Qin, Xiang Yan, Shiqi Yang, Shuowen Yang, Naveed Akhtar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In a real-world infrared imaging system, effectively learning a consistent
+stripe noise removal model is essential. Most existing destriping methods
+cannot precisely reconstruct images due to cross-level semantic gaps and
+insufficient characterization of the global column features. To tackle this
+problem, we propose a novel infrared image destriping method, called Asymmetric
+Sampling Correction Network (ASCNet), that can effectively capture global
+column relationships and embed them into a U-shaped framework, providing
+comprehensive discriminative representation and seamless semantic connectivity.
+Our ASCNet consists of three core elements: Residual Haar Discrete Wavelet
+Transform (RHDWT), Pixel Shuffle (PS), and Column Non-uniformity Correction
+Module (CNCM). Specifically, RHDWT is a novel downsampler that employs
+double-branch modeling to effectively integrate stripe-directional prior
+knowledge and data-driven semantic interaction to enrich the feature
+representation. Observing the semantic patterns crosstalk of stripe noise, PS
+is introduced as an upsampler to prevent excessive apriori decoding and
+performing semantic-bias-free image reconstruction. After each sampling, CNCM
+captures the column relationships in long-range dependencies. By incorporating
+column, spatial, and self-dependence information, CNCM well establishes a
+global context to distinguish stripes from the scene's vertical structures.
+Extensive experiments on synthetic data, real data, and infrared small target
+detection tasks demonstrate that the proposed method outperforms
+state-of-the-art single-image destriping methods both visually and
+quantitatively. Our code will be made publicly available at
+https://github.com/xdFai/ASCNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fingerprint Matching with Localized Deep Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18576v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18576v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongjie Duan, Zhiyu Pan, Jianjiang Feng, Jie Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compared to minutia-based fingerprint representations, fixed-length
+representations are attractive due to simple and efficient matching. However,
+fixed-length fingerprint representations are limited in accuracy when matching
+fingerprints with different visible areas, which can occur due to different
+finger poses or acquisition methods. To address this issue, we propose a
+localized deep representation of fingerprint, named LDRF. By focusing on the
+discriminative characteristics within local regions, LDRF provides a more
+robust and accurate fixed-length representation for fingerprints with variable
+visible areas. LDRF can be adapted to retain information within any valid area,
+making it highly flexible. The matching scores produced by LDRF also exhibit
+intuitive statistical characteristics, which led us to propose a matching score
+normalization technique to mitigate the uncertainty in the cases of very small
+overlapping area. With this new technique, we can maintain a high level of
+accuracy and reliability in our fingerprint matching, even as the size of the
+database grows rapidly. Our experimental results on 21 datasets containing over
+140K fingerprints of various finger poses and impression types show that LDRF
+outperforms other fixed-length representations and is robust to sensing
+technologies and impression types. Besides, the proposed matching score
+normalization effectively reduces the false match rate (FMR) in large-scale
+identification experiments comprising over 5.11 million fingerprints.
+Specifically, this technique results in a reduction of two orders of magnitude
+compared to matching without matching score normalization and five orders of
+magnitude compared to prior works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper requires major revision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mask-based Invisible Backdoor Attacks on Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09550v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09550v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeongjin Shin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models have achieved unprecedented performance in the domain of
+object detection, resulting in breakthroughs in areas such as autonomous
+driving and security. However, deep learning models are vulnerable to backdoor
+attacks. These attacks prompt models to behave similarly to standard models
+without a trigger; however, they act maliciously upon detecting a predefined
+trigger. Despite extensive research on backdoor attacks in image
+classification, their application to object detection remains relatively
+underexplored. Given the widespread application of object detection in critical
+real-world scenarios, the sensitivity and potential impact of these
+vulnerabilities cannot be overstated. In this study, we propose an effective
+invisible backdoor attack on object detection utilizing a mask-based approach.
+Three distinct attack scenarios were explored for object detection: object
+disappearance, object misclassification, and object generation attack. Through
+extensive experiments, we comprehensively examined the effectiveness of these
+attacks and tested certain defense methods to determine effective
+countermeasures. Code will be available at
+https://github.com/jeongjin0/invisible-backdoor-object-detection
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ICC: Quantifying Image Caption Concreteness for Multimodal <span class="highlight-title">Dataset</span>
+  Curation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01306v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01306v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moran Yanuka, Morris Alper, Hadar Averbuch-Elor, Raja Giryes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Web-scale training on paired text-image data is becoming increasingly central
+to multimodal learning, but is challenged by the highly noisy nature of
+datasets in the wild. Standard data filtering approaches succeed in removing
+mismatched text-image pairs, but permit semantically related but highly
+abstract or subjective text. These approaches lack the fine-grained ability to
+isolate the most concrete samples that provide the strongest signal for
+learning in a noisy dataset. In this work, we propose a new metric, image
+caption concreteness, that evaluates caption text without an image reference to
+measure its concreteness and relevancy for use in multimodal learning. Our
+approach leverages strong foundation models for measuring visual-semantic
+information loss in multimodal representations. We demonstrate that this
+strongly correlates with human evaluation of concreteness in both single-word
+and sentence-level texts. Moreover, we show that curation using ICC complements
+existing approaches: It succeeds in selecting the highest quality samples from
+multimodal web-scale datasets to allow for efficient training in
+resource-constrained settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 (Finding). For Project webpage, see
+  https://moranyanuka.github.io/icc/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FitDiff: Robust monocular 3D facial shape and reflectance estimation
+  using Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04465v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04465v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stathis Galanakis, Alexandros Lattas, Stylianos Moschoglou, Stefanos Zafeiriou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable progress in 3D face reconstruction has resulted in high-detail
+and photorealistic facial representations. Recently, Diffusion Models have
+revolutionized the capabilities of generative methods by surpassing the
+performance of GANs. In this work, we present FitDiff, a diffusion-based 3D
+facial avatar generative model. Leveraging diffusion principles, our model
+accurately generates relightable facial avatars, utilizing an identity
+embedding extracted from an "in-the-wild" 2D facial image. The introduced
+multi-modal diffusion model is the first to concurrently output facial
+reflectance maps (diffuse and specular albedo and normals) and shapes,
+showcasing great generalization capabilities. It is solely trained on an
+annotated subset of a public facial dataset, paired with 3D reconstructions. We
+revisit the typical 3D facial fitting approach by guiding a reverse diffusion
+process using perceptual and face recognition losses. Being the first 3D LDM
+conditioned on face recognition embeddings, FitDiff reconstructs relightable
+human avatars, that can be used as-is in common rendering engines, starting
+only from an unconstrained facial image, and achieving state-of-the-art
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advancing Unsupervised Low-light Image Enhancement: Noise Estimation,
+  Illumination Interpolation, and Self-Regulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10223v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10223v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofeng Liu, Jiaxin Gao, Xin Fan, Risheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary Low-Light Image Enhancement (LLIE) techniques have made notable
+advancements in preserving image details and enhancing contrast, achieving
+commendable results on specific datasets. Nevertheless, these approaches
+encounter persistent challenges in efficiently mitigating dynamic noise and
+accommodating diverse low-light scenarios. Insufficient constraints on complex
+pixel-wise mapping learning lead to overfitting to specific types of noise and
+artifacts associated with low-light conditions, reducing effectiveness in
+variable lighting scenarios. To this end, we first propose a method for
+estimating the noise level in low light images in a quick and accurate way.
+This facilitates precise denoising, prevents over-smoothing, and adapts to
+dynamic noise patterns. Subsequently, we devise a Learnable Illumination
+Interpolator (LII), which employs learnlable interpolation operations between
+the input and unit vector to satisfy general constraints between illumination
+and input. Finally, we introduce a self-regularization loss that incorporates
+intrinsic image properties and essential visual attributes to guide the output
+towards meeting human visual expectations. Comprehensive experiments validate
+the competitiveness of our proposed algorithm in both qualitative and
+quantitative assessments. Notably, our noise estimation method, with linear
+time complexity and suitable for various denoisers, significantly improves both
+denoising and enhancement performance. Benefiting from this, our approach
+achieves a 0.675dB PSNR improvement on the LOL dataset and 0.818dB on the MIT
+dataset on LLIE task, even compared to supervised methods. The source code is
+available at \href{https://doi.org/10.5281/zenodo.11463142}{this DOI
+repository} and the specific code for noise estimation can be found at
+\href{https://github.com/GoogolplexGoodenough/noise_estimate}{this separate
+GitHub link}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Image processing, low-light image enhancement, noise estimation,
+  illumination learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Denoising Autoregressive Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05196v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05196v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yazhe Li, Jorg Bornschein, Ting Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we explore a new generative approach for learning visual
+representations. Our method, DARL, employs a decoder-only Transformer to
+predict image patches autoregressively. We find that training with Mean Squared
+Error (MSE) alone leads to strong representations. To enhance the image
+generation ability, we replace the MSE loss with the diffusion objective by
+using a denoising patch decoder. We show that the learned representation can be
+improved by using tailored noise schedules and longer training in larger
+models. Notably, the optimal schedule differs significantly from the typical
+ones used in standard image diffusion models. Overall, despite its simple
+architecture, DARL delivers performance remarkably close to state-of-the-art
+masked prediction models under the fine-tuning protocol. This marks an
+important step towards a unified model capable of both visual perception and
+generation, effectively combining the strengths of autoregressive and denoising
+diffusion models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SeeSR: Towards Semantics-Aware Real-World Image Super-Resolution <span class="chip">CVPR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16518v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16518v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongyuan Wu, Tao Yang, Lingchen Sun, Zhengqiang Zhang, Shuai Li, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Owe to the powerful generative priors, the pre-trained text-to-image (T2I)
+diffusion models have become increasingly popular in solving the real-world
+image super-resolution problem. However, as a consequence of the heavy quality
+degradation of input low-resolution (LR) images, the destruction of local
+structures can lead to ambiguous image semantics. As a result, the content of
+reproduced high-resolution image may have semantic errors, deteriorating the
+super-resolution performance. To address this issue, we present a
+semantics-aware approach to better preserve the semantic fidelity of generative
+real-world image super-resolution. First, we train a degradation-aware prompt
+extractor, which can generate accurate soft and hard semantic prompts even
+under strong degradation. The hard semantic prompts refer to the image tags,
+aiming to enhance the local perception ability of the T2I model, while the soft
+semantic prompts compensate for the hard ones to provide additional
+representation information. These semantic prompts encourage the T2I model to
+generate detailed and semantically accurate results. Furthermore, during the
+inference process, we integrate the LR images into the initial sampling noise
+to mitigate the diffusion model's tendency to generate excessive random
+details. The experiments show that our method can reproduce more realistic
+image details and hold better the semantics. The source code of our method can
+be found at https://github.com/cswry/SeeSR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Epistemic Uncertainty-Weighted Loss for Visual Bias Mitigation <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.09389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.09389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rebecca S Stone, Nishant Ravikumar, Andrew J Bulpitt, David C Hogg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are highly susceptible to learning biases in visual
+data. While various methods have been proposed to mitigate such bias, the
+majority require explicit knowledge of the biases present in the training data
+in order to mitigate. We argue the relevance of exploring methods which are
+completely ignorant of the presence of any bias, but are capable of identifying
+and mitigating them. Furthermore, we propose using Bayesian neural networks
+with a predictive uncertainty-weighted loss function to dynamically identify
+potential bias in individual training samples and to weight them during
+training. We find a positive correlation between samples subject to bias and
+higher epistemic uncertainties. Finally, we show the method has potential to
+mitigate visual bias on a bias benchmark dataset and on a real-world face
+detection problem, and we consider the merits and weaknesses of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in 2022 IEEE CVPR Workshop on Fair, Data Efficient and
+  Trusted Computer Vision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SSFlowNet: Semi-supervised Scene Flow Estimation On Point Clouds With
+  Pseudo Label <span class="chip">ICANN 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.15271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.15271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingze Chen, Junfeng Yao, Qiqin Lin, Rongzhou Zhou, Lei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the domain of supervised scene flow estimation, the process of manual
+labeling is both time-intensive and financially demanding. This paper
+introduces SSFlowNet, a semi-supervised approach for scene flow estimation,
+that utilizes a blend of labeled and unlabeled data, optimizing the balance
+between the cost of labeling and the precision of model training. SSFlowNet
+stands out through its innovative use of pseudo-labels, mainly reducing the
+dependency on extensively labeled datasets while maintaining high model
+accuracy. The core of our model is its emphasis on the intricate geometric
+structures of point clouds, both locally and globally, coupled with a novel
+spatial memory feature. This feature is adept at learning the geometric
+relationships between points over sequential time frames. By identifying
+similarities between labeled and unlabeled points, SSFlowNet dynamically
+constructs a correlation matrix to evaluate scene flow dependencies at
+individual point level. Furthermore, the integration of a flow consistency
+module within SSFlowNet enhances its capability to consistently estimate flow,
+an essential aspect for analyzing dynamic scenes. Empirical results demonstrate
+that SSFlowNet surpasses existing methods in pseudo-label generation and shows
+adaptability across varying data volumes. Moreover, our semi-supervised
+training technique yields promising outcomes even with different smaller ratio
+labeled data, marking a substantial advancement in the field of scene flow
+estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by 33rd International Conference on Artificial Neural
+  Networks (ICANN 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StrucTexTv3: An Efficient Vision-Language Model for Text-rich Image
+  Perception, Comprehension, and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21013v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21013v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyuan Lyu, Yulin Li, Hao Zhou, Weihong Ma, Xingyu Wan, Qunyi Xie, Liang Wu, Chengquan Zhang, Kun Yao, Errui Ding, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-rich images have significant and extensive value, deeply integrated into
+various aspects of human life. Notably, both visual cues and linguistic symbols
+in text-rich images play crucial roles in information transmission but are
+accompanied by diverse challenges. Therefore, the efficient and effective
+understanding of text-rich images is a crucial litmus test for the capability
+of Vision-Language Models. We have crafted an efficient vision-language model,
+StrucTexTv3, tailored to tackle various intelligent tasks for text-rich images.
+The significant design of StrucTexTv3 is presented in the following aspects:
+Firstly, we adopt a combination of an effective multi-scale reduced visual
+transformer and a multi-granularity token sampler (MG-Sampler) as a visual
+token generator, successfully solving the challenges of high-resolution input
+and complex representation learning for text-rich images. Secondly, we enhance
+the perception and comprehension abilities of StrucTexTv3 through instruction
+learning, seamlessly integrating various text-oriented tasks into a unified
+framework. Thirdly, we have curated a comprehensive collection of high-quality
+text-rich images, abbreviated as TIM-30M, encompassing diverse scenarios like
+incidental scenes, office documents, web pages, and screenshots, thereby
+improving the robustness of our model. Our method achieved SOTA results in
+text-rich image perception tasks, and significantly improved performance in
+comprehension tasks. Among multimodal models with LLM decoder of approximately
+1.8B parameters, it stands out as a leader, which also makes the deployment of
+edge devices feasible. In summary, the StrucTexTv3 model, featuring efficient
+structural design, outstanding performance, and broad adaptability, offers
+robust support for diverse intelligent application tasks involving text-rich
+images, thus exhibiting immense potential for widespread application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hyperbolic Active Learning for Semantic Segmentation under Domain Shift <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11180v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11180v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Franco, Paolo Mandica, Konstantinos Kallidromitis, Devin Guillory, Yu-Teng Li, Trevor Darrell, Fabio Galasso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a hyperbolic neural network approach to pixel-level active
+learning for semantic segmentation. Analysis of the data statistics leads to a
+novel interpretation of the hyperbolic radius as an indicator of data scarcity.
+In HALO (Hyperbolic Active Learning Optimization), for the first time, we
+propose the use of epistemic uncertainty as a data acquisition strategy,
+following the intuition of selecting data points that are the least known. The
+hyperbolic radius, complemented by the widely-adopted prediction entropy,
+effectively approximates epistemic uncertainty. We perform extensive
+experimental analysis based on two established synthetic-to-real benchmarks,
+i.e. GTAV $\rightarrow$ Cityscapes and SYNTHIA $\rightarrow$ Cityscapes.
+Additionally, we test HALO on Cityscape $\rightarrow$ ACDC for domain
+adaptation under adverse weather conditions, and we benchmark both
+convolutional and attention-based backbones. HALO sets a new state-of-the-art
+in active learning for semantic segmentation under domain shift and it is the
+first active learning approach that surpasses the performance of supervised
+domain adaptation while using only a small portion of labels (i.e., 1%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Project repository: https://github.com/paolomandica/HALO</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Practical Single-shot Motion Synthesis <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01136v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01136v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantinos Roditakis, Spyridon Thermos, Nikolaos Zioulis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the recent advances in the so-called "cold start" generation from
+text prompts, their needs in data and computing resources, as well as the
+ambiguities around intellectual property and privacy concerns pose certain
+counterarguments for their utility. An interesting and relatively unexplored
+alternative has been the introduction of unconditional synthesis from a single
+sample, which has led to interesting generative applications. In this paper we
+focus on single-shot motion generation and more specifically on accelerating
+the training time of a Generative Adversarial Network (GAN). In particular, we
+tackle the challenge of GAN's equilibrium collapse when using mini-batch
+training by carefully annealing the weights of the loss functions that prevent
+mode collapse. Additionally, we perform statistical analysis in the generator
+and discriminator models to identify correlations between training stages and
+enable transfer learning. Our improved GAN achieves competitive quality and
+diversity on the Mixamo benchmark when compared to the original GAN
+architecture and a single-shot diffusion model, while being up to x6.8 faster
+in training time from the former and x1.75 from the latter. Finally, we
+demonstrate the ability of our improved GAN to mix and compose motion with a
+single forward pass. Project page available at
+https://moverseai.github.io/single-shot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024, AI for 3D Generation Workshop, Project page:
+  https://moverseai.github.io/single-shot</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning for Camera Calibration and Beyond: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10559v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10559v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kang Liao, Lang Nie, Shujuan Huang, Chunyu Lin, Jing Zhang, Yao Zhao, Moncef Gabbouj, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camera calibration involves estimating camera parameters to infer geometric
+features from captured sequences, which is crucial for computer vision and
+robotics. However, conventional calibration is laborious and requires dedicated
+collection. Recent efforts show that learning-based solutions have the
+potential to be used in place of the repeatability works of manual
+calibrations. Among these solutions, various learning strategies, networks,
+geometric priors, and datasets have been investigated. In this paper, we
+provide a comprehensive survey of learning-based camera calibration techniques,
+by analyzing their strengths and limitations. Our main calibration categories
+include the standard pinhole camera model, distortion camera model, cross-view
+model, and cross-sensor model, following the research trend and extended
+applications. As there is no benchmark in this community, we collect a holistic
+calibration dataset that can serve as a public platform to evaluate the
+generalization of existing methods. It comprises both synthetic and real-world
+data, with images and videos captured by different cameras in diverse scenes.
+Toward the end of this paper, we discuss the challenges and provide further
+research directions. To our knowledge, this is the first survey for the
+learning-based camera calibration (spanned 8 years). The summarized methods,
+datasets, and benchmarks are available and will be regularly updated at
+https://github.com/KangLiao929/Awesome-Deep-Camera-Calibration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Github repository:
+  https://github.com/KangLiao929/Awesome-Deep-Camera-Calibration</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synergistic Integration of Coordinate Network and Tensorial Feature for
+  Improving Neural Radiance Fields from Sparse Inputs <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07857v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07857v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyu Kim, Jun-Seong Kim, Se-Young Yun, Jin-Hwa Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The multi-plane representation has been highlighted for its fast training and
+inference across static and dynamic neural radiance fields. This approach
+constructs relevant features via projection onto learnable grids and
+interpolating adjacent vertices. However, it has limitations in capturing
+low-frequency details and tends to overuse parameters for low-frequency
+features due to its bias toward fine details, despite its multi-resolution
+concept. This phenomenon leads to instability and inefficiency when training
+poses are sparse. In this work, we propose a method that synergistically
+integrates multi-plane representation with a coordinate-based MLP network known
+for strong bias toward low-frequency signals. The coordinate-based network is
+responsible for capturing low-frequency details, while the multi-plane
+representation focuses on capturing fine-grained details. We demonstrate that
+using residual connections between them seamlessly preserves their own inherent
+properties. Additionally, the proposed progressive training scheme accelerates
+the disentanglement of these two features. We demonstrate empirically that our
+proposed method outperforms baseline models for both static and dynamic NeRFs
+with sparse inputs, achieving comparable results with fewer parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML2024 ; Project page is accessible at
+  https://mingyukim87.github.io/SynergyNeRF ; Code is available at
+  https://github.com/MingyuKim87/SynergyNeRF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aggregated Text <span class="highlight-title">Transformer</span> for Scene Text Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13984v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13984v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao Zhou, Xiangcheng Du, Yingbin Zheng, Cheng Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the multi-scale aggregation strategy for scene text
+detection in natural images. We present the Aggregated Text TRansformer(ATTR),
+which is designed to represent texts in scene images with a multi-scale
+self-attention mechanism. Starting from the image pyramid with multiple
+resolutions, the features are first extracted at different scales with shared
+weight and then fed into an encoder-decoder architecture of Transformer. The
+multi-scale image representations are robust and contain rich information on
+text contents of various sizes. The text Transformer aggregates these features
+to learn the interaction across different scales and improve text
+representation. The proposed method detects scene texts by representing each
+text instance as an individual binary mask, which is tolerant of curve texts
+and regions with dense instances. Extensive experiments on public scene text
+detection datasets demonstrate the effectiveness of the proposed framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Contrastive Analysis for Salient Pattern Detection using
+  Conditional Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00772v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00772v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristiano Patrício, Carlo Alberto Barbano, Attilio Fiandrotti, Riccardo Renzulli, Marco Grangetto, Luis F. Teixeira, João C. Neves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Analysis (CA) regards the problem of identifying patterns in
+images that allow distinguishing between a background (BG) dataset (i.e.
+healthy subjects) and a target (TG) dataset (i.e. unhealthy subjects). Recent
+works on this topic rely on variational autoencoders (VAE) or contrastive
+learning strategies to learn the patterns that separate TG samples from BG
+samples in a supervised manner. However, the dependency on target (unhealthy)
+samples can be challenging in medical scenarios due to their limited
+availability. Also, the blurred reconstructions of VAEs lack utility and
+interpretability. In this work, we redefine the CA task by employing a
+self-supervised contrastive encoder to learn a latent representation encoding
+only common patterns from input images, using samples exclusively from the BG
+dataset during training, and approximating the distribution of the target
+patterns by leveraging data augmentation techniques. Subsequently, we exploit
+state-of-the-art generative methods, i.e. diffusion models, conditioned on the
+learned latent representation to produce a realistic (healthy) version of the
+input image encoding solely the common patterns. Thorough validation on a
+facial image dataset and experiments across three brain MRI datasets
+demonstrate that conditioning the generative process of state-of-the-art
+generative methods with the latent representation from our self-supervised
+contrastive encoder yields improvements in the generated image quality and in
+the accuracy of image classification. The code is available at
+https://github.com/CristianoPatricio/unsupervised-contrastive-cond-diff.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MagicBathyNet: A Multimodal Remote Sensing <span class="highlight-title">Dataset</span> for Bathymetry
+  Prediction and Pixel-based Classification in Shallow Waters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15477v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15477v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panagiotis Agrafiotis, Łukasz Janowski, Dimitrios Skarlatos, Begüm Demir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate, detailed, and high-frequent bathymetry, coupled with complex
+semantic content, is crucial for the undermapped shallow seabed areas facing
+intense climatological and anthropogenic pressures. Current methods exploiting
+remote sensing images to derive bathymetry or seabed classes mainly exploit
+non-open data. This lack of openly accessible benchmark archives prevents the
+wider use of deep learning methods in such applications. To address this issue,
+in this paper we present the MagicBathyNet, which is a benchmark dataset made
+up of image patches of Sentinel2, SPOT-6 and aerial imagery, bathymetry in
+raster format and annotations of seabed classes. MagicBathyNet is then
+exploited to benchmark state-of-the-art methods in learning-based bathymetry
+and pixel-based classification. Dataset, pre-trained weights, and code are
+publicly available at www.magicbathy.eu/magicbathynet.html.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 5 tables. Accepted at IEEE International
+  Geoscience and Remote Sensing Symposium (IGARSS) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sensitivity-Informed Augmentation for Robust Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01425v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01425v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Zheng, Wenjie Wei, Tony Wu, Jacob Clements, Shreelekha Revankar, Andre Harrison, Yu Shen, Ming C. Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmentation is an integral module in many visual computing applications such
+as virtual try-on, medical imaging, autonomous driving, and agricultural
+automation. These applications often involve either widespread consumer use or
+highly variable environments, both of which can degrade the quality of visual
+sensor data, whether from a common mobile phone or an expensive satellite
+imaging camera. In addition to external noises like user difference or weather
+conditions, internal noises such as variations in camera quality or lens
+distortion can affect the performance of segmentation models during both
+development and deployment. In this work, we present an efficient, adaptable,
+and gradient-free method to enhance the robustness of learning-based
+segmentation models across training. First, we introduce a novel adaptive
+sensitivity analysis (ASA) using Kernel Inception Distance (KID) on basis
+perturbations to benchmark perturbation sensitivity of pre-trained segmentation
+models. Then, we model the sensitivity curve using the adaptive SA and sample
+perturbation hyperparameter values accordingly. Finally, we conduct adversarial
+training with the selected perturbation values and dynamically re-evaluate
+robustness during online training. Our method, implemented end-to-end with
+minimal fine-tuning required, consistently outperforms state-of-the-art data
+augmentation techniques for segmentation. It shows significant improvement in
+both clean data evaluation and real-world adverse scenario evaluation across
+various segmentation datasets used in visual computing and computer graphics
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DA-HFNet: Progressive Fine-Grained Forgery Image Detection and
+  Localization Based on Dual Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01489v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01489v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Xiaofei Li, Jun Zhang, Shengze Hu, Jun Lei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing difficulty in accurately detecting forged images generated by
+AIGC(Artificial Intelligence Generative Content) poses many risks,
+necessitating the development of effective methods to identify and further
+locate forged areas. In this paper, to facilitate research efforts, we
+construct a DA-HFNet forged image dataset guided by text or image-assisted GAN
+and Diffusion model. Our goal is to utilize a hierarchical progressive network
+to capture forged artifacts at different scales for detection and localization.
+Specifically, it relies on a dual-attention mechanism to adaptively fuse
+multi-modal image features in depth, followed by a multi-branch interaction
+network to thoroughly interact image features at different scales and improve
+detector performance by leveraging dependencies between layers. Additionally,
+we extract more sensitive noise fingerprints to obtain more prominent forged
+artifact features in the forged areas. Extensive experiments validate the
+effectiveness of our approach, demonstrating significant performance
+improvements compared to state-of-the-art methods for forged image detection
+and localization.The code and dataset will be released in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MLIP: Efficient Multi-Perspective Language-Image <span class="highlight-title">Pretrain</span>ing with
+  Exhaustive Data Utilization <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01460v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01460v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhang, Qi Zhang, Zixuan Gong, Yiwei Shi, Yepeng Liu, Duoqian Miao, Yang Liu, Ke Liu, Kun Yi, Wei Fan, Liang Hu, Changwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pretraining (CLIP) has achieved remarkable
+success, leading to rapid advancements in multimodal studies. However, CLIP
+faces a notable challenge in terms of inefficient data utilization. It relies
+on a single contrastive supervision for each image-text pair during
+representation learning, disregarding a substantial amount of valuable
+information that could offer richer supervision. Additionally, the retention of
+non-informative tokens leads to increased computational demands and time costs,
+particularly in CLIP's ViT image encoder. To address these issues, we propose
+Multi-Perspective Language-Image Pretraining (MLIP). In MLIP, we leverage the
+frequency transform's sensitivity to both high and low-frequency variations,
+which complements the spatial domain's sensitivity limited to low-frequency
+variations only. By incorporating frequency transforms and token-level
+alignment, we expand CILP's single supervision into multi-domain and
+multi-level supervision, enabling a more thorough exploration of informative
+image features. Additionally, we introduce a token merging method guided by
+comprehensive semantics from the frequency and spatial domains. This allows us
+to merge tokens to multi-granularity tokens with a controllable compression
+rate to accelerate CLIP. Extensive experiments validate the effectiveness of
+our design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics Inspired Criterion for Pruning-Quantization Joint Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.00851v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.00851v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiying Xie, Xiaoyi Fan, Xin Zhang, Yunsong Li, Jie Lei, Leyuan Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pruning-quantization joint learning always facilitates the deployment of deep
+neural networks (DNNs) on resource-constrained edge devices. However, most
+existing methods do not jointly learn a global criterion for pruning and
+quantization in an interpretable way. In this paper, we propose a novel physics
+inspired criterion for pruning-quantization joint learning (PIC-PQ), which is
+explored from an analogy we first draw between elasticity dynamics (ED) and
+model compression (MC). Specifically, derived from Hooke's law in ED, we
+establish a linear relationship between the filters' importance distribution
+and the filter property (FP) by a learnable deformation scale in the physics
+inspired criterion (PIC). Furthermore, we extend PIC with a relative shift
+variable for a global view. To ensure feasibility and flexibility, available
+maximum bitwidth and penalty factor are introduced in quantization bitwidth
+assignment. Experiments on benchmarks of image classification demonstrate that
+PIC-PQ yields a good trade-off between accuracy and bit-operations (BOPs)
+compression ratio e.g., 54.96X BOPs compression ratio in ResNet56 on CIFAR10
+with 0.10% accuracy drop and 53.24X in ResNet18 on ImageNet with 0.61% accuracy
+drop). The code will be available at https://github.com/fanxxxxyi/PIC-PQ.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChebMixer: Efficient Graph Representation Learning with MLP Mixer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16358v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16358v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyan Kui, Haonan Yan, Qinsong Li, Liming Chen, Beiji Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks have achieved remarkable success in learning graph
+representations, especially graph Transformer, which has recently shown
+superior performance on various graph mining tasks. However, graph Transformer
+generally treats nodes as tokens, which results in quadratic complexity
+regarding the number of nodes during self-attention computation. The graph MLP
+Mixer addresses this challenge by using the efficient MLP Mixer technique from
+computer vision. However, the time-consuming process of extracting graph tokens
+limits its performance. In this paper, we present a novel architecture named
+ChebMixer, a newly graph MLP Mixer that uses fast Chebyshev polynomials-based
+spectral filtering to extract a sequence of tokens. Firstly, we produce
+multiscale representations of graph nodes via fast Chebyshev polynomial-based
+spectral filtering. Next, we consider each node's multiscale representations as
+a sequence of tokens and refine the node representation with an effective MLP
+Mixer. Finally, we aggregate the multiscale representations of nodes through
+Chebyshev interpolation. Owing to the powerful representation capabilities and
+fast computational properties of MLP Mixer, we can quickly extract more
+informative node representations to improve the performance of downstream
+tasks. The experimental results prove our significant improvements in a variety
+of scenarios ranging from graph node classification to medical image
+segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language-guided Image Reflection Separation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11874v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11874v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haofeng Zhong, Yuchen Hong, Shuchen Weng, Jinxiu Liang, Boxin Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the problem of language-guided reflection separation,
+which aims at addressing the ill-posed reflection separation problem by
+introducing language descriptions to provide layer content. We propose a
+unified framework to solve this problem, which leverages the cross-attention
+mechanism with contrastive learning strategies to construct the correspondence
+between language descriptions and image layers. A gated network design and a
+randomized training strategy are employed to tackle the recognizable layer
+ambiguity. The effectiveness of the proposed method is validated by the
+significant performance advantage over existing reflection separation methods
+on both quantitative and qualitative comparisons.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PLA4D: Pixel-Level Alignments for Text-to-4D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19957v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19957v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaowei Miao, Yawei Luo, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As text-conditioned diffusion models (DMs) achieve breakthroughs in image,
+video, and 3D generation, the research community's focus has shifted to the
+more challenging task of text-to-4D synthesis, which introduces a temporal
+dimension to generate dynamic 3D objects. In this context, we identify Score
+Distillation Sampling (SDS), a widely used technique for text-to-3D synthesis,
+as a significant hindrance to text-to-4D performance due to its Janus-faced and
+texture-unrealistic problems coupled with high computational costs. In this
+paper, we propose \textbf{P}ixel-\textbf{L}evel \textbf{A}lignments for
+Text-to-\textbf{4D} Gaussian Splatting (\textbf{PLA4D}), a novel method that
+utilizes text-to-video frames as explicit pixel alignment targets to generate
+static 3D objects and inject motion into them. Specifically, we introduce Focal
+Alignment to calibrate camera poses for rendering and GS-Mesh Contrastive
+Learning to distill geometry priors from rendered image contrasts at the pixel
+level. Additionally, we develop Motion Alignment using a deformation network to
+drive changes in Gaussians and implement Reference Refinement for smooth 4D
+object surfaces. These techniques enable 4D Gaussian Splatting to align
+geometry, texture, and motion with generated videos at the pixel level.
+Compared to previous methods, PLA4D produces synthesized outputs with better
+texture details in less time and effectively mitigates the Janus-faced problem.
+PLA4D is fully implemented using open-source models, offering an accessible,
+user-friendly, and promising direction for 4D digital content creation. Our
+project page: https://github.com/MiaoQiaowei/PLA4D.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Superhuman performance in urology board questions by an explainable
+  large language model enabled for context integration of the European
+  Association of Urology guidelines: the UroBot study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01428v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01428v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin J. Hetz, Nicolas Carl, Sarah Haggenmüller, Christoph Wies, Maurice Stephan Michel, Frederik Wessels, Titus J. Brinker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are revolutionizing medical Question-Answering
+(medQA) through extensive use of medical literature. However, their performance
+is often hampered by outdated training data and a lack of explainability, which
+limits clinical applicability. This study aimed to create and assess UroBot, a
+urology-specialized chatbot, by comparing it with state-of-the-art models and
+the performance of urologists on urological board questions, ensuring full
+clinician-verifiability. UroBot was developed using OpenAI's GPT-3.5, GPT-4,
+and GPT-4o models, employing retrieval-augmented generation (RAG) and the
+latest 2023 guidelines from the European Association of Urology (EAU). The
+evaluation included ten runs of 200 European Board of Urology (EBU) In-Service
+Assessment (ISA) questions, with performance assessed by the mean Rate of
+Correct Answers (RoCA). UroBot-4o achieved an average RoCA of 88.4%, surpassing
+GPT-4o by 10.8%, with a score of 77.6%. It was also clinician-verifiable and
+exhibited the highest run agreement as indicated by Fleiss' Kappa (k = 0.979).
+By comparison, the average performance of urologists on board questions, as
+reported in the literature, is 68.7%. UroBot's clinician-verifiable nature and
+superior accuracy compared to both existing models and urologists on board
+questions highlight its potential for clinical integration. The study also
+provides the necessary code and instructions for further development of UroBot.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Macro to Micro: Boosting micro-expression recognition via
+  <span class="highlight-title">pre-train</span>ing on macro-expression videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16451v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16451v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanting Li, Hongjing Niu, Feng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Micro-expression recognition (MER) has drawn increasing attention in recent
+years due to its potential applications in intelligent medical and lie
+detection. However, the shortage of annotated data has been the major obstacle
+to further improve deep-learning based MER methods. Intuitively, utilizing
+sufficient macro-expression data to promote MER performance seems to be a
+feasible solution. However, the facial patterns of macro-expressions and
+micro-expressions are significantly different, which makes naive transfer
+learning methods difficult to deploy directly. To tacle this issue, we propose
+a generalized transfer learning paradigm, called \textbf{MA}cro-expression
+\textbf{TO} \textbf{MI}cro-expression (MA2MI). Under our paradigm, networks can
+learns the ability to represent subtle facial movement by reconstructing future
+frames. In addition, we also propose a two-branch micro-action network
+(MIACNet) to decouple facial position features and facial action features,
+which can help the network more accurately locate facial action locations.
+Extensive experiments on three popular MER benchmarks demonstrate the
+superiority of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Image Segmentation Model with Transformed Total Variation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00571v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00571v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elisha Dayag, Kevin Bui, Fredrick Park, Jack Xin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Based on transformed $\ell_1$ regularization, transformed total variation
+(TTV) has robust image recovery that is competitive with other nonconvex total
+variation (TV) regularizers, such as TV$^p$, $0<p<1$. Inspired by its
+performance, we propose a TTV-regularized Mumford--Shah model with fuzzy
+membership function for image segmentation. To solve it, we design an
+alternating direction method of multipliers (ADMM) algorithm that utilizes the
+transformed $\ell_1$ proximal operator. Numerical experiments demonstrate that
+using TTV is more effective than classical TV and other nonconvex TV variants
+in image segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EUSIPCO'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Temporally Consistent Video Depth from Video Diffusion Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01493v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01493v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Shao, Yuanbo Yang, Hongyu Zhou, Youmin Zhang, Yujun Shen, Matteo Poggi, Yiyi Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work addresses the challenge of video depth estimation, which expects
+not only per-frame accuracy but, more importantly, cross-frame consistency.
+Instead of directly developing a depth estimator from scratch, we reformulate
+the prediction task into a conditional generation problem. This allows us to
+leverage the prior knowledge embedded in existing video generation models,
+thereby reducing learning difficulty and enhancing generalizability.
+Concretely, we study how to tame the public Stable Video Diffusion (SVD) to
+predict reliable depth from input videos using a mixture of image depth and
+video depth datasets. We empirically confirm that a procedural training
+strategy -- first optimizing the spatial layers of SVD and then optimizing the
+temporal layers while keeping the spatial layers frozen -- yields the best
+results in terms of both spatial accuracy and temporal consistency. We further
+examine the sliding window strategy for inference on arbitrarily long videos.
+Our observations indicate a trade-off between efficiency and performance, with
+a one-frame overlap already producing favorable results. Extensive experimental
+results demonstrate the superiority of our approach, termed ChronoDepth, over
+existing alternatives, particularly in terms of the temporal consistency of the
+estimated depth. Additionally, we highlight the benefits of more consistent
+video depth in two practical applications: depth-conditioned video generation
+and novel view synthesis. Our project page is available at
+https://jhaoshao.github.io/ChronoDepth/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extreme Point Supervised Instance Segmentation <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeonjun Lee, Sehyun Hwang, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel approach to learning instance segmentation
+using extreme points, i.e., the topmost, leftmost, bottommost, and rightmost
+points, of each object. These points are readily available in the modern
+bounding box annotation process while offering strong clues for precise
+segmentation, and thus allows to improve performance at the same annotation
+cost with box-supervised methods. Our work considers extreme points as a part
+of the true instance mask and propagates them to identify potential foreground
+and background points, which are all together used for training a pseudo label
+generator. Then pseudo labels given by the generator are in turn used for
+supervised learning of our final model. On three public benchmarks, our method
+significantly outperforms existing box-supervised methods, further narrowing
+the gap with its fully supervised counterpart. In particular, our model
+generates high-quality masks when a target object is separated into multiple
+parts, where previous box-supervised methods often fail.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FineDiffusion: Scaling up Diffusion Models for Fine-grained Image
+  Generation with 10,000 Classes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18331v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18331v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziying Pan, Kun Wang, Gang Li, Feihong He, Yongxuan Lai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The class-conditional image generation based on diffusion models is renowned
+for generating high-quality and diverse images. However, most prior efforts
+focus on generating images for general categories, e.g., 1000 classes in
+ImageNet-1k. A more challenging task, large-scale fine-grained image
+generation, remains the boundary to explore. In this work, we present a
+parameter-efficient strategy, called FineDiffusion, to fine-tune large
+pre-trained diffusion models scaling to large-scale fine-grained image
+generation with 10,000 categories. FineDiffusion significantly accelerates
+training and reduces storage overhead by only fine-tuning tiered class
+embedder, bias terms, and normalization layers' parameters. To further improve
+the image generation quality of fine-grained categories, we propose a novel
+sampling method for fine-grained image generation, which utilizes
+superclass-conditioned guidance, specifically tailored for fine-grained
+categories, to replace the conventional classifier-free guidance sampling.
+Compared to full fine-tuning, FineDiffusion achieves a remarkable 1.56x
+training speed-up and requires storing merely 1.77% of the total model
+parameters, while achieving state-of-the-art FID of 9.776 on image generation
+of 10,000 classes. Extensive qualitative and quantitative experiments
+demonstrate the superiority of our method compared to other parameter-efficient
+fine-tuning methods. The code and more generated results are available at our
+project website: https://finediffusion.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Prototypical Visual Explanations with Reward Reweighing,
+  Reselection, and Retraining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03887v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03887v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron J. Li, Robin Netzorg, Zhihan Cheng, Zhuoqin Zhang, Bin Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, work has gone into developing deep interpretable methods for
+image classification that clearly attributes a model's output to specific
+features of the data. One such of these methods is the Prototypical Part
+Network (ProtoPNet), which attempts to classify images based on meaningful
+parts of the input. While this architecture is able to produce visually
+interpretable classifications, it often learns to classify based on parts of
+the image that are not semantically meaningful. To address this problem, we
+propose the Reward Reweighing, Reselecting, and Retraining (R3) post-processing
+framework, which performs three additional corrective updates to a pretrained
+ProtoPNet in an offline and efficient manner. The first two steps involve
+learning a reward model based on collected human feedback and then aligning the
+prototypes with human preferences. The final step is retraining, which realigns
+the base features and the classifier layer of the original model with the
+updated prototypes. We find that our R3 framework consistently improves both
+the interpretability and the predictive accuracy of ProtoPNet and its variants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Transferable and Principled Efficiency for Open-Vocabulary Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07448v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07448v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingxuan Xu, Wuyang Chen, Yao Zhao, Yunchao Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent success of pre-trained foundation vision-language models makes
+Open-Vocabulary Segmentation (OVS) possible. Despite the promising performance,
+this approach introduces heavy computational overheads for two challenges: 1)
+large model sizes of the backbone; 2) expensive costs during the fine-tuning.
+These challenges hinder this OVS strategy from being widely applicable and
+affordable in real-world scenarios. Although traditional methods such as model
+compression and efficient fine-tuning can address these challenges, they often
+rely on heuristics. This means that their solutions cannot be easily
+transferred and necessitate re-training on different models, which comes at a
+cost. In the context of efficient OVS, we target achieving performance that is
+comparable to or even better than prior OVS works based on large
+vision-language foundation models, by utilizing smaller models that incur lower
+training costs. The core strategy is to make our efficiency principled and thus
+seamlessly transferable from one OVS framework to others without further
+customization. Comprehensive experiments on diverse OVS benchmarks demonstrate
+our superior trade-off between segmentation accuracy and computation costs over
+previous works. Our code is available on https://github.com/Xujxyang/OpenTrans
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Research on the Application of Computer Vision Based on Deep Learning in
+  Autonomous Driving Technology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00490v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00490v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyu Zhang, Jin Cao, Jinghao Chang, Xinjin Li, Houze Liu, Zhenglin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This research aims to explore the application of deep learning in autonomous
+driving computer vision technology and its impact on improving system
+performance. By using advanced technologies such as convolutional neural
+networks (CNN), multi-task joint learning methods, and deep reinforcement
+learning, this article analyzes in detail the application of deep learning in
+image recognition, real-time target tracking and classification, environment
+perception and decision support, and path planning and navigation. Application
+process in key areas. Research results show that the proposed system has an
+accuracy of over 98% in image recognition, target tracking and classification,
+and also demonstrates efficient performance and practicality in environmental
+perception and decision support, path planning and navigation. The conclusion
+points out that deep learning technology can significantly improve the accuracy
+and real-time response capabilities of autonomous driving systems. Although
+there are still challenges in environmental perception and decision support,
+with the advancement of technology, it is expected to achieve wider
+applications and greater capabilities in the future. potential.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GeminiFusion: Efficient Pixel-wise Multimodal Fusion for Vision
+  <span class="highlight-title">Transformer</span> <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01210v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01210v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ding Jia, Jianyuan Guo, Kai Han, Han Wu, Chao Zhang, Chang Xu, Xinghao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-modal transformers have demonstrated superiority in various vision
+tasks by effectively integrating different modalities. This paper first
+critiques prior token exchange methods which replace less informative tokens
+with inter-modal features, and demonstrate exchange based methods underperform
+cross-attention mechanisms, while the computational demand of the latter
+inevitably restricts its use with longer sequences. To surmount the
+computational challenges, we propose GeminiFusion, a pixel-wise fusion approach
+that capitalizes on aligned cross-modal representations. GeminiFusion elegantly
+combines intra-modal and inter-modal attentions, dynamically integrating
+complementary information across modalities. We employ a layer-adaptive noise
+to adaptively control their interplay on a per-layer basis, thereby achieving a
+harmonized fusion process. Notably, GeminiFusion maintains linear complexity
+with respect to the number of input tokens, ensuring this multimodal framework
+operates with efficiency comparable to unimodal networks. Comprehensive
+evaluations across multimodal image-to-image translation, 3D object detection
+and arbitrary-modal semantic segmentation tasks, including RGB, depth, LiDAR,
+event data, etc. demonstrate the superior performance of our GeminiFusion
+against leading-edge techniques. The PyTorch code is available at
+https://github.com/JiaDingCN/GeminiFusion
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024, code and models are available at
+  https://github.com/JiaDingCN/GeminiFusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Robust Physical-world Backdoor Attacks on Lane Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05553v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05553v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinwei Zhang, Aishan Liu, Tianyuan Zhang, Siyuan Liang, Xianglong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based lane detection (LD) plays a critical role in autonomous
+driving systems, such as adaptive cruise control. However, it is vulnerable to
+backdoor attacks. Existing backdoor attack methods on LD exhibit limited
+effectiveness in dynamic real-world scenarios, primarily because they fail to
+consider dynamic scene factors, including changes in driving perspectives
+(e.g., viewpoint transformations) and environmental conditions (e.g., weather
+or lighting changes). To tackle this issue, this paper introduces BadLANE, a
+dynamic scene adaptation backdoor attack for LD designed to withstand changes
+in real-world dynamic scene factors. To address the challenges posed by
+changing driving perspectives, we propose an amorphous trigger pattern composed
+of shapeless pixels. This trigger design allows the backdoor to be activated by
+various forms or shapes of mud spots or pollution on the road or lens, enabling
+adaptation to changes in vehicle observation viewpoints during driving. To
+mitigate the effects of environmental changes, we design a meta-learning
+framework to train meta-generators tailored to different environmental
+conditions. These generators produce meta-triggers that incorporate diverse
+environmental information, such as weather or lighting conditions, as the
+initialization of the trigger patterns for backdoor implantation, thus enabling
+adaptation to dynamic environments. Extensive experiments on various commonly
+used LD models in both digital and physical domains validate the effectiveness
+of our attacks, outperforming other baselines significantly (+25.15% on average
+in Attack Success Rate). Our codes will be available upon paper publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Addressing Diverging Training Costs using Local Restoration for Precise
+  Bird's Eye View Map Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.01016v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.01016v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsu Kim, Giseop Kim, Sunwook Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Bird's Eye View (BEV) fusion for map construction have
+demonstrated remarkable mapping of urban environments. However, their deep and
+bulky architecture incurs substantial amounts of backpropagation memory and
+computing latency. Consequently, the problem poses an unavoidable bottleneck in
+constructing high-resolution (HR) BEV maps, as their large-sized features cause
+significant increases in costs including GPU memory consumption and computing
+latency, named diverging training costs issue. Affected by the problem, most
+existing methods adopt low-resolution (LR) BEV and struggle to estimate the
+precise locations of urban scene components like road lanes, and sidewalks. As
+the imprecision leads to risky self-driving, the diverging training costs issue
+has to be resolved. In this paper, we address the issue with our novel Trumpet
+Neural Network (TNN) mechanism. The framework utilizes LR BEV space and outputs
+an up-sampled semantic BEV map to create a memory-efficient pipeline. To this
+end, we introduce Local Restoration of BEV representation. Specifically, the
+up-sampled BEV representation has severely aliased, blocky signals, and thick
+semantic labels. Our proposed Local Restoration restores the signals and thins
+(or narrows down) the width of the labels. Our extensive experiments show that
+the TNN mechanism provides a plug-and-play memory-efficient pipeline, thereby
+enabling the effective estimation of real-sized (or precise) semantic labels
+for BEV map construction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OODRobustBench: a Benchmark and Large-Scale Analysis of Adversarial
+  Robustness under Distribution Shift <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12793v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12793v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Li, Yifei Wang, Chawin Sitawarin, Michael Spratling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing works have made great progress in improving adversarial robustness,
+but typically test their method only on data from the same distribution as the
+training data, i.e. in-distribution (ID) testing. As a result, it is unclear
+how such robustness generalizes under input distribution shifts, i.e.
+out-of-distribution (OOD) testing. This omission is concerning as such
+distribution shifts are unavoidable when methods are deployed in the wild. To
+address this issue we propose a benchmark named OODRobustBench to
+comprehensively assess OOD adversarial robustness using 23 dataset-wise shifts
+(i.e. naturalistic shifts in input distribution) and 6 threat-wise shifts
+(i.e., unforeseen adversarial threat models). OODRobustBench is used to assess
+706 robust models using 60.7K adversarial evaluations. This large-scale
+analysis shows that: 1) adversarial robustness suffers from a severe OOD
+generalization issue; 2) ID robustness correlates strongly with OOD robustness
+in a positive linear way. The latter enables the prediction of OOD robustness
+from ID robustness. We then predict and verify that existing methods are
+unlikely to achieve high OOD robustness. Novel methods are therefore required
+to achieve OOD robustness beyond our prediction. To facilitate the development
+of these methods, we investigate a wide range of techniques and identify
+several promising directions. Code and models are available at:
+https://github.com/OODRobustBench/OODRobustBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML2024, and ICLR2024 DMLR workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Framework for Microscopy Defocus Deblur with Multi-Pyramid
+  <span class="highlight-title">Transformer</span> and Contrastive Learning <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02611v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02611v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuelin Zhang, Pengyu Zheng, Wanquan Yan, Chengyu Fang, Shing Shin Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Defocus blur is a persistent problem in microscope imaging that poses harm to
+pathology interpretation and medical intervention in cell microscopy and
+microscope surgery. To address this problem, a unified framework including the
+multi-pyramid transformer (MPT) and extended frequency contrastive
+regularization (EFCR) is proposed to tackle two outstanding challenges in
+microscopy deblur: longer attention span and data deficiency. The MPT employs
+an explicit pyramid structure at each network stage that integrates the
+cross-scale window attention (CSWA), the intra-scale channel attention (ISCA),
+and the feature-enhancing feed-forward network (FEFN) to capture long-range
+cross-scale spatial interaction and global channel context. The EFCR addresses
+the data deficiency problem by exploring latent deblur signals from different
+frequency bands. It also enables deblur knowledge transfer to learn
+cross-domain information from extra data, improving deblur performance for
+labeled and unlabeled data. Extensive experiments and downstream task
+validation show the framework achieves state-of-the-art performance across
+multiple datasets. Project page: https://github.com/PieceZhang/MPT-CataBlur.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Mix-based Data Augmentation: Taxonomy, Methods,
+  Applications, and Explainability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10888v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10888v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengtai Cao, Fan Zhou, Yurou Dai, Jianping Wang, Kunpeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation (DA) is indispensable in modern machine learning and deep
+neural networks. The basic idea of DA is to construct new training data to
+improve the model's generalization by adding slightly disturbed versions of
+existing data or synthesizing new data. This survey comprehensively reviews a
+crucial subset of DA techniques, namely Mix-based Data Augmentation (MixDA),
+which generates novel samples by combining multiple examples. In contrast to
+traditional DA approaches that operate on single samples or entire datasets,
+MixDA stands out due to its effectiveness, simplicity, flexibility,
+computational efficiency, theoretical foundation, and broad applicability. We
+begin by introducing a novel taxonomy that categorizes MixDA into Mixup-based,
+Cutmix-based, and mixture approaches based on a hierarchical perspective of the
+data mixing operation. Subsequently, we provide an in-depth review of various
+MixDA techniques, focusing on their underlying motivations. Owing to its
+versatility, MixDA has penetrated a wide range of applications, which we also
+thoroughly investigate in this survey. Moreover, we delve into the underlying
+mechanisms of MixDA's effectiveness by examining its impact on model
+generalization and calibration while providing insights into the model's
+behavior by analyzing the inherent properties of MixDA. Finally, we
+recapitulate the critical findings and fundamental challenges of current MixDA
+studies while outlining the potential directions for future works. Different
+from previous related surveys that focus on DA approaches in specific domains
+(e.g., CV and NLP) or only review a limited subset of MixDA studies, we are the
+first to provide a systematical survey of MixDA, covering its taxonomy,
+methodology, application, and explainability. Furthermore, we provide promising
+directions for researchers interested in this exciting area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 4 figures, and 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CMOSE: Comprehensive Multi-Modality Online Student Engagement <span class="highlight-title">Dataset</span>
+  with High-Quality Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi-hsuan Wu, Shih-yang Liu, Xijie Huang, Xingbo Wang, Rong Zhang, Luca Minciullo, Wong Kai Yiu, Kenny Kwan, Kwang-Ting Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online learning is a rapidly growing industry. However, a major doubt about
+online learning is whether students are as engaged as they are in face-to-face
+classes. An engagement recognition system can notify the instructors about the
+students condition and improve the learning experience. Current challenges in
+engagement detection involve poor label quality, extreme data imbalance, and
+intra-class variety - the variety of behaviors at a certain engagement level.
+To address these problems, we present the CMOSE dataset, which contains a large
+number of data from different engagement levels and high-quality labels
+annotated according to psychological advice. We also propose a training
+mechanism MocoRank to handle the intra-class variety and the ordinal pattern of
+different degrees of engagement classes. MocoRank outperforms prior engagement
+detection frameworks, achieving a 1.32% increase in overall accuracy and 5.05%
+improvement in average accuracy. Further, we demonstrate the effectiveness of
+multi-modality in engagement detection by combining video features with speech
+and audio features. The data transferability experiments also state that the
+proposed CMOSE dataset provides superior label quality and behavior diversity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LanEvil: Benchmarking the Robustness of Lane Detection to Environmental
+  Illusions <span class="chip">ACM MM 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00934v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00934v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyuan Zhang, Lu Wang, Hainan Li, Yisong Xiao, Siyuan Liang, Aishan Liu, Xianglong Liu, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lane detection (LD) is an essential component of autonomous driving systems,
+providing fundamental functionalities like adaptive cruise control and
+automated lane centering. Existing LD benchmarks primarily focus on evaluating
+common cases, neglecting the robustness of LD models against environmental
+illusions such as shadows and tire marks on the road. This research gap poses
+significant safety challenges since these illusions exist naturally in
+real-world traffic situations. For the first time, this paper studies the
+potential threats caused by these environmental illusions to LD and establishes
+the first comprehensive benchmark LanEvil for evaluating the robustness of LD
+against this natural corruption. We systematically design 14 prevalent yet
+critical types of environmental illusions (e.g., shadow, reflection) that cover
+a wide spectrum of real-world influencing factors in LD tasks. Based on
+real-world environments, we create 94 realistic and customizable 3D cases using
+the widely used CARLA simulator, resulting in a dataset comprising 90,292
+sampled images. Through extensive experiments, we benchmark the robustness of
+popular LD methods using LanEvil, revealing substantial performance degradation
+(-5.37% Accuracy and -10.70% F1-Score on average), with shadow effects posing
+the greatest risk (-7.39% Accuracy). Additionally, we assess the performance of
+commercial auto-driving systems OpenPilot and Apollo through collaborative
+simulations, demonstrating that proposed environmental illusions can lead to
+incorrect decisions and potential traffic accidents. To defend against
+environmental illusions, we propose the Attention Area Mixing (AAM) approach
+using hard examples, which witness significant robustness improvement (+3.76%)
+under illumination effects. We hope our paper can contribute to advancing more
+robust auto-driving systems in the future. Website: https://lanevil.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ACM MM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Privacy-Preserving CNN Training with Transfer Learning: Multiclass
+  Logistic Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03807v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03807v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a practical solution to implement
+privacy-preserving CNN training based on mere Homomorphic Encryption (HE)
+technique. To our best knowledge, this is the first attempt successfully to
+crack this nut and no work ever before has achieved this goal. Several
+techniques combine to accomplish the task:: (1) with transfer learning,
+privacy-preserving CNN training can be reduced to homomorphic neural network
+training, or even multiclass logistic regression (MLR) training; (2) via a
+faster gradient variant called $\texttt{Quadratic Gradient}$, an enhanced
+gradient method for MLR with a state-of-the-art performance in convergence
+speed is applied in this work to achieve high performance; (3) we employ the
+thought of transformation in mathematics to transform approximating Softmax
+function in the encryption domain to the approximation of the Sigmoid function.
+A new type of loss function termed $\texttt{Squared Likelihood Error}$ has been
+developed alongside to align with this change.; and (4) we use a simple but
+flexible matrix-encoding method named $\texttt{Volley Revolver}$ to manage the
+data flow in the ciphertexts, which is the key factor to complete the whole
+homomorphic CNN training. The complete, runnable C++ code to implement our work
+can be found at:
+\href{https://github.com/petitioner/HE.CNNtraining}{$\texttt{https://github.com/petitioner/HE.CNNtraining}$}.
+We select $\texttt{REGNET\_X\_400MF}$ as our pre-trained model for transfer
+learning. We use the first 128 MNIST training images as training data and the
+whole MNIST testing dataset as the testing data. The client only needs to
+upload 6 ciphertexts to the cloud and it takes $\sim 21$ mins to perform 2
+iterations on a cloud with 64 vCPUs, resulting in a precision of $21.49\%$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In this work, we initiated to implement privacy-persevering CNN
+  training based on mere HE techniques by presenting a faster HE-friendly
+  algorithm</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contextualized Diffusion Models for Text-Guided Image and Video
+  Generation <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16627v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16627v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Yang, Zhilong Zhang, Zhaochen Yu, Jingwei Liu, Minkai Xu, Stefano Ermon, Bin Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conditional diffusion models have exhibited superior performance in
+high-fidelity text-guided visual generation and editing. Nevertheless,
+prevailing text-guided visual diffusion models primarily focus on incorporating
+text-visual relationships exclusively into the reverse process, often
+disregarding their relevance in the forward process. This inconsistency between
+forward and reverse processes may limit the precise conveyance of textual
+semantics in visual synthesis results. To address this issue, we propose a
+novel and general contextualized diffusion model (ContextDiff) by incorporating
+the cross-modal context encompassing interactions and alignments between text
+condition and visual sample into forward and reverse processes. We propagate
+this context to all timesteps in the two processes to adapt their trajectories,
+thereby facilitating cross-modal conditional modeling. We generalize our
+contextualized diffusion to both DDPMs and DDIMs with theoretical derivations,
+and demonstrate the effectiveness of our model in evaluations with two
+challenging tasks: text-to-image generation, and text-to-video editing. In each
+task, our ContextDiff achieves new state-of-the-art performance, significantly
+enhancing the semantic alignment between text condition and generated samples,
+as evidenced by quantitative and qualitative evaluations. Our code is available
+at https://github.com/YangLing0818/ContextDiff
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. Project: https://github.com/YangLing0818/ContextDiff</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LVLM-Intrepret: An Interpretability Tool for Large Vision-Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03118v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03118v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriela Ben Melech Stan, Estelle Aflalo, Raanan Yehezkel Rohekar, Anahita Bhiwandiwalla, Shao-Yen Tseng, Matthew Lyle Olson, Yaniv Gurwicz, Chenfei Wu, Nan Duan, Vasudev Lal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly evolving landscape of artificial intelligence, multi-modal
+large language models are emerging as a significant area of interest. These
+models, which combine various forms of data input, are becoming increasingly
+popular. However, understanding their internal mechanisms remains a complex
+task. Numerous advancements have been made in the field of explainability tools
+and mechanisms, yet there is still much to explore. In this work, we present a
+novel interactive application aimed towards understanding the internal
+mechanisms of large vision-language models. Our interface is designed to
+enhance the interpretability of the image patches, which are instrumental in
+generating an answer, and assess the efficacy of the language model in
+grounding its output in the image. With our application, a user can
+systematically investigate the model and uncover system limitations, paving the
+way for enhancements in system capabilities. Finally, we present a case study
+of how our application can aid in understanding failure mechanisms in a popular
+large multi-modal model: LLaVA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Hateful Meme Detection through Retrieval-Guided Contrastive
+  Learning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.08110v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.08110v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingbiao Mei, Jinghong Chen, Weizhe Lin, Bill Byrne, Marcus Tomalin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hateful memes have emerged as a significant concern on the Internet.
+Detecting hateful memes requires the system to jointly understand the visual
+and textual modalities. Our investigation reveals that the embedding space of
+existing CLIP-based systems lacks sensitivity to subtle differences in memes
+that are vital for correct hatefulness classification. We propose constructing
+a hatefulness-aware embedding space through retrieval-guided contrastive
+training. Our approach achieves state-of-the-art performance on the
+HatefulMemes dataset with an AUROC of 87.0, outperforming much larger
+fine-tuned large multimodal models. We demonstrate a retrieval-based hateful
+memes detection system, which is capable of identifying hatefulness based on
+data unseen in training. This allows developers to update the hateful memes
+detection system by simply adding new examples without retraining, a desirable
+feature for real services in the constantly evolving landscape of hateful memes
+on the Internet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 Main Conference. This is the camera-ready
+  version. We added more experiments to address reviewers' comments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Will we run out of data? Limits of LLM scaling based on human-generated
+  data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.04325v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.04325v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pablo Villalobos, Anson Ho, Jaime Sevilla, Tamay Besiroglu, Lennart Heim, Marius Hobbhahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the potential constraints on LLM scaling posed by the
+availability of public human-generated text data. We forecast the growing
+demand for training data based on current trends and estimate the total stock
+of public human text data. Our findings indicate that if current LLM
+development trends continue, models will be trained on datasets roughly equal
+in size to the available stock of public human text data between 2026 and 2032,
+or slightly earlier if models are overtrained. We explore how progress in
+language modeling can continue when human-generated text datasets cannot be
+scaled any further. We argue that synthetic data generation, transfer learning
+from data-rich domains, and data efficiency improvements might support further
+progress.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Its Not a Modality Gap: Characterizing and Addressing the Contrastive
+  Gap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18570v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18570v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abrar Fahim, Alex Murphy, Alona Fyshe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal contrastive models such as CLIP achieve state-of-the-art
+performance in zero-shot classification by embedding input images and texts on
+a joint representational space. Recently, a modality gap has been reported in
+two-encoder contrastive models like CLIP, meaning that the image and text
+embeddings reside in disjoint areas of the latent space. Previous studies
+suggest that this gap exists due to 1) the cone effect, 2) mismatched pairs in
+the dataset, and 3) insufficient training. We show that, even when accounting
+for all these factors, and even when using the same modality, the contrastive
+loss actually creates a gap during training. As a result, We propose that the
+modality gap is inherent to the two-encoder contrastive loss and rename it the
+contrastive gap. We present evidence that attributes this contrastive gap to
+low uniformity in CLIP space, resulting in embeddings that occupy only a small
+portion of the latent space. To close the gap, we adapt the uniformity and
+alignment properties of unimodal contrastive loss to the multi-modal setting
+and show that simply adding these terms to the CLIP loss distributes the
+embeddings more uniformly in the representational space, closing the gap. In
+our experiments, we show that the modified representational space achieves
+better performance than default CLIP loss in downstream tasks such as zero-shot
+image classification and multi-modal arithmetic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Models With Learned Adaptive Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.13236v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.13236v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subham Sekhar Sahoo, Aaron Gokaslan, Chris De Sa, Volodymyr Kuleshov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have gained traction as powerful algorithms for synthesizing
+high-quality images. Central to these algorithms is the diffusion process, a
+set of equations which maps data to noise in a way that can significantly
+affect performance. In this paper, we explore whether the diffusion process can
+be learned from data. Our work is grounded in Bayesian inference and seeks to
+improve log-likelihood estimation by casting the learned diffusion process as
+an approximate variational posterior that yields a tighter lower bound (ELBO)
+on the likelihood. A widely held assumption is that the ELBO is invariant to
+the noise process: our work dispels this assumption and proposes multivariate
+learned adaptive noise (MULAN), a learned diffusion process that applies noise
+at different rates across an image. Specifically, our method relies on a
+multivariate noise schedule that is a function of the data to ensure that the
+ELBO is no longer invariant to the choice of the noise schedule as in previous
+works. Empirically, MULAN sets a new state-of-the-art in density estimation on
+CIFAR-10 and ImageNet and reduces the number of training steps by 50%. Code is
+available at https://github.com/s-sahoo/MuLAN
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">22</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XRec: Large Language Models for Explainable Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiyao Ma, Xubin Ren, Chao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems help users navigate information overload by providing
+personalized recommendations aligned with their preferences. Collaborative
+Filtering (CF) is a widely adopted approach, but while advanced techniques like
+graph neural networks (GNNs) and self-supervised learning (SSL) have enhanced
+CF models for better user representations, they often lack the ability to
+provide explanations for the recommended items. Explainable recommendations aim
+to address this gap by offering transparency and insights into the
+recommendation decision-making process, enhancing users' understanding. This
+work leverages the language capabilities of Large Language Models (LLMs) to
+push the boundaries of explainable recommender systems. We introduce a
+model-agnostic framework called XRec, which enables LLMs to provide
+comprehensive explanations for user behaviors in recommender systems. By
+integrating collaborative signals and designing a lightweight collaborative
+adaptor, the framework empowers LLMs to understand complex patterns in
+user-item interactions and gain a deeper understanding of user preferences. Our
+extensive experiments demonstrate the effectiveness of XRec, showcasing its
+ability to generate comprehensive and meaningful explanations that outperform
+baseline approaches in explainable recommender systems. We open-source our
+model implementation at https://github.com/HKUDS/XRec.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models Make Sample-Efficient Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianghao Lin, Xinyi Dai, Rong Shan, Bo Chen, Ruiming Tang, Yong Yu, Weinan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have achieved remarkable progress in the field
+of natural language processing (NLP), demonstrating remarkable abilities in
+producing text that resembles human language for various tasks. This opens up
+new opportunities for employing them in recommender systems (RSs). In this
+paper, we specifically examine the sample efficiency of LLM-enhanced
+recommender systems, which pertains to the model's capacity to attain superior
+performance with a limited quantity of training data. Conventional
+recommendation models (CRMs) often need a large amount of training data because
+of the sparsity of features and interactions. Hence, we propose and verify our
+core viewpoint: Large Language Models Make Sample-Efficient Recommender
+Systems. We propose a simple yet effective framework (i.e., Laser) to validate
+the viewpoint from two aspects: (1) LLMs themselves are sample-efficient
+recommenders; and (2) LLMs, as feature generators and encoders, make CRMs more
+sample-efficient. Extensive experiments on two public datasets show that Laser
+requires only a small fraction of training samples to match or even surpass
+CRMs that are trained on the entire training set, demonstrating superior sample
+efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Frontier of Computer Science</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Description Boosting for Zero-Shot Entity and Relation Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriele Picco, Leopold Fuchs, Marcos Martínez Galindo, Alberto Purpura, Vanessa López, Hoang Thanh Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot entity and relation classification models leverage available
+external information of unseen classes -- e.g., textual descriptions -- to
+annotate input text data. Thanks to the minimum data requirement, Zero-Shot
+Learning (ZSL) methods have high value in practice, especially in applications
+where labeled data is scarce. Even though recent research in ZSL has
+demonstrated significant results, our analysis reveals that those methods are
+sensitive to provided textual descriptions of entities (or relations). Even a
+minor modification of descriptions can lead to a change in the decision
+boundary between entity (or relation) classes. In this paper, we formally
+define the problem of identifying effective descriptions for zero shot
+inference. We propose a strategy for generating variations of an initial
+description, a heuristic for ranking them and an ensemble method capable of
+boosting the predictions of zero-shot models through description enhancement.
+Empirical results on four different entity and relation classification datasets
+show that our proposed method outperform existing approaches and achieve new
+SOTA results on these datasets under the ZSL settings. The source code of the
+proposed solutions and the evaluation framework are open-sourced.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pairwise Ranking Loss for Multi-Task Learning in Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Furkan Durmus, Hasan Saribas, Said Aldemir, Yang Junyan, Hakan Cevikalp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Task Learning (MTL) plays a crucial role in real-world advertising
+applications such as recommender systems, aiming to achieve robust
+representations while minimizing resource consumption. MTL endeavors to
+simultaneously optimize multiple tasks to construct a unified model serving
+diverse objectives. In online advertising systems, tasks like Click-Through
+Rate (CTR) and Conversion Rate (CVR) are often treated as MTL problems
+concurrently. However, it has been overlooked that a conversion ($y_{cvr}=1$)
+necessitates a preceding click ($y_{ctr}=1$). In other words, while certain CTR
+tasks are associated with corresponding conversions, others lack such
+associations. Moreover, the likelihood of noise is significantly higher in CTR
+tasks where conversions do not occur compared to those where they do, and
+existing methods lack the ability to differentiate between these two scenarios.
+In this study, exposure labels corresponding to conversions are regarded as
+definitive indicators, and a novel task-specific loss is introduced by
+calculating a \textbf{p}air\textbf{wise} \textbf{r}anking (PWiseR) loss between
+model predictions, manifesting as pairwise ranking loss, to encourage the model
+to rely more on them. To demonstrate the effect of the proposed loss function,
+experiments were conducted on different MTL and Single-Task Learning (STL)
+models using four distinct public MTL datasets, namely Alibaba FR, NL, US, and
+CCP, along with a proprietary industrial dataset. The results indicate that our
+proposed loss function outperforms the BCE loss function in most cases in terms
+of the AUC metric.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Interaction-based Relevance Modeling for Online E-Commerce and
+  LLM-based Retrieval <span class="chip">ECML-PKDD'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Chen, Huangyu Dai, Xiang Ma, Wen Jiang, Wei Ning
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic relevance calculation is crucial for e-commerce search engines, as
+it ensures that the items selected closely align with customer intent.
+Inadequate attention to this aspect can detrimentally affect user experience
+and engagement. Traditional text-matching techniques are prevalent but often
+fail to capture the nuances of search intent accurately, so neural networks now
+have become a preferred solution to processing such complex text matching.
+Existing methods predominantly employ representation-based architectures, which
+strike a balance between high traffic capacity and low latency. However, they
+exhibit significant shortcomings in generalization and robustness when compared
+to interaction-based architectures. In this work, we introduce a robust
+interaction-based modeling paradigm to address these shortcomings. It
+encompasses 1) a dynamic length representation scheme for expedited inference,
+2) a professional terms recognition method to identify subjects and core
+attributes from complex sentence structures, and 3) a contrastive adversarial
+training protocol to bolster the model's robustness and matching capabilities.
+Extensive offline evaluations demonstrate the superior robustness and
+effectiveness of our approach, and online A/B testing confirms its ability to
+improve relevance in the same exposure position, resulting in more clicks and
+conversions. To the best of our knowledge, this method is the first
+interaction-based approach for large e-commerce search relevance calculation.
+Notably, we have deployed it for the entire search traffic on alibaba.com, the
+largest B2B e-commerce platform in the world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECML-PKDD'24 as Outstanding Paper. 8 pages, 2 figures, 7
+  tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Auto-Encoding or Auto-Regression? A Reality Check on Causality of
+  Self-Attention-Based Sequential Recommenders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueqi Wang, Zhankui He, Zhenrui Yue, Julian McAuley, Dong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The comparison between Auto-Encoding (AE) and Auto-Regression (AR) has become
+an increasingly important topic with recent advances in sequential
+recommendation. At the heart of this discussion lies the comparison of BERT4Rec
+and SASRec, which serve as representative AE and AR models for self-attentive
+sequential recommenders. Yet the conclusion of this debate remains uncertain
+due to: (1) the lack of fair and controlled environments for experiments and
+evaluations; and (2) the presence of numerous confounding factors w.r.t.
+feature selection, modeling choices and optimization algorithms. In this work,
+we aim to answer this question by conducting a series of controlled
+experiments. We start by tracing the AE/AR debate back to its origin through a
+systematic re-evaluation of SASRec and BERT4Rec, discovering that AR models
+generally surpass AE models in sequential recommendation. In addition, we find
+that AR models further outperforms AE models when using a customized design
+space that includes additional features, modeling approaches and optimization
+techniques. Furthermore, the performance advantage of AR models persists in the
+broader HuggingFace transformer ecosystems. Lastly, we provide potential
+explanations and insights into AE/AR performance from two key perspectives:
+low-rank approximation and inductive bias. We make our code and data available
+at https://github.com/yueqirex/ModSAR
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProGEO: Generating <span class="highlight-title">Prompt</span>s through Image-Text Contrastive Learning for
+  Visual Geo-localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Mao, Jingqi Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Geo-localization (VG) refers to the process to identify the location
+described in query images, which is widely applied in robotics field and
+computer vision tasks, such as autonomous driving, metaverse, augmented
+reality, and SLAM. In fine-grained images lacking specific text descriptions,
+directly applying pure visual methods to represent neighborhood features often
+leads to the model focusing on overly fine-grained features, unable to fully
+mine the semantic information in the images. Therefore, we propose a two-stage
+training method to enhance visual performance and use contrastive learning to
+mine challenging samples. We first leverage the multi-modal description
+capability of CLIP (Contrastive Language-Image Pretraining) to create a set of
+learnable text prompts for each geographic image feature to form vague
+descriptions. Then, by utilizing dynamic text prompts to assist the training of
+the image encoder, we enable the image encoder to learn better and more
+generalizable visual features. This strategy of applying text to purely visual
+tasks addresses the challenge of using multi-modal models for geographic
+images, which often suffer from a lack of precise descriptions, making them
+difficult to utilize widely. We validate the effectiveness of the proposed
+strategy on several large-scale visual geo-localization datasets, and our
+method achieves competitive results on multiple visual geo-localization
+datasets. Our code and model are available at
+https://github.com/Chain-Mao/ProGEO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GRAM: Generative Retrieval Augmented Matching of Data Schemas in the
+  Context of Data Security <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanqing Liu, Luyang Kong, Runhui Wang, Patrick Song, Austin Nevins, Henrik Johnson, Nimish Amlathe, Davor Golac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Schema matching constitutes a pivotal phase in the data ingestion process for
+contemporary database systems. Its objective is to discern pairwise
+similarities between two sets of attributes, each associated with a distinct
+data table. This challenge emerges at the initial stages of data analytics,
+such as when incorporating a third-party table into existing databases to
+inform business insights. Given its significance in the realm of database
+systems, schema matching has been under investigation since the 2000s. This
+study revisits this foundational problem within the context of large language
+models. Adhering to increasingly stringent data security policies, our focus
+lies on the zero-shot and few-shot scenarios: the model should analyze only a
+minimal amount of customer data to execute the matching task, contrasting with
+the conventional approach of scrutinizing the entire data table. We emphasize
+that the zero-shot or few-shot assumption is imperative to safeguard the
+identity and privacy of customer data, even at the potential cost of accuracy.
+The capability to accurately match attributes under such stringent requirements
+distinguishes our work from previous literature in this domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2024 Camera Ready; 11 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-efficient Fine-tuning for LLM-based Recommendation <span class="chip">SIGIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.17197v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.17197v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Lin, Wenjie Wang, Yongqi Li, Shuo Yang, Fuli Feng, Yinwei Wei, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging Large Language Models (LLMs) for recommendation has recently
+garnered considerable attention, where fine-tuning plays a key role in LLMs'
+adaptation. However, the cost of fine-tuning LLMs on rapidly expanding
+recommendation data limits their practical application. To address this
+challenge, few-shot fine-tuning offers a promising approach to quickly adapt
+LLMs to new recommendation data. We propose the task of data pruning for
+efficient LLM-based recommendation, aimed at identifying representative samples
+tailored for LLMs' few-shot fine-tuning. While coreset selection is closely
+related to the proposed task, existing coreset selection methods often rely on
+suboptimal heuristic metrics or entail costly optimization on large-scale
+recommendation data.
+  To tackle these issues, we introduce two objectives for the data pruning task
+in the context of LLM-based recommendation: 1) high accuracy aims to identify
+the influential samples that can lead to high overall performance; and 2) high
+efficiency underlines the low costs of the data pruning process. To pursue the
+two objectives, we propose a novel data pruning method based on two scores,
+i.e., influence score and effort score, to efficiently identify the influential
+samples. Particularly, the influence score is introduced to accurately estimate
+the influence of sample removal on the overall performance. To achieve low
+costs of the data pruning process, we use a small-sized surrogate model to
+replace LLMs to obtain the influence score. Considering the potential gap
+between the surrogate model and LLMs, we further propose an effort score to
+prioritize some hard samples specifically for LLMs. Empirical results on three
+real-world datasets validate the effectiveness of our proposed method. In
+particular, the proposed method uses only 2% samples to surpass the full data
+fine-tuning, reducing time costs by 97%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGIR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Approximate Nearest Neighbor Search with Window Filters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00943v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00943v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Engels, Benjamin Landrum, Shangdi Yu, Laxman Dhulipala, Julian Shun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We define and investigate the problem of $\textit{c-approximate window
+search}$: approximate nearest neighbor search where each point in the dataset
+has a numeric label, and the goal is to find nearest neighbors to queries
+within arbitrary label ranges. Many semantic search problems, such as image and
+document search with timestamp filters, or product search with cost filters,
+are natural examples of this problem. We propose and theoretically analyze a
+modular tree-based framework for transforming an index that solves the
+traditional c-approximate nearest neighbor problem into a data structure that
+solves window search. On standard nearest neighbor benchmark datasets equipped
+with random label values, adversarially constructed embeddings, and image
+search embeddings with real timestamps, we obtain up to a $75\times$ speedup
+over existing solutions at the same level of recall.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available: https://github.com/JoshEngels/RangeFilteredANN</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Chat<span class="highlight-title">GPT</span> as a Recommender System: A Rigorous Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03613v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03613v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dario Di Palma, Giovanni Maria Biancofiore, Vito Walter Anelli, Fedelucio Narducci, Tommaso Di Noia, Eugenio Di Sciascio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have recently shown impressive abilities in
+handling various natural language-related tasks. Among different LLMs, current
+studies have assessed ChatGPT's superior performance across manifold tasks,
+especially under the zero/few-shot prompting conditions. Given such successes,
+the Recommender Systems (RSs) research community have started investigating its
+potential applications within the recommendation scenario. However, although
+various methods have been proposed to integrate ChatGPT's capabilities into
+RSs, current research struggles to comprehensively evaluate such models while
+considering the peculiarities of generative models. Often, evaluations do not
+consider hallucinations, duplications, and out-of-the-closed domain
+recommendations and solely focus on accuracy metrics, neglecting the impact on
+beyond-accuracy facets. To bridge this gap, we propose a robust evaluation
+pipeline to assess ChatGPT's ability as an RS and post-process ChatGPT
+recommendations to account for these aspects. Through this pipeline, we
+investigate ChatGPT-3.5 and ChatGPT-4 performance in the recommendation task
+under the zero-shot condition employing the role-playing prompt. We analyze the
+model's functionality in three settings: the Top-N Recommendation, the
+cold-start recommendation, and the re-ranking of a list of recommendations, and
+in three domains: movies, music, and books. The experiments reveal that ChatGPT
+exhibits higher accuracy than the baselines on books domain. It also excels in
+re-ranking and cold-start scenarios while maintaining reasonable
+beyond-accuracy metrics. Furthermore, we measure the similarity between the
+ChatGPT recommendations and the other recommenders, providing insights about
+how ChatGPT could be categorized in the realm of recommender systems. The
+evaluation pipeline is publicly released for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating Query Recommendations via LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19749v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19749v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Enrico Palumbo, Andreas Damianou, Nicola Tonellotto, Fabrizio Silvestri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query recommendation systems are ubiquitous in modern search engines,
+assisting users in producing effective queries to meet their information needs.
+However, these systems require a large amount of data to produce good
+recommendations, such as a large collection of documents to index and query
+logs. In particular, query logs and user data are not available in cold start
+scenarios. Query logs are expensive to collect and maintain and require complex
+and time-consuming cascading pipelines for creating, combining, and ranking
+recommendations. To address these issues, we frame the query recommendation
+problem as a generative task, proposing a novel approach called Generative
+Query Recommendation (GQR). GQR uses an LLM as its foundation and does not
+require to be trained or fine-tuned to tackle the query recommendation problem.
+We design a prompt that enables the LLM to understand the specific
+recommendation task, even using a single example. We then improved our system
+by proposing a version that exploits query logs called Retriever-Augmented GQR
+(RA-GQR). RA-GQr dynamically composes its prompt by retrieving similar queries
+from query logs. GQR approaches reuses a pre-existing neural architecture
+resulting in a simpler and more ready-to-market approach, even in a cold start
+scenario. Our proposed GQR obtains state-of-the-art performance in terms of
+NDCG@10 and clarity score against two commercial search engines and the
+previous state-of-the-art approach on the Robust04 and ClueWeb09B collections,
+improving on average the NDCG@10 performance up to ~4% on Robust04 and
+ClueWeb09B w.r.t the previous best competitor. RA-GQR further improve the
+NDCG@10 obtaining an increase of ~11%, ~6\% on Robust04 and ClueWeb09B w.r.t
+the best competitor. Furthermore, our system obtained ~59% of user preferences
+in a blind user study, proving that our method produces the most engaging
+queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Generating Query Recommendations via LLMs</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DisCo: Towards Harmonious Disentanglement and Collaboration between
+  Tabular and Semantic Space for Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00011v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00011v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kounianhua Du, Jizheng Chen, Jianghao Lin, Yunjia Xi, Hangyu Wang, Xinyi Dai, Bo Chen, Ruiming Tang, Weinan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems play important roles in various applications such as
+e-commerce, social media, etc. Conventional recommendation methods usually
+model the collaborative signals within the tabular representation space.
+Despite the personalization modeling and the efficiency, the latent semantic
+dependencies are omitted. Methods that introduce semantics into recommendation
+then emerge, injecting knowledge from the semantic representation space where
+the general language understanding are compressed. However, existing
+semantic-enhanced recommendation methods focus on aligning the two spaces,
+during which the representations of the two spaces tend to get close while the
+unique patterns are discarded and not well explored. In this paper, we propose
+DisCo to Disentangle the unique patterns from the two representation spaces and
+Collaborate the two spaces for recommendation enhancement, where both the
+specificity and the consistency of the two spaces are captured. Concretely, we
+propose 1) a dual-side attentive network to capture the intra-domain patterns
+and the inter-domain patterns, 2) a sufficiency constraint to preserve the
+task-relevant information of each representation space and filter out the
+noise, and 3) a disentanglement constraint to avoid the model from discarding
+the unique information. These modules strike a balance between disentanglement
+and collaboration of the two representation spaces to produce informative
+pattern vectors, which could serve as extra features and be appended to
+arbitrary recommendation backbones for enhancement. Experiment results validate
+the superiority of our method against different models and the compatibility of
+DisCo over different backbones. Various ablation studies and efficiency
+analysis are also conducted to justify each model component.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FourierKAN-GCF: Fourier Kolmogorov-Arnold Network -- An Effective and
+  Efficient Feature Transformation for Graph Collaborative Filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01034v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01034v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinfeng Xu, Zheyu Chen, Jinze Li, Shuo Yang, Wei Wang, Xiping Hu, Edith C. -H. Ngai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Collaborative Filtering (GCF) has achieved state-of-the-art performance
+for recommendation tasks. However, most GCF structures simplify the feature
+transformation and nonlinear operation during message passing in the graph
+convolution network (GCN). We revisit these two components and discover that a
+part of feature transformation and nonlinear operation during message passing
+in GCN can improve the representation of GCF, but increase the difficulty of
+training.
+  In this work, we propose a simple and effective graph-based recommendation
+model called FourierKAN-GCF. Specifically, it utilizes a novel Fourier
+Kolmogorov-Arnold Network (KAN) to replace the multilayer perceptron (MLP) as a
+part of the feature transformation during message passing in GCN, which
+improves the representation power of GCF and is easy to train. We further
+employ message dropout and node dropout strategies to improve the
+representation power and robustness of the model. Extensive experiments on two
+public datasets demonstrate the superiority of FourierKAN-GCF over most
+state-of-the-art methods. The implementation code is available at
+https://github.com/Jinfeng-Xu/FKAN-GCF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ARL2: Aligning Retrievers for Black-box Large Language Models via
+  Self-guided Adaptive Relevance Labeling <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13542v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13542v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingxi Zhang, Yue Yu, Kuan Wang, Chao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation enhances large language models (LLMs) by
+incorporating relevant information from external knowledge sources. This
+enables LLMs to adapt to specific domains and mitigate hallucinations in
+knowledge-intensive tasks. However, existing retrievers are often misaligned
+with LLMs due to their separate training processes and the black-box nature of
+LLMs. To address this challenge, we propose ARL2, a retriever learning
+technique that harnesses LLMs as labelers. ARL2 leverages LLMs to annotate and
+score relevant evidence, enabling learning the retriever from robust LLM
+supervision. Furthermore, ARL2 uses an adaptive self-training strategy for
+curating high-quality and diverse relevance data, which can effectively reduce
+the annotation cost. Extensive experiments demonstrate the effectiveness of
+ARL2, achieving accuracy improvements of 5.4% on NQ and 4.6% on MMLU compared
+to the state-of-the-art methods. Additionally, ARL2 exhibits robust transfer
+learning capabilities and strong zero-shot generalization abilities. Our code
+will be published at \url{https://github.com/zhanglingxi-cs/ARL2}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAM-EHR: Retrieval Augmentation Meets Clinical Predictions on Electronic
+  Health Records <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00815v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00815v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Xu, Wenqi Shi, Yue Yu, Yuchen Zhuang, Bowen Jin, May D. Wang, Joyce C. Ho, Carl Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present RAM-EHR, a Retrieval AugMentation pipeline to improve clinical
+predictions on Electronic Health Records (EHRs). RAM-EHR first collects
+multiple knowledge sources, converts them into text format, and uses dense
+retrieval to obtain information related to medical concepts. This strategy
+addresses the difficulties associated with complex names for the concepts.
+RAM-EHR then augments the local EHR predictive model co-trained with
+consistency regularization to capture complementary information from patient
+visits and summarized knowledge. Experiments on two EHR datasets show the
+efficacy of RAM-EHR over previous knowledge-enhanced baselines (3.4% gain in
+AUROC and 7.2% gain in AUPR), emphasizing the effectiveness of the summarized
+knowledge from RAM-EHR for clinical prediction tasks. The code will be
+published at \url{https://github.com/ritaranx/RAM-EHR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ C-RAG: Certified Generation Risks for Retrieval-Augmented Language
+  Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03181v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03181v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mintong Kang, Nezihe Merve Gürel, Ning Yu, Dawn Song, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive capabilities of large language models (LLMs) across
+diverse applications, they still suffer from trustworthiness issues, such as
+hallucinations and misalignments. Retrieval-augmented language models (RAG)
+have been proposed to enhance the credibility of generations by grounding
+external knowledge, but the theoretical understandings of their generation
+risks remains unexplored. In this paper, we answer: 1) whether RAG can indeed
+lead to low generation risks, 2) how to provide provable guarantees on the
+generation risks of RAG and vanilla LLMs, and 3) what sufficient conditions
+enable RAG models to reduce generation risks. We propose C-RAG, the first
+framework to certify generation risks for RAG models. Specifically, we provide
+conformal risk analysis for RAG models and certify an upper confidence bound of
+generation risks, which we refer to as conformal generation risk. We also
+provide theoretical guarantees on conformal generation risks for general
+bounded risk functions under test distribution shifts. We prove that RAG
+achieves a lower conformal generation risk than that of a single LLM when the
+quality of the retrieval model and transformer is non-trivial. Our intensive
+empirical results demonstrate the soundness and tightness of our conformal
+generation risk guarantees across four widely-used NLP datasets on four
+state-of-the-art retrieval models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Generative Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01197v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01197v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tzu-Lin Kuo, Tzu-Wei Chiu, Tzung-Sheng Lin, Sheng-Yang Wu, Chao-Wei Huang, Yun-Nung Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Retrieval (GR) is an emerging paradigm in information retrieval
+that leverages generative models to directly map queries to relevant document
+identifiers (DocIDs) without the need for traditional query processing or
+document reranking. This survey provides a comprehensive overview of GR,
+highlighting key developments, indexing and retrieval strategies, and
+challenges. We discuss various document identifier strategies, including
+numerical and string-based identifiers, and explore different document
+representation methods. Our primary contribution lies in outlining future
+research directions that could profoundly impact the field: improving the
+quality of query generation, exploring learnable document identifiers,
+enhancing scalability, and integrating GR with multi-task learning frameworks.
+By examining state-of-the-art GR techniques and their applications, this survey
+aims to provide a foundational understanding of GR and inspire further
+innovations in this transformative approach to information retrieval. We also
+make the complementary materials such as paper collection publicly available at
+https://github.com/MiuLab/GenIR-Survey/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Navigating the Future of Federated Recommendation Systems with
+  Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00004v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00004v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Li, Guodong Long
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, the integration of federated learning (FL) and
+recommendation systems (RS), known as Federated Recommendation Systems (FRS),
+has attracted attention for preserving user privacy by keeping private data on
+client devices. However, FRS faces inherent limitations such as data
+heterogeneity and scarcity, due to the privacy requirements of FL and the
+typical data sparsity issues of RSs. Models like ChatGPT are empowered by the
+concept of transfer learning and self-supervised learning, so they can be
+easily applied to the downstream tasks after fine-tuning or prompting. These
+models, so-called Foundation Models (FM), fouce on understanding the human's
+intent and perform following their designed roles in the specific tasks, which
+are widely recognized for producing high-quality content in the image and
+language domains. Thus, the achievements of FMs inspire the design of FRS and
+suggest a promising research direction: integrating foundation models to
+address the above limitations. In this study, we conduct a comprehensive review
+of FRSs with FMs. Specifically, we: 1) summarise the common approaches of
+current FRSs and FMs; 2) review the challenges posed by FRSs and FMs; 3)
+discuss potential future research directions; and 4) introduce some common
+benchmarks and evaluation metrics in the FRS field. We hope that this position
+paper provides the necessary background and guidance to explore this
+interesting and emerging topic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, position paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Convolutional Forecasting Network Based on Time Series
+  Feature-Driven 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12038v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12038v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dandan Zhang, Zhiqiang Zhang, Nanguang Chen, Yun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series data in real-world scenarios contain a substantial amount of
+nonlinear information, which significantly interferes with the training process
+of models, leading to decreased prediction performance. Therefore, during the
+time series forecasting process, extracting the local and global time series
+patterns and understanding the potential nonlinear features among different
+time observations are highly significant. To address this challenge, we
+introduce multi-resolution convolution and deformable convolution operations.
+By enlarging the receptive field using convolution kernels with different
+dilation factors to capture temporal correlation information at different
+resolutions, and adaptively adjusting the sampling positions through additional
+offset vectors, we enhance the network's ability to capture potential nonlinear
+features among time observations. Building upon this, we propose ACNet, an
+adaptive convolutional network designed to effectively model the local and
+global temporal dependencies and the nonlinear features between observations in
+multivariate time series. Specifically, by extracting and fusing time series
+features at different resolutions, we capture both local contextual information
+and global patterns in the time series. The designed nonlinear feature adaptive
+extraction module captures the nonlinear features among different time
+observations in the time series. We evaluated the performance of ACNet across
+twelve real-world datasets. The results indicate that ACNet consistently
+achieves state-of-the-art performance in both short-term and long-term
+forecasting tasks with favorable runtime efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalizing Conversational Dense Retrieval via LLM-Cognition Data
+  Augmentation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07092v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07092v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Chen, Zhicheng Dou, Kelong Mao, Jiongnan Liu, Ziliang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational search utilizes muli-turn natural language contexts to
+retrieve relevant passages. Existing conversational dense retrieval models
+mostly view a conversation as a fixed sequence of questions and responses,
+overlooking the severe data sparsity problem -- that is, users can perform a
+conversation in various ways, and these alternate conversations are unrecorded.
+Consequently, they often struggle to generalize to diverse conversations in
+real-world scenarios. In this work, we propose a framework for generalizing
+Conversational dense retrieval via LLM-cognition data Augmentation (ConvAug).
+ConvAug first generates multi-level augmented conversations to capture the
+diverse nature of conversational contexts. Inspired by human cognition, we
+devise a cognition-aware process to mitigate the generation of false positives,
+false negatives, and hallucinations. Moreover, we develop a difficulty-adaptive
+sample filter that selects challenging samples for complex conversations,
+thereby giving the model a larger learning space. A contrastive learning
+objective is then employed to train a better conversational context encoder.
+Extensive experiments conducted on four public datasets, under both normal and
+zero-shot settings, demonstrate the effectiveness, generalizability, and
+applicability of ConvAug. The code is released at
+https://github.com/haon-chen/ConvAug.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Its Not a Modality Gap: Characterizing and Addressing the Contrastive
+  Gap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18570v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18570v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abrar Fahim, Alex Murphy, Alona Fyshe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal contrastive models such as CLIP achieve state-of-the-art
+performance in zero-shot classification by embedding input images and texts on
+a joint representational space. Recently, a modality gap has been reported in
+two-encoder contrastive models like CLIP, meaning that the image and text
+embeddings reside in disjoint areas of the latent space. Previous studies
+suggest that this gap exists due to 1) the cone effect, 2) mismatched pairs in
+the dataset, and 3) insufficient training. We show that, even when accounting
+for all these factors, and even when using the same modality, the contrastive
+loss actually creates a gap during training. As a result, We propose that the
+modality gap is inherent to the two-encoder contrastive loss and rename it the
+contrastive gap. We present evidence that attributes this contrastive gap to
+low uniformity in CLIP space, resulting in embeddings that occupy only a small
+portion of the latent space. To close the gap, we adapt the uniformity and
+alignment properties of unimodal contrastive loss to the multi-modal setting
+and show that simply adding these terms to the CLIP loss distributes the
+embeddings more uniformly in the representational space, closing the gap. In
+our experiments, we show that the modified representational space achieves
+better performance than default CLIP loss in downstream tasks such as zero-shot
+image classification and multi-modal arithmetic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to grok: Emergence of in-context learning and skill composition
+  in modular arithmetic tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02550v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02550v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu He, Darshil Doshi, Aritra Das, Andrey Gromov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models can solve tasks that were not present in the training
+set. This capability is believed to be due to in-context learning and skill
+composition. In this work, we study the emergence of in-context learning and
+skill composition in a collection of modular arithmetic tasks. Specifically, we
+consider a finite collection of linear modular functions $z = a \, x + b \, y
+\;\mathrm{mod}\; p$ labeled by the vector $(a, b) \in \mathbb{Z}_p^2$. We use
+some of these tasks for pre-training and the rest for out-of-distribution
+testing. We empirically show that a GPT-style transformer exhibits a transition
+from in-distribution to out-of-distribution generalization as the number of
+pre-training tasks increases. We find that the smallest model capable of
+out-of-distribution generalization requires two transformer blocks, while for
+deeper models, the out-of-distribution generalization phase is
+\emph{transient}, necessitating early stopping. Finally, we perform an
+interpretability study of the pre-trained models, revealing the highly
+structured representations in both phases; and discuss the learnt algorithm.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 19 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust and highly scalable estimation of directional couplings from
+  time-shifted signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02545v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02545v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Ambrogioni, Louis Rouillard, Demian Wassermann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The estimation of directed couplings between the nodes of a network from
+indirect measurements is a central methodological challenge in scientific
+fields such as neuroscience, systems biology and economics. Unfortunately, the
+problem is generally ill-posed due to the possible presence of unknown delays
+in the measurements. In this paper, we offer a solution of this problem by
+using a variational Bayes framework, where the uncertainty over the delays is
+marginalized in order to obtain conservative coupling estimates. To overcome
+the well-known overconfidence of classical variational methods, we use a
+hybrid-VI scheme where the (possibly flat or multimodal) posterior over the
+measurement parameters is estimated using a forward KL loss while the (nearly
+convex) conditional posterior over the couplings is estimated using the highly
+scalable gradient-based VI. In our ground-truth experiments, we show that the
+network provides reliable and conservative estimates of the couplings, greatly
+outperforming similar methods such as regression DCM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ To Believe or Not to Believe Your LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yasin Abbasi Yadkori, Ilja Kuzborskij, András György, Csaba Szepesvári
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore uncertainty quantification in large language models (LLMs), with
+the goal to identify when uncertainty in responses given a query is large. We
+simultaneously consider both epistemic and aleatoric uncertainties, where the
+former comes from the lack of knowledge about the ground truth (such as about
+facts or the language), and the latter comes from irreducible randomness (such
+as multiple possible answers). In particular, we derive an
+information-theoretic metric that allows to reliably detect when only epistemic
+uncertainty is large, in which case the output of the model is unreliable. This
+condition can be computed based solely on the output of the model obtained
+simply by some special iterative prompting based on the previous responses.
+Such quantification, for instance, allows to detect hallucinations (cases when
+epistemic uncertainty is high) in both single- and multi-answer responses. This
+is in contrast to many standard uncertainty quantification strategies (such as
+thresholding the log-likelihood of a response) where hallucinations in the
+multi-answer case cannot be detected. We conduct a series of experiments which
+demonstrate the advantage of our formulation. Further, our investigations shed
+some light on how the probabilities assigned to a given output by an LLM can be
+amplified by iterative prompting, which might be of independent interest.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Loki: Low-Rank Keys for Efficient Sparse Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02542v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02542v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prajwal Singhania, Siddharth Singh, Shwai He, Soheil Feizi, Abhinav Bhatele
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inference on large language models can be expensive in terms of the compute
+and memory costs involved, especially when long sequence lengths are used. In
+particular, the self-attention mechanism used in such models contributes
+significantly to these costs, which has resulted in several recent works that
+propose sparse attention approximations for inference. In this work, we propose
+to approximate the self-attention computation by focusing on the dimensionality
+of key vectors computed in the attention block. Our analysis reveals that the
+key vectors lie in a significantly lower-dimensional space, consistently across
+several datasets and models. Exploiting this observation, we propose Loki, a
+novel sparse attention method that ranks and selects tokens in the KV-cache
+based on attention scores computed in low-dimensional space. Our evaluations
+show that Loki is able to maintain the efficacy of the models better than other
+popular approximation methods, while speeding up the attention computation due
+to reduced data movement (load/store) and compute costs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parrot: Multilingual Visual Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02539v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02539v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai-Long Sun, Da-Wei Zhou, Yang Li, Shiyin Lu, Chao Yi, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, De-Chuan Zhan, Han-Jia Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of Multimodal Large Language Models (MLLMs) like GPT-4V
+has marked a significant step towards artificial general intelligence. Existing
+methods mainly focus on aligning vision encoders with LLMs through supervised
+fine-tuning (SFT) to endow LLMs with multimodal abilities, making MLLMs'
+inherent ability to react to multiple languages progressively deteriorate as
+the training process evolves. We empirically find that the imbalanced SFT
+datasets, primarily composed of English-centric image-text pairs, lead to
+significantly reduced performance in non-English languages. This is due to the
+failure of aligning the vision encoder and LLM with multilingual tokens during
+the SFT process. In this paper, we introduce Parrot, a novel method that
+utilizes textual guidance to drive visual token alignment at the language
+level. Parrot makes the visual tokens condition on diverse language inputs and
+uses Mixture-of-Experts (MoE) to promote the alignment of multilingual tokens.
+Specifically, to enhance non-English visual tokens alignment, we compute the
+cross-attention using the initial visual features and textual embeddings, the
+result of which is then fed into the MoE router to select the most relevant
+experts. The selected experts subsequently convert the initial visual tokens
+into language-specific visual tokens. Moreover, considering the current lack of
+benchmarks for evaluating multilingual capabilities within the field, we
+collect and make available a Massive Multilingual Multimodal Benchmark which
+includes 6 languages, 15 categories, and 12,000 questions, named as MMMB. Our
+method not only demonstrates state-of-the-art performance on multilingual
+MMBench and MMMB, but also excels across a broad range of multimodal tasks.
+Both the source code and the training dataset of Parrot will be made publicly
+available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TopViewRS: Vision-Language Models as Top-View Spatial Reasoners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengzu Li, Caiqi Zhang, Han Zhou, Nigel Collier, Anna Korhonen, Ivan Vulić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Top-view perspective denotes a typical way in which humans read and reason
+over different types of maps, and it is vital for localization and navigation
+of humans as well as of `non-human' agents, such as the ones backed by large
+Vision-Language Models (VLMs). Nonetheless, spatial reasoning capabilities of
+modern VLMs remain unattested and underexplored. In this work, we thus study
+their capability to understand and reason over spatial relations from the top
+view. The focus on top view also enables controlled evaluations at different
+granularity of spatial reasoning; we clearly disentangle different abilities
+(e.g., recognizing particular objects versus understanding their relative
+positions). We introduce the TopViewRS (Top-View Reasoning in Space) dataset,
+consisting of 11,384 multiple-choice questions with either realistic or
+semantic top-view map as visual input. We then use it to study and evaluate
+VLMs across 4 perception and reasoning tasks with different levels of
+complexity. Evaluation of 10 representative open- and closed-source VLMs
+reveals the gap of more than 50% compared to average human performance, and it
+is even lower than the random baseline in some cases. Although additional
+experiments show that Chain-of-Thought reasoning can boost model capabilities
+by 5.82% on average, the overall performance of VLMs remains limited. Our
+findings underscore the critical need for enhanced model capability in top-view
+spatial reasoning and set a foundation for further research towards human-level
+proficiency of VLMs in real-world multimodal tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 figures, 3 tables (21 pages, 4 figures, 15 tables
+  including references and appendices)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigate Position Bias in Large Language Models via Scaling a Single
+  Dimension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02536v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02536v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijiong Yu, Huiqiang Jiang, Xufang Luo, Qianhui Wu, Chin-Yew Lin, Dongsheng Li, Yuqing Yang, Yongfeng Huang, Lili Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are increasingly applied in various real-world
+scenarios due to their excellent generalization capabilities and robust
+generative abilities. However, they exhibit position bias, also known as "lost
+in the middle", a phenomenon that is especially pronounced in long-context
+scenarios, which indicates the placement of the key information in different
+positions of a prompt can significantly affect accuracy. This paper first
+explores the micro-level manifestations of position bias, concluding that
+attention weights are a micro-level expression of position bias. It further
+identifies that, in addition to position embeddings, causal attention mask also
+contributes to position bias by creating position-specific hidden states. Based
+on these insights, we propose a method to mitigate position bias by scaling
+this positional hidden states. Experiments on the NaturalQuestions
+Multi-document QA, KV retrieval, LongBench and timeline reorder tasks, using
+various models including RoPE models, context windowextended models, and Alibi
+models, demonstrate the effectiveness and generalizability of our approach. Our
+method can improve performance by up to 15.2% by modifying just one dimension
+of hidden states. Our code is available at https://aka.ms/PositionalHidden.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing predictive imaging biomarker discovery through treatment
+  effect analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuhan Xiao, Lukas Klein, Jens Petersen, Philipp Vollmuth, Paul F. Jaeger, Klaus H. Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying predictive biomarkers, which forecast individual treatment
+effectiveness, is crucial for personalized medicine and informs decision-making
+across diverse disciplines. These biomarkers are extracted from pre-treatment
+data, often within randomized controlled trials, and have to be distinguished
+from prognostic biomarkers, which are independent of treatment assignment. Our
+study focuses on the discovery of predictive imaging biomarkers, aiming to
+leverage pre-treatment images to unveil new causal relationships. Previous
+approaches relied on labor-intensive handcrafted or manually derived features,
+which may introduce biases. In response, we present a new task of discovering
+predictive imaging biomarkers directly from the pre-treatment images to learn
+relevant image features. We propose an evaluation protocol for this task to
+assess a model's ability to identify predictive imaging biomarkers and
+differentiate them from prognostic ones. It employs statistical testing and a
+comprehensive analysis of image feature attribution. We explore the suitability
+of deep learning models originally designed for estimating the conditional
+average treatment effect (CATE) for this task, which previously have been
+primarily assessed for the precision of CATE estimation, overlooking the
+evaluation of imaging biomarker discovery. Our proof-of-concept analysis
+demonstrates promising results in discovering and validating predictive imaging
+biomarkers from synthetic outcomes and real-world image datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ReLUs Are Sufficient for Learning Implicit Neural Representations <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Shenouda, Yamin Zhou, Robert D. Nowak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the growing theoretical understanding of neural networks that
+employ the Rectified Linear Unit (ReLU) as their activation function, we
+revisit the use of ReLU activation functions for learning implicit neural
+representations (INRs). Inspired by second order B-spline wavelets, we
+incorporate a set of simple constraints to the ReLU neurons in each layer of a
+deep neural network (DNN) to remedy the spectral bias. This in turn enables its
+use for various INR tasks. Empirically, we demonstrate that, contrary to
+popular belief, one can learn state-of-the-art INRs based on a DNN composed of
+only ReLU neurons. Next, by leveraging recent theoretical works which
+characterize the kinds of functions ReLU neural networks learn, we provide a
+way to quantify the regularity of the learned function. This offers a
+principled approach to selecting the hyperparameters in INR architectures. We
+substantiate our claims through experiments in signal representation, super
+resolution, and computed tomography, demonstrating the versatility and
+effectiveness of our method. The code for all experiments can be found at
+https://github.com/joeshenouda/relu-inrs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroush Nasiriany, Abhiram Maddukuri, Lance Zhang, Adeet Parikh, Aaron Lo, Abhishek Joshi, Ajay Mandlekar, Yuke Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Artificial Intelligence (AI) have largely been
+propelled by scaling. In Robotics, scaling is hindered by the lack of access to
+massive robot datasets. We advocate using realistic physical simulation as a
+means to scale environments, tasks, and datasets for robot learning methods. We
+present RoboCasa, a large-scale simulation framework for training generalist
+robots in everyday environments. RoboCasa features realistic and diverse scenes
+focusing on kitchen environments. We provide thousands of 3D assets across over
+150 object categories and dozens of interactable furniture and appliances. We
+enrich the realism and diversity of our simulation with generative AI tools,
+such as object assets from text-to-3D models and environment textures from
+text-to-image models. We design a set of 100 tasks for systematic evaluation,
+including composite tasks generated by the guidance of large language models.
+To facilitate learning, we provide high-quality human demonstrations and
+integrate automated trajectory generation methods to substantially enlarge our
+datasets with minimal human burden. Our experiments show a clear scaling trend
+in using synthetically generated robot data for large-scale imitation learning
+and show great promise in harnessing simulation data in real-world tasks.
+Videos and open-source code are available at https://robocasa.ai/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>RSS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty of Joint Neural Contextual Bandit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02515v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02515v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbo Guo, Zheqing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contextual bandit learning is increasingly favored in modern large-scale
+recommendation systems. To better utlize the contextual information and
+available user or item features, the integration of neural networks have been
+introduced to enhance contextual bandit learning and has triggered significant
+interest from both academia and industry. However, a major challenge arises
+when implementing a disjoint neural contextual bandit solution in large-scale
+recommendation systems, where each item or user may correspond to a separate
+bandit arm. The huge number of items to recommend poses a significant hurdle
+for real world production deployment. This paper focuses on a joint neural
+contextual bandit solution which serves all recommending items in one single
+model. The output consists of a predicted reward $\mu$, an uncertainty $\sigma$
+and a hyper-parameter $\alpha$ which balances exploitation and exploration,
+e.g., $\mu + \alpha \sigma$.
+  The tuning of the parameter $\alpha$ is typically heuristic and complex in
+practice due to its stochastic nature. To address this challenge, we provide
+both theoretical analysis and experimental findings regarding the uncertainty
+$\sigma$ of the joint neural contextual bandit model. Our analysis reveals that
+$\alpha$ demonstrates an approximate square root relationship with the size of
+the last hidden layer $F$ and inverse square root relationship with the amount
+of training data $N$, i.e., $\sigma \propto \sqrt{\frac{F}{N}}$. The
+experiments, conducted with real industrial data, align with the theoretical
+analysis, help understanding model behaviors and assist the hyper-parameter
+tuning during both offline training and online deployment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fairness-Optimized Synthetic EHR Generation for Arbitrary Downstream
+  Predictive Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirza Farhan Bin Tarek, Raphael Poulain, Rahmatollah Beheshti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among various aspects of ensuring the responsible design of AI tools for
+healthcare applications, addressing fairness concerns has been a key focus
+area. Specifically, given the wide spread of electronic health record (EHR)
+data and their huge potential to inform a wide range of clinical decision
+support tasks, improving fairness in this category of health AI tools is of key
+importance. While such a broad problem (that is, mitigating fairness in
+EHR-based AI models) has been tackled using various methods, task- and
+model-agnostic methods are noticeably rare. In this study, we aimed to target
+this gap by presenting a new pipeline that generates synthetic EHR data, which
+is not only consistent with (faithful to) the real EHR data but also can reduce
+the fairness concerns (defined by the end-user) in the downstream tasks, when
+combined with the real data. We demonstrate the effectiveness of our proposed
+pipeline across various downstream tasks and two different EHR datasets. Our
+proposed pipeline can add a widely applicable and complementary tool to the
+existing toolbox of methods to address fairness in health AI applications such
+as those modifying the design of a downstream model. The codebase for our
+project is available at https://github.com/healthylaife/FairSynth
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guiding a Diffusion Model with a Bad Version of Itself 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02507v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02507v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tero Karras, Miika Aittala, Tuomas Kynkäänniemi, Jaakko Lehtinen, Timo Aila, Samuli Laine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The primary axes of interest in image-generating diffusion models are image
+quality, the amount of variation in the results, and how well the results align
+with a given condition, e.g., a class label or a text prompt. The popular
+classifier-free guidance approach uses an unconditional model to guide a
+conditional model, leading to simultaneously better prompt alignment and
+higher-quality images at the cost of reduced variation. These effects seem
+inherently entangled, and thus hard to control. We make the surprising
+observation that it is possible to obtain disentangled control over image
+quality without compromising the amount of variation by guiding generation
+using a smaller, less-trained version of the model itself rather than an
+unconditional model. This leads to significant improvements in ImageNet
+generation, setting record FIDs of 1.01 for 64x64 and 1.25 for 512x512, using
+publicly available networks. Furthermore, the method is also applicable to
+unconditional diffusion models, drastically improving their quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Demystifying the Compression of Mixture-of-Experts Through a Unified
+  Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02500v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02500v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shwai He, Daize Dong, Liang Ding, Ang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling large language models has revolutionized the performance across
+diverse domains, yet the continual growth in model size poses significant
+challenges for real-world deployment. The Mixture of Experts (MoE) approach
+addresses this by dynamically selecting and activating only a subset of
+experts, significantly reducing computational costs while maintaining high
+performance. However, MoE introduces potential redundancy (e.g., parameters)
+and extra costs (e.g., communication overhead). Despite numerous compression
+techniques developed for mitigating the redundancy in dense models, the
+compression of MoE remains under-explored. We first bridge this gap with a
+cutting-edge unified framework that not only seamlessly integrates mainstream
+compression methods but also helps systematically understand MoE compression.
+This framework approaches compression from two perspectives: Expert Slimming
+which compresses individual experts and Expert Trimming which removes
+structured modules. Within this framework, we explore the optimization space
+unexplored by existing methods,and further introduce aggressive Expert Trimming
+techniques, i.e., Layer Drop and Block Drop, to eliminate redundancy at larger
+scales. Based on these insights,we present a comprehensive recipe to guide
+practitioners in compressing MoE effectively. Extensive experimental results
+demonstrate the effectiveness of the compression methods under our framework
+and the proposed recipe, achieving a 6.05x speedup and only 20.0GB memory usage
+while maintaining over 92% of performance on Mixtral-8x7B.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dropout MPC: An Ensemble Neural MPC Approach for Systems with Learned
+  Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Spyridon Syntakas, Kostas Vlachos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks are lately more and more often being used in the context of
+data-driven control, as an approximate model of the true system dynamics. Model
+Predictive Control (MPC) adopts this practise leading to neural MPC strategies.
+This raises a question of whether the trained neural network has converged and
+generalized in a way that the learned model encapsulates an accurate
+approximation of the true dynamic model of the system, thus making it a
+reliable choice for model-based control, especially for disturbed and uncertain
+systems. To tackle that, we propose Dropout MPC, a novel sampling-based
+ensemble neural MPC algorithm that employs the Monte-Carlo dropout technique on
+the learned system model. The closed loop is based on an ensemble of predictive
+controllers, that are used simultaneously at each time-step for trajectory
+optimization. Each member of the ensemble influences the control input, based
+on a weighted voting scheme, thus by employing different realizations of the
+learned system dynamics, neural control becomes more reliable by design. An
+additional strength of the method is that it offers by design a way to estimate
+future uncertainty, leading to cautious control. While the method aims in
+general at uncertain systems with complex dynamics, where models derived from
+first principles are hard to infer, to showcase the application we utilize data
+gathered in the laboratory from a real mobile manipulator and employ the
+proposed algorithm for the navigation of the robot in simulation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kolmogorov-Arnold Networks for Time Series: Bridging Predictive Power
+  and Interpretability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunpeng Xu, Lifei Chen, Shengrui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kolmogorov-Arnold Networks (KAN) is a groundbreaking model recently proposed
+by the MIT team, representing a revolutionary approach with the potential to be
+a game-changer in the field. This innovative concept has rapidly garnered
+worldwide interest within the AI community. Inspired by the Kolmogorov-Arnold
+representation theorem, KAN utilizes spline-parametrized univariate functions
+in place of traditional linear weights, enabling them to dynamically learn
+activation patterns and significantly enhancing interpretability. In this
+paper, we explore the application of KAN to time series forecasting and propose
+two variants: T-KAN and MT-KAN. T-KAN is designed to detect concept drift
+within time series and can explain the nonlinear relationships between
+predictions and previous time steps through symbolic regression, making it
+highly interpretable in dynamically changing environments. MT-KAN, on the other
+hand, improves predictive performance by effectively uncovering and leveraging
+the complex relationships among variables in multivariate time series.
+Experiments validate the effectiveness of these approaches, demonstrating that
+T-KAN and MT-KAN significantly outperform traditional methods in time series
+forecasting tasks, not only enhancing predictive accuracy but also improving
+model interpretability. This research opens new avenues for adaptive
+forecasting models, highlighting the potential of KAN as a powerful and
+interpretable tool in predictive analytics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ai-Sampler: Adversarial Learning of Markov kernels with involutive maps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02490v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02490v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evgenii Egorov, Ricardo Valperga, Efstratios Gavves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Markov chain Monte Carlo methods have become popular in statistics as
+versatile techniques to sample from complicated probability distributions. In
+this work, we propose a method to parameterize and train transition kernels of
+Markov chains to achieve efficient sampling and good mixing. This training
+procedure minimizes the total variation distance between the stationary
+distribution of the chain and the empirical distribution of the data. Our
+approach leverages involutive Metropolis-Hastings kernels constructed from
+reversible neural networks that ensure detailed balance by construction. We
+find that reversibility also implies $C_2$-equivariance of the discriminator
+function which can be used to restrict its function space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Temporal Kolmogorov-Arnold <span class="highlight-title">Transformer</span> for Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02486v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02486v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Remi Genet, Hugo Inzirillo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capturing complex temporal patterns and relationships within multivariate
+data streams is a difficult task. We propose the Temporal Kolmogorov-Arnold
+Transformer (TKAT), a novel attention-based architecture designed to address
+this task using Temporal Kolmogorov-Arnold Networks (TKANs). Inspired by the
+Temporal Fusion Transformer (TFT), TKAT emerges as a powerful encoder-decoder
+model tailored to handle tasks in which the observed part of the features is
+more important than the a priori known part. This new architecture combined the
+theoretical foundation of the Kolmogorov-Arnold representation with the power
+of transformers. TKAT aims to simplify the complex dependencies inherent in
+time series, making them more "interpretable". The use of transformer
+architecture in this framework allows us to capture long-range dependencies
+through self-attention mechanisms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Inpainting Pathology in Lumbar Spine MRI with Latent Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02477v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02477v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Colin Hansen, Simas Glinskis, Ashwin Raju, Micha Kornreich, JinHyeong Park, Jayashri Pawar, Richard Herzog, Li Zhang, Benjamin Odry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data driven models for automated diagnosis in radiology suffer from
+insufficient and imbalanced datasets due to low representation of pathology in
+a population and the cost of expert annotations. Datasets can be bolstered
+through data augmentation. However, even when utilizing a full suite of
+transformations during model training, typical data augmentations do not
+address variations in human anatomy. An alternative direction is to synthesize
+data using generative models, which can potentially craft datasets with
+specific attributes. While this holds promise, commonly used generative models
+such as Generative Adversarial Networks may inadvertently produce anatomically
+inaccurate features. On the other hand, diffusion models, which offer greater
+stability, tend to memorize training data, raising concerns about privacy and
+generative diversity. Alternatively, inpainting has the potential to augment
+data through directly inserting pathology in medical images. However, this
+approach introduces a new challenge: accurately merging the generated
+pathological features with the surrounding anatomical context. While inpainting
+is a well established method for addressing simple lesions, its application to
+pathologies that involve complex structural changes remains relatively
+unexplored. We propose an efficient method for inpainting pathological features
+onto healthy anatomy in MRI through voxelwise noise scheduling in a latent
+diffusion model. We evaluate the method's ability to insert disc herniation and
+central canal stenosis in lumbar spine sagittal T2 MRI, and it achieves
+superior Frechet Inception Distance compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Designing Quantum Experiments with Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sören Arlt, Haonan Duan, Felix Li, Sang Michael Xie, Yuhuai Wu, Mario Krenn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI) has the potential to significantly advance
+scientific discovery by finding solutions beyond human capabilities. However,
+these super-human solutions are often unintuitive and require considerable
+effort to uncover underlying principles, if possible at all. Here, we show how
+a code-generating language model trained on synthetic data can not only find
+solutions to specific problems but can create meta-solutions, which solve an
+entire class of problems in one shot and simultaneously offer insight into the
+underlying design principles. Specifically, for the design of new quantum
+physics experiments, our sequence-to-sequence transformer architecture
+generates interpretable Python code that describes experimental blueprints for
+a whole class of quantum systems. We discover general and previously unknown
+design rules for infinitely large classes of quantum states. The ability to
+automatically generate generalized patterns in readable computer code is a
+crucial step toward machines that help discover new scientific understanding --
+one of the central aims of physics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10+3 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Landscape-Aware Growing: The Power of a Little LAG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefani Karp, Nikunj Saunshi, Sobhan Miryoosefi, Sashank J. Reddi, Sanjiv Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been increasing interest in efficient pretraining
+paradigms for training Transformer-based models. Several recent approaches use
+smaller models to initialize larger models in order to save computation (e.g.,
+stacking and fusion). In this work, we study the fundamental question of how to
+select the best growing strategy from a given pool of growing strategies. Prior
+works have extensively focused on loss- and/or function-preserving behavior at
+initialization or simply performance at the end of training. Instead, we
+identify that behavior at initialization can be misleading as a predictor of
+final performance and present an alternative perspective based on early
+training dynamics, which we call "landscape-aware growing (LAG)". We perform
+extensive analysis of correlation of the final performance with performance in
+the initial steps of training and find early and more accurate predictions of
+the optimal growing strategy (i.e., with only a small "lag" after
+initialization). This perspective also motivates an adaptive strategy for
+gradual stacking.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study into Clustering of Unseen <span class="highlight-title">Dataset</span>s with
+  <span class="highlight-title">Self-Supervised</span> Encoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Scott C. Lowe, Joakim Bruslund Haurum, Sageev Oore, Thomas B. Moeslund, Graham W. Taylor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can pretrained models generalize to new datasets without any retraining? We
+deploy pretrained image models on datasets they were not trained for, and
+investigate whether their embeddings form meaningful clusters. Our suite of
+benchmarking experiments use encoders pretrained solely on ImageNet-1k with
+either supervised or self-supervised training techniques, deployed on image
+datasets that were not seen during training, and clustered with conventional
+clustering algorithms. This evaluation provides new insights into the
+embeddings of self-supervised models, which prioritize different features to
+supervised models. Supervised encoders typically offer more utility than SSL
+encoders within the training domain, and vice-versa far outside of it, however,
+fine-tuned encoders demonstrate the opposite trend. Clustering provides a way
+to evaluate the utility of self-supervised learned representations orthogonal
+to existing methods such as kNN. Additionally, we find the silhouette score
+when measured in a UMAP-reduced space is highly correlated with clustering
+performance, and can therefore be used as a proxy for clustering performance on
+data with no ground truth labels. Our code implementation is available at
+\url{https://github.com/scottclowe/zs-ssl-clustering/}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Meta-Learners for Partially-Identified Treatment Effects Across Multiple
+  Environments <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonas Schweisthal, Dennis Frauen, Mihaela van der Schaar, Stefan Feuerriegel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the conditional average treatment effect (CATE) from observational
+data is relevant for many applications such as personalized medicine. Here, we
+focus on the widespread setting where the observational data come from multiple
+environments, such as different hospitals, physicians, or countries.
+Furthermore, we allow for violations of standard causal assumptions, namely,
+overlap within the environments and unconfoundedness. To this end, we move away
+from point identification and focus on partial identification. Specifically, we
+show that current assumptions from the literature on multiple environments
+allow us to interpret the environment as an instrumental variable (IV). This
+allows us to adapt bounds from the IV literature for partial identification of
+CATE by leveraging treatment assignment mechanisms across environments. Then,
+we propose different model-agnostic learners (so-called meta-learners) to
+estimate the bounds that can be used in combination with arbitrary machine
+learning models. We further demonstrate the effectiveness of our meta-learners
+across various experiments using both simulated and real-world data. Finally,
+we discuss the applicability of our meta-learners to partial identification in
+instrumental variable settings, such as randomized controlled trials with
+non-compliance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine learning Hubbard parameters with equivariant neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02457v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02457v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Uhrin, Austin Zadoks, Luca Binci, Nicola Marzari, Iurii Timrov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Density-functional theory with extended Hubbard functionals (DFT+$U$+$V$)
+provides a robust framework to accurately describe complex materials containing
+transition-metal or rare-earth elements. It does so by mitigating
+self-interaction errors inherent to semi-local functionals which are
+particularly pronounced in systems with partially-filled $d$ and $f$ electronic
+states. However, achieving accuracy in this approach hinges upon the accurate
+determination of the on-site $U$ and inter-site $V$ Hubbard parameters. In
+practice, these are obtained either by semi-empirical tuning, requiring prior
+knowledge, or, more correctly, by using predictive but expensive
+first-principles calculations. Here, we present a machine learning model based
+on equivariant neural networks which uses atomic occupation matrices as
+descriptors, directly capturing the electronic structure, local chemical
+environment, and oxidation states of the system at hand. We target here the
+prediction of Hubbard parameters computed self-consistently with iterative
+linear-response calculations, as implemented in density-functional perturbation
+theory (DFPT), and structural relaxations. Remarkably, when trained on data
+from 11 materials spanning various crystal structures and compositions, our
+model achieves mean absolute relative errors of 3% and 5% for Hubbard $U$ and
+$V$ parameters, respectively. By circumventing computationally expensive DFT or
+DFPT self-consistent protocols, our model significantly expedites the
+prediction of Hubbard parameters with negligible computational overhead, while
+approaching the accuracy of DFPT. Moreover, owing to its robust
+transferability, the model facilitates accelerated materials discovery and
+design via high-throughput calculations, with relevance for various
+technological applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Offline Bayesian Aleatoric and Epistemic Uncertainty Quantification and
+  Posterior Value Optimisation in Finite-State MDPs <span class="chip">UAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02456v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02456v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Filippo Valdettaro, A. Aldo Faisal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address the challenge of quantifying Bayesian uncertainty and
+incorporating it in offline use cases of finite-state Markov Decision Processes
+(MDPs) with unknown dynamics. Our approach provides a principled method to
+disentangle epistemic and aleatoric uncertainty, and a novel technique to find
+policies that optimise Bayesian posterior expected value without relying on
+strong assumptions about the MDP's posterior distribution. First, we utilise
+standard Bayesian reinforcement learning methods to capture the posterior
+uncertainty in MDP parameters based on available data. We then analytically
+compute the first two moments of the return distribution across posterior
+samples and apply the law of total variance to disentangle aleatoric and
+epistemic uncertainties. To find policies that maximise posterior expected
+value, we leverage the closed-form expression for value as a function of
+policy. This allows us to propose a stochastic gradient-based approach for
+solving the problem. We illustrate the uncertainty quantification and Bayesian
+posterior value optimisation performance of our agent in simple, interpretable
+gridworlds and validate it through ground-truth evaluations on synthetic MDPs.
+Finally, we highlight the real-world impact and computational scalability of
+our method by applying it to the AI Clinician problem, which recommends
+treatment for patients in intensive care units and has emerged as a key use
+case of finite-state MDPs with offline data. We discuss the challenges that
+arise with Bayesian modelling of larger scale MDPs while demonstrating the
+potential to apply our methods rooted in Bayesian decision theory into the real
+world. We make our code available at
+https://github.com/filippovaldettaro/finite-state-mdps .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 13 figures, 40th Conference on Uncertainty in Artificial
+  Intelligence (UAI 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Generalized Apprenticeship Learning Framework for Modeling
+  Heterogeneous Student Pedagogical Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Mirajul Islam, Xi Yang, John Hostetter, Adittya Soukarjya Saha, Min Chi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge in e-learning environments like Intelligent Tutoring Systems
+(ITSs) is to induce effective pedagogical policies efficiently. While Deep
+Reinforcement Learning (DRL) often suffers from sample inefficiency and reward
+function design difficulty, Apprenticeship Learning(AL) algorithms can overcome
+them. However, most AL algorithms can not handle heterogeneity as they assume
+all demonstrations are generated with a homogeneous policy driven by a single
+reward function. Still, some AL algorithms which consider heterogeneity, often
+can not generalize to large continuous state space and only work with discrete
+states. In this paper, we propose an expectation-maximization(EM)-EDM, a
+general AL framework to induce effective pedagogical policies from given
+optimal or near-optimal demonstrations, which are assumed to be driven by
+heterogeneous reward functions. We compare the effectiveness of the policies
+induced by our proposed EM-EDM against four AL-based baselines and two policies
+induced by DRL on two different but related tasks that involve pedagogical
+action prediction. Our overall results showed that, for both tasks, EM-EDM
+outperforms the four AL baselines across all performance metrics and the two
+DRL baselines. This suggests that EM-EDM can effectively model complex student
+pedagogical decision-making processes through the ability to manage a large,
+continuous state space and adapt to handle diverse and heterogeneous reward
+functions with very few given demonstrations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reducing Bias in Federated Class-Incremental Learning with Hierarchical
+  Generative Prototypes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02447v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02447v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Salami, Pietro Buzzega, Matteo Mosconi, Mattia Verasani, Simone Calderara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) aims at unburdening the training of deep models by
+distributing computation across multiple devices (clients) while safeguarding
+data privacy. On top of that, Federated Continual Learning (FCL) also accounts
+for data distribution evolving over time, mirroring the dynamic nature of
+real-world environments. In this work, we shed light on the Incremental and
+Federated biases that naturally emerge in FCL. While the former is a known
+problem in Continual Learning, stemming from the prioritization of recently
+introduced classes, the latter (i.e., the bias towards local distributions)
+remains relatively unexplored. Our proposal constrains both biases in the last
+layer by efficiently fine-tuning a pre-trained backbone using learnable
+prompts, resulting in clients that produce less biased representations and more
+biased classifiers. Therefore, instead of solely relying on parameter
+aggregation, we also leverage generative prototypes to effectively balance the
+predictions of the global model. Our method improves on the current State Of
+The Art, providing an average increase of +7.9% in accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Coresets for Multiple $\ell_p$ Regression <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02432v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02432v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David P. Woodruff, Taisuke Yasuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A coreset of a dataset with $n$ examples and $d$ features is a weighted
+subset of examples that is sufficient for solving downstream data analytic
+tasks. Nearly optimal constructions of coresets for least squares and $\ell_p$
+linear regression with a single response are known in prior work. However, for
+multiple $\ell_p$ regression where there can be $m$ responses, there are no
+known constructions with size sublinear in $m$. In this work, we construct
+coresets of size $\tilde O(\varepsilon^{-2}d)$ for $p<2$ and $\tilde
+O(\varepsilon^{-p}d^{p/2})$ for $p>2$ independently of $m$ (i.e.,
+dimension-free) that approximate the multiple $\ell_p$ regression objective at
+every point in the domain up to $(1\pm\varepsilon)$ relative error. If we only
+need to preserve the minimizer subject to a subspace constraint, we improve
+these bounds by an $\varepsilon$ factor for all $p>1$. All of our bounds are
+nearly tight.
+  We give two application of our results. First, we settle the number of
+uniform samples needed to approximate $\ell_p$ Euclidean power means up to a
+$(1+\varepsilon)$ factor, showing that $\tilde\Theta(\varepsilon^{-2})$ samples
+for $p = 1$, $\tilde\Theta(\varepsilon^{-1})$ samples for $1 < p < 2$, and
+$\tilde\Theta(\varepsilon^{1-p})$ samples for $p>2$ is tight, answering a
+question of Cohen-Addad, Saulpic, and Schwiegelshohn. Second, we show that for
+$1<p<2$, every matrix has a subset of $\tilde O(\varepsilon^{-1}k)$ rows which
+spans a $(1+\varepsilon)$-approximately optimal $k$-dimensional subspace for
+$\ell_p$ subspace approximation, which is also nearly optimal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reweighted Solutions for Weighted Low Rank Approximation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David P. Woodruff, Taisuke Yasuda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weighted low rank approximation (WLRA) is an important yet computationally
+challenging primitive with applications ranging from statistical analysis,
+model compression, and signal processing. To cope with the NP-hardness of this
+problem, prior work considers heuristics, bicriteria, or fixed parameter
+tractable algorithms to solve this problem. In this work, we introduce a new
+relaxed solution to WLRA which outputs a matrix that is not necessarily low
+rank, but can be stored using very few parameters and gives provable
+approximation guarantees when the weight matrix has low rank. Our central idea
+is to use the weight matrix itself to reweight a low rank solution, which gives
+an extremely simple algorithm with remarkable empirical performance in
+applications to model compression and on synthetic datasets. Our algorithm also
+gives nearly optimal communication complexity bounds for a natural distributed
+problem associated with this problem, for which we show matching communication
+lower bounds. Together, our communication complexity bounds show that the rank
+of the weight matrix provably parameterizes the communication complexity of
+WLRA. We also obtain the first relative error guarantees for feature selection
+with a weighted objective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing Neural Unit Dynamics for Effective and Scalable
+  Class-Incremental Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Depeng Li, Tianqi Wang, Junwei Chen, Wei Dai, Zhigang Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class-incremental learning (CIL) aims to train a model to learn new classes
+from non-stationary data streams without forgetting old ones. In this paper, we
+propose a new kind of connectionist model by tailoring neural unit dynamics
+that adapt the behavior of neural networks for CIL. In each training session,
+it introduces a supervisory mechanism to guide network expansion whose growth
+size is compactly commensurate with the intrinsic complexity of a newly
+arriving task. This constructs a near-minimal network while allowing the model
+to expand its capacity when cannot sufficiently hold new classes. At inference
+time, it automatically reactivates the required neural units to retrieve
+knowledge and leaves the remaining inactivated to prevent interference. We name
+our model AutoActivator, which is effective and scalable. To gain insights into
+the neural unit dynamics, we theoretically analyze the model's convergence
+property via a universal approximation theorem on learning sequential mappings,
+which is under-explored in the CIL community. Experiments show that our method
+achieves strong CIL performance in rehearsal-free and minimal-expansion
+settings with different backbones.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contextual Optimization under Covariate Shift: A Robust Approach by
+  Intersecting Wasserstein Balls 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02426v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02426v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyu Wang, Ningyuan Chen, Chun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contextual optimization, a decision-maker observes historical samples of
+uncertain variables and associated concurrent covariates, without knowing their
+joint distribution. Given an additional covariate observation, the goal is to
+choose a decision that minimizes some operational costs. A prevalent issue here
+is covariate shift, where the marginal distribution of the new covariate
+differs from historical samples, leading to decision performance variations
+with nonparametric or parametric estimators. To address this, we propose a
+distributionally robust approach that uses an ambiguity set by the intersection
+of two Wasserstein balls, each centered on typical nonparametric or parametric
+distribution estimators. Computationally, we establish the tractable
+reformulation of this distributionally robust optimization problem.
+Statistically, we provide guarantees for our Wasserstein ball intersection
+approach under covariate shift by analyzing the measure concentration of the
+estimators. Furthermore, to reduce computational complexity, we employ a
+surrogate objective that maintains similar generalization guarantees. Through
+synthetic and empirical case studies on income prediction and portfolio
+optimization, we demonstrate the strong empirical performance of our proposed
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contextual Dynamic Pricing: Algorithms, Optimality, and Local
+  Differential Privacy Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02424v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02424v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zifeng Zhao, Feiyu Jiang, Yi Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the contextual dynamic pricing problem where a firm sells products
+to $T$ sequentially arriving consumers that behave according to an unknown
+demand model. The firm aims to maximize its revenue, i.e. minimize its regret
+over a clairvoyant that knows the model in advance. The demand model is a
+generalized linear model (GLM), allowing for a stochastic feature vector in
+$\mathbb R^d$ that encodes product and consumer information. We first show that
+the optimal regret upper bound is of order $\sqrt{dT}$, up to a logarithmic
+factor, improving upon existing upper bounds in the literature by a $\sqrt{d}$
+factor. This sharper rate is materialised by two algorithms: a confidence
+bound-type (supCB) algorithm and an explore-then-commit (ETC) algorithm. A key
+insight of our theoretical result is an intrinsic connection between dynamic
+pricing and the contextual multi-armed bandit problem with many arms based on a
+careful discretization. We further study contextual dynamic pricing under the
+local differential privacy (LDP) constraints. In particular, we propose a
+stochastic gradient descent based ETC algorithm that achieves an optimal regret
+upper bound of order $d\sqrt{T}/\epsilon$, up to a logarithmic factor, where
+$\epsilon>0$ is the privacy parameter. The regret upper bounds with and without
+LDP constraints are accompanied by newly constructed minimax lower bounds,
+which further characterize the cost of privacy. Extensive numerical experiments
+and a real data application on online lending are conducted to illustrate the
+efficiency and practical value of the proposed algorithms in dynamic pricing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IterMask2: Iterative Unsupervised Anomaly Segmentation via Spatial and
+  Frequency Masking for Brain Lesions in MRI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02422v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02422v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyun Liang, Xiaoqing Guo, J. Alison Noble, Konstantinos Kamnitsas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised anomaly segmentation approaches to pathology segmentation train
+a model on images of healthy subjects, that they define as the 'normal' data
+distribution. At inference, they aim to segment any pathologies in new images
+as 'anomalies', as they exhibit patterns that deviate from those in 'normal'
+training data. Prevailing methods follow the 'corrupt-and-reconstruct'
+paradigm. They intentionally corrupt an input image, reconstruct it to follow
+the learned 'normal' distribution, and subsequently segment anomalies based on
+reconstruction error. Corrupting an input image, however, inevitably leads to
+suboptimal reconstruction even of normal regions, causing false positives. To
+alleviate this, we propose a novel iterative spatial mask-refining strategy
+IterMask2. We iteratively mask areas of the image, reconstruct them, and update
+the mask based on reconstruction error. This iterative process progressively
+adds information about areas that are confidently normal as per the model. The
+increasing content guides reconstruction of nearby masked areas, improving
+reconstruction of normal tissue under these areas, reducing false positives. We
+also use high-frequency image content as an auxiliary input to provide
+additional structural information for masked areas. This further improves
+reconstruction error of normal in comparison to anomalous areas, facilitating
+segmentation of the latter. We conduct experiments on several brain lesion
+datasets and demonstrate effectiveness of our method. Code is available at:
+https://github.com/ZiyunLiang/IterMasks2
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Representing Piecewise-Linear Functions by Functions with Minimal Arity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoph Koutschan, Anton Ponomarchuk, Josef Schicho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Any continuous piecewise-linear function $F\colon \mathbb{R}^{n}\to
+\mathbb{R}$ can be represented as a linear combination of $\max$ functions of
+at most $n+1$ affine-linear functions. In our previous paper [``Representing
+piecewise linear functions by functions with small arity'', AAECC, 2023], we
+showed that this upper bound of $n+1$ arguments is tight. In the present paper,
+we extend this result by establishing a correspondence between the function $F$
+and the minimal number of arguments that are needed in any such decomposition.
+We show that the tessellation of the input space $\mathbb{R}^{n}$ induced by
+the function $F$ has a direct connection to the number of arguments in the
+$\max$ functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Modelling of Federated <span class="highlight-title">Dataset</span>s using
+  Mixtures-of-Dirichlet-Multinomials 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02416v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02416v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Scott, Áine Cahill
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In practice, training using federated learning can be orders of magnitude
+slower than standard centralized training. This severely limits the amount of
+experimentation and tuning that can be done, making it challenging to obtain
+good performance on a given task. Server-side proxy data can be used to run
+training simulations, for instance for hyperparameter tuning. This can greatly
+speed up the training pipeline by reducing the number of tuning runs to be
+performed overall on the true clients. However, it is challenging to ensure
+that these simulations accurately reflect the dynamics of the real federated
+training. In particular, the proxy data used for simulations often comes as a
+single centralized dataset without a partition into distinct clients, and
+partitioning this data in a naive way can lead to simulations that poorly
+reflect real federated training. In this paper we address the challenge of how
+to partition centralized data in a way that reflects the statistical
+heterogeneity of the true federated clients. We propose a fully federated,
+theoretically justified, algorithm that efficiently learns the distribution of
+the true clients and observe improved server-side simulations when using the
+inferred distribution to create simulated clients from the centralized data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GrootVL: Tree Topology is All You Need in State Space Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicheng Xiao, Lin Song, Shaoli Huang, Jiangshan Wang, Siyu Song, Yixiao Ge, Xiu Li, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The state space models, employing recursively propagated features,
+demonstrate strong representation capabilities comparable to Transformer models
+and superior efficiency. However, constrained by the inherent geometric
+constraints of sequences, it still falls short in modeling long-range
+dependencies. To address this issue, we propose the GrootVL network, which
+first dynamically generates a tree topology based on spatial relationships and
+input features. Then, feature propagation is performed based on this graph,
+thereby breaking the original sequence constraints to achieve stronger
+representation capabilities. Additionally, we introduce a linear complexity
+dynamic programming algorithm to enhance long-range interactions without
+increasing computational cost. GrootVL is a versatile multimodal framework that
+can be applied to both visual and textual tasks. Extensive experiments
+demonstrate that our method significantly outperforms existing structured state
+space models on image classification, object detection and segmentation.
+Besides, by fine-tuning large language models, our approach achieves consistent
+improvements in multiple textual tasks at minor training cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is available at https://github.com/EasonXiao-888/GrootVL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiple Choice Questions and Large Languages Models: A Case Study with
+  Fictional Medical Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Griot, Jean Vanderdonckt, Demet Yuksel, Coralie Hemptinne
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) like ChatGPT demonstrate significant potential
+in the medical field, often evaluated using multiple-choice questions (MCQs)
+similar to those found on the USMLE. Despite their prevalence in medical
+education, MCQs have limitations that might be exacerbated when assessing LLMs.
+To evaluate the effectiveness of MCQs in assessing the performance of LLMs, we
+developed a fictional medical benchmark focused on a non-existent gland, the
+Glianorex. This approach allowed us to isolate the knowledge of the LLM from
+its test-taking abilities. We used GPT-4 to generate a comprehensive textbook
+on the Glianorex in both English and French and developed corresponding
+multiple-choice questions in both languages. We evaluated various open-source,
+proprietary, and domain-specific LLMs using these questions in a zero-shot
+setting. The models achieved average scores around 67%, with minor performance
+differences between larger and smaller models. Performance was slightly higher
+in English than in French. Fine-tuned medical models showed some improvement
+over their base versions in English but not in French. The uniformly high
+performance across models suggests that traditional MCQ-based benchmarks may
+not accurately measure LLMs' clinical knowledge and reasoning abilities,
+instead highlighting their pattern recognition skills. This study underscores
+the need for more robust evaluation methods to better assess the true
+capabilities of LLMs in medical contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Edit Visual Programs with Self-Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02383v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02383v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. Kenny Jones, Renhao Zhang, Aditya Ganeshan, Daniel Ritchie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We design a system that learns how to edit visual programs. Our edit network
+consumes a complete input program and a visual target. From this input, we task
+our network with predicting a local edit operation that could be applied to the
+input program to improve its similarity to the target. In order to apply this
+scheme for domains that lack program annotations, we develop a self-supervised
+learning approach that integrates this edit network into a bootstrapped
+finetuning loop along with a network that predicts entire programs in one-shot.
+Our joint finetuning scheme, when coupled with an inference procedure that
+initializes a population from the one-shot model and evolves members of this
+population with the edit network, helps to infer more accurate visual programs.
+Over multiple domains, we experimentally compare our method against the
+alternative of using only the one-shot model, and find that even under equal
+search-time budgets, our editing-based paradigm provides significant
+advantages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Finding NeMo: Localizing Neurons Responsible For Memorization in
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02366v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02366v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Hintersdorf, Lukas Struppek, Kristian Kersting, Adam Dziedzic, Franziska Boenisch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models (DMs) produce very detailed and high-quality images. Their
+power results from extensive training on large amounts of data, usually scraped
+from the internet without proper attribution or consent from content creators.
+Unfortunately, this practice raises privacy and intellectual property concerns,
+as DMs can memorize and later reproduce their potentially sensitive or
+copyrighted training images at inference time. Prior efforts prevent this issue
+by either changing the input to the diffusion process, thereby preventing the
+DM from generating memorized samples during inference, or removing the
+memorized data from training altogether. While those are viable solutions when
+the DM is developed and deployed in a secure and constantly monitored
+environment, they hold the risk of adversaries circumventing the safeguards and
+are not effective when the DM itself is publicly released. To solve the
+problem, we introduce NeMo, the first method to localize memorization of
+individual data samples down to the level of neurons in DMs' cross-attention
+layers. Through our experiments, we make the intriguing finding that in many
+cases, single neurons are responsible for memorizing particular training
+samples. By deactivating these memorization neurons, we can avoid the
+replication of training data at inference time, increase the diversity in the
+generated outputs, and mitigate the leakage of private and copyrighted data. In
+this way, our NeMo contributes to a more responsible deployment of DMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Temporal Graph Rewiring with Expander Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02362v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02362v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katarina Petrović, Shenyang Huang, Farimah Poursafaei, Petar Veličković
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evolving relations in real-world networks are often modelled by temporal
+graphs. Graph rewiring techniques have been utilised on Graph Neural Networks
+(GNNs) to improve expressiveness and increase model performance. In this work,
+we propose Temporal Graph Rewiring (TGR), the first approach for graph rewiring
+on temporal graphs. TGR enables communication between temporally distant nodes
+in a continuous time dynamic graph by utilising expander graph propagation to
+construct a message passing highway for message passing between distant nodes.
+Expander graphs are suitable candidates for rewiring as they help overcome the
+oversquashing problem often observed in GNNs. On the public tgbl-wiki
+benchmark, we show that TGR improves the performance of a widely used TGN model
+by a significant margin. Our code repository is accessible at
+https://anonymous.4open.science/r/TGR-254C.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using <span class="highlight-title">Self-supervised</span> Learning Can Improve Model Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sofia Yfantidou, Dimitris Spathis, Marios Constantinides, Athena Vakali, Daniele Quercia, Fahim Kawsar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning (SSL) has become the de facto training paradigm of
+large models, where pre-training is followed by supervised fine-tuning using
+domain-specific data and labels. Despite demonstrating comparable performance
+with supervised methods, comprehensive efforts to assess SSL's impact on
+machine learning fairness (i.e., performing equally on different demographic
+breakdowns) are lacking. Hypothesizing that SSL models would learn more
+generic, hence less biased representations, this study explores the impact of
+pre-training and fine-tuning strategies on fairness. We introduce a fairness
+assessment framework for SSL, comprising five stages: defining dataset
+requirements, pre-training, fine-tuning with gradual unfreezing, assessing
+representation similarity conditioned on demographics, and establishing
+domain-specific evaluation processes. We evaluate our method's generalizability
+on three real-world human-centric datasets (i.e., MIMIC, MESA, and GLOBEM) by
+systematically comparing hundreds of SSL and fine-tuned models on various
+dimensions spanning from the intermediate representations to appropriate
+evaluation metrics. Our findings demonstrate that SSL can significantly improve
+model fairness, while maintaining performance on par with supervised
+methods-exhibiting up to a 30% increase in fairness with minimal loss in
+performance through self-supervision. We posit that such differences can be
+attributed to representation dissimilarities found between the best- and the
+worst-performing demographics across models-up to x13 greater for protected
+attributes with larger performance discrepancies between segments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2401.01640</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The complexity of approximate (coarse) correlated equilibrium for
+  incomplete information games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02357v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02357v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binghui Peng, Aviad Rubinstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the iteration complexity of decentralized learning of approximate
+correlated equilibria in incomplete information games.
+  On the negative side, we prove that in $\mathit{extensive}$-$\mathit{form}$
+$\mathit{games}$, assuming $\mathsf{PPAD} \not\subset
+\mathsf{TIME}(n^{\mathsf{polylog}(n)})$, any polynomial-time learning
+algorithms must take at least $2^{\log_2^{1-o(1)}(|\mathcal{I}|)}$ iterations
+to converge to the set of $\epsilon$-approximate correlated equilibrium, where
+$|\mathcal{I}|$ is the number of nodes in the game and $\epsilon > 0$ is an
+absolute constant. This nearly matches, up to the $o(1)$ term, the algorithms
+of [PR'24, DDFG'24] for learning $\epsilon$-approximate correlated equilibrium,
+and resolves an open question of Anagnostides, Kalavasis, Sandholm, and
+Zampetakis [AKSZ'24]. Our lower bound holds even for the easier solution
+concept of $\epsilon$-approximate $\mathit{coarse}$ correlated equilibrium
+  On the positive side, we give uncoupled dynamics that reach
+$\epsilon$-approximate correlated equilibria of a $\mathit{Bayesian}$
+$\mathit{game}$ in polylogarithmic iterations, without any dependence of the
+number of types. This demonstrates a separation between Bayesian games and
+extensive-form games.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Models Do Hard Arithmetic Tasks Easily and Hardly Do Easy
+  Arithmetic Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02356v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02356v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Gambardella, Yusuke Iwasawa, Yutaka Matsuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability (and inability) of large language models (LLMs) to perform
+arithmetic tasks has been the subject of much theoretical and practical debate.
+We show that LLMs are frequently able to correctly and confidently predict the
+first digit of n-digit by m-digit multiplication tasks without using chain of
+thought reasoning, despite these tasks require compounding operations to solve.
+Simultaneously, LLMs in practice often fail to correctly or confidently predict
+the last digit of an n-digit by m-digit multiplication, a task equivalent to
+1-digit by 1-digit multiplication which can be easily learned or memorized. We
+show that the latter task can be solved more robustly when the LLM is
+conditioned on all of the correct higher-order digits, which on average
+increases the confidence of the correct last digit on 5-digit by 5-digit
+multiplication tasks using Llama 2-13B by over 230% (0.13 to 0.43) and
+Mistral-7B by 150% (0.22 to 0.55).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the 62nd Annual Meeting of the Association for
+  Computational Linguistics (Volume 2: Short Papers)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FedDr+: Stabilizing Dot-regression with Global Feature Distillation for
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seongyoon Kim, Minchan Jeong, Sungnyun Kim, Sungwoo Cho, Sumyeong Ahn, Se-Young Yun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has emerged as a pivotal framework for the
+development of effective global models (global FL) or personalized models
+(personalized FL) across clients with heterogeneous, non-iid data distribution.
+A key challenge in FL is client drift, where data heterogeneity impedes the
+aggregation of scattered knowledge. Recent studies have tackled the client
+drift issue by identifying significant divergence in the last classifier layer.
+To mitigate this divergence, strategies such as freezing the classifier weights
+and aligning the feature extractor accordingly have proven effective. Although
+the local alignment between classifier and feature extractor has been studied
+as a crucial factor in FL, we observe that it may lead the model to
+overemphasize the observed classes within each client. Thus, our objectives are
+twofold: (1) enhancing local alignment while (2) preserving the representation
+of unseen class samples. This approach aims to effectively integrate knowledge
+from individual clients, thereby improving performance for both global and
+personalized FL. To achieve this, we introduce a novel algorithm named FedDr+,
+which empowers local model alignment using dot-regression loss. FedDr+ freezes
+the classifier as a simplex ETF to align the features and improves aggregated
+global models by employing a feature distillation mechanism to retain
+information about unseen/missing classes. Consequently, we provide empirical
+evidence demonstrating that our algorithm surpasses existing methods that use a
+frozen classifier to boost alignment across the diverse distribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Label-wise Aleatoric and Epistemic Uncertainty Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02354v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02354v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yusuf Sale, Paul Hofman, Timo Löhr, Lisa Wimmer, Thomas Nagler, Eyke Hüllermeier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel approach to uncertainty quantification in classification
+tasks based on label-wise decomposition of uncertainty measures. This
+label-wise perspective allows uncertainty to be quantified at the individual
+class level, thereby improving cost-sensitive decision-making and helping
+understand the sources of uncertainty. Furthermore, it allows to define total,
+aleatoric, and epistemic uncertainty on the basis of non-categorical measures
+such as variance, going beyond common entropy-based measures. In particular,
+variance-based measures address some of the limitations associated with
+established methods that have recently been discussed in the literature. We
+show that our proposed measures adhere to a number of desirable properties.
+Through empirical evaluation on a variety of benchmark data sets -- including
+applications in the medical domain where accurate uncertainty quantification is
+crucial -- we establish the effectiveness of label-wise uncertainty
+quantification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Uncertainty in Artificial Intelligence. arXiv admin note: substantial
+  text overlap with arXiv:2401.00276</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ System-Aware Neural ODE Processes for Few-Shot Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02352v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02352v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jixiang Qing, Becky D Langdon, Robert M Lee, Behrang Shafei, Mark van der Wilk, Calvin Tsay, Ruth Misener
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of optimizing initial conditions and timing in
+dynamical systems governed by unknown ordinary differential equations (ODEs),
+where evaluating different initial conditions is costly and there are
+constraints on observation times. To identify the optimal conditions within
+several trials, we introduce a few-shot Bayesian Optimization (BO) framework
+based on the system's prior information. At the core of our approach is the
+System-Aware Neural ODE Processes (SANODEP), an extension of Neural ODE
+Processes (NODEP) designed to meta-learn ODE systems from multiple trajectories
+using a novel context embedding block. Additionally, we propose a
+multi-scenario loss function specifically for optimization purposes. Our
+two-stage BO framework effectively incorporates search space constraints,
+enabling efficient optimization of both initial conditions and observation
+timings. We conduct extensive experiments showcasing SANODEP's potential for
+few-shot BO. We also explore SANODEP's adaptability to varying levels of prior
+information, highlighting the trade-off between prior flexibility and model
+fitting accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AMOSL: Adaptive Modality-wise Structure Learning in Multi-view Graph
+  Neural Networks For Enhanced Unified Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02348v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02348v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiyu Liang, Hongchang Gao, Xubin He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Multi-view Graph Neural Networks (MVGNNs) excel at leveraging diverse
+modalities for learning object representation, existing methods assume
+identical local topology structures across modalities that overlook real-world
+discrepancies. This leads MVGNNs straggles in modality fusion and
+representations denoising. To address these issues, we propose adaptive
+modality-wise structure learning (AMoSL). AMoSL captures node correspondences
+between modalities via optimal transport, and jointly learning with graph
+embedding. To enable efficient end-to-end training, we employ an efficient
+solution for the resulting complex bilevel optimization problem. Furthermore,
+AMoSL adapts to downstream tasks through unsupervised learning on
+inter-modality distances. The effectiveness of AMoSL is demonstrated by its
+ability to train more accurate graph classifiers on six benchmark datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flash Diffusion: Accelerating Any Conditional Diffusion Model for Few
+  Steps Image Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02347v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02347v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clement Chadebec, Onur Tasar, Eyal Benaroche, Benjamin Aubin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose an efficient, fast, and versatile distillation
+method to accelerate the generation of pre-trained diffusion models: Flash
+Diffusion. The method reaches state-of-the-art performances in terms of FID and
+CLIP-Score for few steps image generation on the COCO2014 and COCO2017
+datasets, while requiring only several GPU hours of training and fewer
+trainable parameters than existing methods. In addition to its efficiency, the
+versatility of the method is also exposed across several tasks such as
+text-to-image, inpainting, face-swapping, super-resolution and using different
+backbones such as UNet-based denoisers (SD1.5, SDXL) or DiT (Pixart-$\alpha$),
+as well as adapters. In all cases, the method allowed to reduce drastically the
+number of sampling steps while maintaining very high-quality image generation.
+The official implementation is available at
+https://github.com/gojasper/flash-diffusion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages + 16 pages appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive Confident Masking Attention Network for Audio-Visual
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Wang, Feng Dong, Jinchao Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio and visual signals typically occur simultaneously, and humans possess
+an innate ability to correlate and synchronize information from these two
+modalities. Recently, a challenging problem known as Audio-Visual Segmentation
+(AVS) has emerged, intending to produce segmentation maps for sounding objects
+within a scene. However, the methods proposed so far have not sufficiently
+integrated audio and visual information, and the computational costs have been
+extremely high. Additionally, the outputs of different stages have not been
+fully utilized. To facilitate this research, we introduce a novel Progressive
+Confident Masking Attention Network (PMCANet). It leverages attention
+mechanisms to uncover the intrinsic correlations between audio signals and
+visual frames. Furthermore, we design an efficient and effective
+cross-attention module to enhance semantic perception by selecting query
+tokens. This selection is determined through confidence-driven units based on
+the network's multi-stage predictive outputs. Experiments demonstrate that our
+network outperforms other AVS methods while requiring less computational
+resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures, submitted to IEEE TRANSACTIONS ON CIRCUITS AND
+  SYSTEMS FOR VIDEO TECHNOLOGY</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incorporating Navigation Context into Inland Vessel Trajectory
+  Prediction: A Gaussian Mixture Model and <span class="highlight-title">Transformer</span> Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kathrin Donandt, Dirk Söffker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using data sources beyond the Automatic Identification System to represent
+the context a vessel is navigating in and consequently improve situation
+awareness is still rare in machine learning approaches to vessel trajectory
+prediction (VTP). In inland shipping, where vessel movement is constrained
+within fairways, navigational context information is indispensable. In this
+contribution targeting inland VTP, Gaussian Mixture Models (GMMs) are applied,
+on a fused dataset of AIS and discharge measurements, to generate multi-modal
+distribution curves, capturing typical lateral vessel positioning in the
+fairway and dislocation speeds along the waterway. By sampling the probability
+density curves of the GMMs, feature vectors are derived which are used,
+together with spatio-temporal vessel features and fairway geometries, as input
+to a VTP transformer model. The incorporation of these distribution features of
+both the current and forthcoming navigation context improves prediction
+accuracy. The superiority of the model over a previously proposed transformer
+model for inland VTP is shown. The novelty lies in the provision of
+preprocessed, statistics-based features representing the conditioned spatial
+context, rather than relying on the model to extract relevant features for the
+VTP task from contextual data. Oversimplification of the complexity of inland
+navigation patterns by assuming a single typical route or selecting specific
+clusters prior to model application is avoided by giving the model access to
+the entire distribution information. The methodology's generalizability is
+demonstrated through the usage of data of 3 distinct river sections. It can be
+integrated into an interaction-aware prediction framework, where insights into
+the positioning of the actual vessel behavior in the overall distribution at
+the current location and discharge can enhance trajectory prediction accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in Proceedings of the 27th International Conference
+  on Information Fusion (FUSION 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cluster-Aware Similarity Diffusion for Instance Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jifei Luo, Hantao Yao, Changsheng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based re-ranking is a common method used for retrieving instances
+by performing similarity propagation in a nearest neighbor graph. However,
+existing techniques that construct the affinity graph based on pairwise
+instances can lead to the propagation of misinformation from outliers and other
+manifolds, resulting in inaccurate results. To overcome this issue, we propose
+a novel Cluster-Aware Similarity (CAS) diffusion for instance retrieval. The
+primary concept of CAS is to conduct similarity diffusion within local
+clusters, which can reduce the influence from other manifolds explicitly. To
+obtain a symmetrical and smooth similarity matrix, our Bidirectional Similarity
+Diffusion strategy introduces an inverse constraint term to the optimization
+objective of local cluster diffusion. Additionally, we have optimized a
+Neighbor-guided Similarity Smoothing approach to ensure similarity consistency
+among the local neighbors of each instance. Evaluations in instance retrieval
+and object re-identification validate the effectiveness of the proposed CAS,
+our code is publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Polynomial-Augmented Neural Networks (PANNs) with Weak Orthogonality
+  Constraints for Enhanced Function and PDE Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Madison Cooley, Shandian Zhe, Robert M. Kirby, Varun Shankar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present polynomial-augmented neural networks (PANNs), a novel machine
+learning architecture that combines deep neural networks (DNNs) with a
+polynomial approximant. PANNs combine the strengths of DNNs (flexibility and
+efficiency in higher-dimensional approximation) with those of polynomial
+approximation (rapid convergence rates for smooth functions). To aid in both
+stable training and enhanced accuracy over a variety of problems, we present
+(1) a family of orthogonality constraints that impose mutual orthogonality
+between the polynomial and the DNN within a PANN; (2) a simple basis pruning
+approach to combat the curse of dimensionality introduced by the polynomial
+component; and (3) an adaptation of a polynomial preconditioning strategy to
+both DNNs and polynomials. We test the resulting architecture for its
+polynomial reproduction properties, ability to approximate both smooth
+functions and functions of limited smoothness, and as a method for the solution
+of partial differential equations (PDEs). Through these experiments, we
+demonstrate that PANNs offer superior approximation properties to DNNs for both
+regression and the numerical solution of PDEs, while also offering enhanced
+accuracy over both polynomial and DNN-based regression (each) when regressing
+functions with limited smoothness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Neural Architecture Search for Transfer Learning in 6G Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Orucu, Farnaz Moradi, Masoumeh Ebrahimi, Andreas Johnsson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The future 6G network is envisioned to be AI-native, and as such, ML models
+will be pervasive in support of optimizing performance, reducing energy
+consumption, and in coping with increasing complexity and heterogeneity. A key
+challenge is automating the process of finding optimal model architectures
+satisfying stringent requirements stemming from varying tasks, dynamicity and
+available resources in the infrastructure and deployment positions. In this
+paper, we describe and review the state-of-the-art in Neural Architecture
+Search and Transfer Learning and their applicability in networking. Further, we
+identify open research challenges and set directions with a specific focus on
+three main requirements with elements unique to the future network, namely
+combining NAS and TL, multi-objective search, and tabular data. Finally, we
+outline and discuss both near-term and long-term work ahead.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extended Mind <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phoebe Klett, Thomas Ahle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained language models demonstrate general intelligence and common
+sense, but long inputs quickly become a bottleneck for memorizing information
+at inference time. We resurface a simple method, Memorizing Transformers (Wu et
+al., 2022), that gives the model access to a bank of pre-computed memories. We
+show that it is possible to fix many of the shortcomings of the original
+method, such as the need for fine-tuning, by critically assessing how
+positional encodings should be updated for the keys and values retrieved. This
+intuitive method uses the model's own key/query system to select and attend to
+the most relevant memories at each generation step, rather than using external
+embeddings. We demonstrate the importance of external information being
+retrieved in a majority of decoder layers, contrary to previous work. We open
+source a new counterfactual long-range retrieval benchmark, and show that
+Extended Mind Transformers outperform today's state of the art by 6% on
+average.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Affine Homotopy between Language Encoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02329v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02329v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robin SM Chan, Reda Boumasmoud, Anej Svete, Yuxin Ren, Qipeng Guo, Zhijing Jin, Shauli Ravfogel, Mrinmaya Sachan, Bernhard Schölkopf, Mennatallah El-Assady, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained language encoders -- functions that represent text as vectors --
+are an integral component of many NLP tasks. We tackle a natural question in
+language encoder analysis: What does it mean for two encoders to be similar? We
+contend that a faithful measure of similarity needs to be \emph{intrinsic},
+that is, task-independent, yet still be informative of \emph{extrinsic}
+similarity -- the performance on downstream tasks. It is common to consider two
+encoders similar if they are \emph{homotopic}, i.e., if they can be aligned
+through some transformation. In this spirit, we study the properties of
+\emph{affine} alignment of language encoders and its implications on extrinsic
+similarity. We find that while affine alignment is fundamentally an asymmetric
+notion of similarity, it is still informative of extrinsic similarity. We
+confirm this on datasets of natural language representations. Beyond providing
+useful bounds on extrinsic similarity, affine intrinsic similarity also allows
+us to begin uncovering the structure of the space of pre-trained encoders by
+defining an order over them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Unsupervised Out-of-Distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lars Doorenbos, Raphael Sznitman, Pablo Márquez-Neila
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models excel when the data distribution during training aligns
+with testing data. Yet, their performance diminishes when faced with
+out-of-distribution (OOD) samples, leading to great interest in the field of
+OOD detection. Current approaches typically assume that OOD samples originate
+from an unconcentrated distribution complementary to the training distribution.
+While this assumption is appropriate in the traditional unsupervised OOD
+(U-OOD) setting, it proves inadequate when considering the place of deployment
+of the underlying deep learning model. To better reflect this real-world
+scenario, we introduce the novel setting of continual U-OOD detection. To
+tackle this new setting, we propose a method that starts from a U-OOD detector,
+which is agnostic to the OOD distribution, and slowly updates during deployment
+to account for the actual OOD distribution. Our method uses a new U-OOD scoring
+function that combines the Mahalanobis distance with a nearest-neighbor
+approach. Furthermore, we design a confidence-scaled few-shot OOD detector that
+outperforms previous methods. We show our method greatly improves upon strong
+baselines from related fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of <span class="highlight-title">Transformer</span> Enabled Time Series Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Sommers, Logan Cummins, Sudip Mittal, Shahram Rahimi, Maria Seale, Joseph Jaboure, Thomas Arnold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI has received much attention in the image and language domains,
+with the transformer neural network continuing to dominate the state of the
+art. Application of these models to time series generation is less explored,
+however, and is of great utility to machine learning, privacy preservation, and
+explainability research. The present survey identifies this gap at the
+intersection of the transformer, generative AI, and time series data, and
+reviews works in this sparsely populated subdomain. The reviewed works show
+great variety in approach, and have not yet converged on a conclusive answer to
+the problems the domain poses. GANs, diffusion models, state space models, and
+autoencoders were all encountered alongside or surrounding the transformers
+which originally motivated the survey. While too open a domain to offer
+conclusive insights, the works surveyed are quite suggestive, and several
+recommendations for best practice, and suggestions of valuable future work, are
+provided.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PeFAD: A Parameter-Efficient Federated Framework for Time Series Anomaly
+  Detection <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ronghui Xu, Hao Miao, Senzhang Wang, Philip S. Yu, Jianxin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the proliferation of mobile sensing techniques, huge amounts of time
+series data are generated and accumulated in various domains, fueling plenty of
+real-world applications. In this setting, time series anomaly detection is
+practically important. It endeavors to identify deviant samples from the normal
+sample distribution in time series. Existing approaches generally assume that
+all the time series is available at a central location. However, we are
+witnessing the decentralized collection of time series due to the deployment of
+various edge devices. To bridge the gap between the decentralized time series
+data and the centralized anomaly detection algorithms, we propose a
+Parameter-efficient Federated Anomaly Detection framework named PeFAD with the
+increasing privacy concerns. PeFAD for the first time employs the pre-trained
+language model (PLM) as the body of the client's local model, which can benefit
+from its cross-modality knowledge transfer capability. To reduce the
+communication overhead and local model adaptation cost, we propose a
+parameter-efficient federated training module such that clients only need to
+fine-tune small-scale parameters and transmit them to the server for update.
+PeFAD utilizes a novel anomaly-driven mask selection strategy to mitigate the
+impact of neglected anomalies during training. A knowledge distillation
+operation on a synthetic privacy-preserving dataset that is shared by all the
+clients is also proposed to address the data heterogeneity issue across
+clients. We conduct extensive evaluations on four real datasets, where PeFAD
+outperforms existing state-of-the-art baselines by up to 28.74\%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGKDD 2024 (Research Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Conditional Distributions by Neural (Entropic) Optimal
+  Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02317v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02317v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bao Nguyen, Binh Nguyen, Hieu Trung Nguyen, Viet Anh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning conditional distributions is challenging because the desired outcome
+is not a single distribution but multiple distributions that correspond to
+multiple instances of the covariates. We introduce a novel neural entropic
+optimal transport method designed to effectively learn generative models of
+conditional distributions, particularly in scenarios characterized by limited
+sample sizes. Our method relies on the minimax training of two neural networks:
+a generative network parametrizing the inverse cumulative distribution
+functions of the conditional distributions and another network parametrizing
+the conditional Kantorovich potential. To prevent overfitting, we regularize
+the objective function by penalizing the Lipschitz constant of the network
+output. Our experiments on real-world datasets show the effectiveness of our
+algorithm compared to state-of-the-art conditional distribution learning
+techniques. Our implementation can be found at
+https://github.com/nguyenngocbaocmt02/GENTLE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Independence-promoting Loss for Music Generation with Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02315v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02315v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean-Marie Lemercier, Simon Rouard, Jade Copet, Yossi Adi, Alexandre Déffosez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music generation schemes using language modeling rely on a vocabulary of
+audio tokens, generally provided as codes in a discrete latent space learnt by
+an auto-encoder. Multi-stage quantizers are often employed to produce these
+tokens, therefore the decoding strategy used for token prediction must be
+adapted to account for multiple codebooks: either it should model the joint
+distribution over all codebooks, or fit the product of the codebook marginal
+distributions. Modelling the joint distribution requires a costly increase in
+the number of auto-regressive steps, while fitting the product of the marginals
+yields an inexact model unless the codebooks are mutually independent. In this
+work, we introduce an independence-promoting loss to regularize the
+auto-encoder used as the tokenizer in language models for music generation. The
+proposed loss is a proxy for mutual information based on the maximum mean
+discrepancy principle, applied in reproducible kernel Hilbert spaces. Our
+criterion is simple to implement and train, and it is generalizable to other
+multi-stream codecs. We show that it reduces the statistical dependence between
+codebooks during auto-encoding. This leads to an increase in the generated
+music quality when modelling the product of the marginal distributions, while
+generating audio much faster than the joint distribution model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Thermodynamic Integration: Free Energies from Energy-based
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bálint Máté, François Fleuret, Tristan Bereau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thermodynamic integration (TI) offers a rigorous method for estimating
+free-energy differences by integrating over a sequence of interpolating
+conformational ensembles. However, TI calculations are computationally
+expensive and typically limited to coupling a small number of degrees of
+freedom due to the need to sample numerous intermediate ensembles with
+sufficient conformational-space overlap. In this work, we propose to perform TI
+along an alchemical pathway represented by a trainable neural network, which we
+term Neural TI. Critically, we parametrize a time-dependent Hamiltonian
+interpolating between the interacting and non-interacting systems, and optimize
+its gradient using a denoising-diffusion objective. The ability of the
+resulting energy-based diffusion model to sample all intermediate ensembles,
+allows us to perform TI from a single reference calculation. We apply our
+method to Lennard-Jones fluids, where we report accurate calculations of the
+excess chemical potential, demonstrating that Neural TI is capable of coupling
+hundreds of degrees of freedom at once.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangled Representation via Variational AutoEncoder for Continuous
+  Treatment Effect Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruijing Cui, Jianbin Sun, Bingyu He, Kewei Yang, Bingfeng Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continuous treatment effect estimation holds significant practical importance
+across various decision-making and assessment domains, such as healthcare and
+the military. However, current methods for estimating dose-response curves
+hinge on balancing the entire representation by treating all covariates as
+confounding variables. Although various approaches disentangle covariates into
+different factors for treatment effect estimation, they are confined to binary
+treatment settings. Moreover, observational data are often tainted with
+non-causal noise information that is imperceptible to the human. Hence, in this
+paper, we propose a novel Dose-Response curve estimator via Variational
+AutoEncoder (DRVAE) disentangled covariates representation. Our model is
+dedicated to disentangling covariates into instrumental factors, confounding
+factors, adjustment factors, and external noise factors, thereby facilitating
+the estimation of treatment effects under continuous treatment settings by
+balancing the disentangled confounding factors. Extensive results on synthetic
+and semi-synthetic datasets demonstrate that our model outperforms the current
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effects of Exponential Gaussian Distribution on (Double Sampling)
+  Randomized Smoothing <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youwei Shu, Xi Xiao, Derui Wang, Yuxin Cao, Siji Chen, Jason Xue, Linyi Li, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Randomized Smoothing (RS) is currently a scalable certified defense method
+providing robustness certification against adversarial examples. Although
+significant progress has been achieved in providing defenses against $\ell_p$
+adversaries, the interaction between the smoothing distribution and the
+robustness certification still remains vague. In this work, we comprehensively
+study the effect of two families of distributions, named Exponential Standard
+Gaussian (ESG) and Exponential General Gaussian (EGG) distributions, on
+Randomized Smoothing and Double Sampling Randomized Smoothing (DSRS). We derive
+an analytic formula for ESG's certified radius, which converges to the origin
+formula of RS as the dimension $d$ increases. Additionally, we prove that EGG
+can provide tighter constant factors than DSRS in providing $\Omega(\sqrt{d})$
+lower bounds of $\ell_2$ certified radius, and thus further addresses the curse
+of dimensionality in RS. Our experiments on real-world datasets confirm our
+theoretical analysis of the ESG distributions, that they provide almost the
+same certification under different exponents $\eta$ for both RS and DSRS. In
+addition, EGG
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Node-Level Topological Representation Learning on Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincent P. Grande, Michael T. Schaub
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topological Data Analysis (TDA) allows us to extract powerful topological and
+higher-order information on the global shape of a data set or point cloud.
+Tools like Persistent Homology or the Euler Transform give a single complex
+description of the global structure of the point cloud. However, common machine
+learning applications like classification require point-level information and
+features to be available. In this paper, we bridge this gap and propose a novel
+method to extract node-level topological features from complex point clouds
+using discrete variants of concepts from algebraic topology and differential
+geometry. We verify the effectiveness of these topological point features
+(TOPF) on both synthetic and real-world data and study their robustness under
+noise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 10 figures, comments welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving Partial Differential Equations in Different Domains by Operator
+  Learning method Based on Boundary Integral Equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Meng, Yutong Lu, Ying Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article explores operator learning models that can deduce solutions to
+partial differential equations (PDEs) on arbitrary domains without requiring
+retraining. We introduce two innovative models rooted in boundary integral
+equations (BIEs): the Boundary Integral Type Deep Operator Network
+(BI-DeepONet) and the Boundary Integral Trigonometric Deep Operator Neural
+Network (BI-TDONet), which are crafted to address PDEs across diverse domains.
+Once fully trained, these BIE-based models adeptly predict the solutions of
+PDEs in any domain without the need for additional training. BI-TDONet notably
+enhances its performance by employing the singular value decomposition (SVD) of
+bounded linear operators, allowing for the efficient distribution of input
+functions across its modules. Furthermore, to tackle the issue of function
+sampling values that do not effectively capture oscillatory and impulse signal
+characteristics, trigonometric coefficients are utilized as both inputs and
+outputs in BI-TDONet. Our numerical experiments robustly support and confirm
+the efficacy of this theoretical framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning-Rate-Free Stochastic Optimization over Riemannian Manifolds <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02296v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02296v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Dodd, Louis Sharrock, Christopher Nemeth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, interest in gradient-based optimization over Riemannian
+manifolds has surged. However, a significant challenge lies in the reliance on
+hyperparameters, especially the learning rate, which requires meticulous tuning
+by practitioners to ensure convergence at a suitable rate. In this work, we
+introduce innovative learning-rate-free algorithms for stochastic optimization
+over Riemannian manifolds, eliminating the need for hand-tuning and providing a
+more robust and user-friendly approach. We establish high probability
+convergence guarantees that are optimal, up to logarithmic factors, compared to
+the best-known optimally tuned rate in the deterministic setting. Our approach
+is validated through numerical experiments, demonstrating competitive
+performance against learning-rate-dependent algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How to Explore with Belief: State Entropy Maximization in POMDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Zamboni, Duilio Cirino, Marcello Restelli, Mirco Mutti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works have studied *state entropy maximization* in reinforcement
+learning, in which the agent's objective is to learn a policy inducing high
+entropy over states visitation (Hazan et al., 2019). They typically assume full
+observability of the state of the system, so that the entropy of the
+observations is maximized. In practice, the agent may only get *partial*
+observations, e.g., a robot perceiving the state of a physical space through
+proximity sensors and cameras. A significant mismatch between the entropy over
+observations and true states of the system can arise in those settings. In this
+paper, we address the problem of entropy maximization over the *true states*
+with a decision policy conditioned on partial observations *only*. The latter
+is a generalization of POMDPs, which is intractable in general. We develop a
+memory and computationally efficient *policy gradient* method to address a
+first-order relaxation of the objective defined on *belief* states, providing
+various formal characterizations of approximation gaps, the optimization
+landscape, and the *hallucination* problem. This paper aims to generalize state
+entropy maximization to more realistic domains that meet the challenges of
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smaller Batches, Bigger Gains? Investigating the Impact of Batch Sizes
+  on Reinforcement Learning Based Real-World Production Scheduling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02294v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02294v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arthur Müller, Felix Grumbach, Matthia Sabatelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Production scheduling is an essential task in manufacturing, with
+Reinforcement Learning (RL) emerging as a key solution. In a previous work, RL
+was utilized to solve an extended permutation flow shop scheduling problem
+(PFSSP) for a real-world production line with two stages, linked by a central
+buffer. The RL agent was trained to sequence equallysized product batches to
+minimize setup efforts and idle times. However, the substantial impact caused
+by varying the size of these product batches has not yet been explored. In this
+follow-up study, we investigate the effects of varying batch sizes, exploring
+both the quality of solutions and the training dynamics of the RL agent. The
+results demonstrate that it is possible to methodically identify reasonable
+boundaries for the batch size. These boundaries are determined on one side by
+the increasing sample complexity associated with smaller batch sizes, and on
+the other side by the decreasing flexibility of the agent when dealing with
+larger batch sizes. This provides the practitioner the ability to make an
+informed decision regarding the selection of an appropriate batch size.
+Moreover, we introduce and investigate two new curriculum learning strategies
+to enable the training with small batch sizes. The findings of this work offer
+the potential for application in several industrial use cases with comparable
+scheduling problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was accepted at the ETFA 2024 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Composite Quantile Regression With XGBoost Using the Novel Arctan
+  Pinball Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02293v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02293v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurens Sluijterman, Frank Kreuwel, Eric Cator, Tom Heskes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the use of XGBoost for composite quantile regression.
+XGBoost is a highly popular model renowned for its flexibility, efficiency, and
+capability to deal with missing data. The optimization uses a second order
+approximation of the loss function, complicating the use of loss functions with
+a zero or vanishing second derivative. Quantile regression -- a popular
+approach to obtain conditional quantiles when point estimates alone are
+insufficient -- unfortunately uses such a loss function, the pinball loss.
+Existing workarounds are typically inefficient and can result in severe
+quantile crossings. In this paper, we present a smooth approximation of the
+pinball loss, the arctan pinball loss, that is tailored to the needs of
+XGBoost. Specifically, contrary to other smooth approximations, the arctan
+pinball loss has a relatively large second derivative, which makes it more
+suitable to use in the second order approximation. Using this loss function
+enables the simultaneous prediction of multiple quantiles, which is more
+efficient and results in far fewer quantile crossings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Axiomatic Approach to Loss Aggregation and an Adapted Aggregating
+  Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armando J. Cabrera Pacheco, Rabanus Derr, Robert C. Williamson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised learning has gone beyond the expected risk minimization framework.
+Central to most of these developments is the introduction of more general
+aggregation functions for losses incurred by the learner. In this paper, we
+turn towards online learning under expert advice. Via easily justified
+assumptions we characterize a set of reasonable loss aggregation functions as
+quasi-sums. Based upon this insight, we suggest a variant of the Aggregating
+Algorithm tailored to these more general aggregation functions. This variant
+inherits most of the nice theoretical properties of the AA, such as recovery of
+Bayes' updating and a time-independent bound on quasi-sum regret. Finally, we
+argue that generalized aggregations express the attitude of the learner towards
+losses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Study of Optimizations for Fine-tuning Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02290v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02290v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arjun Singh, Nikhil Pandey, Anup Shirgaonkar, Pavan Manoj, Vijay Aski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning large language models is a popular choice among users trying to
+adapt them for specific applications. However, fine-tuning these models is a
+demanding task because the user has to examine several factors, such as
+resource budget, runtime, model size and context length among others. A
+specific challenge is that fine-tuning is memory intensive, imposing
+constraints on the required hardware memory and context length of training data
+that can be handled. In this work, we share a detailed study on a variety of
+fine-tuning optimizations across different fine-tuning scenarios. In
+particular, we assess Gradient Checkpointing, Low Rank Adaptation, DeepSpeed's
+ZeRO Redundancy Optimizer and Flash Attention. With a focus on memory and
+runtime, we examine the impact of different optimization combinations on GPU
+memory usage and execution runtime during fine-tuning phase. We provide
+recommendation on best default optimization for balancing memory and runtime
+across diverse model sizes. We share effective strategies for fine-tuning very
+large models with tens or hundreds of billions of parameters and enabling large
+context lengths during fine-tuning. Furthermore, we propose the appropriate
+optimization mixtures for fine-tuning under GPU resource limitations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Supervised Performance on Speaker Verification with
+  <span class="highlight-title">Self-Supervised</span> Learning by Leveraging Large-Scale ASR Models <span class="chip">INTERSPEECH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Miara, Theo Lepage, Reda Dehak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Self-Supervised Learning (SSL) have shown promising
+results in Speaker Verification (SV). However, narrowing the performance gap
+with supervised systems remains an ongoing challenge. Several studies have
+observed that speech representations from large-scale ASR models contain
+valuable speaker information. This work explores the limitations of fine-tuning
+these models for SV using an SSL contrastive objective in an end-to-end
+approach. Then, we propose a framework to learn speaker representations in an
+SSL context by fine-tuning a pre-trained WavLM with a supervised loss using
+pseudo-labels. Initial pseudo-labels are derived from an SSL DINO-based model
+and are iteratively refined by clustering the model embeddings. Our method
+achieves 0.99% EER on VoxCeleb1-O, establishing the new state-of-the-art on
+self-supervised SV. As this performance is close to our supervised baseline of
+0.94% EER, this contribution is a step towards supervised performance on SV
+with SSL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at INTERSPEECH 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Test-Time Regret Minimization in Meta Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirco Mutti, Aviv Tamar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Meta reinforcement learning sets a distribution over a set of tasks on which
+the agent can train at will, then is asked to learn an optimal policy for any
+test task efficiently. In this paper, we consider a finite set of tasks modeled
+through Markov decision processes with various dynamics. We assume to have
+endured a long training phase, from which the set of tasks is perfectly
+recovered, and we focus on regret minimization against the optimal policy in
+the unknown test task. Under a separation condition that states the existence
+of a state-action pair revealing a task against another, Chen et al. (2022)
+show that $O(M^2 \log(H))$ regret can be achieved, where $M, H$ are the number
+of tasks in the set and test episodes, respectively. In our first contribution,
+we demonstrate that the latter rate is nearly optimal by developing a novel
+lower bound for test-time regret minimization under separation, showing that a
+linear dependence with $M$ is unavoidable. Then, we present a family of
+stronger yet reasonable assumptions beyond separation, which we call strong
+identifiability, enabling algorithms achieving fast rates $\log (H)$ and
+sublinear dependence with $M$ simultaneously. Our paper provides a new
+understanding of the statistical barriers of test-time regret minimization and
+when fast rates can be achieved.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A KL-based Analysis Framework with Applications to Non-Descent
+  Optimization Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwen Qiu, Bohao Ma, Xiao Li, Andre Milzarek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel analysis framework for non-descent-type optimization
+methodologies in nonconvex scenarios based on the Kurdyka-Lojasiewicz property.
+Our framework allows covering a broad class of algorithms, including those
+commonly employed in stochastic and distributed optimization. Specifically, it
+enables the analysis of first-order methods that lack a sufficient descent
+property and do not require access to full (deterministic) gradient
+information. We leverage this framework to establish, for the first time,
+iterate convergence and the corresponding rates for the decentralized gradient
+method and federated averaging under mild assumptions. Furthermore, based on
+the new analysis techniques, we show the convergence of the random reshuffling
+and stochastic gradient descent method without necessitating typical a priori
+bounded iterates assumptions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Neural Networks Do Not Always Oversmooth 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bastian Epping, Alexandre René, Moritz Helias, Michael T. Schaub
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have emerged as powerful tools for processing
+relational data in applications. However, GNNs suffer from the problem of
+oversmoothing, the property that the features of all nodes exponentially
+converge to the same vector over layers, prohibiting the design of deep GNNs.
+In this work we study oversmoothing in graph convolutional networks (GCNs) by
+using their Gaussian process (GP) equivalence in the limit of infinitely many
+hidden features. By generalizing methods from conventional deep neural networks
+(DNNs), we can describe the distribution of features at the output layer of
+deep GCNs in terms of a GP: as expected, we find that typical parameter choices
+from the literature lead to oversmoothing. The theory, however, allows us to
+identify a new, nonoversmoothing phase: if the initial weights of the network
+have sufficiently large variance, GCNs do not oversmooth, and node features
+remain informative even at large depth. We demonstrate the validity of this
+prediction in finite-size GCNs by training a linear classifier on their output.
+Moreover, using the linearization of the GCN GP, we generalize the concept of
+propagation depth of information from DNNs to GCNs. This propagation depth
+diverges at the transition between the oversmoothing and non-oversmoothing
+phase. We test the predictions of our approach and find good agreement with
+finite-size GCNs. Initializing GCNs near the transition to the
+non-oversmoothing phase, we obtain networks which are both deep and expressive.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing the Benefits of Prototypes for Semi-Supervised Category
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyi Zhang, Logan Nelson, Thomas L. Griffiths
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Categories can be represented at different levels of abstraction, from
+prototypes focused on the most typical members to remembering all observed
+exemplars of the category. These representations have been explored in the
+context of supervised learning, where stimuli are presented with known category
+labels. We examine the benefits of prototype-based representations in a
+less-studied domain: semi-supervised learning, where agents must form
+unsupervised representations of stimuli before receiving category labels. We
+study this problem in a Bayesian unsupervised learning model called a
+variational auto-encoder, and we draw on recent advances in machine learning to
+implement a prior that encourages the model to use abstract prototypes to
+represent data. We apply this approach to image datasets and show that forming
+prototypes can improve semi-supervised category learning. Additionally, we
+study the latent embeddings of the models and show that these prototypes allow
+the models to form clustered representations without supervision, contributing
+to their success in downstream categorization performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning with Lookahead Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nadav Merlis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study reinforcement learning (RL) problems in which agents observe the
+reward or transition realizations at their current state before deciding which
+action to take. Such observations are available in many applications, including
+transactions, navigation and more. When the environment is known, previous work
+shows that this lookahead information can drastically increase the collected
+reward. However, outside of specific applications, existing approaches for
+interacting with unknown environments are not well-adapted to these
+observations. In this work, we close this gap and design provably-efficient
+learning algorithms able to incorporate lookahead information. To achieve this,
+we perform planning using the empirical distribution of the reward and
+transition observations, in contrast to vanilla approaches that only rely on
+estimated expectations. We prove that our algorithms achieve tight regret
+versus a baseline that also has access to lookahead information - linearly
+increasing the amount of collected reward compared to agents that cannot handle
+lookahead information.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MidiCaps -- A large-scale MIDI <span class="highlight-title">dataset</span> with text captions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Melechovsky, Abhinaba Roy, Dorien Herremans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models guided by text prompts are increasingly becoming more
+popular. However, no text-to-MIDI models currently exist, mostly due to the
+lack of a captioned MIDI dataset. This work aims to enable research that
+combines LLMs with symbolic music by presenting the first large-scale MIDI
+dataset with text captions that is openly available: MidiCaps. MIDI (Musical
+Instrument Digital Interface) files are a widely used format for encoding
+musical information. Their structured format captures the nuances of musical
+composition and has practical applications by music producers, composers,
+musicologists, as well as performers. Inspired by recent advancements in
+captioning techniques applied to various domains, we present a large-scale
+curated dataset of over 168k MIDI files accompanied by textual descriptions.
+Each MIDI caption succinctly describes the musical content, encompassing tempo,
+chord progression, time signature, instruments present, genre and mood; thereby
+facilitating multi-modal exploration and analysis. The dataset contains a mix
+of various genres, styles, and complexities, offering a rich source for
+training and evaluating models for tasks such as music information retrieval,
+music understanding and cross-modal translation. We provide detailed statistics
+about the dataset and have assessed the quality of the captions in an extensive
+listening study. We anticipate that this resource will stimulate further
+research in the intersection of music and natural language processing,
+fostering advancements in both fields.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Description Boosting for Zero-Shot Entity and Relation Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriele Picco, Leopold Fuchs, Marcos Martínez Galindo, Alberto Purpura, Vanessa López, Hoang Thanh Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot entity and relation classification models leverage available
+external information of unseen classes -- e.g., textual descriptions -- to
+annotate input text data. Thanks to the minimum data requirement, Zero-Shot
+Learning (ZSL) methods have high value in practice, especially in applications
+where labeled data is scarce. Even though recent research in ZSL has
+demonstrated significant results, our analysis reveals that those methods are
+sensitive to provided textual descriptions of entities (or relations). Even a
+minor modification of descriptions can lead to a change in the decision
+boundary between entity (or relation) classes. In this paper, we formally
+define the problem of identifying effective descriptions for zero shot
+inference. We propose a strategy for generating variations of an initial
+description, a heuristic for ranking them and an ensemble method capable of
+boosting the predictions of zero-shot models through description enhancement.
+Empirical results on four different entity and relation classification datasets
+show that our proposed method outperform existing approaches and achieve new
+SOTA results on these datasets under the ZSL settings. The source code of the
+proposed solutions and the evaluation framework are open-sourced.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Limitations of Fractal Dimension as a Measure of Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charlie Tan, Inés García-Redondo, Qiquan Wang, Michael M. Bronstein, Anthea Monod
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bounding and predicting the generalization gap of overparameterized neural
+networks remains a central open problem in theoretical machine learning. Neural
+network optimization trajectories have been proposed to possess fractal
+structure, leading to bounds and generalization measures based on notions of
+fractal dimension on these trajectories. Prominently, both the Hausdorff
+dimension and the persistent homology dimension have been proposed to correlate
+with generalization gap, thus serving as a measure of generalization. This work
+performs an extended evaluation of these topological generalization measures.
+We demonstrate that fractal dimension fails to predict generalization of models
+trained from poor initializations. We further identify that the $\ell^2$ norm
+of the final parameter iterate, one of the simplest complexity measures in
+learning theory, correlates more strongly with the generalization gap than
+these notions of fractal dimension. Finally, our study reveals the intriguing
+manifestation of model-wise double descent in persistent homology-based
+generalization measures. This work lays the ground for a deeper investigation
+of the causal relationships between fractal geometry, topological data
+analysis, and neural network optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Riemannian coordinate descent algorithms on matrix manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andi Han, Pratik Jawanpuria, Bamdev Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many machine learning applications are naturally formulated as optimization
+problems on Riemannian manifolds. The main idea behind Riemannian optimization
+is to maintain the feasibility of the variables while moving along a descent
+direction on the manifold. This results in updating all the variables at every
+iteration. In this work, we provide a general framework for developing
+computationally efficient coordinate descent (CD) algorithms on matrix
+manifolds that allows updating only a few variables at every iteration while
+adhering to the manifold constraint. In particular, we propose CD algorithms
+for various manifolds such as Stiefel, Grassmann, (generalized) hyperbolic,
+symplectic, and symmetric positive (semi)definite. While the cost per iteration
+of the proposed CD algorithms is low, we further develop a more efficient
+variant via a first-order approximation of the objective function. We analyze
+their convergence and complexity, and empirically illustrate their efficacy in
+several applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SMCL: Saliency Masked Contrastive Learning for Long-tailed Recognition <span class="chip">ICASSP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanglee Park, Seung-won Hwang, Jungmin So
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world data often follow a long-tailed distribution with a high imbalance
+in the number of samples between classes. The problem with training from
+imbalanced data is that some background features, common to all classes, can be
+unobserved in classes with scarce samples. As a result, this background
+correlates to biased predictions into ``major" classes. In this paper, we
+propose saliency masked contrastive learning, a new method that uses saliency
+masking and contrastive learning to mitigate the problem and improve the
+generalizability of a model. Our key idea is to mask the important part of an
+image using saliency detection and use contrastive learning to move the masked
+image towards minor classes in the feature space, so that background features
+present in the masked image are no longer correlated with the original class.
+Experiment results show that our method achieves state-of-the-art level
+performance on benchmark long-tailed datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at ICASSP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLTrain: a sparse plus low-rank approach for parameter and memory
+  efficient <span class="highlight-title">pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02214v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02214v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andi Han, Jiaxiang Li, Wei Huang, Mingyi Hong, Akiko Takeda, Pratik Jawanpuria, Bamdev Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown impressive capabilities across
+various tasks. However, training LLMs from scratch requires significant
+computational power and extensive memory capacity. Recent studies have explored
+low-rank structures on weights for efficient fine-tuning in terms of parameters
+and memory, either through low-rank adaptation or factorization. While
+effective for fine-tuning, low-rank structures are generally less suitable for
+pretraining because they restrict parameters to a low-dimensional subspace. In
+this work, we propose to parameterize the weights as a sum of low-rank and
+sparse matrices for pretraining, which we call SLTrain. The low-rank component
+is learned via matrix factorization, while for the sparse component, we employ
+a simple strategy of uniformly selecting the sparsity support at random and
+learning only the non-zero entries with the fixed support. While being simple,
+the random fixed-support sparse learning strategy significantly enhances
+pretraining when combined with low-rank learning. Our results show that SLTrain
+adds minimal extra parameters and memory costs compared to pretraining with
+low-rank parameterization, yet achieves substantially better performance, which
+is comparable to full-rank training. Remarkably, when combined with
+quantization and per-layer updates, SLTrain can reduce memory requirements by
+up to 73% when pretraining the LLaMA 7B model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rectifying Reinforcement Learning for Reward Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran He, Emmanuel Bengio, Qingpeng Cai, Ling Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Generative Flow Network (GFlowNet) is a probabilistic framework in which
+an agent learns a stochastic policy and flow functions to sample objects with
+probability proportional to an unnormalized reward function. GFlowNets share a
+strong resemblance to reinforcement learning (RL), that typically aims to
+maximize reward, due to their sequential decision-making processes. Recent
+works have studied connections between GFlowNets and maximum entropy (MaxEnt)
+RL, which modifies the standard objective of RL agents by learning an
+entropy-regularized objective. However, a critical theoretical gap persists:
+despite the apparent similarities in their sequential decision-making nature, a
+direct link between GFlowNets and standard RL has yet to be discovered, while
+bridging this gap could further unlock the potential of both fields. In this
+paper, we establish a new connection between GFlowNets and policy evaluation
+for a uniform policy. Surprisingly, we find that the resulting value function
+for the uniform policy has a close relationship to the flows in GFlowNets.
+Leveraging these insights, we further propose a novel rectified policy
+evaluation (RPE) algorithm, which achieves the same reward-matching effect as
+GFlowNets, offering a new perspective. We compare RPE, MaxEnt RL, and GFlowNets
+in a number of benchmarks, and show that RPE achieves competitive results
+compared to previous approaches. This work sheds light on the previously
+unexplored connection between (non-MaxEnt) RL and GFlowNets, potentially
+opening new avenues for future research in both fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Deep Latent Space Particle Filter for Real-Time Data Assimilation
+  with Uncertainty Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolaj T. Mücke, Sander M. Bohté, Cornelis W. Oosterlee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Data Assimilation, observations are fused with simulations to obtain an
+accurate estimate of the state and parameters for a given physical system.
+Combining data with a model, however, while accurately estimating uncertainty,
+is computationally expensive and infeasible to run in real-time for complex
+systems. Here, we present a novel particle filter methodology, the Deep Latent
+Space Particle filter or D-LSPF, that uses neural network-based surrogate
+models to overcome this computational challenge. The D-LSPF enables filtering
+in the low-dimensional latent space obtained using Wasserstein AEs with
+modified vision transformer layers for dimensionality reduction and
+transformers for parameterized latent space time stepping. As we demonstrate on
+three test cases, including leak localization in multi-phase pipe flow and
+seabed identification for fully nonlinear water waves, the D-LSPF runs orders
+of magnitude faster than a high-fidelity particle filter and 3-5 times faster
+than alternative methods while being up to an order of magnitude more accurate.
+The D-LSPF thus enables real-time data assimilation with uncertainty
+quantification for physical systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Recoverability of Causal Relations from Temporally Aggregated
+  I.I.D. Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02191v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02191v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shunxing Fan, Mingming Gong, Kun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the effect of temporal aggregation on instantaneous
+(non-temporal) causal discovery in general setting. This is motivated by the
+observation that the true causal time lag is often considerably shorter than
+the observational interval. This discrepancy leads to high aggregation, causing
+time-delay causality to vanish and instantaneous dependence to manifest.
+Although we expect such instantaneous dependence has consistency with the true
+causal relation in certain sense to make the discovery results meaningful, it
+remains unclear what type of consistency we need and when will such consistency
+be satisfied. We proposed functional consistency and conditional independence
+consistency in formal way correspond functional causal model-based methods and
+conditional independence-based methods respectively and provide the conditions
+under which these consistencies will hold. We show theoretically and
+experimentally that causal discovery results may be seriously distorted by
+aggregation especially in complete nonlinear case and we also find causal
+relationship still recoverable from aggregated data if we have partial
+linearity or appropriate prior. Our findings suggest community should take a
+cautious and meticulous approach when interpreting causal discovery results
+from such data and show why and when aggregation will distort the performance
+of causal discovery methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast and Scalable Multi-Kernel Encoder Classifier 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cencheng Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new kernel-based classifier by viewing kernel
+matrices as generalized graphs and leveraging recent progress in graph
+embedding techniques. The proposed method facilitates fast and scalable kernel
+matrix embedding, and seamlessly integrates multiple kernels to enhance the
+learning process. Our theoretical analysis offers a population-level
+characterization of this approach using random variables. Empirically, our
+method demonstrates superior running time compared to standard approaches such
+as support vector machines and two-layer neural network, while achieving
+comparable classification accuracy across various simulated and real datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages main + 3 pages appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DNCs Require More Planning Steps 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yara Shamshoum, Nitzan Hodos, Yuval Sieradzki, Assaf Schuster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many recent works use machine learning models to solve various complex
+algorithmic problems. However, these models attempt to reach a solution without
+considering the problem's required computational complexity, which can be
+detrimental to their ability to solve it correctly. In this work we investigate
+the effect of computational time and memory on generalization of implicit
+algorithmic solvers. To do so, we focus on the Differentiable Neural Computer
+(DNC), a general problem solver that also lets us reason directly about its
+usage of time and memory. In this work, we argue that the number of planning
+steps the model is allowed to take, which we call "planning budget", is a
+constraint that can cause the model to generalize poorly and hurt its ability
+to fully utilize its external memory. We evaluate our method on Graph Shortest
+Path, Convex Hull, Graph MinCut and Associative Recall, and show how the
+planning budget can drastically change the behavior of the learned algorithm,
+in terms of learned time complexity, training time, stability and
+generalization to inputs larger than those seen during training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On The Statistical Representation Properties Of The Perturb-Softmax And
+  The Perturb-Argmax Probability Distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hedda Cohen Indelman, Tamir Hazan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Gumbel-Softmax probability distribution allows learning discrete tokens
+in generative learning, while the Gumbel-Argmax probability distribution is
+useful in learning discrete structures in discriminative learning. Despite the
+efforts invested in optimizing these probability models, their statistical
+properties are under-explored. In this work, we investigate their
+representation properties and determine for which families of parameters these
+probability distributions are complete, i.e., can represent any probability
+distribution, and minimal, i.e., can represent a probability distribution
+uniquely. We rely on convexity and differentiability to determine these
+statistical conditions and extend this framework to general probability models,
+such as Gaussian-Softmax and Gaussian-Argmax. We experimentally validate the
+qualities of these extensions, which enjoy a faster convergence rate. We
+conclude the analysis by identifying two sets of parameters that satisfy these
+assumptions and thus admit a complete and minimal representation. Our
+contribution is theoretical with supporting practical evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-Shot Federated Learning with Bayesian Pseudocoresets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim d'Hondt, Mykola Pechenizkiy, Robert Peharz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization-based techniques for federated learning (FL) often come with
+prohibitive communication cost, as high dimensional model parameters need to be
+communicated repeatedly between server and clients. In this paper, we follow a
+Bayesian approach allowing to perform FL with one-shot communication, by
+solving the global inference problem as a product of local client posteriors.
+For models with multi-modal likelihoods, such as neural networks, a naive
+application of this scheme is hampered, since clients will capture different
+posterior modes, causing a destructive collapse of the posterior on the server
+side. Consequently, we explore approximate inference in the function-space
+representation of client posteriors, hence suffering less or not at all from
+multi-modality. We show that distributed function-space inference is tightly
+related to learning Bayesian pseudocoresets and develop a tractable Bayesian FL
+algorithm on this insight. We show that this approach achieves prediction
+performance competitive to state-of-the-art while showing a striking reduction
+in communication cost of up to two orders of magnitude. Moreover, due to its
+Bayesian nature, our method also delivers well-calibrated uncertainty
+estimates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AROMA: Preserving Spatial Structure for Latent PDE Modeling with Local
+  Neural Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis Serrano, Thomas X Wang, Etienne Le Naour, Jean-Noël Vittaut, Patrick Gallinari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present AROMA (Attentive Reduced Order Model with Attention), a framework
+designed to enhance the modeling of partial differential equations (PDEs) using
+local neural fields. Our flexible encoder-decoder architecture can obtain
+smooth latent representations of spatial physical fields from a variety of data
+types, including irregular-grid inputs and point clouds. This versatility
+eliminates the need for patching and allows efficient processing of diverse
+geometries. The sequential nature of our latent representation can be
+interpreted spatially and permits the use of a conditional transformer for
+modeling the temporal dynamics of PDEs. By employing a diffusion-based
+formulation, we achieve greater stability and enable longer rollouts compared
+to conventional MSE training. AROMA's superior performance in simulating 1D and
+2D equations underscores the efficacy of our approach in capturing complex
+dynamical behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Branches: A Fast Dynamic Programming and Branch & Bound Algorithm for
+  Optimal Decision Trees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayman Chaouki, Jesse Read, Albert Bifet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision Tree Learning is a fundamental problem for Interpretable Machine
+Learning, yet it poses a formidable optimization challenge. Despite numerous
+efforts dating back to the early 1990's, practical algorithms have only
+recently emerged, primarily leveraging Dynamic Programming (DP) and Branch &
+Bound (B&B) techniques. These breakthroughs led to the development of two
+distinct approaches. Algorithms like DL8.5 and MurTree operate on the space of
+nodes (or branches), they are very fast, but do not penalise complex Decision
+Trees, i.e. they do not solve for sparsity. On the other hand, algorithms like
+OSDT and GOSDT operate on the space of Decision Trees, they solve for sparsity
+but at the detriment of speed. In this work, we introduce Branches, a novel
+algorithm that integrates the strengths of both paradigms. Leveraging DP and
+B&B, Branches achieves exceptional speed while also solving for sparsity.
+Central to its efficiency is a novel analytical bound enabling substantial
+pruning of the search space. Theoretical analysis demonstrates that Branches
+has lower complexity compared to state-of-the-art methods, a claim validated
+through extensive empirical evaluation. Our results illustrate that Branches
+not only greatly outperforms existing approaches in terms of speed and number
+of iterations, it also consistently yields optimal Decision Trees.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This preprint is currently under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can We Remove the Square-Root in Adaptive Gradient Methods? A
+  Second-Order Perspective <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03496v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03496v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wu Lin, Felix Dangel, Runa Eschenhagen, Juhan Bae, Richard E. Turner, Alireza Makhzani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adaptive gradient optimizers like Adam(W) are the default training algorithms
+for many deep learning architectures, such as transformers. Their diagonal
+preconditioner is based on the gradient outer product which is incorporated
+into the parameter update via a square root. While these methods are often
+motivated as approximate second-order methods, the square root represents a
+fundamental difference. In this work, we investigate how the behavior of
+adaptive methods changes when we remove the root, i.e. strengthen their
+second-order motivation. Surprisingly, we find that such square-root-free
+adaptive methods close the generalization gap to SGD on convolutional
+architectures, while maintaining their root-based counterpart's performance on
+transformers. The second-order perspective also has practical benefits for the
+development of non-diagonal adaptive methods through the concept of
+preconditioner invariance. In contrast to root-based methods like Shampoo, the
+root-free counterparts do not require numerically unstable matrix root
+decompositions and inversions, thus work well in half precision. Our findings
+provide new insights into the development of adaptive methods and raise
+important questions regarding the currently overlooked role of adaptivity for
+their success.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A long version of the ICML 2024 paper. Updated Sec 4 to emphasize the
+  concept of preconditioner invariance</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ State-Constrained Zero-Sum Differential Games with One-Sided Information <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02741v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02741v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mukesh Ghimire, Lei Zhang, Zhe Xu, Yi Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study zero-sum differential games with state constraints and one-sided
+information, where the informed player (Player 1) has a categorical payoff type
+unknown to the uninformed player (Player 2). The goal of Player 1 is to
+minimize his payoff without violating the constraints, while that of Player 2
+is to violate the state constraints if possible, or to maximize the payoff
+otherwise. One example of the game is a man-to-man matchup in football. Without
+state constraints, Cardaliaguet (2007) showed that the value of such a game
+exists and is convex to the common belief of players. Our theoretical
+contribution is an extension of this result to games with state constraints and
+the derivation of the primal and dual subdynamic principles necessary for
+computing behavioral strategies. Different from existing works that are
+concerned about the scalability of no-regret learning in games with discrete
+dynamics, our study reveals the underlying structure of strategies for belief
+manipulation resulting from information asymmetry and state constraints. This
+structure will be necessary for scalable learning on games with continuous
+actions and long time windows. We use a simplified football game to demonstrate
+the utility of this work, where we reveal player positions and belief states in
+which the attacker should (or should not) play specific random deceptive moves
+to take advantage of information asymmetry, and compute how the defender should
+respond.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Momentum Particle Maximum Likelihood <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07335v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07335v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jen Ning Lim, Juan Kuntz, Samuel Power, Adam M. Johansen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maximum likelihood estimation (MLE) of latent variable models is often recast
+as the minimization of a free energy functional over an extended space of
+parameters and probability distributions. This perspective was recently
+combined with insights from optimal transport to obtain novel particle-based
+algorithms for fitting latent variable models to data. Drawing inspiration from
+prior works which interpret `momentum-enriched' optimization algorithms as
+discretizations of ordinary differential equations, we propose an analogous
+dynamical-systems-inspired approach to minimizing the free energy functional.
+The result is a dynamical system that blends elements of Nesterov's Accelerated
+Gradient method, the underdamped Langevin diffusion, and particle methods.
+Under suitable assumptions, we prove that the continuous-time system minimizes
+the functional. By discretizing the system, we obtain a practical algorithm for
+MLE in latent variable models. The algorithm outperforms existing particle
+methods in numerical experiments and compares favourably with other MLE
+algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 camera ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparing Graph <span class="highlight-title">Transformer</span>s via Positional Encodings <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14202v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14202v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mitchell Black, Zhengchao Wan, Gal Mishne, Amir Nayyeri, Yusu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The distinguishing power of graph transformers is closely tied to the choice
+of positional encoding: features used to augment the base transformer with
+information about the graph. There are two primary types of positional
+encoding: absolute positional encodings (APEs) and relative positional
+encodings (RPEs). APEs assign features to each node and are given as input to
+the transformer. RPEs instead assign a feature to each pair of nodes, e.g.,
+graph distance, and are used to augment the attention block. A priori, it is
+unclear which method is better for maximizing the power of the resulting graph
+transformer. In this paper, we aim to understand the relationship between these
+different types of positional encodings. Interestingly, we show that graph
+transformers using APEs and RPEs are equivalent in terms of distinguishing
+power. In particular, we demonstrate how to interchange APEs and RPEs while
+maintaining their distinguishing power in terms of graph transformers. Based on
+our theoretical results, we provide a study on several APEs and RPEs (including
+the resistance distance and the recently introduced stable and expressive
+positional encoding (SPE)) and compare their distinguishing power in terms of
+transformers. We believe our work will help navigate the huge number of choices
+of positional encoding and will provide guidance on the future design of
+positional encodings for graph transformers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sample Complexity of Algorithm Selection Using Neural Networks and Its
+  Applications to Branch-and-Cut 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02328v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02328v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyu Cheng, Sammy Khalife, Barbara Fiedorowicz, Amitabh Basu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven algorithm design is a paradigm that uses statistical and machine
+learning techniques to select from a class of algorithms for a computational
+problem an algorithm that has the best expected performance with respect to
+some (unknown) distribution on the instances of the problem. We build upon
+recent work in this line of research by considering the setup where, instead of
+selecting a single algorithm that has the best performance, we allow the
+possibility of selecting an algorithm based on the instance to be solved, using
+neural networks. In particular, given a representative sample of instances, we
+learn a neural network that maps an instance of the problem to the most
+appropriate algorithm for that instance. We formalize this idea and derive
+rigorous sample complexity bounds for this learning problem, in the spirit of
+recent work in data-driven algorithm design. We then apply this approach to the
+problem of making good decisions in the branch-and-cut framework for
+mixed-integer optimization (e.g., which cut to add?). In other words, the
+neural network will take as input a mixed-integer optimization instance and
+output a decision that will result in a small branch-and-cut tree for that
+instance. Our computational results provide evidence that our particular way of
+using neural networks for cut selection can make a significant impact in
+reducing branch-and-cut tree sizes, compared to previous data-driven
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COMQ: A Backpropagation-Free Algorithm for Post-Training Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07134v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07134v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aozhong Zhang, Zi Yang, Naigang Wang, Yingyong Qin, Jack Xin, Xin Li, Penghang Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization (PTQ) has emerged as a practical approach to
+compress large neural networks, making them highly efficient for deployment.
+However, effectively reducing these models to their low-bit counterparts
+without compromising the original accuracy remains a key challenge. In this
+paper, we propose an innovative PTQ algorithm termed COMQ, which sequentially
+conducts coordinate-wise minimization of the layer-wise reconstruction errors.
+We consider the widely used integer quantization, where every quantized weight
+can be decomposed into a shared floating-point scalar and an integer bit-code.
+Within a fixed layer, COMQ treats all the scaling factor(s) and bit-codes as
+the variables of the reconstruction error. Every iteration improves this error
+along a single coordinate while keeping all other variables constant. COMQ is
+easy to use and requires no hyper-parameter tuning. It instead involves only
+dot products and rounding operations. We update these variables in a carefully
+designed greedy order, significantly enhancing the accuracy. COMQ achieves
+remarkable results in quantizing 4-bit Vision Transformers, with a negligible
+loss of less than 1% in Top-1 accuracy. In 4-bit INT quantization of
+convolutional neural networks, COMQ maintains near-lossless accuracy with a
+minimal drop of merely 0.3% in Top-1 accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fast Decision Boundary based Out-of-Distribution Detector <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11536v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11536v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Litian Liu, Yao Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient and effective Out-of-Distribution (OOD) detection is essential for
+the safe deployment of AI systems. Existing feature space methods, while
+effective, often incur significant computational overhead due to their reliance
+on auxiliary models built from training features. In this paper, we propose a
+computationally-efficient OOD detector without using auxiliary models while
+still leveraging the rich information embedded in the feature space.
+Specifically, we detect OOD samples based on their feature distances to
+decision boundaries. To minimize computational cost, we introduce an efficient
+closed-form estimation, analytically proven to tightly lower bound the
+distance. Based on our estimation, we discover that In-Distribution (ID)
+features tend to be further from decision boundaries than OOD features.
+Additionally, ID and OOD samples are better separated when compared at equal
+deviation levels from the mean of training features. By regularizing the
+distances to decision boundaries based on feature deviation from the mean, we
+develop a hyperparameter-free, auxiliary model-free OOD detector. Our method
+matches or surpasses the effectiveness of state-of-the-art methods in extensive
+experiments while incurring negligible overhead in inference latency. Overall,
+our approach significantly improves the efficiency-effectiveness trade-off in
+OOD detection. Code is available at: https://github.com/litianliu/fDBD-OOD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 main conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Smooth Is Attention? <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14820v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14820v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valérie Castin, Pierre Ablin, Gabriel Peyré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-attention and masked self-attention are at the heart of Transformers'
+outstanding success. Still, our mathematical understanding of attention, in
+particular of its Lipschitz properties - which are key when it comes to
+analyzing robustness and expressive power - is incomplete. We provide a
+detailed study of the Lipschitz constant of self-attention in several practical
+scenarios, discussing the impact of the sequence length $n$ and layer
+normalization on the local Lipschitz constant of both unmasked and masked
+self-attention. In particular, we show that for inputs of length $n$ in any
+compact set, the Lipschitz constant of self-attention is bounded by $\sqrt{n}$
+up to a constant factor and that this bound is tight for reasonable sequence
+lengths. When the sequence length $n$ is too large for the previous bound to be
+tight, which we refer to as the mean-field regime, we provide an upper bound
+and a matching lower bound which are independent of $n$. Our mean-field
+framework for masked self-attention is novel and of independent interest. Our
+experiments on pretrained and randomly initialized BERT and GPT-2 support our
+theoretical findings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially Private Decentralized Learning with Random Walks <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07471v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07471v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edwige Cyffers, Aurélien Bellet, Jalaj Upadhyay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of federated learning comes from the possibility of better
+scalability and the ability for participants to keep control of their data,
+improving data security and sovereignty. Unfortunately, sharing model updates
+also creates a new privacy attack surface. In this work, we characterize the
+privacy guarantees of decentralized learning with random walk algorithms, where
+a model is updated by traveling from one node to another along the edges of a
+communication graph. Using a recent variant of differential privacy tailored to
+the study of decentralized algorithms, namely Pairwise Network Differential
+Privacy, we derive closed-form expressions for the privacy loss between each
+pair of nodes where the impact of the communication topology is captured by
+graph theoretic quantities. Our results further reveal that random walk
+algorithms tends to yield better privacy guarantees than gossip algorithms for
+nodes close from each other. We supplement our theoretical results with
+empirical evaluation on synthetic and real-world graphs and datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ENOT: Expectile Regularization for Fast and Accurate Training of Neural
+  Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03777v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03777v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nazar Buzun, Maksim Bobrin, Dmitry V. Dylov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new approach for Neural Optimal Transport (NOT) training
+procedure, capable of accurately and efficiently estimating optimal
+transportation plan via specific regularization on dual Kantorovich potentials.
+The main bottleneck of existing NOT solvers is associated with the procedure of
+finding a near-exact approximation of the conjugate operator (i.e., the
+c-transform), which is done either by optimizing over non-convex max-min
+objectives or by the computationally intensive fine-tuning of the initial
+approximated prediction. We resolve both issues by proposing a new,
+theoretically justified loss in the form of expectile regularisation which
+enforces binding conditions on the learning process of dual potentials. Such a
+regularization provides the upper bound estimation over the distribution of
+possible conjugate potentials and makes the learning stable, completely
+eliminating the need for additional extensive fine-tuning. Proposed method,
+called Expectile-Regularised Neural Optimal Transport (ENOT), outperforms
+previous state-of-the-art approaches on the established Wasserstein-2 benchmark
+tasks by a large margin (up to a 3-fold improvement in quality and up to a
+10-fold improvement in runtime). Moreover, we showcase performance of ENOT for
+varying cost functions on different tasks such as image generation, showing
+robustness of proposed algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Temporal Difference Learning with Compressed Updates: Error-Feedback
+  meets Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.00944v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.00944v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aritra Mitra, George J. Pappas, Hamed Hassani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In large-scale distributed machine learning, recent works have studied the
+effects of compressing gradients in stochastic optimization to alleviate the
+communication bottleneck. These works have collectively revealed that
+stochastic gradient descent (SGD) is robust to structured perturbations such as
+quantization, sparsification, and delays. Perhaps surprisingly, despite the
+surge of interest in multi-agent reinforcement learning, almost nothing is
+known about the analogous question: Are common reinforcement learning (RL)
+algorithms also robust to similar perturbations? We investigate this question
+by studying a variant of the classical temporal difference (TD) learning
+algorithm with a perturbed update direction, where a general compression
+operator is used to model the perturbation. Our work makes three important
+technical contributions. First, we prove that compressed TD algorithms, coupled
+with an error-feedback mechanism used widely in optimization, exhibit the same
+non-asymptotic theoretical guarantees as their SGD counterparts. Second, we
+show that our analysis framework extends seamlessly to nonlinear stochastic
+approximation schemes that subsume Q-learning. Third, we prove that for
+multi-agent TD learning, one can achieve linear convergence speedups with
+respect to the number of agents while communicating just $\tilde{O}(1)$ bits
+per iteration. Notably, these are the first finite-time results in RL that
+account for general compression operators and error-feedback in tandem with
+linear function approximation and Markovian sampling. Our proofs hinge on the
+construction of novel Lyapunov functions that capture the dynamics of a memory
+variable introduced by error-feedback.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Transactions on Machine Learning Research</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Approximate Nearest Neighbor Search with Window Filters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00943v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00943v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Engels, Benjamin Landrum, Shangdi Yu, Laxman Dhulipala, Julian Shun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We define and investigate the problem of $\textit{c-approximate window
+search}$: approximate nearest neighbor search where each point in the dataset
+has a numeric label, and the goal is to find nearest neighbors to queries
+within arbitrary label ranges. Many semantic search problems, such as image and
+document search with timestamp filters, or product search with cost filters,
+are natural examples of this problem. We propose and theoretically analyze a
+modular tree-based framework for transforming an index that solves the
+traditional c-approximate nearest neighbor problem into a data structure that
+solves window search. On standard nearest neighbor benchmark datasets equipped
+with random label values, adversarially constructed embeddings, and image
+search embeddings with real timestamps, we obtain up to a $75\times$ speedup
+over existing solutions at the same level of recall.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available: https://github.com/JoshEngels/RangeFilteredANN</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bringing motion taxonomies to continuous domains via GPLVM on hyperbolic
+  manifolds <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.01672v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.01672v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noémie Jaquier, Leonel Rozo, Miguel González-Duque, Viacheslav Borovitskiy, Tamim Asfour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human motion taxonomies serve as high-level hierarchical abstractions that
+classify how humans move and interact with their environment. They have proven
+useful to analyse grasps, manipulation skills, and whole-body support poses.
+Despite substantial efforts devoted to design their hierarchy and underlying
+categories, their use remains limited. This may be attributed to the lack of
+computational models that fill the gap between the discrete hierarchical
+structure of the taxonomy and the high-dimensional heterogeneous data
+associated to its categories. To overcome this problem, we propose to model
+taxonomy data via hyperbolic embeddings that capture the associated
+hierarchical structure. We achieve this by formulating a novel Gaussian process
+hyperbolic latent variable model that incorporates the taxonomy structure
+through graph-based priors on the latent space and distance-preserving back
+constraints. We validate our model on three different human motion taxonomies
+to learn hyperbolic embeddings that faithfully preserve the original graph
+structure. We show that our model properly encodes unseen data from existing or
+new taxonomy categories, and outperforms its Euclidean and VAE-based
+counterparts. Finally, through proof-of-concept experiments, we show that our
+model may be used to generate realistic trajectories between the learned
+embeddings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Intl. Conference on Machine Learning (ICML), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nearly Minimax Optimal Regret for Multinomial Logistic Bandit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09831v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09831v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joongkyu Lee, Min-hwan Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study the contextual multinomial logit (MNL) bandit problem
+in which a learning agent sequentially selects an assortment based on
+contextual information, and user feedback follows an MNL choice model. There
+has been a significant discrepancy between lower and upper regret bounds,
+particularly regarding the feature dimension $d$ and the maximum assortment
+size $K$. Additionally, the variation in reward structures between these bounds
+complicates the quest for optimality. Under uniform rewards, where all items
+have the same expected reward, we establish a regret lower bound of
+$\Omega(d\sqrt{\smash[b]{T/K}})$ and propose a constant-time algorithm,
+OFU-MNL+, that achieves a matching upper bound of
+$\tilde{O}(d\sqrt{\smash[b]{T/K}})$. Under non-uniform rewards, we prove a
+lower bound of $\Omega(d\sqrt{T})$ and an upper bound of
+$\tilde{O}(d\sqrt{T})$, also achievable by OFU-MNL+. Our empirical studies
+support these theoretical findings. To the best of our knowledge, this is the
+first work in the contextual MNL bandit literature to prove minimax optimality
+-- for either uniform or non-uniform reward setting -- and to propose a
+computationally efficient algorithm that achieves this optimality up to
+logarithmic factors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ExGRG: Explicitly-Generated Relation Graph for <span class="highlight-title">Self-Supervised</span>
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06737v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06737v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahdi Naseri, Mahdi Biparva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised Learning (SSL) has emerged as a powerful technique in
+pre-training deep learning models without relying on expensive annotated
+labels, instead leveraging embedded signals in unlabeled data. While SSL has
+shown remarkable success in computer vision tasks through intuitive data
+augmentation, its application to graph-structured data poses challenges due to
+the semantic-altering and counter-intuitive nature of graph augmentations.
+Addressing this limitation, this paper introduces a novel non-contrastive SSL
+approach to Explicitly Generate a compositional Relation Graph (ExGRG) instead
+of relying solely on the conventional augmentation-based implicit relation
+graph. ExGRG offers a framework for incorporating prior domain knowledge and
+online extracted information into the SSL invariance objective, drawing
+inspiration from the Laplacian Eigenmap and Expectation-Maximization (EM).
+Employing an EM perspective on SSL, our E-step involves relation graph
+generation to identify candidates to guide the SSL invariance objective, and
+M-step updates the model parameters by integrating the derived relational
+information. Extensive experimentation on diverse node classification datasets
+demonstrates the superiority of our method over state-of-the-art techniques,
+affirming ExGRG as an effective adoption of SSL for graph representation
+learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semi-Supervised Learning guided by the Generalized Bayes Rule under Soft
+  Revision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15294v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15294v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Dietrich, Julian Rodemann, Christoph Jansen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide a theoretical and computational investigation of the Gamma-Maximin
+method with soft revision, which was recently proposed as a robust criterion
+for pseudo-label selection (PLS) in semi-supervised learning. Opposed to
+traditional methods for PLS we use credal sets of priors ("generalized Bayes")
+to represent the epistemic modeling uncertainty. These latter are then updated
+by the Gamma-Maximin method with soft revision. We eventually select
+pseudo-labeled data that are most likely in light of the least favorable
+distribution from the so updated credal set. We formalize the task of finding
+optimal pseudo-labeled data w.r.t. the Gamma-Maximin method with soft revision
+as an optimization problem. A concrete implementation for the class of logistic
+models then allows us to compare the predictive power of the method with
+competing approaches. It is observed that the Gamma-Maximin method with soft
+revision can achieve very promising results, especially when the proportion of
+labeled data is low.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 11th International Conference on Soft Methods in
+  Probability and Statistics (SMPS) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fair Wasserstein Coresets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.05436v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.05436v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikai Xiong, Niccolò Dalmasso, Shubham Sharma, Freddy Lecue, Daniele Magazzeni, Vamsi K. Potluru, Tucker Balch, Manuela Veloso
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data distillation and coresets have emerged as popular approaches to generate
+a smaller representative set of samples for downstream learning tasks to handle
+large-scale datasets. At the same time, machine learning is being increasingly
+applied to decision-making processes at a societal level, making it imperative
+for modelers to address inherent biases towards subgroups present in the data.
+While current approaches focus on creating fair synthetic representative
+samples by optimizing local properties relative to the original samples, their
+impact on downstream learning processes has yet to be explored. In this work,
+we present fair Wasserstein coresets (FWC), a novel coreset approach which
+generates fair synthetic representative samples along with sample-level weights
+to be used in downstream learning tasks. FWC uses an efficient majority
+minimization algorithm to minimize the Wasserstein distance between the
+original dataset and the weighted synthetic samples while enforcing demographic
+parity. We show that an unconstrained version of FWC is equivalent to Lloyd's
+algorithm for k-medians and k-means clustering. Experiments conducted on both
+synthetic and real datasets show that FWC: (i) achieves a competitive
+fairness-utility tradeoff in downstream models compared to existing approaches,
+(ii) improves downstream fairness when added to the existing training data and
+(iii) can be used to reduce biases in predictions from large language models
+(GPT-3.5 and GPT-4).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 7 figures, 7 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Identifying Equivalent Training Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William T. Redman, Juan M. Bello-Rivas, Maria Fonoberova, Ryan Mohr, Ioannis G. Kevrekidis, Igor Mezić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Study of the nonlinear evolution deep neural network (DNN) parameters undergo
+during training has uncovered regimes of distinct dynamical behavior. While a
+detailed understanding of these phenomena has the potential to advance
+improvements in training efficiency and robustness, the lack of methods for
+identifying when DNN models have equivalent dynamics limits the insight that
+can be gained from prior work. Topological conjugacy, a notion from dynamical
+systems theory, provides a precise definition of dynamical equivalence,
+offering a possible route to address this need. However, topological
+conjugacies have historically been challenging to compute. By leveraging
+advances in Koopman operator theory, we develop a framework for identifying
+conjugate and non-conjugate training dynamics. To validate our approach, we
+demonstrate that it can correctly identify a known equivalence between online
+mirror descent and online gradient descent. We then utilize it to: identify
+non-conjugate training dynamics between shallow and wide fully connected neural
+networks; characterize the early phase of training dynamics in convolutional
+neural networks; uncover non-conjugate training dynamics in Transformers that
+do and do not undergo grokking. Our results, across a range of DNN
+architectures, illustrate the flexibility of our framework and highlight its
+potential for shedding new light on training dynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 6 figures, 3 supplemental figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Alternative Methods to SHAP Derived from Properties of Kernels: A Note
+  on Theoretical Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00371v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00371v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kazuhiro Hiraki, Shinichi Ishihara, Junnosuke Shino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study first derives a general and analytical expression of AFA (Additive
+Feature Attribution) in terms of the kernel in LIME (Local Interpretable
+Model-agnostic Explanations). Then, we propose some new AFAs that have
+appropriate properties of kernels or that coincide with the LS prenucleolus in
+cooperative game theory. We also revisit existing AFAs such as SHAP (SHapley
+Additive exPlanations) and re-examine the properties of their kernels.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inhomogeneous graph trend filtering via a l2,0 cardinality penalty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05223v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05223v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoqing Huang, Andersen Ang, Kun Huang, Jie Zhang, Yijie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study estimation of piecewise smooth signals over a graph. We propose a
+$\ell_{2,0}$-norm penalized Graph Trend Filtering (GTF) model to estimate
+piecewise smooth graph signals that exhibit inhomogeneous levels of smoothness
+across the nodes. We prove that the proposed GTF model is simultaneously a
+k-means clustering on the signal over the nodes and a minimum graph cut on the
+edges of the graph, where the clustering and the cut share the same assignment
+matrix. We propose two methods to solve the proposed GTF model: a spectral
+decomposition method and a method based on simulated annealing. In the
+experiment on synthetic and real-world datasets, we show that the proposed GTF
+model has a better performances compared with existing approaches on the tasks
+of denoising, support recovery and semi-supervised classification. We also show
+that the proposed GTF model can be solved more efficiently than existing models
+for the dataset with a large edge set.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 3 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Majority Vote for Distributed Differentially Private Sign Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.04419v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.04419v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weidong Liu, Jiyuan Tu, Xiaojun Mao, Xi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Privacy-preserving data analysis has become more prevalent in recent years.
+In this study, we propose a distributed group differentially private Majority
+Vote mechanism, for the sign selection problem in a distributed setup. To
+achieve this, we apply the iterative peeling to the stability function and use
+the exponential mechanism to recover the signs. For enhanced applicability, we
+study the private sign selection for mean estimation and linear regression
+problems, in distributed systems. Our method recovers the support and signs
+with the optimal signal-to-noise ratio as in the non-private scenario, which is
+better than contemporary works of private variable selections. Moreover, the
+sign selection consistency is justified by theoretical guarantees. Simulation
+studies are conducted to demonstrate the effectiveness of the proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiarizationLM: Speaker Diarization Post-Processing with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03506v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03506v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan Wang, Yiling Huang, Guanlong Zhao, Evan Clark, Wei Xia, Hank Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce DiarizationLM, a framework to leverage large
+language models (LLM) to post-process the outputs from a speaker diarization
+system. Various goals can be achieved with the proposed framework, such as
+improving the readability of the diarized transcript, or reducing the word
+diarization error rate (WDER). In this framework, the outputs of the automatic
+speech recognition (ASR) and speaker diarization systems are represented as a
+compact textual format, which is included in the prompt to an optionally
+finetuned LLM. The outputs of the LLM can be used as the refined diarization
+results with the desired enhancement. As a post-processing step, this framework
+can be easily applied to any off-the-shelf ASR and speaker diarization systems
+without retraining existing components. Our experiments show that a finetuned
+PaLM 2-S model can reduce the WDER by rel. 55.5% on the Fisher telephone
+conversation dataset, and rel. 44.9% on the Callhome English dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PASOA- PArticle baSed Bayesian Optimal Adaptive design <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacopo Iollo, Christophe Heinkelé, Pierre Alliez, Florence Forbes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new procedure named PASOA, for Bayesian experimental design,
+that performs sequential design optimization by simultaneously providing
+accurate estimates of successive posterior distributions for parameter
+inference. The sequential design process is carried out via a contrastive
+estimation principle, using stochastic optimization and Sequential Monte Carlo
+(SMC) samplers to maximise the Expected Information Gain (EIG). As larger
+information gains are obtained for larger distances between successive
+posterior distributions, this EIG objective may worsen classical SMC
+performance. To handle this issue, tempering is proposed to have both a large
+information gain and an accurate SMC sampling, that we show is crucial for
+performance. This novel combination of stochastic optimization and tempered SMC
+allows to jointly handle design optimization and parameter inference. We
+provide a proof that the obtained optimal design estimators benefit from some
+consistency property. Numerical experiments confirm the potential of the
+approach, which outperforms other recent existing procedures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamical Survival Analysis with Controlled Latent States <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.17077v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.17077v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linus Bleistein, Van-Tuan Nguyen, Adeline Fermanian, Agathe Guilloux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the task of learning individual-specific intensities of counting
+processes from a set of static variables and irregularly sampled time series.
+We introduce a novel modelization approach in which the intensity is the
+solution to a controlled differential equation. We first design a neural
+estimator by building on neural controlled differential equations. In a second
+time, we show that our model can be linearized in the signature space under
+sufficient regularity conditions, yielding a signature-based estimator which we
+call CoxSig. We provide theoretical learning guarantees for both estimators,
+before showcasing the performance of our models on a vast array of simulated
+and real-world datasets from finance, predictive maintenance and food supply
+chain management.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving <span class="highlight-title">Transformer</span>s with Dynamically Composable Multi-Head Attention <span class="chip">ICML'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.08553v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.08553v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Da Xiao, Qingye Meng, Shengping Li, Xingyuan Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Head Attention (MHA) is a key component of Transformer. In MHA,
+attention heads work independently, causing problems such as low-rank
+bottleneck of attention score matrices and head redundancy. We propose
+Dynamically Composable Multi-Head Attention (DCMHA), a parameter and
+computation efficient attention architecture that tackles the shortcomings of
+MHA and increases the expressive power of the model by dynamically composing
+attention heads. At the core of DCMHA is a $\it{Compose}$ function that
+transforms the attention score and weight matrices in an input-dependent way.
+DCMHA can be used as a drop-in replacement of MHA in any transformer
+architecture to obtain the corresponding DCFormer. DCFormer significantly
+outperforms Transformer on different architectures and model scales in language
+modeling, matching the performance of models with ~1.7x-2.0x compute. For
+example, DCPythia-6.9B outperforms open source Pythia-12B on both pretraining
+perplexity and downstream task evaluation. The code and models are available at
+https://github.com/Caiyun-AI/DCFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 41st International Conference on Machine Learning
+  (ICML'24 oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Social Choice Should Guide AI Alignment in Dealing with Diverse Human
+  Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.10271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.10271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincent Conitzer, Rachel Freedman, Jobst Heitzig, Wesley H. Holliday, Bob M. Jacobs, Nathan Lambert, Milan Mossé, Eric Pacuit, Stuart Russell, Hailey Schoelkopf, Emanuel Tewolde, William S. Zwicker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models such as GPT-4 are fine-tuned to avoid unsafe or otherwise
+problematic behavior, such as helping to commit crimes or producing racist
+text. One approach to fine-tuning, called reinforcement learning from human
+feedback, learns from humans' expressed preferences over multiple outputs.
+Another approach is constitutional AI, in which the input from humans is a list
+of high-level principles. But how do we deal with potentially diverging input
+from humans? How can we aggregate the input into consistent data about
+"collective" preferences or otherwise use it to make collective choices about
+model behavior? In this paper, we argue that the field of social choice is well
+positioned to address these questions, and we discuss ways forward for this
+agenda, drawing on discussions in a recent workshop on Social Choice for AI
+Ethics and Safety held in Berkeley, CA, USA in December 2023.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Identifiability of Switching Dynamical Systems <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15925v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15925v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carles Balsells-Rodas, Yixin Wang, Yingzhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The identifiability of latent variable models has received increasing
+attention due to its relevance in interpretability and out-of-distribution
+generalisation. In this work, we study the identifiability of Switching
+Dynamical Systems, taking an initial step toward extending identifiability
+analysis to sequential latent variable models. We first prove the
+identifiability of Markov Switching Models, which commonly serve as the prior
+distribution for the continuous latent variables in Switching Dynamical
+Systems. We present identification conditions for first-order Markov dependency
+structures, whose transition distribution is parametrised via non-linear
+Gaussians. We then establish the identifiability of the latent variables and
+non-linear mappings in Switching Dynamical Systems up to affine
+transformations, by leveraging identifiability analysis techniques from
+identifiable deep latent variable models. We finally develop estimation
+algorithms for identifiable Switching Dynamical Systems. Throughout empirical
+studies, we demonstrate the practicality of identifiable Switching Dynamical
+Systems for segmenting high-dimensional time series such as videos, and
+showcase the use of identifiable Markov Switching Models for regime-dependent
+causal discovery in climate data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Heterophily for Graph Neural Networks <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.09125v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.09125v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junfu Wang, Yuanfang Guo, Liang Yang, Yunhong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graphs with heterophily have been regarded as challenging scenarios for Graph
+Neural Networks (GNNs), where nodes are connected with dissimilar neighbors
+through various patterns. In this paper, we present theoretical understandings
+of the impacts of different heterophily patterns for GNNs by incorporating the
+graph convolution (GC) operations into fully connected networks via the
+proposed Heterophilous Stochastic Block Models (HSBM), a general random graph
+model that can accommodate diverse heterophily patterns. Firstly, we show that
+by applying a GC operation, the separability gains are determined by two
+factors, i.e., the Euclidean distance of the neighborhood distributions and
+$\sqrt{\mathbb{E}\left[\operatorname{deg}\right]}$, where
+$\mathbb{E}\left[\operatorname{deg}\right]$ is the averaged node degree. It
+reveals that the impact of heterophily on classification needs to be evaluated
+alongside the averaged node degree. Secondly, we show that the topological
+noise has a detrimental impact on separability, which is equivalent to
+degrading $\mathbb{E}\left[\operatorname{deg}\right]$. Finally, when applying
+multiple GC operations, we show that the separability gains are determined by
+the normalized distance of the $l$-powered neighborhood distributions. It
+indicates that the nodes still possess separability as $l$ goes to infinity in
+a wide range of regimes. Extensive experiments on both synthetic and real-world
+data verify the effectiveness of our theory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ More PAC-Bayes bounds: From bounded losses, to losses with general tail
+  behaviors, to anytime validity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12214v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12214v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Borja Rodríguez-Gálvez, Ragnar Thobaben, Mikael Skoglund
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present new high-probability PAC-Bayes bounds for different
+types of losses. Firstly, for losses with a bounded range, we recover a
+strengthened version of Catoni's bound that holds uniformly for all parameter
+values. This leads to new fast-rate and mixed-rate bounds that are
+interpretable and tighter than previous bounds in the literature. In
+particular, the fast-rate bound is equivalent to the Seeger--Langford bound.
+Secondly, for losses with more general tail behaviors, we introduce two new
+parameter-free bounds: a PAC-Bayes Chernoff analogue when the loss' cumulative
+generating function is bounded, and a bound when the loss' second moment is
+bounded. These two bounds are obtained using a new technique based on a
+discretization of the space of possible events for the ``in probability''
+parameter optimization problem. This technique is both simpler and more general
+than previous approaches optimizing over a grid on the parameters' space.
+Finally, using a simple technique that is applicable to any existing bound, we
+extend all previous results to anytime-valid bounds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages: ~20 of main text, ~6.5 of references, and ~17.5 of
+  appendices. Published at JMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Practical Performance Guarantees for Pipelined DNN Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03703v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03703v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron Archer, Matthew Fahrbach, Kuikui Liu, Prakash Prabhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We optimize pipeline parallelism for deep neural network (DNN) inference by
+partitioning model graphs into $k$ stages and minimizing the running time of
+the bottleneck stage, including communication. We give practical and effective
+algorithms for this NP-hard problem, but our emphasis is on tackling the
+practitioner's dilemma of deciding when a solution is good enough. To this end,
+we design novel mixed-integer programming (MIP) relaxations for proving lower
+bounds. Applying these methods to a diverse testbed of 369 production models,
+for $k \in \{2, 4, 8, 16, 32, 64\}$, we empirically show that these lower
+bounds are strong enough to be useful in practice. Our lower bounds are
+substantially stronger than standard combinatorial bounds. For example,
+evaluated via geometric means across a production testbed with $k = 16$
+pipeline stages, our MIP formulations raise the lower bound from 0.4598 to
+0.9452, expressed as a fraction of the best partition found. In other words,
+our improved lower bounds close the optimality gap by a factor of 9.855x.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trust the Model Where It Trusts Itself -- Model-Based Actor-Critic with
+  Uncertainty-Aware Rollout Adaption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19014v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19014v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bernd Frauenknecht, Artur Eisele, Devdutt Subhasish, Friedrich Solowjow, Sebastian Trimpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dyna-style model-based reinforcement learning (MBRL) combines model-free
+agents with predictive transition models through model-based rollouts. This
+combination raises a critical question: 'When to trust your model?'; i.e.,
+which rollout length results in the model providing useful data? Janner et al.
+(2019) address this question by gradually increasing rollout lengths throughout
+the training. While theoretically tempting, uniform model accuracy is a fallacy
+that collapses at the latest when extrapolating. Instead, we propose asking the
+question 'Where to trust your model?'. Using inherent model uncertainty to
+consider local accuracy, we obtain the Model-Based Actor-Critic with
+Uncertainty-Aware Rollout Adaption (MACURA) algorithm. We propose an
+easy-to-tune rollout mechanism and demonstrate substantial improvements in data
+efficiency and performance compared to state-of-the-art deep MBRL methods on
+the MuJoCo benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In value-based deep reinforcement learning, a pruned network is a good
+  network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12479v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12479v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Obando-Ceron, Aaron Courville, Pablo Samuel Castro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown that deep reinforcement learning agents have difficulty
+in effectively using their network parameters. We leverage prior insights into
+the advantages of sparse training techniques and demonstrate that gradual
+magnitude pruning enables value-based agents to maximize parameter
+effectiveness. This results in networks that yield dramatic performance
+improvements over traditional networks, using only a small fraction of the full
+network parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamics Harmonic Analysis of Robotic Systems: Application in
+  Data-Driven Koopman Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07457v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07457v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Ordoñez-Apraez, Vladimir Kostic, Giulio Turrisi, Pietro Novelli, Carlos Mastalli, Claudio Semini, Massimiliano Pontil
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the use of harmonic analysis to decompose the state space of
+symmetric robotic systems into orthogonal isotypic subspaces. These are
+lower-dimensional spaces that capture distinct, symmetric, and synergistic
+motions. For linear dynamics, we characterize how this decomposition leads to a
+subdivision of the dynamics into independent linear systems on each subspace, a
+property we term dynamics harmonic analysis (DHA). To exploit this property, we
+use Koopman operator theory to propose an equivariant deep-learning
+architecture that leverages the properties of DHA to learn a global linear
+model of the system dynamics. Our architecture, validated on synthetic systems
+and the dynamics of locomotion of a quadrupedal robot, exhibits enhanced
+generalization, sample efficiency, and interpretability, with fewer trainable
+parameters and computational costs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mixtures of Experts Unlock Parameter Scaling for Deep RL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08609v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08609v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johan Obando-Ceron, Ghada Sokar, Timon Willi, Clare Lyle, Jesse Farebrother, Jakob Foerster, Gintare Karolina Dziugaite, Doina Precup, Pablo Samuel Castro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent rapid progress in (self) supervised learning models is in large
+part predicted by empirical scaling laws: a model's performance scales
+proportionally to its size. Analogous scaling laws remain elusive for
+reinforcement learning domains, however, where increasing the parameter count
+of a model often hurts its final performance. In this paper, we demonstrate
+that incorporating Mixture-of-Expert (MoE) modules, and in particular Soft MoEs
+(Puigcerver et al., 2023), into value-based networks results in more
+parameter-scalable models, evidenced by substantial performance increases
+across a variety of training regimes and model sizes. This work thus provides
+strong empirical evidence towards developing scaling laws for reinforcement
+learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vertical Federated Learning for Effectiveness, Security, Applicability:
+  A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17495v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17495v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mang Ye, Wei Shen, Bo Du, Eduard Snezhko, Vassili Kovalev, Pong C. Yuen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vertical Federated Learning (VFL) is a privacy-preserving distributed
+learning paradigm where different parties collaboratively learn models using
+partitioned features of shared samples, without leaking private data. Recent
+research has shown promising results addressing various challenges in VFL,
+highlighting its potential for practical applications in cross-domain
+collaboration. However, the corresponding research is scattered and lacks
+organization. To advance VFL research, this survey offers a systematic overview
+of recent developments. First, we provide a history and background
+introduction, along with a summary of the general training protocol of VFL. We
+then revisit the taxonomy in recent reviews and analyze limitations in-depth.
+For a comprehensive and structured discussion, we synthesize recent research
+from three fundamental perspectives: effectiveness, security, and
+applicability. Finally, we discuss several critical future research directions
+in VFL, which will facilitate the developments in this field. We provide a
+collection of research lists and periodically update them at
+https://github.com/shentt67/VFL_Survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 9 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VITS : Variational Inference Thomson Sampling for contextual bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10167v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10167v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Clavier, Tom Huix, Alain Durmus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce and analyze a variant of the Thompson sampling
+(TS) algorithm for contextual bandits. At each round, traditional TS requires
+samples from the current posterior distribution, which is usually intractable.
+To circumvent this issue, approximate inference techniques can be used and
+provide samples with distribution close to the posteriors. However, current
+approximate techniques yield to either poor estimation (Laplace approximation)
+or can be computationally expensive (MCMC methods, Ensemble sampling...). In
+this paper, we propose a new algorithm, Varational Inference Thompson sampling
+VITS, based on Gaussian Variational Inference. This scheme provides powerful
+posterior approximations which are easy to sample from, and is computationally
+efficient, making it an ideal choice for TS. In addition, we show that VITS
+achieves a sub-linear regret bound of the same order in the dimension and
+number of round as traditional TS for linear contextual bandit. Finally, we
+demonstrate experimentally the effectiveness of VITS on both synthetic and real
+world datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Posterior Sampling-Based Bayesian Optimization with Tighter Bayesian
+  Regret Bounds <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03760v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03760v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shion Takeno, Yu Inatsu, Masayuki Karasuyama, Ichiro Takeuchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among various acquisition functions (AFs) in Bayesian optimization (BO),
+Gaussian process upper confidence bound (GP-UCB) and Thompson sampling (TS) are
+well-known options with established theoretical properties regarding Bayesian
+cumulative regret (BCR). Recently, it has been shown that a randomized variant
+of GP-UCB achieves a tighter BCR bound compared with GP-UCB, which we call the
+tighter BCR bound for brevity. Inspired by this study, this paper first shows
+that TS achieves the tighter BCR bound. On the other hand, GP-UCB and TS often
+practically suffer from manual hyperparameter tuning and over-exploration
+issues, respectively. Therefore, we analyze yet another AF called a probability
+of improvement from the maximum of a sample path (PIMS). We show that PIMS
+achieves the tighter BCR bound and avoids the hyperparameter tuning, unlike
+GP-UCB. Furthermore, we demonstrate a wide range of experiments, focusing on
+the effectiveness of PIMS that mitigates the practical issues of GP-UCB and TS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 3 figures, 2 tables, Accepted to ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Unlearning: Language Models as Few Shot Unlearners <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07579v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07579v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Pawelczyk, Seth Neel, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning, the study of efficiently removing the impact of specific
+training instances on a model, has garnered increased attention in recent years
+due to regulatory guidelines such as the \emph{Right to be Forgotten}.
+Achieving precise unlearning typically involves fully retraining the model and
+is computationally infeasible in case of very large models such as Large
+Language Models (LLMs). To this end, recent work has proposed several
+algorithms which approximate the removal of training data without retraining
+the model. These algorithms crucially rely on access to the model parameters in
+order to update them, an assumption that may not hold in practice due to
+computational constraints or having only query access to the LLMs. In this
+work, we propose a new class of unlearning methods for LLMs called ``In-Context
+Unlearning.'' This method unlearns instances from the model by simply providing
+specific kinds of inputs in context, without the need to update model
+parameters. To unlearn specific training instances, we present these instances
+to the LLMs at inference time along with labels that differ from their ground
+truth. Our experimental results demonstrate that in-context unlearning performs
+on par with, or in some cases outperforms other state-of-the-art methods that
+require access to model parameters, effectively removing the influence of
+specific instances on the model while preserving test accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Privacy Attacks in Decentralized Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10001v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10001v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdellah El Mrini, Edwige Cyffers, Aurélien Bellet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized Gradient Descent (D-GD) allows a set of users to perform
+collaborative learning without sharing their data by iteratively averaging
+local model updates with their neighbors in a network graph. The absence of
+direct communication between non-neighbor nodes might lead to the belief that
+users cannot infer precise information about the data of others. In this work,
+we demonstrate the opposite, by proposing the first attack against D-GD that
+enables a user (or set of users) to reconstruct the private data of other users
+outside their immediate neighborhood. Our approach is based on a reconstruction
+attack against the gossip averaging protocol, which we then extend to handle
+the additional challenges raised by D-GD. We validate the effectiveness of our
+attack on real graphs and datasets, showing that the number of users
+compromised by a single or a handful of attackers is often surprisingly large.
+We empirically investigate some of the factors that affect the performance of
+the attack, namely the graph topology, the number of attackers, and their
+position in the graph.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Universally Optimal Algorithms for A/B Testing <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12000v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12000v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Po-An Wang, Kaito Ariu, Alexandre Proutiere
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of best-arm identification with fixed budget in
+stochastic multi-armed bandits with Bernoulli rewards. For the problem with two
+arms, also known as the A/B testing problem, we prove that there is no
+algorithm that (i) performs as well as the algorithm sampling each arm equally
+(referred to as the {\it uniform sampling} algorithm) in all instances, and
+that (ii) strictly outperforms uniform sampling on at least one instance. In
+short, there is no algorithm better than the uniform sampling algorithm. To
+establish this result, we first introduce the natural class of {\it consistent}
+and {\it stable} algorithms, and show that any algorithm that performs as well
+as the uniform sampling algorithm in all instances belongs to this class. The
+proof then proceeds by deriving a lower bound on the error rate satisfied by
+any consistent and stable algorithm, and by showing that the uniform sampling
+algorithm matches this lower bound. Our results provide a solution to the two
+open problems presented in \citep{qin2022open}. For the general problem with
+more than two arms, we provide a first set of results. We characterize the
+asymptotic error rate of the celebrated Successive Rejects (SR) algorithm
+\citep{audibert2010best} and show that, surprisingly, the uniform sampling
+algorithm outperforms the SR algorithm in some instances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Framework for Neurosymbolic Robot Action Planning using Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00438v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00438v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessio Capitanelli, Fulvio Mastrogiovanni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Symbolic task planning is a widely used approach to enforce robot autonomy
+due to its ease of understanding and deployment in robot architectures.
+However, techniques for symbolic task planning are difficult to scale in
+real-world, human-robot collaboration scenarios because of the poor performance
+in complex planning domains or when frequent re-planning is needed. We present
+a framework, Teriyaki, specifically aimed at bridging the gap between symbolic
+task planning and machine learning approaches. The rationale is training Large
+Language Models (LLMs), namely GPT-3, into a neurosymbolic task planner
+compatible with the Planning Domain Definition Language (PDDL), and then
+leveraging its generative capabilities to overcome a number of limitations
+inherent to symbolic task planners. Potential benefits include (i) a better
+scalability in so far as the planning domain complexity increases, since LLMs'
+response time linearly scales with the combined length of the input and the
+output, and (ii) the ability to synthesize a plan action-by-action instead of
+end-to-end, making each action available for execution as soon as it is
+generated instead of waiting for the whole plan to be available, which in turn
+enables concurrent planning and execution. Recently, significant efforts have
+been devoted by the research community to evaluate the cognitive capabilities
+of LLMs, with alternate successes. Instead, with Teriyaki we aim to provide an
+overall planning performance comparable to traditional planners in specific
+planning domains, while leveraging LLMs capabilities to build a look-ahead
+predictive planning model. Preliminary results in selected domains show that
+our method can: (i) solve 95.5% of problems in a test data set of 1,000
+samples; (ii) produce plans up to 13.5% shorter than a traditional symbolic
+planner; (iii) reduce average overall waiting times for a plan availability by
+up to 61.4%
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 pages, 7 figures, 2 tables. Updated according to reviewers'
+  comments. Previous title: A Framework to Generate Neurosymbolic
+  PDDL-compliant Planners</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Probabilistic Model behind <span class="highlight-title">Self-Supervised</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01399v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01399v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alice Bizeul, Bernhard Schölkopf, Carl Allen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In self-supervised learning (SSL), representations are learned via an
+auxiliary task without annotated labels. A common task is to classify
+augmentations or different modalities of the data, which share semantic content
+(e.g. an object in an image) but differ in style (e.g. the object's location).
+Many approaches to self-supervised learning have been proposed, e.g. SimCLR,
+CLIP, and VicREG, which have recently gained much attention for their
+representations achieving downstream performance comparable to supervised
+learning. However, a theoretical understanding of self-supervised methods
+eludes. Addressing this, we present a generative latent variable model for
+self-supervised learning and show that several families of discriminative SSL,
+including contrastive methods, induce a comparable distribution over
+representations, providing a unifying theoretical framework for these methods.
+The proposed model also justifies connections drawn to mutual information and
+the use of a "projection head". Learning representations by fitting the model
+generatively (termed SimVAE) improves performance over discriminative and other
+VAE-based methods on simple image benchmarks and significantly narrows the gap
+between generative and discriminative representation learning in more complex
+settings. Importantly, as our analysis predicts, SimVAE outperforms
+self-supervised learning where style information is required, taking an
+important step toward understanding self-supervised methods and achieving
+task-agnostic representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MALIBO: Meta-learning for Likelihood-free Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarong Pan, Stefan Falkner, Felix Berkenkamp, Joaquin Vanschoren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian optimization (BO) is a popular method to optimize costly black-box
+functions. While traditional BO optimizes each new target task from scratch,
+meta-learning has emerged as a way to leverage knowledge from related tasks to
+optimize new tasks faster. However, existing meta-learning BO methods rely on
+surrogate models that suffer from scalability issues and are sensitive to
+observations with different scales and noise types across tasks. Moreover, they
+often overlook the uncertainty associated with task similarity. This leads to
+unreliable task adaptation when only limited observations are obtained or when
+the new tasks differ significantly from the related tasks. To address these
+limitations, we propose a novel meta-learning BO approach that bypasses the
+surrogate model and directly learns the utility of queries across tasks. Our
+method explicitly models task uncertainty and includes an auxiliary model to
+enable robust adaptation to new tasks. Extensive experiments show that our
+method demonstrates strong anytime performance and outperforms state-of-the-art
+meta-learning BO methods in various benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Looks Too Good To Be True: An Information-Theoretic Analysis of
+  Hallucinations in Generative Restoration Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16475v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16475v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Regev Cohen, Idan Kligvasser, Ehud Rivlin, Daniel Freedman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The pursuit of high perceptual quality in image restoration has driven the
+development of revolutionary generative models, capable of producing results
+often visually indistinguishable from real data. However, as their perceptual
+quality continues to improve, these models also exhibit a growing tendency to
+generate hallucinations - realistic-looking details that do not exist in the
+ground truth images. The presence of hallucinations introduces uncertainty
+regarding the reliability of the models' predictions, raising major concerns
+about their practical application. In this paper, we employ information-theory
+tools to investigate this phenomenon, revealing a fundamental tradeoff between
+uncertainty and perception. We rigorously analyze the relationship between
+these two factors, proving that the global minimal uncertainty in generative
+models grows in tandem with perception. In particular, we define the inherent
+uncertainty of the restoration problem and show that attaining perfect
+perceptual quality entails at least twice this uncertainty. Additionally, we
+establish a relation between mean squared-error distortion, uncertainty and
+perception, through which we prove the aforementioned uncertainly-perception
+tradeoff induces the well-known perception-distortion tradeoff. This work
+uncovers fundamental limitations of generative models in achieving both high
+perceptual quality and reliable predictions for image restoration. We
+demonstrate our theoretical findings through an analysis of single image
+super-resolution algorithms. Our work aims to raise awareness among
+practitioners about this inherent tradeoff, empowering them to make informed
+decisions and potentially prioritize safety over perceptual performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Triadic-OCD: Asynchronous Online Change Detection with Provable
+  Robustness, Optimality, and Convergence <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02372v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02372v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yancheng Huang, Kai Yang, Zelin Zhu, Leian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The primary goal of online change detection (OCD) is to promptly identify
+changes in the data stream. OCD problem find a wide variety of applications in
+diverse areas, e.g., security detection in smart grids and intrusion detection
+in communication networks. Prior research usually assumes precise knowledge of
+the system parameters. Nevertheless, this presumption often proves unattainable
+in practical scenarios due to factors such as estimation errors, system
+updates, etc. This paper aims to take the first attempt to develop a
+triadic-OCD framework with certifiable robustness, provable optimality, and
+guaranteed convergence. In addition, the proposed triadic-OCD algorithm can be
+realized in a fully asynchronous distributed manner, easing the necessity of
+transmitting the data to a single server. This asynchronous mechanism could
+also mitigate the straggler issue that faced by traditional synchronous
+algorithm. Moreover, the non-asymptotic convergence property of Triadic-OCD is
+theoretically analyzed, and its iteration complexity to achieve an
+$\epsilon$-optimal point is derived. Extensive experiments have been conducted
+to elucidate the effectiveness of the proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Precision and Recall to assess the quality and diversity of
+  LLMs <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10693v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10693v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Le Bronnec, Alexandre Verine, Benjamin Negrevergne, Yann Chevaleyre, Alexandre Allauzen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel evaluation framework for Large Language Models (LLMs)
+such as \textsc{Llama-2} and \textsc{Mistral}, focusing on importing Precision
+and Recall metrics from image generation to text generation. This approach
+allows for a nuanced assessment of the quality and diversity of generated text
+without the need for aligned corpora. By conducting a comprehensive evaluation
+of state-of-the-art language models, the study reveals new insights into their
+performance on open-ended generation tasks, which are not adequately captured
+by traditional benchmarks. The findings highlight a trade-off between the
+quality and diversity of generated samples, particularly when models are
+fine-tuned on instruction dataset or with human feedback. This work extends the
+toolkit for distribution-based NLP evaluation, offering insights into the
+practical capabilities and challenges that current LLMs face in generating
+diverse and high-quality text. We release our code and data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 15 figures, ACL 2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating the Impact of Model Instability on Explanations and
+  Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13006v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13006v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sara Vera Marjanović, Isabelle Augenstein, Christina Lioma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable AI methods facilitate the understanding of model behaviour, yet,
+small, imperceptible perturbations to inputs can vastly distort explanations.
+As these explanations are typically evaluated holistically, before model
+deployment, it is difficult to assess when a particular explanation is
+trustworthy. Some studies have tried to create confidence estimators for
+explanations, but none have investigated an existing link between uncertainty
+and explanation quality. We artificially simulate epistemic uncertainty in text
+input by introducing noise at inference time. In this large-scale empirical
+study, we insert different levels of noise perturbations and measure the effect
+on the output of pre-trained language models and different uncertainty metrics.
+Realistic perturbations have minimal effect on performance and explanations,
+yet masking has a drastic effect. We find that high uncertainty doesn't
+necessarily imply low explanation plausibility; the correlation between the two
+metrics can be moderately positive when noise is exposed during the training
+process. This suggests that noise-augmented models may be better at identifying
+salient tokens when uncertain. Furthermore, when predictive and epistemic
+uncertainty measures are over-confident, the robustness of a saliency map to
+perturbation can indicate model stability issues. Integrated Gradients shows
+the overall greatest robustness to perturbation, while still showing
+model-specific patterns in performance; however, this phenomenon is limited to
+smaller Transformer-based language models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ICC: Quantifying Image Caption Concreteness for Multimodal <span class="highlight-title">Dataset</span>
+  Curation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01306v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01306v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moran Yanuka, Morris Alper, Hadar Averbuch-Elor, Raja Giryes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Web-scale training on paired text-image data is becoming increasingly central
+to multimodal learning, but is challenged by the highly noisy nature of
+datasets in the wild. Standard data filtering approaches succeed in removing
+mismatched text-image pairs, but permit semantically related but highly
+abstract or subjective text. These approaches lack the fine-grained ability to
+isolate the most concrete samples that provide the strongest signal for
+learning in a noisy dataset. In this work, we propose a new metric, image
+caption concreteness, that evaluates caption text without an image reference to
+measure its concreteness and relevancy for use in multimodal learning. Our
+approach leverages strong foundation models for measuring visual-semantic
+information loss in multimodal representations. We demonstrate that this
+strongly correlates with human evaluation of concreteness in both single-word
+and sentence-level texts. Moreover, we show that curation using ICC complements
+existing approaches: It succeeds in selecting the highest quality samples from
+multimodal web-scale datasets to allow for efficient training in
+resource-constrained settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 (Finding). For Project webpage, see
+  https://moranyanuka.github.io/icc/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Semantic Latent Space of Diffusion-Based Text-to-Speech Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12423v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12423v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miri Varshavsky-Hassid, Roy Hirsch, Regev Cohen, Tomer Golany, Daniel Freedman, Ehud Rivlin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The incorporation of Denoising Diffusion Models (DDMs) in the Text-to-Speech
+(TTS) domain is rising, providing great value in synthesizing high quality
+speech. Although they exhibit impressive audio quality, the extent of their
+semantic capabilities is unknown, and controlling their synthesized speech's
+vocal properties remains a challenge. Inspired by recent advances in image
+synthesis, we explore the latent space of frozen TTS models, which is composed
+of the latent bottleneck activations of the DDM's denoiser. We identify that
+this space contains rich semantic information, and outline several novel
+methods for finding semantic directions within it, both supervised and
+unsupervised. We then demonstrate how these enable off-the-shelf audio editing,
+without any further training, architectural changes or data requirements. We
+present evidence of the semantic and acoustic qualities of the edited audio,
+and provide supplemental samples:
+https://latent-analysis-grad-tts.github.io/speech-samples/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Denoising Autoregressive Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05196v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05196v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yazhe Li, Jorg Bornschein, Ting Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we explore a new generative approach for learning visual
+representations. Our method, DARL, employs a decoder-only Transformer to
+predict image patches autoregressively. We find that training with Mean Squared
+Error (MSE) alone leads to strong representations. To enhance the image
+generation ability, we replace the MSE loss with the diffusion objective by
+using a denoising patch decoder. We show that the learned representation can be
+improved by using tailored noise schedules and longer training in larger
+models. Notably, the optimal schedule differs significantly from the typical
+ones used in standard image diffusion models. Overall, despite its simple
+architecture, DARL delivers performance remarkably close to state-of-the-art
+masked prediction models under the fine-tuning protocol. This marks an
+important step towards a unified model capable of both visual perception and
+generation, effectively combining the strengths of autoregressive and denoising
+diffusion models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HLOB -- Information Persistence and Structure in Limit Order Books 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18938v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18938v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonio Briola, Silvia Bartolucci, Tomaso Aste
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel large-scale deep learning model for Limit Order Book
+mid-price changes forecasting, and we name it `HLOB'. This architecture (i)
+exploits the information encoded by an Information Filtering Network, namely
+the Triangulated Maximally Filtered Graph, to unveil deeper and non-trivial
+dependency structures among volume levels; and (ii) guarantees deterministic
+design choices to handle the complexity of the underlying system by drawing
+inspiration from the groundbreaking class of Homological Convolutional Neural
+Networks. We test our model against 9 state-of-the-art deep learning
+alternatives on 3 real-world Limit Order Book datasets, each including 15
+stocks traded on the NASDAQ exchange, and we systematically characterize the
+scenarios where HLOB outperforms state-of-the-art architectures. Our approach
+sheds new light on the spatial distribution of information in Limit Order Books
+and on its degradation over increasing prediction horizons, narrowing the gap
+between microstructural modeling and deep learning-based forecasting in
+high-frequency financial markets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 7 figures, 7 tables, 3 equations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMs cannot find reasoning errors, but can correct them given the error
+  location <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.08516v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.08516v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gladys Tyen, Hassan Mansoor, Victor Cărbune, Peter Chen, Tony Mak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While self-correction has shown promise in improving LLM outputs in terms of
+style and quality (e.g. Chen et al., 2023b; Madaan et al., 2023), recent
+attempts to self-correct logical or reasoning errors often cause correct
+answers to become incorrect, resulting in worse performances overall (Huang et
+al., 2023). In this paper, we show that poor self-correction performance stems
+from LLMs' inability to find logical mistakes, rather than their ability to
+correct a known mistake. Firstly, we benchmark several state-of-the-art LLMs on
+their mistake-finding ability and demonstrate that they generally struggle with
+the task, even in highly objective, unambiguous cases. Secondly, we test the
+correction abilities of LLMs -- separately from mistake finding -- using a
+backtracking setup that feeds ground truth mistake location information to the
+model. We show that this boosts downstream task performance across our 5
+reasoning tasks, indicating that LLMs' correction abilities are robust.
+Finally, we show that it is possible to obtain mistake location information
+without ground truth labels or in-domain training data. We train a small
+classifier with out-of-domain data, which exhibits stronger mistake-finding
+performance than prompting a large model. We release our dataset of
+LLM-generated logical mistakes, BIG-Bench Mistake, to enable further research
+into locating LLM reasoning mistakes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Epistemic Uncertainty-Weighted Loss for Visual Bias Mitigation <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.09389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.09389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rebecca S Stone, Nishant Ravikumar, Andrew J Bulpitt, David C Hogg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks are highly susceptible to learning biases in visual
+data. While various methods have been proposed to mitigate such bias, the
+majority require explicit knowledge of the biases present in the training data
+in order to mitigate. We argue the relevance of exploring methods which are
+completely ignorant of the presence of any bias, but are capable of identifying
+and mitigating them. Furthermore, we propose using Bayesian neural networks
+with a predictive uncertainty-weighted loss function to dynamically identify
+potential bias in individual training samples and to weight them during
+training. We find a positive correlation between samples subject to bias and
+higher epistemic uncertainties. Finally, we show the method has potential to
+mitigate visual bias on a bias benchmark dataset and on a real-world face
+detection problem, and we consider the merits and weaknesses of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in 2022 IEEE CVPR Workshop on Fair, Data Efficient and
+  Trusted Computer Vision</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soft Partitioning of Latent Space for Semantic Channel Equalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20085v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20085v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomás Hüttebräucker, Mohamed Sana, Emilio Calvanese Strinati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic channel equalization has emerged as a solution to address language
+mismatch in multi-user semantic communications. This approach aims to align the
+latent spaces of an encoder and a decoder which were not jointly trained and it
+relies on a partition of the semantic (latent) space into atoms based on the
+the semantic meaning. In this work we explore the role of the semantic space
+partition in scenarios where the task structure involves a one-to-many mapping
+between the semantic space and the action space. In such scenarios,
+partitioning based on hard inference results results in loss of information
+which degrades the equalization performance. We propose a soft criterion to
+derive the atoms of the partition which leverages the soft decoder's output and
+offers a more comprehensive understanding of the semantic space's structure.
+Through empirical validation, we demonstrate that soft partitioning yields a
+more descriptive and regular partition of the space, consequently enhancing the
+performance of the equalization algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Latent Space Alignment for Semantic Channel Equalization <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13511v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13511v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomás Hüttebräucker, Mohamed Sana, Emilio Calvanese Strinati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We relax the constraint of a shared language between agents in a semantic and
+goal-oriented communication system to explore the effect of language mismatch
+in distributed task solving. We propose a mathematical framework, which
+provides a modelling and a measure of the semantic distortion introduced in the
+communication when agents use distinct languages. We then propose a new
+approach to semantic channel equalization with proven effectiveness through
+numerical evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at 2024 IEEE ICMLCN</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pragmatic Goal-Oriented Communications under Semantic-Effectiveness
+  Channel Errors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16858v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16858v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomás Hüttebräucker, Mohamed Sana, Emilio Calvanese Strinati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In forthcoming AI-assisted 6G networks, integrating semantic, pragmatic, and
+goal-oriented communication strategies becomes imperative. This integration
+will enable sensing, transmission, and processing of exclusively pertinent task
+data, ensuring conveyed information possesses understandable, pragmatic
+semantic significance, aligning with destination needs and goals. Without
+doubt, no communication is error free. Within this context, besides errors
+stemming from typical wireless communication dynamics, potential distortions
+between transmitter-intended and receiver-interpreted meanings can emerge due
+to limitations in semantic processing capabilities, as well as language and
+knowledge representation disparities between transmitters and receivers. The
+main contribution of this paper is two-fold. First, it proposes and details a
+novel mathematical modeling of errors stemming from language mismatches at both
+semantic and effectiveness levels. Second, it provides a novel algorithmic
+solution to counteract these types of errors which leverages optimal transport
+theory. Our numerical results show the potential of the proposed mechanism to
+compensate for language mismatches, thereby enhancing the attainability of
+reliable communication under noisy communication environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in 2024 IEEE Consumer Communications and
+  Networking Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving the Validity of Decision Trees as Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06777v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06777v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiri Nemecek, Tomas Pevny, Jakub Marecek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In classification and forecasting with tabular data, one often utilizes
+tree-based models. Those can be competitive with deep neural networks on
+tabular data and, under some conditions, explainable. The explainability
+depends on the depth of the tree and the accuracy in each leaf of the tree. We
+point out that decision trees containing leaves with unbalanced accuracy can
+provide misleading explanations. Low-accuracy leaves give less valid
+explanations, which could be interpreted as unfairness among subgroups
+utilizing these explanations. Here, we train a shallow tree with the objective
+of minimizing the maximum misclassification error across all leaf nodes. The
+shallow tree provides a global explanation, while the overall statistical
+performance of the shallow tree can become comparable to state-of-the-art
+methods (e.g., well-tuned XGBoost) by extending the leaves with further models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Activation Addition: Steering Language Models Without Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10248v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10248v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Matt Turner, Lisa Thiergart, Gavin Leech, David Udell, Juan J. Vazquez, Ulisse Mini, Monte MacDiarmid
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliably controlling the behavior of large language models is a pressing open
+problem. Existing methods include supervised finetuning, reinforcement learning
+from human feedback, prompt engineering and guided decoding. We instead
+investigate activation engineering: modifying activations at inference-time to
+predictably alter model behavior. We bias the forward pass with a 'steering
+vector' implicitly specified through natural language. Past work learned these
+steering vectors; our Activation Addition (ActAdd) method instead computes them
+by taking activation differences resulting from pairs of prompts. We
+demonstrate ActAdd on a range of LLMs (LLaMA-3, OPT, GPT-2, and GPT-J),
+obtaining SOTA on detoxification and negative-to-positive sentiment control.
+Our approach yields inference-time control over high-level properties of output
+like topic and sentiment while preserving performance on off-target tasks.
+ActAdd takes far less compute and implementation effort than finetuning or
+RLHF, allows users control through natural language, and its computational
+overhead (as a fraction of inference time) appears stable or improving over
+increasing model size.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">10</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CtrSVDD: A Benchmark <span class="highlight-title">Dataset</span> and Baseline Analysis for Controlled
+  Singing Voice Deepfake Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongyi Zang, Jiatong Shi, You Zhang, Ryuichi Yamamoto, Jionghao Han, Yuxun Tang, Shengyuan Xu, Wenxiao Zhao, Jing Guo, Tomoki Toda, Zhiyao Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent singing voice synthesis and conversion advancements necessitate robust
+singing voice deepfake detection (SVDD) models. Current SVDD datasets face
+challenges due to limited controllability, diversity in deepfake methods, and
+licensing restrictions. Addressing these gaps, we introduce CtrSVDD, a
+large-scale, diverse collection of bonafide and deepfake singing vocals. These
+vocals are synthesized using state-of-the-art methods from publicly accessible
+singing voice datasets. CtrSVDD includes 47.64 hours of bonafide and 260.34
+hours of deepfake singing vocals, spanning 14 deepfake methods and involving
+164 singer identities. We also present a baseline system with flexible
+front-end features, evaluated against a structured train/dev/eval split. The
+experiments show the importance of feature selection and highlight a need for
+generalization towards deepfake methods that deviate further from training
+distribution. The CtrSVDD dataset and baselines are publicly accessible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Interspeech 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive Confident Masking Attention Network for Audio-Visual
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Wang, Feng Dong, Jinchao Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio and visual signals typically occur simultaneously, and humans possess
+an innate ability to correlate and synchronize information from these two
+modalities. Recently, a challenging problem known as Audio-Visual Segmentation
+(AVS) has emerged, intending to produce segmentation maps for sounding objects
+within a scene. However, the methods proposed so far have not sufficiently
+integrated audio and visual information, and the computational costs have been
+extremely high. Additionally, the outputs of different stages have not been
+fully utilized. To facilitate this research, we introduce a novel Progressive
+Confident Masking Attention Network (PMCANet). It leverages attention
+mechanisms to uncover the intrinsic correlations between audio signals and
+visual frames. Furthermore, we design an efficient and effective
+cross-attention module to enhance semantic perception by selecting query
+tokens. This selection is determined through confidence-driven units based on
+the network's multi-stage predictive outputs. Experiments demonstrate that our
+network outperforms other AVS methods while requiring less computational
+resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 9 figures, submitted to IEEE TRANSACTIONS ON CIRCUITS AND
+  SYSTEMS FOR VIDEO TECHNOLOGY</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards AI-Assisted Sustainable Adaptive Video Streaming Systems:
+  Tutorial and <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02302v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02302v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Farahani, Zoha Azimi, Christian Timmerer, Radu Prodan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Improvements in networking technologies and the steadily increasing numbers
+of users, as well as the shift from traditional broadcasting to streaming
+content over the Internet, have made video applications (e.g., live and
+Video-on-Demand (VoD)) predominant sources of traffic. Recent advances in
+Artificial Intelligence (AI) and its widespread application in various academic
+and industrial fields have focused on designing and implementing a variety of
+video compression and content delivery techniques to improve user Quality of
+Experience (QoE). However, providing high QoE services results in more energy
+consumption and carbon footprint across the service delivery path, extending
+from the end user's device through the network and service infrastructure
+(e.g., cloud providers). Despite the importance of energy efficiency in video
+streaming, there is a lack of comprehensive surveys covering state-of-the-art
+AI techniques and their applications throughout the video streaming lifecycle.
+Existing surveys typically focus on specific parts, such as video encoding,
+delivery networks, playback, or quality assessment, without providing a
+holistic view of the entire lifecycle and its impact on energy consumption and
+QoE. Motivated by this research gap, this survey provides a comprehensive
+overview of the video streaming lifecycle, content delivery, energy and Video
+Quality Assessment (VQA) metrics and models, and AI techniques employed in
+video streaming. In addition, it conducts an in-depth state-of-the-art analysis
+focused on AI-driven approaches to enhance the energy efficiency of end-to-end
+aspects of video streaming systems (i.e., encoding, delivery network, playback,
+and VQA approaches). Finally, it discusses prospective research directions for
+developing AI-assisted energy-aware video streaming systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 7 figures, 6 Tables, Journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MidiCaps -- A large-scale MIDI <span class="highlight-title">dataset</span> with text captions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Melechovsky, Abhinaba Roy, Dorien Herremans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models guided by text prompts are increasingly becoming more
+popular. However, no text-to-MIDI models currently exist, mostly due to the
+lack of a captioned MIDI dataset. This work aims to enable research that
+combines LLMs with symbolic music by presenting the first large-scale MIDI
+dataset with text captions that is openly available: MidiCaps. MIDI (Musical
+Instrument Digital Interface) files are a widely used format for encoding
+musical information. Their structured format captures the nuances of musical
+composition and has practical applications by music producers, composers,
+musicologists, as well as performers. Inspired by recent advancements in
+captioning techniques applied to various domains, we present a large-scale
+curated dataset of over 168k MIDI files accompanied by textual descriptions.
+Each MIDI caption succinctly describes the musical content, encompassing tempo,
+chord progression, time signature, instruments present, genre and mood; thereby
+facilitating multi-modal exploration and analysis. The dataset contains a mix
+of various genres, styles, and complexities, offering a rich source for
+training and evaluating models for tasks such as music information retrieval,
+music understanding and cross-modal translation. We provide detailed statistics
+about the dataset and have assessed the quality of the captions in an extensive
+listening study. We anticipate that this resource will stimulate further
+research in the intersection of music and natural language processing,
+fostering advancements in both fields.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Railways Remote Driving: Analysis of Video Streaming Latency and
+  Adaptive Rate Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Mejias, Zaloa Fernandez, Roberto Viola, Ander Aramburu, Igor Lopez, Andoni Diaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote driving aims to improve transport systems by promoting efficiency,
+sustainability, and accessibility. In the railway sector, remote driving makes
+it possible to increase flexibility, as the driver no longer has to be in the
+cab. However, this brings several challenges, as it has to provide at least the
+same level of safety obtained when the driver is in the cab. To achieve it,
+wireless networks and video streaming technologies gain importance as they
+should provide real-time track visualization and obstacle detection
+capabilities to the remote driver. Low latency camera capture, onboard media
+processing devices, and streaming protocols adapted for wireless links are the
+necessary enablers to be developed and integrated into the railway
+infrastructure. This paper compares video streaming protocols such as Real-Time
+Streaming Protocol (RTSP) and Web Real-Time Communication (WebRTC), as they are
+the main alternatives based on Real-time Transport Protocol (RTP) protocol to
+enable low latency. As latency is the main performance metric, this paper also
+provides a solution to calculate the End-to-End video streaming latency
+analytically. Finally, the paper proposes a rate control algorithm to adapt the
+video stream depending on the network capacity. The objective is to keep the
+latency as low as possible while avoiding any visual artifacts. The proposed
+solutions are tested in different setups and scenarios to prove their
+effectiveness before the planned field testing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ M2D-CLAP: Masked Modeling Duo Meets CLAP for Learning General-purpose
+  Audio-Language Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daisuke Niizumi, Daiki Takeuchi, Yasunori Ohishi, Noboru Harada, Masahiro Yasuda, Shunsuke Tsubaki, Keisuke Imoto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive language-audio pre-training (CLAP) enables zero-shot (ZS)
+inference of audio and exhibits promising performance in several classification
+tasks. However, conventional audio representations are still crucial for many
+tasks where ZS is not applicable (e.g., regression problems). Here, we explore
+a new representation, a general-purpose audio-language representation, that
+performs well in both ZS and transfer learning. To do so, we propose a new
+method, M2D-CLAP, which combines self-supervised learning Masked Modeling Duo
+(M2D) and CLAP. M2D learns an effective representation to model audio signals,
+and CLAP aligns the representation with text embedding. As a result, M2D-CLAP
+learns a versatile representation that allows for both ZS and transfer
+learning. Experiments show that M2D-CLAP performs well on linear evaluation,
+fine-tuning, and ZS classification with a GTZAN state-of-the-art of 75.17%,
+thus achieving a general-purpose audio-language representation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, 5 tables. Accepted by Interspeech 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nutrition Estimation for Dietary Management: A <span class="highlight-title">Transformer</span> Approach with
+  Depth Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyi Kwan, Wei Zhang, Zhengkui Wang, Aik Beng Ng, Simon See
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nutrition estimation is crucial for effective dietary management and overall
+health and well-being. Existing methods often struggle with sub-optimal
+accuracy and can be time-consuming. In this paper, we propose NuNet, a
+transformer-based network designed for nutrition estimation that utilizes both
+RGB and depth information from food images. We have designed and implemented a
+multi-scale encoder and decoder, along with two types of feature fusion
+modules, specialized for estimating five nutritional factors. These modules
+effectively balance the efficiency and effectiveness of feature extraction with
+flexible usage of our customized attention mechanisms and fusion strategies.
+Our experimental study shows that NuNet outperforms its variants and existing
+solutions significantly for nutrition estimation. It achieves an error rate of
+15.65%, the lowest known to us, largely due to our multi-scale architecture and
+fusion modules. This research holds practical values for dietary management
+with huge potential for transnational research and deployment and could inspire
+other applications involving multiple data types with varying degrees of
+importance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-layer Learnable Attention Mask for Multimodal Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wayner Barrios, SouYoung Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the Self-Attention mechanism in the Transformer model has proven to be
+effective in many domains, we observe that it is less effective in more diverse
+settings (e.g. multimodality) due to the varying granularity of each token and
+the high computational demands of lengthy sequences. To address the challenges,
+we introduce the Learnable Attention Mask (LAM), strategically designed to
+globally regulate attention maps and prioritize critical tokens within the
+sequence. Leveraging the Self-Attention module in a BERT-like transformer
+network, our approach adeptly captures associations between tokens. The
+extension of the LAM to a multi-layer version accommodates the varied
+information aspects embedded at each layer of the Transformer network.
+Comprehensive experimental validation on various datasets, such as MADv2,
+QVHighlights, ImageNet 1K, and MSRVTT, demonstrates the efficacy of the LAM,
+exemplifying its ability to enhance model performance while mitigating
+redundant computations. This pioneering approach presents a significant
+advancement in enhancing the understanding of complex scenarios, such as in
+movie understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advancing Unsupervised Low-light Image Enhancement: Noise Estimation,
+  Illumination Interpolation, and Self-Regulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10223v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10223v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofeng Liu, Jiaxin Gao, Xin Fan, Risheng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary Low-Light Image Enhancement (LLIE) techniques have made notable
+advancements in preserving image details and enhancing contrast, achieving
+commendable results on specific datasets. Nevertheless, these approaches
+encounter persistent challenges in efficiently mitigating dynamic noise and
+accommodating diverse low-light scenarios. Insufficient constraints on complex
+pixel-wise mapping learning lead to overfitting to specific types of noise and
+artifacts associated with low-light conditions, reducing effectiveness in
+variable lighting scenarios. To this end, we first propose a method for
+estimating the noise level in low light images in a quick and accurate way.
+This facilitates precise denoising, prevents over-smoothing, and adapts to
+dynamic noise patterns. Subsequently, we devise a Learnable Illumination
+Interpolator (LII), which employs learnlable interpolation operations between
+the input and unit vector to satisfy general constraints between illumination
+and input. Finally, we introduce a self-regularization loss that incorporates
+intrinsic image properties and essential visual attributes to guide the output
+towards meeting human visual expectations. Comprehensive experiments validate
+the competitiveness of our proposed algorithm in both qualitative and
+quantitative assessments. Notably, our noise estimation method, with linear
+time complexity and suitable for various denoisers, significantly improves both
+denoising and enhancement performance. Benefiting from this, our approach
+achieves a 0.675dB PSNR improvement on the LOL dataset and 0.818dB on the MIT
+dataset on LLIE task, even compared to supervised methods. The source code is
+available at \href{https://doi.org/10.5281/zenodo.11463142}{this DOI
+repository} and the specific code for noise estimation can be found at
+\href{https://github.com/GoogolplexGoodenough/noise_estimate}{this separate
+GitHub link}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Image processing, low-light image enhancement, noise estimation,
+  illumination learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extreme Compression of Adaptive Neural Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leo Hoshikawa, Marcos V. Conde, Takeshi Ohashi, Atsushi Irie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit Neural Representations (INRs) and Neural Fields are a novel paradigm
+for signal representation, from images and audio to 3D scenes and videos. The
+fundamental idea is to represent a signal as a continuous and differentiable
+neural network. This idea offers unprecedented benefits such as continuous
+resolution and memory efficiency, enabling new compression techniques. However,
+representing data as neural networks poses new challenges. For instance, given
+a 2D image as a neural network, how can we further compress such a neural
+image?. In this work, we present a novel analysis on compressing neural fields,
+with the focus on images. We also introduce Adaptive Neural Images (ANI), an
+efficient neural representation that enables adaptation to different inference
+or transmission requirements. Our proposed method allows to reduce the
+bits-per-pixel (bpp) of the neural image by 4x, without losing sensitive
+details or harming fidelity. We achieve this thanks to our successful
+implementation of 4-bit neural representations. Our work offers a new framework
+for developing compressed neural fields.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-06-03T00:00:00Z">2024-06-03</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">73</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Self-Evolution of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14387v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14387v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengwei Tao, Ting-En Lin, Xiancai Chen, Hangyu Li, Yuchuan Wu, Yongbin Li, Zhi Jin, Fei Huang, Dacheng Tao, Jingren Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have significantly advanced in various fields
+and intelligent agent applications. However, current LLMs that learn from human
+or external model supervision are costly and may face performance ceilings as
+task complexity and diversity increase. To address this issue, self-evolution
+approaches that enable LLM to autonomously acquire, refine, and learn from
+experiences generated by the model itself are rapidly growing. This new
+training paradigm inspired by the human experiential learning process offers
+the potential to scale LLMs towards superintelligence. In this work, we present
+a comprehensive survey of self-evolution approaches in LLMs. We first propose a
+conceptual framework for self-evolution and outline the evolving process as
+iterative cycles composed of four phases: experience acquisition, experience
+refinement, updating, and evaluation. Second, we categorize the evolution
+objectives of LLMs and LLM-based agents; then, we summarize the literature and
+provide taxonomy and insights for each module. Lastly, we pinpoint existing
+challenges and propose future directions to improve self-evolution frameworks,
+equipping researchers with critical insights to fast-track the development of
+self-evolving LLMs. Our corresponding GitHub repository is available at
+https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/Awesome-Self-Evolution-of-LLM
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PixT3: Pixel-based Table-To-Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09808v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09808v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iñigo Alonso, Eneko Agirre, Mirella Lapata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Table-to-text generation involves generating appropriate textual descriptions
+given structured tabular data. It has attracted increasing attention in recent
+years thanks to the popularity of neural network models and the availability of
+large-scale datasets. A common feature across existing methods is their
+treatment of the input as a string, i.e., by employing linearization techniques
+that do not always preserve information in the table, are verbose, and lack
+space efficiency. We propose to rethink data-to-text generation as a visual
+recognition task, removing the need for rendering the input in a string format.
+We present PixT3, a multimodal table-to-text model that overcomes the
+challenges of linearization and input size limitations encountered by existing
+models. PixT3 is trained with a new self-supervised learning objective to
+reinforce table structure awareness and is applicable to open-ended and
+controlled generation settings. Experiments on the ToTTo and Logic2Text
+benchmarks show that PixT3 is competitive and, in some settings, superior to
+generators that operate solely on text.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Arrows of Time for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.17505v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.17505v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vassilis Papadopoulos, Jérémie Wenger, Clément Hongler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the probabilistic modeling performed by Autoregressive Large
+Language Models (LLMs) through the angle of time directionality, addressing a
+question first raised in (Shannon, 1951). For large enough models, we
+empirically find a time asymmetry in their ability to learn natural language: a
+difference in the average log-perplexity when trying to predict the next token
+versus when trying to predict the previous one. This difference is at the same
+time subtle and very consistent across various modalities (language, model
+size, training time, ...). Theoretically, this is surprising: from an
+information-theoretic point of view, there should be no such difference. We
+provide a theoretical framework to explain how such an asymmetry can appear
+from sparsity and computational complexity considerations, and outline a number
+of perspectives opened by our results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Re-arranged and updated figures. Added experiments. 12 figures, 20
+  pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Open-Vocabulary 3D Scene Graphs for Language-Grounded Robot
+  Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17846v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17846v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrhman Werby, Chenguang Huang, Martin Büchner, Abhinav Valada, Wolfram Burgard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent open-vocabulary robot mapping methods enrich dense geometric maps with
+pre-trained visual-language features. While these maps allow for the prediction
+of point-wise saliency maps when queried for a certain language concept,
+large-scale environments and abstract queries beyond the object level still
+pose a considerable hurdle, ultimately limiting language-grounded robotic
+navigation. In this work, we present HOV-SG, a hierarchical open-vocabulary 3D
+scene graph mapping approach for language-grounded robot navigation. Leveraging
+open-vocabulary vision foundation models, we first obtain state-of-the-art
+open-vocabulary segment-level maps in 3D and subsequently construct a 3D scene
+graph hierarchy consisting of floor, room, and object concepts, each enriched
+with open-vocabulary features. Our approach is able to represent multi-story
+buildings and allows robotic traversal of those using a cross-floor Voronoi
+graph. HOV-SG is evaluated on three distinct datasets and surpasses previous
+baselines in open-vocabulary semantic accuracy on the object, room, and floor
+level while producing a 75% reduction in representation size compared to dense
+open-vocabulary maps. In order to prove the efficacy and generalization
+capabilities of HOV-SG, we showcase successful long-horizon
+language-conditioned robot navigation within real-world multi-storage
+environments. We provide code and trial video data at http://hovsg.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and video are available at http://hovsg.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are Language Models More Like Libraries or Like Librarians?
+  Bibliotechnism, the Novel Reference Problem, and the Attitudes of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04854v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04854v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harvey Lederman, Kyle Mahowald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Are LLMs cultural technologies like photocopiers or printing presses, which
+transmit information but cannot create new content? A challenge for this idea,
+which we call bibliotechnism, is that LLMs generate novel text. We begin with a
+defense of bibliotechnism, showing how even novel text may inherit its meaning
+from original human-generated text. We then argue that bibliotechnism faces an
+independent challenge from examples in which LLMs generate novel reference,
+using new names to refer to new entities. Such examples could be explained if
+LLMs were not cultural technologies but had beliefs, desires, and intentions.
+According to interpretationism in the philosophy of mind, a system has such
+attitudes if and only if its behavior is well explained by the hypothesis that
+it does. Interpretationists may hold that LLMs have attitudes, and thus have a
+simple solution to the novel reference problem. We emphasize, however, that
+interpretationism is compatible with very simple creatures having attitudes and
+differs sharply from views that presuppose these attitudes require
+consciousness, sentience, or intelligence (topics about which we make no
+claims).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VIEScore: Towards Explainable Metrics for Conditional Image Synthesis
+  Evaluation <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Ku, Dongfu Jiang, Cong Wei, Xiang Yue, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly advancing field of conditional image generation research,
+challenges such as limited explainability lie in effectively evaluating the
+performance and capabilities of various models. This paper introduces VIEScore,
+a Visual Instruction-guided Explainable metric for evaluating any conditional
+image generation tasks. VIEScore leverages general knowledge from Multimodal
+Large Language Models (MLLMs) as the backbone and does not require training or
+fine-tuning. We evaluate VIEScore on seven prominent tasks in conditional image
+tasks and found: (1) VIEScore (GPT4-o) achieves a high Spearman correlation of
+0.4 with human evaluations, while the human-to-human correlation is 0.45. (2)
+VIEScore (with open-source MLLM) is significantly weaker than GPT-4o and GPT-4v
+in evaluating synthetic images. (3) VIEScore achieves a correlation on par with
+human ratings in the generation tasks but struggles in editing tasks. With
+these results, we believe VIEScore shows its great potential to replace human
+judges in evaluating image synthesis tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL2024 main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuSpeech: Decode Neural signal as Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01748v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01748v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqian Yang, Yiqun Duan, Qiang Zhang, Hyejeong Jo, Jinni Zhou, Won Hee Lee, Renjing Xu, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoding language from brain dynamics is an important open direction in the
+realm of brain-computer interface (BCI), especially considering the rapid
+growth of large language models. Compared to invasive-based signals which
+require electrode implantation surgery, non-invasive neural signals (e.g. EEG,
+MEG) have attracted increasing attention considering their safety and
+generality. However, the exploration is not adequate in three aspects: 1)
+previous methods mainly focus on EEG but none of the previous works address
+this problem on MEG with better signal quality; 2) prior works have
+predominantly used $``teacher-forcing"$ during generative decoding, which is
+impractical; 3) prior works are mostly $``BART-based"$ not fully
+auto-regressive, which performs better in other sequence tasks. In this paper,
+we explore the brain-to-text translation of MEG signals in a speech-decoding
+formation. Here we are the first to investigate a cross-attention-based
+``whisper" model for generating text directly from MEG signals without teacher
+forcing. Our model achieves impressive BLEU-1 scores of 60.30 and 52.89 without
+pretraining $\&$ teacher-forcing on two major datasets ($\textit{GWilliams}$
+and $\textit{Schoffelen}$). This paper conducts a comprehensive review to
+understand how speech decoding formation performs on the neural decoding tasks,
+including pretraining initialization, training $\&$ evaluation set splitting,
+augmentation, and scaling law. Code is available at
+https://github.com/NeuSpeech/NeuSpeech1$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Faithful and Robust LLM Specialists for Evidence-Based
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08277v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08277v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Schimanski, Jingwei Ni, Mathias Kraus, Elliott Ash, Markus Leippold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances towards more faithful and traceable answers of Large Language Models
+(LLMs) are crucial for various research and practical endeavors. One avenue in
+reaching this goal is basing the answers on reliable sources. However, this
+Evidence-Based QA has proven to work insufficiently with LLMs in terms of
+citing the correct sources (source quality) and truthfully representing the
+information within sources (answer attributability). In this work, we
+systematically investigate how to robustly fine-tune LLMs for better source
+quality and answer attributability. Specifically, we introduce a data
+generation pipeline with automated data quality filters, which can synthesize
+diversified high-quality training and testing data at scale. We further
+introduce four test sets to benchmark the robustness of fine-tuned specialist
+models. Extensive evaluation shows that fine-tuning on synthetic data improves
+performance on both in- and out-of-distribution. Furthermore, we show that data
+quality, which can be drastically improved by proposed quality filters, matters
+more than quantity in improving Evidence-Based QA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Subtle Biases Need Subtler Measures: Dual Metrics for Evaluating
+  Representative and Affinity Bias in Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14555v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14555v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Kumar, Sarfaroz Yunusov, Ali Emami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research on Large Language Models (LLMs) has often neglected subtle biases
+that, although less apparent, can significantly influence the models' outputs
+toward particular social narratives. This study addresses two such biases
+within LLMs: representative bias, which denotes a tendency of LLMs to generate
+outputs that mirror the experiences of certain identity groups, and affinity
+bias, reflecting the models' evaluative preferences for specific narratives or
+viewpoints. We introduce two novel metrics to measure these biases: the
+Representative Bias Score (RBS) and the Affinity Bias Score (ABS), and present
+the Creativity-Oriented Generation Suite (CoGS), a collection of open-ended
+tasks such as short story writing and poetry composition, designed with
+customized rubrics to detect these subtle biases. Our analysis uncovers marked
+representative biases in prominent LLMs, with a preference for identities
+associated with being white, straight, and men. Furthermore, our investigation
+of affinity bias reveals distinctive evaluative patterns within each model,
+akin to `bias fingerprints'. This trend is also seen in human evaluators,
+highlighting a complex interplay between human and machine bias perceptions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages (excluding references), accepted to ACL 2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Picturing Ambiguity: A Visual Twist on the Winograd Schema Challenge <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16277v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16277v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brendan Park, Madeline Janecek, Naser Ezzati-Jivan, Yifeng Li, Ali Emami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable success in tasks
+like the Winograd Schema Challenge (WSC), showcasing advanced textual
+common-sense reasoning. However, applying this reasoning to multimodal domains,
+where understanding text and images together is essential, remains a
+substantial challenge. To address this, we introduce WinoVis, a novel dataset
+specifically designed to probe text-to-image models on pronoun disambiguation
+within multimodal contexts. Utilizing GPT-4 for prompt generation and Diffusion
+Attentive Attribution Maps (DAAM) for heatmap analysis, we propose a novel
+evaluation framework that isolates the models' ability in pronoun
+disambiguation from other visual processing challenges. Evaluation of
+successive model versions reveals that, despite incremental advancements,
+Stable Diffusion 2.0 achieves a precision of 56.7% on WinoVis, only marginally
+surpassing random guessing. Further error analysis identifies important areas
+for future research aimed at advancing text-to-image models in their ability to
+interpret and interact with the complex visual world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages (excluding references), accepted to ACL 2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Confidence Under the Hood: An Investigation into the
+  Confidence-Probability Alignment in Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16282v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16282v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Kumar, Robert Morabito, Sanzhar Umbet, Jad Kabbara, Ali Emami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the use of Large Language Models (LLMs) becomes more widespread,
+understanding their self-evaluation of confidence in generated responses
+becomes increasingly important as it is integral to the reliability of the
+output of these models. We introduce the concept of Confidence-Probability
+Alignment, that connects an LLM's internal confidence, quantified by token
+probabilities, to the confidence conveyed in the model's response when
+explicitly asked about its certainty. Using various datasets and prompting
+techniques that encourage model introspection, we probe the alignment between
+models' internal and expressed confidence. These techniques encompass using
+structured evaluation scales to rate confidence, including answer options when
+prompting, and eliciting the model's confidence level for outputs it does not
+recognize as its own. Notably, among the models analyzed, OpenAI's GPT-4 showed
+the strongest confidence-probability alignment, with an average Spearman's
+$\hat{\rho}$ of 0.42, across a wide range of tasks. Our work contributes to the
+ongoing efforts to facilitate risk assessment in the application of LLMs and to
+further our understanding of model trustworthiness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages (excluding references), accepted to ACL 2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ValiTex -- a unified validation framework for computational text-based
+  measures of social constructs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02863v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02863v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Birkenmaier, Claudia Wagner, Clemens Lechner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Guidance on how to validate computational text-based measures of social
+constructs is fragmented. While researchers generally acknowledge the
+importance of validating text-based measures, they often lack a shared
+vocabulary and a unified framework to do so. This paper introduces ValiText, a
+new validation framework designed to assist scholars in validly measuring
+social constructs in textual data. The framework is built on a conceptual
+foundation of validity in the social sciences, strengthened by an empirical
+review of validation practices in the social sciences and consultations with
+experts. Ultimately, ValiText prescribes researchers to demonstrate three types
+of validation evidence: substantive evidence (outlining the theoretical
+underpinning of the measure), structural evidence (examining the properties of
+the text model and its output) and external evidence (testing for how the
+measure relates to independent information). The framework is further
+supplemented by a checklist of validation steps, offering practical guidance in
+the form of documentation sheets that guide researchers in the validation
+process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XAI4LLM. Let Machine Learning Models and LLMs Collaborate for Enhanced
+  In-Context Learning in Healthcare 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.06270v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.06270v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Nazary, Yashar Deldjoo, Tommaso Di Noia, Eugenio di Sciascio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of Large Language Models (LLMs) into healthcare diagnostics
+offers a promising avenue for clinical decision-making. This study outlines the
+development of a novel method for zero-shot/few-shot in-context learning (ICL)
+by integrating medical domain knowledge using a multi-layered structured
+prompt. We also explore the efficacy of two communication styles between the
+user and LLMs: the Numerical Conversational (NC) style, which processes data
+incrementally, and the Natural Language Single-Turn (NL-ST) style, which
+employs long narrative prompts.
+  Our study systematically evaluates the diagnostic accuracy and risk factors,
+including gender bias and false negative rates, using a dataset of 920 patient
+records in various few-shot scenarios. Results indicate that traditional
+clinical machine learning (ML) models generally outperform LLMs in zero-shot
+and few-shot settings. However, the performance gap narrows significantly when
+employing few-shot examples alongside effective explainable AI (XAI) methods as
+sources of domain knowledge. Moreover, with sufficient time and an increased
+number of examples, the conversational style (NC) nearly matches the
+performance of ML models. Most notably, LLMs demonstrate comparable or superior
+cost-sensitive accuracy relative to ML models.
+  This research confirms that, with appropriate domain knowledge and tailored
+communication strategies, LLMs can significantly enhance diagnostic processes.
+The findings highlight the importance of optimizing the number of training
+examples and communication styles to improve accuracy and reduce biases in LLM
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Here's a Free Lunch: Sanitizing Backdoored Models with Model Merge <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.19334v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.19334v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ansh Arora, Xuanli He, Maximilian Mozes, Srinibas Swain, Mark Dras, Qiongkai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The democratization of pre-trained language models through open-source
+initiatives has rapidly advanced innovation and expanded access to cutting-edge
+technologies. However, this openness also brings significant security risks,
+including backdoor attacks, where hidden malicious behaviors are triggered by
+specific inputs, compromising natural language processing (NLP) system
+integrity and reliability. This paper suggests that merging a backdoored model
+with other homogeneous models can significantly remediate backdoor
+vulnerabilities even if such models are not entirely secure. In our
+experiments, we verify our hypothesis on various models (BERT-Base,
+RoBERTa-Large, Llama2-7B, and Mistral-7B) and datasets (SST-2, OLID, AG News,
+and QNLI). Compared to multiple advanced defensive approaches, our method
+offers an effective and efficient inference-stage defense against backdoor
+attacks on classification and instruction-tuned tasks without additional
+resources or specific knowledge. Our approach consistently outperforms recent
+advanced baselines, leading to an average of about 75% reduction in the attack
+success rate. Since model merging has been an established approach for
+improving model performance, the extra advantage it provides regarding defense
+can be seen as a cost-free bonus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to ACL2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Significance of Chain of Thought in Gender Bias Mitigation for
+  English-Dravidian Machine Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19701v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19701v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lavanya Prahallad, Radhika Mamidi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gender bias in machine translation (MT) sys- tems poses a significant
+challenge to achieving accurate and inclusive translations. This paper examines
+gender bias in machine translation systems for languages such as Telugu and
+Kan- nada from the Dravidian family, analyzing how gender inflections affect
+translation accuracy and neutrality using Google Translate and Chat- GPT. It
+finds that while plural forms can reduce bias, individual-centric sentences
+often main- tain the bias due to historical stereotypes. The study evaluates
+the Chain of Thought process- ing, noting significant bias mitigation from 80%
+to 4% in Telugu and from 40% to 0% in Kan- nada. It also compares Telugu and
+Kannada translations, emphasizing the need for language specific strategies to
+address these challenges and suggesting directions for future research to
+enhance fairness in both data preparation and prompts during inference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pediatrics<span class="highlight-title">GPT</span>: Large Language Models as Chinese Medical Assistants for
+  Pediatric Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19266v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19266v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingkang Yang, Jinjie Wei, Dongling Xiao, Shunli Wang, Tong Wu, Gang Li, Mingcheng Li, Shuaibing Wang, Jiawei Chen, Yue Jiang, Qingyao Xu, Ke Li, Peng Zhai, Lihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing intelligent pediatric consultation systems offers promising
+prospects for improving diagnostic efficiency, especially in China, where
+healthcare resources are scarce. Despite recent advances in Large Language
+Models (LLMs) for Chinese medicine, their performance is sub-optimal in
+pediatric applications due to inadequate instruction data and vulnerable
+training procedures. To address the above issues, this paper builds PedCorpus,
+a high-quality dataset of over 300,000 multi-task instructions from pediatric
+textbooks, guidelines, and knowledge graph resources to fulfil diverse
+diagnostic demands. Upon well-designed PedCorpus, we propose PediatricsGPT, the
+first Chinese pediatric LLM assistant built on a systematic and robust training
+pipeline. In the continuous pre-training phase, we introduce a hybrid
+instruction pre-training mechanism to mitigate the internal-injected knowledge
+inconsistency of LLMs for medical domain adaptation. Immediately, the
+full-parameter Supervised Fine-Tuning (SFT) is utilized to incorporate the
+general medical knowledge schema into the models. After that, we devise a
+direct following preference optimization to enhance the generation of
+pediatrician-like humanistic responses. In the parameter-efficient secondary
+SFT phase, a mixture of universal-specific experts strategy is presented to
+resolve the competency conflict between medical generalist and pediatric
+expertise mastery. Extensive results based on the metrics, GPT-4, and doctor
+evaluations on distinct doctor downstream tasks show that PediatricsGPT
+consistently outperforms previous Chinese medical LLMs. Our model and dataset
+will be open-source for community development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A Technical Report on a Chinese Medical Large Language Model</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models are Zero-Shot Next Location Predictors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20962v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20962v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ciro Beneduce, Bruno Lepri, Massimiliano Luca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting the locations an individual will visit in the future is crucial
+for solving many societal issues like disease diffusion and reduction of
+pollution among many others. The models designed to tackle next-location
+prediction, however, require a significant amount of individual-level
+information to be trained effectively. Such data may be scarce or even
+unavailable in some geographic regions or peculiar scenarios (e.g., cold-start
+in recommendation systems). Moreover, the design of a next-location predictor
+able to generalize or geographically transfer knowledge is still an open
+research challenge. Recent advances in natural language processing have led to
+a rapid diffusion of Large Language Models (LLMs) which have shown good
+generalization and reasoning capabilities. These insights, coupled with the
+recent findings that LLMs are rich in geographical knowledge, allowed us to
+believe that these models can act as zero-shot next-location predictors. This
+paper evaluates the capabilities of many popular LLMs in this role,
+specifically Llama, GPT-3.5 and Mistral 7B. After designing a proper prompt, we
+tested the models on three real-world mobility datasets. The results show that
+LLMs can obtain accuracies up to 32.4%, a significant relative improvement of
+over 600% when compared to sophisticated DL models specifically designed for
+human mobility. Moreover, we show that other LLMs are unable to perform the
+task properly. To prevent positively biased results, we also propose a
+framework inspired by other studies to test data contamination. Finally, we
+explored the possibility of using LLMs as text-based explainers for
+next-location prediction showing that can effectively provide an explanation
+for their decision. Notably, 7B models provide more generic, but still
+reliable, explanations compared to larger counterparts. Code:
+github.com/ssai-trento/LLM-zero-shot-NL
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Human vs. Machine: Behavioral Differences Between Expert Humans and
+  Language Models in Wargame Simulations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03407v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03407v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Lamparth, Anthony Corso, Jacob Ganz, Oriana Skylar Mastro, Jacquelyn Schneider, Harold Trinkunas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To some, the advent of artificial intelligence (AI) promises better
+decision-making and increased military effectiveness while reducing the
+influence of human error and emotions. However, there is still debate about how
+AI systems, especially large language models (LLMs), behave compared to humans
+in high-stakes military decision-making scenarios with the potential for
+increased risks towards escalation and unnecessary conflicts. To test this
+potential and scrutinize the use of LLMs for such purposes, we use a new
+wargame experiment with 107 national security experts designed to look at
+crisis escalation in a fictional US-China scenario and compare human players to
+LLM-simulated responses in separate simulations. Wargames have a long history
+in the development of military strategy and the response of nations to threats
+or attacks. Here, we show a considerable high-level agreement in the LLM and
+human responses and significant quantitative and qualitative differences in
+individual actions and strategic tendencies. These differences depend on
+intrinsic biases in LLMs regarding the appropriate level of violence following
+strategic instructions, the choice of LLM, and whether the LLMs are tasked to
+decide for a team of players directly or first to simulate dialog between
+players. When simulating the dialog, the discussions lack quality and maintain
+a farcical harmony. The LLM simulations cannot account for human player
+characteristics, showing no significant difference even for extreme traits,
+such as "pacifist" or "aggressive sociopath". Our results motivate policymakers
+to be cautious before granting autonomy or following AI-based strategy
+recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated with new plot and more details</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Language Models Reasoning with Chain-of-Knowledge <span class="highlight-title">Prompt</span>ing <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06427v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06427v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianing Wang, Qiushi Sun, Xiang Li, Ming Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Chain-of-Thought (CoT) prompting has delivered success on complex
+reasoning tasks, which aims at designing a simple prompt like ``Let's think
+step by step'' or multiple in-context exemplars with well-designed rationales
+to elicit Large Language Models (LLMs) to generate intermediate reasoning
+steps. However, the generated rationales often come with mistakes, making
+unfactual and unfaithful reasoning chains. To mitigate this brittleness, we
+propose a novel Chain-of-Knowledge (CoK) prompting, where we aim at eliciting
+LLMs to generate explicit pieces of knowledge evidence in the form of structure
+triple. This is inspired by our human behaviors, i.e., we can draw a mind map
+or knowledge map as the reasoning evidence in the brain before answering a
+complex question. Benefiting from CoK, we additionally introduce a
+F^2-Verification method to estimate the reliability of the reasoning chains in
+terms of factuality and faithfulness. For the unreliable response, the wrong
+evidence can be indicated to prompt the LLM to rethink. Extensive experiments
+demonstrate that our method can further improve the performance of commonsense,
+factual, symbolic, and arithmetic reasoning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoding Compressed Trust: Scrutinizing the Trustworthiness of Efficient
+  LLMs Under Compression <span class="chip">ICML'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15447v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15447v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyuan Hong, Jinhao Duan, Chenhui Zhang, Zhangheng Li, Chulin Xie, Kelsey Lieberman, James Diffenderfer, Brian Bartoldson, Ajay Jaiswal, Kaidi Xu, Bhavya Kailkhura, Dan Hendrycks, Dawn Song, Zhangyang Wang, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compressing high-capability Large Language Models (LLMs) has emerged as a
+favored strategy for resource-efficient inferences. While state-of-the-art
+(SoTA) compression methods boast impressive advancements in preserving benign
+task performance, the potential risks of compression in terms of safety and
+trustworthiness have been largely neglected. This study conducts the first,
+thorough evaluation of three (3) leading LLMs using five (5) SoTA compression
+techniques across eight (8) trustworthiness dimensions. Our experiments
+highlight the intricate interplay between compression and trustworthiness,
+revealing some interesting patterns. We find that quantization is currently a
+more effective approach than pruning in achieving efficiency and
+trustworthiness simultaneously. For instance, a 4-bit quantized model retains
+the trustworthiness of its original counterpart, but model pruning
+significantly degrades trustworthiness, even at 50% sparsity. Moreover,
+employing quantization within a moderate bit range could unexpectedly improve
+certain trustworthiness dimensions such as ethics and fairness. Conversely,
+extreme quantization to very low bit levels (3 bits) tends to reduce
+trustworthiness significantly. This increased risk cannot be uncovered by
+looking at benign performance alone, in turn, mandating comprehensive
+trustworthiness evaluation in practice. These findings culminate in practical
+recommendations for simultaneously achieving high utility, efficiency, and
+trustworthiness in LLMs. Code and models are available at
+https://decoding-comp-trust.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligner: Efficient Alignment by Learning to Correct 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02416v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02416v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Ji, Boyuan Chen, Hantao Lou, Donghai Hong, Borong Zhang, Xuehai Pan, Juntao Dai, Tianyi Qiu, Yaodong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of large language models (LLMs) and ever-evolving
+practical requirements, finding an efficient and effective alignment method has
+never been more critical. However, the tension between the complexity of
+current alignment methods and the need for rapid iteration in deployment
+scenarios necessitates the development of a model-agnostic alignment approach
+that can operate under these constraints. In this paper, we introduce Aligner,
+a novel and simple alignment paradigm that learns the correctional residuals
+between preferred and dispreferred answers using a small model. Designed as a
+model-agnostic, plug-and-play module, Aligner can be directly applied to
+various open-source and API-based models with only one-off training, making it
+suitable for rapid iteration. Notably, Aligner can be applied to any powerful,
+large-scale upstream models. Moreover, it can even iteratively bootstrap the
+upstream models using corrected responses as synthetic human preference data,
+breaking through the model's performance ceiling. Our experiments demonstrate
+performance improvements by deploying the same Aligner model across 11
+different LLMs, evaluated on the 3H dimensions (helpfulness, harmlessness, and
+honesty). Specifically, Aligner-7B has achieved an average improvement of
+68.9\% in helpfulness and 23.8\% in harmlessness across the tested LLMs while
+also effectively reducing hallucination. In the Alpaca-Eval leaderboard,
+stacking Aligner-2B on GPT-4 Turbo improved its LC Win Rate from 55.0\% to
+58.3\%, surpassing GPT-4 Omni's 57.5\% Win Rate (community report).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM
+  Agents Exponentially Fast <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08567v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08567v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangming Gu, Xiaosen Zheng, Tianyu Pang, Chao Du, Qian Liu, Ye Wang, Jing Jiang, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A multimodal large language model (MLLM) agent can receive instructions,
+capture images, retrieve histories from memory, and decide which tools to use.
+Nonetheless, red-teaming efforts have revealed that adversarial images/prompts
+can jailbreak an MLLM and cause unaligned behaviors. In this work, we report an
+even more severe safety issue in multi-agent environments, referred to as
+infectious jailbreak. It entails the adversary simply jailbreaking a single
+agent, and without any further intervention from the adversary, (almost) all
+agents will become infected exponentially fast and exhibit harmful behaviors.
+To validate the feasibility of infectious jailbreak, we simulate multi-agent
+environments containing up to one million LLaVA-1.5 agents, and employ
+randomized pair-wise chat as a proof-of-concept instantiation for multi-agent
+interaction. Our results show that feeding an (infectious) adversarial image
+into the memory of any randomly chosen agent is sufficient to achieve
+infectious jailbreak. Finally, we derive a simple principle for determining
+whether a defense mechanism can provably restrain the spread of infectious
+jailbreak, but how to design a practical defense that meets this principle
+remains an open question to investigate. Our project page is available at
+https://sail-sg.github.io/Agent-Smith/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparing Inferential Strategies of Humans and Large Language Models in
+  Deductive Reasoning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14856v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14856v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Mondorf, Barbara Plank
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deductive reasoning plays a pivotal role in the formulation of sound and
+cohesive arguments. It allows individuals to draw conclusions that logically
+follow, given the truth value of the information provided. Recent progress in
+the domain of large language models (LLMs) has showcased their capability in
+executing deductive reasoning tasks. Nonetheless, a significant portion of
+research primarily assesses the accuracy of LLMs in solving such tasks, often
+overlooking a deeper analysis of their reasoning behavior. In this study, we
+draw upon principles from cognitive psychology to examine inferential
+strategies employed by LLMs, through a detailed evaluation of their responses
+to propositional logic problems. Our findings indicate that LLMs display
+reasoning patterns akin to those observed in humans, including strategies like
+$\textit{supposition following}$ or $\textit{chain construction}$. Moreover,
+our research demonstrates that the architecture and scale of the model
+significantly affect its preferred method of reasoning, with more advanced
+models tending to adopt strategies more frequently than less sophisticated
+ones. Importantly, we assert that a model's accuracy, that is the correctness
+of its final conclusion, does not necessarily reflect the validity of its
+reasoning process. This distinction underscores the necessity for more nuanced
+evaluation procedures in the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 main, 31 pages, 19 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One-Shot Learning as Instruction Data Prospector for Large Language
+  Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10302v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10302v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunshui Li, Binyuan Hui, Xiaobo Xia, Jiaxi Yang, Min Yang, Lei Zhang, Shuzheng Si, Ling-Hao Chen, Junhao Liu, Tongliang Liu, Fei Huang, Yongbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary practices in instruction tuning often hinge on enlarging data
+scaling without a clear strategy for ensuring data quality, inadvertently
+introducing noise that may compromise model performance. To address this
+challenge, we introduce \textsc{Nuggets}, a novel and efficient methodology
+that leverages one-shot learning to discern and select high-quality instruction
+data from extensive datasets. \textsc{Nuggets} assesses the potential of
+individual instruction examples to act as effective one-shot learning
+instances, thereby identifying those that can significantly improve performance
+across diverse tasks. \textsc{Nuggets} utilizes a scoring system based on the
+impact of candidate examples on the perplexity of a diverse anchor set,
+facilitating the selection of the most advantageous data for instruction
+tuning. Through comprehensive evaluations on two benchmarks, including MT-Bench
+and Alpaca-Eval, we show that instruction tuning with the top 1\% of examples
+curated by \textsc{Nuggets} substantially outperforms conventional methods
+employing the entire dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TextBind: Multi-turn Interleaved Multimodal Instruction-following in the
+  Wild <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08637v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08637v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huayang Li, Siheng Li, Deng Cai, Longyue Wang, Lemao Liu, Taro Watanabe, Yujiu Yang, Shuming Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models with instruction-following abilities have
+revolutionized the field of artificial intelligence. These models show
+exceptional generalizability to tackle various real-world tasks through their
+natural language interfaces. However, their performance heavily relies on
+high-quality exemplar data, which is often difficult to obtain. This challenge
+is further exacerbated when it comes to multimodal instruction following. We
+introduce TextBind, an almost annotation-free framework for empowering larger
+language models with the multi-turn interleaved multimodal
+instruction-following capabilities. Our approach requires only image-caption
+pairs and generates multi-turn multimodal instruction-response conversations
+from a language model. To accommodate interleaved image-text inputs and
+outputs, we devise MIM, a language model-centric architecture that seamlessly
+integrates image encoder and decoder models. We release our dataset, model, and
+demo to foster future research in the area of multimodal instruction following.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LangBridge: Multilingual Reasoning Without Multilingual Supervision <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.10695v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.10695v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongkeun Yoon, Joel Jang, Sungdong Kim, Seungone Kim, Sheikh Shafayat, Minjoon Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce LangBridge, a zero-shot approach to adapt language models for
+multilingual reasoning tasks without multilingual supervision. LangBridge
+operates by bridging two models, each specialized in different aspects: (1) one
+specialized in understanding multiple languages (e.g., mT5 encoder) and (2) one
+specialized in reasoning (e.g., MetaMath). LangBridge connects the two models
+by introducing minimal trainable parameters between them. Despite utilizing
+only English data for training, LangBridge considerably enhances the
+performance of language models on low-resource languages across mathematical
+reasoning, code completion, logical reasoning, and commonsense reasoning. Our
+analysis suggests that the efficacy of LangBridge stems from the
+language-agnostic characteristics of multilingual representations. We publicly
+release our code and models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-enhanced Large Language Models in Asynchronous Plan Reasoning <span class="chip">ICML-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02805v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02805v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangru Lin, Emanuele La Malfa, Valentin Hofmann, Elle Michelle Yang, Anthony Cohn, Janet B. Pierrehumbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning is a fundamental property of human intelligence. Reasoning about
+asynchronous plans is challenging since it requires sequential and parallel
+planning to optimize time costs. Can large language models (LLMs) succeed at
+this task? Here, we present the first large-scale study investigating this
+question. We find that a representative set of closed and open-source LLMs,
+including GPT-4 and LLaMA-2, behave poorly when not supplied with illustrations
+about the task-solving process in our benchmark AsyncHow. We propose a novel
+technique called Plan Like a Graph (PLaG) that combines graphs with natural
+language prompts and achieves state-of-the-art results. We show that although
+PLaG can boost model performance, LLMs still suffer from drastic degradation
+when task complexity increases, highlighting the limits of utilizing LLMs for
+simulating digital devices. We see our study as an exciting step towards using
+LLMs as efficient autonomous agents. Our code and data are available at
+https://github.com/fangru-lin/graph-llm-asynchow-plan.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZeroNLG: Aligning and Autoencoding Domains for Zero-Shot Multimodal and
+  Multilingual Natural Language Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06458v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06458v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bang Yang, Fenglin Liu, Yuexian Zou, Xian Wu, Yaowei Wang, David A. Clifton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural Language Generation (NLG) accepts input data in the form of images,
+videos, or text and generates corresponding natural language text as output.
+Existing NLG methods mainly adopt a supervised approach and rely heavily on
+coupled data-to-text pairs. However, for many targeted scenarios and for
+non-English languages, sufficient quantities of labeled data are often not
+available. To relax the dependency on labeled data of downstream tasks, we
+propose an intuitive and effective zero-shot learning framework, ZeroNLG, which
+can deal with multiple NLG tasks, including image-to-text (image captioning),
+video-to-text (video captioning), and text-to-text (neural machine
+translation), across English, Chinese, German, and French within a unified
+framework. ZeroNLG does not require any labeled downstream pairs for training.
+During training, ZeroNLG (i) projects different domains (across modalities and
+languages) to corresponding coordinates in a shared common latent space; (ii)
+bridges different domains by aligning their corresponding coordinates in this
+space; and (iii) builds an unsupervised multilingual auto-encoder to learn to
+generate text by reconstructing the input text given its coordinate in shared
+latent space. Consequently, during inference, based on the data-to-text
+pipeline, ZeroNLG can generate target sentences across different languages
+given the coordinate of input data in the common space. Within this unified
+framework, given visual (imaging or video) data as input, ZeroNLG can perform
+zero-shot visual captioning; given textual sentences as input, ZeroNLG can
+perform zero-shot machine translation. We present the results of extensive
+experiments on twelve NLG tasks, showing that, without using any labeled
+downstream pairs for training, ZeroNLG generates high-quality and believable
+outputs and significantly outperforms existing zero-shot methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI (Our code and data are available at
+  https://github.com/yangbang18/ZeroNLG)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenRLHF: An Easy-to-use, Scalable and High-performance RLHF Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Hu, Xibin Wu, Weixun Wang,  Xianyu, Dehao Zhang, Yu Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) continue to grow by scaling laws,
+reinforcement learning from human feedback (RLHF) has gained significant
+attention due to its outstanding performance. However, unlike pretraining or
+fine-tuning a single model, scaling reinforcement learning from human feedback
+(RLHF) for training large language models poses coordination challenges across
+four models. We present OpenRLHF, an open-source framework enabling efficient
+RLHF scaling. Unlike existing RLHF frameworks that co-locate four models on the
+same GPUs, OpenRLHF re-designs scheduling for the models beyond 70B parameters
+using Ray, vLLM, and DeepSpeed, leveraging improved resource utilization and
+diverse training approaches. Integrating seamlessly with Hugging Face, OpenRLHF
+provides an out-of-the-box solution with optimized algorithms and launch
+scripts, which ensures user-friendliness. OpenRLHF implements RLHF, DPO,
+rejection sampling, and other alignment techniques. Empowering state-of-the-art
+LLM development, OpenRLHF's code is available at
+https://github.com/OpenLLMAI/OpenRLHF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fundamental Limitations of Alignment in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11082v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11082v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yotam Wolf, Noam Wies, Oshri Avnery, Yoav Levine, Amnon Shashua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An important aspect in developing language models that interact with humans
+is aligning their behavior to be useful and unharmful for their human users.
+This is usually achieved by tuning the model in a way that enhances desired
+behaviors and inhibits undesired ones, a process referred to as alignment. In
+this paper, we propose a theoretical approach called Behavior Expectation
+Bounds (BEB) which allows us to formally investigate several inherent
+characteristics and limitations of alignment in large language models.
+Importantly, we prove that within the limits of this framework, for any
+behavior that has a finite probability of being exhibited by the model, there
+exist prompts that can trigger the model into outputting this behavior, with
+probability that increases with the length of the prompt. This implies that any
+alignment process that attenuates an undesired behavior but does not remove it
+altogether, is not safe against adversarial prompting attacks. Furthermore, our
+framework hints at the mechanism by which leading alignment approaches such as
+reinforcement learning from human feedback make the LLM prone to being prompted
+into the undesired behaviors. This theoretical result is being experimentally
+demonstrated in large scale by the so called contemporary "chatGPT jailbreaks",
+where adversarial users trick the LLM into breaking its alignment guardrails by
+triggering it into acting as a malicious persona. Our results expose
+fundamental limitations in alignment of LLMs and bring to the forefront the
+need to devise reliable mechanisms for ensuring AI safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07105v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07105v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moritz Plenz, Anette Frank
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Language Models (LMs) are the workhorses of NLP, their interplay with
+structured knowledge graphs (KGs) is still actively researched. Current methods
+for encoding such graphs typically either (i) linearize them for embedding with
+LMs -- which underutilize structural information, or (ii) use Graph Neural
+Networks (GNNs) to preserve the graph structure -- but GNNs cannot represent
+text features as well as pretrained LMs. In our work we introduce a novel LM
+type, the Graph Language Model (GLM), that integrates the strengths of both
+approaches and mitigates their weaknesses. The GLM parameters are initialized
+from a pretrained LM to enhance understanding of individual graph concepts and
+triplets. Simultaneously, we design the GLM's architecture to incorporate graph
+biases, thereby promoting effective knowledge distribution within the graph.
+This enables GLMs to process graphs, texts, and interleaved inputs of both.
+Empirical evaluations on relation classification tasks show that GLM embeddings
+surpass both LM- and GNN-based baselines in supervised and zero-shot setting,
+demonstrating their versatility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024. 9 pages, 10 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking and Improving Compositional Generalization of Multi-aspect
+  Controllable Text Generation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.04232v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.04232v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianqi Zhong, Zhaoyi Li, Quan Wang, Linqi Song, Ying Wei, Defu Lian, Zhendong Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositional generalization, representing the model's ability to generate
+text with new attribute combinations obtained by recombining single attributes
+from the training data, is a crucial property for multi-aspect controllable
+text generation (MCTG) methods. Nonetheless, a comprehensive compositional
+generalization evaluation benchmark of MCTG is still lacking. We propose
+CompMCTG, a benchmark encompassing diverse multi-aspect labeled datasets and a
+crafted three-dimensional evaluation protocol, to holistically evaluate the
+compositional generalization of MCTG approaches. We observe that existing MCTG
+works generally confront a noticeable performance drop in compositional
+testing. To mitigate this issue, we introduce Meta-MCTG, a training framework
+incorporating meta-learning, where we enable models to learn how to generalize
+by simulating compositional generalization scenarios in the training phase. We
+demonstrate the effectiveness of Meta-MCTG through achieving obvious
+improvement (by at most 3.64%) for compositional testing performance in 94.4%
+cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 (Main); 32 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can LLMs Separate Instructions From Data? And What Do We Even Mean By
+  That? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Egor Zverev, Sahar Abdelnabi, Soroush Tabesh, Mario Fritz, Christoph H. Lampert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction-tuned Large Language Models (LLMs) show impressive results in
+numerous practical applications, but they lack essential safety features that
+are common in other areas of computer science, particularly an explicit
+separation of instructions and data. This makes them vulnerable to
+manipulations such as indirect prompt injections and generally unsuitable for
+safety-critical tasks. Surprisingly, there is currently no established
+definition or benchmark to quantify this phenomenon. In this work, we close
+this gap by introducing a formal measure for instruction-data separation and an
+empirical variant that is calculable from a model's outputs. We also present a
+new dataset, SEP, that allows estimating the measure for real-world models. Our
+results on various LLMs show that the problem of instruction-data separation is
+real: all models fail to achieve high separation, and canonical mitigation
+techniques, such as prompt engineering and fine-tuning, either fail to
+substantially improve separation or reduce model utility. The source code and
+SEP dataset are openly accessible at
+https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GitHub:
+  https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed. 10 pages main
+  text, 30 pages in total</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Code Similarity Evaluation with Abstract Syntax Tree Edit
+  Distance <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.08817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.08817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yewei Song, Cedric Lothritz, Daniel Tang, Tegawendé F. Bissyandé, Jacques Klein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper revisits recent code similarity evaluation metrics, particularly
+focusing on the application of Abstract Syntax Tree (AST) editing distance in
+diverse programming languages. In particular, we explore the usefulness of
+these metrics and compare them to traditional sequence similarity metrics. Our
+experiments showcase the effectiveness of AST editing distance in capturing
+intricate code structures, revealing a high correlation with established
+metrics. Furthermore, we explore the strengths and weaknesses of AST editing
+distance and prompt-based GPT similarity scores in comparison to BLEU score,
+execution match, and Jaccard Similarity. We propose, optimize, and publish an
+adaptable metric that demonstrates effectiveness across all tested languages,
+representing an enhanced version of Tree Similarity of Edit Distance (TSED).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Preference Optimization: Enhancing Your Alignment via RM-LLM
+  Game <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.08045v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.08045v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyu Cheng, Yifan Yang, Jian Li, Yong Dai, Tianhao Hu, Peixin Cao, Nan Du, Xiaolong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human preference alignment is essential to improve the interaction quality of
+large language models (LLMs). Existing alignment methods depend on manually
+annotated preference data to guide the LLM optimization directions. However,
+continuously updating LLMs for alignment raises a distribution gap between
+model-generated samples and human-annotated responses, hindering training
+effectiveness. To mitigate this issue, previous methods require additional
+preference annotation on newly generated samples to adapt to the shifted
+distribution, which consumes a large amount of annotation resources. Targeting
+more efficient human preference optimization, we propose an Adversarial
+Preference Optimization (APO) framework, in which the LLM and the reward model
+update alternatively via a min-max game. Through adversarial training, the
+reward model can adapt to the shifted generation distribution of the LLM
+without any additional annotation. With comprehensive experiments, we find the
+proposed adversarial training framework further enhances existing alignment
+baselines in terms of LLM helpfulness and harmlessness. The code is at
+https://github.com/Linear95/APO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL2024 findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Planning, Creation, Usage: Benchmarking LLMs for Comprehensive Tool
+  Utilization in Real-World Complex Scenarios <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.17167v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.17167v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijue Huang, Wanjun Zhong, Jianqiao Lu, Qi Zhu, Jiahui Gao, Weiwen Liu, Yutai Hou, Xingshan Zeng, Yasheng Wang, Lifeng Shang, Xin Jiang, Ruifeng Xu, Qun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent trend of using Large Language Models (LLMs) as tool agents in
+real-world applications underscores the necessity for comprehensive evaluations
+of their capabilities, particularly in complex scenarios involving planning,
+creating, and using tools. However, existing benchmarks typically focus on
+simple synthesized queries that do not reflect real-world complexity, thereby
+offering limited perspectives in evaluating tool utilization. To address this
+issue, we present UltraTool, a novel benchmark designed to improve and evaluate
+LLMs' ability in tool utilization within real-world scenarios. UltraTool
+focuses on the entire process of using tools - from planning and creating to
+applying them in complex tasks. It emphasizes real-world complexities,
+demanding accurate, multi-step planning for effective problem-solving. A key
+feature of UltraTool is its independent evaluation of planning with natural
+language, which happens before tool usage and simplifies the task solving by
+mapping out the intermediate steps. Thus, unlike previous work, it eliminates
+the restriction of pre-defined toolset. Through extensive experiments on
+various LLMs, we offer novel insights into the evaluation of capabilities of
+LLMs in tool utilization, thereby contributing a fresh perspective to this
+rapidly evolving field. The benchmark is publicly available at
+https://github.com/JoeYing1019/UltraTool.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PRE: A Peer <span class="highlight-title">Review</span> Based Large Language Model Evaluator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15641v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15641v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhumin Chu, Qingyao Ai, Yiteng Tu, Haitao Li, Yiqun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The impressive performance of large language models (LLMs) has attracted
+considerable attention from the academic and industrial communities. Besides
+how to construct and train LLMs, how to effectively evaluate and compare the
+capacity of LLMs has also been well recognized as an important yet difficult
+problem. Existing paradigms rely on either human annotators or model-based
+evaluators to evaluate the performance of LLMs on different tasks. However,
+these paradigms often suffer from high cost, low generalizability, and
+inherited biases in practice, which make them incapable of supporting the
+sustainable development of LLMs in long term. In order to address these issues,
+inspired by the peer review systems widely used in academic publication
+process, we propose a novel framework that can automatically evaluate LLMs
+through a peer-review process. Specifically, for the evaluation of a specific
+task, we first construct a small qualification exam to select "reviewers" from
+a couple of powerful LLMs. Then, to actually evaluate the "submissions" written
+by different candidate LLMs, i.e., the evaluatees, we use the reviewer LLMs to
+rate or compare the submissions. The final ranking of evaluatee LLMs is
+generated based on the results provided by all reviewers. We conducted
+extensive experiments on text summarization tasks with eleven LLMs including
+GPT-4. The results demonstrate the existence of biasness when evaluating using
+a single LLM. Also, our PRE model outperforms all the baselines, illustrating
+the effectiveness of the peer review mechanism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenToM: A Comprehensive Benchmark for Evaluating Theory-of-Mind
+  Reasoning Capabilities of Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06044v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06044v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hainiu Xu, Runcong Zhao, Lixing Zhu, Jinhua Du, Yulan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Theory-of-Mind (N-ToM), machine's ability to understand and keep track
+of the mental states of others, is pivotal in developing socially intelligent
+agents. However, prevalent N-ToM benchmarks have several shortcomings,
+including the presence of ambiguous and artificial narratives, absence of
+personality traits and preferences, a lack of questions addressing characters'
+psychological mental states, and limited diversity in the questions posed. In
+response to these issues, we construct OpenToM, a new benchmark for assessing
+N-ToM with (1) longer and clearer narrative stories, (2) characters with
+explicit personality traits, (3) actions that are triggered by character
+intentions, and (4) questions designed to challenge LLMs' capabilities of
+modeling characters' mental states of both the physical and psychological
+world. Using OpenToM, we reveal that state-of-the-art LLMs thrive at modeling
+certain aspects of mental states in the physical world but fall short when
+tracking characters' mental states in the psychological world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balancing Speciality and Versatility: a Coarse to Fine Framework for
+  Supervised Fine-tuning Large Language Model <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.10306v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.10306v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengyuan Zhang, Yanru Wu, Dawei Li, Sak Yang, Rui Zhao, Yong Jiang, Fei Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aligned Large Language Models (LLMs) showcase remarkable versatility, capable
+of handling diverse real-world tasks. Meanwhile, aligned LLMs are also expected
+to exhibit speciality, excelling in specific applications. However, fine-tuning
+with extra data, a common practice to gain speciality, often leads to
+catastrophic forgetting (CF) of previously acquired versatility, hindering the
+model's performance across diverse tasks. In response to this challenge, we
+propose CoFiTune, a coarse to fine framework in an attempt to strike the
+balance between speciality and versatility. At the coarse-grained level, an
+empirical tree-search algorithm is utilized to pinpoint and update specific
+modules that are crucial for speciality, while keeping other parameters frozen;
+at the fine-grained level, a soft-masking mechanism regulates the update to the
+LLMs, mitigating the CF issue without harming speciality. In an overall
+evaluation of both speciality and versatility, CoFiTune consistently
+outperforms baseline methods across diverse tasks and model scales. Compared to
+the full-parameter SFT, CoFiTune leads to about 14% versatility improvement and
+marginal speciality loss on a 13B model. Lastly, based on further analysis, we
+provide a speculative insight into the information forwarding process in LLMs,
+which helps explain the effectiveness of the proposed method. The code is
+available at https://github.com/rattlesnakey/CoFiTune.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages, 10 figures, accepted by ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SyntaxShap: Syntax-aware Explainability Method for Text Generation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09259v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09259v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenza Amara, Rita Sevastjanova, Mennatallah El-Assady
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To harness the power of large language models in safety-critical domains, we
+need to ensure the explainability of their predictions. However, despite the
+significant attention to model interpretability, there remains an unexplored
+domain in explaining sequence-to-sequence tasks using methods tailored for
+textual data. This paper introduces SyntaxShap, a local, model-agnostic
+explainability method for text generation that takes into consideration the
+syntax in the text data. The presented work extends Shapley values to account
+for parsing-based syntactic dependencies. Taking a game theoric approach,
+SyntaxShap only considers coalitions constraint by the dependency tree. We
+adopt a model-based evaluation to compare SyntaxShap and its weighted form to
+state-of-the-art explainability methods adapted to text generation tasks, using
+diverse metrics including faithfulness, coherency, and semantic alignment of
+the explanations to the model. We show that our syntax-aware method produces
+explanations that help build more faithful and coherent explanations for
+predictions by autoregressive models. Confronted with the misalignment of human
+and AI model reasoning, this paper also highlights the need for cautious
+evaluation strategies in explainable AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Representation Surgery: Theory and Practice of Affine Steering <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09631v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09631v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashwat Singh, Shauli Ravfogel, Jonathan Herzig, Roee Aharoni, Ryan Cotterell, Ponnurangam Kumaraguru
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models often exhibit undesirable behavior, e.g., generating toxic or
+gender-biased text. In the case of neural language models, an encoding of the
+undesirable behavior is often present in the model's representations. Thus, one
+natural (and common) approach to prevent the model from exhibiting undesirable
+behavior is to steer the model's representations in a manner that reduces the
+probability of it generating undesirable text. This paper investigates the
+formal and empirical properties of steering functions, i.e., transformation of
+the neural language model's representations that alter its behavior. First, we
+derive two optimal, in the least-squares sense, affine steering functions under
+different constraints. Our theory provides justification for existing
+approaches and offers a novel, improved steering approach. Second, we offer a
+series of experiments that demonstrate the empirical effectiveness of the
+methods in mitigating bias and reducing toxic generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Split and Rephrase with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11075v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11075v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Ponce, Thierry Etchegoyhen, Jesús Calleja Pérez, Harritxu Gete
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Split and Rephrase (SPRP) task, which consists in splitting complex
+sentences into a sequence of shorter grammatical sentences, while preserving
+the original meaning, can facilitate the processing of complex texts for humans
+and machines alike. It is also a valuable testbed to evaluate natural language
+processing models, as it requires modelling complex grammatical aspects. In
+this work, we evaluate large language models on the task, showing that they can
+provide large improvements over the state of the art on the main metrics,
+although still lagging in terms of splitting compliance. Results from two human
+evaluations further support the conclusions drawn from automated metric
+results. We provide a comprehensive study that includes prompting variants,
+domain shift, fine-tuned pretrained language models of varying parameter size
+and training data volumes, contrasted with both zero-shot and few-shot
+approaches on instruction-tuned language models. Although the latter were
+markedly outperformed by fine-tuned models, they may constitute a reasonable
+off-the-shelf alternative. Our results provide a fine-grained analysis of the
+potential and limitations of large language models for SPRP, with significant
+improvements achievable using relatively small amounts of training data and
+model parameters overall, and remaining limitations for all models on the task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Linear-time Minimum Bayes Risk Decoding with Reference Aggregation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannis Vamvas, Rico Sennrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Minimum Bayes Risk (MBR) decoding is a text generation technique that has
+been shown to improve the quality of machine translations, but is expensive,
+even if a sampling-based approximation is used. Besides requiring a large
+number of sampled sequences, it requires the pairwise calculation of a utility
+metric, which has quadratic complexity. In this paper, we propose to
+approximate pairwise metric scores with scores calculated against aggregated
+reference representations. This changes the complexity of utility estimation
+from $O(n^2)$ to $O(n)$, while empirically preserving most of the quality gains
+of MBR decoding. We release our source code at https://github.com/ZurichNLP/mbr
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MiniCPM: Unveiling the Potential of Small Language Models with Scalable
+  Training Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.06395v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.06395v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengding Hu, Yuge Tu, Xu Han, Chaoqun He, Ganqu Cui, Xiang Long, Zhi Zheng, Yewei Fang, Yuxiang Huang, Weilin Zhao, Xinrong Zhang, Zheng Leng Thai, Kaihuo Zhang, Chongyi Wang, Yuan Yao, Chenyang Zhao, Jie Zhou, Jie Cai, Zhongwu Zhai, Ning Ding, Chao Jia, Guoyang Zeng, Dahai Li, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The burgeoning interest in developing Large Language Models (LLMs) with up to
+trillion parameters has been met with concerns regarding resource efficiency
+and practical expense, particularly given the immense cost of experimentation.
+This scenario underscores the importance of exploring the potential of Small
+Language Models (SLMs) as a resource-efficient alternative. In this context, we
+introduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter
+variants, not only excel in their respective categories but also demonstrate
+capabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach
+exhibits scalability in both model and data dimensions for future LLM research.
+Regarding model scaling, we employ extensive model wind tunnel experiments for
+stable and optimal scaling. For data scaling, we introduce a
+Warmup-Stable-Decay (WSD) learning rate scheduler (LRS), conducive to
+continuous training and domain adaptation. We present an in-depth analysis of
+the intriguing training dynamics that occurred in the WSD LRS. With WSD LRS, we
+are now able to efficiently study data-model scaling law without extensive
+retraining experiments on both axes of model and data, from which we derive the
+much higher compute optimal data-model ratio than Chinchilla Optimal.
+Additionally, we introduce MiniCPM family, including MiniCPM-DPO, MiniCPM-MoE
+and MiniCPM-128K, whose excellent performance further cementing MiniCPM's
+foundation in diverse SLM applications. MiniCPM models are available publicly
+at https://github.com/OpenBMB/MiniCPM .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>revise according to peer review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PsyEval: A Suite of Mental Health Related Tasks for Evaluating Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoan Jin, Siyuan Chen, Dilawaier Dilixiati, Yewei Jiang, Mengyue Wu, Kenny Q. Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating Large Language Models (LLMs) in the mental health domain poses
+distinct challenged from other domains, given the subtle and highly subjective
+nature of symptoms that exhibit significant variability among individuals. This
+paper presents PsyEval, the first comprehensive suite of mental health-related
+tasks for evaluating LLMs. PsyEval encompasses five sub-tasks that evaluate
+three critical dimensions of mental health. This comprehensive framework is
+designed to thoroughly assess the unique challenges and intricacies of mental
+health-related tasks, making PsyEval a highly specialized and valuable tool for
+evaluating LLM performance in this domain. We evaluate twelve advanced LLMs
+using PsyEval. Experiment results not only demonstrate significant room for
+improvement in current LLMs concerning mental health but also unveil potential
+directions for future model optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Genshin: General Shield for Natural Language Processing with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18741v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18741v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Peng, Tao Liu, Ying Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) like ChatGPT, Gemini, or LLaMA have been
+trending recently, demonstrating considerable advancement and generalizability
+power in countless domains. However, LLMs create an even bigger black box
+exacerbating opacity, with interpretability limited to few approaches. The
+uncertainty and opacity embedded in LLMs' nature restrict their application in
+high-stakes domains like financial fraud, phishing, etc. Current approaches
+mainly rely on traditional textual classification with posterior interpretable
+algorithms, suffering from attackers who may create versatile adversarial
+samples to break the system's defense, forcing users to make trade-offs between
+efficiency and robustness. To address this issue, we propose a novel cascading
+framework called Genshin (General Shield for Natural Language Processing with
+Large Language Models), utilizing LLMs as defensive one-time plug-ins. Unlike
+most applications of LLMs that try to transform text into something new or
+structural, Genshin uses LLMs to recover text to its original state. Genshin
+aims to combine the generalizability of the LLM, the discrimination of the
+median model, and the interpretability of the simple model. Our experiments on
+the task of sentimental analysis and spam detection have shown fatal flaws of
+the current median models and exhilarating results on LLMs' recovery ability,
+demonstrating that Genshin is both effective and efficient. In our ablation
+study, we unearth several intriguing observations. Utilizing the LLM defender,
+a tool derived from the 4th paradigm, we have reproduced BERT's 15% optimal
+mask rate results in the 3rd paradigm of NLP. Additionally, when employing the
+LLM as a potential adversarial tool, attackers are capable of executing
+effective attacks that are nearly semantically lossless.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Mutual Learning of Dialogue Discourse Parsing and Topic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19799v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19799v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahui Xu, Feng Jiang, Anningzhe Gao, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of large language models (LLMs) has propelled the development
+of dialogue systems. Unlike the popular ChatGPT-like assistant model, which
+only satisfies the user's preferences, task-oriented dialogue systems have also
+faced new requirements and challenges in the broader business field. They are
+expected to provide correct responses at each dialogue turn, at the same time,
+achieve the overall goal defined by the task. By understanding rhetorical
+structures and topic structures via topic segmentation and discourse parsing, a
+dialogue system may do a better planning to achieve both objectives. However,
+while both structures belong to discourse structure in linguistics, rhetorical
+structure and topic structure are mostly modeled separately or with one
+assisting the other in the prior work. The interaction between these two
+structures has not been considered for joint modeling and mutual learning.
+Furthermore, unsupervised learning techniques to achieve the above are not well
+explored. To fill this gap, we propose an unsupervised mutual learning
+framework of two structures leveraging the global and local connections between
+them. We extend the topic modeling between non-adjacent discourse units to
+ensure global structural relevance with rhetorical structures. We also
+incorporate rhetorical structures into the topic structure through a graph
+neural network model to ensure local coherence consistency. Finally, we utilize
+the similarity between the two fused structures for mutual learning. The
+experimental results demonstrate that our methods outperform all strong
+baselines on two dialogue rhetorical datasets (STAC and Molweni), as well as
+dialogue topic datasets (Doc2Dial and TIAGE). We provide our code at
+https://github.com/Jeff-Sue/URT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video-LaVIT: Unified Video-Language <span class="highlight-title">Pre-train</span>ing with Decoupled
+  Visual-Motional Tokenization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03161v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03161v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Jin, Zhicheng Sun, Kun Xu, Kun Xu, Liwei Chen, Hao Jiang, Quzhe Huang, Chengru Song, Yuliang Liu, Di Zhang, Yang Song, Kun Gai, Yadong Mu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In light of recent advances in multimodal Large Language Models (LLMs), there
+is increasing attention to scaling them from image-text data to more
+informative real-world videos. Compared to static images, video poses unique
+challenges for effective large-scale pre-training due to the modeling of its
+spatiotemporal dynamics. In this paper, we address such limitations in
+video-language pre-training with an efficient video decomposition that
+represents each video as keyframes and temporal motions. These are then adapted
+to an LLM using well-designed tokenizers that discretize visual and temporal
+information as a few tokens, thus enabling unified generative pre-training of
+videos, images, and text. At inference, the generated tokens from the LLM are
+carefully recovered to the original continuous pixel space to create various
+video content. Our proposed framework is both capable of comprehending and
+generating image and video content, as demonstrated by its competitive
+performance across 13 multimodal benchmarks in image and video understanding
+and generation. Our code and models are available at
+https://video-lavit.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Tree-structured Knowledge Graph For Academic Insight <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04854v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04854v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghong Li, Huy Phan, Wen Gu, Koichi Ota, Shinobu Hasegawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research surveys have always posed a challenge for beginner researchers who
+lack of research training. These researchers struggle to understand the
+directions within their research topic, and the discovery of new research
+findings within a short time. One way to provide intuitive assistance to
+beginner researchers is by offering relevant knowledge graphs(KG) and
+recommending related academic papers. However, existing navigation knowledge
+graphs primarily rely on keywords in the research field and often fail to
+present the logical hierarchy among multiple related papers clearly. Moreover,
+most recommendation systems for academic papers simply rely on high text
+similarity, which can leave researchers confused as to why a particular article
+is being recommended. They may lack of grasp important information about the
+insight connection between "Issue resolved" and "Issue finding" that they hope
+to obtain. To address these issues, this study aims to support research insight
+surveys for beginner researchers by establishing a hierarchical tree-structured
+knowledge graph that reflects the inheritance insight of research topics and
+the relevance insight among the academic papers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper will be submitted to 'The 18TH International Conference on
+  INnovations in Intelligent SysTems and Applications (INISTA 2024)'</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UP4LS: User Profile Constructed by Multiple Attributes for Enhancing
+  Linguistic Steganalysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.01775v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.01775v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihao Wang, Ruiqi Song, Lingxiao Li, Yifan Tang, Ru Zhang, Jianyi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linguistic steganalysis (LS) tasks aim to detect whether a text contains
+secret information. Existing LS methods focus on the deep-learning model design
+and they achieve excellent results in ideal data. However, they overlook the
+unique user characteristics, leading to weak performance in social networks.
+And a few stegos here that further complicate detection. We propose the UP4LS,
+a framework with the User Profile for enhancing LS in realistic scenarios.
+Three kinds of user attributes like writing habits are explored to build the
+profile. For each attribute, the specific feature extraction module is
+designed. The extracted features are mapped to high-dimensional user features
+via the deep-learning model of the method to be improved. The content feature
+is extracted by the language model. Then user and content features are
+integrated. Existing methods can improve LS results by adding the UP4LS
+framework without changing their deep-learning models. Experiments show that
+UP4LS can significantly enhance the performance of LS-task baselines in
+realistic scenarios, with the overall Acc increased by 25%, F1 increased by
+51%, and SOTA results. The improvement is especially pronounced in fewer
+stegos. Additionally, UP4LS also sets the stage for the related-task SOTA
+methods to efficient LS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures, 14 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KS-Lottery: Finding Certified Lottery Tickets for Multilingual Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02801v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02801v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Yuan, Chang Ma, Shuai Yuan, Qiushi Sun, Lei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lottery ticket hypothesis posits the existence of ``winning tickets''
+within a randomly initialized neural network. Do winning tickets exist for LLMs
+in fine-tuning scenarios? How can we find such winning tickets? In this paper,
+we propose KS-Lottery, a method to identify a small subset of LLM parameters
+highly effective in multilingual fine-tuning. Our key idea is to use
+Kolmogorov-Smirnov Test to analyze the distribution shift of parameters before
+and after fine-tuning. We further theoretically prove that KS-Lottery can find
+the certified winning tickets in the embedding layer, fine-tuning on the found
+parameters is guaranteed to perform as well as full fine-tuning. Comparing
+KS-Lottery with other parameter-efficient tuning algorithms on translation
+tasks, the experimental results show that KS-Lottery finds a much smaller set
+of parameters for fine-tuning while achieving the comparable performance as
+full fine-tuning LLM. Surprisingly, we find that fine-tuning 18 tokens'
+embedding of LLaMA suffices to reach the fine-tuning translation
+performance~\footnote{https://github.com/CONE-MT/KS-Lottery.}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cleaner <span class="highlight-title">Pretrain</span>ing Corpus Curation with Neural Web Scraping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14652v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14652v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Xu, Zhenghao Liu, Yukun Yan, Zhiyuan Liu, Ge Yu, Chenyan Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The web contains large-scale, diverse, and abundant information to satisfy
+the information-seeking needs of humans. Through meticulous data collection,
+preprocessing, and curation, webpages can be used as a fundamental data
+resource for language model pretraining. However, when confronted with the
+progressively revolutionized and intricate nature of webpages,
+rule-based/feature-based web scrapers are becoming increasingly inadequate.
+This paper presents a simple, fast, and effective Neural web Scraper
+(NeuScraper) to help extract primary and clean text contents from webpages.
+Experimental results show that NeuScraper surpasses the baseline scrapers by
+achieving more than a 20% improvement, demonstrating its potential in
+extracting higher-quality data to facilitate the language model pretraining.
+All of the code is available at https://github.com/OpenMatch/NeuScraper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DoRA: Weight-Decomposed Low-Rank Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09353v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09353v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shih-Yang Liu, Chien-Yi Wang, Hongxu Yin, Pavlo Molchanov, Yu-Chiang Frank Wang, Kwang-Ting Cheng, Min-Hung Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among the widely used parameter-efficient fine-tuning (PEFT) methods, LoRA
+and its variants have gained considerable popularity because of avoiding
+additional inference costs. However, there still often exists an accuracy gap
+between these methods and full fine-tuning (FT). In this work, we first
+introduce a novel weight decomposition analysis to investigate the inherent
+differences between FT and LoRA. Aiming to resemble the learning capacity of FT
+from the findings, we propose Weight-Decomposed Low-Rank Adaptation (DoRA).
+DoRA decomposes the pre-trained weight into two components, magnitude and
+direction, for fine-tuning, specifically employing LoRA for directional updates
+to efficiently minimize the number of trainable parameters. By employing \ours,
+we enhance both the learning capacity and training stability of LoRA while
+avoiding any additional inference overhead. \ours~consistently outperforms LoRA
+on fine-tuning LLaMA, LLaVA, and VL-BART on various downstream tasks, such as
+commonsense reasoning, visual instruction tuning, and image/video-text
+understanding. Code is available at https://github.com/NVlabs/DoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://github.com/NVlabs/DoRA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04679v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04679v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahdi Nikdan, Soroush Tabesh, Elvir Crnčević, Dan Alistarh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate parameter-efficient fine-tuning (PEFT) methods that can
+provide good accuracy under limited computational and memory budgets in the
+context of large language models (LLMs). We present a new PEFT method called
+Robust Adaptation (RoSA) inspired by robust principal component analysis that
+jointly trains $\textit{low-rank}$ and $\textit{highly-sparse}$ components on
+top of a set of fixed pretrained weights to efficiently approximate the
+performance of a full-fine-tuning (FFT) solution. Across a series of
+challenging generative tasks such as grade-school math and SQL query
+generation, which require fine-tuning for good performance, we show that RoSA
+outperforms LoRA, pure sparse fine-tuning, and alternative hybrid methods at
+the same parameter budget, and can even recover the performance of FFT on some
+tasks. We provide system support for RoSA to complement the training algorithm,
+specifically in the form of sparse GPU kernels which enable memory- and
+computationally-efficient training, and show that it is also compatible with
+low-precision base weights, resulting in the first joint representation
+combining quantization, low-rank and sparse approximations. Our code is
+available at https://github.com/IST-DASLab/RoSA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On <span class="highlight-title">Prompt</span>-Driven Safeguarding for Large Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.18018v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.18018v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chujie Zheng, Fan Yin, Hao Zhou, Fandong Meng, Jie Zhou, Kai-Wei Chang, Minlie Huang, Nanyun Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prepending model inputs with safety prompts is a common practice for
+safeguarding large language models (LLMs) against queries with harmful intents.
+However, the underlying working mechanisms of safety prompts have not been
+unraveled yet, restricting the possibility of automatically optimizing them to
+improve LLM safety. In this work, we investigate how LLMs' behavior (i.e.,
+complying with or refusing user queries) is affected by safety prompts from the
+perspective of model representation. We find that in the representation space,
+the input queries are typically moved by safety prompts in a "higher-refusal"
+direction, in which models become more prone to refusing to provide assistance,
+even when the queries are harmless. On the other hand, LLMs are naturally
+capable of distinguishing harmful and harmless queries without safety prompts.
+Inspired by these findings, we propose a method for safety prompt optimization,
+namely DRO (Directed Representation Optimization). Treating a safety prompt as
+continuous, trainable embeddings, DRO learns to move the queries'
+representations along or opposite the refusal direction, depending on their
+harmfulness. Experiments with eight LLMs on out-of-domain and jailbreak
+benchmarks demonstrate that DRO remarkably improves the safeguarding
+performance of human-crafted safety prompts, without compromising the models'
+general performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rewriting the Code: A Simple Method for Large Language Model Augmented
+  Code Search <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04514v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04514v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Li, Xin Zhou, Zhiqi Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In code search, the Generation-Augmented Retrieval (GAR) framework, which
+generates exemplar code snippets to augment queries, has emerged as a promising
+strategy to address the principal challenge of modality misalignment between
+code snippets and natural language queries, particularly with the demonstrated
+code generation capabilities of Large Language Models (LLMs). Nevertheless, our
+preliminary investigations indicate that the improvements conferred by such an
+LLM-augmented framework are somewhat constrained. This limitation could
+potentially be ascribed to the fact that the generated codes, albeit
+functionally accurate, frequently display a pronounced stylistic deviation from
+the ground truth code in the codebase. In this paper, we extend the
+foundational GAR framework and propose a simple yet effective method that
+additionally Rewrites the Code (ReCo) within the codebase for style
+normalization. Experimental results demonstrate that ReCo significantly boosts
+retrieval accuracy across sparse (up to 35.7%), zero-shot dense (up to 27.6%),
+and fine-tuned dense (up to 23.6%) retrieval settings in diverse search
+scenarios. To further elucidate the advantages of ReCo and stimulate research
+in code style normalization, we introduce Code Style Similarity, the first
+metric tailored to quantify stylistic similarities in code. Notably, our
+empirical findings reveal the inadequacy of existing metrics in capturing
+stylistic nuances. The source code and data are available at
+\url{https://github.com/Alex-HaochenLi/ReCo}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balanced Data Sampling for Language Model Training with Clustering <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14526v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14526v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfan Shao, Linyang Li, Zhaoye Fei, Hang Yan, Dahua Lin, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data plays a fundamental role in the training of Large Language Models
+(LLMs). While attention has been paid to the collection and composition of
+datasets, determining the data sampling strategy in training remains an open
+question. Most LLMs are trained with a simple strategy, random sampling.
+However, this sampling strategy ignores the unbalanced nature of training data
+distribution, which can be sub-optimal. In this paper, we propose ClusterClip
+Sampling to balance the text distribution of training data for better model
+training. Specifically, ClusterClip Sampling utilizes data clustering to
+reflect the data distribution of the training set and balances the common
+samples and rare samples during training based on the cluster results. A
+repetition clip operation is introduced to mitigate the overfitting issue led
+by samples from certain clusters. Extensive experiments validate the
+effectiveness of ClusterClip Sampling, which outperforms random sampling and
+other cluster-based sampling variants under various training datasets and large
+language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 (findings), Code is released at
+  https://github.com/choosewhatulike/cluster-clip</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Feature-Adaptive and Data-Scalable In-Context Learning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10738v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10738v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Li, Quan Wang, Licheng Zhang, Guoqing Jin, Zhendong Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning (ICL), which promotes inference with several
+demonstrations, has become a widespread paradigm to stimulate LLM capabilities
+for downstream tasks. Due to context length constraints, it cannot be further
+improved in spite of more training data, and general features directly from
+LLMs in ICL are not adaptive to the specific downstream task. In this paper, we
+propose a feature-adaptive and data-scalable in-context learning framework
+(FADS-ICL), which can leverage task-adaptive features to promote inference on
+the downstream task, with the supervision of beyond-context samples.
+Specifically, it first extracts general features of beyond-context samples via
+the LLM with ICL input form one by one, and introduces a task-specific
+modulator to perform feature refinement and prediction after fitting a specific
+downstream task. We conduct extensive experiments on FADS-ICL under varying
+data settings (4$\sim$128 shots) and LLM scale (0.8$\sim$70B) settings.
+Experimental results show that FADS-ICL consistently outperforms previous
+state-of-the-art methods by a significant margin under all settings, verifying
+the effectiveness and superiority of FADS-ICL. For example, under the 1.5B and
+32 shots setting, FADS-ICL can achieve \textbf{+14.3} average accuracy from
+feature adaptation over vanilla ICL on 10 datasets, with \textbf{+6.2} average
+accuracy over the previous state-of-the-art method, and the performance can
+further improve with increasing training data. Code and data are publicly
+available at \url{https://github.com/jiahaozhenbang/FADS-ICL}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unraveling and Mitigating Retriever Inconsistencies in
+  Retrieval-Augmented Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingda Li, Xinyu Li, Yifan Chen, Wenfeng Xuan, Weinan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although Retrieval-Augmented Large Language Models (RALMs) demonstrate their
+superiority in terms of factuality, they do not consistently outperform the
+original retrieval-free Language Models (LMs). Our experiments reveal that this
+example-level performance inconsistency exists not only between
+retrieval-augmented and retrieval-free LM but also among different retrievers.
+To understand this phenomenon, we investigate the degeneration behavior of
+RALMs and theoretically decompose it into four categories. Further analysis
+based on our decomposition reveals that the innate difference in knowledge
+sources and the unpredictable degeneration of the reader model contribute most
+to the inconsistency. Drawing from our analysis, we introduce Ensemble of
+Retrievers (EoR), a trainable framework that can adaptively retrieve from
+different knowledge sources and effectively decrease unpredictable reader
+errors. Our experiments on Open Domain Question Answering show that EoR
+substantially improves performance over the RALM with a single retriever by
+considerably reducing inconsistent behaviors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 (findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Vocabulary Sharing Facilitates Multilingualism in LLaMA? <span class="chip">ACL-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09071v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09071v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Yuan, Shuai Yuan, Zhiyong Wu, Lei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), often show strong performance on English tasks,
+while exhibiting limitations on other languages. What is an LLM's multilingual
+capability when it is trained only on certain languages? The underlying
+mechanism remains unclear. This study endeavors to examine the multilingual
+capability of LLMs from the vocabulary sharing perspective by conducting an
+exhaustive analysis across 101 languages. Through the investigation of the
+performance gap before and after embedding fine-tuning, we discovered four
+distinct quadrants. By delving into each quadrant we provide actionable and
+efficient guidelines for tuning these languages. Extensive experiments reveal
+that existing LLMs possess multilingual capabilities that surpass our
+expectations, and we can significantly improve the multilingual performance of
+LLMs based on these attributes of each
+quadrant~\footnote{\url{https://github.com/CONE-MT/Vocabulary-Sharing-Facilitates-Multilingualism}.}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL-2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large
+  Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuohao Yu, Chang Gao, Wenjin Yao, Yidong Wang, Wei Ye, Jindong Wang, Xing Xie, Yue Zhang, Shikun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic evaluation methods for large language models (LLMs) are hindered by
+data contamination, leading to inflated assessments of their effectiveness.
+Existing strategies, which aim to detect contaminated texts, focus on
+quantifying contamination status instead of accurately gauging model
+performance. In this paper, we introduce KIEval, a Knowledge-grounded
+Interactive Evaluation framework, which incorporates an LLM-powered
+"interactor" role for the first time to accomplish a dynamic
+contamination-resilient evaluation. Starting with a question in a conventional
+LLM benchmark involving domain-specific knowledge, KIEval utilizes dynamically
+generated, multi-round, and knowledge-focused dialogues to determine whether a
+model's response is merely a recall of benchmark answers or demonstrates a deep
+comprehension to apply knowledge in more complex conversations. Extensive
+experiments on seven leading LLMs across five datasets validate KIEval's
+effectiveness and generalization. We also reveal that data contamination brings
+no contribution or even negative effect to models' real-world applicability and
+understanding, and existing contamination detection methods for LLMs can only
+identify contamination in pre-training but not during supervised fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 (main conference); 19 pages, 5 figures, 19
+  tables, code is available at: https://github.com/zhuohaoyu/KIEval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03653v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03653v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongzhi Jiang, Guanglu Song, Xiaoshi Wu, Renrui Zhang, Dazhong Shen, Zhuofan Zong, Yu Liu, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated great success in the field of
+text-to-image generation. However, alleviating the misalignment between the
+text prompts and images is still challenging. The root reason behind the
+misalignment has not been extensively investigated. We observe that the
+misalignment is caused by inadequate token attention activation. We further
+attribute this phenomenon to the diffusion model's insufficient condition
+utilization, which is caused by its training paradigm. To address the issue, we
+propose CoMat, an end-to-end diffusion model fine-tuning strategy with an
+image-to-text concept matching mechanism. We leverage an image captioning model
+to measure image-to-text alignment and guide the diffusion model to revisit
+ignored tokens. A novel attribute concentration module is also proposed to
+address the attribute binding problem. Without any image or human preference
+data, we use only 20K text prompts to fine-tune SDXL to obtain CoMat-SDXL.
+Extensive experiments show that CoMat-SDXL significantly outperforms the
+baseline model SDXL in two text-to-image alignment benchmarks and achieves
+start-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://caraj7.github.io/comat</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Editing by Standard Fine-Tuning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11078v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11078v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Govind Gangadhar, Karl Stratos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard fine-tuning is considered not as effective as specialized methods
+for model editing due to its comparatively poor performance. However, it is
+simple, agnostic to the architectural details of the model being edited, and
+able to leverage advances in standard training techniques with no additional
+work (e.g., black-box PEFT for computational efficiency), making it an
+appealing choice for a model editor. In this work, we show that standard
+fine-tuning alone can yield competitive model editing performance with two
+minor modifications. First, we optimize the conditional likelihood rather than
+the full likelihood. Second, in addition to the typical practice of training on
+randomly paraphrased edit prompts to encourage generalization, we also train on
+random or similar unedited facts to encourage locality. Our experiments on the
+ZsRE and CounterFact datasets demonstrate that these simple modifications allow
+standard fine-tuning to match or outperform highly specialized editors in terms
+of edit score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Large Multimodal Models Uncover Deep Semantics Behind Images? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11281v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11281v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Yang, Zheng Li, Qingxiu Dong, Heming Xia, Zhifang Sui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the deep semantics of images is essential in the era dominated
+by social media. However, current research works primarily on the superficial
+description of images, revealing a notable deficiency in the systematic
+investigation of the inherent deep semantics. In this work, we introduce
+DEEPEVAL, a comprehensive benchmark to assess Large Multimodal Models' (LMMs)
+capacities of visual deep semantics. DEEPEVAL includes human-annotated dataset
+and three progressive subtasks: fine-grained description selection, in-depth
+title matching, and deep semantics understanding. Utilizing DEEPEVAL, we
+evaluate 9 open-source LMMs and GPT-4V(ision). Our evaluation demonstrates a
+substantial gap between the deep semantic comprehension capabilities of
+existing LMMs and humans. For example, GPT-4V is 30% behind humans in
+understanding deep semantics, even though it achieves human-comparable
+performance in image description. Further analysis reveals that LMM performance
+on DEEPEVAL varies according to the specific facets of deep semantics explored,
+indicating the fundamental challenges remaining in developing LMMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do <span class="highlight-title">pretrain</span>ed <span class="highlight-title">Transformer</span>s Learn In-Context by Gradient Descent? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08540v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08540v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingfeng Shen, Aayush Mishra, Daniel Khashabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of In-Context Learning (ICL) in LLMs remains a remarkable
+phenomenon that is partially understood. To explain ICL, recent studies have
+created theoretical connections to Gradient Descent (GD). We ask, do such
+connections hold up in actual pre-trained language models? We highlight the
+limiting assumptions in prior works that make their setup considerably
+different from the practical setup in which language models are trained. For
+example, their experimental verification uses \emph{ICL objective} (training
+models explicitly for ICL), which differs from the emergent ICL in the wild.
+Furthermore, the theoretical hand-constructed weights used in these studies
+have properties that don't match those of real LLMs. We also look for evidence
+in real models. We observe that ICL and GD have different sensitivity to the
+order in which they observe demonstrations. Finally, we probe and compare the
+ICL vs. GD hypothesis in a natural setting. We conduct comprehensive empirical
+analyses on language models pre-trained on natural data (LLaMa-7B). Our
+comparisons of three performance metrics highlight the inconsistent behavior of
+ICL and GD as a function of various factors such as datasets, models, and the
+number of demonstrations. We observe that ICL and GD modify the output
+distribution of language models differently. These results indicate that
+\emph{the equivalence between ICL and GD remains an open hypothesis} and calls
+for further studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spectral <span class="highlight-title">Prompt</span> Tuning:Unveiling Unseen Classes for Zero-Shot Semantic
+  Segmentation <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12754v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12754v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Xu, Rongtao Xu, Changwei Wang, Shibiao Xu, Li Guo, Man Zhang, Xiaopeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, CLIP has found practical utility in the domain of pixel-level
+zero-shot segmentation tasks. The present landscape features two-stage
+methodologies beset by issues such as intricate pipelines and elevated
+computational costs. While current one-stage approaches alleviate these
+concerns and incorporate Visual Prompt Training (VPT) to uphold CLIP's
+generalization capacity, they still fall short in fully harnessing CLIP's
+potential for pixel-level unseen class demarcation and precise pixel
+predictions. To further stimulate CLIP's zero-shot dense prediction capability,
+we propose SPT-SEG, a one-stage approach that improves CLIP's adaptability from
+image to pixel. Specifically, we initially introduce Spectral Prompt Tuning
+(SPT), incorporating spectral prompts into the CLIP visual encoder's shallow
+layers to capture structural intricacies of images, thereby enhancing
+comprehension of unseen classes. Subsequently, we introduce the Spectral Guided
+Decoder (SGD), utilizing both high and low-frequency information to steer the
+network's spatial focus towards more prominent classification features,
+enabling precise pixel-level prediction outcomes. Through extensive experiments
+on two public datasets, we demonstrate the superiority of our method over
+state-of-the-art approaches, performing well across all classes and
+particularly excelling in handling unseen classes. Code is available
+at:https://github.com/clearxu/SPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI2024 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLEAN-EVAL: Clean Evaluation on Contaminated Large Language Models <span class="chip">NAACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09154v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09154v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhong Zhu, Hongkun Hao, Zhiwei He, Yunze Song, Yumeng Zhang, Hanxu Hu, Yiran Wei, Rui Wang, Hongyuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We are currently in an era of fierce competition among various large language
+models (LLMs) continuously pushing the boundaries of benchmark performance.
+However, genuinely assessing the capabilities of these LLMs has become a
+challenging and critical issue due to potential data contamination, and it
+wastes dozens of time and effort for researchers and engineers to download and
+try those contaminated models. To save our precious time, we propose a novel
+and useful method, Clean-Eval, which mitigates the issue of data contamination
+and evaluates the LLMs in a cleaner manner. Clean-Eval employs an LLM to
+paraphrase and back-translate the contaminated data into a candidate set,
+generating expressions with the same meaning but in different surface forms. A
+semantic detector is then used to filter the generated low-quality samples to
+narrow down this candidate set. The best candidate is finally selected from
+this set based on the BLEURT score. According to human assessment, this best
+candidate is semantically similar to the original contamination data but
+expressed differently. All candidates can form a new benchmark to evaluate the
+model. Our experiments illustrate that Clean-Eval substantially restores the
+actual evaluation results on contaminated LLMs under both few-shot learning and
+fine-tuning scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NAACL2024(findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Open-Ended Text Generation via Adaptive Decoding <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18223v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18223v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhong Zhu, Hongkun Hao, Zhiwei He, Yiming Ai, Rui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current language models decode text token by token according to probabilistic
+distribution, and determining the appropriate candidates for the next token is
+crucial to ensure generation quality. This study introduces adaptive decoding,
+a mechanism that dynamically empowers language models to ascertain a sensible
+candidate set during generation. Specifically, we introduce an entropy-based
+metric called confidence and conceptualize determining the optimal candidate
+set as a confidence-increasing process. The rationality of including a token in
+the candidate set is assessed by leveraging the increment of confidence.
+Experimental results reveal that our method balances diversity and coherence
+well. The human evaluation shows that our method can generate human-preferred
+text. Additionally, our method can potentially improve the reasoning ability of
+language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reasons to Reject? Aligning Language Models with Judgments <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14591v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14591v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiwen Xu, Deng Cai, Zhisong Zhang, Wai Lam, Shuming Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As humans, we consistently interact with our peers and receive feedback in
+the form of natural language. This language feedback allows us to maintain
+appropriate behavior, and rectify potential errors. The question arises
+naturally: can we use language feedback to align large language models (LLMs)?
+In contrast to previous research that aligns LLMs with scalar rewards, we
+present the first systematic exploration of alignment through the lens of
+language feedback (i.e., judgment). We start with an in-depth investigation of
+potential methods that can be adapted for aligning LLMs with judgments,
+revealing that these methods cannot fully capitalize on judgments. To
+facilitate more effective utilization of judgments, we propose a novel
+framework, Contrastive Unlikelihood Training (CUT), that allows for
+fine-grained inappropriate content detection and correction based on judgments.
+Our results show that, with merely 1317 off-the-shelf judgment data, CUT
+(LLaMA2-13b) can beat the 175B DaVinci003 and surpass the best baseline by
+50.84 points on AlpacaEval. CUT (LLaMA2-chat-13b) can also align LLMs in an
+iterative fashion using up-to-date model-specific judgments, improving
+performance from 81.09 to 91.68 points on AlpacaEval. Further analysis suggests
+that judgments hold greater potential than rewards in LLM alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 Findings. Our source codes and models are
+  publicly available at https://github.com/wwxu21/CUT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DFA-RAG: Conversational Semantic Router for Large Language Model with
+  Definite Finite Automaton <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04411v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04411v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyou Sun, Junjie Hu, Wei Cheng, Haifeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the retrieval-augmented large language model with
+Definite Finite Automaton (DFA-RAG), a novel framework designed to enhance the
+capabilities of conversational agents using large language models (LLMs).
+Traditional LLMs face challenges in generating regulated and compliant
+responses in special scenarios with predetermined response guidelines, like
+emotional support and customer service. Our framework addresses these
+challenges by embedding a Definite Finite Automaton (DFA), learned from
+training dialogues, within the LLM. This structured approach acts as a semantic
+router which enables the LLM to adhere to a deterministic response pathway. The
+routing is achieved by the retrieval-augmentation generation (RAG) strategy,
+which carefully selects dialogue examples aligned with the current
+conversational context. The advantages of DFA-RAG include an interpretable
+structure through human-readable DFA, context-aware retrieval for responses in
+conversations, and plug-and-play compatibility with existing LLMs. Extensive
+benchmarks validate DFA-RAG's effectiveness, indicating its potential as a
+valuable contribution to the conversational agent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Preference Optimization: Pushing the Boundaries of LLM
+  Performance in Machine Translation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08417v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08417v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Xu, Amr Sharaf, Yunmo Chen, Weiting Tan, Lingfeng Shen, Benjamin Van Durme, Kenton Murray, Young Jin Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Moderate-sized large language models (LLMs) -- those with 7B or 13B
+parameters -- exhibit promising machine translation (MT) performance. However,
+even the top-performing 13B LLM-based translation models, like ALMA, does not
+match the performance of state-of-the-art conventional encoder-decoder
+translation models or larger-scale LLMs such as GPT-4. In this study, we bridge
+this performance gap. We first assess the shortcomings of supervised
+fine-tuning for LLMs in the MT task, emphasizing the quality issues present in
+the reference data, despite being human-generated. Then, in contrast to SFT
+which mimics reference translations, we introduce Contrastive Preference
+Optimization (CPO), a novel approach that trains models to avoid generating
+adequate but not perfect translations. Applying CPO to ALMA models with only
+22K parallel sentences and 12M parameters yields significant improvements. The
+resulting model, called ALMA-R, can match or exceed the performance of the WMT
+competition winners and GPT-4 on WMT'21, WMT'22 and WMT'23 test datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SOUL: Unlocking the Power of Second-Order Optimization for LLM
+  Unlearning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18239v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18239v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghan Jia, Yihua Zhang, Yimeng Zhang, Jiancheng Liu, Bharat Runwal, James Diffenderfer, Bhavya Kailkhura, Sijia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have highlighted the necessity of effective
+unlearning mechanisms to comply with data regulations and ethical AI practices.
+LLM unlearning aims at removing undesired data influences and associated model
+capabilities without compromising utility out of the scope of unlearning. While
+interest in studying LLM unlearning is growing,the impact of the optimizer
+choice for LLM unlearning remains under-explored. In this work, we shed light
+on the significance of optimizer selection in LLM unlearning for the first
+time, establishing a clear connection between {second-order optimization} and
+influence unlearning (a classical approach using influence functions to update
+the model for data influence removal). This insight propels us to develop a
+second-order unlearning framework, termed SOUL, built upon the second-order
+clipped stochastic optimization (Sophia)-based LLM training method. SOUL
+extends the static, one-shot model update using influence unlearning to a
+dynamic, iterative unlearning process. Our extensive experiments show that SOUL
+consistently outperforms conventional first-order methods across various
+unlearning tasks, models, and metrics, suggesting the promise of second-order
+optimization in providing a scalable and easily implementable solution for LLM
+unlearning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ II-MMR: Identifying and Improving Multi-modal Multi-hop Reasoning in
+  Visual Question Answering <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11058v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11058v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihyung Kil, Farideh Tavazoee, Dongyeop Kang, Joo-Kyung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) often involves diverse reasoning scenarios
+across Vision and Language (V&L). Most prior VQA studies, however, have merely
+focused on assessing the model's overall accuracy without evaluating it on
+different reasoning cases. Furthermore, some recent works observe that
+conventional Chain-of-Thought (CoT) prompting fails to generate effective
+reasoning for VQA, especially for complex scenarios requiring multi-hop
+reasoning. In this paper, we propose II-MMR, a novel idea to identify and
+improve multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA
+question with an image and finds a reasoning path to reach its answer using two
+novel language promptings: (i) answer prediction-guided CoT prompt, or (ii)
+knowledge triplet-guided prompt. II-MMR then analyzes this path to identify
+different reasoning cases in current VQA benchmarks by estimating how many hops
+and what types (i.e., visual or beyond-visual) of reasoning are required to
+answer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR
+observes that most of their VQA questions are easy to answer, simply demanding
+"single-hop" reasoning, whereas only a few questions require "multi-hop"
+reasoning. Moreover, while the recent V&L model struggles with such complex
+multi-hop reasoning questions even using the traditional CoT method, II-MMR
+shows its effectiveness across all reasoning cases in both zero-shot and
+fine-tuning settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">74</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ L-MAGIC: Language Model Assisted Generation of Images with Coherence <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01843v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01843v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Cai, Matthias Mueller, Reiner Birkl, Diana Wofk, Shao-Yen Tseng, JunDa Cheng, Gabriela Ben-Melech Stan, Vasudev Lal, Michael Paulitsch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the current era of generative AI breakthroughs, generating panoramic
+scenes from a single input image remains a key challenge. Most existing methods
+use diffusion-based iterative or simultaneous multi-view inpainting. However,
+the lack of global scene layout priors leads to subpar outputs with duplicated
+objects (e.g., multiple beds in a bedroom) or requires time-consuming human
+text inputs for each view. We propose L-MAGIC, a novel method leveraging large
+language models for guidance while diffusing multiple coherent views of 360
+degree panoramic scenes. L-MAGIC harnesses pre-trained diffusion and language
+models without fine-tuning, ensuring zero-shot performance. The output quality
+is further enhanced by super-resolution and multi-view fusion techniques.
+Extensive experiments demonstrate that the resulting panoramic scenes feature
+better scene layouts and perspective view rendering quality compared to related
+works, with >70% preference in human evaluations. Combined with conditional
+diffusion models, L-MAGIC can accept various input modalities, including but
+not limited to text, depth maps, sketches, and colored scripts. Applying depth
+estimation further enables 3D point cloud generation and dynamic scene
+exploration with fluid camera motion. Code is available at
+https://github.com/IntelLabs/MMPano. The video presentation is available at
+https://youtu.be/XDMNEzH4-Ec?list=PLG9Zyvu7iBa0-a7ccNLO8LjcVRAoMn57s.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Vision-Language Models with Transduction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Zanella, Benoît Gérin, Ismail Ben Ayed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transduction is a powerful paradigm that leverages the structure of unlabeled
+data to boost predictive accuracy. We present TransCLIP, a novel and
+computationally efficient transductive approach designed for Vision-Language
+Models (VLMs). TransCLIP is applicable as a plug-and-play module on top of
+popular inductive zero- and few-shot models, consistently improving their
+performances. Our new objective function can be viewed as a regularized
+maximum-likelihood estimation, constrained by a KL divergence penalty that
+integrates the text-encoder knowledge and guides the transductive learning
+process. We further derive an iterative Block Majorize-Minimize (BMM) procedure
+for optimizing our objective, with guaranteed convergence and decoupled
+sample-assignment updates, yielding computationally efficient transduction for
+large-scale datasets. We report comprehensive evaluations, comparisons, and
+ablation studies that demonstrate: (i) Transduction can greatly enhance the
+generalization capabilities of inductive pretrained zero- and few-shot VLMs;
+(ii) TransCLIP substantially outperforms standard transductive few-shot
+learning methods relying solely on vision features, notably due to the KL-based
+language constraint.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FacAID: A <span class="highlight-title">Transformer</span> Model for Neuro-Symbolic Facade Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksander Płocharski, Jan Swidzinski, Joanna Porter-Sobieraj, Przemyslaw Musialski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a neuro-symbolic transformer-based model that converts flat,
+segmented facade structures into procedural definitions using a custom-designed
+split grammar. To facilitate this, we first develop a semi-complex split
+grammar tailored for architectural facades and then generate a dataset
+comprising of facades alongside their corresponding procedural representations.
+This dataset is used to train our transformer model to convert segmented, flat
+facades into the procedural language of our grammar. During inference, the
+model applies this learned transformation to new facade segmentations,
+providing a procedural representation that users can adjust to generate varied
+facade designs. This method not only automates the conversion of static facade
+images into dynamic, editable procedural formats but also enhances the design
+flexibility, allowing for easy modifications and variations by architects and
+designers. Our approach sets a new standard in facade design by combining the
+precision of procedural generation with the adaptability of neuro-symbolic
+learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 10 figures, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GIFT: Generative Interpretable Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.00700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.00700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chinmay Savadikar, Xi Song, Tianfu Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Generative Interpretable Fine-Tuning (GIFT) for
+parameter-efficient fine-tuning of pretrained Transformer backbones, which can
+be formulated as a simple factorized matrix multiplication in the parameter
+space or equivalently in the activation space, and thus embraces built-in
+interpretability. For a pretrained layer with weights $\omega\in
+\mathbb{R}^{d_{out}\times d_{in}}$, our proposed GIFT learns the fine-tuned
+weights $\hat{\omega}$ directly from $\omega$ as $\hat{\omega}=\omega \cdot
+(\mathbb{I}+\phi_{d_{in}\times r}\cdot \psi_{r\times d_{in}})$ where
+$\mathbb{I}$ is an identity matrix. $\Theta=(\phi, \psi)$ are the learnable
+parameters of the two linear layers of GIFT with $r$ being a hyper-parameter.
+$\Theta$ is shared by all the layers selected for fine-tuning, resulting in
+significantly fewer trainable parameters compared to Low-Rank Adaptation
+(LoRA). We perform comprehensive evaluations on natural language tasks
+(commonsense reasoning and sequence classification) and computer vision tasks
+(visual fine-grained classification). We obtain the best accuracy and parameter
+efficiency among baselines both on the Commonsense170k reasoning benchmark
+using LLaMA-1 (7B) and Llama-2 (7B)/-3 (8B) and on the FGVC and VTAB visual
+recognition benchmarks using ImageNet-21k pretrained Vision Transformer
+(ViT-B/16). Notably, we obtain 5.9% absolute increase in average accuracy with
+53.8 times reduction of parameters on Commonsense170k using Llama-3 (8B)
+compared to LoRA. We obtain performance comparable to LoRA on the GLUE
+benchmark but with significantly fewer parameters using RoBERTa-Base/Large. We
+show the output of the first linear layer (i.e., $\omega\cdot \phi$) is
+surprisingly interpretable, which can play the role of a token-clustering head
+as a by-product to localize meaningful objects/parts in images for computer
+vision tasks. Our code is publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page and code: https://savadikarc.github.io/gift</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mitigating Motion Blur in Neural Radiance Fields with Events and Frames <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.19780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.19780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Cannici, Davide Scaramuzza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRFs) have shown great potential in novel view
+synthesis. However, they struggle to render sharp images when the data used for
+training is affected by motion blur. On the other hand, event cameras excel in
+dynamic scenes as they measure brightness changes with microsecond resolution
+and are thus only marginally affected by blur. Recent methods attempt to
+enhance NeRF reconstructions under camera motion by fusing frames and events.
+However, they face challenges in recovering accurate color content or constrain
+the NeRF to a set of predefined camera poses, harming reconstruction quality in
+challenging conditions. This paper proposes a novel formulation addressing
+these issues by leveraging both model- and learning-based modules. We
+explicitly model the blur formation process, exploiting the event double
+integral as an additional model-based prior. Additionally, we model the
+event-pixel response using an end-to-end learnable response function, allowing
+our method to adapt to non-idealities in the real event-camera sensor. We show,
+on synthetic and real data, that the proposed approach outperforms existing
+deblur NeRFs that use only frames as well as those that combine frames and
+events by +6.13dB and +2.48dB, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Conference on Computer Vision and Pattern Recognition (CVPR),
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MIM-Refiner: A Contrastive Learning Boost from Intermediate <span class="highlight-title">Pre-Train</span>ed
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10093v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10093v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benedikt Alkin, Lukas Miklautz, Sepp Hochreiter, Johannes Brandstetter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MIM (Masked Image Modeling)-Refiner, a contrastive learning
+boost for pre-trained MIM models. MIM-Refiner is motivated by the insight that
+strong representations within MIM models generally reside in intermediate
+layers. Accordingly, MIM-Refiner leverages multiple contrastive heads that are
+connected to different intermediate layers. In each head, a modified nearest
+neighbor objective constructs semantic clusters that capture semantic
+information which improves performance on downstream tasks, including
+off-the-shelf and fine-tuning settings.
+  The refinement process is short and simple - yet highly effective. Within a
+few epochs, we refine the features of MIM models from subpar to
+state-of-the-art, off-the-shelf features. Refining a ViT-H, pre-trained with
+data2vec 2.0 on ImageNet-1K, sets a new state-of-the-art in linear probing
+(84.7%) and low-shot classification among models that are pre-trained on
+ImageNet-1K. At ImageNet-1K 1-shot classification, MIM-Refiner advances the
+state-of-the-art to 64.2%, outperforming larger models that were trained on up
+to 2000 times more data such as DINOv2-g, OpenCLIP-G and MAWS-6.5B.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReShader: View-Dependent Highlights for Single Image View-Synthesis <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10689v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10689v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avinash Paliwal, Brandon Nguyen, Andrii Tsarov, Nima Khademi Kalantari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, novel view synthesis from a single image has seen
+significant progress thanks to the rapid advancements in 3D scene
+representation and image inpainting techniques. While the current approaches
+are able to synthesize geometrically consistent novel views, they often do not
+handle the view-dependent effects properly. Specifically, the highlights in
+their synthesized images usually appear to be glued to the surfaces, making the
+novel views unrealistic. To address this major problem, we make a key
+observation that the process of synthesizing novel views requires changing the
+shading of the pixels based on the novel camera, and moving them to appropriate
+locations. Therefore, we propose to split the view synthesis process into two
+independent tasks of pixel reshading and relocation. During the reshading
+process, we take the single image as the input and adjust its shading based on
+the novel camera. This reshaded image is then used as the input to an existing
+view synthesis method to relocate the pixels and produce the final novel view
+image. We propose to use a neural network to perform reshading and generate a
+large set of synthetic input-reshaded pairs to train our network. We
+demonstrate that our approach produces plausible novel view images with
+realistic moving highlights on a variety of real world scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH Asia 2023. Project page at
+  https://people.engr.tamu.edu/nimak/Papers/SIGAsia2023_Reshader/index.html and
+  video at https://www.youtube.com/watch?v=XW-tl48D3Ok</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Stochastic-Geometrical Framework for Object Pose Estimation based on
+  Mixture Models Avoiding the Correspondence Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18107v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18107v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wolfgang Hoegele
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Pose estimation of rigid objects is a practical challenge in
+optical metrology and computer vision. This paper presents a novel
+stochastic-geometrical modeling framework for object pose estimation based on
+observing multiple feature points.
+  Methods: This framework utilizes mixture models for feature point densities
+in object space and for interpreting real measurements. Advantages are the
+avoidance to resolve individual feature correspondences and to incorporate
+correct stochastic dependencies in multi-view applications. First, the general
+modeling framework is presented, second, a general algorithm for pose
+estimation is derived, and third, two example models (camera and lateration
+setup) are presented.
+  Results: Numerical experiments show the effectiveness of this modeling and
+general algorithm by presenting four simulation scenarios for three observation
+systems, including the dependence on measurement resolution, object
+deformations and measurement noise. Probabilistic modeling utilizing mixture
+models shows the potential for accurate and robust pose estimations while
+avoiding the correspondence problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Addressing Diverging Training Costs using Local Restoration for Precise
+  Bird's Eye View Map Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.01016v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.01016v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsu Kim, Giseop Kim, Sunwook Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Bird's Eye View (BEV) fusion for map construction have
+demonstrated remarkable mapping of urban environments. However, their deep and
+bulky architecture incurs substantial amounts of backpropagation memory and
+computing latency. Consequently, the problem poses an unavoidable bottleneck in
+constructing high-resolution (HR) BEV maps, as their large-sized features cause
+significant increases in costs including GPU memory consumption and computing
+latency, named diverging training costs issue. Affected by the problem, most
+existing methods adopt low-resolution (LR) BEV and struggle to estimate the
+precise locations of urban scene components like road lanes, and sidewalks. As
+the imprecision leads to risky self-driving, the diverging training costs issue
+has to be resolved. In this paper, we address the issue with our novel Trumpet
+Neural Network (TNN) mechanism. The framework utilizes LR BEV space and outputs
+an up-sampled semantic BEV map to create a memory-efficient pipeline. To this
+end, we introduce Local Restoration of BEV representation. Specifically, the
+up-sampled BEV representation has severely aliased, blocky signals, and thick
+semantic labels. Our proposed Local Restoration restores the signals and thins
+(or narrows down) the width of the labels. Our extensive experiments show that
+the TNN mechanism provides a plug-and-play memory-efficient pipeline, thereby
+enabling the effective estimation of real-sized (or precise) semantic labels
+for BEV map construction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ U-DiTs: Downsample Tokens in U-Shaped Diffusion <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02730v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02730v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchuan Tian, Zhijun Tu, Hanting Chen, Jie Hu, Chao Xu, Yunhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Transformers (DiTs) introduce the transformer architecture to
+diffusion tasks for latent-space image generation. With an isotropic
+architecture that chains a series of transformer blocks, DiTs demonstrate
+competitive performance and good scalability; but meanwhile, the abandonment of
+U-Net by DiTs and their following improvements is worth rethinking. To this
+end, we conduct a simple toy experiment by comparing a U-Net architectured DiT
+with an isotropic one. It turns out that the U-Net architecture only gain a
+slight advantage amid the U-Net inductive bias, indicating potential
+redundancies within the U-Net-style DiT. Inspired by the discovery that U-Net
+backbone features are low-frequency-dominated, we perform token downsampling on
+the query-key-value tuple for self-attention that bring further improvements
+despite a considerable amount of reduction in computation. Based on
+self-attention with downsampled tokens, we propose a series of U-shaped DiTs
+(U-DiTs) in the paper and conduct extensive experiments to demonstrate the
+extraordinary performance of U-DiT models. The proposed U-DiT could outperform
+DiT-XL/2 with only 1/6 of its computation cost. Codes are available at
+https://github.com/YuchuanTian/U-DiT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Open-Vocabulary 3D Scene Graphs for Language-Grounded Robot
+  Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17846v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17846v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrhman Werby, Chenguang Huang, Martin Büchner, Abhinav Valada, Wolfram Burgard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent open-vocabulary robot mapping methods enrich dense geometric maps with
+pre-trained visual-language features. While these maps allow for the prediction
+of point-wise saliency maps when queried for a certain language concept,
+large-scale environments and abstract queries beyond the object level still
+pose a considerable hurdle, ultimately limiting language-grounded robotic
+navigation. In this work, we present HOV-SG, a hierarchical open-vocabulary 3D
+scene graph mapping approach for language-grounded robot navigation. Leveraging
+open-vocabulary vision foundation models, we first obtain state-of-the-art
+open-vocabulary segment-level maps in 3D and subsequently construct a 3D scene
+graph hierarchy consisting of floor, room, and object concepts, each enriched
+with open-vocabulary features. Our approach is able to represent multi-story
+buildings and allows robotic traversal of those using a cross-floor Voronoi
+graph. HOV-SG is evaluated on three distinct datasets and surpasses previous
+baselines in open-vocabulary semantic accuracy on the object, room, and floor
+level while producing a 75% reduction in representation size compared to dense
+open-vocabulary maps. In order to prove the efficacy and generalization
+capabilities of HOV-SG, we showcase successful long-horizon
+language-conditioned robot navigation within real-world multi-storage
+environments. We provide code and trial video data at http://hovsg.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and video are available at http://hovsg.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VIEScore: Towards Explainable Metrics for Conditional Image Synthesis
+  Evaluation <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Ku, Dongfu Jiang, Cong Wei, Xiang Yue, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly advancing field of conditional image generation research,
+challenges such as limited explainability lie in effectively evaluating the
+performance and capabilities of various models. This paper introduces VIEScore,
+a Visual Instruction-guided Explainable metric for evaluating any conditional
+image generation tasks. VIEScore leverages general knowledge from Multimodal
+Large Language Models (MLLMs) as the backbone and does not require training or
+fine-tuning. We evaluate VIEScore on seven prominent tasks in conditional image
+tasks and found: (1) VIEScore (GPT4-o) achieves a high Spearman correlation of
+0.4 with human evaluations, while the human-to-human correlation is 0.45. (2)
+VIEScore (with open-source MLLM) is significantly weaker than GPT-4o and GPT-4v
+in evaluating synthetic images. (3) VIEScore achieves a correlation on par with
+human ratings in the generation tasks but struggles in editing tasks. With
+these results, we believe VIEScore shows its great potential to replace human
+judges in evaluating image synthesis tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL2024 main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Picturing Ambiguity: A Visual Twist on the Winograd Schema Challenge <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16277v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16277v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brendan Park, Madeline Janecek, Naser Ezzati-Jivan, Yifeng Li, Ali Emami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable success in tasks
+like the Winograd Schema Challenge (WSC), showcasing advanced textual
+common-sense reasoning. However, applying this reasoning to multimodal domains,
+where understanding text and images together is essential, remains a
+substantial challenge. To address this, we introduce WinoVis, a novel dataset
+specifically designed to probe text-to-image models on pronoun disambiguation
+within multimodal contexts. Utilizing GPT-4 for prompt generation and Diffusion
+Attentive Attribution Maps (DAAM) for heatmap analysis, we propose a novel
+evaluation framework that isolates the models' ability in pronoun
+disambiguation from other visual processing challenges. Evaluation of
+successive model versions reveals that, despite incremental advancements,
+Stable Diffusion 2.0 achieves a precision of 56.7% on WinoVis, only marginally
+surpassing random guessing. Further error analysis identifies important areas
+for future research aimed at advancing text-to-image models in their ability to
+interpret and interact with the complex visual world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages (excluding references), accepted to ACL 2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NU-Class Net: A Novel Approach for Video Quality Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01163v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01163v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parham Zilouchian Moghaddam, Mehdi Modarressi, Mohammad Amin Sadeghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video content has experienced a surge in popularity, asserting its dominance
+over internet traffic and Internet of Things (IoT) networks. Video compression
+has long been regarded as the primary means of efficiently managing the
+substantial multimedia traffic generated by video-capturing devices.
+Nevertheless, video compression algorithms entail significant computational
+demands in order to achieve substantial compression ratios. This complexity
+presents a formidable challenge when implementing efficient video coding
+standards in resource-constrained embedded systems, such as IoT edge node
+cameras. To tackle this challenge, this paper introduces NU-Class Net, an
+innovative deep-learning model designed to mitigate compression artifacts
+stemming from lossy compression codecs. This enhancement significantly elevates
+the perceptible quality of low-bit-rate videos. By employing the NU-Class Net,
+the video encoder within the video-capturing node can reduce output quality,
+thereby generating low-bit-rate videos and effectively curtailing both
+computation and bandwidth requirements at the edge. On the decoder side, which
+is typically less encumbered by resource limitations, NU-Class Net is applied
+after the video decoder to compensate for artifacts and approximate the quality
+of the original video. Experimental results affirm the efficacy of the proposed
+model in enhancing the perceptible quality of videos, especially those streamed
+at low bit rates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain Transfer Through Image-to-Image Translation for Uncertainty-Aware
+  Prostate Cancer Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00479v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00479v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng Zhou, Amoon Jamzad, Jason Izard, Alexandre Menard, Robert Siemens, Parvin Mousavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prostate Cancer (PCa) is a prevalent disease among men, and multi-parametric
+MRIs offer a non-invasive method for its detection. While MRI-based deep
+learning solutions have shown promise in supporting PCa diagnosis, acquiring
+sufficient training data, particularly in local clinics remains challenging.
+One potential solution is to take advantage of publicly available datasets to
+pre-train deep models and fine-tune them on the local data, but multi-source
+MRIs can pose challenges due to cross-domain distribution differences. These
+limitations hinder the adoption of explainable and reliable deep-learning
+solutions in local clinics for PCa diagnosis. In this work, we present a novel
+approach for unpaired image-to-image translation of prostate multi-parametric
+MRIs and an uncertainty-aware training approach for classifying clinically
+significant PCa, to be applied in data-constrained settings such as local and
+small clinics. Our approach involves a novel pipeline for translating unpaired
+3.0T multi-parametric prostate MRIs to 1.5T, thereby augmenting the available
+training data. Additionally, we introduce an evidential deep learning approach
+to estimate model uncertainty and employ dataset filtering techniques during
+training. Furthermore, we propose a simple, yet efficient Evidential Focal
+Loss, combining focal loss with evidential uncertainty, to train our model
+effectively. Our experiments demonstrate that the proposed method significantly
+improves the Area Under ROC Curve (AUC) by over 20% compared to the previous
+work. Our code is available at https://github.com/med-i-lab/DT_UE_PCa
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. In Submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EvGGS: A Collaborative Learning Framework for Event-based Generalizable
+  Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxu Wang, Junhao He, Ziyi Zhang, Mingyuan Sun, Jingkai Sun, Renjing Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event cameras offer promising advantages such as high dynamic range and low
+latency, making them well-suited for challenging lighting conditions and
+fast-moving scenarios. However, reconstructing 3D scenes from raw event streams
+is difficult because event data is sparse and does not carry absolute color
+information. To release its potential in 3D reconstruction, we propose the
+first event-based generalizable 3D reconstruction framework, called EvGGS,
+which reconstructs scenes as 3D Gaussians from only event input in a
+feedforward manner and can generalize to unseen cases without any retraining.
+This framework includes a depth estimation module, an intensity reconstruction
+module, and a Gaussian regression module. These submodules connect in a
+cascading manner, and we collaboratively train them with a designed joint loss
+to make them mutually promote. To facilitate related studies, we build a novel
+event-based 3D dataset with various material objects and calibrated labels of
+grayscale images, depth maps, camera poses, and silhouettes. Experiments show
+models that have jointly trained significantly outperform those trained
+individually. Our approach performs better than all baselines in reconstruction
+quality, and depth/intensity predictions with satisfactory rendering speed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Pixel Is Worth More Than One 3D Gaussians in Single-View 3D
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20310v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20310v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianghao Shen, Nan Xue, Tianfu Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning 3D scene representation from a single-view image is a long-standing
+fundamental problem in computer vision, with the inherent ambiguity in
+predicting contents unseen from the input view. Built on the recently proposed
+3D Gaussian Splatting (3DGS), the Splatter Image method has made promising
+progress on fast single-image novel view synthesis via learning a single 3D
+Gaussian for each pixel based on the U-Net feature map of an input image.
+However, it has limited expressive power to represent occluded components that
+are not observable in the input view. To address this problem, this paper
+presents a Hierarchical Splatter Image method in which a pixel is worth more
+than one 3D Gaussians. Specifically, each pixel is represented by a parent 3D
+Gaussian and a small number of child 3D Gaussians. Parent 3D Gaussians are
+learned as done in the vanilla Splatter Image. Child 3D Gaussians are learned
+via a lightweight Multi-Layer Perceptron (MLP) which takes as input the
+projected image features of a parent 3D Gaussian and the embedding of a target
+camera view. Both parent and child 3D Gaussians are learned end-to-end in a
+stage-wise way. The joint condition of input image features from eyes of the
+parent Gaussians and the target camera position facilitates learning to
+allocate child Gaussians to ``see the unseen'', recovering the occluded details
+that are often missed by parent Gaussians.
+  In experiments, the proposed method is tested on the ShapeNet-SRN and CO3D
+datasets with state-of-the-art performance obtained, especially showing
+promising capabilities of reconstructing occluded contents in the input view.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-view Masked Diffusion <span class="highlight-title">Transformer</span>s for Person Image Synthesis <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01516v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01516v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trung X. Pham, Zhang Kang, Chang D. Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present X-MDPT ($\underline{Cross}$-view $\underline{M}$asked
+$\underline{D}$iffusion $\underline{P}$rediction $\underline{T}$ransformers), a
+novel diffusion model designed for pose-guided human image generation. X-MDPT
+distinguishes itself by employing masked diffusion transformers that operate on
+latent patches, a departure from the commonly-used Unet structures in existing
+works. The model comprises three key modules: 1) a denoising diffusion
+Transformer, 2) an aggregation network that consolidates conditions into a
+single vector for the diffusion process, and 3) a mask cross-prediction module
+that enhances representation learning with semantic information from the
+reference image. X-MDPT demonstrates scalability, improving FID, SSIM, and
+LPIPS with larger models. Despite its simple design, our model outperforms
+state-of-the-art approaches on the DeepFashion dataset while exhibiting
+efficiency in terms of training parameters, training time, and inference speed.
+Our compact 33MB model achieves an FID of 7.42, surpassing a prior Unet latent
+diffusion approach (FID 8.07) using only $11\times$ fewer parameters. Our best
+model surpasses the pixel-based diffusion with $\frac{2}{3}$ of the parameters
+and achieves $5.43 \times$ faster inference. The code is available at
+https://github.com/trungpx/xmdpt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Iterative Motion Editing with Natural Language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11538v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11538v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Purvi Goel, Kuan-Chieh Wang, C. Karen Liu, Kayvon Fatahalian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-motion diffusion models can generate realistic animations from text
+prompts, but do not support fine-grained motion editing controls. In this
+paper, we present a method for using natural language to iteratively specify
+local edits to existing character animations, a task that is common in most
+computer animation workflows. Our key idea is to represent a space of motion
+edits using a set of kinematic motion editing operators (MEOs) whose effects on
+the source motion is well-aligned with user expectations. We provide an
+algorithm that leverages pre-existing language models to translate textual
+descriptions of motion edits into source code for programs that define and
+execute sequences of MEOs on a source animation. We execute MEOs by first
+translating them into keyframe constraints, and then use diffusion-based motion
+models to generate output motions that respect these constraints. Through a
+user study and quantitative evaluation, we demonstrate that our system can
+perform motion edits that respect the animator's editing intent, remain
+faithful to the original animation (it edits the original animation, but does
+not dramatically change it), and yield realistic character animation results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpreting and Improving Diffusion Models from an Optimization
+  Perspective <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04848v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04848v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Permenter, Chenyang Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising is intuitively related to projection. Indeed, under the manifold
+hypothesis, adding random noise is approximately equivalent to orthogonal
+perturbation. Hence, learning to denoise is approximately learning to project.
+In this paper, we use this observation to interpret denoising diffusion models
+as approximate gradient descent applied to the Euclidean distance function. We
+then provide straight-forward convergence analysis of the DDIM sampler under
+simple assumptions on the projection error of the denoiser. Finally, we propose
+a new gradient-estimation sampler, generalizing DDIM using insights from our
+theoretical results. In as few as 5-10 function evaluations, our sampler
+achieves state-of-the-art FID scores on pretrained CIFAR-10 and CelebA models
+and can generate high quality samples on latent diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 9 figures, 4 tables. To appear in ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM
+  Agents Exponentially Fast <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08567v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08567v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangming Gu, Xiaosen Zheng, Tianyu Pang, Chao Du, Qian Liu, Ye Wang, Jing Jiang, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A multimodal large language model (MLLM) agent can receive instructions,
+capture images, retrieve histories from memory, and decide which tools to use.
+Nonetheless, red-teaming efforts have revealed that adversarial images/prompts
+can jailbreak an MLLM and cause unaligned behaviors. In this work, we report an
+even more severe safety issue in multi-agent environments, referred to as
+infectious jailbreak. It entails the adversary simply jailbreaking a single
+agent, and without any further intervention from the adversary, (almost) all
+agents will become infected exponentially fast and exhibit harmful behaviors.
+To validate the feasibility of infectious jailbreak, we simulate multi-agent
+environments containing up to one million LLaVA-1.5 agents, and employ
+randomized pair-wise chat as a proof-of-concept instantiation for multi-agent
+interaction. Our results show that feeding an (infectious) adversarial image
+into the memory of any randomly chosen agent is sufficient to achieve
+infectious jailbreak. Finally, we derive a simple principle for determining
+whether a defense mechanism can provably restrain the spread of infectious
+jailbreak, but how to design a practical defense that meets this principle
+remains an open question to investigate. Our project page is available at
+https://sail-sg.github.io/Agent-Smith/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Connecting the Dots: Collaborative Fine-tuning for Black-Box
+  Vision-Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04050v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04050v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengbo Wang, Jian Liang, Ran He, Zilei Wang, Tieniu Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of pretrained vision-language models (VLMs), considerable
+efforts have been devoted to fine-tuning them for downstream tasks. Despite the
+progress made in designing efficient fine-tuning methods, such methods require
+access to the model's parameters, which can be challenging as model owners
+often opt to provide their models as a black box to safeguard model ownership.
+This paper proposes a \textbf{C}ollabo\textbf{ra}tive
+\textbf{F}ine-\textbf{T}uning (\textbf{CraFT}) approach for fine-tuning
+black-box VLMs to downstream tasks, where one only has access to the input
+prompts and the output predictions of the model. CraFT comprises two modules, a
+prompt generation module for learning text prompts and a prediction refinement
+module for enhancing output predictions in residual style. Additionally, we
+introduce an auxiliary prediction-consistent loss to promote consistent
+optimization across these modules. These modules are optimized by a novel
+collaborative training algorithm. Extensive experiments on few-shot
+classification over 15 datasets demonstrate the superiority of CraFT. The
+results show that CraFT achieves a decent gain of about 12\% with 16-shot
+datasets and only 8,000 queries. Moreover, CraFT trains faster and uses only
+about 1/80 of the memory footprint for deployment, while sacrificing only
+1.62\% compared to the white-box method. Our code is publicly available at
+https://github.com/mrflogs/CraFT .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Patch-Wise <span class="highlight-title">Self-Supervised</span> Visual Representation Learning: A
+  Fine-Grained Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.18651v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.18651v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Javidani, Mohammad Amin Sadeghi, Babak Nadjar Araabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised visual representation learning traditionally focuses on
+image-level instance discrimination. Our study introduces an innovative,
+fine-grained dimension by integrating patch-level discrimination into these
+methodologies. This integration allows for the simultaneous analysis of local
+and global visual features, thereby enriching the quality of the learned
+representations. Initially, the original images undergo spatial augmentation.
+Subsequently, we employ a distinctive photometric patch-level augmentation,
+where each patch is individually augmented, independent from other patches
+within the same view. This approach generates a diverse training dataset with
+distinct color variations in each segment. The augmented images are then
+processed through a self-distillation learning framework, utilizing the Vision
+Transformer (ViT) as its backbone. The proposed method minimizes the
+representation distances across both image and patch levels to capture details
+from macro to micro perspectives. To this end, we present a simple yet
+effective patch-matching algorithm to find the corresponding patches across the
+augmented views. Thanks to the efficient structure of the patch-matching
+algorithm, our method reduces computational complexity compared to similar
+approaches. Consequently, we achieve an advanced understanding of the model
+without adding significant computational requirements. We have extensively
+pretrained our method on datasets of varied scales, such as Cifar10,
+ImageNet-100, and ImageNet-1K. It demonstrates superior performance over
+state-of-the-art self-supervised representation learning methods in image
+classification and downstream tasks, such as copy detection and image
+retrieval. The implementation of our method is accessible on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZeroNLG: Aligning and Autoencoding Domains for Zero-Shot Multimodal and
+  Multilingual Natural Language Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06458v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06458v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bang Yang, Fenglin Liu, Yuexian Zou, Xian Wu, Yaowei Wang, David A. Clifton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural Language Generation (NLG) accepts input data in the form of images,
+videos, or text and generates corresponding natural language text as output.
+Existing NLG methods mainly adopt a supervised approach and rely heavily on
+coupled data-to-text pairs. However, for many targeted scenarios and for
+non-English languages, sufficient quantities of labeled data are often not
+available. To relax the dependency on labeled data of downstream tasks, we
+propose an intuitive and effective zero-shot learning framework, ZeroNLG, which
+can deal with multiple NLG tasks, including image-to-text (image captioning),
+video-to-text (video captioning), and text-to-text (neural machine
+translation), across English, Chinese, German, and French within a unified
+framework. ZeroNLG does not require any labeled downstream pairs for training.
+During training, ZeroNLG (i) projects different domains (across modalities and
+languages) to corresponding coordinates in a shared common latent space; (ii)
+bridges different domains by aligning their corresponding coordinates in this
+space; and (iii) builds an unsupervised multilingual auto-encoder to learn to
+generate text by reconstructing the input text given its coordinate in shared
+latent space. Consequently, during inference, based on the data-to-text
+pipeline, ZeroNLG can generate target sentences across different languages
+given the coordinate of input data in the common space. Within this unified
+framework, given visual (imaging or video) data as input, ZeroNLG can perform
+zero-shot visual captioning; given textual sentences as input, ZeroNLG can
+perform zero-shot machine translation. We present the results of extensive
+experiments on twelve NLG tasks, showing that, without using any labeled
+downstream pairs for training, ZeroNLG generates high-quality and believable
+outputs and significantly outperforms existing zero-shot methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TPAMI (Our code and data are available at
+  https://github.com/yangbang18/ZeroNLG)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FissionFusion: Fast Geometric Generation and Hierarchical Souping for
+  Medical Image Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13341v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13341v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Santosh Sanjeev, Nuren Zhaksylyk, Ibrahim Almakky, Anees Ur Rehman Hashmi, Mohammad Areeb Qazi, Mohammad Yaqub
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scarcity of well-annotated medical datasets requires leveraging transfer
+learning from broader datasets like ImageNet or pre-trained models like CLIP.
+Model soups averages multiple fine-tuned models aiming to improve performance
+on In-Domain (ID) tasks and enhance robustness against Out-of-Distribution
+(OOD) datasets. However, applying these methods to the medical imaging domain
+faces challenges and results in suboptimal performance. This is primarily due
+to differences in error surface characteristics that stem from data
+complexities such as heterogeneity, domain shift, class imbalance, and
+distributional shifts between training and testing phases. To address this
+issue, we propose a hierarchical merging approach that involves local and
+global aggregation of models at various levels based on models' hyperparameter
+configurations. Furthermore, to alleviate the need for training a large number
+of models in the hyperparameter search, we introduce a computationally
+efficient method using a cyclical learning rate scheduler to produce multiple
+models for aggregation in the weight space. Our method demonstrates significant
+improvements over the model souping approach across multiple datasets (around
+6% gain in HAM10000 and CheXpert datasets) while maintaining low computational
+costs for model generation and selection. Moreover, we achieve better results
+on OOD datasets than model soups. The code is available at
+https://github.com/BioMedIA-MBZUAI/FissionFusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Weak Augmentation Guided Relational <span class="highlight-title">Self-Supervised</span> Learning <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.08717v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.08717v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingkai Zheng, Shan You, Fei Wang, Chen Qian, Changshui Zhang, Xiaogang Wang, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised Learning (SSL) including the mainstream contrastive learning
+has achieved great success in learning visual representations without data
+annotations. However, most methods mainly focus on the instance level
+information (\ie, the different augmented images of the same instance should
+have the same feature or cluster into the same class), but there is a lack of
+attention on the relationships between different instances. In this paper, we
+introduce a novel SSL paradigm, which we term as relational self-supervised
+learning (ReSSL) framework that learns representations by modeling the
+relationship between different instances. Specifically, our proposed method
+employs sharpened distribution of pairwise similarities among different
+instances as \textit{relation} metric, which is thus utilized to match the
+feature embeddings of different augmentations. To boost the performance, we
+argue that weak augmentations matter to represent a more reliable relation, and
+leverage momentum strategy for practical efficiency. The designed asymmetric
+predictor head and an InfoNCE warm-up strategy enhance the robustness to
+hyper-parameters and benefit the resulting performance. Experimental results
+show that our proposed ReSSL substantially outperforms the state-of-the-art
+methods across different network architectures, including various lightweight
+networks (\eg, EfficientNet and MobileNet).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of NeurIPS 2021 paper. arXiv admin note: substantial
+  text overlap with arXiv:2107.09282</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpeechAct: Towards Generating Whole-body Motion from Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.17425v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.17425v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinsong Zhang, Minjie Zhu, Yuxiang Zhang, Yebin Liu, Kun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the problem of generating whole-body motion from speech.
+Despite great successes, prior methods still struggle to produce reasonable and
+diverse whole-body motions from speech. This is due to their reliance on
+suboptimal representations and a lack of strategies for generating diverse
+results. To address these challenges, we present a novel hybrid point
+representation to achieve accurate and continuous motion generation, e.g.,
+avoiding foot skating, and this representation can be transformed into an
+easy-to-use representation, i.e., SMPL-X body mesh, for many applications. To
+generate whole-body motion from speech, for facial motion, closely tied to the
+audio signal, we introduce an encoder-decoder architecture to achieve
+deterministic outcomes. However, for the body and hands, which have weaker
+connections to the audio signal, we aim to generate diverse yet reasonable
+motions. To boost diversity in motion generation, we propose a contrastive
+motion learning method to encourage the model to produce more distinctive
+representations. Specifically, we design a robust VQ-VAE to learn a quantized
+motion codebook using our hybrid representation. Then, we regress the motion
+representation from the audio signal by a translation model employing our
+contrastive motion learning method. Experimental results validate the superior
+performance and the correctness of our model. The project page is available for
+research purposes at http://cic.tju.edu.cn/faculty/likun/projects/SpeechAct.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>the manuscript should be revised</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multistep Consistency Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Heek, Emiel Hoogeboom, Tim Salimans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are relatively easy to train but require many steps to
+generate samples. Consistency models are far more difficult to train, but
+generate samples in a single step.
+  In this paper we propose Multistep Consistency Models: A unification between
+Consistency Models (Song et al., 2023) and TRACT (Berthelot et al., 2023) that
+can interpolate between a consistency model and a diffusion model: a trade-off
+between sampling speed and sampling quality. Specifically, a 1-step consistency
+model is a conventional consistency model whereas a $\infty$-step consistency
+model is a diffusion model.
+  Multistep Consistency Models work really well in practice. By increasing the
+sample budget from a single step to 2-8 steps, we can train models more easily
+that generate higher quality samples, while retaining much of the sampling
+speed benefits. Notable results are 1.4 FID on Imagenet 64 in 8 step and 2.1
+FID on Imagenet128 in 8 steps with consistency distillation, using simple
+losses without adversarial training. We also show that our method scales to a
+text-to-image diffusion model, generating samples that are close to the quality
+of the original model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DP-IQA: Utilizing Diffusion Prior for Blind Image Quality Assessment in
+  the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19996v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19996v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honghao Fu, Yufei Wang, Wenhan Yang, Bihan Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image quality assessment (IQA) plays a critical role in selecting
+high-quality images and guiding compression and enhancement methods in a series
+of applications. The blind IQA, which assesses the quality of in-the-wild
+images containing complex authentic distortions without reference images, poses
+greater challenges. Existing methods are limited to modeling a uniform
+distribution with local patches and are bothered by the gap between low and
+high-level visions (caused by widely adopted pre-trained classification
+networks). In this paper, we propose a novel IQA method called diffusion
+priors-based IQA (DP-IQA), which leverages the prior knowledge from the
+pre-trained diffusion model with its excellent powers to bridge semantic gaps
+in the perception of the visual quality of images. Specifically, we use
+pre-trained stable diffusion as the backbone, extract multi-level features from
+the denoising U-Net during the upsampling process at a specified timestep, and
+decode them to estimate the image quality score. The text and image adapters
+are adopted to mitigate the domain gap for downstream tasks and correct the
+information loss caused by the variational autoencoder bottleneck. Finally, we
+distill the knowledge in the above model into a CNN-based student model,
+significantly reducing the parameter to enhance applicability, with the student
+model performing similarly or even better than the teacher model surprisingly.
+Experimental results demonstrate that our DP-IQA achieves state-of-the-art
+results on various in-the-wild datasets with better generalization capability,
+which shows the superiority of our method in global modeling and utilizing the
+hierarchical feature clues of diffusion for evaluating image quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Masked Autoencoders with Self-Consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14431v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14431v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaowen Li, Yousong Zhu, Zhiyang Chen, Wei Li, Chaoyang Zhao, Rui Zhao, Ming Tang, Jinqiao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by the masked language modeling (MLM) in natural language processing
+tasks, the masked image modeling (MIM) has been recognized as a strong
+self-supervised pre-training method in computer vision. However, the high
+random mask ratio of MIM results in two serious problems: 1) the inadequate
+data utilization of images within each iteration brings prolonged pre-training,
+and 2) the high inconsistency of predictions results in unreliable generations,
+$i.e.$, the prediction of the identical patch may be inconsistent in different
+mask rounds, leading to divergent semantics in the ultimately generated
+outcomes. To tackle these problems, we propose the efficient masked
+autoencoders with self-consistency (EMAE) to improve the pre-training
+efficiency and increase the consistency of MIM. In particular, we present a
+parallel mask strategy that divides the image into K non-overlapping parts,
+each of which is generated by a random mask with the same mask ratio. Then the
+MIM task is conducted parallelly on all parts in an iteration and the model
+minimizes the loss between the predictions and the masked patches. Besides, we
+design the self-consistency learning to further maintain the consistency of
+predictions of overlapping masked patches among parts. Overall, our method is
+able to exploit the data more efficiently and obtains reliable representations.
+Experiments on ImageNet show that EMAE achieves the best performance on
+ViT-Large with only 13% of MAE pre-training time using NVIDIA A100 GPUs. After
+pre-training on diverse datasets, EMAE consistently obtains state-of-the-art
+transfer ability on a variety of downstream tasks, such as image
+classification, object detection, and semantic segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept by IEEE Transactions on Pattern Analysis and Machine
+  Intelligence (TPAMI)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeCoF: Generated Video Detection via Frame Consistency: The First
+  Benchmark <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02085v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02085v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Ma, Jiajia Zhang, Hongping Deng, Ningyu Zhang, Qinglang Guo, Haiyang Yu, Yong Liao, Pengyuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The escalating quality of video generated by advanced video generation
+methods results in new security challenges, while there have been few relevant
+research efforts: 1) There is no open-source dataset for generated video
+detection, 2) No generated video detection method has been proposed so far. To
+this end, we propose an open-source dataset and a detection method for
+generated video for the first time. First, we propose a scalable dataset
+consisting of 964 prompts, covering various forgery targets, scenes, behaviors,
+and actions, as well as various generation models with different architectures
+and generation methods, including the most popular commercial models like
+OpenAI's Sora and Google's Veo. Second, we found via probing experiments that
+spatial artifact-based detectors lack generalizability. Hence, we propose a
+simple yet effective \textbf{de}tection model based on \textbf{f}rame
+\textbf{co}nsistency (\textbf{DeCoF}), which focuses on temporal artifacts by
+eliminating the impact of spatial artifacts during feature learning. Extensive
+experiments demonstrate the efficacy of DeCoF in detecting videos generated by
+unseen video generation models and confirm its powerful generalizability across
+several commercially proprietary models. Our code and dataset will be released
+at \url{https://anonymous.4open.science/r/DeCoF-8394}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Cranial Defect Reconstruction with <span class="highlight-title">Self-Supervised</span> Deep
+  Deformable Masked Autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13106v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13106v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marek Wodzinski, Daria Hemmerling, Mateusz Daniol
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thousands of people suffer from cranial injuries every year. They require
+personalized implants that need to be designed and manufactured before the
+reconstruction surgery. The manual design is expensive and time-consuming
+leading to searching for algorithms whose goal is to automatize the process.
+The problem can be formulated as volumetric shape completion and solved by deep
+neural networks dedicated to supervised image segmentation. However, such an
+approach requires annotating the ground-truth defects which is costly and
+time-consuming. Usually, the process is replaced with synthetic defect
+generation. However, even the synthetic ground-truth generation is
+time-consuming and limits the data heterogeneity, thus the deep models'
+generalizability. In our work, we propose an alternative and simple approach to
+use a self-supervised masked autoencoder to solve the problem. This approach by
+design increases the heterogeneity of the training set and can be seen as a
+form of data augmentation. We compare the proposed method with several
+state-of-the-art deep neural networks and show both the quantitative and
+qualitative improvement on the SkullBreak and SkullFix datasets. The proposed
+method can be used to efficiently reconstruct the cranial defects in real time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Efficient and Effective Point-based Networks for Event Camera
+  Classification and Regression: EventMamba 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.06116v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.06116v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongwei Ren, Yue Zhou, Jiadong Zhu, Haotian Fu, Yulong Huang, Xiaopeng Lin, Yuetong Fang, Fei Ma, Hao Yu, Bojun Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event cameras, drawing inspiration from biological systems, efficiently
+detect changes in ambient light with low latency and high dynamic range while
+consuming minimal power. The most current approach to processing event data
+often involves converting it into frame-based representations, which is
+well-established in traditional vision. However, this approach neglects the
+sparsity of event data, loses fine-grained temporal information during the
+transformation process, and increases the computational burden, making it
+ineffective for characterizing event camera properties. In contrast, Point
+Cloud is a popular representation for 3D processing and is better suited to
+match the sparse and asynchronous nature of the event camera. Nevertheless,
+despite the theoretical compatibility of point-based methods with event
+cameras, the results show a performance gap that is not yet satisfactory
+compared to frame-based methods. In order to bridge the performance gap, we
+propose EventMamba, an efficient and effective Point Cloud framework that
+achieves competitive results even compared to the state-of-the-art (SOTA)
+frame-based method in both classification and regression tasks. This notable
+accomplishment is facilitated by our rethinking of the distinction between
+Event Cloud and Point Cloud, emphasizing effective temporal information
+extraction through optimized network structures. Specifically, EventMamba
+leverages temporal aggregation and State Space Model (SSM) based Mamba boasting
+enhanced temporal information extraction capabilities. Through a hierarchical
+structure, EventMamba is adept at abstracting local and global spatial features
+and implicit and explicit temporal features. By adhering to the lightweight
+design principle, EventMamba delivers impressive results with minimal
+computational resource utilization, demonstrating its efficiency and
+effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extension Journal of TTPOINT and PEPNet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PLUG: Revisiting Amodal Segmentation with Foundation Model and
+  Hierarchical Focus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16094v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16094v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaochen Liu, Limeng Qiao, Xiangxiang Chu, Tingting Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aiming to predict the complete shapes of partially occluded objects, amodal
+segmentation is an important step towards visual intelligence. With crucial
+significance, practical prior knowledge derives from sufficient training, while
+limited amodal annotations pose challenges to achieve better performance. To
+tackle this problem, utilizing the mighty priors accumulated in the foundation
+model, we propose the first SAM-based amodal segmentation approach, PLUG.
+Methodologically, a novel framework with hierarchical focus is presented to
+better adapt the task characteristics and unleash the potential capabilities of
+SAM. In the region level, due to the association and division in visible and
+occluded areas, inmodal and amodal regions are assigned as the focuses of
+distinct branches to avoid mutual disturbance. In the point level, we introduce
+the concept of uncertainty to explicitly assist the model in identifying and
+focusing on ambiguous points. Guided by the uncertainty map, a
+computation-economic point loss is applied to improve the accuracy of predicted
+boundaries. Experiments are conducted on several prominent datasets, and the
+results show that our proposed method outperforms existing methods with large
+margins. Even with fewer total parameters, our method still exhibits remarkable
+advantages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Portrait4D: Learning One-Shot 4D Head Avatar Synthesis using Synthetic
+  Data <span class="chip">CVPR24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18729v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18729v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Deng, Duomin Wang, Xiaohang Ren, Xingyu Chen, Baoyuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing one-shot 4D head synthesis methods usually learn from monocular
+videos with the aid of 3DMM reconstruction, yet the latter is evenly
+challenging which restricts them from reasonable 4D head synthesis. We present
+a method to learn one-shot 4D head synthesis via large-scale synthetic data.
+The key is to first learn a part-wise 4D generative model from monocular images
+via adversarial learning, to synthesize multi-view images of diverse identities
+and full motions as training data; then leverage a transformer-based animatable
+triplane reconstructor to learn 4D head reconstruction using the synthetic
+data. A novel learning strategy is enforced to enhance the generalizability to
+real images by disentangling the learning process of 3D reconstruction and
+reenactment. Experiments demonstrate our superiority over the prior art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR24 camera ready version. Project page:
+  https://yudeng.github.io/Portrait4D/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Wasserstein Gradient Flow for Generative Modeling through
+  Unbalanced Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05443v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05443v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemoo Choi, Jaewoong Choi, Myungjoo Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wasserstein Gradient Flow (WGF) describes the gradient dynamics of
+probability density within the Wasserstein space. WGF provides a promising
+approach for conducting optimization over the probability distributions.
+Numerically approximating the continuous WGF requires the time discretization
+method. The most well-known method for this is the JKO scheme. In this regard,
+previous WGF models employ the JKO scheme and parametrize transport map for
+each JKO step. However, this approach results in quadratic training complexity
+$O(K^2)$ with the number of JKO step $K$. This severely limits the scalability
+of WGF models. In this paper, we introduce a scalable WGF-based generative
+model, called Semi-dual JKO (S-JKO). Our model is based on the semi-dual form
+of the JKO step, derived from the equivalence between the JKO step and the
+Unbalanced Optimal Transport. Our approach reduces the training complexity to
+$O(K)$. We demonstrate that our model significantly outperforms existing
+WGF-based generative models, achieving FID scores of 2.62 on CIFAR-10 and 5.46
+on CelebA-HQ-256, which are comparable to state-of-the-art image generative
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video-LaVIT: Unified Video-Language <span class="highlight-title">Pre-train</span>ing with Decoupled
+  Visual-Motional Tokenization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03161v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03161v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Jin, Zhicheng Sun, Kun Xu, Kun Xu, Liwei Chen, Hao Jiang, Quzhe Huang, Chengru Song, Yuliang Liu, Di Zhang, Yang Song, Kun Gai, Yadong Mu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In light of recent advances in multimodal Large Language Models (LLMs), there
+is increasing attention to scaling them from image-text data to more
+informative real-world videos. Compared to static images, video poses unique
+challenges for effective large-scale pre-training due to the modeling of its
+spatiotemporal dynamics. In this paper, we address such limitations in
+video-language pre-training with an efficient video decomposition that
+represents each video as keyframes and temporal motions. These are then adapted
+to an LLM using well-designed tokenizers that discretize visual and temporal
+information as a few tokens, thus enabling unified generative pre-training of
+videos, images, and text. At inference, the generated tokens from the LLM are
+carefully recovered to the original continuous pixel space to create various
+video content. Our proposed framework is both capable of comprehending and
+generating image and video content, as demonstrated by its competitive
+performance across 13 multimodal benchmarks in image and video understanding
+and generation. Our code and models are available at
+https://video-lavit.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Conditional Diffusion Models for Robust Semantic Image
+  Synthesis <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16506v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16506v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juyeon Ko, Inho Kong, Dogyun Park, Hyunwoo J. Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic image synthesis (SIS) is a task to generate realistic images
+corresponding to semantic maps (labels). However, in real-world applications,
+SIS often encounters noisy user inputs. To address this, we propose Stochastic
+Conditional Diffusion Model (SCDM), which is a robust conditional diffusion
+model that features novel forward and generation processes tailored for SIS
+with noisy labels. It enhances robustness by stochastically perturbing the
+semantic label maps through Label Diffusion, which diffuses the labels with
+discrete diffusion. Through the diffusion of labels, the noisy and clean
+semantic maps become similar as the timestep increases, eventually becoming
+identical at $t=T$. This facilitates the generation of an image close to a
+clean image, enabling robust generation. Furthermore, we propose a class-wise
+noise schedule to differentially diffuse the labels depending on the class. We
+demonstrate that the proposed method generates high-quality samples through
+extensive experiments and analyses on benchmark datasets, including a novel
+experimental setup simulating human errors during real-world applications. Code
+is available at https://github.com/mlvlab/SCDM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Edit As You Wish: Video Caption Editing with Multi-grained User Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linli Yao, Yuanmeng Zhang, Ziheng Wang, Xinglin Hou, Tiezheng Ge, Yuning Jiang, Xu Sun, Qin Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically narrating videos in natural language complying with user
+requests, i.e. Controllable Video Captioning task, can help people manage
+massive videos with desired intentions. However, existing works suffer from two
+shortcomings: 1) the control signal is single-grained which can not satisfy
+diverse user intentions; 2) the video description is generated in a single
+round which can not be further edited to meet dynamic needs. In this paper, we
+propose a novel \textbf{V}ideo \textbf{C}aption \textbf{E}diting \textbf{(VCE)}
+task to automatically revise an existing video description guided by
+multi-grained user requests. Inspired by human writing-revision habits, we
+design the user command as a pivotal triplet \{\textit{operation, position,
+attribute}\} to cover diverse user needs from coarse-grained to fine-grained.
+To facilitate the VCE task, we \textit{automatically} construct an open-domain
+benchmark dataset named VATEX-EDIT and \textit{manually} collect an e-commerce
+dataset called EMMAD-EDIT. We further propose a specialized small-scale model
+(i.e., OPA) compared with two generalist Large Multi-modal Models to perform an
+exhaustive analysis of the novel task. For evaluation, we adopt comprehensive
+metrics considering caption fluency, command-caption consistency, and
+video-caption alignment. Experiments reveal the task challenges of fine-grained
+multi-modal semantics understanding and processing. Our datasets, codes, and
+evaluation tools are ready to be open-sourced.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Continual Learning of Vision-Language Models via
+  Mixture-of-Experts Adapters <span class="chip">CVPR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11549v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11549v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazuo Yu, Yunzhi Zhuge, Lu Zhang, Ping Hu, Dong Wang, Huchuan Lu, You He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning can empower vision-language models to continuously acquire
+new knowledge, without the need for access to the entire historical dataset.
+However, mitigating the performance degradation in large-scale models is
+non-trivial due to (i) parameter shifts throughout lifelong learning and (ii)
+significant computational burdens associated with full-model tuning. In this
+work, we present a parameter-efficient continual learning framework to
+alleviate long-term forgetting in incremental learning with vision-language
+models. Our approach involves the dynamic expansion of a pre-trained CLIP
+model, through the integration of Mixture-of-Experts (MoE) adapters in response
+to new tasks. To preserve the zero-shot recognition capability of
+vision-language models, we further introduce a Distribution Discriminative
+Auto-Selector (DDAS) that automatically routes in-distribution and
+out-of-distribution inputs to the MoE Adapter and the original CLIP,
+respectively. Through extensive experiments across various settings, our
+proposed method consistently outperforms previous state-of-the-art approaches
+while concurrently reducing parameter training burdens by 60%. Our code locates
+at https://github.com/JiazuoYu/MoE-Adapters4CL
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is accepted by CVPR2024. More modifications may be
+  performed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPAFormer: Sequential 3D Part Assembly with <span class="highlight-title">Transformer</span>s <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boshen Xu, Sipeng Zheng, Qin Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce SPAFormer, an innovative model designed to overcome the
+combinatorial explosion challenge in the 3D Part Assembly (3D-PA) task. This
+task requires accurate prediction of each part's pose and shape in sequential
+steps, and as the number of parts increases, the possible assembly combinations
+increase exponentially, leading to a combinatorial explosion that severely
+hinders the efficacy of 3D-PA. SPAFormer addresses this problem by leveraging
+weak constraints from assembly sequences, effectively reducing the solution
+space's complexity. Since assembly part sequences convey construction rules
+similar to sentences being structured through words, our model explores both
+parallel and autoregressive generation. It further enhances assembly through
+knowledge enhancement strategies that utilize the attributes of parts and their
+sequence information, enabling it to capture the inherent assembly pattern and
+relationships among sequentially ordered parts. We also construct a more
+challenging benchmark named PartNet-Assembly covering 21 varied categories to
+more comprehensively validate the effectiveness of SPAFormer. Extensive
+experiments demonstrate the superior generalization capabilities of SPAFormer,
+particularly with multi-tasking and in scenarios requiring long-horizon
+assembly. Codes and model weights will be released at
+https://github.com/xuboshen/SPAFormer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/xuboshen/SPAFormer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EgoNCE++: Do Egocentric Video-Language Models Really Understand
+  Hand-Object Interactions? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17719v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17719v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boshen Xu, Ziheng Wang, Yang Du, Zhinan Song, Sipeng Zheng, Qin Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Egocentric video-language pretraining is a crucial paradigm to advance the
+learning of egocentric hand-object interactions (EgoHOI). Despite the great
+success on existing testbeds, these benchmarks focus more on closed-set visual
+concepts or limited scenarios. Due to the occurrence of diverse EgoHOIs in the
+real world, we propose an open-vocabulary benchmark named EgoHOIBench to reveal
+the diminished performance of current egocentric video-language models (EgoVLM)
+on fined-grained concepts, indicating that these models still lack a full
+spectrum of egocentric understanding. We attribute this performance gap to
+insufficient fine-grained supervision and strong bias towards understanding
+objects rather than temporal dynamics in current methods. To tackle these
+issues, we introduce a novel asymmetric contrastive objective for EgoHOI named
+EgoNCE++. For video-to-text loss, we enhance text supervision through the
+generation of negative captions by leveraging the in-context learning of large
+language models to perform HOI-related word substitution. For text-to-video
+loss, we propose an object-centric positive video sampling strategy that
+aggregates video representations by the same nouns. Our extensive experiments
+demonstrate that EgoNCE++ significantly boosts open-vocabulary HOI recognition,
+multi-instance retrieval, and action recognition tasks across various
+egocentric models, with improvements of up to +26.55%. Our code is available at
+https://github.com/xuboshen/EgoNCEpp.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/xuboshen/EgoNCEpp</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DoRA: Weight-Decomposed Low-Rank Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09353v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09353v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shih-Yang Liu, Chien-Yi Wang, Hongxu Yin, Pavlo Molchanov, Yu-Chiang Frank Wang, Kwang-Ting Cheng, Min-Hung Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among the widely used parameter-efficient fine-tuning (PEFT) methods, LoRA
+and its variants have gained considerable popularity because of avoiding
+additional inference costs. However, there still often exists an accuracy gap
+between these methods and full fine-tuning (FT). In this work, we first
+introduce a novel weight decomposition analysis to investigate the inherent
+differences between FT and LoRA. Aiming to resemble the learning capacity of FT
+from the findings, we propose Weight-Decomposed Low-Rank Adaptation (DoRA).
+DoRA decomposes the pre-trained weight into two components, magnitude and
+direction, for fine-tuning, specifically employing LoRA for directional updates
+to efficiently minimize the number of trainable parameters. By employing \ours,
+we enhance both the learning capacity and training stability of LoRA while
+avoiding any additional inference overhead. \ours~consistently outperforms LoRA
+on fine-tuning LLaMA, LLaVA, and VL-BART on various downstream tasks, such as
+commonsense reasoning, visual instruction tuning, and image/video-text
+understanding. Code is available at https://github.com/NVlabs/DoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code available at https://github.com/NVlabs/DoRA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Socface Project: Large-Scale Collection, Processing, and Analysis of
+  a Century of French Censuses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18706v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18706v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mélodie Boillet, Solène Tarride, Manon Blanco, Valentin Rigal, Yoann Schneider, Bastien Abadie, Lionel Kesztenbaum, Christopher Kermorvant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a complete processing workflow for extracting information
+from French census lists from 1836 to 1936. These lists contain information
+about individuals living in France and their households. We aim at extracting
+all the information contained in these tables using automatic handwritten table
+recognition. At the end of the Socface project, in which our work is taking
+place, the extracted information will be redistributed to the departmental
+archives, and the nominative lists will be freely available to the public,
+allowing anyone to browse hundreds of millions of records. The extracted data
+will be used by demographers to analyze social change over time, significantly
+improving our understanding of French economic and social structures. For this
+project, we developed a complete processing workflow: large-scale data
+collection from French departmental archives, collaborative annotation of
+documents, training of handwritten table text and structure recognition models,
+and mass processing of millions of images. We present the tools we have
+developed to easily collect and process millions of pages. We also show that it
+is possible to process such a wide variety of tables with a single table
+recognition model that uses the image of the entire page to recognize
+information about individuals, categorize them and automatically group them
+into households. The entire process has been successfully used to process the
+documents of a departmental archive, representing more than 450,000 images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spurious Feature Eraser: Stabilizing Test-Time Adaptation for
+  Vision-Language Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00376v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00376v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan Ma, Yan Zhu, Changqing Zhang, Peilin Zhao, Baoyuan Wu, Long-Kai Huang, Qinghua Hu, Bingzhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language foundation models have exhibited remarkable success across a
+multitude of downstream tasks due to their scalability on extensive image-text
+paired data. However, these models also display significant limitations when
+applied to downstream tasks, such as fine-grained image classification, as a
+result of ``decision shortcuts'' that hinder their generalization capabilities.
+In this work, we find that the CLIP model possesses a rich set of features,
+encompassing both \textit{desired invariant causal features} and
+\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP
+on downstream tasks originates from its inability to effectively utilize
+pre-trained features in accordance with specific task requirements. To address
+this challenge, we propose a simple yet effective method, Spurious Feature
+Eraser (SEraser), to alleviate the decision shortcuts by erasing the spurious
+features. Specifically, we introduce a test-time prompt tuning paradigm that
+optimizes a learnable prompt, thereby compelling the model to exploit invariant
+features while disregarding decision shortcuts during the inference phase. The
+proposed method effectively alleviates excessive dependence on potentially
+misleading spurious information. We conduct comparative analysis of the
+proposed method against various approaches which validates the significant
+superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling White-Box <span class="highlight-title">Transformer</span>s for Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20299v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20299v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinrui Yang, Xianhang Li, Druv Pai, Yuyin Zhou, Yi Ma, Yaodong Yu, Cihang Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CRATE, a white-box transformer architecture designed to learn compressed and
+sparse representations, offers an intriguing alternative to standard vision
+transformers (ViTs) due to its inherent mathematical interpretability. Despite
+extensive investigations into the scaling behaviors of language and vision
+transformers, the scalability of CRATE remains an open question which this
+paper aims to address. Specifically, we propose CRATE-$\alpha$, featuring
+strategic yet minimal modifications to the sparse coding block in the CRATE
+architecture design, and a light training recipe designed to improve the
+scalability of CRATE. Through extensive experiments, we demonstrate that
+CRATE-$\alpha$ can effectively scale with larger model sizes and datasets. For
+example, our CRATE-$\alpha$-B substantially outperforms the prior best CRATE-B
+model accuracy on ImageNet classification by 3.7%, achieving an accuracy of
+83.2%. Meanwhile, when scaling further, our CRATE-$\alpha$-L obtains an
+ImageNet classification accuracy of 85.1%. More notably, these model
+performance improvements are achieved while preserving, and potentially even
+enhancing the interpretability of learned CRATE models, as we demonstrate
+through showing that the learned token representations of increasingly larger
+trained CRATE-$\alpha$ models yield increasingly higher-quality unsupervised
+object segmentation of images. The project page is
+https://rayjryang.github.io/CRATE-alpha/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://rayjryang.github.io/CRATE-alpha/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoMat: Aligning Text-to-Image Diffusion Model with Image-to-Text Concept
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03653v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03653v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongzhi Jiang, Guanglu Song, Xiaoshi Wu, Renrui Zhang, Dazhong Shen, Zhuofan Zong, Yu Liu, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated great success in the field of
+text-to-image generation. However, alleviating the misalignment between the
+text prompts and images is still challenging. The root reason behind the
+misalignment has not been extensively investigated. We observe that the
+misalignment is caused by inadequate token attention activation. We further
+attribute this phenomenon to the diffusion model's insufficient condition
+utilization, which is caused by its training paradigm. To address the issue, we
+propose CoMat, an end-to-end diffusion model fine-tuning strategy with an
+image-to-text concept matching mechanism. We leverage an image captioning model
+to measure image-to-text alignment and guide the diffusion model to revisit
+ignored tokens. A novel attribute concentration module is also proposed to
+address the attribute binding problem. Without any image or human preference
+data, we use only 20K text prompts to fine-tune SDXL to obtain CoMat-SDXL.
+Extensive experiments show that CoMat-SDXL significantly outperforms the
+baseline model SDXL in two text-to-image alignment benchmarks and achieves
+start-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://caraj7.github.io/comat</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stratified Avatar Generation from Sparse Observations <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20786v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20786v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Feng, Wenchao Ma, Quankai Gao, Xianwei Zheng, Nan Xue, Huijuan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating 3D full-body avatars from AR/VR devices is essential for creating
+immersive experiences in AR/VR applications. This task is challenging due to
+the limited input from Head Mounted Devices, which capture only sparse
+observations from the head and hands. Predicting the full-body avatars,
+particularly the lower body, from these sparse observations presents
+significant difficulties. In this paper, we are inspired by the inherent
+property of the kinematic tree defined in the Skinned Multi-Person Linear
+(SMPL) model, where the upper body and lower body share only one common
+ancestor node, bringing the potential of decoupled reconstruction. We propose a
+stratified approach to decouple the conventional full-body avatar
+reconstruction pipeline into two stages, with the reconstruction of the upper
+body first and a subsequent reconstruction of the lower body conditioned on the
+previous stage. To implement this straightforward idea, we leverage the latent
+diffusion model as a powerful probabilistic generator, and train it to follow
+the latent distribution of decoupled motions explored by a VQ-VAE
+encoder-decoder model. Extensive experiments on AMASS mocap dataset demonstrate
+our state-of-the-art performance in the reconstruction of full-body motions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2024 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring Multi-Timestep Multi-Stage Diffusion Features for
+  Hyperspectral Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08964v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08964v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyi Zhou, Jiamu Sheng, Jiayuan Fan, Peng Ye, Tong He, Bin Wang, Tao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The effectiveness of spectral-spatial feature learning is crucial for the
+hyperspectral image (HSI) classification task. Diffusion models, as a new class
+of groundbreaking generative models, have the ability to learn both contextual
+semantics and textual details from the distinct timestep dimension, enabling
+the modeling of complex spectral-spatial relations in HSIs. However, existing
+diffusion-based HSI classification methods only utilize manually selected
+single-timestep single-stage features, limiting the full exploration and
+exploitation of rich contextual semantics and textual information hidden in the
+diffusion model. To address this issue, we propose a novel diffusion-based
+feature learning framework that explores Multi-Timestep Multi-Stage Diffusion
+features for HSI classification for the first time, called MTMSD. Specifically,
+the diffusion model is first pretrained with unlabeled HSI patches to mine the
+connotation of unlabeled data, and then is used to extract the multi-timestep
+multi-stage diffusion features. To effectively and efficiently leverage
+multi-timestep multi-stage features,two strategies are further developed. One
+strategy is class & timestep-oriented multi-stage feature purification module
+with the inter-class and inter-timestep prior for reducing the redundancy of
+multi-stage features and alleviating memory constraints. The other one is
+selective timestep feature fusion module with the guidance of global features
+to adaptively select different timestep features for integrating texture and
+semantics. Both strategies facilitate the generality and adaptability of the
+MTMSD framework for diverse patterns of different HSI data. Extensive
+experiments are conducted on four public HSI datasets, and the results
+demonstrate that our method outperforms state-of-the-art methods for HSI
+classification, especially on the challenging Houston 2018 dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Micro-action Recognition: <span class="highlight-title">Dataset</span>, Methods, and
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan Guo, Kun Li, Bin Hu, Yan Zhang, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Micro-action is an imperceptible non-verbal behaviour characterised by
+low-intensity movement. It offers insights into the feelings and intentions of
+individuals and is important for human-oriented applications such as emotion
+recognition and psychological assessment. However, the identification,
+differentiation, and understanding of micro-actions pose challenges due to the
+imperceptible and inaccessible nature of these subtle human behaviors in
+everyday life. In this study, we innovatively collect a new micro-action
+dataset designated as Micro-action-52 (MA-52), and propose a benchmark named
+micro-action network (MANet) for micro-action recognition (MAR) task. Uniquely,
+MA-52 provides the whole-body perspective including gestures, upper- and
+lower-limb movements, attempting to reveal comprehensive micro-action cues. In
+detail, MA-52 contains 52 micro-action categories along with seven body part
+labels, and encompasses a full array of realistic and natural micro-actions,
+accounting for 205 participants and 22,422 video instances collated from the
+psychological interviews. Based on the proposed dataset, we assess MANet and
+other nine prevalent action recognition methods. MANet incorporates squeeze-and
+excitation (SE) and temporal shift module (TSM) into the ResNet architecture
+for modeling the spatiotemporal characteristics of micro-actions. Then a
+joint-embedding loss is designed for semantic matching between video and action
+labels; the loss is used to better distinguish between visually similar yet
+distinct micro-action categories. The extended application in emotion
+recognition has demonstrated one of the important values of our proposed
+dataset and method. In the future, further exploration of human behaviour,
+emotion, and psychological assessment will be conducted in depth. The dataset
+and source code are released at https://github.com/VUT-HFUT/Micro-Action.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Circuits and Systems for Video
+  Technology</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S4Fusion: Saliency-aware Selective State Space Model for Infrared
+  Visible Image Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20881v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20881v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haolong Ma, Hui Li, Chunyang Cheng, Gaoang Wang, Xiaoning Song, Xiaojun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As one of the tasks in Image Fusion, Infrared and Visible Image Fusion aims
+to integrate complementary information captured by sensors of different
+modalities into a single image. The Selective State Space Model (SSSM), known
+for its ability to capture long-range dependencies, has demonstrated its
+potential in the field of computer vision. However, in image fusion, current
+methods underestimate the potential of SSSM in capturing the global spatial
+information of both modalities. This limitation prevents the simultaneous
+consideration of the global spatial information from both modalities during
+interaction, leading to a lack of comprehensive perception of salient targets.
+Consequently, the fusion results tend to bias towards one modality instead of
+adaptively preserving salient targets. To address this issue, we propose the
+Saliency-aware Selective State Space Fusion Model (S4Fusion). In our S4Fusion,
+the designed Cross-Modal Spatial Awareness Module (CMSA) can simultaneously
+focus on global spatial information from both modalities while facilitating
+their interaction, thereby comprehensively capturing complementary information.
+Additionally, S4Fusion leverages a pre-trained network to perceive uncertainty
+in the fused images. By minimizing this uncertainty, S4Fusion adaptively
+highlights salient targets from both images. Extensive experiments demonstrate
+that our approach produces high-quality images and enhances performance in
+downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Characteristic Guidance: Non-linear Correction for Diffusion Model at
+  Large Guidance Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07586v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07586v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Candi Zheng, Yuan Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Popular guidance for denoising diffusion probabilistic model (DDPM) linearly
+combines distinct conditional models together to provide enhanced control over
+samples. However, this approach overlooks nonlinear effects that become
+significant when guidance scale is large. To address this issue, we propose
+characteristic guidance, a guidance method that provides first-principle
+non-linear correction for classifier-free guidance. Such correction forces the
+guided DDPMs to respect the Fokker-Planck (FP) equation of diffusion process,
+in a way that is training-free and compatible with existing sampling methods.
+Experiments show that characteristic guidance enhances semantic characteristics
+of prompts and mitigate irregularities in image generation, proving effective
+in diverse applications ranging from simulating magnet phase transitions to
+latent space sampling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TempCompass: Do Video LLMs Really Understand Videos? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00476v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00476v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanxin Liu, Shicheng Li, Yi Liu, Yuxiang Wang, Shuhuai Ren, Lei Li, Sishuo Chen, Xu Sun, Lu Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there is a surge in interest surrounding video large language
+models (Video LLMs). However, existing benchmarks fail to provide a
+comprehensive feedback on the temporal perception ability of Video LLMs. On the
+one hand, most of them are unable to distinguish between different temporal
+aspects (e.g., speed, direction) and thus cannot reflect the nuanced
+performance on these specific aspects. On the other hand, they are limited in
+the diversity of task formats (e.g., only multi-choice QA), which hinders the
+understanding of how temporal perception performance may vary across different
+types of tasks. Motivated by these two problems, we propose the
+\textbf{TempCompass} benchmark, which introduces a diversity of temporal
+aspects and task formats. To collect high-quality test data, we devise two
+novel strategies: (1) In video collection, we construct conflicting videos that
+share the same static content but differ in a specific temporal aspect, which
+prevents Video LLMs from leveraging single-frame bias or language priors. (2)
+To collect the task instructions, we propose a paradigm where humans first
+annotate meta-information for a video and then an LLM generates the
+instruction. We also design an LLM-based approach to automatically and
+accurately evaluate the responses from Video LLMs. Based on TempCompass, we
+comprehensively evaluate 8 state-of-the-art (SOTA) Video LLMs and 3 Image LLMs,
+and reveal the discerning fact that these models exhibit notably poor temporal
+perception ability. Our data will be available at
+https://github.com/llyx97/TempCompass.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Effectiveness of a Simplified Model Structure for Crowd Counting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07847v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07847v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Chen, Xinghang Gao, Fei Chao, Chih Min Lin, Xingen Gao, Hongyi Zhang, Juqiang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of crowd counting research, many recent deep learning based
+methods have demonstrated robust capabilities for accurately estimating crowd
+sizes. However, the enhancement in their performance often arises from an
+increase in the complexity of the model structure. This paper discusses how to
+construct high-performance crowd counting models using only simple structures.
+We proposes the Fuss-Free Network (FFNet) that is characterized by its simple
+and efficieny structure, consisting of only a backbone network and a
+multi-scale feature fusion structure. The multi-scale feature fusion structure
+is a simple structure consisting of three branches, each only equipped with a
+focus transition module, and combines the features from these branches through
+the concatenation operation. Our proposed crowd counting model is trained and
+evaluated on four widely used public datasets, and it achieves accuracy that is
+comparable to that of existing complex models. Furthermore, we conduct a
+comprehensive evaluation by replacing the existing backbones of various models
+such as FFNet and CCTrans with different networks, including MobileNet-v3,
+ConvNeXt-Tiny, and Swin-Transformer-Small. The experimental results further
+indicate that excellent crowd counting performance can be achieved with the
+simplied structure proposed by us.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Learning: Forget-free Winning Subnetworks for Video
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11973v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11973v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haeyong Kang, Jaehong Yoon, Sung Ju Hwang, Chang D. Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the
+existence of efficient subnetworks within larger, dense networks, a
+high-performing Winning Subnetwork (WSN) in terms of task performance under
+appropriate sparsity conditions is considered for various continual learning
+tasks. It leverages pre-existing weights from dense networks to achieve
+efficient learning in Task Incremental Learning (TIL) and Task-agnostic
+Incremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning
+(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is
+designed to prevent overfitting when the data samples are scarce. Furthermore,
+the sparse reuse of WSN weights is considered for Video Incremental Learning
+(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It
+enables compact encoding of videos and identifies reusable subnetworks across
+varying bandwidths. We have integrated FSO into different architectural
+frameworks for continual learning, including VIL, TIL, and FSCIL. Our
+comprehensive experiments demonstrate FSO's effectiveness, significantly
+improving task performance at various convolutional representational levels.
+Specifically, FSO enhances higher-layer performance in TIL and FSCIL and
+lower-layer performance in VIL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2303.14962,
+  arXiv:2306.11305</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spectral <span class="highlight-title">Prompt</span> Tuning:Unveiling Unseen Classes for Zero-Shot Semantic
+  Segmentation <span class="chip">AAAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12754v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12754v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Xu, Rongtao Xu, Changwei Wang, Shibiao Xu, Li Guo, Man Zhang, Xiaopeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, CLIP has found practical utility in the domain of pixel-level
+zero-shot segmentation tasks. The present landscape features two-stage
+methodologies beset by issues such as intricate pipelines and elevated
+computational costs. While current one-stage approaches alleviate these
+concerns and incorporate Visual Prompt Training (VPT) to uphold CLIP's
+generalization capacity, they still fall short in fully harnessing CLIP's
+potential for pixel-level unseen class demarcation and precise pixel
+predictions. To further stimulate CLIP's zero-shot dense prediction capability,
+we propose SPT-SEG, a one-stage approach that improves CLIP's adaptability from
+image to pixel. Specifically, we initially introduce Spectral Prompt Tuning
+(SPT), incorporating spectral prompts into the CLIP visual encoder's shallow
+layers to capture structural intricacies of images, thereby enhancing
+comprehension of unseen classes. Subsequently, we introduce the Spectral Guided
+Decoder (SGD), utilizing both high and low-frequency information to steer the
+network's spatial focus towards more prominent classification features,
+enabling precise pixel-level prediction outcomes. Through extensive experiments
+on two public datasets, we demonstrate the superiority of our method over
+state-of-the-art approaches, performing well across all classes and
+particularly excelling in handling unseen classes. Code is available
+at:https://github.com/clearxu/SPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI2024 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Calibrated Deep Clustering Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02998v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02998v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuheng Jia, Jianhong Cheng, Hui Liu, Junhui Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep clustering has exhibited remarkable performance; however, the
+over-confidence problem, i.e., the estimated confidence for a sample belonging
+to a particular cluster greatly exceeds its actual prediction accuracy, has
+been overlooked in prior research. To tackle this critical issue, we pioneer
+the development of a calibrated deep clustering framework. Specifically, we
+propose a novel dual-head (calibration head and clustering head) deep
+clustering model that can effectively calibrate the estimated confidence and
+the actual accuracy. The calibration head adjusts the overconfident predictions
+of the clustering head, generating prediction confidence that match the model
+learning status. Then, the clustering head dynamically select reliable
+high-confidence samples estimated by the calibration head for pseudo-label
+self-training. Additionally, we introduce an effective network initialization
+strategy that enhances both training speed and network robustness. The
+effectiveness of the proposed calibration approach and initialization strategy
+are both endorsed with solid theoretical guarantees. Extensive experiments
+demonstrate the proposed calibrated deep clustering model not only surpasses
+state-of-the-art deep clustering methods by 10 times in terms of expected
+calibration error but also significantly outperforms them in terms of
+clustering accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FIFO-Diffusion: Generating Infinite Videos from Text without Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11473v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11473v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihwan Kim, Junoh Kang, Jinyoung Choi, Bohyung Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel inference technique based on a pretrained diffusion model
+for text-conditional video generation. Our approach, called FIFO-Diffusion, is
+conceptually capable of generating infinitely long videos without additional
+training. This is achieved by iteratively performing diagonal denoising, which
+concurrently processes a series of consecutive frames with increasing noise
+levels in a queue; our method dequeues a fully denoised frame at the head while
+enqueuing a new random noise frame at the tail. However, diagonal denoising is
+a double-edged sword as the frames near the tail can take advantage of cleaner
+ones by forward reference but such a strategy induces the discrepancy between
+training and inference. Hence, we introduce latent partitioning to reduce the
+training-inference gap and lookahead denoising to leverage the benefit of
+forward referencing. Practically, FIFO-Diffusion consumes a constant amount of
+memory regardless of the target video length given a baseline model, while
+well-suited for parallel inference on multiple GPUs. We have demonstrated the
+promising results and effectiveness of the proposed methods on existing
+text-to-video generation baselines. Generated video samples and source codes
+are available at our project page.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://jjihwan.github.io/projects/FIFO-Diffusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StrucTexTv3: An Efficient Vision-Language Model for Text-rich Image
+  Perception, Comprehension, and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21013v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21013v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyuan Lyu, Yulin Li, Hao Zhou, Weihong Ma, Xingyu Wan, Qunyi Xie, Liang Wu, Chengquan Zhang, Kun Yao, Errui Ding, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-rich images have significant and extensive value, deeply integrated into
+various aspects of human life. Notably, both visual cues and linguistic symbols
+in text-rich images play crucial roles in information transmission but are
+accompanied by diverse challenges. Therefore, the efficient and effective
+understanding of text-rich images is a crucial litmus test for the capability
+of Vision-Language Models. We have crafted an efficient vision-language model,
+StrucTexTv3, tailored to tackle various intelligent tasks for text-rich images.
+The significant design of StrucTexTv3 is presented in the following aspects:
+Firstly, we adopt a combination of an effective multi-scale reduced visual
+transformer and a multi-granularity token sampler (MG-Sampler) as a visual
+token generator, successfully solving the challenges of high-resolution input
+and complex representation learning for text-rich images. Secondly, we enhance
+the perception and comprehension abilities of StrucTexTv3 through instruction
+learning, seamlessly integrating various text-oriented tasks into a unified
+framework. Thirdly, we have curated a comprehensive collection of high-quality
+text-rich images, abbreviated as TIM-30M, encompassing diverse scenarios like
+incidental scenes, office documents, web pages, and screenshots, thereby
+improving the robustness of our model. Our method achieved SOTA results in
+text-rich image perception tasks, and significantly improved performance in
+comprehension tasks. Among multimodal models with LLM decoder of approximately
+1.8B parameters, it stands out as a leader, which also makes the deployment of
+edge devices feasible. In summary, the StrucTexTv3 model, featuring efficient
+structural design, outstanding performance, and broad adaptability, offers
+robust support for diverse intelligent application tasks involving text-rich
+images, thus exhibiting immense potential for widespread application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EGTR: Extracting Graph from <span class="highlight-title">Transformer</span> for Scene Graph Generation <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.02072v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.02072v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinbae Im, JeongYeon Nam, Nokyung Park, Hyungmin Lee, Seunghyun Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Graph Generation (SGG) is a challenging task of detecting objects and
+predicting relationships between objects. After DETR was developed, one-stage
+SGG models based on a one-stage object detector have been actively studied.
+However, complex modeling is used to predict the relationship between objects,
+and the inherent relationship between object queries learned in the multi-head
+self-attention of the object detector has been neglected. We propose a
+lightweight one-stage SGG model that extracts the relation graph from the
+various relationships learned in the multi-head self-attention layers of the
+DETR decoder. By fully utilizing the self-attention by-products, the relation
+graph can be extracted effectively with a shallow relation extraction head.
+Considering the dependency of the relation extraction task on the object
+detection task, we propose a novel relation smoothing technique that adjusts
+the relation label adaptively according to the quality of the detected objects.
+By the relation smoothing, the model is trained according to the continuous
+curriculum that focuses on object detection task at the beginning of training
+and performs multi-task learning as the object detection performance gradually
+improves. Furthermore, we propose a connectivity prediction task that predicts
+whether a relation exists between object pairs as an auxiliary task of the
+relation extraction. We demonstrate the effectiveness and efficiency of our
+method for the Visual Genome and Open Image V6 datasets. Our code is publicly
+available at https://github.com/naver-ai/egtr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024 (Best paper award candidate)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BioFusionNet: Deep Learning-Based Survival Risk Stratification in ER+
+  Breast Cancer Through Multifeature and Multimodal Data Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10717v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10717v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raktim Kumar Mondol, Ewan K. A. Millar, Arcot Sowmya, Erik Meijering
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is a significant health concern affecting millions of women
+worldwide. Accurate survival risk stratification plays a crucial role in
+guiding personalised treatment decisions and improving patient outcomes. Here
+we present BioFusionNet, a deep learning framework that fuses image-derived
+features with genetic and clinical data to obtain a holistic profile and
+achieve survival risk stratification of ER+ breast cancer patients. We employ
+multiple self-supervised feature extractors (DINO and MoCoV3) pretrained on
+histopathological patches to capture detailed image features. These features
+are then fused by a variational autoencoder and fed to a self-attention network
+generating patient-level features. A co-dual-cross-attention mechanism combines
+the histopathological features with genetic data, enabling the model to capture
+the interplay between them. Additionally, clinical data is incorporated using a
+feed-forward network, further enhancing predictive performance and achieving
+comprehensive multimodal feature integration. Furthermore, we introduce a
+weighted Cox loss function, specifically designed to handle imbalanced survival
+data, which is a common challenge. Our model achieves a mean concordance index
+of 0.77 and a time-dependent area under the curve of 0.84, outperforming
+state-of-the-art methods. It predicts risk (high versus low) with prognostic
+significance for overall survival in univariate analysis (HR=2.99, 95% CI:
+1.88--4.78, p<0.005), and maintains independent significance in multivariate
+analysis incorporating standard clinicopathological variables (HR=2.91, 95\%
+CI: 1.80--4.68, p<0.005).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Keywords: Multimodal Fusion, Breast Cancer, Whole Slide Images, Deep
+  Neural Network, Survival Prediction</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ E$^{2}$GAN: Efficient Training of Efficient GANs for Image-to-Image
+  Translation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06127v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06127v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Gong, Zheng Zhan, Qing Jin, Yanyu Li, Yerlan Idelbayev, Xian Liu, Andrey Zharkov, Kfir Aberman, Sergey Tulyakov, Yanzhi Wang, Jian Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One highly promising direction for enabling flexible real-time on-device
+image editing is utilizing data distillation by leveraging large-scale
+text-to-image diffusion models to generate paired datasets used for training
+generative adversarial networks (GANs). This approach notably alleviates the
+stringent requirements typically imposed by high-end commercial GPUs for
+performing image editing with diffusion models. However, unlike text-to-image
+diffusion models, each distilled GAN is specialized for a specific image
+editing task, necessitating costly training efforts to obtain models for
+various concepts. In this work, we introduce and address a novel research
+direction: can the process of distilling GANs from diffusion models be made
+significantly more efficient? To achieve this goal, we propose a series of
+innovative techniques. First, we construct a base GAN model with generalized
+features, adaptable to different concepts through fine-tuning, eliminating the
+need for training from scratch. Second, we identify crucial layers within the
+base GAN model and employ Low-Rank Adaptation (LoRA) with a simple yet
+effective rank search process, rather than fine-tuning the entire base model.
+Third, we investigate the minimal amount of data necessary for fine-tuning,
+further reducing the overall training time. Extensive experiments show that we
+can efficiently empower GANs with the ability to perform real-time high-quality
+image editing on mobile devices with remarkably reduced training and storage
+costs for each concept.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Project Page: https://yifanfanfanfan.github.io/e2gan/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PhyRecon: Physically Plausible Neural Scene Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.16666v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.16666v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junfeng Ni, Yixin Chen, Bohan Jing, Nan Jiang, Bin Wang, Bo Dai, Puhao Li, Yixin Zhu, Song-Chun Zhu, Siyuan Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural implicit representations have gained popularity in multi-view 3D
+reconstruction. However, most previous work struggles to yield physically
+plausible results, limiting their utility in domains requiring rigorous
+physical accuracy, such as embodied AI and robotics. This lack of plausibility
+stems from the absence of physics modeling in existing methods and their
+inability to recover intricate geometrical structures. In this paper, we
+introduce PhyRecon, the first approach to leverage both differentiable
+rendering and differentiable physics simulation to learn implicit surface
+representations. PhyRecon features a novel differentiable particle-based
+physical simulator built on neural implicit representations. Central to this
+design is an efficient transformation between SDF-based implicit
+representations and explicit surface points via our proposed Surface Points
+Marching Cubes (SP-MC), enabling differentiable learning with both rendering
+and physical losses. Additionally, PhyRecon models both rendering and physical
+uncertainty to identify and compensate for inconsistent and inaccurate
+monocular geometric priors. This physical uncertainty further facilitates a
+novel physics-guided pixel sampling to enhance the learning of slender
+structures. By integrating these techniques, our model supports differentiable
+joint modeling of appearance, geometry, and physics. Extensive experiments
+demonstrate that PhyRecon significantly outperforms all state-of-the-art
+methods. Our results also exhibit superior physical stability in physical
+simulators, with at least a 40% improvement across all datasets, paving the way
+for future physics-based applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://phyrecon.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HemoSet: The First Blood Segmentation <span class="highlight-title">Dataset</span> for Automation of
+  Hemostasis Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16286v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16286v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Albert J. Miao, Shan Lin, Jingpei Lu, Florian Richter, Benjamin Ostrander, Emily K. Funk, Ryan K. Orosco, Michael C. Yip
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hemorrhaging occurs in surgeries of all types, forcing surgeons to quickly
+adapt to the visual interference that results from blood rapidly filling the
+surgical field. Introducing automation into the crucial surgical task of
+hemostasis management would offload mental and physical tasks from the surgeon
+and surgical assistants while simultaneously increasing the efficiency and
+safety of the operation. The first step in automation of hemostasis management
+is detection of blood in the surgical field. To propel the development of blood
+detection algorithms in surgeries, we present HemoSet, the first blood
+segmentation dataset based on bleeding during a live animal robotic surgery.
+Our dataset features vessel hemorrhage scenarios where turbulent flow leads to
+abnormal pooling geometries in surgical fields. These pools are formed in
+conditions endemic to surgical procedures -- uneven heterogeneous tissue, under
+glossy lighting conditions and rapid tool movement. We benchmark several
+state-of-the-art segmentation models and provide insight into the difficulties
+specific to blood detection. We intend for HemoSet to spur development of
+autonomous blood suction tools by providing a platform for training and
+refining blood segmentation models, addressing the precision needed for such
+robotics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Brain Imaging-to-Graph Generation using Adversarial Hierarchical
+  Diffusion Models for MCI Causality Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10754v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10754v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiankun Zuo, Hao Tian, Chi-Man Pun, Hongfei Wang, Yudong Zhang, Jin Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective connectivity can describe the causal patterns among brain regions.
+These patterns have the potential to reveal the pathological mechanism and
+promote early diagnosis and effective drug development for cognitive disease.
+However, the current methods utilize software toolkits to extract empirical
+features from brain imaging to estimate effective connectivity. These methods
+heavily rely on manual parameter settings and may result in large errors during
+effective connectivity estimation. In this paper, a novel brain
+imaging-to-graph generation (BIGG) framework is proposed to map functional
+magnetic resonance imaging (fMRI) into effective connectivity for mild
+cognitive impairment (MCI) analysis. To be specific, the proposed BIGG
+framework is based on the diffusion denoising probabilistic models (DDPM),
+where each denoising step is modeled as a generative adversarial network (GAN)
+to progressively translate the noise and conditional fMRI to effective
+connectivity. The hierarchical transformers in the generator are designed to
+estimate the noise at multiple scales. Each scale concentrates on both spatial
+and temporal information between brain regions, enabling good quality in noise
+removal and better inference of causal relations. Meanwhile, the
+transformer-based discriminator constrains the generator to further capture
+global and local patterns for improving high-quality and diversity generation.
+By introducing the diffusive factor, the denoising inference with a large
+sampling step size is more efficient and can maintain high-quality results for
+effective connectivity generation. Evaluations of the ADNI dataset demonstrate
+the feasibility and efficacy of the proposed model. The proposed model not only
+achieves superior prediction performance compared with other competing methods
+but also predicts MCI-related causal connections that are consistent with
+clinical studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ II-MMR: Identifying and Improving Multi-modal Multi-hop Reasoning in
+  Visual Question Answering <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11058v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11058v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihyung Kil, Farideh Tavazoee, Dongyeop Kang, Joo-Kyung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) often involves diverse reasoning scenarios
+across Vision and Language (V&L). Most prior VQA studies, however, have merely
+focused on assessing the model's overall accuracy without evaluating it on
+different reasoning cases. Furthermore, some recent works observe that
+conventional Chain-of-Thought (CoT) prompting fails to generate effective
+reasoning for VQA, especially for complex scenarios requiring multi-hop
+reasoning. In this paper, we propose II-MMR, a novel idea to identify and
+improve multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA
+question with an image and finds a reasoning path to reach its answer using two
+novel language promptings: (i) answer prediction-guided CoT prompt, or (ii)
+knowledge triplet-guided prompt. II-MMR then analyzes this path to identify
+different reasoning cases in current VQA benchmarks by estimating how many hops
+and what types (i.e., visual or beyond-visual) of reasoning are required to
+answer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR
+observes that most of their VQA questions are easy to answer, simply demanding
+"single-hop" reasoning, whereas only a few questions require "multi-hop"
+reasoning. Moreover, while the recent V&L model struggles with such complex
+multi-hop reasoning questions even using the traditional CoT method, II-MMR
+shows its effectiveness across all reasoning cases in both zero-shot and
+fine-tuning settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ REBUS: A Robust Evaluation Benchmark of Understanding Symbols 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05604v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05604v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Gritsevskiy, Arjun Panickssery, Aaron Kirtland, Derik Kauffman, Hans Gundlach, Irina Gritsevskaya, Joe Cavanagh, Jonathan Chiang, Lydia La Roux, Michelle Hung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new benchmark evaluating the performance of multimodal large
+language models on rebus puzzles. The dataset covers 333 original examples of
+image-based wordplay, cluing 13 categories such as movies, composers, major
+cities, and food. To achieve good performance on the benchmark of identifying
+the clued word or phrase, models must combine image recognition and string
+manipulation with hypothesis testing, multi-step reasoning, and an
+understanding of human cognition, making for a complex, multimodal evaluation
+of capabilities. We find that GPT-4o significantly outperforms all other
+models, followed by proprietary models outperforming all other evaluated
+models. However, even the best model has a final accuracy of only 42\%, which
+goes down to just 7\% on hard puzzles, highlighting the need for substantial
+improvements in reasoning. Further, models rarely understand all parts of a
+puzzle, and are almost always incapable of retroactively explaining the correct
+answer. Our benchmark can therefore be used to identify major shortcomings in
+the knowledge and reasoning of multimodal large language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures. For code, see http://github.com/cvndsh/rebus</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compositional Generative Modeling: A Single Model is Not All You Need <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01103v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01103v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilun Du, Leslie Kaelbling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large monolithic generative models trained on massive amounts of data have
+become an increasingly dominant approach in AI research. In this paper, we
+argue that we should instead construct large generative systems by composing
+smaller generative models together. We show how such a compositional generative
+approach enables us to learn distributions in a more data-efficient manner,
+enabling generalization to parts of the data distribution unseen at training
+time. We further show how this enables us to program and construct new
+generative models for tasks completely unseen at training. Finally, we show
+that in many cases, we can discover separate compositional components from
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 (Position Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BaboonLand <span class="highlight-title">Dataset</span>: Tracking Primates in the Wild and Automating
+  Behaviour Recognition from Drone Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17698v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17698v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Isla Duporge, Maksim Kholiavchenko, Roi Harel, Scott Wolf, Dan Rubenstein, Meg Crofoot, Tanya Berger-Wolf, Stephen Lee, Julie Barreau, Jenna Kline, Michelle Ramirez, Charles Stewart
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Using drones to track multiple individuals simultaneously in their natural
+environment is a powerful approach for better understanding group primate
+behavior. Previous studies have demonstrated that it is possible to automate
+the classification of primate behavior from video data, but these studies have
+been carried out in captivity or from ground-based cameras. To understand group
+behavior and the self-organization of a collective, the whole troop needs to be
+seen at a scale where behavior can be seen in relation to the natural
+environment in which ecological decisions are made. This study presents a novel
+dataset from drone videos for baboon detection, tracking, and behavior
+recognition. The baboon detection dataset was created by manually annotating
+all baboons in drone videos with bounding boxes. A tiling method was
+subsequently applied to create a pyramid of images at various scales from the
+original 5.3K resolution images, resulting in approximately 30K images used for
+baboon detection. The tracking dataset is derived from the detection dataset,
+where all bounding boxes are assigned the same ID throughout the video. This
+process resulted in half an hour of very dense tracking data. The behavior
+recognition dataset was generated by converting tracks into mini-scenes, a
+video subregion centered on each animal; each mini-scene was manually annotated
+with 12 distinct behavior types, resulting in over 20 hours of data. Benchmark
+results show mean average precision (mAP) of 92.62\% for the YOLOv8-X detection
+model, multiple object tracking precision (MOTA) of 63.81\% for the BotSort
+tracking algorithm, and micro top-1 accuracy of 63.97\% for the X3D behavior
+recognition model. Using deep learning to classify wildlife behavior from drone
+footage facilitates non-invasive insight into the collective behavior of an
+entire group.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Dataset will be published shortly</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Hierarchical Certification for Segmentation using Randomized
+  Smoothing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08400v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08400v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alaa Anani, Tobias Lorenz, Bernt Schiele, Mario Fritz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Certification for machine learning is proving that no adversarial sample can
+evade a model within a range under certain conditions, a necessity for
+safety-critical domains. Common certification methods for segmentation use a
+flat set of fine-grained classes, leading to high abstain rates due to model
+uncertainty across many classes. We propose a novel, more practical setting,
+which certifies pixels within a multi-level hierarchy, and adaptively relaxes
+the certification to a coarser level for unstable components classic methods
+would abstain from, effectively lowering the abstain rate whilst providing more
+certified semantically meaningful information. We mathematically formulate the
+problem setup, introduce an adaptive hierarchical certification algorithm and
+prove the correctness of its guarantees. Since certified accuracy does not take
+the loss of information into account for coarser classes, we introduce the
+Certified Information Gain ($\mathrm{CIG}$) metric, which is proportional to
+the class granularity level. Our extensive experiments on the datasets
+Cityscapes, PASCAL-Context, ACDC and COCO-Stuff demonstrate that our adaptive
+algorithm achieves a higher $\mathrm{CIG}$ and lower abstain rate compared to
+the current state-of-the-art certification method. Our code can be found here:
+https://github.com/AlaaAnani/adaptive-certify.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dreaming of Electrical Waves: Generative Modeling of Cardiac Excitation
+  Waves using Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14830v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14830v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tanish Baranwal, Jan Lebert, Jan Christoph
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Electrical waves in the heart form rotating spiral or scroll waves during
+life-threatening arrhythmias such as atrial or ventricular fibrillation. The
+wave dynamics are typically modeled using coupled partial differential
+equations, which describe reaction-diffusion dynamics in excitable media. More
+recently, data-driven generative modeling has emerged as an alternative to
+generate spatio-temporal patterns in physical and biological systems. Here, we
+explore denoising diffusion probabilistic models for the generative modeling of
+electrical wave patterns in cardiac tissue. We trained diffusion models with
+simulated electrical wave patterns to be able to generate such wave patterns in
+unconditional and conditional generation tasks. For instance, we explored the
+diffusion-based i) parameter-specific generation, ii) evolution and iii)
+inpainting of spiral wave dynamics, including reconstructing three-dimensional
+scroll wave dynamics from superficial two-dimensional measurements. Further, we
+generated arbitrarily shaped bi-ventricular geometries and simultaneously
+initiated scroll wave patterns inside these geometries using diffusion. We
+characterized and compared the diffusion-generated solutions to solutions
+obtained with corresponding biophysical models and found that diffusion models
+learn to replicate spiral and scroll waves dynamics so well that they could be
+used for data-driven modeling of excitation waves in cardiac tissue. For
+instance, an ensemble of diffusion-generated spiral wave dynamics exhibits
+similar self-termination statistics as the corresponding ensemble simulated
+with a biophysical model. However, we also found that diffusion models {produce
+artifacts if training data is lacking, e.g. during self-termination,} and
+`hallucinate' wave patterns when insufficiently constrained.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ground-based image deconvolution with Swin <span class="highlight-title">Transformer</span> UNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07842v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07842v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Utsav Akhaury, Pascale Jablonka, Jean-Luc Starck, Frédéric Courbin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As ground-based all-sky astronomical surveys will gather millions of images
+in the coming years, a critical requirement emerges for the development of fast
+deconvolution algorithms capable of efficiently improving the spatial
+resolution of these images. By successfully recovering clean and
+high-resolution images from these surveys, the objective is to deepen the
+understanding of galaxy formation and evolution through accurate photometric
+measurements. We introduce a two-step deconvolution framework using a Swin
+Transformer architecture. Our study reveals that the deep learning-based
+solution introduces a bias, constraining the scope of scientific analysis. To
+address this limitation, we propose a novel third step relying on the active
+coefficients in the sparsity wavelet framework. We conducted a performance
+comparison between our deep learning-based method and Firedec, a classical
+deconvolution algorithm, based on an analysis of a subset of the EDisCS cluster
+samples. We demonstrate the advantage of our method in terms of resolution
+recovery, generalisation to different noise properties, and computational
+efficiency. The analysis of this cluster sample not only allowed us to assess
+the efficiency of our method, but it also enabled us to quantify the number of
+clumps within these galaxies in relation to their disc colour. This robust
+technique that we propose holds promise for identifying structures in the
+distant universe through ground-based images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VQPy: An Object-Oriented Approach to Modern Video Analytics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.01623v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.01623v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shan Yu, Zhenting Zhu, Yu Chen, Hanchen Xu, Pengzhan Zhao, Yang Wang, Arthi Padmanabhan, Hugo Latapie, Harry Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video analytics is widely used in contemporary systems and services. At the
+forefront of video analytics are video queries that users develop to find
+objects of particular interest. Building upon the insight that video objects
+(e.g., human, animals, cars, etc.), the center of video analytics, are similar
+in spirit to objects modeled by traditional object-oriented languages, we
+propose to develop an object-oriented approach to video analytics. This
+approach, named VQPy, consists of a frontend$\unicode{x2015}$a Python variant
+with constructs that make it easy for users to express video objects and their
+interactions$\unicode{x2015}$as well as an extensible backend that can
+automatically construct and optimize pipelines based on video objects. We have
+implemented and open-sourced VQPy, which has been productized in Cisco as part
+of its DeepVision framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MLSys'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physically Compatible 3D Object Modeling from a Single Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20510v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20510v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghao Guo, Bohan Wang, Pingchuan Ma, Tianyuan Zhang, Crystal Elaine Owens, Chuang Gan, Joshua B. Tenenbaum, Kaiming He, Wojciech Matusik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a computational framework that transforms single images into 3D
+physical objects. The visual geometry of a physical object in an image is
+determined by three orthogonal attributes: mechanical properties, external
+forces, and rest-shape geometry. Existing single-view 3D reconstruction methods
+often overlook this underlying composition, presuming rigidity or neglecting
+external forces. Consequently, the reconstructed objects fail to withstand
+real-world physical forces, resulting in instability or undesirable deformation
+-- diverging from their intended designs as depicted in the image. Our
+optimization framework addresses this by embedding physical compatibility into
+the reconstruction process. We explicitly decompose the three physical
+attributes and link them through static equilibrium, which serves as a hard
+constraint, ensuring that the optimized physical shapes exhibit desired
+physical behaviors. Evaluations on a dataset collected from Objaverse
+demonstrate that our framework consistently enhances the physical realism of 3D
+models over existing methods. The utility of our framework extends to practical
+applications in dynamic simulations and 3D printing, where adherence to
+physical compatibility is paramount.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">16</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Session Context Embedding for Intent Understanding in Product Search <span class="chip">SIGIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Mehrdad, Vishal Rathi, Sravanthi Rajanala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is often noted that single query-item pair relevance training in search
+does not capture the customer intent. User intent can be better deduced from a
+series of engagements (Clicks, ATCs, Orders) in a given search session. We
+propose a novel method for vectorizing session context for capturing and
+utilizing context in retrieval and rerank. In the runtime, session embedding is
+an alternative to query embedding, saved and updated after each request in the
+session, it can be used for retrieval and ranking. We outline session
+embedding's solution to session-based intent understanding and its
+architecture, the background to this line of thought in search and
+recommendation, detail the methodologies implemented, and finally present the
+results of an implementation of session embedding for query product type
+classification. We demonstrate improvements over strategies ignoring session
+context in the runtime for user intent understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 Figure, 5 Tables, SIGIR 2024, LLM for Individuals, Groups,
+  and Society</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Privacy in LLM-based Recommendation: Recent Advances and Future
+  Directions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sichun Luo, Wei Shao, Yuxuan Yao, Jian Xu, Mingyang Liu, Qintong Li, Bowei He, Maolin Wang, Guanzhi Deng, Hanxu Hou, Xinyi Zhang, Linqi Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, large language models (LLMs) have been integrated with conventional
+recommendation models to improve recommendation performance. However, while
+most of the existing works have focused on improving the model performance, the
+privacy issue has only received comparatively less attention. In this paper, we
+review recent advancements in privacy within LLM-based recommendation,
+categorizing them into privacy attacks and protection mechanisms. Additionally,
+we highlight several challenges and propose future directions for the community
+to address these critical problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models as Recommender Systems: A Study of Popularity Bias <span class="chip">SIGIR24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Malte Lichtenberg, Alexander Buchholz, Pola Schwöbel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The issue of popularity bias -- where popular items are disproportionately
+recommended, overshadowing less popular but potentially relevant items --
+remains a significant challenge in recommender systems. Recent advancements
+have seen the integration of general-purpose Large Language Models (LLMs) into
+the architecture of such systems. This integration raises concerns that it
+might exacerbate popularity bias, given that the LLM's training data is likely
+dominated by popular items. However, it simultaneously presents a novel
+opportunity to address the bias via prompt tuning. Our study explores this
+dichotomy, examining whether LLMs contribute to or can alleviate popularity
+bias in recommender systems. We introduce a principled way to measure
+popularity bias by discussing existing metrics and proposing a novel metric
+that fulfills a series of desiderata. Based on our new metric, we compare a
+simple LLM-based recommender to traditional recommender systems on a movie
+recommendation task. We find that the LLM recommender exhibits less popularity
+bias, even without any explicit mitigation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Gen-IR@SIGIR24 workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Demo: Soccer Information Retrieval via Natural Queries using SoccerRAG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksander Theo Strand, Sushant Gautam, Cise Midoglu, Pål Halvorsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of digital sports media necessitates sophisticated
+information retrieval systems that can efficiently parse extensive multimodal
+datasets. This paper demonstrates SoccerRAG, an innovative framework designed
+to harness the power of Retrieval Augmented Generation (RAG) and Large Language
+Models (LLMs) to extract soccer-related information through natural language
+queries. By leveraging a multimodal dataset, SoccerRAG supports dynamic
+querying and automatic data validation, enhancing user interaction and
+accessibility to sports archives. We present a novel interactive user interface
+(UI) based on the Chainlit framework which wraps around the core functionality,
+and enable users to interact with the SoccerRAG framework in a chatbot-like
+visual manner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to CBMI 2024 as a demonstration;
+  https://github.com/simula/soccer-rag</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SoccerRAG: Multimodal Soccer Information Retrieval via Natural Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksander Theo Strand, Sushant Gautam, Cise Midoglu, Pål Halvorsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of digital sports media necessitates sophisticated
+information retrieval systems that can efficiently parse extensive multimodal
+datasets. This paper introduces SoccerRAG, an innovative framework designed to
+harness the power of Retrieval Augmented Generation (RAG) and Large Language
+Models (LLMs) to extract soccer-related information through natural language
+queries. By leveraging a multimodal dataset, SoccerRAG supports dynamic
+querying and automatic data validation, enhancing user interaction and
+accessibility to sports archives. Our evaluations indicate that SoccerRAG
+effectively handles complex queries, offering significant improvements over
+traditional retrieval systems in terms of accuracy and user engagement. The
+results underscore the potential of using RAG and LLMs in sports analytics,
+paving the way for future advancements in the accessibility and real-time
+processing of sports data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to CBMI 2024 as a regular paper;
+  https://github.com/simula/soccer-rag</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-word Term Embeddings Improve Lexical Product Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viktor Shcherbakov, Fedor Krasnov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Product search is uniquely different from search for documents, Internet
+resources or vacancies, therefore it requires the development of specialized
+search systems. The present work describes the H1 embdedding model, designed
+for an offline term indexing of product descriptions at e-commerce platforms.
+The model is compared to other state-of-the-art (SoTA) embedding models within
+a framework of hybrid product search system that incorporates the advantages of
+lexical methods for product retrieval and semantic embedding-based methods. We
+propose an approach to building semantically rich term vocabularies for search
+indexes. Compared to other production semantic models, H1 paired with the
+proposed approach stands out due to its ability to process multi-word product
+terms as one token. As an example, for search queries "new balance shoes",
+"gloria jeans kids wear" brand entity will be represented as one token - "new
+balance", "gloria jeans". This results in an increased precision of the system
+without affecting the recall. The hybrid search system with proposed model
+scores mAP@12 = 56.1% and R@1k = 86.6% on the WANDS public dataset, beating
+other SoTA analogues.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Poisoning Attacks and Defenses in Recommender Systems: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01022v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01022v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongwei Wang, Junliang Yu, Min Gao, Guanhua Ye, Shazia Sadiq, Hongzhi Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern recommender systems (RS) have profoundly enhanced user experience
+across digital platforms, yet they face significant threats from poisoning
+attacks. These attacks, aimed at manipulating recommendation outputs for
+unethical gains, exploit vulnerabilities in RS through injecting malicious data
+or intervening model training. This survey presents a unique perspective by
+examining these threats through the lens of an attacker, offering fresh
+insights into their mechanics and impacts. Concretely, we detail a systematic
+pipeline that encompasses four stages of a poisoning attack: setting attack
+goals, assessing attacker capabilities, analyzing victim architecture, and
+implementing poisoning strategies. The pipeline not only aligns with various
+attack tactics but also serves as a comprehensive taxonomy to pinpoint focuses
+of distinct poisoning attacks. Correspondingly, we further classify defensive
+strategies into two main categories: poisoning data filtering and robust
+training from the defender's perspective. Finally, we highlight existing
+limitations and suggest innovative directions for further exploration in this
+field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cold-start Recommendation by Personalized Embedding Region Elicitation <span class="chip">UAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hieu Trung Nguyen, Duy Nguyen, Khoa Doan, Viet Anh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rating elicitation is a success element for recommender systems to perform
+well at cold-starting, in which the systems need to recommend items to a newly
+arrived user with no prior knowledge about the user's preference. Existing
+elicitation methods employ a fixed set of items to learn the user's preference
+and then infer the users' preferences on the remaining items. Using a fixed
+seed set can limit the performance of the recommendation system since the seed
+set is unlikely optimal for all new users with potentially diverse preferences.
+This paper addresses this challenge using a 2-phase, personalized elicitation
+scheme. First, the elicitation scheme asks users to rate a small set of popular
+items in a ``burn-in'' phase. Second, it sequentially asks the user to rate
+adaptive items to refine the preference and the user's representation.
+Throughout the process, the system represents the user's embedding value not by
+a point estimate but by a region estimate. The value of information obtained by
+asking the user's rating on an item is quantified by the distance from the
+region center embedding space that contains with high confidence the true
+embedding value of the user. Finally, the recommendations are successively
+generated by considering the preference region of the user. We show that each
+subproblem in the elicitation scheme can be efficiently implemented. Further,
+we empirically demonstrate the effectiveness of the proposed method against
+existing rating-elicitation methods on several prominent datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at UAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveil the Duality of Retrieval-Augmented Generation: Theoretical
+  Analysis and Practical Solution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shicheng Xu, Liang Pang, Huawei Shen, Xueqi Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) utilizes retrieved texts to enhance
+large language models (LLMs). However, studies show that RAG is not
+consistently effective and can even mislead LLMs due to noisy or incorrect
+retrieved texts. This suggests that RAG possesses a duality including both
+benefit and detriment. Although many existing methods attempt to address this
+issue, they lack a theoretical explanation for the duality in RAG. The benefit
+and detriment within this duality remain a black box that cannot be quantified
+or compared in an explainable manner. This paper takes the first step in
+theoretically giving the essential explanation of benefit and detriment in RAG
+by: (1) decoupling and formalizing them from RAG prediction, (2) approximating
+the gap between their values by representation similarity and (3) establishing
+the trade-off mechanism between them, to make them explainable, quantifiable,
+and comparable. We demonstrate that the distribution difference between
+retrieved texts and LLMs' knowledge acts as double-edged sword, bringing both
+benefit and detriment. We also prove that the actual effect of RAG can be
+predicted at token level. Based on our theory, we propose a practical novel
+method, X-RAG, which achieves collaborative generation between pure LLM and RAG
+at token level to preserve benefit and avoid detriment. Experiments in
+real-world tasks based on LLMs including OPT, LLaMA-2, and Mistral show the
+effectiveness of our method and support our theoretical results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00083v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00083v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Xue, Mengxin Zheng, Yebowen Hu, Fei Liu, Xun Chen, Qian Lou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are constrained by outdated information and a
+tendency to generate incorrect data, commonly referred to as "hallucinations."
+Retrieval-Augmented Generation (RAG) addresses these limitations by combining
+the strengths of retrieval-based methods and generative models. This approach
+involves retrieving relevant information from a large, up-to-date dataset and
+using it to enhance the generation process, leading to more accurate and
+contextually appropriate responses. Despite its benefits, RAG introduces a new
+attack surface for LLMs, particularly because RAG databases are often sourced
+from public data, such as the web. In this paper, we propose \TrojRAG{} to
+identify the vulnerabilities and attacks on retrieval parts (RAG database) and
+their indirect attacks on generative parts (LLMs). Specifically, we identify
+that poisoning several customized content passages could achieve a retrieval
+backdoor, where the retrieval works well for clean queries but always returns
+customized poisoned adversarial queries. Triggers and poisoned passages can be
+highly customized to implement various attacks. For example, a trigger could be
+a semantic group like "The Republican Party, Donald Trump, etc." Adversarial
+passages can be tailored to different contents, not only linked to the triggers
+but also used to indirectly attack generative LLMs without modifying them.
+These attacks can include denial-of-service attacks on RAG and semantic
+steering attacks on LLM generations conditioned by the triggers. Our
+experiments demonstrate that by just poisoning 10 adversarial passages can
+induce 98.2\% success rate to retrieve the adversarial passages. Then, these
+passages can increase the reject ratio of RAG-based GPT-4 from 0.01\% to 74.6\%
+or increase the rate of negative responses from 0.22\% to 72\% for targeted
+queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Partially Aligned Item Representation for Cross-Domain
+  Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12473v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12473v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingjia Yin, Hao Wang, Wei Guo, Yong Liu, Zhi Li, Sirui Zhao, Defu Lian, Enhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain sequential recommendation (CDSR) aims to uncover and transfer
+users' sequential preferences across multiple recommendation domains. While
+significant endeavors have been made, they primarily concentrated on developing
+advanced transfer modules and aligning user representations using
+self-supervised learning techniques. However, the problem of aligning item
+representations has received limited attention, and misaligned item
+representations can potentially lead to sub-optimal sequential modeling and
+user representation alignment. To this end, we propose a model-agnostic
+framework called \textbf{C}ross-domain item representation \textbf{A}lignment
+for \textbf{C}ross-\textbf{D}omain \textbf{S}equential \textbf{R}ecommendation
+(\textbf{CA-CDSR}), which achieves sequence-aware generation and adaptively
+partial alignment for item representations. Specifically, we first develop a
+sequence-aware feature augmentation strategy, which captures both collaborative
+and sequential item correlations, thus facilitating holistic item
+representation generation. Next, we conduct an empirical study to investigate
+the partial representation alignment problem from a spectrum perspective. It
+motivates us to devise an adaptive spectrum filter, achieving partial alignment
+adaptively. Furthermore, the aligned item representations can be fed into
+different sequential encoders to obtain user representations. The entire
+framework is optimized in a multi-task learning paradigm with an annealing
+strategy. Extensive experiments have demonstrated that CA-CDSR can surpass
+state-of-the-art baselines by a significant margin and can effectively align
+items in representation spaces to enhance performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Dataset</span> Regeneration for Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17795v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17795v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingjia Yin, Hao Wang, Wei Guo, Yong Liu, Suojuan Zhang, Sirui Zhao, Defu Lian, Enhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The sequential recommender (SR) system is a crucial component of modern
+recommender systems, as it aims to capture the evolving preferences of users.
+Significant efforts have been made to enhance the capabilities of SR systems.
+These methods typically follow the model-centric paradigm, which involves
+developing effective models based on fixed datasets. However, this approach
+often overlooks potential quality issues and flaws inherent in the data. Driven
+by the potential of data-centric AI, we propose a novel data-centric paradigm
+for developing an ideal training dataset using a model-agnostic dataset
+regeneration framework called DR4SR. This framework enables the regeneration of
+a dataset with exceptional cross-architecture generalizability. Additionally,
+we introduce the DR4SR+ framework, which incorporates a model-aware dataset
+personalizer to tailor the regenerated dataset specifically for a target model.
+To demonstrate the effectiveness of the data-centric paradigm, we integrate our
+framework with various model-centric methods and observe significant
+performance improvements across four widely adopted datasets. Furthermore, we
+conduct in-depth analyses to explore the potential of the data-centric paradigm
+and provide valuable insights. The code can be found at
+https://anonymous.4open.science/r/KDD2024-86EA
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PRE: A Peer <span class="highlight-title">Review</span> Based Large Language Model Evaluator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15641v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15641v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhumin Chu, Qingyao Ai, Yiteng Tu, Haitao Li, Yiqun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The impressive performance of large language models (LLMs) has attracted
+considerable attention from the academic and industrial communities. Besides
+how to construct and train LLMs, how to effectively evaluate and compare the
+capacity of LLMs has also been well recognized as an important yet difficult
+problem. Existing paradigms rely on either human annotators or model-based
+evaluators to evaluate the performance of LLMs on different tasks. However,
+these paradigms often suffer from high cost, low generalizability, and
+inherited biases in practice, which make them incapable of supporting the
+sustainable development of LLMs in long term. In order to address these issues,
+inspired by the peer review systems widely used in academic publication
+process, we propose a novel framework that can automatically evaluate LLMs
+through a peer-review process. Specifically, for the evaluation of a specific
+task, we first construct a small qualification exam to select "reviewers" from
+a couple of powerful LLMs. Then, to actually evaluate the "submissions" written
+by different candidate LLMs, i.e., the evaluatees, we use the reviewer LLMs to
+rate or compare the submissions. The final ranking of evaluatee LLMs is
+generated based on the results provided by all reviewers. We conducted
+extensive experiments on text summarization tasks with eleven LLMs including
+GPT-4. The results demonstrate the existence of biasness when evaluating using
+a single LLM. Also, our PRE model outperforms all the baselines, illustrating
+the effectiveness of the peer review mechanism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rewriting the Code: A Simple Method for Large Language Model Augmented
+  Code Search <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04514v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04514v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Li, Xin Zhou, Zhiqi Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In code search, the Generation-Augmented Retrieval (GAR) framework, which
+generates exemplar code snippets to augment queries, has emerged as a promising
+strategy to address the principal challenge of modality misalignment between
+code snippets and natural language queries, particularly with the demonstrated
+code generation capabilities of Large Language Models (LLMs). Nevertheless, our
+preliminary investigations indicate that the improvements conferred by such an
+LLM-augmented framework are somewhat constrained. This limitation could
+potentially be ascribed to the fact that the generated codes, albeit
+functionally accurate, frequently display a pronounced stylistic deviation from
+the ground truth code in the codebase. In this paper, we extend the
+foundational GAR framework and propose a simple yet effective method that
+additionally Rewrites the Code (ReCo) within the codebase for style
+normalization. Experimental results demonstrate that ReCo significantly boosts
+retrieval accuracy across sparse (up to 35.7%), zero-shot dense (up to 27.6%),
+and fine-tuned dense (up to 23.6%) retrieval settings in diverse search
+scenarios. To further elucidate the advantages of ReCo and stimulate research
+in code style normalization, we introduce Code Style Similarity, the first
+metric tailored to quantify stylistic similarities in code. Notably, our
+empirical findings reveal the inadequacy of existing metrics in capturing
+stylistic nuances. The source code and data are available at
+\url{https://github.com/Alex-HaochenLi/ReCo}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Category-Oriented Representation Learning for Image to Multi-Modal
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03972v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03972v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zida Cheng, Chen Ju, Xu Chen, Zhonghua Zhai, Shuai Xiao, Xiaoyi Zeng, Weilin Huang, Junchi Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of multi-modal search requests from users has highlighted the
+importance of multi-modal retrieval (i.e. image-to-text or text-to-image
+retrieval), yet the more complex task of image-to-multi-modal retrieval,
+crucial for many industry applications, remains under-explored. To address this
+gap and promote further research, we introduce and define the concept of
+Image-to-Multi-Modal Retrieval (IMMR), a process designed to retrieve rich
+multi-modal (i.e. image and text) documents based on image queries. We focus on
+representation learning for IMMR and analyze three key challenges for it: 1)
+skewed data and noisy label in real-world industrial data, 2) the
+information-inequality between image and text modality of documents when
+learning representations, 3) effective and efficient training in large-scale
+industrial contexts. To tackle the above challenges, we propose a novel
+framework named organizing categories and learning by classification for
+retrieval (OCLEAR). It consists of three components: 1) a novel
+category-oriented data governance scheme coupled with a large-scale
+classification-based learning paradigm, which handles the skewed and noisy data
+from a data perspective. 2) model architecture specially designed for
+multi-modal learning, where information-inequality between image and text
+modality of documents is considered for modality fusion. 3) a hybrid parallel
+training approach for tackling large-scale training in industrial scenario. The
+proposed framework achieves SOTA performance on public datasets and has been
+deployed in a real-world industrial e-commence system, leading to significant
+business growth. Code will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAWSEO: Adversarial Wiki Search Poisoning for Illicit Online Promotion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11300v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11300v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zilong Lin, Zhengyi Li, Xiaojing Liao, XiaoFeng Wang, Xiaozhong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As a prominent instance of vandalism edits, Wiki search poisoning for illicit
+promotion is a cybercrime in which the adversary aims at editing Wiki articles
+to promote illicit businesses through Wiki search results of relevant queries.
+In this paper, we report a study that, for the first time, shows that such
+stealthy blackhat SEO on Wiki can be automated. Our technique, called MAWSEO,
+employs adversarial revisions to achieve real-world cybercriminal objectives,
+including rank boosting, vandalism detection evasion, topic relevancy, semantic
+consistency, user awareness (but not alarming) of promotional content, etc. Our
+evaluation and user study demonstrate that MAWSEO is capable of effectively and
+efficiently generating adversarial vandalism edits, which can bypass
+state-of-the-art built-in Wiki vandalism detectors, and also get promotional
+content through to Wiki users without triggering their alarms. In addition, we
+investigated potential defense, including coherence based detection and
+adversarial training of vandalism detection, against our attack in the Wiki
+ecosystem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the 45th IEEE Symposium on Security and Privacy (IEEE S&P
+  2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">139</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GIFT: Generative Interpretable Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.00700v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.00700v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chinmay Savadikar, Xi Song, Tianfu Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Generative Interpretable Fine-Tuning (GIFT) for
+parameter-efficient fine-tuning of pretrained Transformer backbones, which can
+be formulated as a simple factorized matrix multiplication in the parameter
+space or equivalently in the activation space, and thus embraces built-in
+interpretability. For a pretrained layer with weights $\omega\in
+\mathbb{R}^{d_{out}\times d_{in}}$, our proposed GIFT learns the fine-tuned
+weights $\hat{\omega}$ directly from $\omega$ as $\hat{\omega}=\omega \cdot
+(\mathbb{I}+\phi_{d_{in}\times r}\cdot \psi_{r\times d_{in}})$ where
+$\mathbb{I}$ is an identity matrix. $\Theta=(\phi, \psi)$ are the learnable
+parameters of the two linear layers of GIFT with $r$ being a hyper-parameter.
+$\Theta$ is shared by all the layers selected for fine-tuning, resulting in
+significantly fewer trainable parameters compared to Low-Rank Adaptation
+(LoRA). We perform comprehensive evaluations on natural language tasks
+(commonsense reasoning and sequence classification) and computer vision tasks
+(visual fine-grained classification). We obtain the best accuracy and parameter
+efficiency among baselines both on the Commonsense170k reasoning benchmark
+using LLaMA-1 (7B) and Llama-2 (7B)/-3 (8B) and on the FGVC and VTAB visual
+recognition benchmarks using ImageNet-21k pretrained Vision Transformer
+(ViT-B/16). Notably, we obtain 5.9% absolute increase in average accuracy with
+53.8 times reduction of parameters on Commonsense170k using Llama-3 (8B)
+compared to LoRA. We obtain performance comparable to LoRA on the GLUE
+benchmark but with significantly fewer parameters using RoBERTa-Base/Large. We
+show the output of the first linear layer (i.e., $\omega\cdot \phi$) is
+surprisingly interpretable, which can play the role of a token-clustering head
+as a by-product to localize meaningful objects/parts in images for computer
+vision tasks. Our code is publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page and code: https://savadikarc.github.io/gift</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tiny Time Mixers (TTMs): Fast <span class="highlight-title">Pre-train</span>ed Models for Enhanced
+  Zero/Few-Shot Forecasting of Multivariate Time Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03955v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03955v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vijay Ekambaram, Arindam Jati, Pankaj Dayama, Sumanta Mukherjee, Nam H. Nguyen, Wesley M. Gifford, Chandra Reddy, Jayant Kalagnanam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large pre-trained models excel in zero/few-shot learning for language and
+vision tasks but face challenges in multivariate time series (TS) forecasting
+due to diverse data characteristics. Consequently, recent research efforts have
+focused on developing pre-trained TS forecasting models. These models, whether
+built from scratch or adapted from large language models (LLMs), excel in
+zero/few-shot forecasting tasks. However, they are limited by slow performance,
+high computational demands, and neglect of cross-channel and exogenous
+correlations. To address this, we introduce Tiny Time Mixers (TTM), a compact
+model (starting from 1M parameters) with effective transfer learning
+capabilities, trained exclusively on public TS datasets. TTM, based on the
+light-weight TSMixer architecture, incorporates innovations like adaptive
+patching, diverse resolution sampling, and resolution prefix tuning to handle
+pre-training on varied dataset resolutions with minimal model capacity.
+Additionally, it employs multi-level modeling to capture channel correlations
+and infuse exogenous signals during fine-tuning. TTM outperforms existing
+popular benchmarks in zero/few-shot forecasting by (4-40\%), while reducing
+computational requirements significantly. Moreover, TTMs are lightweight and
+can be executed even on CPU-only machines, enhancing usability and fostering
+wider adoption in resource-constrained environments. Model weights for our
+initial variant (TTM-Q) are available at
+https://huggingface.co/ibm-granite/granite-timeseries-ttm-v1. Model weights for
+more sophisticated variants (TTM-B, TTM-E, and TTM-A) will be shared soon. The
+source code for TTM can be accessed at
+https://github.com/ibm-granite/granite-tsfm/tree/main/tsfm_public/models/tinytimemixer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MIM-Refiner: A Contrastive Learning Boost from Intermediate <span class="highlight-title">Pre-Train</span>ed
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10093v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10093v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benedikt Alkin, Lukas Miklautz, Sepp Hochreiter, Johannes Brandstetter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MIM (Masked Image Modeling)-Refiner, a contrastive learning
+boost for pre-trained MIM models. MIM-Refiner is motivated by the insight that
+strong representations within MIM models generally reside in intermediate
+layers. Accordingly, MIM-Refiner leverages multiple contrastive heads that are
+connected to different intermediate layers. In each head, a modified nearest
+neighbor objective constructs semantic clusters that capture semantic
+information which improves performance on downstream tasks, including
+off-the-shelf and fine-tuning settings.
+  The refinement process is short and simple - yet highly effective. Within a
+few epochs, we refine the features of MIM models from subpar to
+state-of-the-art, off-the-shelf features. Refining a ViT-H, pre-trained with
+data2vec 2.0 on ImageNet-1K, sets a new state-of-the-art in linear probing
+(84.7%) and low-shot classification among models that are pre-trained on
+ImageNet-1K. At ImageNet-1K 1-shot classification, MIM-Refiner advances the
+state-of-the-art to 64.2%, outperforming larger models that were trained on up
+to 2000 times more data such as DINOv2-g, OpenCLIP-G and MAWS-6.5B.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Loss Symmetry and Noise Equilibrium of Stochastic Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07193v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07193v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Ziyin, Mingze Wang, Hongchao Li, Lei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Symmetries exist abundantly in the loss function of neural networks. We
+characterize the learning dynamics of stochastic gradient descent (SGD) when
+exponential symmetries, a broad subclass of continuous symmetries, exist in the
+loss function. We establish that when gradient noises do not balance, SGD has
+the tendency to move the model parameters toward a point where noises from
+different directions are balanced. Here, a special type of fixed point in the
+constant directions of the loss function emerges as a candidate for solutions
+for SGD. As the main theoretical result, we prove that every parameter $\theta$
+connects without loss function barrier to a unique noise-balanced fixed point
+$\theta^*$. The theory implies that the balancing of gradient noise can serve
+as a novel alternative mechanism for relevant phenomena such as progressive
+sharpening and flattening and can be applied to understand common practical
+problems such as representation normalization, matrix factorization, warmup,
+and formation of latent representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DITTO: Diffusion Inference-Time T-Optimization for Music Generation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.12179v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.12179v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zachary Novack, Julian McAuley, Taylor Berg-Kirkpatrick, Nicholas J. Bryan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Diffusion Inference-Time T-Optimization (DITTO), a general-purpose
+frame-work for controlling pre-trained text-to-music diffusion models at
+inference-time via optimizing initial noise latents. Our method can be used to
+optimize through any differentiable feature matching loss to achieve a target
+(stylized) output and leverages gradient checkpointing for memory efficiency.
+We demonstrate a surprisingly wide-range of applications for music generation
+including inpainting, outpainting, and looping as well as intensity, melody,
+and musical structure control - all without ever fine-tuning the underlying
+model. When we compare our approach against related training, guidance, and
+optimization-based methods, we find DITTO achieves state-of-the-art performance
+on nearly all tasks, including outperforming comparable approaches on
+controllability, audio quality, and computational efficiency, thus opening the
+door for high-quality, flexible, training-free control of diffusion models.
+Sound examples can be found at https://DITTO-Music.github.io/web/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Oral at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Arrows of Time for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.17505v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.17505v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vassilis Papadopoulos, Jérémie Wenger, Clément Hongler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the probabilistic modeling performed by Autoregressive Large
+Language Models (LLMs) through the angle of time directionality, addressing a
+question first raised in (Shannon, 1951). For large enough models, we
+empirically find a time asymmetry in their ability to learn natural language: a
+difference in the average log-perplexity when trying to predict the next token
+versus when trying to predict the previous one. This difference is at the same
+time subtle and very consistent across various modalities (language, model
+size, training time, ...). Theoretically, this is surprising: from an
+information-theoretic point of view, there should be no such difference. We
+provide a theoretical framework to explain how such an asymmetry can appear
+from sparsity and computational complexity considerations, and outline a number
+of perspectives opened by our results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Re-arranged and updated figures. Added experiments. 12 figures, 20
+  pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Topology and Geometry of Neural Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11028v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11028v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Baihan Lin, Nikolaus Kriegeskorte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A central question for neuroscience is how to characterize brain
+representations of perceptual and cognitive content. An ideal characterization
+should distinguish different functional regions with robustness to noise and
+idiosyncrasies of individual brains that do not correspond to computational
+differences. Previous studies have characterized brain representations by their
+representational geometry, which is defined by the representational
+dissimilarity matrix (RDM), a summary statistic that abstracts from the roles
+of individual neurons (or responses channels) and characterizes the
+discriminability of stimuli. Here we explore a further step of abstraction:
+from the geometry to the topology of brain representations. We propose
+topological representational similarity analysis (tRSA), an extension of
+representational similarity analysis (RSA) that uses a family of
+geo-topological summary statistics that generalizes the RDM to characterize the
+topology while de-emphasizing the geometry. We evaluate this new family of
+statistics in terms of the sensitivity and specificity for model selection
+using both simulations and fMRI data. In the simulations, the ground truth is a
+data-generating layer representation in a neural network model and the models
+are the same and other layers in different model instances (trained from
+different random seeds). In fMRI, the ground truth is a visual area and the
+models are the same and other areas measured in different subjects. Results
+show that topology-sensitive characterizations of population codes are robust
+to noise and interindividual variability and maintain excellent sensitivity to
+the unique representational signatures of different neural network layers and
+brain regions. These methods enable researchers to calibrate comparisons among
+representations in brains and models to be sensitive to the geometry, the
+topology, or a combination of both.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>codes: https://github.com/doerlbh/TopologicalRSA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Open-Vocabulary 3D Scene Graphs for Language-Grounded Robot
+  Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17846v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17846v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrhman Werby, Chenguang Huang, Martin Büchner, Abhinav Valada, Wolfram Burgard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent open-vocabulary robot mapping methods enrich dense geometric maps with
+pre-trained visual-language features. While these maps allow for the prediction
+of point-wise saliency maps when queried for a certain language concept,
+large-scale environments and abstract queries beyond the object level still
+pose a considerable hurdle, ultimately limiting language-grounded robotic
+navigation. In this work, we present HOV-SG, a hierarchical open-vocabulary 3D
+scene graph mapping approach for language-grounded robot navigation. Leveraging
+open-vocabulary vision foundation models, we first obtain state-of-the-art
+open-vocabulary segment-level maps in 3D and subsequently construct a 3D scene
+graph hierarchy consisting of floor, room, and object concepts, each enriched
+with open-vocabulary features. Our approach is able to represent multi-story
+buildings and allows robotic traversal of those using a cross-floor Voronoi
+graph. HOV-SG is evaluated on three distinct datasets and surpasses previous
+baselines in open-vocabulary semantic accuracy on the object, room, and floor
+level while producing a 75% reduction in representation size compared to dense
+open-vocabulary maps. In order to prove the efficacy and generalization
+capabilities of HOV-SG, we showcase successful long-horizon
+language-conditioned robot navigation within real-world multi-storage
+environments. We provide code and trial video data at http://hovsg.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and video are available at http://hovsg.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Clover: Closed-Loop Verifiable Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.17807v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.17807v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuyue Sun, Ying Sheng, Oded Padon, Clark Barrett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The use of large language models for code generation is a rapidly growing
+trend in software development. However, without effective methods for ensuring
+the correctness of generated code, this trend could lead to any number of
+undesirable outcomes. In this paper, we lay out a vision for addressing this
+challenge: the Clover paradigm, short for Closed-Loop Verifiable Code
+Generation, which reduces correctness checking to the more accessible problem
+of consistency checking. At the core of Clover lies a checker that performs
+consistency checks among code, docstrings, and formal annotations. The checker
+is implemented using a novel integration of formal verification tools and large
+language models. We provide a theoretical analysis to support our thesis that
+Clover should be effective at consistency checking. We also empirically
+investigate its feasibility on a hand-designed dataset (CloverBench) featuring
+annotated Dafny programs at a textbook level of difficulty. Experimental
+results show that for this dataset, (i) LLMs are reasonably successful at
+automatically generating formal specifications; and (ii) our consistency
+checker achieves a promising acceptance rate (up to 87%) for correct instances
+while maintaining zero tolerance for incorrect ones (no false positives).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Faithful and Robust LLM Specialists for Evidence-Based
+  Question-Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08277v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08277v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Schimanski, Jingwei Ni, Mathias Kraus, Elliott Ash, Markus Leippold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances towards more faithful and traceable answers of Large Language Models
+(LLMs) are crucial for various research and practical endeavors. One avenue in
+reaching this goal is basing the answers on reliable sources. However, this
+Evidence-Based QA has proven to work insufficiently with LLMs in terms of
+citing the correct sources (source quality) and truthfully representing the
+information within sources (answer attributability). In this work, we
+systematically investigate how to robustly fine-tune LLMs for better source
+quality and answer attributability. Specifically, we introduce a data
+generation pipeline with automated data quality filters, which can synthesize
+diversified high-quality training and testing data at scale. We further
+introduce four test sets to benchmark the robustness of fine-tuned specialist
+models. Extensive evaluation shows that fine-tuning on synthetic data improves
+performance on both in- and out-of-distribution. Furthermore, we show that data
+quality, which can be drastically improved by proposed quality filters, matters
+more than quantity in improving Evidence-Based QA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Model Openness Framework: Promoting Completeness and Openness for
+  Reproducibility, Transparency, and Usability in Artificial Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.13784v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.13784v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matt White, Ibrahim Haddad, Cailean Osborne, Xiao-Yang Liu Yanglet, Ahmed Abdelmonsef, Sachin Varghese
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI (GAI) offers unprecedented opportunities for research and
+innovation, but its commercialization has raised concerns about transparency,
+reproducibility, and safety. Many open GAI models lack the necessary components
+for full understanding and reproducibility, and some use restrictive licenses
+whilst claiming to be ``open-source''. To address these concerns, we propose
+the Model Openness Framework (MOF), a ranked classification system that rates
+machine learning models based on their completeness and openness, following
+principles of open science, open source, open data, and open access. The MOF
+requires specific components of the model development lifecycle to be included
+and released under appropriate open licenses. This framework aims to prevent
+misrepresentation of models claiming to be open, guide researchers and
+developers in providing all model components under permissive licenses, and
+help individuals and organizations identify models that can be safely adopted
+without restrictions. By promoting transparency and reproducibility, the MOF
+combats ``openwashing'' practices and establishes completeness and openness as
+primary criteria alongside the core tenets of responsible AI. Wide adoption of
+the MOF will foster a more open AI ecosystem, benefiting research, innovation,
+and adoption of state-of-the-art models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Subtle Biases Need Subtler Measures: Dual Metrics for Evaluating
+  Representative and Affinity Bias in Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14555v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14555v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Kumar, Sarfaroz Yunusov, Ali Emami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research on Large Language Models (LLMs) has often neglected subtle biases
+that, although less apparent, can significantly influence the models' outputs
+toward particular social narratives. This study addresses two such biases
+within LLMs: representative bias, which denotes a tendency of LLMs to generate
+outputs that mirror the experiences of certain identity groups, and affinity
+bias, reflecting the models' evaluative preferences for specific narratives or
+viewpoints. We introduce two novel metrics to measure these biases: the
+Representative Bias Score (RBS) and the Affinity Bias Score (ABS), and present
+the Creativity-Oriented Generation Suite (CoGS), a collection of open-ended
+tasks such as short story writing and poetry composition, designed with
+customized rubrics to detect these subtle biases. Our analysis uncovers marked
+representative biases in prominent LLMs, with a preference for identities
+associated with being white, straight, and men. Furthermore, our investigation
+of affinity bias reveals distinctive evaluative patterns within each model,
+akin to `bias fingerprints'. This trend is also seen in human evaluators,
+highlighting a complex interplay between human and machine bias perceptions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages (excluding references), accepted to ACL 2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Picturing Ambiguity: A Visual Twist on the Winograd Schema Challenge <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16277v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16277v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brendan Park, Madeline Janecek, Naser Ezzati-Jivan, Yifeng Li, Ali Emami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable success in tasks
+like the Winograd Schema Challenge (WSC), showcasing advanced textual
+common-sense reasoning. However, applying this reasoning to multimodal domains,
+where understanding text and images together is essential, remains a
+substantial challenge. To address this, we introduce WinoVis, a novel dataset
+specifically designed to probe text-to-image models on pronoun disambiguation
+within multimodal contexts. Utilizing GPT-4 for prompt generation and Diffusion
+Attentive Attribution Maps (DAAM) for heatmap analysis, we propose a novel
+evaluation framework that isolates the models' ability in pronoun
+disambiguation from other visual processing challenges. Evaluation of
+successive model versions reveals that, despite incremental advancements,
+Stable Diffusion 2.0 achieves a precision of 56.7% on WinoVis, only marginally
+surpassing random guessing. Further error analysis identifies important areas
+for future research aimed at advancing text-to-image models in their ability to
+interpret and interact with the complex visual world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages (excluding references), accepted to ACL 2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Confidence Under the Hood: An Investigation into the
+  Confidence-Probability Alignment in Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16282v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16282v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Kumar, Robert Morabito, Sanzhar Umbet, Jad Kabbara, Ali Emami
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As the use of Large Language Models (LLMs) becomes more widespread,
+understanding their self-evaluation of confidence in generated responses
+becomes increasingly important as it is integral to the reliability of the
+output of these models. We introduce the concept of Confidence-Probability
+Alignment, that connects an LLM's internal confidence, quantified by token
+probabilities, to the confidence conveyed in the model's response when
+explicitly asked about its certainty. Using various datasets and prompting
+techniques that encourage model introspection, we probe the alignment between
+models' internal and expressed confidence. These techniques encompass using
+structured evaluation scales to rate confidence, including answer options when
+prompting, and eliciting the model's confidence level for outputs it does not
+recognize as its own. Notably, among the models analyzed, OpenAI's GPT-4 showed
+the strongest confidence-probability alignment, with an average Spearman's
+$\hat{\rho}$ of 0.42, across a wide range of tasks. Our work contributes to the
+ongoing efforts to facilitate risk assessment in the application of LLMs and to
+further our understanding of model trustworthiness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages (excluding references), accepted to ACL 2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Feature Attribution with Necessity and Sufficiency via Dual-stage
+  Perturbation Test for Causal Explanation <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08845v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08845v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuexin Chen, Ruichu Cai, Zhengting Huang, Yuxuan Zhu, Julien Horwood, Zhifeng Hao, Zijian Li, Jose Miguel Hernandez-Lobato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the problem of explainability for machine learning models,
+focusing on Feature Attribution Methods (FAMs) that evaluate feature importance
+through perturbation tests. Despite their utility, FAMs struggle to distinguish
+the contributions of different features, when their prediction changes are
+similar after perturbation. To enhance FAMs' discriminative power, we introduce
+Feature Attribution with Necessity and Sufficiency (FANS), which find a
+neighborhood of the input such that perturbing samples within this neighborhood
+have a high Probability of being Necessity and Sufficiency (PNS) cause for the
+change in predictions, and use this PNS as the importance of the feature.
+Specifically, FANS compute this PNS via a heuristic strategy for estimating the
+neighborhood and a perturbation test involving two stages (factual and
+interventional) for counterfactual reasoning. To generate counterfactual
+samples, we use a resampling-based approach on the observed samples to
+approximate the required conditional distribution. We demonstrate that FANS
+outperforms existing attribution methods on six benchmarks. Please refer to the
+source code via \url{https://github.com/DMIRLAB-Group/FANS}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the Proceedings of the 41st International Conference on
+  Machine Learning (ICML2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XAI4LLM. Let Machine Learning Models and LLMs Collaborate for Enhanced
+  In-Context Learning in Healthcare 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.06270v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.06270v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Nazary, Yashar Deldjoo, Tommaso Di Noia, Eugenio di Sciascio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of Large Language Models (LLMs) into healthcare diagnostics
+offers a promising avenue for clinical decision-making. This study outlines the
+development of a novel method for zero-shot/few-shot in-context learning (ICL)
+by integrating medical domain knowledge using a multi-layered structured
+prompt. We also explore the efficacy of two communication styles between the
+user and LLMs: the Numerical Conversational (NC) style, which processes data
+incrementally, and the Natural Language Single-Turn (NL-ST) style, which
+employs long narrative prompts.
+  Our study systematically evaluates the diagnostic accuracy and risk factors,
+including gender bias and false negative rates, using a dataset of 920 patient
+records in various few-shot scenarios. Results indicate that traditional
+clinical machine learning (ML) models generally outperform LLMs in zero-shot
+and few-shot settings. However, the performance gap narrows significantly when
+employing few-shot examples alongside effective explainable AI (XAI) methods as
+sources of domain knowledge. Moreover, with sufficient time and an increased
+number of examples, the conversational style (NC) nearly matches the
+performance of ML models. Most notably, LLMs demonstrate comparable or superior
+cost-sensitive accuracy relative to ML models.
+  This research confirms that, with appropriate domain knowledge and tailored
+communication strategies, LLMs can significantly enhance diagnostic processes.
+The findings highlight the importance of optimizing the number of training
+examples and communication styles to improve accuracy and reduce biases in LLM
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How Flawed Is ECE? An Analysis via Logit Smoothing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muthu Chidambaram, Holden Lee, Colin McSwiggen, Semon Rezchikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Informally, a model is calibrated if its predictions are correct with a
+probability that matches the confidence of the prediction. By far the most
+common method in the literature for measuring calibration is the expected
+calibration error (ECE). Recent work, however, has pointed out drawbacks of
+ECE, such as the fact that it is discontinuous in the space of predictors. In
+this work, we ask: how fundamental are these issues, and what are their impacts
+on existing results? Towards this end, we completely characterize the
+discontinuities of ECE with respect to general probability measures on Polish
+spaces. We then use the nature of these discontinuities to motivate a novel
+continuous, easily estimated miscalibration metric, which we term
+Logit-Smoothed ECE (LS-ECE). By comparing the ECE and LS-ECE of pre-trained
+image classification models, we show in initial experiments that binned ECE
+closely tracks LS-ECE, indicating that the theoretical pathologies of ECE may
+be avoidable in practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convolutional L2LFlows: Generating Accurate Showers in Highly Granular
+  Calorimeters Using Convolutional Normalizing Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20407v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20407v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thorsten Buss, Frank Gaede, Gregor Kasieczka, Claudius Krause, David Shih
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the quest to build generative surrogate models as computationally
+efficient alternatives to rule-based simulations, the quality of the generated
+samples remains a crucial frontier. So far, normalizing flows have been among
+the models with the best fidelity. However, as the latent space in such models
+is required to have the same dimensionality as the data space, scaling up
+normalizing flows to high dimensional datasets is not straightforward. The
+prior L2LFlows approach successfully used a series of separate normalizing
+flows and sequence of conditioning steps to circumvent this problem. In this
+work, we extend L2LFlows to simulate showers with a 9-times larger profile in
+the lateral direction. To achieve this, we introduce convolutional layers and
+U-Net-type connections, move from masked autoregressive flows to coupling
+layers, and demonstrate the successful modelling of showers in the ILD
+Electromagnetic Calorimeter as well as Dataset 3 from the public CaloChallenge
+dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rotational Equilibrium: How Weight Decay Balances Learning Across Neural
+  Networks <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17212v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17212v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atli Kosson, Bettina Messmer, Martin Jaggi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigates how weight decay affects the update behavior of
+individual neurons in deep neural networks through a combination of applied
+analysis and experimentation. Weight decay can cause the expected magnitude and
+angular updates of a neuron's weight vector to converge to a steady state we
+call rotational equilibrium. These states can be highly homogeneous,
+effectively balancing the average rotation -- a proxy for the effective
+learning rate -- across different layers and neurons. Our work analyzes these
+dynamics across optimizers like Adam, Lion, and SGD with momentum, offering a
+new simple perspective on training that elucidates the efficacy of widely used
+but poorly understood methods in deep learning. We demonstrate how balanced
+rotation plays a key role in the effectiveness of normalization like Weight
+Standardization, as well as that of AdamW over Adam with L2-regularization.
+Finally, we show that explicitly controlling the rotation provides the benefits
+of weight decay while substantially reducing the need for learning rate warmup.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024; Code available at https://github.com/epfml/REQ</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantum Theory and Application of Contextual Optimal Transport <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14991v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14991v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicola Mariella, Albert Akhriev, Francesco Tacchino, Christa Zoufal, Juan Carlos Gonzalez-Espitia, Benedek Harsanyi, Eugene Koskin, Ivano Tavernelli, Stefan Woerner, Marianna Rapsomaniki, Sergiy Zhuk, Jannis Born
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal Transport (OT) has fueled machine learning (ML) across many domains.
+When paired data measurements $(\boldsymbol{\mu}, \boldsymbol{\nu})$ are
+coupled to covariates, a challenging conditional distribution learning setting
+arises. Existing approaches for learning a $\textit{global}$ transport map
+parameterized through a potentially unseen context utilize Neural OT and
+largely rely on Brenier's theorem. Here, we propose a first-of-its-kind quantum
+computing formulation for amortized optimization of contextualized
+transportation plans. We exploit a direct link between doubly stochastic
+matrices and unitary operators thus unravelling a natural connection between OT
+and quantum computation. We verify our method (QontOT) on synthetic and real
+data by predicting variations in cell type distributions conditioned on drug
+dosage. Importantly we conduct a 24-qubit hardware experiment on a task
+challenging for classical computers and report a performance that cannot be
+matched with our classical neural OT approach. In sum, this is a first step
+toward learning to predict contextualized transportation plans through quantum
+computing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Inverse Design Optimization through Multi-fidelity
+  Simulations, Machine Learning, and Search Space Reduction Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03654v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03654v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luka Grbcic, Juliane Müller, Wibe Albert de Jong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a methodology designed to augment the inverse design
+optimization process in scenarios constrained by limited compute, through the
+strategic synergy of multi-fidelity evaluations, machine learning models, and
+optimization algorithms. The proposed methodology is analyzed on two distinct
+engineering inverse design problems: airfoil inverse design and the scalar
+field reconstruction problem. It leverages a machine learning model trained
+with low-fidelity simulation data, in each optimization cycle, thereby
+proficiently predicting a target variable and discerning whether a
+high-fidelity simulation is necessitated, which notably conserves computational
+resources. Additionally, the machine learning model is strategically deployed
+prior to optimization to compress the design space boundaries, thereby further
+accelerating convergence toward the optimal solution. The methodology has been
+employed to enhance two optimization algorithms, namely Differential Evolution
+and Particle Swarm Optimization. Comparative analyses illustrate performance
+improvements across both algorithms. Notably, this method is adaptable across
+any inverse design application, facilitating a synergy between a representative
+low-fidelity ML model, and high-fidelity simulation, and can be seamlessly
+applied across any variety of population-based optimization algorithms.}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient and Generalizable Certified Unlearning: A Hessian-free
+  Recollection Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.01712v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.01712v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinbao Qiao, Meng Zhang, Ming Tang, Ermin Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning strives to uphold the data owners' right to be forgotten
+by enabling models to selectively forget specific data. Recent advances suggest
+precomputing and storing statistics extracted from second-order information and
+implementing unlearning through Newton-style updates. However, the theoretical
+analysis of these works often depends on restrictive assumptions of convexity
+and smoothness, and those mentioned operations on Hessian matrix are extremely
+costly. As a result, applying these works to high-dimensional models becomes
+challenging. In this paper, we propose an efficient Hessian-free certified
+unlearning. We propose to maintain a statistical vector for each data, computed
+through affine stochastic recursion approximation of the difference between
+retrained and learned models. Our analysis does not involve inverting Hessian
+and thus can be extended to non-convex non-smooth objectives. Under same
+assumptions, we demonstrate advancements of proposed method beyond the
+state-of-the-art theoretical studies, in terms of generalization, unlearning
+guarantee, deletion capacity, and computation/storage complexity, and we show
+that the unlearned model of our proposed approach is close to or same as the
+retrained model. Based on the strategy of recollecting statistics for
+forgetting data, we develop an algorithm that achieves near-instantaneous
+unlearning as it only requires a vector addition operation. Experiments
+demonstrate that the proposed scheme surpasses existing results by orders of
+magnitude in terms of time/storage costs, while also enhancing accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Domain-Size Generalization in Markov Logic Networks <span class="chip">ECML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15933v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15933v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Chen, Felix Weitkämper, Sagar Malhotra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the generalization behavior of Markov Logic Networks (MLNs) across
+relational structures of different sizes. Multiple works have noticed that MLNs
+learned on a given domain generalize poorly across domains of different sizes.
+This behavior emerges from a lack of internal consistency within an MLN when
+used across different domain sizes. In this paper, we quantify this
+inconsistency and bound it in terms of the variance of the MLN parameters. The
+parameter variance also bounds the KL divergence between an MLN's marginal
+distributions taken from different domain sizes. We use these bounds to show
+that maximizing the data log-likelihood while simultaneously minimizing the
+parameter variance corresponds to two natural notions of generalization across
+domain sizes. Our theoretical results apply to Exponential Random Graphs and
+other Markov network based relational models. Finally, we observe that
+solutions known to decrease the variance of the MLN parameters, like
+regularization and Domain-Size Aware MLNs, increase the internal consistency of
+the MLNs. We empirically verify our results on four different datasets, with
+different methods to control parameter variance, showing that controlling
+parameter variance leads to better generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To Appear in Proceedings of ECML 2024-Research Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-Based RL for Mean-Field Games is not Statistically Harder than
+  Single-Agent RL <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05724v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05724v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Huang, Niao He, Andreas Krause
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the sample complexity of reinforcement learning (RL) in Mean-Field
+Games (MFGs) with model-based function approximation that requires strategic
+exploration to find a Nash Equilibrium policy. We introduce the Partial
+Model-Based Eluder Dimension (P-MBED), a more effective notion to characterize
+the model class complexity. Notably, P-MBED measures the complexity of the
+single-agent model class converted from the given mean-field model class, and
+potentially, can be exponentially lower than the MBED proposed by
+\citet{huang2023statistical}. We contribute a model elimination algorithm
+featuring a novel exploration strategy and establish sample complexity results
+polynomial w.r.t.~P-MBED. Crucially, our results reveal that, under the basic
+realizability and Lipschitz continuity assumptions, \emph{learning Nash
+Equilibrium in MFGs is no more statistically challenging than solving a
+logarithmic number of single-agent RL problems}. We further extend our results
+to Multi-Type MFGs, generalizing from conventional MFGs and involving multiple
+types of agents. This extension implies statistical tractability of a broader
+class of Markov Games through the efficacy of mean-field approximation.
+Finally, inspired by our theoretical algorithm, we present a heuristic approach
+with improved computational efficiency and empirically demonstrate its
+effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024; 55 Pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 1-Lipschitz Neural Networks are more expressive with N-Activations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.06103v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.06103v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bernd Prach, Christoph H. Lampert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A crucial property for achieving secure, trustworthy and interpretable deep
+learning systems is their robustness: small changes to a system's inputs should
+not result in large changes to its outputs. Mathematically, this means one
+strives for networks with a small Lipschitz constant. Several recent works have
+focused on how to construct such Lipschitz networks, typically by imposing
+constraints on the weight matrices. In this work, we study an orthogonal
+aspect, namely the role of the activation function. We show that commonly used
+activation functions, such as MaxMin, as well as all piece-wise linear ones
+with two segments unnecessarily restrict the class of representable functions,
+even in the simplest one-dimensional setting. We furthermore introduce the new
+N-activation function that is provably more expressive than currently popular
+activation functions. We provide code at
+https://github.com/berndprach/NActivation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Functional Bilevel Optimization for Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.20233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.20233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ieva Petrulionyte, Julien Mairal, Michael Arbel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a new functional point of view on bilevel
+optimization problems for machine learning, where the inner objective is
+minimized over a function space. These types of problems are most often solved
+by using methods developed in the parametric setting, where the inner objective
+is strongly convex with respect to the parameters of the prediction function.
+The functional point of view does not rely on this assumption and notably
+allows using over-parameterized neural networks as the inner prediction
+function. We propose scalable and efficient algorithms for the functional
+bilevel optimization problem and illustrate the benefits of our approach on
+instrumental regression and reinforcement learning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CF-OPT: Counterfactual Explanations for Structured Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18293v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18293v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Germain Vivier-Ardisson, Alexandre Forel, Axel Parmentier, Thibaut Vidal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimization layers in deep neural networks have enjoyed a growing popularity
+in structured learning, improving the state of the art on a variety of
+applications. Yet, these pipelines lack interpretability since they are made of
+two opaque layers: a highly non-linear prediction model, such as a deep neural
+network, and an optimization layer, which is typically a complex black-box
+solver. Our goal is to improve the transparency of such methods by providing
+counterfactual explanations. We build upon variational autoencoders a
+principled way of obtaining counterfactuals: working in the latent space leads
+to a natural notion of plausibility of explanations. We finally introduce a
+variant of the classic loss for VAE training that improves their performance in
+our specific structured context. These provide the foundations of CF-OPT, a
+first-order optimization algorithm that can find counterfactual explanations
+for a broad class of structured learning architectures. Our numerical results
+show that both close and plausible explanations can be obtained for problems
+from the recent literature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Planning with a Learned Policy Basis to Optimally Solve Complex Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15301v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15301v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillermo Infante, David Kuric, Anders Jonsson, Vicenç Gómez, Herke van Hoof
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional reinforcement learning (RL) methods can successfully solve a
+wide range of sequential decision problems. However, learning policies that can
+generalize predictably across multiple tasks in a setting with non-Markovian
+reward specifications is a challenging problem. We propose to use successor
+features to learn a policy basis so that each (sub)policy in it solves a
+well-defined subproblem. In a task described by a finite state automaton (FSA)
+that involves the same set of subproblems, the combination of these
+(sub)policies can then be used to generate an optimal solution without
+additional learning. In contrast to other methods that combine (sub)policies
+via planning, our method asymptotically attains global optimality, even in
+stochastic environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knockout: A simple way to handle missing inputs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20448v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20448v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh Nguyen, Batuhan K. Karaman, Heejong Kim, Alan Q. Wang, Fengbei Liu, Mert R. Sabuncu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models can extract predictive and actionable information from
+complex inputs. The richer the inputs, the better these models usually perform.
+However, models that leverage rich inputs (e.g., multi-modality) can be
+difficult to deploy widely, because some inputs may be missing at inference.
+Current popular solutions to this problem include marginalization, imputation,
+and training multiple models. Marginalization can obtain calibrated predictions
+but it is computationally costly and therefore only feasible for low
+dimensional inputs. Imputation may result in inaccurate predictions because it
+employs point estimates for missing variables and does not work well for high
+dimensional inputs (e.g., images). Training multiple models whereby each model
+takes different subsets of inputs can work well but requires knowing missing
+input patterns in advance. Furthermore, training and retaining multiple models
+can be costly. We propose an efficient way to learn both the conditional
+distribution using full inputs and the marginal distributions. Our method,
+Knockout, randomly replaces input features with appropriate placeholder values
+during training. We provide a theoretical justification of Knockout and show
+that it can be viewed as an implicit marginalization strategy. We evaluate
+Knockout in a wide range of simulations and real-world datasets and show that
+it can offer strong empirical performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aligner: Efficient Alignment by Learning to Correct 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02416v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02416v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Ji, Boyuan Chen, Hantao Lou, Donghai Hong, Borong Zhang, Xuehai Pan, Juntao Dai, Tianyi Qiu, Yaodong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of large language models (LLMs) and ever-evolving
+practical requirements, finding an efficient and effective alignment method has
+never been more critical. However, the tension between the complexity of
+current alignment methods and the need for rapid iteration in deployment
+scenarios necessitates the development of a model-agnostic alignment approach
+that can operate under these constraints. In this paper, we introduce Aligner,
+a novel and simple alignment paradigm that learns the correctional residuals
+between preferred and dispreferred answers using a small model. Designed as a
+model-agnostic, plug-and-play module, Aligner can be directly applied to
+various open-source and API-based models with only one-off training, making it
+suitable for rapid iteration. Notably, Aligner can be applied to any powerful,
+large-scale upstream models. Moreover, it can even iteratively bootstrap the
+upstream models using corrected responses as synthetic human preference data,
+breaking through the model's performance ceiling. Our experiments demonstrate
+performance improvements by deploying the same Aligner model across 11
+different LLMs, evaluated on the 3H dimensions (helpfulness, harmlessness, and
+honesty). Specifically, Aligner-7B has achieved an average improvement of
+68.9\% in helpfulness and 23.8\% in harmlessness across the tested LLMs while
+also effectively reducing hallucination. In the Alpaca-Eval leaderboard,
+stacking Aligner-2B on GPT-4 Turbo improved its LC Win Rate from 55.0\% to
+58.3\%, surpassing GPT-4 Omni's 57.5\% Win Rate (community report).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PAC-Bayesian Generalization Bounds for Knowledge Graph Representation
+  Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.06418v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.06418v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaejun Lee, Minsung Hwang, Joyce Jiyoung Whang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While a number of knowledge graph representation learning (KGRL) methods have
+been proposed over the past decade, very few theoretical analyses have been
+conducted on them. In this paper, we present the first PAC-Bayesian
+generalization bounds for KGRL methods. To analyze a broad class of KGRL
+models, we propose a generic framework named ReED (Relation-aware
+Encoder-Decoder), which consists of a relation-aware message passing encoder
+and a triplet classification decoder. Our ReED framework can express at least
+15 different existing KGRL models, including not only graph neural
+network-based models such as R-GCN and CompGCN but also shallow-architecture
+models such as RotatE and ANALOGY. Our generalization bounds for the ReED
+framework provide theoretical grounds for the commonly used tricks in KGRL,
+e.g., parameter-sharing and weight normalization schemes, and guide desirable
+design choices for practical KGRL methods. We empirically show that the
+critical factors in our generalization bounds can explain actual generalization
+errors on three real-world knowledge graphs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 3 figures, 4 tables, The 41st International Conference on
+  Machine Learning (ICML 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalization Bounds for Heavy-Tailed SDEs through the Fractional
+  Fokker-Planck Equation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07723v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07723v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Dupuis, Umut Şimşekli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the generalization properties of heavy-tailed stochastic
+optimization algorithms has attracted increasing attention over the past years.
+While illuminating interesting aspects of stochastic optimizers by using
+heavy-tailed stochastic differential equations as proxies, prior works either
+provided expected generalization bounds, or introduced non-computable
+information theoretic terms. Addressing these drawbacks, in this work, we prove
+high-probability generalization bounds for heavy-tailed SDEs which do not
+contain any nontrivial information theoretic terms. To achieve this goal, we
+develop new proof techniques based on estimating the entropy flows associated
+with the so-called fractional Fokker-Planck equation (a partial differential
+equation that governs the evolution of the distribution of the corresponding
+heavy-tailed SDE). In addition to obtaining high-probability bounds, we show
+that our bounds have a better dependence on the dimension of parameters as
+compared to prior art. Our results further identify a phase transition
+phenomenon, which suggests that heavy tails can be either beneficial or harmful
+depending on the problem structure. We support our theory with experiments
+conducted in a variety of settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph External Attention Enhanced <span class="highlight-title">Transformer</span> <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21061v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21061v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianqing Liang, Min Chen, Jiye Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Transformer architecture has recently gained considerable attention in
+the field of graph representation learning, as it naturally overcomes several
+limitations of Graph Neural Networks (GNNs) with customized attention
+mechanisms or positional and structural encodings. Despite making some
+progress, existing works tend to overlook external information of graphs,
+specifically the correlation between graphs. Intuitively, graphs with similar
+structures should have similar representations. Therefore, we propose Graph
+External Attention (GEA) -- a novel attention mechanism that leverages multiple
+external node/edge key-value units to capture inter-graph correlations
+implicitly. On this basis, we design an effective architecture called Graph
+External Attention Enhanced Transformer (GEAET), which integrates local
+structure and global interaction information for more comprehensive graph
+representations. Extensive experiments on benchmark datasets demonstrate that
+GEAET achieves state-of-the-art empirical performance. The source code is
+available for reproducibility at: https://github.com/icm1018/GEAET.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpreting and Improving Diffusion Models from an Optimization
+  Perspective <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04848v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04848v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank Permenter, Chenyang Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising is intuitively related to projection. Indeed, under the manifold
+hypothesis, adding random noise is approximately equivalent to orthogonal
+perturbation. Hence, learning to denoise is approximately learning to project.
+In this paper, we use this observation to interpret denoising diffusion models
+as approximate gradient descent applied to the Euclidean distance function. We
+then provide straight-forward convergence analysis of the DDIM sampler under
+simple assumptions on the projection error of the denoiser. Finally, we propose
+a new gradient-estimation sampler, generalizing DDIM using insights from our
+theoretical results. In as few as 5-10 function evaluations, our sampler
+achieves state-of-the-art FID scores on pretrained CIFAR-10 and CelebA models
+and can generate high quality samples on latent diffusion models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 9 figures, 4 tables. To appear in ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM
+  Agents Exponentially Fast <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08567v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08567v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangming Gu, Xiaosen Zheng, Tianyu Pang, Chao Du, Qian Liu, Ye Wang, Jing Jiang, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A multimodal large language model (MLLM) agent can receive instructions,
+capture images, retrieve histories from memory, and decide which tools to use.
+Nonetheless, red-teaming efforts have revealed that adversarial images/prompts
+can jailbreak an MLLM and cause unaligned behaviors. In this work, we report an
+even more severe safety issue in multi-agent environments, referred to as
+infectious jailbreak. It entails the adversary simply jailbreaking a single
+agent, and without any further intervention from the adversary, (almost) all
+agents will become infected exponentially fast and exhibit harmful behaviors.
+To validate the feasibility of infectious jailbreak, we simulate multi-agent
+environments containing up to one million LLaVA-1.5 agents, and employ
+randomized pair-wise chat as a proof-of-concept instantiation for multi-agent
+interaction. Our results show that feeding an (infectious) adversarial image
+into the memory of any randomly chosen agent is sufficient to achieve
+infectious jailbreak. Finally, we derive a simple principle for determining
+whether a defense mechanism can provably restrain the spread of infectious
+jailbreak, but how to design a practical defense that meets this principle
+remains an open question to investigate. Our project page is available at
+https://sail-sg.github.io/Agent-Smith/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Craftax: A Lightning-Fast Benchmark for Open-Ended Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16801v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16801v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Matthews, Michael Beukman, Benjamin Ellis, Mikayel Samvelyan, Matthew Jackson, Samuel Coward, Jakob Foerster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Benchmarks play a crucial role in the development and analysis of
+reinforcement learning (RL) algorithms. We identify that existing benchmarks
+used for research into open-ended learning fall into one of two categories.
+Either they are too slow for meaningful research to be performed without
+enormous computational resources, like Crafter, NetHack and Minecraft, or they
+are not complex enough to pose a significant challenge, like Minigrid and
+Procgen. To remedy this, we first present Craftax-Classic: a ground-up rewrite
+of Crafter in JAX that runs up to 250x faster than the Python-native original.
+A run of PPO using 1 billion environment interactions finishes in under an hour
+using only a single GPU and averages 90% of the optimal reward. To provide a
+more compelling challenge we present the main Craftax benchmark, a significant
+extension of the Crafter mechanics with elements inspired from NetHack. Solving
+Craftax requires deep exploration, long term planning and memory, as well as
+continual adaptation to novel situations as more of the world is discovered. We
+show that existing methods including global and episodic exploration, as well
+as unsupervised environment design fail to make material progress on the
+benchmark. We believe that Craftax can for the first time allow researchers to
+experiment in a complex, open-ended environment with limited computational
+resources.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Balanced Data, Imbalanced Spectra: Unveiling Class Disparities with
+  Spectral Imbalance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11742v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11742v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chiraag Kaushik, Ran Liu, Chi-Heng Lin, Amrit Khera, Matthew Y Jin, Wenrui Ma, Vidya Muthukumar, Eva L Dyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classification models are expected to perform equally well for different
+classes, yet in practice, there are often large gaps in their performance. This
+issue of class bias is widely studied in cases of datasets with sample
+imbalance, but is relatively overlooked in balanced datasets. In this work, we
+introduce the concept of spectral imbalance in features as a potential source
+for class disparities and study the connections between spectral imbalance and
+class bias in both theory and practice. To build the connection between
+spectral imbalance and class gap, we develop a theoretical framework for
+studying class disparities and derive exact expressions for the per-class error
+in a high-dimensional mixture model setting. We then study this phenomenon in
+11 different state-of-the-art pretrained encoders and show how our proposed
+framework can be used to compare the quality of encoders, as well as evaluate
+and combine data augmentation strategies to mitigate the issue. Our work sheds
+light on the class-dependent effects of learning, and provides new insights
+into how state-of-the-art pretrained features may have unknown biases that can
+be diagnosed through their spectra.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PowerGraph: A power grid benchmark <span class="highlight-title">dataset</span> for graph neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Varbella, Kenza Amara, Blazhe Gjorgiev, Mennatallah El-Assady, Giovanni Sansavini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Power grids are critical infrastructures of paramount importance to modern
+society and, therefore, engineered to operate under diverse conditions and
+failures. The ongoing energy transition poses new challenges for the
+decision-makers and system operators. Therefore, we must develop grid analysis
+algorithms to ensure reliable operations. These key tools include power flow
+analysis and system security analysis, both needed for effective operational
+and strategic planning. The literature review shows a growing trend of machine
+learning (ML) models that perform these analyses effectively. In particular,
+Graph Neural Networks (GNNs) stand out in such applications because of the
+graph-based structure of power grids. However, there is a lack of publicly
+available graph datasets for training and benchmarking ML models in electrical
+power grid applications. First, we present PowerGraph, which comprises
+GNN-tailored datasets for i) power flows, ii) optimal power flows, and iii)
+cascading failure analyses of power grids. Second, we provide ground-truth
+explanations for the cascading failure analysis. Finally, we perform a complete
+benchmarking of GNN methods for node-level and graph-level tasks and
+explainability. Overall, PowerGraph is a multifaceted GNN dataset for diverse
+tasks that includes power flow and fault scenarios with real-world
+explanations, providing a valuable resource for developing improved GNN models
+for node-level, graph-level tasks and explainability methods in power system
+modeling. The dataset is available at
+https://figshare.com/articles/dataset/PowerGraph/22820534 and the code at
+https://github.com/PowerGraph-Datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 8 figures, conference paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uplift Modeling Under Limited Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.19289v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.19289v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        George Panagopoulos, Daniele Malitesta, Fragkiskos D. Malliaros, Jun Pang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating causal effects in e-commerce tends to involve costly treatment
+assignments which can be impractical in large-scale settings. Leveraging
+machine learning to predict such treatment effects without actual intervention
+is a standard practice to diminish the risk. However, existing methods for
+treatment effect prediction tend to rely on training sets of substantial size,
+which are built from real experiments and are thus inherently risky to create.
+In this work we propose a graph neural network to diminish the required
+training set size, relying on graphs that are common in e-commerce data.
+Specifically, we view the problem as node regression with a restricted number
+of labeled instances, develop a two-model neural architecture akin to previous
+causal effect estimators, and test varying message-passing layers for encoding.
+Furthermore, as an extra step, we combine the model with an acquisition
+function to guide the creation of the training set in settings with extremely
+low experimental budget. The framework is flexible since each step can be used
+separately with other models or treatment policies. The experiments on real
+large-scale networks indicate a clear advantage of our methodology over the
+state of the art, which in many cases performs close to random, underlining the
+need for models that can generalize with limited supervision to reduce
+experimental risks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Learning with Confidential Computing: A Systematization of
+  Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.10134v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.10134v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Mo, Zahra Tarkhani, Hamed Haddadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Privacy and security challenges in Machine Learning (ML) have become
+increasingly severe, along with ML's pervasive development and the recent
+demonstration of large attack surfaces. As a mature system-oriented approach,
+Confidential Computing has been utilized in both academia and industry to
+mitigate privacy and security issues in various ML scenarios. In this paper,
+the conjunction between ML and Confidential Computing is investigated. We
+systematize the prior work on Confidential Computing-assisted ML techniques
+that provide i) confidentiality guarantees and ii) integrity assurances, and
+discuss their advanced features and drawbacks. Key challenges are further
+identified, and we provide dedicated analyses of the limitations in existing
+Trusted Execution Environment (TEE) systems for ML use cases. Finally,
+prospective works are discussed, including grounded privacy definitions for
+closed-loop protection, partitioned executions of efficient ML, dedicated
+TEE-assisted designs for ML, TEE-aware ML, and ML full pipeline guarantees. By
+providing these potential solutions in our systematization of knowledge, we aim
+to build the bridge to help achieve a much stronger TEE-enabled ML for privacy
+guarantees without introducing computation and system costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Survey paper, 37 pages, accepted to ACM Computing Surveys</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constrained Exploration via Reflected Replica Exchange Stochastic
+  Gradient Langevin Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07839v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07839v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyang Zheng, Hengrong Du, Qi Feng, Wei Deng, Guang Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Replica exchange stochastic gradient Langevin dynamics (reSGLD) is an
+effective sampler for non-convex learning in large-scale datasets. However, the
+simulation may encounter stagnation issues when the high-temperature chain
+delves too deeply into the distribution tails. To tackle this issue, we propose
+reflected reSGLD (r2SGLD): an algorithm tailored for constrained non-convex
+exploration by utilizing reflection steps within a bounded domain.
+Theoretically, we observe that reducing the diameter of the domain enhances
+mixing rates, exhibiting a $\textit{quadratic}$ behavior. Empirically, we test
+its performance through extensive experiments, including identifying dynamical
+systems with physical constraints, simulations of constrained multi-modal
+distributions, and image classification tasks. The theoretical and empirical
+findings highlight the crucial role of constrained exploration in improving the
+simulation efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Targeted Reduction of Causal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armin Kekić, Bernhard Schölkopf, Michel Besserve
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Why does a phenomenon occur? Addressing this question is central to most
+scientific inquiries and often relies on simulations of scientific models. As
+models become more intricate, deciphering the causes behind phenomena in
+high-dimensional spaces of interconnected variables becomes increasingly
+challenging. Causal Representation Learning (CRL) offers a promising avenue to
+uncover interpretable causal patterns within these simulations through an
+interventional lens. However, developing general CRL frameworks suitable for
+practical applications remains an open challenge. We introduce Targeted Causal
+Reduction (TCR), a method for condensing complex intervenable models into a
+concise set of causal factors that explain a specific target phenomenon. We
+propose an information theoretic objective to learn TCR from interventional
+data of simulations, establish identifiability for continuous variables under
+shift interventions and present a practical algorithm for learning TCRs. Its
+ability to generate interpretable high-level explanations from complex models
+is demonstrated on toy and mechanical systems, illustrating its potential to
+assist scientists in the study of complex phenomena in a broad range of
+disciplines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predictive Coding beyond Correlations <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15479v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15479v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tommaso Salvatori, Luca Pinchetti, Amine M'Charrak, Beren Millidge, Thomas Lukasiewicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been extensive research on the capabilities of
+biologically plausible algorithms. In this work, we show how one of such
+algorithms, called predictive coding, is able to perform causal inference
+tasks. First, we show how a simple change in the inference process of
+predictive coding enables to compute interventions without the need to mutilate
+or redefine a causal graph. Then, we explore applications in cases where the
+graph is unknown, and has to be inferred from observational data. Empirically,
+we show how such findings can be used to improve the performance of predictive
+coding in image classification tasks, and conclude that such models are able to
+perform simple end-to-end causal inference tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>44 Pages, 24 Figures. Changed title and abstract, following the ICML
+  accepted version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robotic Imitation of Human Actions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.08381v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.08381v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josua Spisak, Matthias Kerzel, Stefan Wermter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation can allow us to quickly gain an understanding of a new task.
+Through a demonstration, we can gain direct knowledge about which actions need
+to be performed and which goals they have. In this paper, we introduce a new
+approach to imitation learning that tackles the challenges of a robot imitating
+a human, such as the change in perspective and body schema. Our approach can
+use a single human demonstration to abstract information about the demonstrated
+task, and use that information to generalise and replicate it. We facilitate
+this ability by a new integration of two state-of-the-art methods: a diffusion
+action segmentation model to abstract temporal information from the
+demonstration and an open vocabulary object detector for spatial information.
+Furthermore, we refine the abstracted information and use symbolic reasoning to
+create an action plan utilising inverse kinematics, to allow the robot to
+imitate the demonstrated action.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the ICDL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantum Generative Diffusion Model: A Fully Quantum-Mechanical Model for
+  Generating Quantum State Ensemble 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07039v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07039v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuangtao Chen, Qinglin Zhao, MengChu Zhou, Zhimin He, Zhili Sun, Haozhen Situ
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classical diffusion models have shown superior generative results and have
+been applied to many problems. Exploring these models in the quantum domain can
+advance the field of quantum generative learning. In this paper, we introduce
+the Quantum Generative Diffusion Model (QGDM), a simple and elegant quantum
+counterpart of classical diffusion models.
+  The core idea of QGDM is that any target quantum state can be transformed
+into a completely mixed state, which has the highest entropy and maximum
+uncertainty about the system, through a non-unitary forward process.
+Subsequently, a trainable backward process can be used to recover the target
+state from the completely mixed state. The design requirements for QGDM's
+backward process include ensuring non-unitarity while maintaining a low number
+of parameters. To achieve this, we introduce partial trace operations in the
+backward process to enforce non-unitary. Additionally, we control the number of
+trainable parameters by using a parameter-sharing strategy and incorporating
+temporal information as an input in the backward process. Furthermore, we
+introduce a resource-efficient version of QGDM, which reduces the number of
+auxiliary qubits while preserving impressive generative capabilities.
+  Our proposed models exhibit better convergence performance than Quantum
+Generative Adversarial Networks (QGANs) because our models optimize a convex
+distance function using gradient descent. Comparative results with QGANs
+demonstrate the effectiveness of our models in generating both pure and mixed
+quantum states. Notably, our models achieve 53.03% higher fidelity in
+mixed-state generation tasks compared to QGANs. These results highlight the
+potential of the proposed models to tackle challenging quantum generation
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Comments are welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conservative Prediction via Data-Driven Confidence Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04974v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04974v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caroline Choi, Fahim Tajwar, Yoonho Lee, Huaxiu Yao, Ananya Kumar, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In safety-critical applications of machine learning, it is often desirable
+for a model to be conservative, abstaining from making predictions on unknown
+inputs which are not well-represented in the training data. However, detecting
+unknown examples is challenging, as it is impossible to anticipate all
+potential inputs at test time. To address this, prior work (Hendrycks et al.,
+2018) minimizes model confidence on an auxiliary outlier dataset carefully
+curated to be disjoint from the training distribution. We theoretically analyze
+the choice of auxiliary dataset for confidence minimization, revealing two
+actionable insights: (1) if the auxiliary set contains unknown examples similar
+to those seen at test time, confidence minimization leads to provable detection
+of unknown test examples, and (2) if the first condition is satisfied, it is
+unnecessary to filter out known examples for out-of-distribution (OOD)
+detection. Motivated by these guidelines, we propose the Data-Driven Confidence
+Minimization (DCM) framework, which minimizes confidence on an uncertainty
+dataset. We apply DCM to two problem settings in which conservative prediction
+is paramount -- selective classification and OOD detection -- and provide a
+realistic way to gather uncertainty data for each setting. In our experiments,
+DCM consistently outperforms existing selective classification approaches on 4
+datasets when tested on unseen distributions and outperforms state-of-the-art
+OOD detection methods on 12 ID-OOD dataset pairs, reducing FPR (at TPR $95\%$)
+by $6.3\%$ and $58.1\%$ on CIFAR-10 and CIFAR-100 compared to Outlier Exposure.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Transactions on Machine Learning Research (TMLR), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Connecting the Dots: Collaborative Fine-tuning for Black-Box
+  Vision-Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04050v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04050v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengbo Wang, Jian Liang, Ran He, Zilei Wang, Tieniu Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of pretrained vision-language models (VLMs), considerable
+efforts have been devoted to fine-tuning them for downstream tasks. Despite the
+progress made in designing efficient fine-tuning methods, such methods require
+access to the model's parameters, which can be challenging as model owners
+often opt to provide their models as a black box to safeguard model ownership.
+This paper proposes a \textbf{C}ollabo\textbf{ra}tive
+\textbf{F}ine-\textbf{T}uning (\textbf{CraFT}) approach for fine-tuning
+black-box VLMs to downstream tasks, where one only has access to the input
+prompts and the output predictions of the model. CraFT comprises two modules, a
+prompt generation module for learning text prompts and a prediction refinement
+module for enhancing output predictions in residual style. Additionally, we
+introduce an auxiliary prediction-consistent loss to promote consistent
+optimization across these modules. These modules are optimized by a novel
+collaborative training algorithm. Extensive experiments on few-shot
+classification over 15 datasets demonstrate the superiority of CraFT. The
+results show that CraFT achieves a decent gain of about 12\% with 16-shot
+datasets and only 8,000 queries. Moreover, CraFT trains faster and uses only
+about 1/80 of the memory footprint for deployment, while sacrificing only
+1.62\% compared to the white-box method. Our code is publicly available at
+https://github.com/mrflogs/CraFT .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Partial Search in a Frozen Network is Enough to Find a Strong Lottery
+  Ticket 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14029v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14029v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hikari Otsuka, Daiki Chijiwa, Ángel López García-Arias, Yasuyuki Okoshi, Kazushi Kawamura, Thiem Van Chu, Daichi Fujiki, Susumu Takeuchi, Masato Motomura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Randomly initialized dense networks contain subnetworks that achieve high
+accuracy without weight learning -- strong lottery tickets (SLTs). Recently,
+Gadhikar et al. (2023) demonstrated that SLTs can also be found within a
+randomly pruned source network, thus reducing the SLT search space. However,
+this limits the search to SLTs that are even sparser than the source, leading
+to worse accuracy due to unintentionally high sparsity. This paper proposes a
+method that reduces the SLT search space by an arbitrary ratio independent of
+the desired SLT sparsity. A random subset of the initial weights is excluded
+from the search space by freezing it -- i.e., by either permanently pruning
+them or locking them as a fixed part of the SLT. In addition to reducing search
+space, the proposed random freezing can also provide the benefit of reducing
+the model size for inference. Furthermore, experimental results show that the
+proposed method finds SLTs with better accuracy-to-model size trade-off than
+the SLTs obtained from dense or randomly pruned source networks. In particular,
+the SLTs found in Frozen ResNets on image classification using ImageNet
+significantly improve the accuracy-to-search space and accuracy-to-model size
+trade-offs over SLTs within dense (non-freezing) or sparse (non-locking) random
+networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2: Updates include additional experiments and revisions of some
+  experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sequential Neural Score Estimation: Likelihood-Free Inference with
+  Conditional Score Based Diffusion Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.04872v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.04872v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Louis Sharrock, Jack Simons, Song Liu, Mark Beaumont
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Sequential Neural Posterior Score Estimation (SNPSE), a
+score-based method for Bayesian inference in simulator-based models. Our
+method, inspired by the remarkable success of score-based methods in generative
+modelling, leverages conditional score-based diffusion models to generate
+samples from the posterior distribution of interest. The model is trained using
+an objective function which directly estimates the score of the posterior. We
+embed the model into a sequential training procedure, which guides simulations
+using the current approximation of the posterior at the observation of
+interest, thereby reducing the simulation cost. We also introduce several
+alternative sequential approaches, and discuss their relative merits. We then
+validate our method, as well as its amortised, non-sequential, variant on
+several numerical examples, demonstrating comparable or superior performance to
+existing state-of-the-art methods such as Sequential Neural Posterior
+Estimation (SNPE).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-enhanced Large Language Models in Asynchronous Plan Reasoning <span class="chip">ICML-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02805v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02805v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangru Lin, Emanuele La Malfa, Valentin Hofmann, Elle Michelle Yang, Anthony Cohn, Janet B. Pierrehumbert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning is a fundamental property of human intelligence. Reasoning about
+asynchronous plans is challenging since it requires sequential and parallel
+planning to optimize time costs. Can large language models (LLMs) succeed at
+this task? Here, we present the first large-scale study investigating this
+question. We find that a representative set of closed and open-source LLMs,
+including GPT-4 and LLaMA-2, behave poorly when not supplied with illustrations
+about the task-solving process in our benchmark AsyncHow. We propose a novel
+technique called Plan Like a Graph (PLaG) that combines graphs with natural
+language prompts and achieves state-of-the-art results. We show that although
+PLaG can boost model performance, LLMs still suffer from drastic degradation
+when task complexity increases, highlighting the limits of utilizing LLMs for
+simulating digital devices. We see our study as an exciting step towards using
+LLMs as efficient autonomous agents. Our code and data are available at
+https://github.com/fangru-lin/graph-llm-asynchow-plan.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML-2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accelerating Graph Neural Networks via Edge Pruning for Power Allocation
+  in Wireless Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lili Chen, Jingge Zhu, Jamie Evans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have recently emerged as a promising approach to
+tackling power allocation problems in wireless networks. Since unpaired
+transmitters and receivers are often spatially distant, the distance-based
+threshold is proposed to reduce the computation time by excluding or including
+the channel state information in GNNs. In this paper, we are the first to
+introduce a neighbour-based threshold approach to GNNs to reduce the time
+complexity. Furthermore, we conduct a comprehensive analysis of both
+distance-based and neighbour-based thresholds and provide recommendations for
+selecting the appropriate value in different communication channel scenarios.
+We design the corresponding neighbour-based Graph Neural Networks (N-GNN) with
+the aim of allocating transmit powers to maximise the network throughput. Our
+results show that our proposed N-GNN offer significant advantages in terms of
+reducing time complexity while preserving strong performance and generalisation
+capacity. Besides, we show that by choosing a suitable threshold, the time
+complexity is reduced from O(|V|^2) to O(|V|), where |V| is the total number of
+transceiver pairs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in 2023 IEEE Global Communications Conference Workshops (GC
+  Workshops)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unlock the Power of Algorithm Features: A Generalization Analysis for
+  Algorithm Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11349v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11349v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Wu, Yan Zhong, Jibin Wu, Yuxiao Huang, Sheng-hao Wu, Kay Chen Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the algorithm selection research, the discussion surrounding algorithm
+features has been significantly overshadowed by the emphasis on problem
+features. Although a few empirical studies have yielded evidence regarding the
+effectiveness of algorithm features, the potential benefits of incorporating
+algorithm features into algorithm selection models and their suitability for
+different scenarios remain unclear. In this paper, we address this gap by
+proposing the first provable guarantee for algorithm selection based on
+algorithm features, taking a generalization perspective. We analyze the
+benefits and costs associated with algorithm features and investigate how the
+generalization error is affected by different factors. Specifically, we examine
+adaptive and predefined algorithm features under transductive and inductive
+learning paradigms, respectively, and derive upper bounds for the
+generalization error based on their model's Rademacher complexity. Our
+theoretical findings not only provide tight upper bounds, but also offer
+analytical insights into the impact of various factors, such as the training
+scale of problem instances and candidate algorithms, model parameters, feature
+values, and distributional differences between the training and test data.
+Notably, we demonstrate how models will benefit from algorithm features in
+complex scenarios involving many algorithms, and proves the positive
+correlation between generalization error bound and $\chi^2$-divergence of
+distributions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ iMove: Exploring Bio-impedance Sensing for Fitness Activity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09445v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09445v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengxi Liu, Vitor Fortes Rey, Yu Zhang, Lala Shakti Swarup Ray, Bo Zhou, Paul Lukowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic and precise fitness activity recognition can be beneficial in
+aspects from promoting a healthy lifestyle to personalized preventative
+healthcare. While IMUs are currently the prominent fitness tracking modality,
+through iMove, we show bio-impedence can help improve IMU-based fitness
+tracking through sensor fusion and contrastive learning.To evaluate our
+methods, we conducted an experiment including six upper body fitness activities
+performed by ten subjects over five days to collect synchronized data from
+bio-impedance across two wrists and IMU on the left wrist.The contrastive
+learning framework uses the two modalities to train a better IMU-only
+classification model, where bio-impedance is only required at the training
+phase, by which the average Macro F1 score with the input of a single IMU was
+improved by 3.22 \% reaching 84.71 \% compared to the 81.49 \% of the IMU
+baseline model. We have also shown how bio-impedance can improve human activity
+recognition (HAR) directly through sensor fusion, reaching an average Macro F1
+score of 89.57 \% (two modalities required for both training and inference)
+even if Bio-impedance alone has an average macro F1 score of 75.36 \%, which is
+outperformed by IMU alone. In addition, similar results were obtained in an
+extended study on lower body fitness activity classification, demonstrating the
+generalisability of our approach.Our findings underscore the potential of
+sensor fusion and contrastive learning as valuable tools for advancing fitness
+activity recognition, with bio-impedance playing a pivotal role in augmenting
+the capabilities of IMU-based systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by percom2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenRLHF: An Easy-to-use, Scalable and High-performance RLHF Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Hu, Xibin Wu, Weixun Wang,  Xianyu, Dehao Zhang, Yu Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) continue to grow by scaling laws,
+reinforcement learning from human feedback (RLHF) has gained significant
+attention due to its outstanding performance. However, unlike pretraining or
+fine-tuning a single model, scaling reinforcement learning from human feedback
+(RLHF) for training large language models poses coordination challenges across
+four models. We present OpenRLHF, an open-source framework enabling efficient
+RLHF scaling. Unlike existing RLHF frameworks that co-locate four models on the
+same GPUs, OpenRLHF re-designs scheduling for the models beyond 70B parameters
+using Ray, vLLM, and DeepSpeed, leveraging improved resource utilization and
+diverse training approaches. Integrating seamlessly with Hugging Face, OpenRLHF
+provides an out-of-the-box solution with optimized algorithms and launch
+scripts, which ensures user-friendliness. OpenRLHF implements RLHF, DPO,
+rejection sampling, and other alignment techniques. Empowering state-of-the-art
+LLM development, OpenRLHF's code is available at
+https://github.com/OpenLLMAI/OpenRLHF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07105v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07105v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Moritz Plenz, Anette Frank
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Language Models (LMs) are the workhorses of NLP, their interplay with
+structured knowledge graphs (KGs) is still actively researched. Current methods
+for encoding such graphs typically either (i) linearize them for embedding with
+LMs -- which underutilize structural information, or (ii) use Graph Neural
+Networks (GNNs) to preserve the graph structure -- but GNNs cannot represent
+text features as well as pretrained LMs. In our work we introduce a novel LM
+type, the Graph Language Model (GLM), that integrates the strengths of both
+approaches and mitigates their weaknesses. The GLM parameters are initialized
+from a pretrained LM to enhance understanding of individual graph concepts and
+triplets. Simultaneously, we design the GLM's architecture to incorporate graph
+biases, thereby promoting effective knowledge distribution within the graph.
+This enables GLMs to process graphs, texts, and interleaved inputs of both.
+Empirical evaluations on relation classification tasks show that GLM embeddings
+surpass both LM- and GNN-based baselines in supervised and zero-shot setting,
+demonstrating their versatility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024. 9 pages, 10 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Particle identification with machine learning from incomplete data in
+  the ALICE experiment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.17436v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.17436v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maja Karwowska, Łukasz Graczykowski, Kamil Deja, Miłosz Kasak, Małgorzata Janik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ALICE experiment at the LHC measures properties of the strongly
+interacting matter formed in ultrarelativistic heavy-ion collisions. Such
+studies require accurate particle identification (PID). ALICE provides PID
+information via several detectors for particles with momentum from about 100
+MeV/c up to 20 GeV/c. Traditionally, particles are selected with rectangular
+cuts. A much better performance can be achieved with machine learning (ML)
+methods. Our solution uses multiple neural networks (NN) serving as binary
+classifiers. Moreover, we extended our particle classifier with Feature Set
+Embedding and attention in order to train on data with incomplete samples. We
+also present the integration of the ML project with the ALICE analysis
+software, and we discuss domain adaptation, the ML technique needed to transfer
+the knowledge between simulated and real experimental data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Proceedings of 3rd Artificial Intelligence for the Electron Ion
+  Collider workshop -- AI4EIC2023, 28.11-1.12.2023. Accepted in JINST</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProtoGate: Prototype-based Neural Networks with Global-to-local Feature
+  Selection for Tabular Biomedical Data <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12330v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12330v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangjian Jiang, Andrei Margeloiu, Nikola Simidjievski, Mateja Jamnik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tabular biomedical data poses challenges in machine learning because it is
+often high-dimensional and typically low-sample-size (HDLSS). Previous research
+has attempted to address these challenges via local feature selection, but
+existing approaches often fail to achieve optimal performance due to their
+limitation in identifying globally important features and their susceptibility
+to the co-adaptation problem. In this paper, we propose ProtoGate, a
+prototype-based neural model for feature selection on HDLSS data. ProtoGate
+first selects instance-wise features via adaptively balancing global and local
+feature selection. Furthermore, ProtoGate employs a non-parametric
+prototype-based prediction mechanism to tackle the co-adaptation problem,
+ensuring the feature selection results and predictions are consistent with
+underlying data clusters. We conduct comprehensive experiments to evaluate the
+performance and interpretability of ProtoGate on synthetic and real-world
+datasets. The results show that ProtoGate generally outperforms
+state-of-the-art methods in prediction accuracy by a clear margin while
+providing high-fidelity feature selection and explainable predictions. Code is
+available at https://github.com/SilenceX12138/ProtoGate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the Forty-first International Conference on Machine
+  Learning (ICML2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Weak Augmentation Guided Relational <span class="highlight-title">Self-Supervised</span> Learning <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.08717v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.08717v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingkai Zheng, Shan You, Fei Wang, Chen Qian, Changshui Zhang, Xiaogang Wang, Chang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised Learning (SSL) including the mainstream contrastive learning
+has achieved great success in learning visual representations without data
+annotations. However, most methods mainly focus on the instance level
+information (\ie, the different augmented images of the same instance should
+have the same feature or cluster into the same class), but there is a lack of
+attention on the relationships between different instances. In this paper, we
+introduce a novel SSL paradigm, which we term as relational self-supervised
+learning (ReSSL) framework that learns representations by modeling the
+relationship between different instances. Specifically, our proposed method
+employs sharpened distribution of pairwise similarities among different
+instances as \textit{relation} metric, which is thus utilized to match the
+feature embeddings of different augmentations. To boost the performance, we
+argue that weak augmentations matter to represent a more reliable relation, and
+leverage momentum strategy for practical efficiency. The designed asymmetric
+predictor head and an InfoNCE warm-up strategy enhance the robustness to
+hyper-parameters and benefit the resulting performance. Experimental results
+show that our proposed ReSSL substantially outperforms the state-of-the-art
+methods across different network architectures, including various lightweight
+networks (\eg, EfficientNet and MobileNet).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of NeurIPS 2021 paper. arXiv admin note: substantial
+  text overlap with arXiv:2107.09282</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can LLMs Separate Instructions From Data? And What Do We Even Mean By
+  That? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Egor Zverev, Sahar Abdelnabi, Soroush Tabesh, Mario Fritz, Christoph H. Lampert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction-tuned Large Language Models (LLMs) show impressive results in
+numerous practical applications, but they lack essential safety features that
+are common in other areas of computer science, particularly an explicit
+separation of instructions and data. This makes them vulnerable to
+manipulations such as indirect prompt injections and generally unsuitable for
+safety-critical tasks. Surprisingly, there is currently no established
+definition or benchmark to quantify this phenomenon. In this work, we close
+this gap by introducing a formal measure for instruction-data separation and an
+empirical variant that is calculable from a model's outputs. We also present a
+new dataset, SEP, that allows estimating the measure for real-world models. Our
+results on various LLMs show that the problem of instruction-data separation is
+real: all models fail to achieve high separation, and canonical mitigation
+techniques, such as prompt engineering and fine-tuning, either fail to
+substantially improve separation or reduce model utility. The source code and
+SEP dataset are openly accessible at
+https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GitHub:
+  https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed. 10 pages main
+  text, 30 pages in total</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FAdam: Adam is a natural gradient optimizer using diagonal empirical
+  Fisher information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12807v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12807v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongseong Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper establishes a mathematical foundation for the Adam optimizer,
+elucidating its connection to natural gradient descent through Riemannian and
+information geometry. We rigorously analyze the diagonal empirical Fisher
+information matrix (FIM) in Adam, clarifying all detailed approximations and
+advocating for the use of log probability functions as loss, which should be
+based on discrete distributions, due to the limitations of empirical FIM. Our
+analysis uncovers flaws in the original Adam algorithm, leading to proposed
+corrections such as enhanced momentum calculations, adjusted bias corrections,
+adaptive epsilon, and gradient clipping. We refine the weight decay term based
+on our theoretical framework. Our modified algorithm, Fisher Adam (FAdam),
+demonstrates superior performance across diverse domains including LLM, ASR,
+and VQ-VAE, achieving state-of-the-art results in ASR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 4 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Overcoming Saturation in Density Ratio Estimation by Iterated
+  Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13891v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13891v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Gruber, Markus Holzleitner, Johannes Lehner, Sepp Hochreiter, Werner Zellinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the ratio of two probability densities from finitely many samples,
+is a central task in machine learning and statistics. In this work, we show
+that a large class of kernel methods for density ratio estimation suffers from
+error saturation, which prevents algorithms from achieving fast error
+convergence rates on highly regular learning problems. To resolve saturation,
+we introduce iterated regularization in density ratio estimation to achieve
+fast error rates. Our methods outperform its non-iteratively regularized
+versions on benchmarks for density ratio estimation as well as on large-scale
+evaluations for importance-weighted ensembling of deep unsupervised domain
+adaptation models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adversarial Preference Optimization: Enhancing Your Alignment via RM-LLM
+  Game <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.08045v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.08045v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyu Cheng, Yifan Yang, Jian Li, Yong Dai, Tianhao Hu, Peixin Cao, Nan Du, Xiaolong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human preference alignment is essential to improve the interaction quality of
+large language models (LLMs). Existing alignment methods depend on manually
+annotated preference data to guide the LLM optimization directions. However,
+continuously updating LLMs for alignment raises a distribution gap between
+model-generated samples and human-annotated responses, hindering training
+effectiveness. To mitigate this issue, previous methods require additional
+preference annotation on newly generated samples to adapt to the shifted
+distribution, which consumes a large amount of annotation resources. Targeting
+more efficient human preference optimization, we propose an Adversarial
+Preference Optimization (APO) framework, in which the LLM and the reward model
+update alternatively via a min-max game. Through adversarial training, the
+reward model can adapt to the shifted generation distribution of the LLM
+without any additional annotation. With comprehensive experiments, we find the
+proposed adversarial training framework further enhances existing alignment
+baselines in terms of LLM helpfulness and harmlessness. The code is at
+https://github.com/Linear95/APO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL2024 findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multistep Consistency Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Heek, Emiel Hoogeboom, Tim Salimans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are relatively easy to train but require many steps to
+generate samples. Consistency models are far more difficult to train, but
+generate samples in a single step.
+  In this paper we propose Multistep Consistency Models: A unification between
+Consistency Models (Song et al., 2023) and TRACT (Berthelot et al., 2023) that
+can interpolate between a consistency model and a diffusion model: a trade-off
+between sampling speed and sampling quality. Specifically, a 1-step consistency
+model is a conventional consistency model whereas a $\infty$-step consistency
+model is a diffusion model.
+  Multistep Consistency Models work really well in practice. By increasing the
+sample budget from a single step to 2-8 steps, we can train models more easily
+that generate higher quality samples, while retaining much of the sampling
+speed benefits. Notable results are 1.4 FID on Imagenet 64 in 8 step and 2.1
+FID on Imagenet128 in 8 steps with consistency distillation, using simple
+losses without adversarial training. We also show that our method scales to a
+text-to-image diffusion model, generating samples that are close to the quality
+of the original model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning Calabi-Yau four folds with hybrid and recurrent neural
+  network architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17406v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17406v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        H. L. Dao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we report the results of applying deep learning based on hybrid
+convolutional-recurrent and purely recurrent neural network architectures to
+the dataset of almost one million complete intersection Calabi-Yau four-folds
+(CICY4) to machine-learn their four Hodge numbers $h^{1,1}, h^{2,1}, h^{3,1},
+h^{2,2}$. In particular, we explored and experimented with twelve different
+neural network models, nine of which are convolutional-recurrent (CNN-RNN)
+hybrids with the RNN unit being either GRU (Gated Recurrent Unit) or Long Short
+Term Memory (LSTM). The remaining four models are purely recurrent neural
+networks based on LSTM. In terms of the $h^{1,1}, h^{2,1}, h^{3,1}, h^{2,2}$
+prediction accuracies, at 72% training ratio, our best performing individual
+model is CNN-LSTM-400, a hybrid CNN-LSTM with the LSTM hidden size of 400,
+which obtained 99.74%, 98.07%, 95.19%, 81.01%, our second best performing
+individual model is LSTM-448, an LSTM-based model with the hidden size of 448,
+which obtained 99.74%, 97.51%, 94.24%, and 78.63%. These results were improved
+by forming ensembles of the top two, three or even four models. Our best
+ensemble, consisting of the top four models, achieved the accuracies of 99.84%,
+98.71%, 96.26%, 85.03%. At 80% training ratio, the top two performing models
+LSTM-448 and LSTM-424 are both LSTM-based with the hidden sizes of 448 and 424.
+Compared with the 72% training ratio, there is a significant improvement of
+accuracies, which reached 99.85%, 98.66%, 96.26%, 84.77% for the best
+individual model and 99.90%, 99.03%, 97.97%, 87.34% for the best ensemble.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2: new (improved) results added, references added, typos corrected</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Learning under Partially Class-Disjoint Data via Manifold
+  Reshaping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18983v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18983v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqing Fan, Jiangchao Yao, Ruipeng Zhang, Lingjuan Lyu, Ya Zhang, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical heterogeneity severely limits the performance of federated
+learning (FL), motivating several explorations e.g., FedProx, MOON and FedDyn,
+to alleviate this problem. Despite effectiveness, their considered scenario
+generally requires samples from almost all classes during the local training of
+each client, although some covariate shifts may exist among clients. In fact,
+the natural case of partially class-disjoint data (PCDD), where each client
+contributes a few classes (instead of all classes) of samples, is practical yet
+underexplored. Specifically, the unique collapse and invasion characteristics
+of PCDD can induce the biased optimization direction in local training, which
+prevents the efficiency of federated learning. To address this dilemma, we
+propose a manifold reshaping approach called FedMR to calibrate the feature
+space of local training. Our FedMR adds two interplaying losses to the vanilla
+federated learning: one is intra-class loss to decorrelate feature dimensions
+for anti-collapse; and the other one is inter-class loss to guarantee the
+proper margin among categories in the feature expansion. We conduct extensive
+experiments on a range of datasets to demonstrate that our FedMR achieves much
+higher accuracy and better communication efficiency. Source code is available
+at: https://github.com/MediaBrain-SJTU/FedMR.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CacheBlend: Fast Large Language Model Serving for RAG with Cached
+  Knowledge Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16444v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16444v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Yao, Hanchen Li, Yuhan Liu, Siddhant Ray, Yihua Cheng, Qizheng Zhang, Kuntai Du, Shan Lu, Junchen Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often incorporate multiple text chunks in their
+inputs to provide the necessary contexts. To speed up the prefill of the long
+LLM inputs, one can pre-compute the KV cache of a text and re-use the KV cache
+when the context is reused as the prefix of another LLM input. However, the
+reused text chunks are not always the input prefix, and when they are not,
+their precomputed KV caches cannot be directly used since they ignore the
+text's cross-attention with the preceding text in the LLM input. Thus, the
+benefits of reusing KV caches remain largely unrealized.
+  This paper tackles just one question: when an LLM input contains multiple
+text chunks, how to quickly combine their precomputed KV caches in order to
+achieve the same generation quality as the expensive full prefill (i.e.,
+without reusing KV cache)? We present CacheBlend, a scheme that reuses the
+pre-computed KV caches, regardless prefix or not, and selectively recomputes
+the KV values of a small subset of tokens to partially update each reused KV
+cache. In the meantime,the small extra delay for recomputing some tokens can be
+pipelined with the retrieval of KV caches within the same job,allowing
+CacheBlend to store KV caches in slower devices with more storage capacity
+while retrieving them without increasing the inference delay. By comparing
+CacheBlend with the state-of-the-art KV cache reusing schemes on three
+open-source LLMs of various sizes and four popular benchmark datasets of
+different tasks, we show that CacheBlend reduces time-to-first-token (TTFT) by
+2.2-3.3X and increases the inference throughput by 2.8-5X, compared with full
+KV recompute, without compromising generation quality or incurring more storage
+cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Flood and Echo Net: Algorithmically Aligned GNNs that Generalize 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06970v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06970v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joël Mathys, Florian Grötschla, Kalyan Varma Nadimpalli, Roger Wattenhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most Graph Neural Networks follow the standard message-passing framework
+where, in each step, all nodes simultaneously communicate with each other. We
+want to challenge this paradigm by aligning the computation more closely to the
+execution of distributed algorithms and propose the Flood and Echo Net. A
+single round of a Flood and Echo Net consists of an origin node and a flooding
+phase followed by an echo phase. First, during the flooding, messages are sent
+from the origin and propagated outwards throughout the entire graph. Then,
+during the echo, the message flow reverses and messages are sent back towards
+the origin. As nodes are only sparsely activated upon receiving a message, this
+leads to a wave-like activation pattern that traverses the graph. Through these
+sparse but parallel activations, the Net becomes more expressive than
+traditional MPNNs which are limited by the 1-WL test and also is provably more
+efficient in terms of message complexity. Moreover, the mechanism's inherent
+ability to generalize across graphs of varying sizes positions it as a
+practical architecture for the task of algorithmic learning. We test the Flood
+and Echo Net on a variety of synthetic tasks and the SALSA-CLRS benchmark and
+find that the algorithmic alignment of the execution improves generalization to
+larger graph sizes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Functional Programming Paradigm of Python for Scientific Computation
+  Pipeline Integration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16956v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16956v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Zhang, Lecheng Jia, Wei Zhang, Ning Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of modern data processing has led to an increasing tendency
+towards interdisciplinarity, which frequently involves the importation of
+different technical approaches. Consequently, there is an urgent need for a
+unified data control system to facilitate the integration of varying libraries.
+This integration is of profound significance in accelerating prototype
+verification, optimising algorithm performance and minimising maintenance
+costs. This paper presents a novel functional programming (FP) paradigm based
+on the Python architecture and associated suites in programming practice,
+designed for the integration of pipelines of different data mapping operations.
+In particular, the solution is intended for the integration of scientific
+computation flows, which affords a robust yet flexible solution for the
+aforementioned challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Estimating the normal-inverse-Wishart distribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16088v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16088v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan So
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The normal-inverse-Wishart (NIW) distribution is commonly used as a prior
+distribution for the mean and covariance parameters of a multivariate normal
+distribution. The family of NIW distributions is also a minimal exponential
+family. In this short note we describe a convergent procedure for converting
+from mean parameters to natural parameters in the NIW family, or --
+equivalently -- for performing maximum likelihood estimation of the natural
+parameters given observed sufficient statistics. This is needed, for example,
+when using a NIW base family in expectation propagation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph-based Forecasting with Missing Data through Spatiotemporal
+  Downsampling <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10634v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10634v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Marisca, Cesare Alippi, Filippo Maria Bianchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a set of synchronous time series, each associated with a sensor-point
+in space and characterized by inter-series relationships, the problem of
+spatiotemporal forecasting consists of predicting future observations for each
+point. Spatiotemporal graph neural networks achieve striking results by
+representing the relationships across time series as a graph. Nonetheless, most
+existing methods rely on the often unrealistic assumption that inputs are
+always available and fail to capture hidden spatiotemporal dynamics when part
+of the data is missing. In this work, we tackle this problem through
+hierarchical spatiotemporal downsampling. The input time series are
+progressively coarsened over time and space, obtaining a pool of
+representations that capture heterogeneous temporal and spatial dynamics.
+Conditioned on observations and missing data patterns, such representations are
+combined by an interpretable attention mechanism to generate the forecasts. Our
+approach outperforms state-of-the-art methods on synthetic and real-world
+benchmarks under different missing data distributions, particularly in the
+presence of contiguous blocks of missing values.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Representation Surgery: Theory and Practice of Affine Steering <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09631v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09631v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashwat Singh, Shauli Ravfogel, Jonathan Herzig, Roee Aharoni, Ryan Cotterell, Ponnurangam Kumaraguru
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models often exhibit undesirable behavior, e.g., generating toxic or
+gender-biased text. In the case of neural language models, an encoding of the
+undesirable behavior is often present in the model's representations. Thus, one
+natural (and common) approach to prevent the model from exhibiting undesirable
+behavior is to steer the model's representations in a manner that reduces the
+probability of it generating undesirable text. This paper investigates the
+formal and empirical properties of steering functions, i.e., transformation of
+the neural language model's representations that alter its behavior. First, we
+derive two optimal, in the least-squares sense, affine steering functions under
+different constraints. Our theory provides justification for existing
+approaches and offers a novel, improved steering approach. Second, we offer a
+series of experiments that demonstrate the empirical effectiveness of the
+methods in mitigating bias and reducing toxic generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Examining properness in the external validation of survival models with
+  squared and logarithmic losses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Sonabend, John Zobolas, Philipp Kopper, Lukas Burk, Andreas Bender
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scoring rules promote rational and honest decision-making, which is becoming
+increasingly important for automated procedures in `auto-ML'. In this paper we
+survey common squared and logarithmic scoring rules for survival analysis and
+determine which losses are proper and improper. We prove that commonly utilised
+squared and logarithmic scoring rules that are claimed to be proper are in fact
+improper, such as the Integrated Survival Brier Score (ISBS). We further prove
+that under a strict set of assumptions a class of scoring rules is strictly
+proper for, what we term, `approximate' survival losses. Despite the difference
+in properness, experiments in simulated and real-world datasets show there is
+no major difference between improper and proper versions of the widely-used
+ISBS, ensuring that we can reasonably trust previous experiments utilizing the
+original score for evaluation purposes. We still advocate for the use of proper
+scoring rules, as even minor differences between losses can have important
+implications in automated processes such as model tuning. We hope our findings
+encourage further research into the properties of survival measures so that
+robust and honest evaluation of survival models can be achieved.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust and Conjugate Gaussian Process Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.00463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.00463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matias Altamirano, François-Xavier Briol, Jeremias Knoblauch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To enable closed form conditioning, a common assumption in Gaussian process
+(GP) regression is independent and identically distributed Gaussian observation
+noise. This strong and simplistic assumption is often violated in practice,
+which leads to unreliable inferences and uncertainty quantification.
+Unfortunately, existing methods for robustifying GPs break closed-form
+conditioning, which makes them less attractive to practitioners and
+significantly more computationally expensive. In this paper, we demonstrate how
+to perform provably robust and conjugate Gaussian process (RCGP) regression at
+virtually no additional cost using generalised Bayesian inference. RCGP is
+particularly versatile as it enables exact conjugate closed form updates in all
+settings where standard GPs admit them. To demonstrate its strong empirical
+performance, we deploy RCGP for problems ranging from Bayesian optimisation to
+sparse variational Gaussian processes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPEAR:Exact Gradient Inversion of Batches in Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03945v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03945v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitar I. Dimitrov, Maximilian Baader, Mark Niklas Müller, Martin Vechev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning is a framework for collaborative machine learning where
+clients only share gradient updates and not their private data with a server.
+However, it was recently shown that gradient inversion attacks can reconstruct
+this data from the shared gradients. In the important honest-but-curious
+setting, existing attacks enable exact reconstruction only for a batch size of
+$b=1$, with larger batches permitting only approximate reconstruction. In this
+work, we propose SPEAR, the first algorithm reconstructing whole batches with
+$b >1$ exactly. SPEAR combines insights into the explicit low-rank structure of
+gradients with a sampling-based algorithm. Crucially, we leverage ReLU-induced
+gradient sparsity to precisely filter out large numbers of incorrect samples,
+making a final reconstruction step tractable. We provide an efficient GPU
+implementation for fully connected networks and show that it recovers
+high-dimensional ImageNet inputs in batches of up to $b \lesssim 25$ exactly
+while scaling to large networks. Finally, we show theoretically that much
+larger batches can be reconstructed with high probability given exponential
+time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">survey</span> on multi-player bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.16275v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.16275v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Etienne Boursier, Vianney Perchet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due mostly to its application to cognitive radio networks, multiplayer
+bandits gained a lot of interest in the last decade. A considerable progress
+has been made on its theoretical aspect. However, the current algorithms are
+far from applicable and many obstacles remain between these theoretical results
+and a possible implementation of multiplayer bandits algorithms in real
+cognitive radio networks. This survey contextualizes and organizes the rich
+multiplayer bandits literature. In light of the existing works, some clear
+directions for future research appear. We believe that a further study of these
+different directions might lead to theoretical algorithms adapted to real-world
+situations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>final version, accepted at JMLR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do we need rebalancing strategies? A theoretical and empirical study
+  around SMOTE and its variants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03819v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03819v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdoulaye Sakho, Emmanuel Malherbe, Erwan Scornet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic Minority Oversampling Technique (SMOTE) is a common rebalancing
+strategy for handling imbalanced tabular data sets. However, few works analyze
+SMOTE theoretically. In this paper, we prove that SMOTE (with default
+parameter) simply copies the original minority samples asymptotically. We also
+prove that SMOTE exhibits boundary artifacts, thus justifying existing SMOTE
+variants. Then we introduce two new SMOTE-related strategies, and compare them
+with state-of-the-art rebalancing procedures. Surprisingly, for most data sets,
+we observe that applying no rebalancing strategy is competitive in terms of
+predictive performances, with tuned random forests. For highly imbalanced data
+sets, our new method, named Multivariate Gaussian SMOTE, is competitive.
+Besides, our analysis sheds some lights on the behavior of common rebalancing
+strategies, when used in conjunction with random forests.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quality-Diversity Actor-Critic: Learning High-Performing and Diverse
+  Behaviors via Value and Successor Features Critics <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09930v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09930v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Grillotti, Maxence Faldor, Borja G. León, Antoine Cully
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key aspect of intelligence is the ability to demonstrate a broad spectrum
+of behaviors for adapting to unexpected situations. Over the past decade,
+advancements in deep reinforcement learning have led to groundbreaking
+achievements to solve complex continuous control tasks. However, most
+approaches return only one solution specialized for a specific problem. We
+introduce Quality-Diversity Actor-Critic (QDAC), an off-policy actor-critic
+deep reinforcement learning algorithm that leverages a value function critic
+and a successor features critic to learn high-performing and diverse behaviors.
+In this framework, the actor optimizes an objective that seamlessly unifies
+both critics using constrained optimization to (1) maximize return, while (2)
+executing diverse skills. Compared with other Quality-Diversity methods, QDAC
+achieves significantly higher performance and more diverse behaviors on six
+challenging continuous control locomotion tasks. We also demonstrate that we
+can harness the learned skills to adapt better than other baselines to five
+perturbed environments. Finally, qualitative analyses showcase a range of
+remarkable behaviors: adaptive-intelligent-robotics.github.io/QDAC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The first two authors contributed equally to this work. Accepted at
+  ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Role of Learning Algorithms in Collective Action <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.06582v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.06582v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omri Ben-Dov, Jake Fawkes, Samira Samadi, Amartya Sanyal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collective action in machine learning is the study of the control that a
+coordinated group can have over machine learning algorithms. While previous
+research has concentrated on assessing the impact of collectives against
+Bayes~(sub)-optimal classifiers, this perspective is limited in that it does
+not account for the choice of learning algorithm. Classifiers seldom behave
+like Bayes classifiers and are influenced by the choice of learning algorithms
+along with their inherent biases. In this work, we initiate the study of how
+the choice of the learning algorithm plays a role in the success of a
+collective in practical settings. Specifically, we focus on distributionally
+robust optimization (DRO), popular for improving a worst group error, and on
+the ubiquitous stochastic gradient descent (SGD), due to its inductive bias for
+"simpler" functions. Our empirical results, supported by a theoretical
+foundation, show that the effective size and success of the collective are
+highly dependent on properties of the learning algorithm. This highlights the
+necessity of taking the learning algorithm into account when studying the
+impact of collective action in machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the International Conference in Machine Learning (ICML),
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VREM-FL: Mobility-Aware Computation-Scheduling Co-Design for Vehicular
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.18741v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.18741v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Ballotta, Nicolò Dal Fabbro, Giovanni Perin, Luca Schenato, Michele Rossi, Giuseppe Piro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assisted and autonomous driving are rapidly gaining momentum and will soon
+become a reality. Artificial intelligence and machine learning are regarded as
+key enablers thanks to the massive amount of data that smart vehicles will
+collect from onboard sensors. Federated learning is one of the most promising
+techniques for training global machine learning models while preserving data
+privacy of vehicles and optimizing communications resource usage. In this
+article, we propose vehicular radio environment map federated learning
+(VREM-FL), a computation-scheduling co-design for vehicular federated learning
+that combines mobility of vehicles with 5G radio environment maps. VREM-FL
+jointly optimizes learning performance of the global model and wisely allocates
+communication and computation resources. This is achieved by orchestrating
+local computations at the vehicles in conjunction with transmission of their
+local models in an adaptive and predictive fashion, by exploiting radio channel
+maps. The proposed algorithm can be tuned to trade training time for radio
+resource usage. Experimental results demonstrate that VREM-FL outperforms
+literature benchmarks for both a linear regression model (learning time reduced
+by 28%) and a deep neural network for semantic image segmentation (doubling the
+number of model updates within the same time window).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to optimize with convergence guarantees using nonlinear system
+  theory 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Martin, Luca Furieri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing reliance on numerical methods for controlling dynamical
+systems and training machine learning models underscores the need to devise
+algorithms that dependably and efficiently navigate complex optimization
+landscapes. Classical gradient descent methods offer strong theoretical
+guarantees for convex problems; however, they demand meticulous hyperparameter
+tuning for non-convex ones. The emerging paradigm of learning to optimize (L2O)
+automates the discovery of algorithms with optimized performance leveraging
+learning models and data - yet, it lacks a theoretical framework to analyze
+convergence of the learned algorithms. In this paper, we fill this gap by
+harnessing nonlinear system theory. Specifically, we propose an unconstrained
+parametrization of all convergent algorithms for smooth non-convex objective
+functions. Notably, our framework is directly compatible with automatic
+differentiation tools, ensuring convergence by design while learning to
+optimize.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the IEEE Control Systems Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A finite operator learning technique for mapping the elastic properties
+  of microstructures to their mechanical deformations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00074v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00074v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahed Rezaei, Reza Najian Asl, Shirko Faroughi, Mahdi Asgharzadeh, Ali Harandi, Rasoul Najafi Koopas, Gottfried Laschet, Stefanie Reese, Markus Apel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To obtain fast solutions for governing physical equations in solid mechanics,
+we introduce a method that integrates the core ideas of the finite element
+method with physics-informed neural networks and concept of neural operators.
+This approach generalizes and enhances each method, learning the parametric
+solution for mechanical problems without relying on data from other resources
+(e.g. other numerical solvers). We propose directly utilizing the available
+discretized weak form in finite element packages to construct the loss
+functions algebraically, thereby demonstrating the ability to find solutions
+even in the presence of sharp discontinuities. Our focus is on micromechanics
+as an example, where knowledge of deformation and stress fields for a given
+heterogeneous microstructure is crucial for further design applications. The
+primary parameter under investigation is the Young's modulus distribution
+within the heterogeneous solid system. Our investigations reveal that
+physics-based training yields higher accuracy compared to purely data-driven
+approaches for unseen microstructures. Additionally, we offer two methods to
+directly improve the process of obtaining high-resolution solutions, avoiding
+the need to use basic interpolation techniques. First is based on an
+autoencoder approach to enhance the efficiency for calculation on high
+resolution grid point. Next, Fourier-based parametrization is utilized to
+address complex 2D and 3D problems in micromechanics. The latter idea aims to
+represent complex microstructures efficiently using Fourier coefficients.
+Comparisons with other well-known operator learning algorithms, further
+emphasize the advantages of the newly proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MiniCPM: Unveiling the Potential of Small Language Models with Scalable
+  Training Strategies 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.06395v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.06395v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengding Hu, Yuge Tu, Xu Han, Chaoqun He, Ganqu Cui, Xiang Long, Zhi Zheng, Yewei Fang, Yuxiang Huang, Weilin Zhao, Xinrong Zhang, Zheng Leng Thai, Kaihuo Zhang, Chongyi Wang, Yuan Yao, Chenyang Zhao, Jie Zhou, Jie Cai, Zhongwu Zhai, Ning Ding, Chao Jia, Guoyang Zeng, Dahai Li, Zhiyuan Liu, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The burgeoning interest in developing Large Language Models (LLMs) with up to
+trillion parameters has been met with concerns regarding resource efficiency
+and practical expense, particularly given the immense cost of experimentation.
+This scenario underscores the importance of exploring the potential of Small
+Language Models (SLMs) as a resource-efficient alternative. In this context, we
+introduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter
+variants, not only excel in their respective categories but also demonstrate
+capabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach
+exhibits scalability in both model and data dimensions for future LLM research.
+Regarding model scaling, we employ extensive model wind tunnel experiments for
+stable and optimal scaling. For data scaling, we introduce a
+Warmup-Stable-Decay (WSD) learning rate scheduler (LRS), conducive to
+continuous training and domain adaptation. We present an in-depth analysis of
+the intriguing training dynamics that occurred in the WSD LRS. With WSD LRS, we
+are now able to efficiently study data-model scaling law without extensive
+retraining experiments on both axes of model and data, from which we derive the
+much higher compute optimal data-model ratio than Chinchilla Optimal.
+Additionally, we introduce MiniCPM family, including MiniCPM-DPO, MiniCPM-MoE
+and MiniCPM-128K, whose excellent performance further cementing MiniCPM's
+foundation in diverse SLM applications. MiniCPM models are available publicly
+at https://github.com/OpenBMB/MiniCPM .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>revise according to peer review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Optimal Transport for Domain Adaptation on SPD Manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.05745v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.05745v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ce Ju, Cuntai Guan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The machine learning community has shown increasing interest in addressing
+the domain adaptation problem on symmetric positive definite (SPD) manifolds.
+This interest is primarily driven by the complexities of neuroimaging data
+generated from brain signals, which often exhibit shifts in data distribution
+across recording sessions. These neuroimaging data, represented by signal
+covariance matrices, possess the mathematical properties of symmetry and
+positive definiteness. However, applying conventional domain adaptation methods
+is challenging because these mathematical properties can be disrupted when
+operating on covariance matrices. In this study, we introduce a novel geometric
+deep learning-based approach utilizing optimal transport on SPD manifolds to
+manage discrepancies in both marginal and conditional distributions between the
+source and target domains. We evaluate the effectiveness of this approach in
+three cross-session brain-computer interface scenarios and provide visualized
+results for further insights. The GitHub repository of this study can be
+accessed at
+https://github.com/GeometricBCI/Deep-Optimal-Transport-for-Domain-Adaptation-on-SPD-Manifolds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ACEGEN: Reinforcement learning of generative chemical agents for drug
+  discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.04657v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.04657v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Albert Bou, Morgan Thomas, Sebastian Dittert, Carles Navarro Ramírez, Maciej Majewski, Ye Wang, Shivam Patel, Gary Tresadern, Mazen Ahmad, Vincent Moens, Woody Sherman, Simone Sciabola, Gianni De Fabritiis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, reinforcement learning (RL) has emerged as a valuable tool
+in drug design, offering the potential to propose and optimize molecules with
+desired properties. However, striking a balance between capabilities,
+flexibility, reliability, and efficiency remains challenging due to the
+complexity of advanced RL algorithms and the significant reliance on
+specialized code. In this work, we introduce ACEGEN, a comprehensive and
+streamlined toolkit tailored for generative drug design, built using TorchRL, a
+modern RL library that offers thoroughly tested reusable components. We
+validate ACEGEN by benchmarking against other published generative modeling
+algorithms and show comparable or improved performance. We also show examples
+of ACEGEN applied in multiple drug discovery case studies. ACEGEN is accessible
+at \url{https://github.com/acellera/acegen-open} and available for use under
+the MIT license.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Practical Approach to Novel Class Discovery in Tabular Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.05440v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.05440v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Colin Troisemaine, Alexandre Reiffers-Masson, Stéphane Gosselin, Vincent Lemaire, Sandrine Vaton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of Novel Class Discovery (NCD) consists in extracting knowledge
+from a labeled set of known classes to accurately partition an unlabeled set of
+novel classes. While NCD has recently received a lot of attention from the
+community, it is often solved on computer vision problems and under unrealistic
+conditions. In particular, the number of novel classes is usually assumed to be
+known in advance, and their labels are sometimes used to tune hyperparameters.
+Methods that rely on these assumptions are not applicable in real-world
+scenarios. In this work, we focus on solving NCD in tabular data when no prior
+knowledge of the novel classes is available. To this end, we propose to tune
+the hyperparameters of NCD methods by adapting the $k$-fold cross-validation
+process and hiding some of the known classes in each fold. Since we have found
+that methods with too many hyperparameters are likely to overfit these hidden
+classes, we define a simple deep NCD model. This method is composed of only the
+essential elements necessary for the NCD problem and performs impressively well
+under realistic conditions. Furthermore, we find that the latent space of this
+method can be used to reliably estimate the number of novel classes.
+Additionally, we adapt two unsupervised clustering algorithms ($k$-means and
+Spectral Clustering) to leverage the knowledge of the known classes. Extensive
+experiments are conducted on 7 tabular datasets and demonstrate the
+effectiveness of the proposed method and hyperparameter tuning process, and
+show that the NCD problem can be solved without relying on knowledge from the
+novel classes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, including 7 pages of annexes</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fusion-PSRO: Nash Policy Fusion for Policy Space Response Oracles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21027v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21027v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiesong Lian, Yucong Huang, Mingzhi Wang, Chengdong Ma, Yixue Hao, Ying Wen, Yaodong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A popular approach for solving zero-sum games is to maintain populations of
+policies to approximate the Nash Equilibrium (NE). Previous studies have shown
+that Policy Space Response Oracle (PSRO) algorithm is an effective multi-agent
+reinforcement learning framework for solving such games. However, repeatedly
+training new policies from scratch to approximate Best Response (BR) to
+opponents' mixed policies at each iteration is both inefficient and costly.
+While some PSRO variants initialize a new policy by inheriting from past BR
+policies, this approach limits the exploration of new policies, especially
+against challenging opponents. To address this issue, we propose Fusion-PSRO,
+which employs policy fusion to initialize policies for better approximation to
+BR. By selecting high-quality base policies from meta-NE, policy fusion fuses
+the base policies into a new policy through model averaging. This approach
+allows the initialized policies to incorporate multiple expert policies, making
+it easier to handle difficult opponents compared to inheriting from past BR
+policies or initializing from scratch. Moreover, our method only modifies the
+policy initialization phase, allowing its application to nearly all PSRO
+variants without additional training overhead. Our experiments on
+non-transitive matrix games, Leduc Poker, and the more complex Liars Dice
+demonstrate that Fusion-PSRO enhances the performance of nearly all PSRO
+variants, achieving lower exploitability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Exploration Networks <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13049v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13049v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mattie Fellows, Brandon Kaplowitz, Christian Schroeder de Witt, Shimon Whiteson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian reinforcement learning (RL) offers a principled and elegant approach
+for sequential decision making under uncertainty. Most notably, Bayesian agents
+do not face an exploration/exploitation dilemma, a major pathology of
+frequentist methods. However theoretical understanding of model-free approaches
+is lacking. In this paper, we introduce a novel Bayesian model-free formulation
+and the first analysis showing that model-free approaches can yield
+Bayes-optimal policies. We show all existing model-free approaches make
+approximations that yield policies that can be arbitrarily Bayes-suboptimal. As
+a first step towards model-free Bayes optimality, we introduce the Bayesian
+exploration network (BEN) which uses normalising flows to model both the
+aleatoric uncertainty (via density estimation) and epistemic uncertainty (via
+variational inference) in the Bellman operator. In the limit of complete
+optimisation, BEN learns true Bayes-optimal policies, but like in variational
+expectation-maximisation, partial optimisation renders our approach tractable.
+Empirical results demonstrate that BEN can learn true Bayes-optimal policies in
+tasks where existing model-free approaches fail.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Version Update</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Expressivity of Persistent Homology in Graph Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09826v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09826v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rubén Ballester, Bastian Rieck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Persistent homology, a technique from computational topology, has recently
+shown strong empirical performance in the context of graph classification.
+Being able to capture long range graph properties via higher-order topological
+features, such as cycles of arbitrary length, in combination with multi-scale
+topological descriptors, has improved predictive performance for data sets with
+prominent topological structures, such as molecules. At the same time, the
+theoretical properties of persistent homology have not been formally assessed
+in this context. This paper intends to bridge the gap between computational
+topology and graph machine learning by providing a brief introduction to
+persistent homology in the context of graphs, as well as a theoretical
+discussion and empirical analysis of its expressivity for graph learning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Efficient Replay in Federated Incremental Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05890v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05890v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichen Li, Qunwei Li, Haozhao Wang, Ruixuan Li, Wenliang Zhong, Guannan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Federated Learning (FL), the data in each client is typically assumed
+fixed or static. However, data often comes in an incremental manner in
+real-world applications, where the data domain may increase dynamically. In
+this work, we study catastrophic forgetting with data heterogeneity in
+Federated Incremental Learning (FIL) scenarios where edge clients may lack
+enough storage space to retain full data. We propose to employ a simple,
+generic framework for FIL named Re-Fed, which can coordinate each client to
+cache important samples for replay. More specifically, when a new task arrives,
+each client first caches selected previous samples based on their global and
+local importance. Then, the client trains the local model with both the cached
+samples and the samples from the new task. Theoretically, we analyze the
+ability of Re-Fed to discover important samples for replay thus alleviating the
+catastrophic forgetting problem. Moreover, we empirically show that Re-Fed
+achieves competitive performance compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Wasserstein Gradient Flow for Generative Modeling through
+  Unbalanced Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05443v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05443v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaemoo Choi, Jaewoong Choi, Myungjoo Kang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Wasserstein Gradient Flow (WGF) describes the gradient dynamics of
+probability density within the Wasserstein space. WGF provides a promising
+approach for conducting optimization over the probability distributions.
+Numerically approximating the continuous WGF requires the time discretization
+method. The most well-known method for this is the JKO scheme. In this regard,
+previous WGF models employ the JKO scheme and parametrize transport map for
+each JKO step. However, this approach results in quadratic training complexity
+$O(K^2)$ with the number of JKO step $K$. This severely limits the scalability
+of WGF models. In this paper, we introduce a scalable WGF-based generative
+model, called Semi-dual JKO (S-JKO). Our model is based on the semi-dual form
+of the JKO step, derived from the equivalence between the JKO step and the
+Unbalanced Optimal Transport. Our approach reduces the training complexity to
+$O(K)$. We demonstrate that our model significantly outperforms existing
+WGF-based generative models, achieving FID scores of 2.62 on CIFAR-10 and 5.46
+on CelebA-HQ-256, which are comparable to state-of-the-art image generative
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What Is Fairness? On the Role of Protected Attributes and Fictitious
+  Worlds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09622v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09622v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ludwig Bothmann, Kristina Peters, Bernd Bischl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A growing body of literature in fairness-aware machine learning (fairML) aims
+to mitigate machine learning (ML)-related unfairness in automated
+decision-making (ADM) by defining metrics that measure fairness of an ML model
+and by proposing methods to ensure that trained ML models achieve low scores on
+these metrics. However, the underlying concept of fairness, i.e., the question
+of what fairness is, is rarely discussed, leaving a significant gap between
+centuries of philosophical discussion and the recent adoption of the concept in
+the ML community. In this work, we try to bridge this gap by formalizing a
+consistent concept of fairness and by translating the philosophical
+considerations into a formal framework for the training and evaluation of ML
+models in ADM systems. We argue that fairness problems can arise even without
+the presence of protected attributes (PAs), and point out that fairness and
+predictive performance are not irreconcilable opposites, but that the latter is
+necessary to achieve the former. Furthermore, we argue why and how causal
+considerations are necessary when assessing fairness in the presence of PAs by
+proposing a fictitious, normatively desired (FiND) world in which PAs have no
+causal effects. In practice, this FiND world must be approximated by a warped
+world in which the causal effects of the PAs are removed from the real-world
+data. Finally, we achieve greater linguistic clarity in the discussion of
+fairML. We outline algorithms for practical applications and present
+illustrative experiments on COMPAS data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NUBO: A Transparent Python Package for Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06709v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06709v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mike Diessner, Kevin J. Wilson, Richard D. Whalley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  NUBO, short for Newcastle University Bayesian Optimization, is a Bayesian
+optimization framework for optimizing expensive-to-evaluate black-box
+functions, such as physical experiments and computer simulators. Bayesian
+optimization is a cost-efficient optimization strategy that uses surrogate
+modeling via Gaussian processes to represent an objective function and
+acquisition functions to guide the selection of candidate points to approximate
+the global optimum of the objective function. NUBO focuses on transparency and
+user experience to make Bayesian optimization accessible to researchers from
+all disciplines. Clean and understandable code, precise references, and
+thorough documentation ensure transparency, while a modular and flexible
+design, easy-to-write syntax, and careful selection of Bayesian optimization
+algorithms ensure a good user experience. NUBO allows users to tailor Bayesian
+optimization to their problem by writing a custom optimization loop using the
+provided building blocks. It supports sequential single-point, parallel
+multi-point, and asynchronous optimization of bounded, constrained, and mixed
+(discrete and continuous) parameter input spaces. Only algorithms and methods
+extensively tested and validated to perform well are included in NUBO. This
+ensures that the package remains compact and does not overwhelm the user with
+an unnecessarily large number of options. The package is written in Python but
+does not require expert knowledge of Python to optimize simulators and
+experiments. NUBO is distributed as open-source software under the BSD 3-Clause
+license.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication by the Journal of Statistical Software</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cardinality Estimation over Knowledge Graphs with Embeddings and Graph
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Schwabe, Maribel Acosta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardinality Estimation over Knowledge Graphs (KG) is crucial for query
+optimization, yet remains a challenging task due to the semi-structured nature
+and complex correlations of typical Knowledge Graphs. In this work, we propose
+GNCE, a novel approach that leverages knowledge graph embeddings and Graph
+Neural Networks (GNN) to accurately predict the cardinality of conjunctive
+queries. GNCE first creates semantically meaningful embeddings for all entities
+in the KG, which are then integrated into the given query, which is processed
+by a GNN to estimate the cardinality of the query. We evaluate GNCE on several
+KGs in terms of q-Error and demonstrate that it outperforms state-of-the-art
+approaches based on sampling, summaries, and (machine) learning in terms of
+estimation accuracy while also having lower execution time and less parameters.
+Additionally, we show that GNCE can inductively generalise to unseen entities,
+making it suitable for use in dynamic query processing scenarios. Our proposed
+approach has the potential to significantly improve query optimization and
+related applications that rely on accurate cardinality estimates of conjunctive
+queries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Tree-structured Knowledge Graph For Academic Insight <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04854v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04854v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghong Li, Huy Phan, Wen Gu, Koichi Ota, Shinobu Hasegawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research surveys have always posed a challenge for beginner researchers who
+lack of research training. These researchers struggle to understand the
+directions within their research topic, and the discovery of new research
+findings within a short time. One way to provide intuitive assistance to
+beginner researchers is by offering relevant knowledge graphs(KG) and
+recommending related academic papers. However, existing navigation knowledge
+graphs primarily rely on keywords in the research field and often fail to
+present the logical hierarchy among multiple related papers clearly. Moreover,
+most recommendation systems for academic papers simply rely on high text
+similarity, which can leave researchers confused as to why a particular article
+is being recommended. They may lack of grasp important information about the
+insight connection between "Issue resolved" and "Issue finding" that they hope
+to obtain. To address these issues, this study aims to support research insight
+surveys for beginner researchers by establishing a hierarchical tree-structured
+knowledge graph that reflects the inheritance insight of research topics and
+the relevance insight among the academic papers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper will be submitted to 'The 18TH International Conference on
+  INnovations in Intelligent SysTems and Applications (INISTA 2024)'</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KS-Lottery: Finding Certified Lottery Tickets for Multilingual Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02801v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02801v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fei Yuan, Chang Ma, Shuai Yuan, Qiushi Sun, Lei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lottery ticket hypothesis posits the existence of ``winning tickets''
+within a randomly initialized neural network. Do winning tickets exist for LLMs
+in fine-tuning scenarios? How can we find such winning tickets? In this paper,
+we propose KS-Lottery, a method to identify a small subset of LLM parameters
+highly effective in multilingual fine-tuning. Our key idea is to use
+Kolmogorov-Smirnov Test to analyze the distribution shift of parameters before
+and after fine-tuning. We further theoretically prove that KS-Lottery can find
+the certified winning tickets in the embedding layer, fine-tuning on the found
+parameters is guaranteed to perform as well as full fine-tuning. Comparing
+KS-Lottery with other parameter-efficient tuning algorithms on translation
+tasks, the experimental results show that KS-Lottery finds a much smaller set
+of parameters for fine-tuning while achieving the comparable performance as
+full fine-tuning LLM. Surprisingly, we find that fine-tuning 18 tokens'
+embedding of LLaMA suffices to reach the fine-tuning translation
+performance~\footnote{https://github.com/CONE-MT/KS-Lottery.}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAMformer: Unlocking the Potential of <span class="highlight-title">Transformer</span>s in Time Series
+  Forecasting with Sharpness-Aware Minimization and Channel-Wise Attention <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10198v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10198v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Romain Ilbert, Ambroise Odonnat, Vasilii Feofanov, Aladin Virmaux, Giuseppe Paolo, Themis Palpanas, Ievgen Redko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based architectures achieved breakthrough performance in natural
+language processing and computer vision, yet they remain inferior to simpler
+linear baselines in multivariate long-term forecasting. To better understand
+this phenomenon, we start by studying a toy linear forecasting problem for
+which we show that transformers are incapable of converging to their true
+solution despite their high expressive power. We further identify the attention
+of transformers as being responsible for this low generalization capacity.
+Building upon this insight, we propose a shallow lightweight transformer model
+that successfully escapes bad local minima when optimized with sharpness-aware
+optimization. We empirically demonstrate that this result extends to all
+commonly used real-world multivariate time series datasets. In particular,
+SAMformer surpasses current state-of-the-art methods and is on par with the
+biggest foundation model MOIRAI while having significantly fewer parameters.
+The code is available at https://github.com/romilbert/samformer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as an Oral at ICML 2024, Vienna. The first two authors
+  contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SparseTSF: Modeling Long-term Time Series Forecasting with 1k Parameters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengsheng Lin, Weiwei Lin, Wentai Wu, Haojun Chen, Junjie Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces SparseTSF, a novel, extremely lightweight model for
+Long-term Time Series Forecasting (LTSF), designed to address the challenges of
+modeling complex temporal dependencies over extended horizons with minimal
+computational resources. At the heart of SparseTSF lies the Cross-Period Sparse
+Forecasting technique, which simplifies the forecasting task by decoupling the
+periodicity and trend in time series data. This technique involves downsampling
+the original sequences to focus on cross-period trend prediction, effectively
+extracting periodic features while minimizing the model's complexity and
+parameter count. Based on this technique, the SparseTSF model uses fewer than
+*1k* parameters to achieve competitive or superior performance compared to
+state-of-the-art models. Furthermore, SparseTSF showcases remarkable
+generalization capabilities, making it well-suited for scenarios with limited
+computational resources, small samples, or low-quality data. The code is
+publicly available at this repository: https://github.com/lss-1138/SparseTSF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spurious Feature Eraser: Stabilizing Test-Time Adaptation for
+  Vision-Language Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00376v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00376v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan Ma, Yan Zhu, Changqing Zhang, Peilin Zhao, Baoyuan Wu, Long-Kai Huang, Qinghua Hu, Bingzhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language foundation models have exhibited remarkable success across a
+multitude of downstream tasks due to their scalability on extensive image-text
+paired data. However, these models also display significant limitations when
+applied to downstream tasks, such as fine-grained image classification, as a
+result of ``decision shortcuts'' that hinder their generalization capabilities.
+In this work, we find that the CLIP model possesses a rich set of features,
+encompassing both \textit{desired invariant causal features} and
+\textit{undesired decision shortcuts}. Moreover, the underperformance of CLIP
+on downstream tasks originates from its inability to effectively utilize
+pre-trained features in accordance with specific task requirements. To address
+this challenge, we propose a simple yet effective method, Spurious Feature
+Eraser (SEraser), to alleviate the decision shortcuts by erasing the spurious
+features. Specifically, we introduce a test-time prompt tuning paradigm that
+optimizes a learnable prompt, thereby compelling the model to exploit invariant
+features while disregarding decision shortcuts during the inference phase. The
+proposed method effectively alleviates excessive dependence on potentially
+misleading spurious information. We conduct comparative analysis of the
+proposed method against various approaches which validates the significant
+superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accented Text-to-Speech Synthesis with a Conditional Variational
+  Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.03316v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.03316v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Melechovsky, Ambuj Mehrish, Berrak Sisman, Dorien Herremans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accent plays a significant role in speech communication, influencing one's
+capability to understand as well as conveying a person's identity. This paper
+introduces a novel and efficient framework for accented Text-to-Speech (TTS)
+synthesis based on a Conditional Variational Autoencoder. It has the ability to
+synthesize a selected speaker's voice, which is converted to any desired target
+accent. Our thorough experiments validate the effectiveness of the proposed
+framework using both objective and subjective evaluations. The results also
+show remarkable performance in terms of the ability to manipulate accents in
+the synthesized speech and provide a promising avenue for future accented TTS
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint submitted to a conference, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RoSA: Accurate Parameter-Efficient Fine-Tuning via Robust Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04679v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04679v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahdi Nikdan, Soroush Tabesh, Elvir Crnčević, Dan Alistarh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate parameter-efficient fine-tuning (PEFT) methods that can
+provide good accuracy under limited computational and memory budgets in the
+context of large language models (LLMs). We present a new PEFT method called
+Robust Adaptation (RoSA) inspired by robust principal component analysis that
+jointly trains $\textit{low-rank}$ and $\textit{highly-sparse}$ components on
+top of a set of fixed pretrained weights to efficiently approximate the
+performance of a full-fine-tuning (FFT) solution. Across a series of
+challenging generative tasks such as grade-school math and SQL query
+generation, which require fine-tuning for good performance, we show that RoSA
+outperforms LoRA, pure sparse fine-tuning, and alternative hybrid methods at
+the same parameter budget, and can even recover the performance of FFT on some
+tasks. We provide system support for RoSA to complement the training algorithm,
+specifically in the form of sparse GPU kernels which enable memory- and
+computationally-efficient training, and show that it is also compatible with
+low-precision base weights, resulting in the first joint representation
+combining quantization, low-rank and sparse approximations. Our code is
+available at https://github.com/IST-DASLab/RoSA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On <span class="highlight-title">Prompt</span>-Driven Safeguarding for Large Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.18018v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.18018v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chujie Zheng, Fan Yin, Hao Zhou, Fandong Meng, Jie Zhou, Kai-Wei Chang, Minlie Huang, Nanyun Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prepending model inputs with safety prompts is a common practice for
+safeguarding large language models (LLMs) against queries with harmful intents.
+However, the underlying working mechanisms of safety prompts have not been
+unraveled yet, restricting the possibility of automatically optimizing them to
+improve LLM safety. In this work, we investigate how LLMs' behavior (i.e.,
+complying with or refusing user queries) is affected by safety prompts from the
+perspective of model representation. We find that in the representation space,
+the input queries are typically moved by safety prompts in a "higher-refusal"
+direction, in which models become more prone to refusing to provide assistance,
+even when the queries are harmless. On the other hand, LLMs are naturally
+capable of distinguishing harmful and harmless queries without safety prompts.
+Inspired by these findings, we propose a method for safety prompt optimization,
+namely DRO (Directed Representation Optimization). Treating a safety prompt as
+continuous, trainable embeddings, DRO learns to move the queries'
+representations along or opposite the refusal direction, depending on their
+harmfulness. Experiments with eight LLMs on out-of-domain and jailbreak
+benchmarks demonstrate that DRO remarkably improves the safeguarding
+performance of human-crafted safety prompts, without compromising the models'
+general performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rewriting the Code: A Simple Method for Large Language Model Augmented
+  Code Search <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04514v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04514v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Li, Xin Zhou, Zhiqi Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In code search, the Generation-Augmented Retrieval (GAR) framework, which
+generates exemplar code snippets to augment queries, has emerged as a promising
+strategy to address the principal challenge of modality misalignment between
+code snippets and natural language queries, particularly with the demonstrated
+code generation capabilities of Large Language Models (LLMs). Nevertheless, our
+preliminary investigations indicate that the improvements conferred by such an
+LLM-augmented framework are somewhat constrained. This limitation could
+potentially be ascribed to the fact that the generated codes, albeit
+functionally accurate, frequently display a pronounced stylistic deviation from
+the ground truth code in the codebase. In this paper, we extend the
+foundational GAR framework and propose a simple yet effective method that
+additionally Rewrites the Code (ReCo) within the codebase for style
+normalization. Experimental results demonstrate that ReCo significantly boosts
+retrieval accuracy across sparse (up to 35.7%), zero-shot dense (up to 27.6%),
+and fine-tuned dense (up to 23.6%) retrieval settings in diverse search
+scenarios. To further elucidate the advantages of ReCo and stimulate research
+in code style normalization, we introduce Code Style Similarity, the first
+metric tailored to quantify stylistic similarities in code. Notably, our
+empirical findings reveal the inadequacy of existing metrics in capturing
+stylistic nuances. The source code and data are available at
+\url{https://github.com/Alex-HaochenLi/ReCo}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sample Efficient Reinforcement Learning with Partial Dynamics Knowledge <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12558v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12558v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meshal Alharbi, Mardavij Roozbehani, Munther Dahleh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The problem of sample complexity of online reinforcement learning is often
+studied in the literature without taking into account any partial knowledge
+about the system dynamics that could potentially accelerate the learning
+process. In this paper, we study the sample complexity of online Q-learning
+methods when some prior knowledge about the dynamics is available or can be
+learned efficiently. We focus on systems that evolve according to an additive
+disturbance model of the form $S_{h+1} = f(S_h, A_h) + W_h$, where $f$
+represents the underlying system dynamics, and $W_h$ are unknown disturbances
+independent of states and actions. In the setting of finite episodic Markov
+decision processes with $S$ states, $A$ actions, and episode length $H$, we
+present an optimistic Q-learning algorithm that achieves
+$\tilde{\mathcal{O}}(\text{Poly}(H)\sqrt{T})$ regret under perfect knowledge of
+$f$, where $T$ is the total number of interactions with the system. This is in
+contrast to the typical $\tilde{\mathcal{O}}(\text{Poly}(H)\sqrt{SAT})$ regret
+for existing Q-learning methods. Further, if only a noisy estimate $\hat{f}$ of
+$f$ is available, our method can learn an approximately optimal policy in a
+number of samples that is independent of the cardinalities of state and action
+spaces. The sub-optimality gap depends on the approximation error $\hat{f}-f$,
+as well as the Lipschitz constant of the corresponding optimal value function.
+Our approach does not require modeling of the transition probabilities and
+enjoys the same memory complexity as model-free methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in the 38th Annual AAAI Conference on Artificial
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decomposable Submodular Maximization in Federated Setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.00138v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.00138v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akbar Rafiey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Submodular functions, as well as the sub-class of decomposable submodular
+functions, and their optimization appear in a wide range of applications in
+machine learning, recommendation systems, and welfare maximization. However,
+optimization of decomposable submodular functions with millions of component
+functions is computationally prohibitive. Furthermore, the component functions
+may be private (they might represent user preference function, for example) and
+cannot be widely shared. To address these issues, we propose a {\em federated
+optimization} setting for decomposable submodular optimization. In this
+setting, clients have their own preference functions, and a weighted sum of
+these preferences needs to be maximized. We implement the popular {\em
+continuous greedy} algorithm in this setting where clients take parallel small
+local steps towards the local solution and then the local changes are
+aggregated at a central server. To address the large number of clients, the
+aggregation is performed only on a subsampled set. Further, the aggregation is
+performed only intermittently between stretches of parallel local steps, which
+reduces communication cost significantly. We show that our federated algorithm
+is guaranteed to provide a good approximate solution, even in the presence of
+above cost-cutting measures. Finally, we show how the federated setting can be
+incorporated in solving fundamental discrete submodular optimization problems
+such as Maximum Coverage and Facility Location.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large
+  Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuohao Yu, Chang Gao, Wenjin Yao, Yidong Wang, Wei Ye, Jindong Wang, Xing Xie, Yue Zhang, Shikun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic evaluation methods for large language models (LLMs) are hindered by
+data contamination, leading to inflated assessments of their effectiveness.
+Existing strategies, which aim to detect contaminated texts, focus on
+quantifying contamination status instead of accurately gauging model
+performance. In this paper, we introduce KIEval, a Knowledge-grounded
+Interactive Evaluation framework, which incorporates an LLM-powered
+"interactor" role for the first time to accomplish a dynamic
+contamination-resilient evaluation. Starting with a question in a conventional
+LLM benchmark involving domain-specific knowledge, KIEval utilizes dynamically
+generated, multi-round, and knowledge-focused dialogues to determine whether a
+model's response is merely a recall of benchmark answers or demonstrates a deep
+comprehension to apply knowledge in more complex conversations. Extensive
+experiments on seven leading LLMs across five datasets validate KIEval's
+effectiveness and generalization. We also reveal that data contamination brings
+no contribution or even negative effect to models' real-world applicability and
+understanding, and existing contamination detection methods for LLMs can only
+identify contamination in pre-training but not during supervised fine-tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 (main conference); 19 pages, 5 figures, 19
+  tables, code is available at: https://github.com/zhuohaoyu/KIEval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model Editing by Standard Fine-Tuning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11078v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11078v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Govind Gangadhar, Karl Stratos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Standard fine-tuning is considered not as effective as specialized methods
+for model editing due to its comparatively poor performance. However, it is
+simple, agnostic to the architectural details of the model being edited, and
+able to leverage advances in standard training techniques with no additional
+work (e.g., black-box PEFT for computational efficiency), making it an
+appealing choice for a model editor. In this work, we show that standard
+fine-tuning alone can yield competitive model editing performance with two
+minor modifications. First, we optimize the conditional likelihood rather than
+the full likelihood. Second, in addition to the typical practice of training on
+randomly paraphrased edit prompts to encourage generalization, we also train on
+random or similar unedited facts to encourage locality. Our experiments on the
+ZsRE and CounterFact datasets demonstrate that these simple modifications allow
+standard fine-tuning to match or outperform highly specialized editors in terms
+of edit score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data Contamination Calibration for Black-box LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11930v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11930v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Ye, Jiaqi Hu, Liyao Li, Haobo Wang, Gang Chen, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancements of Large Language Models (LLMs) tightly associate with
+the expansion of the training data size. However, the unchecked
+ultra-large-scale training sets introduce a series of potential risks like data
+contamination, i.e. the benchmark data is used for training. In this work, we
+propose a holistic method named Polarized Augment Calibration (PAC) along with
+a new to-be-released dataset to detect the contaminated data and diminish the
+contamination effect. PAC extends the popular MIA (Membership Inference Attack)
+-- from machine learning community -- by forming a more global target at
+detecting training data to Clarify invisible training data. As a pioneering
+work, PAC is very much plug-and-play that can be integrated with most (if not
+all) current white- and black-box LLMs. By extensive experiments, PAC
+outperforms existing methods by at least 4.5%, towards data contamination
+detection on more 4 dataset formats, with more than 10 base LLMs. Besides, our
+application in real-world scenarios highlights the prominent presence of
+contamination and related issues.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving out-of-distribution generalization in graphs via hierarchical
+  semantic environments <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01773v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01773v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinhua Piao, Sangseon Lee, Yijingxiu Lu, Sun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) generalization in the graph domain is challenging
+due to complex distribution shifts and a lack of environmental contexts. Recent
+methods attempt to enhance graph OOD generalization by generating flat
+environments. However, such flat environments come with inherent limitations to
+capture more complex data distributions. Considering the DrugOOD dataset, which
+contains diverse training environments (e.g., scaffold, size, etc.), flat
+contexts cannot sufficiently address its high heterogeneity. Thus, a new
+challenge is posed to generate more semantically enriched environments to
+enhance graph invariant learning for handling distribution shifts. In this
+paper, we propose a novel approach to generate hierarchical semantic
+environments for each graph. Firstly, given an input graph, we explicitly
+extract variant subgraphs from the input graph to generate proxy predictions on
+local environments. Then, stochastic attention mechanisms are employed to
+re-extract the subgraphs for regenerating global environments in a hierarchical
+manner. In addition, we introduce a new learning objective that guides our
+model to learn the diversity of environments within the same hierarchy while
+maintaining consistency across different hierarchies. This approach enables our
+model to consider the relationships between environments and facilitates robust
+graph invariant learning. Extensive experiments on real-world graph data have
+demonstrated the effectiveness of our framework. Particularly, in the
+challenging dataset DrugOOD, our method achieves up to 1.29% and 2.83%
+improvement over the best baselines on IC50 and EC50 prediction tasks,
+respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do <span class="highlight-title">pretrain</span>ed <span class="highlight-title">Transformer</span>s Learn In-Context by Gradient Descent? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08540v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08540v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingfeng Shen, Aayush Mishra, Daniel Khashabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of In-Context Learning (ICL) in LLMs remains a remarkable
+phenomenon that is partially understood. To explain ICL, recent studies have
+created theoretical connections to Gradient Descent (GD). We ask, do such
+connections hold up in actual pre-trained language models? We highlight the
+limiting assumptions in prior works that make their setup considerably
+different from the practical setup in which language models are trained. For
+example, their experimental verification uses \emph{ICL objective} (training
+models explicitly for ICL), which differs from the emergent ICL in the wild.
+Furthermore, the theoretical hand-constructed weights used in these studies
+have properties that don't match those of real LLMs. We also look for evidence
+in real models. We observe that ICL and GD have different sensitivity to the
+order in which they observe demonstrations. Finally, we probe and compare the
+ICL vs. GD hypothesis in a natural setting. We conduct comprehensive empirical
+analyses on language models pre-trained on natural data (LLaMa-7B). Our
+comparisons of three performance metrics highlight the inconsistent behavior of
+ICL and GD as a function of various factors such as datasets, models, and the
+number of demonstrations. We observe that ICL and GD modify the output
+distribution of language models differently. These results indicate that
+\emph{the equivalence between ICL and GD remains an open hypothesis} and calls
+for further studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Characteristic Guidance: Non-linear Correction for Diffusion Model at
+  Large Guidance Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07586v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07586v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Candi Zheng, Yuan Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Popular guidance for denoising diffusion probabilistic model (DDPM) linearly
+combines distinct conditional models together to provide enhanced control over
+samples. However, this approach overlooks nonlinear effects that become
+significant when guidance scale is large. To address this issue, we propose
+characteristic guidance, a guidance method that provides first-principle
+non-linear correction for classifier-free guidance. Such correction forces the
+guided DDPMs to respect the Fokker-Planck (FP) equation of diffusion process,
+in a way that is training-free and compatible with existing sampling methods.
+Experiments show that characteristic guidance enhances semantic characteristics
+of prompts and mitigate irregularities in image generation, proving effective
+in diverse applications ranging from simulating magnet phase transitions to
+latent space sampling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MC-GTA: Metric-Constrained Model-Based Clustering using Goodness-of-fit
+  Tests with Autocorrelations <span class="chip">ICML-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18395v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18395v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangyu Wang, Gengchen Mai, Krzysztof Janowicz, Ni Lao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A wide range of (multivariate) temporal (1D) and spatial (2D) data analysis
+tasks, such as grouping vehicle sensor trajectories, can be formulated as
+clustering with given metric constraints. Existing metric-constrained
+clustering algorithms overlook the rich correlation between feature similarity
+and metric distance, i.e., metric autocorrelation. The model-based variations
+of these clustering algorithms (e.g. TICC and STICC) achieve SOTA performance,
+yet suffer from computational instability and complexity by using a
+metric-constrained Expectation-Maximization procedure. In order to address
+these two problems, we propose a novel clustering algorithm, MC-GTA
+(Model-based Clustering via Goodness-of-fit Tests with Autocorrelations). Its
+objective is only composed of pairwise weighted sums of feature similarity
+terms (square Wasserstein-2 distance) and metric autocorrelation terms (a novel
+multivariate generalization of classic semivariogram). We show that MC-GTA is
+effectively minimizing the total hinge loss for intra-cluster observation pairs
+not passing goodness-of-fit tests, i.e., statistically not originating from the
+same distribution. Experiments on 1D/2D synthetic and real-world datasets
+demonstrate that MC-GTA successfully incorporates metric autocorrelation. It
+outperforms strong baselines by large margins (up to 14.3% in ARI and 32.1% in
+NMI) with faster and stabler optimization (>10x speedup).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML-2024 Proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Continual Learning: Forget-free Winning Subnetworks for Video
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.11973v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.11973v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haeyong Kang, Jaehong Yoon, Sung Ju Hwang, Chang D. Yoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the
+existence of efficient subnetworks within larger, dense networks, a
+high-performing Winning Subnetwork (WSN) in terms of task performance under
+appropriate sparsity conditions is considered for various continual learning
+tasks. It leverages pre-existing weights from dense networks to achieve
+efficient learning in Task Incremental Learning (TIL) and Task-agnostic
+Incremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning
+(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is
+designed to prevent overfitting when the data samples are scarce. Furthermore,
+the sparse reuse of WSN weights is considered for Video Incremental Learning
+(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It
+enables compact encoding of videos and identifies reusable subnetworks across
+varying bandwidths. We have integrated FSO into different architectural
+frameworks for continual learning, including VIL, TIL, and FSCIL. Our
+comprehensive experiments demonstrate FSO's effectiveness, significantly
+improving task performance at various convolutional representational levels.
+Specifically, FSO enhances higher-layer performance in TIL and FSCIL and
+lower-layer performance in VIL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2303.14962,
+  arXiv:2306.11305</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLark: A Multimodal Instruction-Following Language Model for Music <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07160v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07160v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josh Gardner, Simon Durand, Daniel Stoller, Rachel M. Bittner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Music has a unique and complex structure which is challenging for both expert
+humans and existing AI systems to understand, and presents unique challenges
+relative to other forms of audio. We present LLark, an instruction-tuned
+multimodal model for \emph{music} understanding. We detail our process for
+dataset creation, which involves augmenting the annotations of diverse
+open-source music datasets and converting them to a unified instruction-tuning
+format. We propose a multimodal architecture for LLark, integrating a
+pretrained generative model for music with a pretrained language model. In
+evaluations on three types of tasks (music understanding, captioning,
+reasoning), we show that LLark matches or outperforms existing baselines in
+music understanding, and that humans show a high degree of agreement with its
+responses in captioning and reasoning tasks. LLark is trained entirely from
+open-source music data and models, and we make our training code available
+along with the release of this paper. Additional results and audio examples are
+at https://bit.ly/llark, and our source code is available at
+https://github.com/spotify-research/llark .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformalized Survival Distributions: A Generic Post-Process to Increase
+  Calibration <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07374v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07374v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shi-ang Qi, Yakun Yu, Russell Greiner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discrimination and calibration represent two important properties of survival
+analysis, with the former assessing the model's ability to accurately rank
+subjects and the latter evaluating the alignment of predicted outcomes with
+actual events. With their distinct nature, it is hard for survival models to
+simultaneously optimize both of them especially as many previous results found
+improving calibration tends to diminish discrimination performance. This paper
+introduces a novel approach utilizing conformal regression that can improve a
+model's calibration without degrading discrimination. We provide theoretical
+guarantees for the above claim, and rigorously validate the efficiency of our
+approach across 11 real-world datasets, showcasing its practical applicability
+and robustness in diverse scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024; 37 pages, 19 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Wolf Within: Covert Injection of Malice into MLLM Societies via an
+  MLLM Operative <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14859v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14859v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Tan, Chengshuai Zhao, Raha Moraffah, Yifan Li, Yu Kong, Tianlong Chen, Huan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to their unprecedented ability to process and respond to various types of
+data, Multimodal Large Language Models (MLLMs) are constantly defining the new
+boundary of Artificial General Intelligence (AGI). As these advanced generative
+models increasingly form collaborative networks for complex tasks, the
+integrity and security of these systems are crucial. Our paper, ``The Wolf
+Within'', explores a novel vulnerability in MLLM societies - the indirect
+propagation of malicious content. Unlike direct harmful output generation for
+MLLMs, our research demonstrates how a single MLLM agent can be subtly
+influenced to generate prompts that, in turn, induce other MLLM agents in the
+society to output malicious content. Our findings reveal that, an MLLM agent,
+when manipulated to produce specific prompts or instructions, can effectively
+``infect'' other agents within a society of MLLMs. This infection leads to the
+generation and circulation of harmful outputs, such as dangerous instructions
+or misinformation, across the society. We also show the transferability of
+these indirectly generated prompts, highlighting their possibility in
+propagating malice through inter-agent communication. This research provides a
+critical insight into a new dimension of threat posed by MLLMs, where a single
+agent can act as a catalyst for widespread malevolent influence. Our work
+underscores the urgent need for developing robust mechanisms to detect and
+mitigate such covert manipulations within MLLM societies, ensuring their safe
+and ethical utilization in societal applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to workshop on ReGenAI@CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring and Exploiting the Asymmetric Valley of Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12489v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12489v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin-Chun Li, Jin-Lin Tang, Bo Zhang, Lan Li, De-Chuan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploring the loss landscape offers insights into the inherent principles of
+deep neural networks (DNNs). Recent work suggests an additional asymmetry of
+the valley beyond the flat and sharp ones, yet without thoroughly examining its
+causes or implications. Our study methodically explores the factors affecting
+the symmetry of DNN valleys, encompassing (1) the dataset, network
+architecture, initialization, and hyperparameters that influence the
+convergence point; and (2) the magnitude and direction of the noise for 1D
+visualization. Our major observation shows that the {\it degree of sign
+consistency} between the noise and the convergence point is a critical
+indicator of valley symmetry. Theoretical insights from the aspects of ReLU
+activation and softmax function could explain the interesting phenomenon. Our
+discovery propels novel understanding and applications in the scenario of Model
+Fusion: (1) the efficacy of interpolating separate models significantly
+correlates with their sign consistency ratio, and (2) imposing sign alignment
+during federated learning emerges as an innovative approach for model parameter
+alignment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tackling the Unlimited Staleness in Federated Learning with Intertwined
+  Data and Device Heterogeneities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13536v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13536v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoming Wang, Wei Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The efficiency of Federated Learning (FL) is often affected by both data and
+device heterogeneities. Data heterogeneity is defined as the heterogeneity of
+data distributions on different clients. Device heterogeneity is defined as the
+clients' variant latencies in uploading their local model updates due to
+heterogeneous conditions of local hardware resources, and causes the problem of
+staleness when being addressed by asynchronous FL. Traditional schemes of
+tackling the impact of staleness consider data and device heterogeneities as
+two separate and independent aspects in FL, but this assumption is unrealistic
+in many practical FL scenarios where data and device heterogeneities are
+intertwined. In these cases, traditional schemes of weighted aggregation in FL
+have been proved to be ineffective, and a better approach is to convert a stale
+model update into a non-stale one. In this paper, we present a new FL framework
+that leverages the gradient inversion technique for such conversion, hence
+efficiently tackling unlimited staleness in clients' model updates. Our basic
+idea is to use gradient inversion to get estimations of clients' local training
+data from their uploaded stale model updates, and use these estimations to
+compute non-stale client model updates. In this way, we address the problem of
+possible data quality drop when using gradient inversion, while still
+preserving the clients' local data privacy. We compared our approach with the
+existing FL strategies on mainstream datasets and models, and experiment
+results demonstrate that when tackling unlimited staleness, our approach can
+significantly improve the trained model accuracy by up to 20% and speed up the
+FL training progress by up to 35%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Optimal Control for Diffusion Bridges in Function Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20630v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20630v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Byoungwoo Park, Jungwon Choi, Sungbin Lim, Juho Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in diffusion models and diffusion bridges primarily focus
+on finite-dimensional spaces, yet many real-world problems necessitate
+operations in infinite-dimensional function spaces for more natural and
+interpretable formulations. In this paper, we present a theory of stochastic
+optimal control (SOC) tailored to infinite-dimensional spaces, aiming to extend
+diffusion-based algorithms to function spaces. Specifically, we demonstrate how
+Doob's $h$-transform, the fundamental tool for constructing diffusion bridges,
+can be derived from the SOC perspective and expanded to infinite dimensions.
+This expansion presents a challenge, as infinite-dimensional spaces typically
+lack closed-form densities. Leveraging our theory, we establish that solving
+the optimal control problem with a specific objective function choice is
+equivalent to learning diffusion-based generative models. We propose two
+applications: (1) learning bridges between two infinite-dimensional
+distributions and (2) generative models for sampling from an
+infinite-dimensional distribution. Our approach proves effective for diverse
+problems involving continuous function space representations, such as
+resolution-free images, time-series data, and probability density functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Inherent Privacy Properties of Discrete Denoising Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.15524v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.15524v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongzhe Wei, Eleonora Kreačić, Haoyu Wang, Haoteng Yin, Eli Chien, Vamsi K. Potluru, Pan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Privacy concerns have led to a surge in the creation of synthetic datasets,
+with diffusion models emerging as a promising avenue. Although prior studies
+have performed empirical evaluations on these models, there has been a gap in
+providing a mathematical characterization of their privacy-preserving
+capabilities. To address this, we present the pioneering theoretical
+exploration of the privacy preservation inherent in discrete diffusion models
+(DDMs) for discrete dataset generation. Focusing on per-instance differential
+privacy (pDP), our framework elucidates the potential privacy leakage for each
+data point in a given training dataset, offering insights into how the privacy
+loss of each point correlates with the dataset's distribution. Our bounds also
+show that training with $s$-sized data points leads to a surge in privacy
+leakage from $(\epsilon, O(\frac{1}{s^2\epsilon}))$-pDP to $(\epsilon,
+O(\frac{1}{s\epsilon}))$-pDP of the DDM during the transition from the pure
+noise to the synthetic clean data phase, and a faster decay in diffusion
+coefficients amplifies the privacy guarantee. Finally, we empirically verify
+our theoretical findings on both synthetic and real-world datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>58 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Proof-of-Learning with Incentive Security 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09005v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09005v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zishuo Zhao, Zhixuan Fang, Xuechao Wang, Xi Chen, Yuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most concurrent blockchain systems rely heavily on the Proof-of-Work (PoW) or
+Proof-of-Stake (PoS) mechanisms for decentralized consensus and security
+assurance. However, the substantial energy expenditure stemming from
+computationally intensive yet meaningless tasks has raised considerable
+concerns surrounding traditional PoW approaches, The PoS mechanism, while free
+of energy consumption, is subject to security and economic issues. Addressing
+these issues, the paradigm of Proof-of-Useful-Work (PoUW) seeks to employ
+challenges of practical significance as PoW, thereby imbuing energy consumption
+with tangible value. While previous efforts in Proof of Learning (PoL) explored
+the utilization of deep learning model training SGD tasks as PoUW challenges,
+recent research has revealed its vulnerabilities to adversarial attacks and the
+theoretical hardness in crafting a byzantine-secure PoL mechanism. In this
+paper, we introduce the concept of incentive-security that incentivizes
+rational provers to behave honestly for their best interest, bypassing the
+existing hardness to design a PoL mechanism with computational efficiency, a
+provable incentive-security guarantee and controllable difficulty.
+Particularly, our work is secure against two attacks to the recent work of Jia
+et al. [2021], and also improves the computational overhead from $\Theta(1)$ to
+$O(\frac{\log E}{E})$. Furthermore, while most recent research assumes trusted
+problem providers and verifiers, our design also guarantees frontend
+incentive-security even when problem providers are untrusted, and verifier
+incentive-security that bypasses the Verifier's Dilemma. By incorporating ML
+training into blockchain consensus mechanisms with provable guarantees, our
+research not only proposes an eco-friendly solution to blockchain systems, but
+also provides a proposal for a completely decentralized computing power market
+in the new AI age.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A New Robust Partial $p$-Wasserstein-Based Metric for Comparing
+  Distributions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.03664v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.03664v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharath Raghvendra, Pouyan Shirzadian, Kaiyi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The $2$-Wasserstein distance is sensitive to minor geometric differences
+between distributions, making it a very powerful dissimilarity metric. However,
+due to this sensitivity, a small outlier mass can also cause a significant
+increase in the $2$-Wasserstein distance between two similar distributions.
+Similarly, sampling discrepancy can cause the empirical $2$-Wasserstein
+distance on $n$ samples in $\mathbb{R}^2$ to converge to the true distance at a
+rate of $n^{-1/4}$, which is significantly slower than the rate of $n^{-1/2}$
+for $1$-Wasserstein distance. We introduce a new family of distances
+parameterized by $k \ge 0$, called $k$-RPW that is based on computing the
+partial $2$-Wasserstein distance. We show that (1) $k$-RPW satisfies the metric
+properties, (2) $k$-RPW is robust to small outlier mass while retaining the
+sensitivity of $2$-Wasserstein distance to minor geometric differences, and (3)
+when $k$ is a constant, $k$-RPW distance between empirical distributions on $n$
+samples in $\mathbb{R}^2$ converges to the true distance at a rate of
+$n^{-1/3}$, which is faster than the convergence rate of $n^{-1/4}$ for the
+$2$-Wasserstein distance. Using the partial $p$-Wasserstein distance, we extend
+our distance to any $p \in [1,\infty]$. By setting parameters $k$ or $p$
+appropriately, we can reduce our distance to the total variation,
+$p$-Wasserstein, and the L\'evy-Prokhorov distances. Experiments show that our
+distance function achieves higher accuracy in comparison to the
+$1$-Wasserstein, $2$-Wasserstein, and TV distances for image retrieval tasks on
+noisy real-world data sets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Symbolic Music Generation with Non-Differentiable Rule Guided Diffusion <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14285v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14285v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yujia Huang, Adishree Ghatare, Yuanzhe Liu, Ziniu Hu, Qinsheng Zhang, Chandramouli S Sastry, Siddharth Gururani, Sageev Oore, Yisong Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of symbolic music generation (e.g., generating piano
+rolls), with a technical focus on non-differentiable rule guidance. Musical
+rules are often expressed in symbolic form on note characteristics, such as
+note density or chord progression, many of which are non-differentiable which
+pose a challenge when using them for guided diffusion. We propose \oursfull
+(\ours), a novel guidance method that only requires forward evaluation of rule
+functions that can work with pre-trained diffusion models in a plug-and-play
+way, thus achieving training-free guidance for non-differentiable rules for the
+first time. Additionally, we introduce a latent diffusion architecture for
+symbolic music generation with high time resolution, which can be composed with
+SCG in a plug-and-play fashion. Compared to standard strong baselines in
+symbolic music generation, this framework demonstrates marked advancements in
+music quality and rule-based controllability, outperforming current
+state-of-the-art generators in a variety of settings. For detailed
+demonstrations, code and model checkpoints, please visit our project website:
+https://scg-rule-guided-music.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLAQ: Pushing the Limits of Low-Bit Post-Training Quantization for LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17233v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17233v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Wang, Bei Liu, Hang Shao, Bo Xiao, Ke Zeng, Guanglu Wan, Yanmin Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter quantization for Large Language Models (LLMs) has attracted
+increasing attentions recently in reducing memory costs and improving
+computational efficiency. Early approaches have been widely adopted. However,
+the existing methods suffer from poor performance in low-bit (such as 2 to 3
+bits) scenarios. In this paper, we present a novel and effective Column-Level
+Adaptive weight Quantization (CLAQ) framework by introducing three different
+types of adaptive strategies for LLM quantization. Firstly, a K-Means
+clustering based algorithm is proposed that allows dynamic generation of
+quantization centroids for each column of a parameter matrix. Secondly, we
+design an outlier-guided adaptive precision search strategy which can
+dynamically assign varying bit-widths to different columns. Finally, a dynamic
+outlier reservation scheme is developed to retain some parameters in their
+original float point precision, in trade off of boosted model performance.
+Experiments on various mainstream open source LLMs including LLaMA-1, LLaMA-2
+and Yi demonstrate that our methods achieve the state-of-the-art results across
+different bit settings, especially in extremely low-bit scenarios. Code is
+available at https://github.com/fayuge/CLAQ.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Representing Molecules as Random Walks Over Interpretable Grammars 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.08147v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.08147v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Sun, Minghao Guo, Weize Yuan, Veronika Thost, Crystal Elaine Owens, Aristotle Franklin Grosz, Sharvaa Selvan, Katelyn Zhou, Hassan Mohiuddin, Benjamin J Pedretti, Zachary P Smith, Jie Chen, Wojciech Matusik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research in molecular discovery has primarily been devoted to small,
+drug-like molecules, leaving many similarly important applications in material
+design without adequate technology. These applications often rely on more
+complex molecular structures with fewer examples that are carefully designed
+using known substructures. We propose a data-efficient and interpretable model
+for representing and reasoning over such molecules in terms of graph grammars
+that explicitly describe the hierarchical design space featuring motifs to be
+the design basis. We present a novel representation in the form of random walks
+over the design space, which facilitates both molecule generation and property
+prediction. We demonstrate clear advantages over existing methods in terms of
+performance, efficiency, and synthesizability of predicted molecules, and we
+provide detailed insights into the method's chemical interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Achieving $\tilde{O}(1/ε)$ Sample Complexity for Constrained
+  Markov Decision Process 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16324v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16324v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiashuo Jiang, Yinyu Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the reinforcement learning problem for the constrained Markov
+decision process (CMDP), which plays a central role in satisfying safety or
+resource constraints in sequential learning and decision-making. In this
+problem, we are given finite resources and a MDP with unknown transition
+probabilities. At each stage, we take an action, collecting a reward and
+consuming some resources, all assumed to be unknown and need to be learned over
+time. In this work, we take the first step towards deriving optimal
+problem-dependent guarantees for the CMDP problems. We derive a logarithmic
+regret bound, which translates into a
+$O(\frac{1}{\Delta\cdot\eps}\cdot\log^2(1/\eps))$ sample complexity bound, with
+$\Delta$ being a problem-dependent parameter, yet independent of $\eps$. Our
+sample complexity bound improves upon the state-of-art $O(1/\eps^2)$ sample
+complexity for CMDP problems established in the previous literature, in terms
+of the dependency on $\eps$. To achieve this advance, we develop a new
+framework for analyzing CMDP problems. To be specific, our algorithm operates
+in the primal space and we resolve the primal LP for the CMDP problem at each
+period in an online manner, with \textit{adaptive} remaining resource
+capacities. The key elements of our algorithm are: i) a characterization of the
+instance hardness via LP basis, ii) an eliminating procedure that identifies
+one optimal basis of the primal LP, and; iii) a resolving procedure that is
+adaptive to the remaining resources and sticks to the characterized optimal
+basis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Schrödinger Diffusion Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.04795v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.04795v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Deng, Weijian Luo, Yixin Tan, Marin Biloš, Yu Chen, Yuriy Nevmyvaka, Ricky T. Q. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Schr\"odinger bridge (SB) has emerged as the go-to method for optimizing
+transportation plans in diffusion models. However, SB requires estimating the
+intractable forward score functions, inevitably resulting in the costly
+implicit training loss based on simulated trajectories. To improve the
+scalability while preserving efficient transportation plans, we leverage
+variational inference to linearize the forward score functions (variational
+scores) of SB and restore simulation-free properties in training backward
+scores. We propose the variational Schr\"odinger diffusion model (VSDM), where
+the forward process is a multivariate diffusion and the variational scores are
+adaptively optimized for efficient transport. Theoretically, we use stochastic
+approximation to prove the convergence of the variational scores and show the
+convergence of the adaptively generated samples based on the optimal
+variational scores. Empirically, we test the algorithm in simulated examples
+and observe that VSDM is efficient in generations of anisotropic shapes and
+yields straighter sample trajectories compared to the single-variate diffusion.
+We also verify the scalability of the algorithm in real-world data and achieve
+competitive unconditional generation performance in CIFAR10 and conditional
+generation in time series modeling. Notably, VSDM no longer depends on warm-up
+initializations and has become tuning-friendly in training large-scale
+experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KTO: Model Alignment as Prospect Theoretic Optimization <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01306v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01306v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kawin Ethayarajh, Winnie Xu, Niklas Muennighoff, Dan Jurafsky, Douwe Kiela
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kahneman & Tversky's $\textit{prospect theory}$ tells us that humans perceive
+random variables in a biased but well-defined manner (1992); for example,
+humans are famously loss-averse. We show that objectives for aligning LLMs with
+human feedback implicitly incorporate many of these biases -- the success of
+these objectives (e.g., DPO) over cross-entropy minimization can partly be
+ascribed to them belonging to a family of loss functions that we call
+$\textit{human-aware losses}$ (HALOs). However, the utility functions these
+methods attribute to humans still differ from those in the prospect theory
+literature. Using a Kahneman-Tversky model of human utility, we propose a HALO
+that directly maximizes the utility of generations instead of maximizing the
+log-likelihood of preferences, as current methods do. We call this approach
+KTO, and it matches or exceeds the performance of preference-based methods at
+scales from 1B to 30B, despite only learning from a binary signal of whether an
+output is desirable. More broadly, our work suggests that there is no one HALO
+that is universally superior; the best loss depends on the inductive biases
+most appropriate for a given setting, an oft-overlooked consideration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Policy Dispersion in Non-Markovian Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14509v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14509v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohao Qu, Xiaofeng Cao, Jielong Yang, Hechang Chen, Chang Yi, Ivor W. Tsang, Yew-Soon Ong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Markov Decision Process (MDP) presents a mathematical framework to formulate
+the learning processes of agents in reinforcement learning. MDP is limited by
+the Markovian assumption that a reward only depends on the immediate state and
+action. However, a reward sometimes depends on the history of states and
+actions, which may result in the decision process in a non-Markovian
+environment. In such environments, agents receive rewards via
+temporally-extended behaviors sparsely, and the learned policies may be
+similar. This leads the agents acquired with similar policies generally overfit
+to the given task and can not quickly adapt to perturbations of environments.
+To resolve this problem, this paper tries to learn the diverse policies from
+the history of state-action pairs under a non-Markovian environment, in which a
+policy dispersion scheme is designed for seeking diverse policy representation.
+Specifically, we first adopt a transformer-based method to learn policy
+embeddings. Then, we stack the policy embeddings to construct a dispersion
+matrix to induce a set of diverse policies. Finally, we prove that if the
+dispersion matrix is positive definite, the dispersed embeddings can
+effectively enlarge the disagreements across policies, yielding a diverse
+expression for the original policy embedding distribution. Experimental results
+show that this dispersion scheme can obtain more expressive diverse policies,
+which then derive more robust performance than recent learning baselines under
+various learning environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In further research, we found that the core content of the paper
+  requires significant modification and that the entire paper needs to be
+  restructured. To enhance the scientific quality and contributions of the
+  paper, we have decided to resubmit it after completing the necessary
+  revisions and improvements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EGTR: Extracting Graph from <span class="highlight-title">Transformer</span> for Scene Graph Generation <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.02072v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.02072v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinbae Im, JeongYeon Nam, Nokyung Park, Hyungmin Lee, Seunghyun Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene Graph Generation (SGG) is a challenging task of detecting objects and
+predicting relationships between objects. After DETR was developed, one-stage
+SGG models based on a one-stage object detector have been actively studied.
+However, complex modeling is used to predict the relationship between objects,
+and the inherent relationship between object queries learned in the multi-head
+self-attention of the object detector has been neglected. We propose a
+lightweight one-stage SGG model that extracts the relation graph from the
+various relationships learned in the multi-head self-attention layers of the
+DETR decoder. By fully utilizing the self-attention by-products, the relation
+graph can be extracted effectively with a shallow relation extraction head.
+Considering the dependency of the relation extraction task on the object
+detection task, we propose a novel relation smoothing technique that adjusts
+the relation label adaptively according to the quality of the detected objects.
+By the relation smoothing, the model is trained according to the continuous
+curriculum that focuses on object detection task at the beginning of training
+and performs multi-task learning as the object detection performance gradually
+improves. Furthermore, we propose a connectivity prediction task that predicts
+whether a relation exists between object pairs as an auxiliary task of the
+relation extraction. We demonstrate the effectiveness and efficiency of our
+method for the Visual Genome and Open Image V6 datasets. Our code is publicly
+available at https://github.com/naver-ai/egtr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024 (Best paper award candidate)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ E$^{2}$GAN: Efficient Training of Efficient GANs for Image-to-Image
+  Translation <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06127v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06127v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Gong, Zheng Zhan, Qing Jin, Yanyu Li, Yerlan Idelbayev, Xian Liu, Andrey Zharkov, Kfir Aberman, Sergey Tulyakov, Yanzhi Wang, Jian Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One highly promising direction for enabling flexible real-time on-device
+image editing is utilizing data distillation by leveraging large-scale
+text-to-image diffusion models to generate paired datasets used for training
+generative adversarial networks (GANs). This approach notably alleviates the
+stringent requirements typically imposed by high-end commercial GPUs for
+performing image editing with diffusion models. However, unlike text-to-image
+diffusion models, each distilled GAN is specialized for a specific image
+editing task, necessitating costly training efforts to obtain models for
+various concepts. In this work, we introduce and address a novel research
+direction: can the process of distilling GANs from diffusion models be made
+significantly more efficient? To achieve this goal, we propose a series of
+innovative techniques. First, we construct a base GAN model with generalized
+features, adaptable to different concepts through fine-tuning, eliminating the
+need for training from scratch. Second, we identify crucial layers within the
+base GAN model and employ Low-Rank Adaptation (LoRA) with a simple yet
+effective rank search process, rather than fine-tuning the entire base model.
+Third, we investigate the minimal amount of data necessary for fine-tuning,
+further reducing the overall training time. Extensive experiments show that we
+can efficiently empower GANs with the ability to perform real-time high-quality
+image editing on mobile devices with remarkably reduced training and storage
+costs for each concept.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Project Page: https://yifanfanfanfan.github.io/e2gan/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GraphAny: A Foundation Model for Node Classification on Any Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20445v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20445v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianan Zhao, Hesham Mostafa, Mikhail Galkin, Michael Bronstein, Zhaocheng Zhu, Jian Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models that can perform inference on any new task without
+requiring specific training have revolutionized machine learning in vision and
+language applications. However, applications involving graph-structured data
+remain a tough nut for foundation models, due to challenges in the unique
+feature- and label spaces associated with each graph. Traditional graph ML
+models such as graph neural networks (GNNs) trained on graphs cannot perform
+inference on a new graph with feature and label spaces different from the
+training ones. Furthermore, existing models learn functions specific to the
+training graph and cannot generalize to new graphs. In this work, we tackle
+these two challenges with a new foundational architecture for inductive node
+classification named GraphAny. GraphAny models inference on a new graph as an
+analytical solution to a LinearGNN, thereby solving the first challenge. To
+solve the second challenge, we learn attention scores for each node to fuse the
+predictions of multiple LinearGNNs. Specifically, the attention module is
+carefully parameterized as a function of the entropy-normalized
+distance-features between multiple LinearGNNs predictions to ensure
+generalization to new graphs. Empirically, GraphAny trained on the Wisconsin
+dataset with only 120 labeled nodes can effectively generalize to 30 new graphs
+with an average accuracy of 67.26\% in an inductive manner, surpassing GCN and
+GAT trained in the supervised regime, as well as other inductive baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Consistency of semi-supervised learning, stochastic tug-of-war games,
+  and the p-Laplacian 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07463v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07463v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeff Calder, Nadejda Drenska
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper we give a broad overview of the intersection of partial
+differential equations (PDEs) and graph-based semi-supervised learning. The
+overview is focused on a large body of recent work on PDE continuum limits of
+graph-based learning, which have been used to prove well-posedness of
+semi-supervised learning algorithms in the large data limit. We highlight some
+interesting research directions revolving around consistency of graph-based
+semi-supervised learning, and present some new results on the consistency of
+$p$-Laplacian semi-supervised learning using the stochastic tug-of-war game
+interpretation of the $p$-Laplacian. We also present the results of some
+numerical experiments that illustrate our results and suggest directions for
+future work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DFA-RAG: Conversational Semantic Router for Large Language Model with
+  Definite Finite Automaton <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04411v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04411v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyou Sun, Junjie Hu, Wei Cheng, Haifeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the retrieval-augmented large language model with
+Definite Finite Automaton (DFA-RAG), a novel framework designed to enhance the
+capabilities of conversational agents using large language models (LLMs).
+Traditional LLMs face challenges in generating regulated and compliant
+responses in special scenarios with predetermined response guidelines, like
+emotional support and customer service. Our framework addresses these
+challenges by embedding a Definite Finite Automaton (DFA), learned from
+training dialogues, within the LLM. This structured approach acts as a semantic
+router which enables the LLM to adhere to a deterministic response pathway. The
+routing is achieved by the retrieval-augmentation generation (RAG) strategy,
+which carefully selects dialogue examples aligned with the current
+conversational context. The advantages of DFA-RAG include an interpretable
+structure through human-readable DFA, context-aware retrieval for responses in
+conversations, and plug-and-play compatibility with existing LLMs. Extensive
+benchmarks validate DFA-RAG's effectiveness, indicating its potential as a
+valuable contribution to the conversational agent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Sample Generation Strategies for Learning Heuristic
+  Functions in Classical Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.13316v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.13316v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        R. V. Bettker, P. P. Minini, A. G. Pereira, M. Ritt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of learning good heuristic functions for classical
+planning tasks with neural networks based on samples represented by states with
+their cost-to-goal estimates. The heuristic function is learned for a state
+space and goal condition with the number of samples limited to a fraction of
+the size of the state space, and must generalize well for all states of the
+state space with the same goal condition. Our main goal is to better understand
+the influence of sample generation strategies on the performance of a greedy
+best-first heuristic search (GBFS) guided by a learned heuristic function. In a
+set of controlled experiments, we find that two main factors determine the
+quality of the learned heuristic: the algorithm used to generate the sample set
+and how close the sample estimates to the perfect cost-to-goal are. These two
+factors are dependent: having perfect cost-to-goal estimates is insufficient if
+the samples are not well distributed across the state space. We also study
+other effects, such as adding samples with high-value estimates. Based on our
+findings, we propose practical strategies to improve the quality of learned
+heuristics: three strategies that aim to generate more representative states
+and two strategies that improve the cost-to-goal estimates. Our practical
+strategies result in a learned heuristic that, when guiding a GBFS algorithm,
+increases by more than 30% the mean coverage compared to a baseline learned
+heuristic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SOUL: Unlocking the Power of Second-Order Optimization for LLM
+  Unlearning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18239v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18239v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghan Jia, Yihua Zhang, Yimeng Zhang, Jiancheng Liu, Bharat Runwal, James Diffenderfer, Bhavya Kailkhura, Sijia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have highlighted the necessity of effective
+unlearning mechanisms to comply with data regulations and ethical AI practices.
+LLM unlearning aims at removing undesired data influences and associated model
+capabilities without compromising utility out of the scope of unlearning. While
+interest in studying LLM unlearning is growing,the impact of the optimizer
+choice for LLM unlearning remains under-explored. In this work, we shed light
+on the significance of optimizer selection in LLM unlearning for the first
+time, establishing a clear connection between {second-order optimization} and
+influence unlearning (a classical approach using influence functions to update
+the model for data influence removal). This insight propels us to develop a
+second-order unlearning framework, termed SOUL, built upon the second-order
+clipped stochastic optimization (Sophia)-based LLM training method. SOUL
+extends the static, one-shot model update using influence unlearning to a
+dynamic, iterative unlearning process. Our extensive experiments show that SOUL
+consistently outperforms conventional first-order methods across various
+unlearning tasks, models, and metrics, suggesting the promise of second-order
+optimization in providing a scalable and easily implementable solution for LLM
+unlearning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Position: Considerations for Differentially Private Learning with
+  Large-Scale Public <span class="highlight-title">Pretrain</span>ing <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.06470v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.06470v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Tramèr, Gautam Kamath, Nicholas Carlini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of differentially private machine learning can be boosted
+significantly by leveraging the transfer learning capabilities of non-private
+models pretrained on large public datasets. We critically review this approach.
+  We primarily question whether the use of large Web-scraped datasets should be
+viewed as differential-privacy-preserving. We caution that publicizing these
+models pretrained on Web data as "private" could lead to harm and erode the
+public's trust in differential privacy as a meaningful definition of privacy.
+  Beyond the privacy considerations of using public data, we further question
+the utility of this paradigm. We scrutinize whether existing machine learning
+benchmarks are appropriate for measuring the ability of pretrained models to
+generalize to sensitive domains, which may be poorly represented in public Web
+data. Finally, we notice that pretraining has been especially impactful for the
+largest available models -- models sufficiently large to prohibit end users
+running them on their own devices. Thus, deploying such models today could be a
+net loss for privacy, as it would require (private) data to be outsourced to a
+more compute-powerful third party.
+  We conclude by discussing potential paths forward for the field of private
+learning, as public pretraining becomes more popular and powerful.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provably Stable Feature Rankings with SHAP and LIME 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15800v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15800v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeremy Goldwasser, Giles Hooker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Feature attributions are ubiquitous tools for understanding the predictions
+of machine learning models. However, the calculation of popular methods for
+scoring input variables such as SHAP and LIME suffers from high instability due
+to random sampling. Leveraging ideas from multiple hypothesis testing, we
+devise attribution methods that ensure the most important features are ranked
+correctly with high probability. Given SHAP estimates from KernelSHAP or
+Shapley Sampling, we demonstrate how to retrospectively verify the number of
+stable rankings. Further, we introduce efficient sampling algorithms for SHAP
+and LIME that guarantee the $K$ highest-ranked features have the proper
+ordering. Finally, we show how to adapt these local feature attribution methods
+for the global importance setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Communication-Efficient Gradient Descent-Accent Methods for Distributed
+  Variational Inequalities: Unified Analysis and Local Updates <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05100v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05100v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siqi Zhang, Sayantan Choudhury, Sebastian U Stich, Nicolas Loizou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributed and federated learning algorithms and techniques associated
+primarily with minimization problems. However, with the increase of minimax
+optimization and variational inequality problems in machine learning, the
+necessity of designing efficient distributed/federated learning approaches for
+these problems is becoming more apparent. In this paper, we provide a unified
+convergence analysis of communication-efficient local training methods for
+distributed variational inequality problems (VIPs). Our approach is based on a
+general key assumption on the stochastic estimates that allows us to propose
+and analyze several novel local training algorithms under a single framework
+for solving a class of structured non-monotone VIPs. We present the first local
+gradient descent-accent algorithms with provable improved communication
+complexity for solving distributed variational inequalities on heterogeneous
+data. The general algorithmic framework recovers state-of-the-art algorithms
+and their sharp convergence guarantees when the setting is specialized to
+minimization or minimax optimization problems. Finally, we demonstrate the
+strong performance of the proposed algorithms compared to state-of-the-art
+methods when solving federated minimax optimization problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A SARS-CoV-2 Interaction <span class="highlight-title">Dataset</span> and VHH Sequence Corpus for Antibody
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18749v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18749v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hirofumi Tsuruta, Hiroyuki Yamazaki, Ryota Maeda, Ryotaro Tamura, Akihiro Imura
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Antibodies are crucial proteins produced by the immune system to eliminate
+harmful foreign substances and have become pivotal therapeutic agents for
+treating human diseases. To accelerate the discovery of antibody therapeutics,
+there is growing interest in constructing language models using antibody
+sequences. However, the applicability of pre-trained language models for
+antibody discovery has not been thoroughly evaluated due to the scarcity of
+labeled datasets. To overcome these limitations, we introduce AVIDa-SARS-CoV-2,
+a dataset featuring the antigen-variable domain of heavy chain of heavy chain
+antibody (VHH) interactions obtained from two alpacas immunized with severe
+acute respiratory syndrome coronavirus 2 (SARS-CoV-2) spike proteins.
+AVIDa-SARS-CoV-2 includes binary labels indicating the binding or non-binding
+of diverse VHH sequences to 12 SARS-CoV-2 mutants, such as the Delta and
+Omicron variants. Furthermore, we release VHHCorpus-2M, a pre-training dataset
+for antibody language models, containing over two million VHH sequences. We
+report benchmark results for predicting SARS-CoV-2-VHH binding using VHHBERT
+pre-trained on VHHCorpus-2M and existing general protein and antibody-specific
+pre-trained language models. These results confirm that AVIDa-SARS-CoV-2
+provides valuable benchmarks for evaluating the representation capabilities of
+antibody language models for binding prediction, thereby facilitating the
+development of AI-driven antibody discovery. The datasets are available at
+https://datasets.cognanous.com.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAM as the Guide: Mastering Pseudo-Label Refinement in Semi-Supervised
+  Referring Expression Segmentation <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danni Yang, Jiayi Ji, Yiwei Ma, Tianyu Guo, Haowei Wang, Xiaoshuai Sun, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce SemiRES, a semi-supervised framework that
+effectively leverages a combination of labeled and unlabeled data to perform
+RES. A significant hurdle in applying semi-supervised techniques to RES is the
+prevalence of noisy pseudo-labels, particularly at the boundaries of objects.
+SemiRES incorporates the Segment Anything Model (SAM), renowned for its precise
+boundary demarcation, to improve the accuracy of these pseudo-labels. Within
+SemiRES, we offer two alternative matching strategies: IoU-based Optimal
+Matching (IOM) and Composite Parts Integration (CPI). These strategies are
+designed to extract the most accurate masks from SAM's output, thus guiding the
+training of the student model with enhanced precision. In instances where a
+precise mask cannot be matched from the available candidates, we develop the
+Pixel-Wise Adjustment (PWA) strategy, guiding the student model's training
+directly by the pseudo-labels. Extensive experiments on three RES
+benchmarks--RefCOCO, RefCOCO+, and G-Ref reveal its superior performance
+compared to fully supervised methods. Remarkably, with only 1% labeled data,
+our SemiRES outperforms the supervised baseline by a large margin, e.g. +18.64%
+gains on RefCOCO val set. The project code is available at
+\url{https://github.com/nini0919/SemiRES}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sequence-to-Sequence Multi-Modal Speech In-Painting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahsa Kadkhodaei Elyaderani, Shahram Shirani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech in-painting is the task of regenerating missing audio contents using
+reliable context information. Despite various recent studies in multi-modal
+perception of audio in-painting, there is still a need for an effective
+infusion of visual and auditory information in speech in-painting. In this
+paper, we introduce a novel sequence-to-sequence model that leverages the
+visual information to in-paint audio signals via an encoder-decoder
+architecture. The encoder plays the role of a lip-reader for facial recordings
+and the decoder takes both encoder outputs as well as the distorted audio
+spectrograms to restore the original speech. Our model outperforms an
+audio-only speech in-painting model and has comparable results with a recent
+multi-modal speech in-painter in terms of speech quality and intelligibility
+metrics for distortions of 300 ms to 1500 ms duration, which proves the
+effectiveness of the introduced multi-modality in speech in-painting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Demo: Soccer Information Retrieval via Natural Queries using SoccerRAG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksander Theo Strand, Sushant Gautam, Cise Midoglu, Pål Halvorsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of digital sports media necessitates sophisticated
+information retrieval systems that can efficiently parse extensive multimodal
+datasets. This paper demonstrates SoccerRAG, an innovative framework designed
+to harness the power of Retrieval Augmented Generation (RAG) and Large Language
+Models (LLMs) to extract soccer-related information through natural language
+queries. By leveraging a multimodal dataset, SoccerRAG supports dynamic
+querying and automatic data validation, enhancing user interaction and
+accessibility to sports archives. We present a novel interactive user interface
+(UI) based on the Chainlit framework which wraps around the core functionality,
+and enable users to interact with the SoccerRAG framework in a chatbot-like
+visual manner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to CBMI 2024 as a demonstration;
+  https://github.com/simula/soccer-rag</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SoccerRAG: Multimodal Soccer Information Retrieval via Natural Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01273v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01273v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aleksander Theo Strand, Sushant Gautam, Cise Midoglu, Pål Halvorsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of digital sports media necessitates sophisticated
+information retrieval systems that can efficiently parse extensive multimodal
+datasets. This paper introduces SoccerRAG, an innovative framework designed to
+harness the power of Retrieval Augmented Generation (RAG) and Large Language
+Models (LLMs) to extract soccer-related information through natural language
+queries. By leveraging a multimodal dataset, SoccerRAG supports dynamic
+querying and automatic data validation, enhancing user interaction and
+accessibility to sports archives. Our evaluations indicate that SoccerRAG
+effectively handles complex queries, offering significant improvements over
+traditional retrieval systems in terms of accuracy and user engagement. The
+results underscore the potential of using RAG and LLMs in sports analytics,
+paving the way for future advancements in the accessibility and real-time
+processing of sports data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to CBMI 2024 as a regular paper;
+  https://github.com/simula/soccer-rag</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Jersey Number Recognition Using Multi-task Learning With
+  Orientation-guided Weight Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yung-Hui Lin, Yu-Wen Chang, Huang-Chia Shih, Takahiro Ogawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Jersey number recognition (JNR) has always been an important task in sports
+analytics. Improving recognition accuracy remains an ongoing challenge because
+images are subject to blurring, occlusion, deformity, and low resolution.
+Recent research has addressed these problems using number localization and
+optical character recognition. Some approaches apply player identification
+schemes to image sequences, ignoring the impact of human body rotation angles
+on jersey digit identification. Accurately predicting the number of jersey
+digits by using a multi-task scheme to recognize each individual digit enables
+more robust results. Based on the above considerations, this paper proposes a
+multi-task learning method called the angle-digit refine scheme (ADRS), which
+combines human body orientation angles and digit number clues to recognize
+athletic jersey numbers. Based on our experimental results, our approach
+increases inference information, significantly improving prediction accuracy.
+Compared to state-of-the-art methods, which can only handle a single type of
+sport, the proposed method produces a more diverse and practical JNR
+application. The incorporation of diverse types of team sports such as soccer,
+football, basketball, volleyball, and baseball into our dataset contributes
+greatly to generalized JNR in sports analytics. Our accuracy achieves 64.07% on
+Top-1 and 89.97% on Top-2, with corresponding F1 scores of 67.46% and 90.64%,
+respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Weakly-Supervised Audio-Visual Video Parsing via Segment-wise
+  Pseudo Labeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00919v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00919v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinxing Zhou, Dan Guo, Yiran Zhong, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Audio-Visual Video Parsing task aims to identify and temporally localize
+the events that occur in either or both the audio and visual streams of audible
+videos. It often performs in a weakly-supervised manner, where only video event
+labels are provided, \ie, the modalities and the timestamps of the labels are
+unknown. Due to the lack of densely annotated labels, recent work attempts to
+leverage pseudo labels to enrich the supervision. A commonly used strategy is
+to generate pseudo labels by categorizing the known video event labels for each
+modality. However, the labels are still confined to the video level, and the
+temporal boundaries of events remain unlabeled. In this paper, we propose a new
+pseudo label generation strategy that can explicitly assign labels to each
+video segment by utilizing prior knowledge learned from the open world.
+Specifically, we exploit the large-scale pretrained models, namely CLIP and
+CLAP, to estimate the events in each video segment and generate segment-level
+visual and audio pseudo labels, respectively. We then propose a new loss
+function to exploit these pseudo labels by taking into account their
+category-richness and segment-richness. A label denoising strategy is also
+adopted to further improve the visual pseudo labels by flipping them whenever
+abnormally large forward losses occur. We perform extensive experiments on the
+LLP dataset and demonstrate the effectiveness of each proposed design and we
+achieve state-of-the-art video parsing performance on all types of event
+parsing, \ie, audio event, visual event, and audio-visual event. We also
+examine the proposed pseudo label generation strategy on a relevant
+weakly-supervised audio-visual event localization task and the experimental
+results again verify the benefits and generalization of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCV 2024 Accepted. arXiv admin note: substantial text overlap with
+  arXiv:2303.02344</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VIEScore: Towards Explainable Metrics for Conditional Image Synthesis
+  Evaluation <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.14867v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.14867v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Ku, Dongfu Jiang, Cong Wei, Xiang Yue, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly advancing field of conditional image generation research,
+challenges such as limited explainability lie in effectively evaluating the
+performance and capabilities of various models. This paper introduces VIEScore,
+a Visual Instruction-guided Explainable metric for evaluating any conditional
+image generation tasks. VIEScore leverages general knowledge from Multimodal
+Large Language Models (MLLMs) as the backbone and does not require training or
+fine-tuning. We evaluate VIEScore on seven prominent tasks in conditional image
+tasks and found: (1) VIEScore (GPT4-o) achieves a high Spearman correlation of
+0.4 with human evaluations, while the human-to-human correlation is 0.45. (2)
+VIEScore (with open-source MLLM) is significantly weaker than GPT-4o and GPT-4v
+in evaluating synthetic images. (3) VIEScore achieves a correlation on par with
+human ratings in the generation tasks but struggles in editing tasks. With
+these results, we believe VIEScore shows its great potential to replace human
+judges in evaluating image synthesis tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL2024 main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NU-Class Net: A Novel Approach for Video Quality Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.01163v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.01163v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parham Zilouchian Moghaddam, Mehdi Modarressi, Mohammad Amin Sadeghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video content has experienced a surge in popularity, asserting its dominance
+over internet traffic and Internet of Things (IoT) networks. Video compression
+has long been regarded as the primary means of efficiently managing the
+substantial multimedia traffic generated by video-capturing devices.
+Nevertheless, video compression algorithms entail significant computational
+demands in order to achieve substantial compression ratios. This complexity
+presents a formidable challenge when implementing efficient video coding
+standards in resource-constrained embedded systems, such as IoT edge node
+cameras. To tackle this challenge, this paper introduces NU-Class Net, an
+innovative deep-learning model designed to mitigate compression artifacts
+stemming from lossy compression codecs. This enhancement significantly elevates
+the perceptible quality of low-bit-rate videos. By employing the NU-Class Net,
+the video encoder within the video-capturing node can reduce output quality,
+thereby generating low-bit-rate videos and effectively curtailing both
+computation and bandwidth requirements at the edge. On the decoder side, which
+is typically less encumbered by resource limitations, NU-Class Net is applied
+after the video decoder to compensate for artifacts and approximate the quality
+of the original video. Experimental results affirm the efficacy of the proposed
+model in enhancing the perceptible quality of videos, especially those streamed
+at low bit rates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the Robustness of Decision-Level Through Adversarial Attacks
+  on LLM-Based Embodied Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19802v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19802v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyuan Liu, Jiawei Chen, Shouwei Ruan, Hang Su, Zhaoxia Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Embodied intelligence empowers agents with a profound sense of perception,
+enabling them to respond in a manner closely aligned with real-world
+situations. Large Language Models (LLMs) delve into language instructions with
+depth, serving a crucial role in generating plans for intricate tasks. Thus,
+LLM-based embodied models further enhance the agent's capacity to comprehend
+and process information. However, this amalgamation also ushers in new
+challenges in the pursuit of heightened intelligence. Specifically, attackers
+can manipulate LLMs to produce irrelevant or even malicious outputs by altering
+their prompts. Confronted with this challenge, we observe a notable absence of
+multi-modal datasets essential for comprehensively evaluating the robustness of
+LLM-based embodied models. Consequently, we construct the Embodied Intelligent
+Robot Attack Dataset (EIRAD), tailored specifically for robustness evaluation.
+Additionally, two attack strategies are devised, including untargeted attacks
+and targeted attacks, to effectively simulate a range of diverse attack
+scenarios. At the same time, during the attack process, to more accurately
+ascertain whether our method is successful in attacking the LLM-based embodied
+model, we devise a new attack success evaluation method utilizing the BLIP2
+model. Recognizing the time and cost-intensive nature of the GCG algorithm in
+attacks, we devise a scheme for prompt suffix initialization based on various
+target tasks, thus expediting the convergence process. Experimental results
+demonstrate that our method exhibits a superior attack success rate when
+targeting LLM-based embodied models, indicating a lower level of decision-level
+robustness in these models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Edit As You Wish: Video Caption Editing with Multi-grained User Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linli Yao, Yuanmeng Zhang, Ziheng Wang, Xinglin Hou, Tiezheng Ge, Yuning Jiang, Xu Sun, Qin Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically narrating videos in natural language complying with user
+requests, i.e. Controllable Video Captioning task, can help people manage
+massive videos with desired intentions. However, existing works suffer from two
+shortcomings: 1) the control signal is single-grained which can not satisfy
+diverse user intentions; 2) the video description is generated in a single
+round which can not be further edited to meet dynamic needs. In this paper, we
+propose a novel \textbf{V}ideo \textbf{C}aption \textbf{E}diting \textbf{(VCE)}
+task to automatically revise an existing video description guided by
+multi-grained user requests. Inspired by human writing-revision habits, we
+design the user command as a pivotal triplet \{\textit{operation, position,
+attribute}\} to cover diverse user needs from coarse-grained to fine-grained.
+To facilitate the VCE task, we \textit{automatically} construct an open-domain
+benchmark dataset named VATEX-EDIT and \textit{manually} collect an e-commerce
+dataset called EMMAD-EDIT. We further propose a specialized small-scale model
+(i.e., OPA) compared with two generalist Large Multi-modal Models to perform an
+exhaustive analysis of the novel task. For evaluation, we adopt comprehensive
+metrics considering caption fluency, command-caption consistency, and
+video-caption alignment. Experiments reveal the task challenges of fine-grained
+multi-modal semantics understanding and processing. Our datasets, codes, and
+evaluation tools are ready to be open-sourced.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Heracles: A Hybrid SSM-<span class="highlight-title">Transformer</span> Model for High-Resolution Image and
+  Time-Series Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Badri N. Patro, Suhas Ranganath, Vinay P. Namboodiri, Vijay S. Agneeswaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have revolutionized image modeling tasks with adaptations like
+DeIT, Swin, SVT, Biformer, STVit, and FDVIT. However, these models often face
+challenges with inductive bias and high quadratic complexity, making them less
+efficient for high-resolution images. State space models (SSMs) such as Mamba,
+V-Mamba, ViM, and SiMBA offer an alternative to handle high resolution images
+in computer vision tasks. These SSMs encounter two major issues. First, they
+become unstable when scaled to large network sizes. Second, although they
+efficiently capture global information in images, they inherently struggle with
+handling local information. To address these challenges, we introduce Heracles,
+a novel SSM that integrates a local SSM, a global SSM, and an attention-based
+token interaction module. Heracles leverages a Hartely kernel-based state space
+model for global image information, a localized convolutional network for local
+details, and attention mechanisms in deeper layers for token interactions. Our
+extensive experiments demonstrate that Heracles-C-small achieves
+state-of-the-art performance on the ImageNet dataset with 84.5\% top-1
+accuracy. Heracles-C-Large and Heracles-C-Huge further improve accuracy to
+85.9\% and 86.4\%, respectively. Additionally, Heracles excels in transfer
+learning tasks on datasets such as CIFAR-10, CIFAR-100, Oxford Flowers, and
+Stanford Cars, and in instance segmentation on the MSCOCO dataset. Heracles
+also proves its versatility by achieving state-of-the-art results on seven
+time-series datasets, showcasing its ability to generalize across domains with
+spectral data, capturing both local and global information. The project page is
+available at this link.\url{https://github.com/badripatro/heracles}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-06-02T00:00:00Z">2024-06-02</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">43</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-based Hierarchical Concept Decomposition for Interpretable
+  Fine-Grained Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18672v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18672v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renyi Qu, Mark Yatskar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  (Renyi Qu's Master's Thesis) Recent advancements in interpretable models for
+vision-language tasks have achieved competitive performance; however, their
+interpretability often suffers due to the reliance on unstructured text outputs
+from large language models (LLMs). This introduces randomness and compromises
+both transparency and reliability, which are essential for addressing safety
+issues in AI systems. We introduce \texttt{Hi-CoDe} (Hierarchical Concept
+Decomposition), a novel framework designed to enhance model interpretability
+through structured concept analysis. Our approach consists of two main
+components: (1) We use GPT-4 to decompose an input image into a structured
+hierarchy of visual concepts, thereby forming a visual concept tree. (2) We
+then employ an ensemble of simple linear classifiers that operate on
+concept-specific features derived from CLIP to perform classification. Our
+approach not only aligns with the performance of state-of-the-art models but
+also advances transparency by providing clear insights into the decision-making
+process and highlighting the importance of various concepts. This allows for a
+detailed analysis of potential failure modes and improves model compactness,
+therefore setting a new benchmark in interpretability without compromising the
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics of Language Models: Part 1, Learning Hierarchical Language
+  Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13673v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13673v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyuan Allen-Zhu, Yuanzhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based language models are effective but complex, and
+understanding their inner workings is a significant challenge. Previous
+research has primarily explored how these models handle simple tasks like name
+copying or selection, and we extend this by investigating how these models
+grasp complex, recursive language structures defined by context-free grammars
+(CFGs). We introduce a family of synthetic CFGs that produce hierarchical
+rules, capable of generating lengthy sentences (e.g., hundreds of tokens) that
+are locally ambiguous and require dynamic programming to parse. Despite this
+complexity, we demonstrate that generative models like GPT can accurately learn
+this CFG language and generate sentences based on it. We explore the model's
+internals, revealing that its hidden states precisely capture the structure of
+CFGs, and its attention patterns resemble the information passing in a dynamic
+programming algorithm.
+  This paper also presents several corollaries, including showing why
+positional embedding is inferior to relative attention or rotary embedding;
+demonstrating that encoder-based models (e.g., BERT, deBERTa) cannot learn very
+deeply nested CFGs as effectively as generative models (e.g., GPT); and
+highlighting the necessity of adding structural and syntactic errors to the
+pretraining data to make the model more robust to corrupted language prefixes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>V2+V3 polishes writing; V3 includes Figures 6 and 10 for better
+  illustrations of our results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reinforcement of Explainability of Chat<span class="highlight-title">GPT</span> <span class="highlight-title">Prompt</span>s by Embedding Breast
+  Cancer Self-Screening Rules into AI Responses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14454v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14454v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yousef Khan, Ahmed Abdeen Hamed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Addressing the global challenge of breast cancer, this research explores the
+fusion of generative AI, focusing on ChatGPT 3.5 turbo model, and the
+intricacies of breast cancer risk assessment. The research aims to evaluate
+ChatGPT's reasoning capabilities, emphasizing its potential to process rules
+and provide explanations for screening recommendations. The study seeks to
+bridge the technology gap between intelligent machines and clinicians by
+demonstrating ChatGPT's unique proficiency in natural language reasoning. The
+methodology employs a supervised prompt-engineering approach to enforce
+detailed explanations for ChatGPT's recommendations. Synthetic use cases,
+generated algorithmically, serve as the testing ground for the encoded rules,
+evaluating the model's processing prowess. Findings highlight ChatGPT's
+promising capacity in processing rules comparable to Expert System Shells, with
+a focus on natural language reasoning. The research introduces the concept of
+reinforcement explainability, showcasing its potential in elucidating outcomes
+and facilitating user-friendly interfaces for breast cancer risk assessment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 3 algorithms, 1 table, submitted to the IEEE
+  MedAI'24 Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProLex: A Benchmark for Language Proficiency-oriented Lexical
+  Substitution <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11356v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11356v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanming Zhang, Zixun Chen, Zhou Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lexical Substitution discovers appropriate substitutes for a given target
+word in a context sentence. However, the task fails to consider substitutes
+that are of equal or higher proficiency than the target, an aspect that could
+be beneficial for language learners looking to improve their writing. To bridge
+this gap, we propose a new task, language proficiency-oriented lexical
+substitution. We also introduce ProLex, a novel benchmark designed to assess
+systems' ability to generate not only appropriate substitutes but also
+substitutes that demonstrate better language proficiency. Besides the
+benchmark, we propose models that can automatically perform the new task. We
+show that our best model, a Llama2-13B model fine-tuned with task-specific
+synthetic data, outperforms ChatGPT by an average of 3.2% in F-score and
+achieves comparable results with GPT-4 on ProLex.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In ACL 2024 Findings, 19 pages, 4 figures, 14 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-lingual Cross-temporal Summarization: <span class="highlight-title">Dataset</span>, Models, Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12916v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12916v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Zhang, Jihed Ouni, Steffen Eger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While summarization has been extensively researched in natural language
+processing (NLP), cross-lingual cross-temporal summarization (CLCTS) is a
+largely unexplored area that has the potential to improve cross-cultural
+accessibility and understanding. This paper comprehensively addresses the CLCTS
+task, including dataset creation, modeling, and evaluation. We (1) build the
+first CLCTS corpus with 328 instances for hDe-En (extended version with 455
+instances) and 289 for hEn-De (extended version with 501 instances), leveraging
+historical fiction texts and Wikipedia summaries in English and German; (2)
+examine the effectiveness of popular transformer end-to-end models with
+different intermediate finetuning tasks; (3) explore the potential of GPT-3.5
+as a summarizer; (4) report evaluations from humans, GPT-4, and several recent
+automatic evaluation metrics. Our results indicate that intermediate task
+finetuned end-to-end models generate bad to moderate quality summaries while
+GPT-3.5, as a zero-shot summarizer, provides moderate to good quality outputs.
+GPT-3.5 also seems very adept at normalizing historical text. To assess data
+contamination in GPT-3.5, we design an adversarial attack scheme in which we
+find that GPT-3.5 performs slightly worse for unseen source documents compared
+to seen documents. Moreover, it sometimes hallucinates when the source
+sentences are inverted against its prior knowledge with a summarization
+accuracy of 0.67 for plot omission, 0.71 for entity swap, and 0.53 for plot
+negation. Overall, our regression results of model performances suggest that
+longer, older, and more complex source texts (all of which are more
+characteristic for historical language variants) are harder to summarize for
+all models, indicating the difficulty of the CLCTS task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Computational Linguistics. Submitted manuscript.
+  https://direct.mit.edu/coli/article/doi/10.1162/coli_a_00519/121095/Cross-lingual-Cross-temporal-Summarization-Dataset</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-SR: Scientific Equation Discovery via Programming with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18400v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18400v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parshin Shojaee, Kazem Meidani, Shashank Gupta, Amir Barati Farimani, Chandan K Reddy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mathematical equations have been unreasonably effective in describing complex
+natural phenomena across various scientific disciplines. However, discovering
+such insightful equations from data presents significant challenges due to the
+necessity of navigating extremely high-dimensional combinatorial and nonlinear
+hypothesis spaces. Traditional methods of equation discovery, commonly known as
+symbolic regression, largely focus on extracting equations from data alone,
+often neglecting the rich domain-specific prior knowledge that scientists
+typically depend on. To bridge this gap, we introduce LLM-SR, a novel approach
+that leverages the extensive scientific knowledge and robust code generation
+capabilities of Large Language Models (LLMs) to discover scientific equations
+from data in an efficient manner. Specifically, LLM-SR treats equations as
+programs with mathematical operators and combines LLMs' scientific priors with
+evolutionary search over equation programs. The LLM iteratively proposes new
+equation skeleton hypotheses, drawing from its physical understanding, which
+are then optimized against data to estimate skeleton parameters. We demonstrate
+LLM-SR's effectiveness across three diverse scientific domains, where it
+discovers physically accurate equations that provide significantly better fits
+to in-domain and out-of-domain data compared to the well-established symbolic
+regression baselines. Incorporating scientific prior knowledge also enables
+LLM-SR to search the equation space more efficiently than baselines. Code is
+available at: https://github.com/deep-symbolic-mathematics/LLM-SR
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Expressive Capacity of State Space Models: A Formal Language
+  Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17394v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17394v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yash Sarrof, Yana Veitsman, Michael Hahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, recurrent models based on linear state space models (SSMs) have
+shown promising performance in language modeling (LM), competititve with
+transformers. However, there is little understanding of the in-principle
+abilities of such models, which could provide useful guidance to the search for
+better LM architectures. We present a comprehensive theoretical study of the
+capacity of such SSMs as it compares to that of transformers and traditional
+RNNs. We find that SSMs and transformers have overlapping but distinct
+strengths. In star-free state tracking, SSMs implement straightforward and
+exact solutions to problems that transformers struggle to represent exactly.
+They can also model bounded hierarchical structure with optimal memory even
+without simulating a stack. On the other hand, we identify a design choice in
+current SSMs that limits their expressive power. We discuss implications for
+SSM and LM research, and verify results empirically on a recent SSM, Mamba.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Monte Carlo Language Model Pipeline for Zero-Shot Sociopolitical Event
+  Extraction <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erica Cai, Brendan O'Connor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current social science efforts automatically populate event databases of "who
+did what to whom?" tuples, by applying event extraction (EE) to text such as
+news. The event databases are used to analyze sociopolitical dynamics between
+actor pairs (dyads) in, e.g., international relations. While most EE methods
+heavily rely on rules or supervised learning, \emph{zero-shot} event extraction
+could potentially allow researchers to flexibly specify arbitrary event classes
+for new research questions. Unfortunately, we find that current zero-shot EE
+methods, as well as a naive zero-shot approach of simple generative language
+model (LM) prompting, perform poorly for dyadic event extraction; most suffer
+from word sense ambiguity, modality sensitivity, and computational
+inefficiency. We address these challenges with a new fine-grained, multi-stage
+instruction-following generative LM pipeline, proposing a Monte Carlo approach
+to deal with, and even take advantage of, nondeterminism of generative outputs.
+Our pipeline includes explicit stages of linguistic analysis (synonym
+generation, contextual disambiguation, argument realization, event modality),
+\textit{improving control and interpretability} compared to purely neural
+methods. This method outperforms other zero-shot EE approaches, and outperforms
+naive applications of generative LMs by at least 17 F1 percent points. The
+pipeline's filtering mechanism greatly improves computational efficiency,
+allowing it to perform as few as 12% of queries that a previous zero-shot
+method uses. Finally, we demonstrate our pipeline's application to dyadic
+international relations analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2023 Workshop on Instruction Tuning and
+  Instruction Following; oral presentation at New England Natural Language
+  Processing, 2023; 17 pages of text including references and appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Multi-loss Fusion Network for Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00264v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00264v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehui Wu, Ziwei Gong, Jaywon Koo, Julia Hirschberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the optimal selection and fusion of feature encoders
+across multiple modalities and combines these in one neural network to improve
+sentiment detection. We compare different fusion methods and examine the impact
+of multi-loss training within the multi-modality fusion network, identifying
+surprisingly important findings relating to subnet performance. We have also
+found that integrating context significantly enhances model performance. Our
+best model achieves state-of-the-art performance for three datasets (CMU-MOSI,
+CMU-MOSEI and CH-SIMS). These results suggest a roadmap toward an optimized
+feature selection and fusion approach for enhancing sentiment detection in
+neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally to the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Skin-in-the-Game: Decision Making via Multi-Stakeholder Alignment in
+  LLMs <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12933v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12933v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bilgehan Sel, Priya Shanmugasundaram, Mohammad Kachuee, Kun Zhou, Ruoxi Jia, Ming Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown remarkable capabilities in tasks such
+as summarization, arithmetic reasoning, and question answering. However, they
+encounter significant challenges in the domain of moral reasoning and ethical
+decision-making, especially in complex scenarios with multiple stakeholders.
+This paper introduces the Skin-in-the-Game (SKIG) framework, aimed at enhancing
+moral reasoning in LLMs by exploring decisions' consequences from multiple
+stakeholder perspectives. Central to SKIG's mechanism is simulating
+accountability for actions, which, alongside empathy exercises and risk
+assessment, is pivotal to its effectiveness. We validate SKIG's performance
+across various moral reasoning benchmarks with proprietary and opensource LLMs,
+and investigate its crucial components through extensive ablation analyses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024, long paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MIDGARD: Self-Consistency Using Minimum Description Length for
+  Structured Commonsense Reasoning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inderjeet Nair, Lu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the task of conducting structured reasoning as generating a
+reasoning graph from natural language input using large language models (LLMs).
+Previous approaches have explored various prompting schemes, yet they suffer
+from error propagation due to the autoregressive nature and single-pass-based
+decoding, which lack error correction capability. Additionally, relying solely
+on a single sample may result in the omission of true nodes and edges. To
+counter this, we draw inspiration from self-consistency (SC), which involves
+sampling a diverse set of reasoning chains and taking the majority vote as the
+final answer. To tackle the substantial challenge of applying SC on generated
+graphs, we propose MIDGARD (MInimum Description length Guided Aggregation of
+Reasoning in Directed acyclic graph) that leverages Minimum Description Length
+(MDL)-based formulation to identify consistent properties among the different
+graph samples generated by an LLM. This formulation helps reject properties
+that appear in only a few samples, which are likely to be erroneous, while
+enabling the inclusion of missing elements without compromising precision. Our
+method demonstrates superior performance than comparisons across various
+structured reasoning tasks, including argument structure extraction,
+explanation graph generation, inferring dependency relations among actions for
+everyday tasks, and semantic graph generation from natural texts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024(main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AFaCTA: Assisting the Annotation of Factual Claim Detection with
+  Reliable LLM Annotators <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11073v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11073v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwei Ni, Minjing Shi, Dominik Stammbach, Mrinmaya Sachan, Elliott Ash, Markus Leippold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rise of generative AI, automated fact-checking methods to combat
+misinformation are becoming more and more important. However, factual claim
+detection, the first step in a fact-checking pipeline, suffers from two key
+issues that limit its scalability and generalizability: (1) inconsistency in
+definitions of the task and what a claim is, and (2) the high cost of manual
+annotation. To address (1), we review the definitions in related work and
+propose a unifying definition of factual claims that focuses on verifiability.
+To address (2), we introduce AFaCTA (Automatic Factual Claim deTection
+Annotator), a novel framework that assists in the annotation of factual claims
+with the help of large language models (LLMs). AFaCTA calibrates its annotation
+confidence with consistency along three predefined reasoning paths. Extensive
+evaluation and experiments in the domain of political speech reveal that AFaCTA
+can efficiently assist experts in annotating factual claims and training
+high-quality classifiers, and can work with or without expert supervision. Our
+analyses also result in PoliClaim, a comprehensive claim detection dataset
+spanning diverse political topics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL2024 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XSTEM: An exemplar-based stemming algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.04355v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.04355v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kirk Baker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stemming is the process of reducing related words to a standard form by
+removing affixes from them. Existing algorithms vary with respect to their
+complexity, configurability, handling of unknown words, and ability to avoid
+under- and over-stemming. This paper presents a fast, simple, configurable,
+high-precision, high-recall stemming algorithm that combines the simplicity and
+performance of word-based lookup tables with the strong generalizability of
+rule-based methods to avert problems with out-of-vocabulary words.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Let's Fuse Step by Step: A Generative Fusion Decoding Algorithm with
+  LLMs for Multi-modal Text Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chan-Jan Hsu, Yi-Chang Chen, Feng-Ting Liao, Pei-Chen Ho, Yu-Hsiang Wang, Po-Chun Hsu, Da-shan Shiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce "Generative Fusion Decoding" (GFD), a novel shallow fusion
+framework, utilized to integrate Large Language Models (LLMs) into multi-modal
+text recognition systems such as automatic speech recognition (ASR) and optical
+character recognition (OCR). We derive the formulas necessary to enable GFD to
+operate across mismatched token spaces of different models by mapping text
+token space to byte token space, enabling seamless fusion during the decoding
+process. The framework is plug-and-play, compatible with various
+auto-regressive models, and does not require re-training for feature alignment,
+thus overcoming limitations of previous fusion techniques. We highlight three
+main advantages of GFD: First, by simplifying the complexity of aligning
+different model sample spaces, GFD allows LLMs to correct errors in tandem with
+the recognition model, reducing computation latencies. Second, the in-context
+learning ability of LLMs is fully capitalized by GFD, increasing robustness in
+long-form speech recognition and instruction aware speech recognition. Third,
+GFD enables fusing recognition models deficient in Chinese text recognition
+with LLMs extensively trained on Chinese. Our evaluation demonstrates that GFD
+significantly improves performance in ASR and OCR tasks, with ASR reaching
+state-of-the-art in the NTUML2021 benchmark. GFD provides a significant step
+forward in model integration, offering a unified solution that could be widely
+applicable to leveraging existing pre-trained models through step by step
+fusion.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Token-level Direct Preference Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.11999v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.11999v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongcheng Zeng, Guoqing Liu, Weiyu Ma, Ning Yang, Haifeng Zhang, Jun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning pre-trained Large Language Models (LLMs) is essential to align
+them with human values and intentions. This process often utilizes methods like
+pairwise comparisons and KL divergence against a reference LLM, focusing on the
+evaluation of full answers generated by the models. However, the generation of
+these responses occurs in a token level, following a sequential,
+auto-regressive fashion. In this paper, we introduce Token-level Direct
+Preference Optimization (TDPO), a novel approach to align LLMs with human
+preferences by optimizing policy at the token level. Unlike previous methods,
+which face challenges in divergence efficiency, TDPO incorporates forward KL
+divergence constraints for each token, improving alignment and diversity.
+Utilizing the Bradley-Terry model for a token-based reward system, TDPO
+enhances the regulation of KL divergence, while preserving simplicity without
+the need for explicit reward modeling. Experimental results across various text
+tasks demonstrate TDPO's superior performance in balancing alignment with
+generation diversity. Notably, fine-tuning with TDPO strikes a better balance
+than DPO in the controlled sentiment generation and single-turn dialogue
+datasets, and significantly improves the quality of generated responses
+compared to both DPO and PPO-based RLHF methods. Our code is open-sourced at
+https://github.com/Vance0124/Token-level-Direct-Preference-Optimization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Algorithm of Thoughts: Enhancing Exploration of Ideas in Large Language
+  Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10379v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10379v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bilgehan Sel, Ahmad Al-Tawaha, Vanshaj Khattar, Ruoxi Jia, Ming Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current literature, aiming to surpass the "Chain-of-Thought" approach, often
+resorts to external modi operandi involving halting, modifying, and then
+resuming the generation process to boost Large Language Models' (LLMs)
+reasoning capacities. Due to their myopic perspective, they escalate the number
+of query requests, leading to increased costs, memory, and computational
+overheads. Addressing this, we propose the Algorithm of Thoughts -- a novel
+strategy that propels LLMs through algorithmic reasoning pathways. By employing
+algorithmic examples fully in-context, this overarching view of the whole
+process exploits the innate recurrence dynamics of LLMs, expanding their idea
+exploration with merely one or a few queries. Our technique outperforms earlier
+single-query methods and even more recent multi-query strategies that employ an
+extensive tree search algorithms while using significantly fewer tokens.
+Intriguingly, our results suggest that instructing an LLM using an algorithm
+can lead to performance surpassing that of the algorithm itself, hinting at
+LLM's inherent ability to weave its intuition into optimized searches. We probe
+into the underpinnings of our method's efficacy and its nuances in application.
+The code and related content can be found in:
+https://algorithm-of-thoughts.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Alirector: Alignment-Enhanced Chinese Grammatical Error Corrector <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04601v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04601v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haihui Yang, Xiaojun Quan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chinese grammatical error correction (CGEC) faces serious overcorrection
+challenges when employing autoregressive generative models such as
+sequence-to-sequence (Seq2Seq) models and decoder-only large language models
+(LLMs). While previous methods aim to address overcorrection in Seq2Seq models,
+they are difficult to adapt to decoder-only LLMs. In this paper, we propose an
+alignment-enhanced corrector for the overcorrection problem that applies to
+both Seq2Seq models and decoder-only LLMs. Our method first trains a correction
+model to generate an initial correction of the source sentence. Then, we
+combine the source sentence with the initial correction and feed it through an
+alignment model for another round of correction, aiming to enforce the
+alignment model to focus on potential overcorrection. Moreover, to enhance the
+model's ability to identify nuances, we further explore the reverse alignment
+of the source sentence and the initial correction. Finally, we transfer the
+alignment knowledge from two alignment models to the correction model,
+instructing it on how to avoid overcorrection. Experimental results on three
+CGEC datasets demonstrate the effectiveness of our approach in alleviating
+overcorrection and improving overall performance. Our code has been made
+publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal ArXiv: A <span class="highlight-title">Dataset</span> for Improving Scientific Comprehension of
+  Large Vision-Language Models <span class="chip">ACL
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00231v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00231v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Li, Yuqi Wang, Runxin Xu, Peiyi Wang, Xiachong Feng, Lingpeng Kong, Qi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models (LVLMs) excel across diverse tasks involving
+concrete images from natural scenes. However, their ability to interpret
+abstract figures, such as geometry shapes and scientific plots, remains limited
+due to a scarcity of training datasets in scientific domains. To fill this gap,
+we introduce Multimodal ArXiv, consisting of ArXivCap and ArXivQA, for
+enhancing LVLMs scientific comprehension. ArXivCap is a figure-caption dataset
+comprising 6.4M images and 3.9M captions, sourced from 572K ArXiv papers
+spanning various scientific domains. Drawing from ArXivCap, we introduce
+ArXivQA, a question-answering dataset generated by prompting GPT-4V based on
+scientific figures. ArXivQA greatly enhances open-sourced LVLMs' mathematical
+reasoning capabilities, achieving a 10.4\% absolute accuracy gain on a
+multimodal mathematical reasoning benchmark. Furthermore, employing ArXivCap,
+we devise four vision-to-text tasks for benchmarking LVLMs. Evaluation results
+with state-of-the-art LVLMs underscore their struggle with the nuanced
+semantics of academic figures, while domain-specific training yields
+substantial performance gains. Our error analysis uncovers misinterpretations
+of visual context, recognition errors, and the production of overly simplified
+captions by current LVLMs, shedding light on future improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://mm-arxiv.github.io, Camera Ready Version of ACL
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cephalo: Multi-Modal Vision-Language Models for Bio-Inspired Materials
+  Analysis and Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19076v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19076v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus J. Buehler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Cephalo, a series of multimodal vision large language models
+(V-LLMs) designed for materials science applications, integrating visual and
+linguistic data for enhanced understanding and interaction within human-AI and
+multi-agent AI frameworks. A key innovation of Cephalo is its advanced dataset
+generation method, which employs a sophisticated algorithm to accurately detect
+and separate images and their corresponding textual descriptions from PDF
+documents, such as scientific papers. The method includes a careful refinement
+of image-text pairs through integrated vision and language processing, ensuring
+high-quality, contextually relevant, and well reasoned training data. Cephalo
+is trained on integrated image and text data extracted from thousands of
+scientific papers and science-focused Wikipedia pages demonstrates can
+interpret complex visual scenes, generate precise language descriptions, and
+answer queries about images effectively. The combination of a vision encoder
+with an autoregressive transformer supports complex natural language
+understanding in an integrated model, which can be coupled with other
+generative methods to create an image-to-text-to-image or image-to-text-to-3D
+pipeline. To explore the development of larger models from smaller ones, we
+report both mixture-of-expert methods and model merging. These hybrid
+approaches allow us to leverage the domain-specific expertise and general
+conversational capabilities to harness the strengths of multiple models. We
+examine the models in diverse use cases that incorporate biological materials,
+fracture and engineering analysis, protein biophysics, and bio-inspired design
+based on insect behavior. Generative applications include bio-inspired designs,
+including pollen-inspired architected materials, as well as the synthesis of
+bio-inspired material microstructures from a photograph of a solar eclipse.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hardware-Aware Parallel <span class="highlight-title">Prompt</span> Decoding for Memory-Efficient
+  Acceleration of LLM Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18628v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18628v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Mark Chen, Wayne Luk, Ka Fai Cedric Yiu, Rui Li, Konstantin Mishchenko, Stylianos I. Venieris, Hongxiang Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The auto-regressive decoding of Large Language Models (LLMs) results in
+significant overheads in their hardware performance. While recent research has
+investigated various speculative decoding techniques for multi-token
+generation, these efforts have primarily focused on improving processing speed
+such as throughput. Crucially, they often neglect other metrics essential for
+real-life deployments, such as memory consumption and training cost. To
+overcome these limitations, we propose a novel parallel prompt decoding that
+requires only $0.0002$% trainable parameters, enabling efficient training on a
+single A100-40GB GPU in just 16 hours. Inspired by the human natural language
+generation process, $PPD$ approximates outputs generated at future timesteps in
+parallel by using multiple prompt tokens. This approach partially recovers the
+missing conditional dependency information necessary for multi-token
+generation, resulting in up to a 28% higher acceptance rate for long-range
+predictions. Furthermore, we present a hardware-aware dynamic sparse tree
+technique that adaptively optimizes this decoding scheme to fully leverage the
+computational capacities on different GPUs. Through extensive experiments
+across LLMs ranging from MobileLlama to Vicuna-13B on a wide range of
+benchmarks, our approach demonstrates up to 2.49$\times$ speedup and maintains
+a minimal runtime memory overhead of just $0.0004$%. More importantly, our
+parallel prompt decoding can serve as an orthogonal optimization for
+synergistic integration with existing speculative decoding, showing up to
+$1.22\times$ further speed improvement. Our code is available at
+https://github.com/hmarkc/parallel-prompt-decoding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code for this implementation is available at
+  https://github.com/hmarkc/parallel-prompt-decoding</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Only Time Will Tell: Interpreting How <span class="highlight-title">Transformer</span>s Process Local
+  Ambiguities Through the Lens of Restart-Incrementality <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13113v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13113v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brielen Madureira, Patrick Kahardipraja, David Schlangen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incremental models that process sentences one token at a time will sometimes
+encounter points where more than one interpretation is possible. Causal models
+are forced to output one interpretation and continue, whereas models that can
+revise may edit their previous output as the ambiguity is resolved. In this
+work, we look at how restart-incremental Transformers build and update internal
+states, in an effort to shed light on what processes cause revisions not viable
+in autoregressive models. We propose an interpretable way to analyse the
+incremental states, showing that their sequential structure encodes information
+on the garden path effect and its resolution. Our method brings insights on
+various bidirectional encoders for contextualised meaning representation and
+dependency parsing, contributing to show their advantage over causal models
+when it comes to revisions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Network Formation and Dynamics Among Multi-LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10659v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10659v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marios Papachristou, Yuan Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social networks shape opinions, behaviors, and information dissemination in
+human societies. As large language models (LLMs) increasingly integrate into
+social and professional environments, understanding their behavior within the
+context of social interactions and networks becomes essential. Our study
+analyzes LLMs' network formation behavior to examine whether the dynamics of
+multiple LLMs are similar to or different from human social dynamics. We
+observe that LLMs exhibit key social network principles, including preferential
+attachment, triadic closure, homophily, community structure, and the
+small-world phenomenon, when asked about their preferences in network
+formation. We also investigate LLMs' decision-making based on real-world
+networks, revealing that triadic closure and homophily have a stronger
+influence than preferential attachment and that LLMs perform well in network
+formation predictions. Overall, our study opens up new possibilities for using
+LLMs in network science research and helps develop socially aware LLMs by
+shedding light on their social interaction behaviors and exploring their
+impacts on social dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FLuRKA: Fast and accurate unified Low-Rank & Kernel Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15799v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15799v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahan Gupta, Hao Guo, Yueming Yuan, Yanqi Zhou, Charith Mendis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many efficient $\textit{approximate}$ self-attention techniques have become
+prevalent since the inception of the transformer architecture. Two popular
+classes of these techniques are low-rank and kernel methods. Each of these
+methods has its strengths. We observe these strengths synergistically
+complement each other and exploit them to fuse low-rank and kernel methods,
+producing a new class of transformers: FLuRKA ($\textbf{F}$ast
+$\textbf{L}$ow-$\textbf{R}$ank & $\textbf{K}$ernel$ \textbf{A}$ttention).
+FLuRKA are highly $\textit{training-efficient}$ with faster model speeds
+$\textit{and}$ similar model qualities compared to constituent low-rank and
+kernel methods. We theoretically and empirically evaluate the speed and quality
+of FLuRKA. Our model speed analysis posits a variety of parameter
+configurations where FLuRKA exhibit speedups over low-rank and kernel
+approximations and our model quality analysis bounds the error of FLuRKA with
+respect to full-attention. Empirically, we instantiate three FLuRKA variants
+which experience speedups of up to 3.3x and 1.7x over low-rank and kernel
+methods respectively. This translates to speedups of up to 20x over models with
+flash-attention. Across a diverse set of tasks spanning language modeling,
+language understanding, long sequence modeling, machine translation, and image
+classification, FLuRKA achieve comparable accuracy with underlying low-rank and
+kernel approximations, occasionally surpassing both.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NewsBench: A Systematic Evaluation Framework for Assessing Editorial
+  Capabilities of Large Language Models in Chinese Journalism <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00862v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00862v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miao Li, Ming-Bin Chen, Bo Tang, Shengbin Hou, Pengyu Wang, Haiying Deng, Zhiyu Li, Feiyu Xiong, Keming Mao, Peng Cheng, Yi Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present NewsBench, a novel evaluation framework to systematically assess
+the capabilities of Large Language Models (LLMs) for editorial capabilities in
+Chinese journalism. Our constructed benchmark dataset is focused on four facets
+of writing proficiency and six facets of safety adherence, and it comprises
+manually and carefully designed 1,267 test samples in the types of multiple
+choice questions and short answer questions for five editorial tasks in 24 news
+domains. To measure performances, we propose different GPT-4 based automatic
+evaluation protocols to assess LLM generations for short answer questions in
+terms of writing proficiency and safety adherence, and both are validated by
+the high correlations with human evaluations. Based on the systematic
+evaluation framework, we conduct a comprehensive analysis of ten popular LLMs
+which can handle Chinese. The experimental results highlight GPT-4 and ERNIE
+Bot as top performers, yet reveal a relative deficiency in journalistic safety
+adherence in creative writing tasks. Our findings also underscore the need for
+enhanced ethical guidance in machine-generated journalistic content, marking a
+step forward in aligning LLMs with journalistic standards and safety
+considerations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Long paper, ACL 2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoCo-Agent: A Comprehensive Cognitive MLLM Agent for Smartphone GUI
+  Automation <span class="chip">ACL'2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11941v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11941v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinbei Ma, Zhuosheng Zhang, Hai Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) have shown remarkable potential as
+human-like autonomous language agents to interact with real-world environments,
+especially for graphical user interface (GUI) automation. However, those GUI
+agents require comprehensive cognition ability including exhaustive perception
+and reliable action response. We propose a Comprehensive Cognitive LLM Agent,
+CoCo-Agent, with two novel approaches, comprehensive environment perception
+(CEP) and conditional action prediction (CAP), to systematically improve the
+GUI automation performance. First, CEP facilitates the GUI perception through
+different aspects and granularity, including screenshots and complementary
+detailed layouts for the visual channel and historical actions for the textual
+channel. Second, CAP decomposes the action prediction into sub-problems: action
+type prediction and action target conditioned on the action type. With our
+technical design, our agent achieves new state-of-the-art performance on AITW
+and META-GUI benchmarks, showing promising abilities in realistic scenarios.
+Code is available at https://github.com/xbmxb/CoCo-Agent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL'2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Iterative Associative Memory Model for Empathetic Response Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhou Yang, Zhaochun Ren, Yufeng Wang, Chao Chen, Haizhou Sun, Xiaofei Zhu, Xiangwen Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Empathetic response generation aims to comprehend the cognitive and emotional
+states in dialogue utterances and generate proper responses. Psychological
+theories posit that comprehending emotional and cognitive states necessitates
+iteratively capturing and understanding associated words across dialogue
+utterances. However, existing approaches regard dialogue utterances as either a
+long sequence or independent utterances for comprehension, which are prone to
+overlook the associated words between them. To address this issue, we propose
+an Iterative Associative Memory Model (IAMM) for empathetic response
+generation. Specifically, we employ a novel second-order interaction attention
+mechanism to iteratively capture vital associated words between dialogue
+utterances and situations, dialogue history, and a memory module (for storing
+associated words), thereby accurately and nuancedly comprehending the
+utterances. We conduct experiments on the Empathetic-Dialogue dataset. Both
+automatic and human evaluations validate the efficacy of the model. Variant
+experiments on LLMs also demonstrate that attending to associated words
+improves empathetic comprehension and expression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Acquiring Clean Language Models from Backdoor Poisoned <span class="highlight-title">Dataset</span>s by
+  Downscaling Frequency Space <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12026v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12026v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongru Wu, Zhuosheng Zhang, Pengzhou Cheng, Gongshen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the notable success of language models (LMs) in various natural
+language processing (NLP) tasks, the reliability of LMs is susceptible to
+backdoor attacks. Prior research attempts to mitigate backdoor learning while
+training the LMs on the poisoned dataset, yet struggles against complex
+backdoor attacks in real-world scenarios. In this paper, we investigate the
+learning mechanisms of backdoor LMs in the frequency space by Fourier analysis.
+Our findings indicate that the backdoor mapping presented on the poisoned
+datasets exhibits a more discernible inclination towards lower frequency
+compared to clean mapping, resulting in the faster convergence of backdoor
+mapping. To alleviate this dilemma, we propose Multi-Scale Low-Rank Adaptation
+(MuScleLoRA), which deploys multiple radial scalings in the frequency space
+with low-rank adaptation to the target model and further aligns the gradients
+when updating parameters. Through downscaling in the frequency space,
+MuScleLoRA encourages the model to prioritize the learning of relatively
+high-frequency clean mapping, consequently mitigating backdoor learning.
+Experimental results demonstrate that MuScleLoRA outperforms baselines
+significantly. Notably, MuScleLoRA reduces the average success rate of diverse
+backdoor attacks to below 15\% across multiple datasets and generalizes to
+various backbone LMs, including BERT, RoBERTa, GPT2-XL, and Llama2. The codes
+are publicly available at https://github.com/ZrW00/MuScleLoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 (Long Paper. Main Conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Discrete <span class="highlight-title">Prompt</span> Compression with Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08758v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08758v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hoyoun Jung, Kyung-Joong Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compressed prompts aid instruction-tuned language models (LMs) in overcoming
+context window limitations and reducing computational costs. Existing methods,
+which primarily based on training embeddings, face various challenges
+associated with interpretability, the fixed number of embedding tokens,
+reusability across different LMs, and inapplicability when interacting with
+black-box APIs. This study proposes prompt compression with reinforcement
+learning (PCRL), which is a discrete prompt compression method that addresses
+these issues. The proposed PCRL method utilizes a computationally efficient
+policy network that edits prompts directly. The training approach employed in
+the proposed PCRLs can be applied flexibly to various types of LMs, including
+both decoder-only and encoder-decoder architecture and it can be trained
+without gradient access to the LMs or labeled data. The proposed PCRL achieves
+an average reduction of 24.6% in terms of the token count across various
+instruction prompts while maintaining sufficient performance. In addition, we
+demonstrate that the learned policy can be transferred to larger LMs, and
+through a comprehensive analysis, we explore the token importance within the
+prompts. Our code is accessible at
+https://github.com/nenomigami/PromptCompressor.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Building Multilingual Language Model for Medicine 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13963v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13963v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengcheng Qiu, Chaoyi Wu, Xiaoman Zhang, Weixiong Lin, Haicheng Wang, Ya Zhang, Yanfeng Wang, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of open-source, multilingual medical language models can
+benefit a wide, linguistically diverse audience from different regions. To
+promote this domain, we present contributions from the following: First, we
+construct a multilingual medical corpus, containing approximately 25.5B tokens
+encompassing 6 main languages, termed as MMedC, enabling auto-regressive domain
+adaptation for general LLMs; Second, to monitor the development of multilingual
+medical LLMs, we propose a multilingual medical multi-choice question-answering
+benchmark with rationale, termed as MMedBench; Third, we have assessed a number
+of open-source large language models (LLMs) on our benchmark, along with those
+further auto-regressive trained on MMedC. Our final model, MMed-Llama 3, with
+only 8B parameters, achieves superior performance compared to all other
+open-source models on both MMedBench and English benchmarks, even rivaling
+GPT-4. In conclusion, in this work, we present a large-scale corpus, a
+benchmark and a series of models to support the development of multilingual
+medical LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating Multi-Hop Factual Shortcuts in Knowledge Editing of Large
+  Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11900v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11900v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianjie Ju, Yijin Chen, Xinwei Yuan, Zhuosheng Zhang, Wei Du, Yubin Zheng, Gongshen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has showcased the powerful capability of large language models
+(LLMs) in recalling knowledge and reasoning. However, the reliability of LLMs
+in combining these two capabilities into reasoning through multi-hop facts has
+not been widely explored. This paper systematically investigates the
+possibilities for LLMs to utilize shortcuts based on direct connections between
+the initial and terminal entities of multi-hop knowledge. We first explore the
+existence of factual shortcuts through Knowledge Neurons, revealing that: (i)
+the strength of factual shortcuts is highly correlated with the frequency of
+co-occurrence of initial and terminal entities in the pre-training corpora;
+(ii) few-shot prompting leverage more shortcuts in answering multi-hop
+questions compared to chain-of-thought prompting. Then, we analyze the risks
+posed by factual shortcuts from the perspective of multi-hop knowledge editing.
+Analysis shows that approximately 20% of the failures are attributed to
+shortcuts, and the initial and terminal entities in these failure instances
+usually have higher co-occurrences in the pre-training corpus. Finally, we
+propose erasing shortcut neurons to mitigate the associated risks and find that
+this approach significantly reduces failures in multiple-hop knowledge editing
+caused by shortcuts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 (Long Paper. Main Conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advancing Parameter Efficiency in Fine-tuning via Representation Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15179v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15179v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muling Wu, Wenhao Liu, Xiaohua Wang, Tianlong Li, Changze Lv, Zixuan Ling, Jianhao Zhu, Cenyuan Zhang, Xiaoqing Zheng, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter Efficient Fine-Tuning (PEFT) techniques have drawn significant
+attention due to their ability to yield competitive results while updating only
+a small portion of the adjustable parameters. However, existing PEFT methods
+pose challenges in hyperparameter selection, such as choosing the rank for LoRA
+or Adapter, or specifying the length of soft prompts. To address these
+challenges, we propose a novel fine-tuning approach for neural models, named
+Representation EDiting (RED), which modifies the representations generated at
+some layers through the application of scaling and biasing operations. While
+existing PEFT methods still demonstrate over-parameterization that could
+potentially undermine the generalization ability acquired from pre-training,
+RED can substantially reduce the number of trainable parameters by a factor of
+25, 700 compared to full parameter fine-tuning and by a factor of 32 relative
+to LoRA. Remarkably, RED achieves results comparable or superior to both full
+parameter fine-tuning and other PEFT methods. Extensive experiments across
+various model architectures and scales, including RoBERTa, GPT-2, T5, and
+LLaMA-2, have demonstrated the effectiveness and efficiency of RED1, thereby
+positioning it as a promising PEFT strategy for large-scale neural models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InversionView: A General-Purpose Method for Reading Information from
+  Neural Activations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17653v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17653v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinting Huang, Madhur Panwar, Navin Goyal, Michael Hahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The inner workings of neural networks can be better understood if we can
+fully decipher the information encoded in neural activations. In this paper, we
+argue that this information is embodied by the subset of inputs that give rise
+to similar activations. Computing such subsets is nontrivial as the input space
+is exponentially large. We propose InversionView, which allows us to
+practically inspect this subset by sampling from a trained decoder model
+conditioned on activations. This helps uncover the information content of
+activation vectors, and facilitates understanding of the algorithms implemented
+by transformer models. We present three case studies where we investigate
+models ranging from small transformers to GPT-2. In these studies, we
+demonstrate the characteristics of our method, show the distinctive advantages
+it offers, and provide causally verified circuits.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Timeline-based Sentence Decomposition with In-Context Learning for
+  Temporal Fact Extraction <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10288v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10288v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianhao Chen, Haoyuan Ouyang, Junyang Ren, Wentao Ding, Wei Hu, Yuzhong Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Facts extraction is pivotal for constructing knowledge graphs. Recently, the
+increasing demand for temporal facts in downstream tasks has led to the
+emergence of the task of temporal fact extraction. In this paper, we
+specifically address the extraction of temporal facts from natural language
+text. Previous studies fail to handle the challenge of establishing
+time-to-fact correspondences in complex sentences. To overcome this hurdle, we
+propose a timeline-based sentence decomposition strategy using large language
+models (LLMs) with in-context learning, ensuring a fine-grained understanding
+of the timeline associated with various facts. In addition, we evaluate the
+performance of LLMs for direct temporal fact extraction and get unsatisfactory
+results. To this end, we introduce TSDRE, a method that incorporates the
+decomposition capabilities of LLMs into the traditional fine-tuning of smaller
+pre-trained language models (PLMs). To support the evaluation, we construct
+ComplexTRED, a complex temporal fact extraction dataset. Our experiments show
+that TSDRE achieves state-of-the-art results on both HyperRED-Temporal and
+ComplexTRED datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLaMP: Large Language Model Made Powerful for High-fidelity Materials
+  Knowledge Retrieval and Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.17244v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.17244v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Chiang, Elvis Hsieh, Chia-Hong Chou, Janosh Riebesell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reducing hallucination of Large Language Models (LLMs) is imperative for use
+in the sciences, where reliability and reproducibility are crucial. However,
+LLMs inherently lack long-term memory, making it a nontrivial, ad hoc, and
+inevitably biased task to fine-tune them on domain-specific literature and
+data. Here we introduce LLaMP, a multimodal retrieval-augmented generation
+(RAG) framework of hierarchical reasoning-and-acting (ReAct) agents that can
+dynamically and recursively interact with computational and experimental data
+on Materials Project (MP) and run atomistic simulations via high-throughput
+workflow interface. Without fine-tuning, LLaMP demonstrates strong tool usage
+ability to comprehend and integrate various modalities of materials science
+concepts, fetch relevant data stores on the fly, process higher-order data
+(such as crystal structure and elastic tensor), and streamline complex tasks in
+computational materials and chemistry. We propose a simple metric combining
+uncertainty and confidence estimates to evaluate the self-consistency of
+responses by LLaMP and vanilla LLMs. Our benchmark shows that LLaMP effectively
+mitigates the intrinsic bias in LLMs, counteracting the errors on bulk moduli,
+electronic bandgaps, and formation energies that seem to derive from mixed data
+sources. We also demonstrate LLaMP's capability to edit crystal structures and
+run annealing molecular dynamics simulations using pre-trained machine-learning
+force fields. The framework offers an intuitive and nearly hallucination-free
+approach to exploring and scaling materials informatics, and establishes a
+pathway for knowledge distillation and fine-tuning other language models. Code
+and live demo are available at https://github.com/chiang-yuan/llamp
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAP-Neo: Highly Capable and Transparent Bilingual Large Language Model
+  Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19327v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19327v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ge Zhang, Scott Qu, Jiaheng Liu, Chenchen Zhang, Chenghua Lin, Chou Leuang Yu, Danny Pan, Esther Cheng, Jie Liu, Qunshu Lin, Raven Yuan, Tuney Zheng, Wei Pang, Xinrun Du, Yiming Liang, Yinghao Ma, Yizhi Li, Ziyang Ma, Bill Lin, Emmanouil Benetos, Huan Yang, Junting Zhou, Kaijing Ma, Minghao Liu, Morry Niu, Noah Wang, Quehry Que, Ruibo Liu, Sine Liu, Shawn Guo, Soren Gao, Wangchunshu Zhou, Xinyue Zhang, Yizhi Zhou, Yubo Wang, Yuelin Bai, Yuhan Zhang, Yuxiang Zhang, Zenith Wang, Zhenzhu Yang, Zijian Zhao, Jiajun Zhang, Wanli Ouyang, Wenhao Huang, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have made great strides in recent years to
+achieve unprecedented performance across different tasks. However, due to
+commercial interest, the most competitive models like GPT, Gemini, and Claude
+have been gated behind proprietary interfaces without disclosing the training
+details. Recently, many institutions have open-sourced several strong LLMs like
+LLaMA-3, comparable to existing closed-source LLMs. However, only the model's
+weights are provided with most details (e.g., intermediate checkpoints,
+pre-training corpus, and training code, etc.) being undisclosed. To improve the
+transparency of LLMs, the research community has formed to open-source truly
+open LLMs (e.g., Pythia, Amber, OLMo), where more details (e.g., pre-training
+corpus and training code) are being provided. These models have greatly
+advanced the scientific study of these large models including their strengths,
+weaknesses, biases and risks. However, we observe that the existing truly open
+LLMs on reasoning, knowledge, and coding tasks are still inferior to existing
+state-of-the-art LLMs with similar model sizes. To this end, we open-source
+MAP-Neo, a highly capable and transparent bilingual language model with 7B
+parameters trained from scratch on 4.5T high-quality tokens. Our MAP-Neo is the
+first fully open-sourced bilingual LLM with comparable performance compared to
+existing state-of-the-art LLMs. Moreover, we open-source all details to
+reproduce our MAP-Neo, where the cleaned pre-training corpus, data cleaning
+pipeline, checkpoints, and well-optimized training/evaluation framework are
+provided. Finally, we hope our MAP-Neo will enhance and strengthen the open
+research community and inspire more innovations and creativities to facilitate
+the further improvements of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://map-neo.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Increasing Trust in Language Models through the Reuse of Verified
+  Circuits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02619v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02619v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philip Quirke, Clement Neo, Fazl Barez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language Models (LMs) are increasingly used for a wide range of prediction
+tasks, but their training can often neglect rare edge cases, reducing their
+reliability. Here, we define a stringent standard of trustworthiness whereby
+the task algorithm and circuit implementation must be verified, accounting for
+edge cases, with no known failure modes. We show that a transformer model can
+be trained to meet this standard if built using mathematically and logically
+specified frameworks. In this paper, we fully verify a model for n-digit
+integer addition. To exhibit the reusability of verified modules, we insert the
+trained integer addition model into an untrained model and train the combined
+model to perform both addition and subtraction. We find extensive reuse of the
+addition circuits for both tasks, easing verification of the more complex
+subtractor model. We discuss how inserting verified task modules into LMs can
+leverage model reuse to improve verifiability and trustworthiness of language
+models built using them. The reuse of verified circuits reduces the effort to
+verify more complex composite models which we believe to be a significant step
+towards safety of language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WorDepth: Variational Language Prior for Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03635v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03635v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyao Zeng, Daniel Wang, Fengyu Yang, Hyoungseob Park, Yangchao Wu, Stefano Soatto, Byung-Woo Hong, Dong Lao, Alex Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Three-dimensional (3D) reconstruction from a single image is an ill-posed
+problem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text
+description(s) is similarly ill-posed, i.e. spatial arrangements of objects
+described. We investigate the question of whether two inherently ambiguous
+modalities can be used in conjunction to produce metric-scaled reconstructions.
+To test this, we focus on monocular depth estimation, the problem of predicting
+a dense depth map from a single image, but with an additional text caption
+describing the scene. To this end, we begin by encoding the text caption as a
+mean and standard deviation; using a variational framework, we learn the
+distribution of the plausible metric reconstructions of 3D scenes corresponding
+to the text captions as a prior. To "select" a specific reconstruction or depth
+map, we encode the given image through a conditional sampler that samples from
+the latent space of the variational text encoder, which is then decoded to the
+output depth map. Our approach is trained alternatingly between the text and
+image branches: in one optimization step, we predict the mean and standard
+deviation from the text description and sample from a standard Gaussian, and in
+the other, we sample using a (image) conditional sampler. Once trained, we
+directly predict depth from the encoded text using the conditional sampler. We
+demonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where
+we show that language can consistently improve performance in both.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Political Preferences of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01789v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01789v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Rozado
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  I report here a comprehensive analysis about the political preferences
+embedded in Large Language Models (LLMs). Namely, I administer 11 political
+orientation tests, designed to identify the political preferences of the test
+taker, to 24 state-of-the-art conversational LLMs, both closed and open source.
+When probed with questions/statements with political connotations, most
+conversational LLMs tend to generate responses that are diagnosed by most
+political test instruments as manifesting preferences for left-of-center
+viewpoints. This does not appear to be the case for five additional base (i.e.
+foundation) models upon which LLMs optimized for conversation with humans are
+built. However, the weak performance of the base models at coherently answering
+the tests' questions makes this subset of results inconclusive. Finally, I
+demonstrate that LLMs can be steered towards specific locations in the
+political spectrum through Supervised Fine-Tuning (SFT) with only modest
+amounts of politically aligned data, suggesting SFT's potential to embed
+political orientation in LLMs. With LLMs beginning to partially displace
+traditional information sources like search engines and Wikipedia, the societal
+implications of political biases embedded in LLMs are substantial.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Mindset: An MBTI Exploration of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12999v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12999v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxi Cui, Liuzhenghao Lv, Jing Wen, Rongsheng Wang, Jing Tang, YongHong Tian, Li Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel approach for integrating Myers-Briggs Type Indicator
+(MBTI) personality traits into large language models (LLMs), addressing the
+challenges of personality consistency in personalized AI. Our method, "Machine
+Mindset," involves a two-phase fine-tuning and Direct Preference Optimization
+(DPO) to embed MBTI traits into LLMs. This approach ensures that models
+internalize these traits, offering a stable and consistent personality profile.
+We demonstrate the effectiveness of our models across various domains, showing
+alignment between model performance and their respective MBTI traits. The paper
+highlights significant contributions in the development of personality datasets
+and a new training methodology for personality integration in LLMs, enhancing
+the potential for personalized AI applications. We also open-sourced our model
+and part of the data at \url{https://github.com/PKU-YuanGroup/Machine-Mindset}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta-Reasoning: Semantics-Symbol Deconstruction for Large Language
+  Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17820v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17820v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Wang, Zhuosheng Zhang, Pei Zhang, Baosong Yang, Rui Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural-symbolic methods have demonstrated efficiency in enhancing the
+reasoning abilities of large language models (LLMs). However, existing methods
+mainly rely on syntactically mapping natural languages to complete formal
+languages like Python and SQL. Those methods require that reasoning tasks be
+convertible into programs, which cater to the computer execution mindset and
+deviate from human reasoning habits. To broaden symbolic methods' applicability
+and adaptability in the real world, we propose the Meta-Reasoning from a
+linguistic perspective. This method empowers LLMs to deconstruct
+reasoning-independent semantic information into generic symbolic
+representations, thereby efficiently capturing more generalized reasoning
+knowledge. We conduct extensive experiments on more than ten datasets
+encompassing conventional reasoning tasks like arithmetic, symbolic, and
+logical reasoning, and the more complex interactive reasoning tasks like
+theory-of-mind reasoning. Experimental results demonstrate that Meta-Reasoning
+significantly enhances in-context reasoning accuracy, learning efficiency,
+out-of-domain generalization, and output stability compared to the
+Chain-of-Thought technique. Code and data are publicly available at
+\url{https://github.com/Alsace08/Meta-Reasoning}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PET-SQL: A <span class="highlight-title">Prompt</span>-Enhanced Two-Round Refinement of Text-to-SQL with
+  Cross-consistency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09732v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09732v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhishuai Li, Xiang Wang, Jingjing Zhao, Sun Yang, Guoqing Du, Xiaoru Hu, Bin Zhang, Yuxiao Ye, Ziyue Li, Rui Zhao, Hangyu Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Text-to-SQL (Text2SQL) emphasize stimulating the large
+language models (LLM) on in-context learning, achieving significant results.
+Nevertheless, they face challenges when dealing with verbose database
+information and complex user intentions. This paper presents a two-stage
+framework to enhance the performance of current LLM-based natural language to
+SQL systems. We first introduce a novel prompt representation, called
+reference-enhanced representation, which includes schema information and
+randomly sampled cell values from tables to instruct LLMs in generating SQL
+queries. Then, in the first stage, question-SQL pairs are retrieved as few-shot
+demonstrations, prompting the LLM to generate a preliminary SQL (PreSQL). After
+that, the mentioned entities in PreSQL are parsed to conduct schema linking,
+which can significantly compact the useful information. In the second stage,
+with the linked schema, we simplify the prompt's schema information and
+instruct the LLM to produce the final SQL. Finally, as the post-refinement
+module, we propose using cross-consistency across different LLMs rather than
+self-consistency within a particular LLM. Our methods achieve new SOTA results
+on the Spider benchmark, with an execution accuracy of 87.6%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ M$^3$AV: A Multimodal, Multigenre, and Multipurpose Audio-Visual
+  Academic Lecture <span class="highlight-title">Dataset</span> <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.14168v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.14168v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Chen, Heyang Liu, Wenyi Yu, Guangzhi Sun, Hongcheng Liu, Ji Wu, Chao Zhang, Yu Wang, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Publishing open-source academic video recordings is an emergent and prevalent
+approach to sharing knowledge online. Such videos carry rich multimodal
+information including speech, the facial and body movements of the speakers, as
+well as the texts and pictures in the slides and possibly even the papers.
+Although multiple academic video datasets have been constructed and released,
+few of them support both multimodal content recognition and understanding
+tasks, which is partially due to the lack of high-quality human annotations. In
+this paper, we propose a novel multimodal, multigenre, and multipurpose
+audio-visual academic lecture dataset (M$^3$AV), which has almost 367 hours of
+videos from five sources covering computer science, mathematics, and medical
+and biology topics. With high-quality human annotations of the slide text and
+spoken words, in particular high-valued name entities, the dataset can be used
+for multiple audio-visual recognition and understanding tasks. Evaluations
+performed on contextual speech recognition, speech synthesis, and slide and
+script generation tasks demonstrate that the diversity of M$^3$AV makes it a
+challenging dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main Conference. Project website:
+  https://jack-zc8.github.io/M3AV-dataset-page</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MEMoE: Enhancing Model Editing with Mixture of Experts Adaptors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19086v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19086v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renzhi Wang, Piji Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model editing aims to efficiently alter the behavior of Large Language Models
+(LLMs) within a desired scope, while ensuring no adverse impact on other
+inputs. Recent years have witnessed various model editing methods been
+proposed. However, these methods either exhibit poor overall performance or
+struggle to strike a balance between generalization and locality. We propose
+MEMoE, a model editing adapter utilizing a Mixture of Experts (MoE)
+architecture with a knowledge anchor routing strategy. MEMoE updates knowledge
+using a bypass MoE structure, keeping the original parameters unchanged to
+preserve the general ability of LLMs. And, the knowledge anchor routing ensures
+that inputs requiring similar knowledge are routed to the same expert, thereby
+enhancing the generalization of the updated knowledge. Experimental results
+show the superiority of our approach over both batch editing and sequential
+batch editing tasks, exhibiting exceptional overall performance alongside
+outstanding balance between generalization and locality. Our code will be
+available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">46</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-based Hierarchical Concept Decomposition for Interpretable
+  Fine-Grained Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18672v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18672v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renyi Qu, Mark Yatskar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  (Renyi Qu's Master's Thesis) Recent advancements in interpretable models for
+vision-language tasks have achieved competitive performance; however, their
+interpretability often suffers due to the reliance on unstructured text outputs
+from large language models (LLMs). This introduces randomness and compromises
+both transparency and reliability, which are essential for addressing safety
+issues in AI systems. We introduce \texttt{Hi-CoDe} (Hierarchical Concept
+Decomposition), a novel framework designed to enhance model interpretability
+through structured concept analysis. Our approach consists of two main
+components: (1) We use GPT-4 to decompose an input image into a structured
+hierarchy of visual concepts, thereby forming a visual concept tree. (2) We
+then employ an ensemble of simple linear classifiers that operate on
+concept-specific features derived from CLIP to perform classification. Our
+approach not only aligns with the performance of state-of-the-art models but
+also advances transparency by providing clear insights into the decision-making
+process and highlighting the importance of various concepts. This allows for a
+detailed analysis of potential failure modes and improves model compactness,
+therefore setting a new benchmark in interpretability without compromising the
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Slot Abstractors: Toward Scalable Abstract Visual Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03458v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03458v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanka Subhra Mondal, Jonathan D. Cohen, Taylor W. Webb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abstract visual reasoning is a characteristically human ability, allowing the
+identification of relational patterns that are abstracted away from object
+features, and the systematic generalization of those patterns to unseen
+problems. Recent work has demonstrated strong systematic generalization in
+visual reasoning tasks involving multi-object inputs, through the integration
+of slot-based methods used for extracting object-centric representations
+coupled with strong inductive biases for relational abstraction. However, this
+approach was limited to problems containing a single rule, and was not scalable
+to visual reasoning problems containing a large number of objects. Other recent
+work proposed Abstractors, an extension of Transformers that incorporates
+strong relational inductive biases, thereby inheriting the Transformer's
+scalability and multi-head architecture, but it has yet to be demonstrated how
+this approach might be applied to multi-object visual inputs. Here we combine
+the strengths of the above approaches and propose Slot Abstractors, an approach
+to abstract visual reasoning that can be scaled to problems involving a large
+number of objects and multiple relations among them. The approach displays
+state-of-the-art performance across four abstract visual reasoning tasks, as
+well as an abstract reasoning task involving real-world images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sp2360: Sparse-view 360 Scene Reconstruction using Cascaded 2D Diffusion
+  Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16517v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16517v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soumava Paul, Christopher Wewer, Bernt Schiele, Jan Eric Lenssen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to tackle sparse-view reconstruction of a 360 3D scene using priors
+from latent diffusion models (LDM). The sparse-view setting is ill-posed and
+underconstrained, especially for scenes where the camera rotates 360 degrees
+around a point, as no visual information is available beyond some frontal views
+focused on the central object(s) of interest. In this work, we show that
+pretrained 2D diffusion models can strongly improve the reconstruction of a
+scene with low-cost fine-tuning. Specifically, we present SparseSplat360
+(Sp2360), a method that employs a cascade of in-painting and artifact removal
+models to fill in missing details and clean novel views. Due to superior
+training and rendering speeds, we use an explicit scene representation in the
+form of 3D Gaussians over NeRF-based implicit representations. We propose an
+iterative update strategy to fuse generated pseudo novel views with existing 3D
+Gaussians fitted to the initial sparse inputs. As a result, we obtain a
+multi-view consistent scene representation with details coherent with the
+observed inputs. Our evaluation on the challenging Mip-NeRF360 dataset shows
+that our proposed 2D to 3D distillation algorithm considerably improves the
+performance of a regularized version of 3DGS adapted to a sparse-view setting
+and outperforms existing sparse-view reconstruction methods in 360 scene
+reconstruction. Qualitatively, our method generates entire 360 scenes from as
+few as 9 input views, with a high degree of foreground and background detail.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 11 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Prototypical Part Networks with Reward Reweighing,
+  Reselection, and Retraining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03887v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03887v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron J. Li, Robin Netzorg, Zhihan Cheng, Zhuoqin Zhang, Bin Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, work has gone into developing deep interpretable methods for
+image classification that clearly attributes a model's output to specific
+features of the data. One such of these methods is the Prototypical Part
+Network (ProtoPNet), which attempts to classify images based on meaningful
+parts of the input. While this architecture is able to produce visually
+interpretable classifications, it often learns to classify based on parts of
+the image that are not semantically meaningful. To address this problem, we
+propose the Reward Reweighing, Reselecting, and Retraining (R3) post-processing
+framework, which performs three additional corrective updates to a pretrained
+ProtoPNet in an offline and efficient manner. The first two steps involve
+learning a reward model based on collected human feedback and then aligning the
+prototypes with human preferences. The final step is retraining, which realigns
+the base features and the classifier layer of the original model with the
+updated prototypes. We find that our R3 framework consistently improves both
+the interpretability and the predictive accuracy of ProtoPNet and its variants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Position: Do Not Explain Vision Models Without Context <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18316v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18316v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paulina Tomaszewska, Przemysław Biecek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Does the stethoscope in the picture make the adjacent person a doctor or a
+patient? This, of course, depends on the contextual relationship of the two
+objects. If it's obvious, why don't explanation methods for vision models use
+contextual information? In this paper, we (1) review the most popular methods
+of explaining computer vision models by pointing out that they do not take into
+account context information, (2) show examples of failures of popular XAI
+methods, (3) provide examples of real-world use cases where spatial context
+plays a significant role, (4) propose new research directions that may lead to
+better use of context information in explaining computer vision models, (5)
+argue that a change in approach to explanations is needed from 'where' to
+'how'.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at International Conference on Machine Learning (ICML) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantile-based Maximum Likelihood Training for Outlier Detection <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06085v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06085v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masoud Taghikhah, Nishant Kumar, Siniša Šegvić, Abouzar Eslami, Stefan Gumhold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discriminative learning effectively predicts true object class for image
+classification. However, it often results in false positives for outliers,
+posing critical concerns in applications like autonomous driving and video
+surveillance systems. Previous attempts to address this challenge involved
+training image classifiers through contrastive learning using actual outlier
+data or synthesizing outliers for self-supervised learning. Furthermore,
+unsupervised generative modeling of inliers in pixel space has shown limited
+success for outlier detection. In this work, we introduce a quantile-based
+maximum likelihood objective for learning the inlier distribution to improve
+the outlier separation during inference. Our approach fits a normalizing flow
+to pre-trained discriminative features and detects the outliers according to
+the evaluated log-likelihood. The experimental evaluation demonstrates the
+effectiveness of our method as it surpasses the performance of the
+state-of-the-art unsupervised methods for outlier detection. The results are
+also competitive compared with a recent self-supervised approach for outlier
+detection. Our work allows to reduce dependency on well-sampled negative
+training data, which is especially important for domains like medical
+diagnostics or remote sensing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera Ready Version. Accepted at AAAI 2024. Code available at
+  https://github.com/taghikhah/QuantOD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiG-IN: Diffusion Guidance for Investigating Networks -- Uncovering
+  Classifier Differences, Neuron Visualisations, and Visual Counterfactual
+  Explanations <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.17833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.17833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Augustin, Yannic Neuhaus, Matthias Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep learning has led to huge progress in complex image classification
+tasks like ImageNet, unexpected failure modes, e.g. via spurious features, call
+into question how reliably these classifiers work in the wild. Furthermore, for
+safety-critical tasks the black-box nature of their decisions is problematic,
+and explanations or at least methods which make decisions plausible are needed
+urgently. In this paper, we address these problems by generating images that
+optimize a classifier-derived objective using a framework for guided image
+generation. We analyze the decisions of image classifiers by visual
+counterfactual explanations (VCEs), detection of systematic mistakes by
+analyzing images where classifiers maximally disagree, and visualization of
+neurons and spurious features. In this way, we validate existing observations,
+e.g. the shape bias of adversarially robust models, as well as novel failure
+modes, e.g. systematic errors of zero-shot CLIP classifiers. Moreover, our VCEs
+outperform previous work while being more versatile.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solar Panel Segmentation :<span class="highlight-title">Self-Supervised</span> Learning Solutions for
+  Imperfect <span class="highlight-title">Dataset</span>s <span class="chip">ICLR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12843v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12843v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sankarshanaa Sagaram, Krish Didwania, Laven Srivastava, Aditya Kasliwal, Pallavi Kailas, Ujjwal Verma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing adoption of solar energy necessitates advanced methodologies
+for monitoring and maintenance to ensure optimal performance of solar panel
+installations. A critical component in this context is the accurate
+segmentation of solar panels from aerial or satellite imagery, which is
+essential for identifying operational issues and assessing efficiency. This
+paper addresses the significant challenges in panel segmentation, particularly
+the scarcity of annotated data and the labour-intensive nature of manual
+annotation for supervised learning. We explore and apply Self-Supervised
+Learning (SSL) to solve these challenges. We demonstrate that SSL significantly
+enhances model generalization under various conditions and reduces dependency
+on manually annotated data, paving the way for robust and adaptable solar panel
+segmentation solutions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICLR Tiny Paper 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoLLaVO: Crayon Large Language and Vision mOdel <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11248v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11248v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Byung-Kwan Lee, Beomchan Park, Chae Won Kim, Yong Man Ro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable success of Large Language Models (LLMs) and instruction tuning
+drives the evolution of Vision Language Models (VLMs) towards a versatile
+general-purpose model. Yet, it remains unexplored whether current VLMs
+genuinely possess quality object-level image understanding capabilities
+determined from 'what objects are in the image?' or 'which object corresponds
+to a specified bounding box?'. Our findings reveal that the image understanding
+capabilities of current VLMs are strongly correlated with their zero-shot
+performance on vision language (VL) tasks. This suggests that prioritizing
+basic image understanding is crucial for VLMs to excel at VL tasks. To enhance
+object-level image understanding, we propose Crayon Large Language and Vision
+mOdel (CoLLaVO), which incorporates instruction tuning with Crayon Prompt as a
+new visual prompt tuning scheme based on panoptic color maps. Furthermore, we
+present a learning strategy of Dual QLoRA to preserve object-level image
+understanding without forgetting it during visual instruction tuning, thereby
+achieving a significant leap in numerous VL benchmarks in a zero-shot setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Findings. Code available:
+  https://github.com/ByungKwanLee/CoLLaVO</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solving the bongard-logo problem by modeling a probabilistic model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03173v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03173v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruizhuo Song, Beiming Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abstract reasoning problems pose challenges to the perception and cognition
+abilities of AI algorithms, demanding deeper pattern recognition and inductive
+reasoning beyond mere identification of explicit image features. In this study,
+we introduce PMoC, a probabilistic model tailored for the Bongard-Logo problem,
+achieving high reasoning accuracy through the construction of an conditional
+probabilistic model. Additionally, we have designed the Pose-Transformer, an
+enhanced Transformer-Encoder specifically crafted for complex abstract
+reasoning tasks, including Bongard-Logo, RAVEN, I-RAVEN, and PGM. Inspired by
+the pose matrix in capsule networks, Pose-Transformer strengthens the focus on
+positional relationships between local features when processing image data.
+When combined with PMoC, it can further enhance reasoning accuracy. Our
+Pose-Transformer effectively addresses reasoning difficulties associated with
+changes in the position of abstract entities, outperforming previous models on
+RAVEN's OIG, D3$\times$3 subsets, and the PGM dataset. Finally, considering the
+deployment difficulties arising from the large number of Pose-Transformer
+parameters, this paper presents a lightweight version, Straw-Pose-Transformer,
+which maintains performance while significantly reducing the parameter count.
+This study contributes to enhancing AI capabilities in abstract reasoning and
+cognitive pattern recognition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 11 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BC-MRI-SEG: A Breast Cancer MRI Tumor Segmentation Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13756v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13756v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anthony Bilic, Chen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Binary breast cancer tumor segmentation with Magnetic Resonance Imaging (MRI)
+data is typically trained and evaluated on private medical data, which makes
+comparing deep learning approaches difficult. We propose a benchmark
+(BC-MRI-SEG) for binary breast cancer tumor segmentation based on publicly
+available MRI datasets. The benchmark consists of four datasets in total, where
+two datasets are used for supervised training and evaluation, and two are used
+for zero-shot evaluation. Additionally we compare state-of-the-art (SOTA)
+approaches on our benchmark and provide an exhaustive list of available public
+breast cancer MRI datasets. The source code has been made available at
+https://irulenot.github.io/BC_MRI_SEG_Benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparative Analysis of Hyperspectral Image Reconstruction Using Deep
+  Learning for Agricultural and Biological Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13331v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13331v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md. Toukir Ahmed, Arthur Villordon, Mohammed Kamruzzaman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperspectral imaging (HSI) has become a key technology for non-invasive
+quality evaluation in various fields, offering detailed insights through
+spatial and spectral data. Despite its efficacy, the complexity and high cost
+of HSI systems have hindered their widespread adoption. This study addressed
+these challenges by exploring deep learning-based hyperspectral image
+reconstruction from RGB (Red, Green, Blue) images, particularly for
+agricultural products. Specifically, different hyperspectral reconstruction
+algorithms, such as Hyperspectral Convolutional Neural Network - Dense
+(HSCNN-D), High-Resolution Network (HRNET), and Multi-Scale Transformer Plus
+Plus (MST++), were compared to assess the dry matter content of sweet potatoes.
+Among the tested reconstruction methods, HRNET demonstrated superior
+performance, achieving the lowest mean relative absolute error (MRAE) of 0.07,
+root mean square error (RMSE) of 0.03, and the highest peak signal-to-noise
+ratio (PSNR) of 32.28 decibels (dB). Some key features were selected using the
+genetic algorithm (GA), and their importance was interpreted using explainable
+artificial intelligence (XAI). Partial least squares regression (PLSR) models
+were developed using the RGB, reconstructed, and ground truth (GT) data. The
+visual and spectra quality of these reconstructed methods was compared with GT
+data, and predicted maps were generated. The results revealed the prospect of
+deep learning-based hyperspectral image reconstruction as a cost-effective and
+efficient quality assessment tool for agricultural and biological applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MV-MR: multi-views and multi-representations for <span class="highlight-title">self-supervised</span>
+  learning and knowledge distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vitaliy Kinakh, Mariia Drozdova, Slava Voloshynovskiy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new method of self-supervised learning and knowledge
+distillation based on the multi-views and multi-representations (MV-MR). The
+MV-MR is based on the maximization of dependence between learnable embeddings
+from augmented and non-augmented views, jointly with the maximization of
+dependence between learnable embeddings from augmented view and multiple
+non-learnable representations from non-augmented view. We show that the
+proposed method can be used for efficient self-supervised classification and
+model-agnostic knowledge distillation. Unlike other self-supervised techniques,
+our approach does not use any contrastive learning, clustering, or stop
+gradients. MV-MR is a generic framework allowing the incorporation of
+constraints on the learnable embeddings via the usage of image
+multi-representations as regularizers. Along this line, knowledge distillation
+is considered a particular case of such a regularization. MV-MR provides the
+state-of-the-art performance on the STL10 and ImageNet-1K datasets among
+non-contrastive and clustering-free methods. We show that a lower complexity
+ResNet50 model pretrained using proposed knowledge distillation based on the
+CLIP ViT model achieves state-of-the-art performance on STL10 linear
+evaluation. The code is available at: https://github.com/vkinakh/mv-mr
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Super-Resolution Networks through Realistic Thick-Slice CT
+  Simulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10182v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10182v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Tang, Xiaodan Xing, Guang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based Generative Models have the potential to convert
+low-resolution CT images into high-resolution counterparts without long
+acquisition times and increased radiation exposure in thin-slice CT imaging.
+However, procuring appropriate training data for these Super-Resolution (SR)
+models is challenging. Previous SR research has simulated thick-slice CT images
+from thin-slice CT images to create training pairs. However, these methods
+either rely on simplistic interpolation techniques that lack realism or
+sinogram reconstruction, which require the release of raw data and complex
+reconstruction algorithms. Thus, we introduce a simple yet realistic method to
+generate thick CT images from thin-slice CT images, facilitating the creation
+of training pairs for SR algorithms. The training pairs produced by our method
+closely resemble real data distributions (PSNR=49.74 vs. 40.66, p$<$0.05). A
+multivariate Cox regression analysis involving thick slice CT images with lung
+fibrosis revealed that only the radiomics features extracted using our method
+demonstrated a significant correlation with mortality (HR=1.19 and HR=1.14,
+p$<$0.005). This paper represents the first to identify and address the
+challenge of generating appropriate paired training data for Deep
+Learning-based CT SR models, which enhances the efficacy and applicability of
+SR models in real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal ArXiv: A <span class="highlight-title">Dataset</span> for Improving Scientific Comprehension of
+  Large Vision-Language Models <span class="chip">ACL
+  2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00231v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00231v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Li, Yuqi Wang, Runxin Xu, Peiyi Wang, Xiachong Feng, Lingpeng Kong, Qi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models (LVLMs) excel across diverse tasks involving
+concrete images from natural scenes. However, their ability to interpret
+abstract figures, such as geometry shapes and scientific plots, remains limited
+due to a scarcity of training datasets in scientific domains. To fill this gap,
+we introduce Multimodal ArXiv, consisting of ArXivCap and ArXivQA, for
+enhancing LVLMs scientific comprehension. ArXivCap is a figure-caption dataset
+comprising 6.4M images and 3.9M captions, sourced from 572K ArXiv papers
+spanning various scientific domains. Drawing from ArXivCap, we introduce
+ArXivQA, a question-answering dataset generated by prompting GPT-4V based on
+scientific figures. ArXivQA greatly enhances open-sourced LVLMs' mathematical
+reasoning capabilities, achieving a 10.4\% absolute accuracy gain on a
+multimodal mathematical reasoning benchmark. Furthermore, employing ArXivCap,
+we devise four vision-to-text tasks for benchmarking LVLMs. Evaluation results
+with state-of-the-art LVLMs underscore their struggle with the nuanced
+semantics of academic figures, while domain-specific training yields
+substantial performance gains. Our error analysis uncovers misinterpretations
+of visual context, recognition errors, and the production of overly simplified
+captions by current LVLMs, shedding light on future improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://mm-arxiv.github.io, Camera Ready Version of ACL
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cephalo: Multi-Modal Vision-Language Models for Bio-Inspired Materials
+  Analysis and Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19076v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19076v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus J. Buehler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Cephalo, a series of multimodal vision large language models
+(V-LLMs) designed for materials science applications, integrating visual and
+linguistic data for enhanced understanding and interaction within human-AI and
+multi-agent AI frameworks. A key innovation of Cephalo is its advanced dataset
+generation method, which employs a sophisticated algorithm to accurately detect
+and separate images and their corresponding textual descriptions from PDF
+documents, such as scientific papers. The method includes a careful refinement
+of image-text pairs through integrated vision and language processing, ensuring
+high-quality, contextually relevant, and well reasoned training data. Cephalo
+is trained on integrated image and text data extracted from thousands of
+scientific papers and science-focused Wikipedia pages demonstrates can
+interpret complex visual scenes, generate precise language descriptions, and
+answer queries about images effectively. The combination of a vision encoder
+with an autoregressive transformer supports complex natural language
+understanding in an integrated model, which can be coupled with other
+generative methods to create an image-to-text-to-image or image-to-text-to-3D
+pipeline. To explore the development of larger models from smaller ones, we
+report both mixture-of-expert methods and model merging. These hybrid
+approaches allow us to leverage the domain-specific expertise and general
+conversational capabilities to harness the strengths of multiple models. We
+examine the models in diverse use cases that incorporate biological materials,
+fracture and engineering analysis, protein biophysics, and bio-inspired design
+based on insect behavior. Generative applications include bio-inspired designs,
+including pollen-inspired architected materials, as well as the synthesis of
+bio-inspired material microstructures from a photograph of a solar eclipse.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StereoDiffusion: Training-Free Stereo Image Generation Using Latent
+  Diffusion Models <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04965v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04965v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lezhong Wang, Jeppe Revall Frisvad, Mark Bo Jensen, Siavash Arjomand Bigdeli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The demand for stereo images increases as manufacturers launch more XR
+devices. To meet this demand, we introduce StereoDiffusion, a method that,
+unlike traditional inpainting pipelines, is trainning free, remarkably
+straightforward to use, and it seamlessly integrates into the original Stable
+Diffusion model. Our method modifies the latent variable to provide an
+end-to-end, lightweight capability for fast generation of stereo image pairs,
+without the need for fine-tuning model weights or any post-processing of
+images. Using the original input to generate a left image and estimate a
+disparity map for it, we generate the latent vector for the right image through
+Stereo Pixel Shift operations, complemented by Symmetric Pixel Shift Masking
+Denoise and Self-Attention Layers Modification methods to align the right-side
+image with the left-side image. Moreover, our proposed method maintains a high
+standard of image quality throughout the stereo generation process, achieving
+state-of-the-art scores in various quantitative evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated to CVPR 2024 GCV accepted version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeRF On-the-go: Exploiting Uncertainty for Distractor-free NeRFs in the
+  Wild <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weining Ren, Zihan Zhu, Boyang Sun, Jiaqi Chen, Marc Pollefeys, Songyou Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRFs) have shown remarkable success in synthesizing
+photorealistic views from multi-view images of static scenes, but face
+challenges in dynamic, real-world environments with distractors like moving
+objects, shadows, and lighting changes. Existing methods manage controlled
+environments and low occlusion ratios but fall short in render quality,
+especially under high occlusion scenarios. In this paper, we introduce NeRF
+On-the-go, a simple yet effective approach that enables the robust synthesis of
+novel views in complex, in-the-wild scenes from only casually captured image
+sequences. Delving into uncertainty, our method not only efficiently eliminates
+distractors, even when they are predominant in captures, but also achieves a
+notably faster convergence speed. Through comprehensive experiments on various
+scenes, our method demonstrates a significant improvement over state-of-the-art
+techniques. This advancement opens new avenues for NeRF in diverse and dynamic
+real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024, first two authors contributed equally. Project Page:
+  https://rwn17.github.io/nerf-on-the-go/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LP-OVOD: Open-Vocabulary Object Detection by Linear Probing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.17109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.17109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chau Pham, Truong Vu, Khoi Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the challenging problem of open-vocabulary object
+detection (OVOD) where an object detector must identify both seen and unseen
+classes in test images without labeled examples of the unseen classes in
+training. A typical approach for OVOD is to use joint text-image embeddings of
+CLIP to assign box proposals to their closest text label. However, this method
+has a critical issue: many low-quality boxes, such as over- and
+under-covered-object boxes, have the same similarity score as high-quality
+boxes since CLIP is not trained on exact object location information. To
+address this issue, we propose a novel method, LP-OVOD, that discards
+low-quality boxes by training a sigmoid linear classifier on pseudo labels
+retrieved from the top relevant region proposals to the novel text.
+Experimental results on COCO affirm the superior performance of our approach
+over the state of the art, achieving $\textbf{40.5}$ in $\text{AP}_{novel}$
+using ResNet50 as the backbone and without external datasets or knowing novel
+classes during training. Our code will be available at
+https://github.com/VinAIResearch/LP-OVOD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>4SGG: Synthesizing Scene Graphs from Holistic and Region-specific
+  Narratives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04314v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04314v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuyao Chen, Jinlin Wu, Zhen Lei, Zhaoxiang Zhang, Changwen Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training Scene Graph Generation (SGG) models with natural language captions
+has become increasingly popular due to the abundant, cost-effective, and
+open-world generalization supervision signals that natural language offers.
+However, such unstructured caption data and its processing pose significant
+challenges in learning accurate and comprehensive scene graphs. The challenges
+can be summarized as three aspects: 1) traditional scene graph parsers based on
+linguistic representation often fail to extract meaningful relationship
+triplets from caption data. 2) grounding unlocalized objects of parsed triplets
+will meet ambiguity issues in visual-language alignment. 3) caption data
+typically are sparse and exhibit bias to partial observations of image content.
+Aiming to address these problems, we propose a divide-and-conquer strategy with
+a novel framework named \textit{GPT4SGG}, to obtain more accurate and
+comprehensive scene graph signals. This framework decomposes a complex scene
+into a bunch of simple regions, resulting in a set of region-specific
+narratives. With these region-specific narratives (partial observations) and a
+holistic narrative (global observation) for an image, a large language model
+(LLM) performs the relationship reasoning to synthesize an accurate and
+comprehensive scene graph. Experimental results demonstrate \textit{GPT4SGG}
+significantly improves the performance of SGG models trained on image-caption
+data, in which the ambiguity issue and long-tail bias have been well-handled
+with more accurate and comprehensive scene graphs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LatentMan: Generating Consistent Animated Characters using Image
+  Diffusion Models <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07133v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07133v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrahman Eldesokey, Peter Wonka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a zero-shot approach for generating consistent videos of animated
+characters based on Text-to-Image (T2I) diffusion models. Existing
+Text-to-Video (T2V) methods are expensive to train and require large-scale
+video datasets to produce diverse characters and motions. At the same time,
+their zero-shot alternatives fail to produce temporally consistent videos with
+continuous motion. We strive to bridge this gap, and we introduce LatentMan,
+which leverages existing text-based motion diffusion models to generate diverse
+continuous motions to guide the T2I model. To boost the temporal consistency,
+we introduce the Spatial Latent Alignment module that exploits cross-frame
+dense correspondences that we compute to align the latents of the video frames.
+Furthermore, we propose Pixel-Wise Guidance to steer the diffusion process in a
+direction that minimizes visual discrepancies between frames. Our proposed
+approach outperforms existing zero-shot T2V approaches in generating videos of
+animated characters in terms of pixel-wise consistency and user preference.
+Project page https://abdo-eldesokey.github.io/latentman/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPRW 2024. Project page: https://abdo-eldesokey.github.io/latentman/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ODFormer: Semantic Fundus Image Segmentation Using <span class="highlight-title">Transformer</span> for Optic
+  Nerve Head Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09552v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09552v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Wang, Yi-An Mao, Xiaoyu Ma, Sicen Guo, Yuting Shao, Xiao Lv, Wenting Han, Mark Christopher, Linda M. Zangwill, Yanlong Bi, Rui Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optic nerve head (ONH) detection has been a crucial area of study in
+ophthalmology for years. However, the significant discrepancy between fundus
+image datasets, each generated using a single type of fundus camera, poses
+challenges to the generalizability of ONH detection approaches developed based
+on semantic segmentation networks. Despite the numerous recent advancements in
+general-purpose semantic segmentation methods using convolutional neural
+networks (CNNs) and Transformers, there is currently a lack of benchmarks for
+these state-of-the-art (SoTA) networks specifically trained for ONH detection.
+Therefore, in this article, we make contributions from three key aspects:
+network design, the publication of a dataset, and the establishment of a
+comprehensive benchmark. Our newly developed ONH detection network, referred to
+as ODFormer, is based upon the Swin Transformer architecture and incorporates
+two novel components: a multi-scale context aggregator and a lightweight
+bidirectional feature recalibrator. Our published large-scale dataset, known as
+TongjiU-DROD, provides multi-resolution fundus images for each participant,
+captured using two distinct types of cameras. Our established benchmark
+involves three datasets: DRIONS-DB, DRISHTI-GS1, and TongjiU-DROD, created by
+researchers from different countries and containing fundus images captured from
+participants of diverse races and ages. Extensive experimental results
+demonstrate that our proposed ODFormer outperforms other state-of-the-art
+(SoTA) networks in terms of performance and generalizability. Our dataset and
+source code are publicly available at mias.group/ODFormer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiagSet: a <span class="highlight-title">dataset</span> for prostate cancer histopathological image
+  classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.04014v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.04014v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michał Koziarski, Bogusław Cyganek, Przemysław Niedziela, Bogusław Olborski, Zbigniew Antosz, Marcin Żydak, Bogdan Kwolek, Paweł Wąsowicz, Andrzej Bukała, Jakub Swadźba, Piotr Sitkowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cancer diseases constitute one of the most significant societal challenges.
+In this paper, we introduce a novel histopathological dataset for prostate
+cancer detection. The proposed dataset, consisting of over 2.6 million tissue
+patches extracted from 430 fully annotated scans, 4675 scans with assigned
+binary diagnoses, and 46 scans with diagnoses independently provided by a group
+of histopathologists can be found at
+https://github.com/michalkoziarski/DiagSet. Furthermore, we propose a machine
+learning framework for detection of cancerous tissue regions and prediction of
+scan-level diagnosis, utilizing thresholding to abstain from the decision in
+uncertain cases. The proposed approach, composed of ensembles of deep neural
+networks operating on the histopathological scans at different scales, achieves
+94.6% accuracy in patch-level recognition and is compared in a scan-level
+diagnosis with 9 human histopathologists showing high statistical agreement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Diffusion Models for Inverse Problems Using Optimal Posterior
+  Covariance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02149v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02149v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Peng, Ziyang Zheng, Wenrui Dai, Nuoqian Xiao, Chenglin Li, Junni Zou, Hongkai Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent diffusion models provide a promising zero-shot solution to noisy
+linear inverse problems without retraining for specific inverse problems. In
+this paper, we reveal that recent methods can be uniformly interpreted as
+employing a Gaussian approximation with hand-crafted isotropic covariance for
+the intractable denoising posterior to approximate the conditional posterior
+mean. Inspired by this finding, we propose to improve recent methods by using
+more principled covariance determined by maximum likelihood estimation. To
+achieve posterior covariance optimization without retraining, we provide
+general plug-and-play solutions based on two approaches specifically designed
+for leveraging pre-trained models with and without reverse covariance. We
+further propose a scalable method for learning posterior covariance prediction
+based on representation with orthonormal basis. Experimental results
+demonstrate that the proposed methods significantly enhance reconstruction
+performance without requiring hyperparameter tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards a Better Evaluation of Out-of-Domain Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19703v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19703v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duhun Hwang, Suhyun Kang, Moonjung Eo, Jimyeong Kim, Wonjong Rhee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The objective of Domain Generalization (DG) is to devise algorithms and
+models capable of achieving high performance on previously unseen test
+distributions. In the pursuit of this objective, average measure has been
+employed as the prevalent measure for evaluating models and comparing
+algorithms in the existing DG studies. Despite its significance, a
+comprehensive exploration of the average measure has been lacking and its
+suitability in approximating the true domain generalization performance has
+been questionable. In this study, we carefully investigate the limitations
+inherent in the average measure and propose worst+gap measure as a robust
+alternative. We establish theoretical grounds of the proposed measure by
+deriving two theorems starting from two different assumptions. We conduct
+extensive experimental investigations to compare the proposed worst+gap measure
+with the conventional average measure. Given the indispensable need to access
+the true DG performance for studying measures, we modify five existing datasets
+to come up with SR-CMNIST, C-Cats&Dogs, L-CIFAR10, PACS-corrupted, and
+VLCS-corrupted datasets. The experiment results unveil an inferior performance
+of the average measure in approximating the true DG performance and confirm the
+robustness of the theoretically supported worst+gap measure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MOFA-Video: Controllable Image Animation via Generative Motion Field
+  Adaptions in Frozen Image-to-Video Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muyao Niu, Xiaodong Cun, Xintao Wang, Yong Zhang, Ying Shan, Yinqiang Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present MOFA-Video, an advanced controllable image animation method that
+generates video from the given image using various additional controllable
+signals (such as human landmarks reference, manual trajectories, and another
+even provided video) or their combinations. This is different from previous
+methods which only can work on a specific motion domain or show weak control
+abilities with diffusion prior. To achieve our goal, we design several
+domain-aware motion field adapters (\ie, MOFA-Adapters) to control the
+generated motions in the video generation pipeline. For MOFA-Adapters, we
+consider the temporal motion consistency of the video and generate the dense
+motion flow from the given sparse control conditions first, and then, the
+multi-scale features of the given image are wrapped as a guided feature for
+stable video diffusion generation. We naively train two motion adapters for the
+manual trajectories and the human landmarks individually since they both
+contain sparse information about the control. After training, the MOFA-Adapters
+in different domains can also work together for more controllable video
+generation. Project Page: https://myniuuu.github.io/MOFA_Video/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://myniuuu.github.io/MOFA_Video/ ; Codes:
+  https://github.com/MyNiuuu/MOFA-Video</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ iVideo<span class="highlight-title">GPT</span>: Interactive Video<span class="highlight-title">GPT</span>s are Scalable World Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15223v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15223v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jialong Wu, Shaofeng Yin, Ningya Feng, Xu He, Dong Li, Jianye Hao, Mingsheng Long
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  World models empower model-based agents to interactively explore, reason, and
+plan within imagined environments for real-world decision-making. However, the
+high demand for interactivity poses challenges in harnessing recent
+advancements in video generative models for developing world models at scale.
+This work introduces Interactive VideoGPT (iVideoGPT), a scalable
+autoregressive transformer framework that integrates multimodal signals--visual
+observations, actions, and rewards--into a sequence of tokens, facilitating an
+interactive experience of agents via next-token prediction. iVideoGPT features
+a novel compressive tokenization technique that efficiently discretizes
+high-dimensional visual observations. Leveraging its scalable architecture, we
+are able to pre-train iVideoGPT on millions of human and robotic manipulation
+trajectories, establishing a versatile foundation that is adaptable to serve as
+interactive world models for a wide range of downstream tasks. These include
+action-conditioned video prediction, visual planning, and model-based
+reinforcement learning, where iVideoGPT achieves competitive performance
+compared with state-of-the-art methods. Our work advances the development of
+interactive general world models, bridging the gap between generative video
+models and practical model-based reinforcement learning applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://thuml.github.io/iVideoGPT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenTron: Diffusion <span class="highlight-title">Transformer</span>s for Image and Video Generation <span class="chip">CVPR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.04557v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.04557v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shoufa Chen, Mengmeng Xu, Jiawei Ren, Yuren Cong, Sen He, Yanping Xie, Animesh Sinha, Ping Luo, Tao Xiang, Juan-Manuel Perez-Rua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we explore Transformer-based diffusion models for image and
+video generation. Despite the dominance of Transformer architectures in various
+fields due to their flexibility and scalability, the visual generative domain
+primarily utilizes CNN-based U-Net architectures, particularly in
+diffusion-based models. We introduce GenTron, a family of Generative models
+employing Transformer-based diffusion, to address this gap. Our initial step
+was to adapt Diffusion Transformers (DiTs) from class to text conditioning, a
+process involving thorough empirical exploration of the conditioning mechanism.
+We then scale GenTron from approximately 900M to over 3B parameters, observing
+significant improvements in visual quality. Furthermore, we extend GenTron to
+text-to-video generation, incorporating novel motion-free guidance to enhance
+video quality. In human evaluations against SDXL, GenTron achieves a 51.1% win
+rate in visual quality (with a 19.8% draw rate), and a 42.3% win rate in text
+alignment (with a 42.9% draw rate). GenTron also excels in the T2I-CompBench,
+underscoring its strengths in compositional generation. We believe this work
+will provide meaningful insights and serve as a valuable reference for future
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR2024 Camera Ready. Website:
+  https://www.shoufachen.com/gentron_website/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Ego Status All You Need for Open-Loop End-to-End Autonomous Driving? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03031v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03031v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqi Li, Zhiding Yu, Shiyi Lan, Jiahan Li, Jan Kautz, Tong Lu, Jose M. Alvarez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end autonomous driving recently emerged as a promising research
+direction to target autonomy from a full-stack perspective. Along this line,
+many of the latest works follow an open-loop evaluation setting on nuScenes to
+study the planning behavior. In this paper, we delve deeper into the problem by
+conducting thorough analyses and demystifying more devils in the details. We
+initially observed that the nuScenes dataset, characterized by relatively
+simple driving scenarios, leads to an under-utilization of perception
+information in end-to-end models incorporating ego status, such as the ego
+vehicle's velocity. These models tend to rely predominantly on the ego
+vehicle's status for future path planning. Beyond the limitations of the
+dataset, we also note that current metrics do not comprehensively assess the
+planning quality, leading to potentially biased conclusions drawn from existing
+benchmarks. To address this issue, we introduce a new metric to evaluate
+whether the predicted trajectories adhere to the road. We further propose a
+simple baseline able to achieve competitive results without relying on
+perception annotations. Given the current limitations on the benchmark and
+metrics, we suggest the community reassess relevant prevailing research and be
+cautious whether the continued pursuit of state-of-the-art would yield
+convincing and universal conclusions. Code and models are available at
+\url{https://github.com/NVlabs/BEV-Planner}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept to cvpr 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Task Sampler Learning for Meta-Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08924v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08924v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingyao Wang, Wenwen Qiang, Xingzhe Su, Changwen Zheng, Fuchun Sun, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Meta-learning aims to learn general knowledge with diverse training tasks
+conducted from limited data, and then transfer it to new tasks. It is commonly
+believed that increasing task diversity will enhance the generalization ability
+of meta-learning models. However, this paper challenges this view through
+empirical and theoretical analysis. We obtain three conclusions: (i) there is
+no universal task sampling strategy that can guarantee the optimal performance
+of meta-learning models; (ii) over-constraining task diversity may incur the
+risk of under-fitting or over-fitting during training; and (iii) the
+generalization performance of meta-learning models are affected by task
+diversity, task entropy, and task difficulty. Based on this insight, we design
+a novel task sampler, called Adaptive Sampler (ASr). ASr is a plug-and-play
+module that can be integrated into any meta-learning framework. It dynamically
+adjusts task weights according to task diversity, task entropy, and task
+difficulty, thereby obtaining the optimal probability distribution for
+meta-training tasks. Finally, we conduct experiments on a series of benchmark
+datasets across various scenarios, and the results demonstrate that ASr has
+clear advantages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by IJCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Domain Adaptation for Low-dose CT Reconstruction via
+  Bayesian Uncertainty Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kecheng Chen, Jie Liu, Renjie Wan, Victor Ho-Fun Lee, Varut Vardhanabhuti, Hong Yan, Haoliang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-dose computed tomography (LDCT) image reconstruction techniques can
+reduce patient radiation exposure while maintaining acceptable imaging quality.
+Deep learning is widely used in this problem, but the performance of testing
+data (a.k.a. target domain) is often degraded in clinical scenarios due to the
+variations that were not encountered in training data (a.k.a. source domain).
+Unsupervised domain adaptation (UDA) of LDCT reconstruction has been proposed
+to solve this problem through distribution alignment. However, existing UDA
+methods fail to explore the usage of uncertainty quantification, which is
+crucial for reliable intelligent medical systems in clinical scenarios with
+unexpected variations. Moreover, existing direct alignment for different
+patients would lead to content mismatch issues. To address these issues, we
+propose to leverage a probabilistic reconstruction framework to conduct a joint
+discrepancy minimization between source and target domains in both the latent
+and image spaces. In the latent space, we devise a Bayesian uncertainty
+alignment to reduce the epistemic gap between the two domains. This approach
+reduces the uncertainty level of target domain data, making it more likely to
+render well-reconstructed results on target domains. In the image space, we
+propose a sharpness-aware distribution alignment to achieve a match of
+second-order information, which can ensure that the reconstructed images from
+the target domain have similar sharpness to normal-dose CT images from the
+source domain. Experimental results on two simulated datasets and one clinical
+low-dose imaging dataset show that our proposed method outperforms other
+methods in quantitative and visualized performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Neural Networks and Learning Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MBQuant: A Novel Multi-Branch Topology Method for Arbitrary Bit-width
+  Network Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08117v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08117v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunshan Zhong, Yuyao Zhou, Fei Chao, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Arbitrary bit-width network quantization has received significant attention
+due to its high adaptability to various bit-width requirements during runtime.
+However, in this paper, we investigate existing methods and observe a
+significant accumulation of quantization errors caused by switching weight and
+activations bit-widths, leading to limited performance. To address this issue,
+we propose MBQuant, a novel method that utilizes a multi-branch topology for
+arbitrary bit-width quantization. MBQuant duplicates the network body into
+multiple independent branches, where the weights of each branch are quantized
+to a fixed 2-bit and the activations remain in the input bit-width. The
+computation of a desired bit-width is completed by selecting an appropriate
+number of branches that satisfy the original computational constraint. By
+fixing the weight bit-width, this approach substantially reduces quantization
+errors caused by switching weight bit-widths. Additionally, we introduce an
+amortization branch selection strategy to distribute quantization errors caused
+by switching activation bit-widths among branches to improve performance.
+Finally, we adopt an in-place distillation strategy that facilitates guidance
+between branches to further enhance MBQuant's performance. Extensive
+experiments demonstrate that MBQuant achieves significant performance gains
+compared to existing arbitrary bit-width quantization methods. Code is at
+https://github.com/zysxmu/MultiQuant.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language-guided Image Reflection Separation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11874v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11874v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haofeng Zhong, Yuchen Hong, Shuchen Weng, Jinxiu Liang, Boxin Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the problem of language-guided reflection separation,
+which aims at addressing the ill-posed reflection separation problem by
+introducing language descriptions to provide layer content. We propose a
+unified framework to solve this problem, which leverages the cross-attention
+mechanism with contrastive learning strategies to construct the correspondence
+between language descriptions and image layers. A gated network design and a
+randomized training strategy are employed to tackle the recognizable layer
+ambiguity. The effectiveness of the proposed method is validated by the
+significant performance advantage over existing reflection separation methods
+on both quantitative and qualitative comparisons.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Skeleton-in-Context: Unified Skeleton Sequence Modeling with In-Context
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.03703v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.03703v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinshun Wang, Zhongbin Fang, Xia Li, Xiangtai Li, Mengyuan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning provides a new perspective for multi-task modeling for
+vision and NLP. Under this setting, the model can perceive tasks from prompts
+and accomplish them without any extra task-specific head predictions or model
+fine-tuning. However, Skeleton sequence modeling via in-context learning
+remains unexplored. Directly applying existing in-context models from other
+areas onto skeleton sequences fails due to the inter-frame and cross-task pose
+similarity that makes it outstandingly hard to perceive the task correctly from
+a subtle context. To address this challenge, we propose Skeleton-in-Context
+(SiC), an effective framework for in-context skeleton sequence modeling. Our
+SiC is able to handle multiple skeleton-based tasks simultaneously after a
+single training process and accomplish each task from context according to the
+given prompt. It can further generalize to new, unseen tasks according to
+customized prompts. To facilitate context perception, we additionally propose a
+task-unified prompt, which adaptively learns tasks of different natures, such
+as partial joint-level generation, sequence-level prediction, or 2D-to-3D
+motion prediction. We conduct extensive experiments to evaluate the
+effectiveness of our SiC on multiple tasks, including motion prediction, pose
+estimation, joint completion, and future pose estimation. We also evaluate its
+generalization capability on unseen tasks such as motion-in-between. These
+experiments show that our model achieves state-of-the-art multi-task
+performance and even outperforms single-task methods on certain tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://github.com/fanglaosi/Skeleton-in-Context</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedNeXt: <span class="highlight-title">Transformer</span>-driven Scaling of ConvNets for Medical Image
+  Segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09975v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09975v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saikat Roy, Gregor Koehler, Constantin Ulrich, Michael Baumgartner, Jens Petersen, Fabian Isensee, Paul F. Jaeger, Klaus Maier-Hein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been exploding interest in embracing Transformer-based
+architectures for medical image segmentation. However, the lack of large-scale
+annotated medical datasets make achieving performances equivalent to those in
+natural images challenging. Convolutional networks, in contrast, have higher
+inductive biases and consequently, are easily trainable to high performance.
+Recently, the ConvNeXt architecture attempted to modernize the standard ConvNet
+by mirroring Transformer blocks. In this work, we improve upon this to design a
+modernized and scalable convolutional architecture customized to challenges of
+data-scarce medical settings. We introduce MedNeXt, a Transformer-inspired
+large kernel segmentation network which introduces - 1) A fully ConvNeXt 3D
+Encoder-Decoder Network for medical image segmentation, 2) Residual ConvNeXt up
+and downsampling blocks to preserve semantic richness across scales, 3) A novel
+technique to iteratively increase kernel sizes by upsampling small kernel
+networks, to prevent performance saturation on limited medical data, 4)
+Compound scaling at multiple levels (depth, width, kernel size) of MedNeXt.
+This leads to state-of-the-art performance on 4 tasks on CT and MRI modalities
+and varying dataset sizes, representing a modernized deep architecture for
+medical image segmentation. Our code is made publicly available at:
+https://github.com/MIC-DKFZ/MedNeXt.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Momentor: Advancing Video Large Language Model with Fine-Grained
+  Temporal Reasoning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Qian, Juncheng Li, Yu Wu, Yaobo Ye, Hao Fei, Tat-Seng Chua, Yueting Zhuang, Siliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) demonstrate remarkable proficiency in
+comprehending and handling text-based tasks. Many efforts are being made to
+transfer these attributes to video modality, which are termed Video-LLMs.
+However, existing Video-LLMs can only capture the coarse-grained semantics and
+are unable to effectively handle tasks related to comprehension or localization
+of specific video segments. In light of these challenges, we propose Momentor,
+a Video-LLM capable of accomplishing fine-grained temporal understanding tasks.
+To support the training of Momentor, we design an automatic data generation
+engine to construct Moment-10M, a large-scale video instruction dataset with
+segment-level instruction data. We train Momentor on Moment-10M, enabling it to
+perform segment-level reasoning and localization. Zero-shot evaluations on
+several tasks demonstrate that Momentor excels in fine-grained temporally
+grounded comprehension and localization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PatchScaler: An Efficient Patch-Independent Diffusion Model for
+  Super-Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17158v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17158v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong Liu, Hang Dong, Jinshan Pan, Qingji Dong, Kai Chen, Rongxiang Zhang, Xing Mei, Lean Fu, Fei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models significantly improve the quality of super-resolved images
+with their impressive content generation capabilities. However, the huge
+computational costs limit the applications of these methods.Recent efforts have
+explored reasonable inference acceleration to reduce the number of sampling
+steps, but the computational cost remains high as each step is performed on the
+entire image.This paper introduces PatchScaler, a patch-independent
+diffusion-based single image super-resolution (SR) method, designed to enhance
+the efficiency of the inference process.The proposed method is motivated by the
+observation that not all the image patches within an image need the same
+sampling steps for reconstructing high-resolution images.Based on this
+observation, we thus develop a Patch-adaptive Group Sampling (PGS) to divide
+feature patches into different groups according to the patch-level
+reconstruction difficulty and dynamically assign an appropriate sampling
+configuration for each group so that the inference speed can be better
+accelerated.In addition, to improve the denoising ability at each step of the
+sampling, we develop a texture prompt to guide the estimations of the diffusion
+model by retrieving high-quality texture priors from a patch-independent
+reference texture memory.Experiments show that our PatchScaler achieves
+favorable performance in both quantitative and qualitative evaluations with
+fast inference speed.Our code and model are available at
+\url{https://github.com/yongliuy/PatchScaler}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attribute Based Interpretable Evaluation Metrics for Generative Models <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.17261v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.17261v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongkyun Kim, Mingi Kwon, Youngjung Uh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When the training dataset comprises a 1:1 proportion of dogs to cats, a
+generative model that produces 1:1 dogs and cats better resembles the training
+species distribution than another model with 3:1 dogs and cats. Can we capture
+this phenomenon using existing metrics? Unfortunately, we cannot, because these
+metrics do not provide any interpretability beyond "diversity". In this
+context, we propose a new evaluation protocol that measures the divergence of a
+set of generated images from the training set regarding the distribution of
+attribute strengths as follows. Single-attribute Divergence (SaD) measures the
+divergence regarding PDFs of a single attribute. Paired-attribute Divergence
+(PaD) measures the divergence regarding joint PDFs of a pair of attributes.
+They provide which attributes the models struggle. For measuring the attribute
+strengths of an image, we propose Heterogeneous CLIPScore (HCS) which measures
+the cosine similarity between image and text vectors with heterogeneous initial
+points. With SaD and PaD, we reveal the following about existing generative
+models. ProjectedGAN generates implausible attribute relationships such as a
+baby with a beard even though it has competitive scores of existing metrics.
+Diffusion models struggle to capture diverse colors in the datasets. The larger
+sampling timesteps of latent diffusion model generate the more minor objects
+including earrings and necklaces. Stable Diffusion v1.5 better captures the
+attributes than v2.1. Our metrics lay a foundation for explainable evaluations
+of generative models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WorDepth: Variational Language Prior for Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03635v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03635v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyao Zeng, Daniel Wang, Fengyu Yang, Hyoungseob Park, Yangchao Wu, Stefano Soatto, Byung-Woo Hong, Dong Lao, Alex Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Three-dimensional (3D) reconstruction from a single image is an ill-posed
+problem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text
+description(s) is similarly ill-posed, i.e. spatial arrangements of objects
+described. We investigate the question of whether two inherently ambiguous
+modalities can be used in conjunction to produce metric-scaled reconstructions.
+To test this, we focus on monocular depth estimation, the problem of predicting
+a dense depth map from a single image, but with an additional text caption
+describing the scene. To this end, we begin by encoding the text caption as a
+mean and standard deviation; using a variational framework, we learn the
+distribution of the plausible metric reconstructions of 3D scenes corresponding
+to the text captions as a prior. To "select" a specific reconstruction or depth
+map, we encode the given image through a conditional sampler that samples from
+the latent space of the variational text encoder, which is then decoded to the
+output depth map. Our approach is trained alternatingly between the text and
+image branches: in one optimization step, we predict the mean and standard
+deviation from the text description and sample from a standard Gaussian, and in
+the other, we sample using a (image) conditional sampler. Once trained, we
+directly predict depth from the encoded text using the conditional sampler. We
+demonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where
+we show that language can consistently improve performance in both.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Safe and Reliable Autonomous Driving: Dynamic Occupancy Set
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.19385v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.19385v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenbo Shao, Jiahui Xu, Wenhao Yu, Jun Li, Hong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly evolving field of autonomous driving, reliable prediction is
+pivotal for vehicular safety. However, trajectory predictions often deviate
+from actual paths, particularly in complex and challenging environments,
+leading to significant errors. To address this issue, our study introduces a
+novel method for Dynamic Occupancy Set (DOS) prediction, it effectively
+combines advanced trajectory prediction networks with a DOS prediction module,
+overcoming the shortcomings of existing models. It provides a comprehensive and
+adaptable framework for predicting the potential occupancy sets of traffic
+participants. The innovative contributions of this study include the
+development of a novel DOS prediction model specifically tailored for
+navigating complex scenarios, the introduction of precise DOS mathematical
+representations, and the formulation of optimized loss functions that
+collectively advance the safety and efficiency of autonomous systems. Through
+rigorous validation, our method demonstrates marked improvements over
+traditional models, establishing a new benchmark for safety and operational
+efficiency in intelligent transportation systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE IV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large-scale DSM registration via motion averaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19442v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19442v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ningli Xu, Rongjun Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating wide-area digital surface models (DSMs) requires registering a
+large number of individual, and partially overlapped DSMs. This presents a
+challenging problem for a typical registration algorithm, since when a large
+number of observations from these multiple DSMs are considered, it may easily
+cause memory overflow. Sequential registration algorithms, although can
+significantly reduce the computation, are especially vulnerable for small
+overlapped pairs, leading to a large error accumulation. In this work, we
+propose a novel solution that builds the DSM registration task as a motion
+averaging problem: pair-wise DSMs are registered to build a scene graph, with
+edges representing relative poses between DSMs. Specifically, based on the grid
+structure of the large DSM, the pair-wise registration is performed using a
+novel nearest neighbor search method. We show that the scene graph can be
+optimized via an extremely fast motion average algorithm with O(N) complexity
+(N refers to the number of images). Evaluation of high-resolution
+satellite-derived DSM demonstrates significant improvement in computation and
+accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond Sole Strength: Customized Ensembles for Generalized
+  Vision-Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.17091v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.17091v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihe Lu, Jiawang Bai, Xin Li, Zeyu Xiao, Xinchao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning pre-trained vision-language models (VLMs), e.g., CLIP, for the
+open-world generalization has gained increasing popularity due to its practical
+value. However, performance advancements are limited when relying solely on
+intricate algorithmic designs for a single model, even one exhibiting strong
+performance, e.g., CLIP-ViT-B/16. This paper, for the first time, explores the
+collaborative potential of leveraging much weaker VLMs to enhance the
+generalization of a robust single model. The affirmative findings motivate us
+to address the generalization problem from a novel perspective, i.e., ensemble
+of pre-trained VLMs. We introduce three customized ensemble strategies, each
+tailored to one specific scenario. Firstly, we introduce the zero-shot
+ensemble, automatically adjusting the logits of different models based on their
+confidence when only pre-trained VLMs are available. Furthermore, for scenarios
+with extra few-shot samples, we propose the training-free and tuning ensemble,
+offering flexibility based on the availability of computing resources. The
+proposed ensemble strategies are evaluated on zero-shot, base-to-new, and
+cross-dataset generalization, achieving new state-of-the-art performance.
+Notably, this work represents an initial stride toward enhancing the
+generalization performance of VLMs via ensemble. The code is available at
+https://github.com/zhiheLu/Ensemble_VLM.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted on ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ X-Ray: A Sequential 3D Representation For Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14329v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14329v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Hu, Wenhang Ge, Yuyang Zhao, Gim Hee Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce X-Ray, a novel 3D sequential representation inspired by the
+penetrability of x-ray scans. X-Ray transforms a 3D object into a series of
+surface frames at different layers, making it suitable for generating 3D models
+from images. Our method utilizes ray casting from the camera center to capture
+geometric and textured details, including depth, normal, and color, across all
+intersected surfaces. This process efficiently condenses the whole 3D object
+into a multi-frame video format, motivating the utilize of a network
+architecture similar to those in video diffusion models. This design ensures an
+efficient 3D representation by focusing solely on surface information. Also, we
+propose a two-stage pipeline to generate 3D objects from X-Ray Diffusion Model
+and Upsampler. We demonstrate the practicality and adaptability of our X-Ray
+representation by synthesizing the complete visible and hidden surfaces of a 3D
+object from a single input image. Experimental results reveal the
+state-of-the-art superiority of our representation in enhancing the accuracy of
+3D generation, paving the way for new 3D representation research and practical
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Emotion Recognition in Context Debiasing <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05963v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05963v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingkang Yang, Kun Yang, Mingcheng Li, Shunli Wang, Shuaibing Wang, Lihua Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Context-aware emotion recognition (CAER) has recently boosted the practical
+applications of affective computing techniques in unconstrained environments.
+Mainstream CAER methods invariably extract ensemble representations from
+diverse contexts and subject-centred characteristics to perceive the target
+person's emotional state. Despite advancements, the biggest challenge remains
+due to context bias interference. The harmful bias forces the models to rely on
+spurious correlations between background contexts and emotion labels in
+likelihood estimation, causing severe performance bottlenecks and confounding
+valuable context priors. In this paper, we propose a counterfactual emotion
+inference (CLEF) framework to address the above issue. Specifically, we first
+formulate a generalized causal graph to decouple the causal relationships among
+the variables in CAER. Following the causal graph, CLEF introduces a
+non-invasive context branch to capture the adverse direct effect caused by the
+context bias. During the inference, we eliminate the direct context effect from
+the total causal effect by comparing factual and counterfactual outcomes,
+resulting in bias mitigation and robust prediction. As a model-agnostic
+framework, CLEF can be readily integrated into existing methods, bringing
+consistent performance gains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GLaMM: Pixel Grounding Large Multimodal Model <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03356v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03356v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanoona Rasheed, Muhammad Maaz, Sahal Shaji Mullappilly, Abdelrahman Shaker, Salman Khan, Hisham Cholakkal, Rao M. Anwer, Erix Xing, Ming-Hsuan Yang, Fahad S. Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Multimodal Models (LMMs) extend Large Language Models to the vision
+domain. Initial LMMs used holistic images and text prompts to generate
+ungrounded textual responses. Recently, region-level LMMs have been used to
+generate visually grounded responses. However, they are limited to only
+referring to a single object category at a time, require users to specify the
+regions, or cannot offer dense pixel-wise object grounding. In this work, we
+present Grounding LMM (GLaMM), the first model that can generate natural
+language responses seamlessly intertwined with corresponding object
+segmentation masks. GLaMM not only grounds objects appearing in the
+conversations but is flexible enough to accept both textual and optional visual
+prompts (region of interest) as input. This empowers users to interact with the
+model at various levels of granularity, both in textual and visual domains. Due
+to the lack of standard benchmarks for the novel setting of visually Grounded
+Conversation Generation (GCG), we introduce a comprehensive evaluation protocol
+with our curated grounded conversations. Our proposed GCG task requires densely
+grounded concepts in natural scenes at a large-scale. To this end, we propose a
+densely annotated Grounding-anything Dataset (GranD) using our proposed
+automated annotation pipeline that encompasses 7.5M unique concepts grounded in
+a total of 810M regions available with segmentation masks. Besides GCG, GLaMM
+also performs effectively on several downstream tasks, e.g., referring
+expression segmentation, image and region-level captioning and vision-language
+conversations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DynaSeg: A Deep Dynamic Fusion Method for Unsupervised Image
+  Segmentation Incorporating Feature Similarity and Spatial Continuity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.05477v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.05477v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boujemaa Guermazi, Naimul Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our work tackles the fundamental challenge of image segmentation in computer
+vision, which is crucial for diverse applications. While supervised methods
+demonstrate proficiency, their reliance on extensive pixel-level annotations
+limits scalability. We introduce DynaSeg, an innovative unsupervised image
+segmentation approach that overcomes the challenge of balancing feature
+similarity and spatial continuity without relying on extensive hyperparameter
+tuning. Unlike traditional methods, DynaSeg employs a dynamic weighting scheme
+that automates parameter tuning, adapts flexibly to image characteristics, and
+facilitates easy integration with other segmentation networks. By incorporating
+a Silhouette Score Phase, DynaSeg prevents undersegmentation failures where the
+number of predicted clusters might converge to one. DynaSeg uses CNN-based and
+pre-trained ResNet feature extraction, making it computationally efficient and
+more straightforward than other complex models. Experimental results showcase
+state-of-the-art performance, achieving a 12.2% and 14.12% mIOU improvement
+over current unsupervised segmentation approaches on COCO-All and COCO-Stuff
+datasets, respectively. We provide qualitative and quantitative results on five
+benchmark datasets, demonstrating the efficacy of the proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">11</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Pre-Train</span>ing Protein Bi-level Representation Through Span Mask Strategy
+  On 3D Protein Chains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01481v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01481v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiale Zhao, Wanru Zhuang, Jia Song, Yaqi Li, Shuqi Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there has been a surge in the development of 3D
+structure-based pre-trained protein models, representing a significant
+advancement over pre-trained protein language models in various downstream
+tasks. However, most existing structure-based pre-trained models primarily
+focus on the residue level, i.e., alpha carbon atoms, while ignoring other
+atoms like side chain atoms. We argue that modeling proteins at both residue
+and atom levels is important since the side chain atoms can also be crucial for
+numerous downstream tasks, for example, molecular docking. Nevertheless, we
+find that naively combining residue and atom information during pre-training
+typically fails. We identify a key reason is the information leakage caused by
+the inclusion of atom structure in the input, which renders residue-level
+pre-training tasks trivial and results in insufficiently expressive residue
+representations. To address this issue, we introduce a span mask pre-training
+strategy on 3D protein chains to learn meaningful representations of both
+residues and atoms. This leads to a simple yet effective approach to learning
+protein representation suitable for diverse downstream tasks. Extensive
+experimental results on binding site prediction and function prediction tasks
+demonstrate our proposed pre-training approach significantly outperforms other
+methods. Our code will be made public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics of Language Models: Part 1, Learning Hierarchical Language
+  Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13673v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13673v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyuan Allen-Zhu, Yuanzhi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based language models are effective but complex, and
+understanding their inner workings is a significant challenge. Previous
+research has primarily explored how these models handle simple tasks like name
+copying or selection, and we extend this by investigating how these models
+grasp complex, recursive language structures defined by context-free grammars
+(CFGs). We introduce a family of synthetic CFGs that produce hierarchical
+rules, capable of generating lengthy sentences (e.g., hundreds of tokens) that
+are locally ambiguous and require dynamic programming to parse. Despite this
+complexity, we demonstrate that generative models like GPT can accurately learn
+this CFG language and generate sentences based on it. We explore the model's
+internals, revealing that its hidden states precisely capture the structure of
+CFGs, and its attention patterns resemble the information passing in a dynamic
+programming algorithm.
+  This paper also presents several corollaries, including showing why
+positional embedding is inferior to relative attention or rotary embedding;
+demonstrating that encoder-based models (e.g., BERT, deBERTa) cannot learn very
+deeply nested CFGs as effectively as generative models (e.g., GPT); and
+highlighting the necessity of adding structural and syntactic errors to the
+pretraining data to make the model more robust to corrupted language prefixes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>V2+V3 polishes writing; V3 includes Figures 6 and 10 for better
+  illustrations of our results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Slot Abstractors: Toward Scalable Abstract Visual Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03458v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03458v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanka Subhra Mondal, Jonathan D. Cohen, Taylor W. Webb
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abstract visual reasoning is a characteristically human ability, allowing the
+identification of relational patterns that are abstracted away from object
+features, and the systematic generalization of those patterns to unseen
+problems. Recent work has demonstrated strong systematic generalization in
+visual reasoning tasks involving multi-object inputs, through the integration
+of slot-based methods used for extracting object-centric representations
+coupled with strong inductive biases for relational abstraction. However, this
+approach was limited to problems containing a single rule, and was not scalable
+to visual reasoning problems containing a large number of objects. Other recent
+work proposed Abstractors, an extension of Transformers that incorporates
+strong relational inductive biases, thereby inheriting the Transformer's
+scalability and multi-head architecture, but it has yet to be demonstrated how
+this approach might be applied to multi-object visual inputs. Here we combine
+the strengths of the above approaches and propose Slot Abstractors, an approach
+to abstract visual reasoning that can be scaled to problems involving a large
+number of objects and multiple relations among them. The approach displays
+state-of-the-art performance across four abstract visual reasoning tasks, as
+well as an abstract reasoning task involving real-world images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scalable Multi-modal Model Predictive Control via Duality-based
+  Interaction Predictions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01116v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01116v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansung Kim, Siddharth H. Nair, Francesco Borrelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a hierarchical architecture designed for scalable real-time Model
+Predictive Control (MPC) in complex, multi-modal traffic scenarios. This
+architecture comprises two key components: 1) RAID-Net, a novel attention-based
+Recurrent Neural Network that predicts relevant interactions along the MPC
+prediction horizon between the autonomous vehicle and the surrounding vehicles
+using Lagrangian duality, and 2) a reduced Stochastic MPC problem that
+eliminates irrelevant collision avoidance constraints, enhancing computational
+efficiency. Our approach is demonstrated in a simulated traffic intersection
+with interactive surrounding vehicles, showcasing a 12x speed-up in solving the
+motion planning problem. A video demonstrating the proposed architecture in
+multiple complex traffic scenarios can be found here:
+https://youtu.be/-pRiOnPb9_c. GitHub:
+https://github.com/MPC-Berkeley/hmpc_raidnet
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE Intelligent Vehicles Symposium 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Peeking with PEAK: Sequential, Nonparametric Composite Hypothesis Tests
+  for Means of Multiple Data Streams <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06122v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06122v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Cho, Kyra Gan, Nathan Kallus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel nonparametric sequential test for composite hypotheses for
+means of multiple data streams. Our proposed method, \emph{peeking with
+expectation-based averaged capital} (PEAK), builds upon the testing-by-betting
+framework and provides a non-asymptotic $\alpha$-level test across any stopping
+time. Our contributions are two-fold: (1) we propose a novel betting scheme and
+provide theoretical guarantees on type-I error control, power, and asymptotic
+growth rate/$e$-power in the setting of a single data stream; (2) we introduce
+PEAK, a generalization of this betting scheme to multiple streams, that (i)
+avoids using wasteful union bounds via averaging, (ii) is a test of power one
+under mild regularity conditions on the sampling scheme of the streams, and
+(iii) reduces computational overhead when applying the testing-as-betting
+approaches for pure-exploration bandit problems. We illustrate the practical
+benefits of PEAK using both synthetic and real-world HeartSteps datasets. Our
+experiments show that PEAK provides up to an 85\% reduction in the number of
+samples before stopping compared to existing stopping rules for
+pure-exploration bandit problems, and matches the performance of
+state-of-the-art sequential tests while improving upon computational
+complexity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at the Forty-first International Conference on Machine
+  Learning (ICML 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Least Squares Regression Can Exhibit Under-Parameterized Double Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14689v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14689v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyue Li, Rishi Sonthalia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The relationship between the number of training data points, the number of
+parameters, and the generalization capabilities has been widely studied.
+Previous work has shown that double descent can occur in the over-parameterized
+regime, and believe that the standard bias-variance trade-off holds in the
+under-parameterized regime. These works provide multiple reasons for the
+existence of the peak. We postulate that the location of the peak depends on
+the technical properties of both the spectrum as well as the eigenvectors of
+the sample covariance. We present two simple examples that provably exhibit
+double descent in the under-parameterized regime and do not seem to occur for
+reasons provided in prior work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hypergraph-MLP: Learning on Hypergraphs without Message Passing <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09778v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09778v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bohan Tang, Siheng Chen, Xiaowen Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hypergraphs are vital in modelling data with higher-order relations
+containing more than two entities, gaining prominence in machine learning and
+signal processing. Many hypergraph neural networks leverage message passing
+over hypergraph structures to enhance node representation learning, yielding
+impressive performances in tasks like hypergraph node classification. However,
+these message-passing-based models face several challenges, including
+oversmoothing as well as high latency and sensitivity to structural
+perturbations at inference time. To tackle those challenges, we propose an
+alternative approach where we integrate the information about hypergraph
+structures into training supervision without explicit message passing, thus
+also removing the reliance on it at inference. Specifically, we introduce
+Hypergraph-MLP, a novel learning framework for hypergraph-structured data,
+where the learning model is a straightforward multilayer perceptron (MLP)
+supervised by a loss function based on a notion of signal smoothness on
+hypergraphs. Experiments on hypergraph node classification tasks demonstrate
+that Hypergraph-MLP achieves competitive performance compared to existing
+baselines, and is considerably faster and more robust against structural
+perturbations at inference.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Preference Fine-Tuning of LLMs Should Leverage Suboptimal, On-Policy
+  Data <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.14367v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.14367v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fahim Tajwar, Anikait Singh, Archit Sharma, Rafael Rafailov, Jeff Schneider, Tengyang Xie, Stefano Ermon, Chelsea Finn, Aviral Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from preference labels plays a crucial role in fine-tuning large
+language models. There are several distinct approaches for preference
+fine-tuning, including supervised learning, on-policy reinforcement learning
+(RL), and contrastive learning. Different methods come with different
+implementation tradeoffs and performance differences, and existing empirical
+findings present different conclusions, for instance, some results show that
+online RL is quite important to attain good fine-tuning results, while others
+find (offline) contrastive or even purely supervised methods sufficient. This
+raises a natural question: what kind of approaches are important for
+fine-tuning with preference data and why? In this paper, we answer this
+question by performing a rigorous analysis of a number of fine-tuning
+techniques on didactic and full-scale LLM problems. Our main finding is that,
+in general, approaches that use on-policy sampling or attempt to push down the
+likelihood on certain responses (i.e., employ a "negative gradient") outperform
+offline and maximum likelihood objectives. We conceptualize our insights and
+unify methods that use on-policy sampling or negative gradient under a notion
+of mode-seeking objectives for categorical distributions. Mode-seeking
+objectives are able to alter probability mass on specific bins of a categorical
+distribution at a fast rate compared to maximum likelihood, allowing them to
+relocate masses across bins more effectively. Our analysis prescribes
+actionable insights for preference fine-tuning of LLMs and informs how data
+should be collected for maximal improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Conference on Machine Learning (ICML), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Primal-Dual Algorithm for Offline Constrained Reinforcement Learning
+  with Linear MDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04493v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04493v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kihyuk Hong, Ambuj Tewari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study offline reinforcement learning (RL) with linear MDPs under the
+infinite-horizon discounted setting which aims to learn a policy that maximizes
+the expected discounted cumulative reward using a pre-collected dataset.
+Existing algorithms for this setting either require a uniform data coverage
+assumptions or are computationally inefficient for finding an
+$\epsilon$-optimal policy with $O(\epsilon^{-2})$ sample complexity. In this
+paper, we propose a primal dual algorithm for offline RL with linear MDPs in
+the infinite-horizon discounted setting. Our algorithm is the first
+computationally efficient algorithm in this setting that achieves sample
+complexity of $O(\epsilon^{-2})$ with partial data coverage assumption. Our
+work is an improvement upon a recent work that requires $O(\epsilon^{-4})$
+samples. Moreover, we extend our algorithm to work in the offline constrained
+RL setting that enforces constraints on additional reward signals.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Prototypical Part Networks with Reward Reweighing,
+  Reselection, and Retraining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03887v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03887v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aaron J. Li, Robin Netzorg, Zhihan Cheng, Zhuoqin Zhang, Bin Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, work has gone into developing deep interpretable methods for
+image classification that clearly attributes a model's output to specific
+features of the data. One such of these methods is the Prototypical Part
+Network (ProtoPNet), which attempts to classify images based on meaningful
+parts of the input. While this architecture is able to produce visually
+interpretable classifications, it often learns to classify based on parts of
+the image that are not semantically meaningful. To address this problem, we
+propose the Reward Reweighing, Reselecting, and Retraining (R3) post-processing
+framework, which performs three additional corrective updates to a pretrained
+ProtoPNet in an offline and efficient manner. The first two steps involve
+learning a reward model based on collected human feedback and then aligning the
+prototypes with human preferences. The final step is retraining, which realigns
+the base features and the classifier layer of the original model with the
+updated prototypes. We find that our R3 framework consistently improves both
+the interpretability and the predictive accuracy of ProtoPNet and its variants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GaLore: Memory-Efficient LLM Training by Gradient Low-Rank Projection <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03507v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03507v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Zhao, Zhenyu Zhang, Beidi Chen, Zhangyang Wang, Anima Anandkumar, Yuandong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training Large Language Models (LLMs) presents significant memory challenges,
+predominantly due to the growing size of weights and optimizer states. Common
+memory-reduction approaches, such as low-rank adaptation (LoRA), add a
+trainable low-rank matrix to the frozen pre-trained weight in each layer,
+reducing trainable parameters and optimizer states. However, such approaches
+typically underperform training with full-rank weights in both pre-training and
+fine-tuning stages since they limit the parameter search to a low-rank subspace
+and alter the training dynamics, and further, may require full-rank warm start.
+In this work, we propose Gradient Low-Rank Projection (GaLore), a training
+strategy that allows full-parameter learning but is more memory-efficient than
+common low-rank adaptation methods such as LoRA. Our approach reduces memory
+usage by up to 65.5% in optimizer states while maintaining both efficiency and
+performance for pre-training on LLaMA 1B and 7B architectures with C4 dataset
+with up to 19.7B tokens, and on fine-tuning RoBERTa on GLUE tasks. Our 8-bit
+GaLore further reduces optimizer memory by up to 82.5% and total training
+memory by 63.3%, compared to a BF16 baseline. Notably, we demonstrate, for the
+first time, the feasibility of pre-training a 7B model on consumer GPUs with
+24GB memory (e.g., NVIDIA RTX 4090) without model parallel, checkpointing, or
+offloading strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Multi-Modal Speech In-Painting: A Sequence-to-Sequence Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahsa Kadkhodaei Elyaderani, Shahram Shirani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The process of reconstructing missing parts of speech audio from context is
+called speech in-painting. Human perception of speech is inherently
+multi-modal, involving both audio and visual (AV) cues. In this paper, we
+introduce and study a sequence-to-sequence (seq2seq) speech in-painting model
+that incorporates AV features. Our approach extends AV speech in-painting
+techniques to scenarios where both audio and visual data may be jointly
+corrupted. To achieve this, we employ a multi-modal training paradigm that
+boosts the robustness of our model across various conditions involving acoustic
+and visual distortions. This makes our distortion-aware model a plausible
+solution for real-world challenging environments. We compare our method with
+existing transformer-based and recurrent neural network-based models, which
+attempt to reconstruct missing speech gaps ranging from a few milliseconds to
+over a second. Our experimental results demonstrate that our novel seq2seq
+architecture outperforms the state-of-the-art transformer solution by 38.8% in
+terms of enhancing speech quality and 7.14% in terms of improving speech
+intelligibility. We exploit a multi-task learning framework that simultaneously
+performs lip-reading (transcribing video components to text) while
+reconstructing missing parts of the associated speech.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Point Cloud Compression for Machine Perception: A Simple and
+  Strong Baseline by Learning the Octree Depth Level Predictor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00791v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00791v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Liu, Zhihao Hu, Zhenghao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud compression has garnered significant interest in computer vision.
+However, existing algorithms primarily cater to human vision, while most point
+cloud data is utilized for machine vision tasks. To address this, we propose a
+point cloud compression framework that simultaneously handles both human and
+machine vision tasks. Our framework learns a scalable bit-stream, using only
+subsets for different machine vision tasks to save bit-rate, while employing
+the entire bit-stream for human vision tasks. Building on mainstream
+octree-based frameworks like VoxelContext-Net, OctAttention, and G-PCC, we
+introduce a new octree depth-level predictor. This predictor adaptively
+determines the optimal depth level for each octree constructed from a point
+cloud, controlling the bit-rate for machine vision tasks. For simpler tasks
+(\textit{e.g.}, classification) or objects/scenarios, we use fewer depth levels
+with fewer bits, saving bit-rate. Conversely, for more complex tasks
+(\textit{e.g}., segmentation) or objects/scenarios, we use deeper depth levels
+with more bits to enhance performance. Experimental results on various datasets
+(\textit{e.g}., ModelNet10, ModelNet40, ShapeNet, ScanNet, and KITTI) show that
+our point cloud compression approach improves performance for machine vision
+tasks without compromising human vision quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Once-for-All: Controllable Generative Image Compression with Dynamic
+  Granularity Adaption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00758v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00758v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anqi Li, Yuxi Liu, Huihui Bai, Feng Li, Runmin Cong, Meng Wang, Yao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although recent generative image compression methods have demonstrated
+impressive potential in optimizing the rate-distortion-perception trade-off,
+they still face the critical challenge of flexible rate adaption to diverse
+compression necessities and scenarios. To overcome this challenge, this paper
+proposes a Controllable Generative Image Compression framework, Control-GIC,
+the first capable of fine-grained bitrate adaption across a broad spectrum
+while ensuring high-fidelity and generality compression. We base Control-GIC on
+a VQGAN framework representing an image as a sequence of variable-length codes
+(i.e. VQ-indices), which can be losslessly compressed and exhibits a direct
+positive correlation with the bitrates. Therefore, drawing inspiration from the
+classical coding principle, we naturally correlate the information density of
+local image patches with their granular representations, to achieve dynamic
+adjustment of the code quantity following different granularity decisions. This
+implies we can flexibly determine a proper allocation of granularity for the
+patches to acquire desirable compression rates. We further develop a
+probabilistic conditional decoder that can trace back to historic encoded
+multi-granularity representations according to transmitted codes, and then
+reconstruct hierarchical granular features in the formalization of conditional
+probability, enabling more informative aggregation to improve reconstruction
+realism. Our experiments show that Control-GIC allows highly flexible and
+controllable bitrate adaption and even once compression on an entire dataset to
+fulfill constrained bitrate conditions. Experimental results demonstrate its
+superior performance over recent state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Frequency Correlation for Hyperspectral Image Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00683v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00683v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muge Yan, Lizhi Wang, Lin Zhu, Hua Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep priors have emerged as potent methods in hyperspectral image (HSI)
+reconstruction. While most methods emphasize space-domain learning using image
+space priors like non-local similarity, frequency-domain learning using image
+frequency priors remains neglected, limiting the reconstruction capability of
+networks. In this paper, we first propose a Hyperspectral Frequency Correlation
+(HFC) prior rooted in in-depth statistical frequency analyses of existent HSI
+datasets. Leveraging the HFC prior, we subsequently establish the frequency
+domain learning composed of a Spectral-wise self-Attention of Frequency (SAF)
+and a Spectral-spatial Interaction of Frequency (SIF) targeting low-frequency
+and high-frequency components, respectively. The outputs of SAF and SIF are
+adaptively merged by a learnable gating filter, thus achieving a thorough
+exploitation of image frequency priors. Integrating the frequency domain
+learning and the existing space domain learning, we finally develop the
+Correlation-driven Mixing Domains Transformer (CMDT) for HSI reconstruction.
+Extensive experiments highlight that our method surpasses various
+state-of-the-art (SOTA) methods in reconstruction quality and computational
+efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent Text-Conditioned Music Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00626v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00626v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhouyao Xie, Nikhil Yadala, Xinyi Chen, Jing Xi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CLIP (Contrastive Language-Image Pre-Training) is a multimodal neural network
+trained on (text, image) pairs to predict the most relevant text caption given
+an image. It has been used extensively in image generation by connecting its
+output with a generative model such as VQGAN, with the most notable example
+being OpenAI's DALLE-2. In this project, we apply a similar approach to bridge
+the gap between natural language and music. Our model is split into two steps:
+first, we train a CLIP-like model on pairs of text and music over contrastive
+loss to align a piece of music with its most probable text caption. Then, we
+combine the alignment model with a music decoder to generate music. To the best
+of our knowledge, this is the first attempt at text-conditioned deep music
+generation. Our experiments show that it is possible to train the text-music
+alignment model using contrastive loss and train a decoder to generate music
+from text prompts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Multi-loss Fusion Network for Sentiment Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.00264v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.00264v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehui Wu, Ziwei Gong, Jaywon Koo, Julia Hirschberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the optimal selection and fusion of feature encoders
+across multiple modalities and combines these in one neural network to improve
+sentiment detection. We compare different fusion methods and examine the impact
+of multi-loss training within the multi-modality fusion network, identifying
+surprisingly important findings relating to subnet performance. We have also
+found that integrating context significantly enhances model performance. Our
+best model achieves state-of-the-art performance for three datasets (CMU-MOSI,
+CMU-MOSEI and CH-SIMS). These results suggest a roadmap toward an optimized
+feature selection and fusion approach for enhancing sentiment detection in
+neural networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally to the paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WorDepth: Variational Language Prior for Monocular Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.03635v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.03635v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyao Zeng, Daniel Wang, Fengyu Yang, Hyoungseob Park, Yangchao Wu, Stefano Soatto, Byung-Woo Hong, Dong Lao, Alex Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Three-dimensional (3D) reconstruction from a single image is an ill-posed
+problem with inherent ambiguities, i.e. scale. Predicting a 3D scene from text
+description(s) is similarly ill-posed, i.e. spatial arrangements of objects
+described. We investigate the question of whether two inherently ambiguous
+modalities can be used in conjunction to produce metric-scaled reconstructions.
+To test this, we focus on monocular depth estimation, the problem of predicting
+a dense depth map from a single image, but with an additional text caption
+describing the scene. To this end, we begin by encoding the text caption as a
+mean and standard deviation; using a variational framework, we learn the
+distribution of the plausible metric reconstructions of 3D scenes corresponding
+to the text captions as a prior. To "select" a specific reconstruction or depth
+map, we encode the given image through a conditional sampler that samples from
+the latent space of the variational text encoder, which is then decoded to the
+output depth map. Our approach is trained alternatingly between the text and
+image branches: in one optimization step, we predict the mean and standard
+deviation from the text description and sample from a standard Gaussian, and in
+the other, we sample using a (image) conditional sampler. Once trained, we
+directly predict depth from the encoded text using the conditional sampler. We
+demonstrate our approach on indoor (NYUv2) and outdoor (KITTI) scenarios, where
+we show that language can consistently improve performance in both.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">5</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Maximum-Entropy Regularized Decision <span class="highlight-title">Transformer</span> with Reward Relabelling
+  for Dynamic Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocong Chen, Siyu Wang, Lina Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning-based recommender systems have recently gained
+popularity. However, due to the typical limitations of simulation environments
+(e.g., data inefficiency), most of the work cannot be broadly applied in all
+domains. To counter these challenges, recent advancements have leveraged
+offline reinforcement learning methods, notable for their data-driven approach
+utilizing offline datasets. A prominent example of this is the Decision
+Transformer. Despite its popularity, the Decision Transformer approach has
+inherent drawbacks, particularly evident in recommendation methods based on it.
+This paper identifies two key shortcomings in existing Decision
+Transformer-based methods: a lack of stitching capability and limited
+effectiveness in online adoption. In response, we introduce a novel methodology
+named Max-Entropy enhanced Decision Transformer with Reward Relabeling for
+Offline RLRS (EDT4Rec). Our approach begins with a max entropy perspective,
+leading to the development of a max entropy enhanced exploration strategy. This
+strategy is designed to facilitate more effective exploration in online
+environments. Additionally, to augment the model's capability to stitch
+sub-optimal trajectories, we incorporate a unique reward relabeling technique.
+To validate the effectiveness and superiority of EDT4Rec, we have conducted
+comprehensive experiments across six real-world offline datasets and in an
+online simulator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A lexicon obtained and validated by a data-driven approach for organic
+  residues valorization in emerging and developing countries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christiane Rakotomalala, Jean-Marie Paillat, Frédéric Feder, Angel Avadí, Laurent Thuriès, Marie-Liesse Vermeire, Jean-Michel Médoc, Tom Wassenaar, Caroline Hottelart, Lilou Kieffer, Elisa Ndjie, Mathieu Picart, Jorel Tchamgoue, Alvin Tulle, Laurine Valade, Annie Boyer, Marie-Christine Duchamp, Mathieu Roche
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The text mining method presented in this paper was used for annotation of
+terms related to biological transformation and valorization of organic residues
+in agriculture in low and middle-income country. Specialized lexicon was
+obtained through different steps: corpus and extraction of terms, annotation of
+extracted terms, selection of relevant terms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COS-Mix: Cosine Similarity and Distance Fusion for Improved Information
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kush Juvekar, Anupam Purwar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes a novel hybrid retrieval strategy for Retrieval-Augmented
+Generation (RAG) that integrates cosine similarity and cosine distance measures
+to improve retrieval performance, particularly for sparse data. The traditional
+cosine similarity measure is widely used to capture the similarity between
+vectors in high-dimensional spaces. However, it has been shown that this
+measure can yield arbitrary results in certain scenarios. To address this
+limitation, we incorporate cosine distance measures to provide a complementary
+perspective by quantifying the dissimilarity between vectors. Our approach is
+experimented on proprietary data, unlike recent publications that have used
+open-source datasets. The proposed method demonstrates enhanced retrieval
+performance and provides a more comprehensive understanding of the semantic
+relationships between documents or items. This hybrid strategy offers a
+promising solution for efficiently and accurately retrieving relevant
+information in knowledge-intensive applications, leveraging techniques such as
+BM25 (sparse) retrieval , vector (Dense) retrieval, and cosine distance based
+retrieval to facilitate efficient information retrieval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making Recommender Systems More Knowledgeable: A Framework to
+  Incorporate Side Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00615v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00615v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukun Jiang, Leo Guo, Xinyi Chen, Jing Xi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Session-based recommender systems typically focus on using only the triplet
+(user_id, timestamp, item_id) to make predictions of users' next actions. In
+this paper, we aim to utilize side information to help recommender systems
+catch patterns and signals otherwise undetectable. Specifically, we propose a
+general framework for incorporating item-specific side information into the
+recommender system to enhance its performance without much modification on the
+original model architecture. Experimental results on several models and
+datasets prove that with side information, our recommender system outperforms
+state-of-the-art models by a considerable margin and converges much faster.
+Additionally, we propose a new type of loss to regularize the attention
+mechanism used by recommender systems and evaluate its influence on model
+performance. Furthermore, through analysis, we put forward a few insights on
+potential further improvements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Know Your Neighborhood: General and Zero-Shot Capable Binary Function
+  Search Powered by Call Graphlets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.02606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.02606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Collyer, Tim Watson, Iain Phillips
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Binary code similarity detection is an important problem with applications in
+areas like malware analysis, vulnerability research and plagiarism detection.
+This paper proposes a novel graph neural network architecture combined with a
+novel graph data representation called call graphlets. A call graphlet encodes
+the neighborhood around each function in a binary executable, capturing the
+local and global context through a series of statistical features. A
+specialized graph neural network model is then designed to operate on this
+graph representation, learning to map it to a feature vector that encodes
+semantic code similarities using deep metric learning. The proposed approach is
+evaluated across four distinct datasets covering different architectures,
+compiler toolchains, and optimization levels. Experimental results demonstrate
+that the combination of call graphlets and the novel graph neural network
+architecture achieves state-of-the-art performance compared to baseline
+techniques across cross-architecture, mono-architecture and zero shot tasks. In
+addition, our proposed approach also performs well when evaluated against an
+out-of-domain function inlining task. Overall, the work provides a general and
+effective graph neural network-based solution for conducting binary code
+similarity detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-06-01T00:00:00Z">2024-06-01</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">26</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Can Learn Temporal Reasoning <span class="chip">ACL24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06853v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06853v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siheng Xiong, Ali Payani, Ramana Kompella, Faramarz Fekri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) have demonstrated remarkable reasoning
+capabilities, they are not without their flaws and inaccuracies. Recent studies
+have introduced various methods to mitigate these limitations. Temporal
+reasoning (TR), in particular, presents a significant challenge for LLMs due to
+its reliance on diverse temporal concepts and intricate temporal logic. In this
+paper, we propose TG-LLM, a novel framework towards language-based TR. Instead
+of reasoning over the original context, we adopt a latent representation,
+temporal graph (TG) that enhances the learning of TR. A synthetic dataset
+(TGQA), which is fully controllable and requires minimal supervision, is
+constructed for fine-tuning LLMs on this text-to-TG translation task. We
+confirmed in experiments that the capability of TG translation learned on our
+dataset can be transferred to other TR tasks and benchmarks. On top of that, we
+teach LLM to perform deliberate reasoning over the TGs via Chain-of-Thought
+(CoT) bootstrapping and graph data augmentation. We observed that those
+strategies, which maintain a balance between usefulness and diversity, bring
+more reliable CoTs and final results than the vanilla CoT distillation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL24 (main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lumos : Empowering Multimodal LLMs with Scene Text Recognition <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08017v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08017v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Shenoy, Yichao Lu, Srihari Jayakumar, Debojeet Chatterjee, Mohsen Moslehpour, Pierce Chuang, Abhay Harpale, Vikas Bhardwaj, Di Xu, Shicong Zhao, Longfang Zhao, Ankit Ramchandani, Xin Luna Dong, Anuj Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Lumos, the first end-to-end multimodal question-answering system
+with text understanding capabilities. At the core of Lumos is a Scene Text
+Recognition (STR) component that extracts text from first person point-of-view
+images, the output of which is used to augment input to a Multimodal Large
+Language Model (MM-LLM). While building Lumos, we encountered numerous
+challenges related to STR quality, overall latency, and model inference. In
+this paper, we delve into those challenges, and discuss the system
+architecture, design choices, and modeling techniques employed to overcome
+these obstacles. We also provide a comprehensive evaluation for each component,
+showcasing high quality and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to KDD 2024 (ADS Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conformal Language Modeling <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10193v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10193v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Victor Quach, Adam Fisch, Tal Schuster, Adam Yala, Jae Ho Sohn, Tommi S. Jaakkola, Regina Barzilay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel approach to conformal prediction for generative language
+models (LMs). Standard conformal prediction produces prediction sets -- in
+place of single predictions -- that have rigorous, statistical performance
+guarantees. LM responses are typically sampled from the model's predicted
+distribution over the large, combinatorial output space of natural language.
+Translating this process to conformal prediction, we calibrate a stopping rule
+for sampling different outputs from the LM that get added to a growing set of
+candidates until we are confident that the output set is sufficient. Since some
+samples may be low-quality, we also simultaneously calibrate and apply a
+rejection rule for removing candidates from the output set to reduce noise.
+Similar to conformal prediction, we prove that the sampled set returned by our
+procedure contains at least one acceptable answer with high probability, while
+still being empirically precise (i.e., small) on average. Furthermore, within
+this set of candidate responses, we show that we can also accurately identify
+subsets of individual components -- such as phrases or sentences -- that are
+each independently correct (e.g., that are not "hallucinations"), again with
+statistical guarantees. We demonstrate the promise of our approach on multiple
+tasks in open-domain question answering, text summarization, and radiology
+report generation using different LM variants.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formalizing and Benchmarking <span class="highlight-title">Prompt</span> Injection Attacks and Defenses <span class="chip">USENIX Security</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12815v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12815v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupei Liu, Yuqi Jia, Runpeng Geng, Jinyuan Jia, Neil Zhenqiang Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A prompt injection attack aims to inject malicious instruction/data into the
+input of an LLM-Integrated Application such that it produces results as an
+attacker desires. Existing works are limited to case studies. As a result, the
+literature lacks a systematic understanding of prompt injection attacks and
+their defenses. We aim to bridge the gap in this work. In particular, we
+propose a framework to formalize prompt injection attacks. Existing attacks are
+special cases in our framework. Moreover, based on our framework, we design a
+new attack by combining existing ones. Using our framework, we conduct a
+systematic evaluation on 5 prompt injection attacks and 10 defenses with 10
+LLMs and 7 tasks. Our work provides a common benchmark for quantitatively
+evaluating future prompt injection attacks and defenses. To facilitate research
+on this topic, we make our platform public at
+https://github.com/liu00222/Open-Prompt-Injection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in USENIX Security Symposium 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Critique of Critique <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04518v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04518v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shichao Sun, Junlong Li, Weizhe Yuan, Ruifeng Yuan, Wenjie Li, Pengfei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Critique, as a natural language description for assessing the quality of
+model-generated content, has played a vital role in the training, evaluation,
+and refinement of LLMs. However, a systematic method to evaluate the quality of
+critique is lacking. In this paper, we pioneer the critique of critique, termed
+MetaCritique, which builds specific quantification criteria. To achieve a
+reliable evaluation outcome, we propose Atomic Information Units (AIUs), which
+describe the critique in a more fine-grained manner. MetaCritique aggregates
+each AIU's judgment for the overall score. Moreover, MetaCritique delivers a
+natural language rationale for the intricate reasoning within each judgment.
+Lastly, we construct a meta-evaluation dataset covering 4 tasks across 16
+public datasets involving human-written and LLM-generated critiques.
+Experiments demonstrate that MetaCritique can achieve near-human performance.
+Our study can facilitate future research in LLM critiques based on our
+following observations and released resources: (1) superior critiques judged by
+MetaCritique can lead to better refinements, indicating that it can potentially
+enhance the alignment of existing LLMs; (2) the leaderboard of critique models
+reveals that open-source critique models commonly suffer from factuality
+issues; (3) relevant code and data are publicly available at
+https://github.com/GAIR-NLP/MetaCritique to support deeper exploration; (4) an
+API at PyPI with the usage documentation in Appendix C allows users to assess
+the critique conveniently.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ tnGPS: Discovering Unknown Tensor Network Structure Search Algorithms
+  via Large Language Models (LLMs) <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02456v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02456v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhua Zeng, Chao Li, Zhun Sun, Qibin Zhao, Guoxu Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tensor networks are efficient for extremely high-dimensional representation,
+but their model selection, known as tensor network structure search (TN-SS), is
+a challenging problem. Although several works have targeted TN-SS, most
+existing algorithms are manually crafted heuristics with poor performance,
+suffering from the curse of dimensionality and local convergence. In this work,
+we jump out of the box, studying how to harness large language models (LLMs) to
+automatically discover new TN-SS algorithms, replacing the involvement of human
+experts. By observing how human experts innovate in research, we model their
+common workflow and propose an automatic algorithm discovery framework called
+tnGPS. The proposed framework is an elaborate prompting pipeline that instruct
+LLMs to generate new TN-SS algorithms through iterative refinement and
+enhancement. The experimental results demonstrate that the algorithms
+discovered by tnGPS exhibit superior performance in benchmarks compared to the
+current state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML2024, pre-printed version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S3D: A Simple and Cost-Effective Self-Speculative Decoding Scheme for
+  Low-Memory GPUs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20314v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20314v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Zhong, Manasa Bharadwaj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speculative decoding (SD) has attracted a significant amount of research
+attention due to the substantial speedup it can achieve for LLM inference.
+However, despite the high speedups they offer, speculative decoding methods
+often achieve optimal performance on high-end devices or with a substantial GPU
+memory overhead. Given limited memory and the necessity of quantization, a
+high-performing model on a high-end GPU can slow down by up to 7 times. To this
+end, we propose Skippy Simultaneous Speculative Decoding (or S3D), a
+cost-effective self-speculative SD method based on simultaneous multi-token
+decoding and mid-layer skipping. When compared against recent effective
+open-source SD systems, our method has achieved one of the top
+performance-memory ratios while requiring minimal architecture changes and
+training data. Leveraging our memory efficiency, we created a smaller yet more
+effective SD model based on Phi-3. It is 1.4 to 2 times faster than the
+quantized EAGLE model and operates in half-precision while using less VRAM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Position: Key Claims in LLM Research Have a Long Tail of Footnotes <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07120v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07120v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Rogers, Alexandra Sasha Luccioni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Much of the recent discourse within the ML community has been centered around
+Large Language Models (LLMs), their functionality and potential -- yet not only
+do we not have a working definition of LLMs, but much of this discourse relies
+on claims and assumptions that are worth re-examining. We contribute a
+definition of LLMs, critically examine five common claims regarding their
+properties (including 'emergent properties'), and conclude with suggestions for
+future research directions and their framing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 camera-ready (https://openreview.net/forum?id=M2cwkGleRL)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning for Assessment of Oral Reading Fluency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19426v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19426v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mithilesh Vaidya, Binaya Kumar Sahoo, Preeti Rao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reading fluency assessment is a critical component of literacy programmes,
+serving to guide and monitor early education interventions. Given the resource
+intensive nature of the exercise when conducted by teachers, the development of
+automatic tools that can operate on audio recordings of oral reading is
+attractive as an objective and highly scalable solution. Multiple complex
+aspects such as accuracy, rate and expressiveness underlie human judgements of
+reading fluency. In this work, we investigate end-to-end modeling on a training
+dataset of children's audio recordings of story texts labeled by human experts.
+The pre-trained wav2vec2.0 model is adopted due its potential to alleviate the
+challenges from the limited amount of labeled data. We report the performance
+of a number of system variations on the relevant measures, and also probe the
+learned embeddings for lexical and acoustic-prosodic features known to be
+important to the perception of reading fluency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PrivLM-Bench: A Multi-level Privacy Evaluation Benchmark for Language
+  Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04044v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04044v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Li, Dadi Guo, Donghao Li, Wei Fan, Qi Hu, Xin Liu, Chunkit Chan, Duanyi Yao, Yuan Yao, Yangqiu Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of language models (LMs) brings unprecedented
+accessibility and usage for both models and users. On the one hand, powerful
+LMs achieve state-of-the-art performance over numerous downstream NLP tasks. On
+the other hand, more and more attention is paid to unrestricted model accesses
+that may bring malicious privacy risks of data leakage. To address these
+issues, many recent works propose privacy-preserving language models (PPLMs)
+with differential privacy (DP). Unfortunately, different DP implementations
+make it challenging for a fair comparison among existing PPLMs. In this paper,
+we present PrivLM-Bench, a multi-perspective privacy evaluation benchmark to
+empirically and intuitively quantify the privacy leakage of LMs. Instead of
+only reporting DP parameters, PrivLM-Bench sheds light on the neglected
+inference data privacy during actual usage. PrivLM-Bench first clearly defines
+multi-faceted privacy objectives. Then, PrivLM-Bench constructs a unified
+pipeline to perform private fine-tuning. Lastly, PrivLM-Bench performs existing
+privacy attacks on LMs with pre-defined privacy objectives as the empirical
+evaluation results. The empirical attack results are used to fairly and
+intuitively evaluate the privacy leakage of various PPLMs. We conduct extensive
+experiments on three datasets of GLUE for mainstream LMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SLEB: Streamlining LLMs through Redundancy Verification and Elimination
+  of <span class="highlight-title">Transformer</span> Blocks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09025v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09025v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiwon Song, Kyungseok Oh, Taesu Kim, Hyungjun Kim, Yulhwa Kim, Jae-Joon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have proven to be highly effective across
+various natural language processing tasks. However, their large number of
+parameters poses significant challenges for practical deployment. Pruning, a
+technique aimed at reducing the size and complexity of LLMs, offers a potential
+solution by removing redundant components from the network. Despite the promise
+of pruning, existing methods often struggle to achieve substantial end-to-end
+LLM inference speedup. In this paper, we introduce SLEB, a novel approach
+designed to streamline LLMs by eliminating redundant transformer blocks. We
+choose the transformer block as the fundamental unit for pruning, because LLMs
+exhibit block-level redundancy with high similarity between the outputs of
+neighboring blocks. This choice allows us to effectively enhance the processing
+speed of LLMs. Our experimental results demonstrate that SLEB outperforms
+previous LLM pruning methods in accelerating LLM inference while also
+maintaining superior perplexity and accuracy, making SLEB as a promising
+technique for enhancing the efficiency of LLMs. The code is available at:
+https://github.com/jiwonsong-dev/SLEB.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Consistency and Role-Specific Knowledge Capturing by
+  Rebuilding Fictional Character's Persona 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19778v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19778v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeiyoon Park, Chanjun Park, Heuiseok Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent introduction of Assistants API, it is expected that
+document-based language models will be actively used in various domains,
+especially Role-playing. However, a key challenge lies in utilizing
+protagonist's persona: Assistants API often fails to achieve with its search
+because the information extraction part is different each time and it often
+omits important information such as protagonist's backstory or relationships.
+It is hard to maintain a consistent persona simply by using the persona
+document as input to the Assistants API. To address the challenge of achieving
+stable persona consistency, we propose CharacterGPT, a novel persona
+reconstruction framework to alleviate the shortcomings of the Assistants API.
+Our method involves Character Persona Training (CPT), an effective persona
+rebuilding process that updates the character persona by extracting the
+character's traits from given summary of the novel for each character as if the
+story in a novel progresses. In our experiments, we ask each character to take
+the Big Five Inventory personality test in various settings and analyze the
+results. To assess whether it can think outside the box, we let each character
+generate short novels. Extensive experiments and human evaluation demonstrate
+that CharacterGPT presents new possibilities for role-playing agent research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Estimating the Level of Dialectness Predicts Interannotator Agreement in
+  Multi-dialect Arabic <span class="highlight-title">Dataset</span>s <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11282v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11282v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amr Keleg, Walid Magdy, Sharon Goldwater
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  On annotating multi-dialect Arabic datasets, it is common to randomly assign
+the samples across a pool of native Arabic speakers. Recent analyses
+recommended routing dialectal samples to native speakers of their respective
+dialects to build higher-quality datasets. However, automatically identifying
+the dialect of samples is hard. Moreover, the pool of annotators who are native
+speakers of specific Arabic dialects might be scarce. Arabic Level of
+Dialectness (ALDi) was recently introduced as a quantitative variable that
+measures how sentences diverge from Standard Arabic. On randomly assigning
+samples to annotators, we hypothesize that samples of higher ALDi scores are
+harder to label especially if they are written in dialects that the annotators
+do not speak. We test this by analyzing the relation between ALDi scores and
+the annotators' agreement, on 15 public datasets having raw individual sample
+annotations for various sentence-classification tasks. We find strong evidence
+supporting our hypothesis for 11 of them. Consequently, we recommend
+prioritizing routing samples of high ALDi scores to native speakers of each
+sample's dialect, for which the dialect could be automatically identified at
+higher accuracies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 - Main (camera-ready version)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Semantic Distance Metric Learning approach for Lexical Semantic Change
+  Detection <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00226v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00226v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taichi Aida, Danushka Bollegala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting temporal semantic changes of words is an important task for various
+NLP applications that must make time-sensitive predictions. Lexical Semantic
+Change Detection (SCD) task involves predicting whether a given target word,
+$w$, changes its meaning between two different text corpora, $C_1$ and $C_2$.
+For this purpose, we propose a supervised two-staged SCD method that uses
+existing Word-in-Context (WiC) datasets. In the first stage, for a target word
+$w$, we learn two sense-aware encoders that represent the meaning of $w$ in a
+given sentence selected from a corpus. Next, in the second stage, we learn a
+sense-aware distance metric that compares the semantic representations of a
+target word across all of its occurrences in $C_1$ and $C_2$. Experimental
+results on multiple benchmark datasets for SCD show that our proposed method
+achieves strong performance in multiple languages. Additionally, our method
+achieves significant improvements on WiC benchmarks compared to a sense-aware
+encoder with conventional distance functions. Source code is available at
+https://github.com/LivNLP/svp-sdml .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Measuring Faithfulness or Self-consistency of Natural Language
+  Explanations <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.07466v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.07466v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Letitia Parcalabescu, Anette Frank
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) can explain their predictions through post-hoc
+or Chain-of-Thought (CoT) explanations. But an LLM could make up reasonably
+sounding explanations that are unfaithful to its underlying reasoning. Recent
+work has designed tests that aim to judge the faithfulness of post-hoc or CoT
+explanations. In this work we argue that these faithfulness tests do not
+measure faithfulness to the models' inner workings -- but rather their
+self-consistency at output level. Our contributions are three-fold: i) We
+clarify the status of faithfulness tests in view of model explainability,
+characterising them as self-consistency tests instead. This assessment we
+underline by ii) constructing a Comparative Consistency Bank for
+self-consistency tests that for the first time compares existing tests on a
+common suite of 11 open LLMs and 5 tasks -- including iii) our new
+self-consistency measure CC-SHAP. CC-SHAP is a fine-grained measure (not a
+test) of LLM self-consistency. It compares how a model's input contributes to
+the predicted answer and to generating the explanation. Our fine-grained
+CC-SHAP metric allows us iii) to compare LLM behaviour when making predictions
+and to analyse the effect of other consistency tests at a deeper level, which
+takes us one step further towards measuring faithfulness by bringing us closer
+to the internals of the model than strictly surface output-oriented tests. Our
+code is available at \url{https://github.com/Heidelberg-NLP/CC-SHAP}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted for publication at ACL 2024 Main (Bangkok, Thailand);
+  10 main paper pages, 30 appendix pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CriticBench: Benchmarking LLMs for Critique-Correct Reasoning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14809v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14809v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zicheng Lin, Zhibin Gou, Tian Liang, Ruilin Luo, Haowei Liu, Yujiu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability of Large Language Models (LLMs) to critique and refine their
+reasoning is crucial for their application in evaluation, feedback provision,
+and self-improvement. This paper introduces CriticBench, a comprehensive
+benchmark designed to assess LLMs' abilities to critique and rectify their
+reasoning across a variety of tasks. CriticBench encompasses five reasoning
+domains: mathematical, commonsense, symbolic, coding, and algorithmic. It
+compiles 15 datasets and incorporates responses from three LLM families.
+Utilizing CriticBench, we evaluate and dissect the performance of 17 LLMs in
+generation, critique, and correction reasoning, i.e., GQC reasoning. Our
+findings reveal: (1) a linear relationship in GQC capabilities, with
+critique-focused training markedly enhancing performance; (2) a task-dependent
+variation in correction effectiveness, with logic-oriented tasks being more
+amenable to correction; (3) GQC knowledge inconsistencies that decrease as
+model size increases; and (4) an intriguing inter-model critiquing dynamic,
+where stronger models are better at critiquing weaker ones, while weaker models
+can surprisingly surpass stronger ones in their self-critique. We hope these
+insights into the nuanced critique-correct reasoning of LLMs will foster
+further research in LLM critique and self-improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ STYLE: Improving Domain Transferability of Asking Clarification
+  Questions in Large Language Model Powered Conversational Agents <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12059v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12059v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Chen, Chen Huang, Yang Deng, Wenqiang Lei, Dingnan Jin, Jia Liu, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Equipping a conversational search engine with strategies regarding when to
+ask clarification questions is becoming increasingly important across various
+domains. Attributing to the context understanding capability of LLMs and their
+access to domain-specific sources of knowledge, LLM-based clarification
+strategies feature rapid transfer to various domains in a post-hoc manner.
+However, they still struggle to deliver promising performance on unseen
+domains, struggling to achieve effective domain transferability. We take the
+first step to investigate this issue and existing methods tend to produce
+one-size-fits-all strategies across diverse domains, limiting their search
+effectiveness. In response, we introduce a novel method, called Style, to
+achieve effective domain transferability. Our experimental results indicate
+that Style bears strong domain transferability, resulting in an average search
+performance improvement of ~10% on four unseen domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of ACL 2024. Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLAMBER: A Benchmark of Identifying and Clarifying Ambiguous Information
+  Needs in Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Zhang, Peixin Qin, Yang Deng, Chen Huang, Wenqiang Lei, Junhong Liu, Dingnan Jin, Hongru Liang, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are increasingly used to meet user information
+needs, but their effectiveness in dealing with user queries that contain
+various types of ambiguity remains unknown, ultimately risking user trust and
+satisfaction. To this end, we introduce CLAMBER, a benchmark for evaluating
+LLMs using a well-organized taxonomy. Building upon the taxonomy, we construct
+~12K high-quality data to assess the strengths, weaknesses, and potential risks
+of various off-the-shelf LLMs. Our findings indicate the limited practical
+utility of current LLMs in identifying and clarifying ambiguous user queries,
+even enhanced by chain-of-thought (CoT) and few-shot prompting. These
+techniques may result in overconfidence in LLMs and yield only marginal
+enhancements in identifying ambiguity. Furthermore, current LLMs fall short in
+generating high-quality clarifying questions due to a lack of conflict
+resolution and inaccurate utilization of inherent knowledge. In this paper,
+CLAMBER presents a guidance and promotes further research on proactive and
+trustworthy LLMs. Our dataset is available at
+https://github.com/zt991211/CLAMBER
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024. Camera Ready. Our dataset is available at
+  https://github.com/zt991211/CLAMBER</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ARAIDA: Analogical Reasoning-Augmented Interactive Data Annotation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11912v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11912v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Huang, Yiping Jin, Ilija Ilievski, Wenqiang Lei, Jiancheng Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human annotation is a time-consuming task that requires a significant amount
+of effort. To address this issue, interactive data annotation utilizes an
+annotation model to provide suggestions for humans to approve or correct.
+However, annotation models trained with limited labeled data are prone to
+generating incorrect suggestions, leading to extra human correction effort. To
+tackle this challenge, we propose Araida, an analogical reasoning-based
+approach that enhances automatic annotation accuracy in the interactive data
+annotation setting and reduces the need for human corrections. Araida involves
+an error-aware integration strategy that dynamically coordinates an annotation
+model and a k-nearest neighbors (KNN) model, giving more importance to KNN's
+predictions when predictions from the annotation model are deemed inaccurate.
+Empirical studies demonstrate that Araida is adaptable to different annotation
+tasks and models. On average, it reduces human correction labor by 11.02%
+compared to vanilla interactive data annotation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024. Camera Ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SIBO: A Simple Booster for Parameter-Efficient Fine-Tuning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11896v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11896v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Wen, Jie Zhang, Yuan Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning all parameters of large language models (LLMs) necessitates
+substantial computational power and extended time. Latest advancements in
+parameter-efficient fine-tuning (PEFT) techniques, such as Adapter tuning and
+LoRA, allow for adjustments to only a minor fraction of the parameters of these
+LLMs. Concurrently, it has been noted that the issue of over-smoothing
+diminishes the effectiveness of these Transformer-based LLMs, resulting in
+suboptimal performances in downstream tasks. In this paper, we present SIBO,
+which is a SImple BOoster to enhance PEFT, by injecting an initial residual.
+SIBO is straightforward and readily extensible to a range of state-of-the-art
+PEFT techniques to alleviate over-smoothing and enhance performance. Extensive
+experiments on 22 benchmark datasets demonstrate that SIBO significantly
+enhances the performance of various strong baselines, achieving up to 15.7% and
+23.5% improvement over existing PEFT methods on the arithmetic and commonsense
+reasoning tasks, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024, 17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video sentence grounding with temporally global textual knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13611v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13611v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cai Chen, Runzhong Zhang, Jianjun Gao, Kejun Wu, Kim-Hui Yap, Yi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal sentence grounding involves the retrieval of a video moment with a
+natural language query. Many existing works directly incorporate the given
+video and temporally localized query for temporal grounding, overlooking the
+inherent domain gap between different modalities. In this paper, we utilize
+pseudo-query features containing extensive temporally global textual knowledge
+sourced from the same video-query pair, to enhance the bridging of domain gaps
+and attain a heightened level of similarity between multi-modal features.
+Specifically, we propose a Pseudo-query Intermediary Network (PIN) to achieve
+an improved alignment of visual and comprehensive pseudo-query features within
+the feature space through contrastive learning. Subsequently, we utilize
+learnable prompts to encapsulate the knowledge of pseudo-queries, propagating
+them into the textual encoder and multi-modal fusion module, further enhancing
+the feature alignment between visual and language for better temporal
+grounding. Extensive experiments conducted on the Charades-STA and
+ActivityNet-Captions datasets demonstrate the effectiveness of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MM-SAP: A Comprehensive Benchmark for Assessing Self-Awareness of
+  Multimodal Large Language Models in Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07529v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07529v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Wang, Yusheng Liao, Heyang Liu, Hongcheng Liu, Yu Wang, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Multimodal Large Language Models (MLLMs) have
+demonstrated exceptional capabilities in visual perception and understanding.
+However, these models also suffer from hallucinations, which limit their
+reliability as AI systems. We believe that these hallucinations are partially
+due to the models' struggle with understanding what they can and cannot
+perceive from images, a capability we refer to as self-awareness in perception.
+Despite its importance, this aspect of MLLMs has been overlooked in prior
+studies. In this paper, we aim to define and evaluate the self-awareness of
+MLLMs in perception. To do this, we first introduce the knowledge quadrant in
+perception, which helps define what MLLMs know and do not know about images.
+Using this framework, we propose a novel benchmark, the Self-Awareness in
+Perception for MLLMs (MM-SAP), specifically designed to assess this capability.
+We apply MM-SAP to a variety of popular MLLMs, offering a comprehensive
+analysis of their self-awareness and providing detailed insights. The
+experiment results reveal that current MLLMs possess limited self-awareness
+capabilities, pointing to a crucial area for future advancement in the
+development of trustworthy MLLMs. Code and data are available at
+https://github.com/YHWmz/MM-SAP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Anchor-based Large Language Models <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07616v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07616v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianhui Pang, Fanghua Ye, Derek Fai Wong, Xin He, Wanshun Chen, Longyue Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) predominantly employ decoder-only transformer
+architectures, necessitating the retention of keys/values information for
+historical tokens to provide contextual information and avoid redundant
+computation. However, the substantial size and parameter volume of these LLMs
+require massive GPU memory. This memory demand increases with the length of the
+input text, leading to an urgent need for more efficient methods of information
+storage and processing. This study introduces Anchor-based LLMs (AnLLMs), which
+utilize an innovative anchor-based self-attention network (AnSAN) and also an
+anchor-based inference strategy. This approach enables LLMs to compress
+sequence information into an anchor token, reducing the keys/values cache and
+enhancing inference efficiency. Experiments on question-answering benchmarks
+reveal that AnLLMs maintain similar accuracy levels while achieving up to 99%
+keys/values cache reduction and up to 3.5 times faster inference. Despite a
+minor compromise in accuracy, the substantial enhancements of AnLLMs employing
+the AnSAN technique in resource utilization and computational efficiency
+underscore their potential for practical LLM applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been accepted by the ACL2024 conference. Work was done
+  when Jianhui Pang and Fanghua Ye were interning at Tencent AI Lab</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ I Learn Better If You Speak My Language: Understanding the Superior
+  Performance of Fine-Tuning Large Language Models with LLM-Generated Responses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11192v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11192v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan Ren, Biao Wu, Lingqiao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores an intriguing observation: fine-tuning a large language
+model (LLM) with responses generated by a LLM often yields better results than
+using responses generated by humans. We conduct an in-depth investigation to
+understand why this occurs. Contrary to the common belief that these instances
+is simply due to the more detailed nature of LLM-generated content, our study
+identifies another contributing factor: an LLM is inherently more "familiar"
+with LLM generated responses. This familiarity is evidenced by lower perplexity
+before fine-tuning. We design a series of experiments to understand the impact
+of the "familiarity" and our conclusion reveals that this "familiarity"
+significantly impacts learning performance. Training with LLM-generated
+responses not only enhances performance but also helps maintain the model's
+capabilities in other tasks after fine-tuning on a specific task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Easy Problems That LLMs Get Wrong 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19616v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19616v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Williams, James Huckle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a comprehensive Linguistic Benchmark designed to evaluate the
+limitations of Large Language Models (LLMs) in domains such as logical
+reasoning, spatial intelligence, and linguistic understanding, among others.
+Through a series of straightforward questions, it uncovers the significant
+limitations of well-regarded models to perform tasks that humans manage with
+ease. It also highlights the potential of prompt engineering to mitigate some
+errors and underscores the necessity for better training methodologies. Our
+findings stress the importance of grounding LLMs with human reasoning and
+common sense, emphasising the need for human-in-the-loop for enterprise
+applications. We hope this work paves the way for future research to enhance
+the usefulness and reliability of new models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AutogenAI Ltd. GitHub Repo:
+  https://github.com/autogenai/easy-problems-that-llms-get-wrong</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are You Sure? Rank Them Again: Repeated Ranking For Better Preference
+  <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18952v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18952v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Devine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training Large Language Models (LLMs) with Reinforcement Learning from AI
+Feedback (RLAIF) aligns model outputs more closely with human preferences. This
+involves an evaluator model ranking multiple candidate responses to user
+prompts. However, the rankings from popular evaluator models such as GPT-4 can
+be inconsistent. We propose the Repeat Ranking method - where we evaluate the
+same responses multiple times and train only on those responses which are
+consistently ranked. Using 2,714 prompts in 62 languages, we generated
+responses from 7 top multilingual LLMs and had GPT-4 rank them five times each.
+Evaluating on MT-Bench chat benchmarks in six languages, our method
+outperformed the standard practice of training on all available prompts. Our
+work highlights the quality versus quantity trade-off in RLAIF dataset
+generation and offers a stackable strategy for enhancing dataset and thus model
+quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">39</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DMesh: A Differentiable Mesh Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13445v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13445v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanghyun Son, Matheus Gadelha, Yang Zhou, Zexiang Xu, Ming C. Lin, Yi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a differentiable representation, DMesh, for general 3D triangular
+meshes. DMesh considers both the geometry and connectivity information of a
+mesh. In our design, we first get a set of convex tetrahedra that compactly
+tessellates the domain based on Weighted Delaunay Triangulation (WDT), and
+select triangular faces on the tetrahedra to define the final mesh. We
+formulate probability of faces to exist on the actual surface in a
+differentiable manner based on the WDT. This enables DMesh to represent meshes
+of various topology in a differentiable way, and allows us to reconstruct the
+mesh under various observations, such as point cloud and multi-view images
+using gradient-based optimization. The source code and full paper is available
+at: https://sonsang.github.io/dmesh-project.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 22 figures. Updated with more analysis and experimental
+  results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XCAT-3.0: A Comprehensive Library of Personalized Digital Twins Derived
+  from CT Scans 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11133v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11133v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lavsen Dahal, Mobina Ghojoghnejad, Dhrubajyoti Ghosh, Yubraj Bhandari, David Kim, Fong Chi Ho, Fakrul Islam Tushar, Sheng Luoa, Kyle J. Lafata, Ehsan Abadi, Ehsan Samei, Joseph Y. Lo, W. Paul Segars
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Virtual Imaging Trials (VIT) offer a cost-effective and scalable approach for
+evaluating medical imaging technologies. Computational phantoms, which mimic
+real patient anatomy and physiology, play a central role in VIT. However, the
+current libraries of computational phantoms face limitations, particularly in
+terms of sample size and diversity. Insufficient representation of the
+population hampers accurate assessment of imaging technologies across different
+patient groups. Traditionally, phantoms were created by manual segmentation,
+which is a laborious and time-consuming task, impeding the expansion of phantom
+libraries. This study presents a framework for realistic computational phantom
+modeling using a suite of four deep learning segmentation models, followed by
+three forms of automated organ segmentation quality control. Over 2500
+computational phantoms with up to 140 structures illustrating a sophisticated
+approach to detailed anatomical modeling are released. Phantoms are available
+in both voxelized and surface mesh formats. The framework is aggregated with an
+in-house CT scanner simulator to produce realistic CT images. The framework can
+potentially advance virtual imaging trials, facilitating comprehensive and
+reliable evaluations of medical imaging technologies. Phantoms may be requested
+at https://cvit.duke.edu/resources/, code, model weights, and sample CT images
+are available at https://xcat-3.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Single Image Camera Calibration by Heatmap Regression to Recover
+  Fisheye Images Under Manhattan World Assumption <span class="chip">CVPR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17166v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17166v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nobuhiko Wakai, Satoshi Sato, Yasunori Ishii, Takayoshi Yamashita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A Manhattan world lying along cuboid buildings is useful for camera angle
+estimation. However, accurate and robust angle estimation from fisheye images
+in the Manhattan world has remained an open challenge because general scene
+images tend to lack constraints such as lines, arcs, and vanishing points. To
+achieve higher accuracy and robustness, we propose a learning-based calibration
+method that uses heatmap regression, which is similar to pose estimation using
+keypoints, to detect the directions of labeled image coordinates.
+Simultaneously, our two estimators recover the rotation and remove fisheye
+distortion by remapping from a general scene image. Without considering
+vanishing-point constraints, we find that additional points for learning-based
+methods can be defined. To compensate for the lack of vanishing points in
+images, we introduce auxiliary diagonal points that have the optimal 3D
+arrangement of spatial uniformity. Extensive experiments demonstrated that our
+method outperforms conventional methods on large-scale datasets and with
+off-the-shelf cameras.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lumos : Empowering Multimodal LLMs with Scene Text Recognition <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08017v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08017v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ashish Shenoy, Yichao Lu, Srihari Jayakumar, Debojeet Chatterjee, Mohsen Moslehpour, Pierce Chuang, Abhay Harpale, Vikas Bhardwaj, Di Xu, Shicong Zhao, Longfang Zhao, Ankit Ramchandani, Xin Luna Dong, Anuj Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Lumos, the first end-to-end multimodal question-answering system
+with text understanding capabilities. At the core of Lumos is a Scene Text
+Recognition (STR) component that extracts text from first person point-of-view
+images, the output of which is used to augment input to a Multimodal Large
+Language Model (MM-LLM). While building Lumos, we encountered numerous
+challenges related to STR quality, overall latency, and model inference. In
+this paper, we delve into those challenges, and discuss the system
+architecture, design choices, and modeling techniques employed to overcome
+these obstacles. We also provide a comprehensive evaluation for each component,
+showcasing high quality and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to KDD 2024 (ADS Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Low-Rank Few-Shot Adaptation of Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18541v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18541v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Zanella, Ismail Ben Ayed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in the few-shot adaptation of Vision-Language Models (VLMs)
+has further pushed their generalization capabilities, at the expense of just a
+few labeled samples within the target downstream task. However, this promising,
+already quite abundant few-shot literature has focused principally on prompt
+learning and, to a lesser extent, on adapters, overlooking the recent advances
+in Parameter-Efficient Fine-Tuning (PEFT). Furthermore, existing few-shot
+learning methods for VLMs often rely on heavy training procedures and/or
+carefully chosen, task-specific hyper-parameters, which might impede their
+applicability. In response, we introduce Low-Rank Adaptation (LoRA) in few-shot
+learning for VLMs, and show its potential on 11 datasets, in comparison to
+current state-of-the-art prompt- and adapter-based approaches. Surprisingly,
+our simple CLIP-LoRA method exhibits substantial improvements, while reducing
+the training times and keeping the same hyper-parameters in all the target
+tasks, i.e., across all the datasets and numbers of shots. Certainly, our
+surprising results do not dismiss the potential of prompt-learning and
+adapter-based research. However, we believe that our strong baseline could be
+used to evaluate progress in these emergent subjects in few-shot VLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced 3D Urban Scene Reconstruction and Point Cloud Densification
+  using Gaussian Splatting and Google Earth Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11021v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11021v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Gao, Dening Lu, Hongjie He, Linlin Xu, Jonathan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D urban scene reconstruction and modelling is a crucial research area in
+remote sensing with numerous applications in academia, commerce, industry, and
+administration. Recent advancements in view synthesis models have facilitated
+photorealistic 3D reconstruction solely from 2D images. Leveraging Google Earth
+imagery, we construct a 3D Gaussian Splatting model of the Waterloo region
+centered on the University of Waterloo and are able to achieve view-synthesis
+results far exceeding previous 3D view-synthesis results based on neural
+radiance fields which we demonstrate in our benchmark. Additionally, we
+retrieved the 3D geometry of the scene using the 3D point cloud extracted from
+the 3D Gaussian Splatting model which we benchmarked against our Multi-
+View-Stereo dense reconstruction of the scene, thereby reconstructing both the
+3D geometry and photorealistic lighting of the large-scale urban scene through
+3D Gaussian Splatting
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Investigating Robustness of Open-Vocabulary Foundation Object Detectors
+  under Distribution Shifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14874v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14874v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prakash Chandra Chhipa, Kanjar De, Meenakshi Subhash Chippa, Rajkumar Saini, Marcus Liwicki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The challenge of Out-Of-Distribution (OOD) robustness remains a critical
+hurdle towards deploying deep vision models. Open-vocabulary object detection
+extends the capabilities of traditional object detection frameworks to
+recognize and classify objects beyond predefined categories. Investigating OOD
+robustness in open-vocabulary object detection is essential to increase the
+trustworthiness of these models. This study presents a comprehensive robustness
+evaluation of zero-shot capabilities of three recent open-vocabulary foundation
+object detection models, namely OWL-ViT, YOLO World, and Grounding DINO.
+Experiments carried out on the COCO-O and COCO-C benchmarks encompassing
+distribution shifts highlight the challenges of the models' robustness. Source
+code shall be made available to the research community on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 + 3 single column pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A citizen science toolkit to collect human perceptions of urban
+  environments using open street view images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.00174v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.00174v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Danish, SM Labib, Britta Ricker, Marco Helbich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Street View Imagery (SVI) is a valuable data source for studies (e.g.,
+environmental assessments, green space identification or land cover
+classification). While commercial SVI is available, such providers commonly
+restrict copying or reuse in ways necessary for research. Open SVI datasets are
+readily available from less restrictive sources, such as Mapillary, but due to
+the heterogeneity of the images, these require substantial preprocessing,
+filtering, and careful quality checks. We present an efficient method for
+automated downloading, processing, cropping, and filtering open SVI, to be used
+in a survey of human perceptions of the streets portrayed in these images. We
+demonstrate our open-source reusable SVI preparation and smartphone-friendly
+perception-survey software with Amsterdam (Netherlands) as the case study.
+Using a citizen science approach, we collected from 331 people 22,637 ratings
+about their perceptions for various criteria. We have published our software in
+a public repository for future re-use and reproducibility.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChAda-ViT : Channel Adaptive Attention for Joint Representation Learning
+  of Heterogeneous Microscopy Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15264v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15264v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Bourriez, Ihab Bendidi, Ethan Cohen, Gabriel Watkinson, Maxime Sanchez, Guillaume Bollot, Auguste Genovesio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unlike color photography images, which are consistently encoded into RGB
+channels, biological images encompass various modalities, where the type of
+microscopy and the meaning of each channel varies with each experiment.
+Importantly, the number of channels can range from one to a dozen and their
+correlation is often comparatively much lower than RGB, as each of them brings
+specific information content. This aspect is largely overlooked by methods
+designed out of the bioimage field, and current solutions mostly focus on
+intra-channel spatial attention, often ignoring the relationship between
+channels, yet crucial in most biological applications. Importantly, the
+variable channel type and count prevent the projection of several experiments
+to a unified representation for large scale pre-training. In this study, we
+propose ChAda-ViT, a novel Channel Adaptive Vision Transformer architecture
+employing an Inter-Channel Attention mechanism on images with an arbitrary
+number, order and type of channels. We also introduce IDRCell100k, a bioimage
+dataset with a rich set of 79 experiments covering 7 microscope modalities,
+with a multitude of channel types, and counts varying from 1 to 10 per
+experiment. Our architecture, trained in a self-supervised manner, outperforms
+existing approaches in several biologically relevant downstream tasks.
+Additionally, it can be used to bridge the gap for the first time between
+assays with different microscopes, channel numbers or types by embedding
+various image and experimental modalities into a unified biological image
+representation. The latter should facilitate interdisciplinary studies and pave
+the way for better adoption of deep learning in biological image-based
+analyses. Code and Data available at https://github.com/nicoboou/chadavit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CHAIN: Enhancing Generalization in Data-Efficient GANs via lipsCHitz
+  continuity constrAIned Normalization <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00521v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00521v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Ni, Piotr Koniusz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) significantly advanced image
+generation but their performance heavily depends on abundant training data. In
+scenarios with limited data, GANs often struggle with discriminator overfitting
+and unstable training. Batch Normalization (BN), despite being known for
+enhancing generalization and training stability, has rarely been used in the
+discriminator of Data-Efficient GANs. Our work addresses this gap by
+identifying a critical flaw in BN: the tendency for gradient explosion during
+the centering and scaling steps. To tackle this issue, we present CHAIN
+(lipsCHitz continuity constrAIned Normalization), which replaces the
+conventional centering step with zero-mean regularization and integrates a
+Lipschitz continuity constraint in the scaling step. CHAIN further enhances GAN
+training by adaptively interpolating the normalized and unnormalized features,
+effectively avoiding discriminator overfitting. Our theoretical analyses firmly
+establishes CHAIN's effectiveness in reducing gradients in latent features and
+weights, improving stability and generalization in GAN training. Empirical
+evidence supports our theory. CHAIN achieves state-of-the-art results in
+data-limited scenarios on CIFAR-10/100, ImageNet, five low-shot and seven
+high-resolution few-shot image datasets. Code:
+https://github.com/MaxwellYaoNi/CHAIN
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2024. 26 pages. Improve Lemma 3.1 - Prop. 3.1 logic
+  flow. Code: https://github.com/MaxwellYaoNi/CHAIN</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Direct3D: Scalable Image-to-3D Generation via 3D Latent Diffusion
+  <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14832v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14832v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuang Wu, Youtian Lin, Feihu Zhang, Yifei Zeng, Jingxi Xu, Philip Torr, Xun Cao, Yao Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating high-quality 3D assets from text and images has long been
+challenging, primarily due to the absence of scalable 3D representations
+capable of capturing intricate geometry distributions. In this work, we
+introduce Direct3D, a native 3D generative model scalable to in-the-wild input
+images, without requiring a multiview diffusion model or SDS optimization. Our
+approach comprises two primary components: a Direct 3D Variational Auto-Encoder
+(D3D-VAE) and a Direct 3D Diffusion Transformer (D3D-DiT). D3D-VAE efficiently
+encodes high-resolution 3D shapes into a compact and continuous latent triplane
+space. Notably, our method directly supervises the decoded geometry using a
+semi-continuous surface sampling strategy, diverging from previous methods
+relying on rendered images as supervision signals. D3D-DiT models the
+distribution of encoded 3D latents and is specifically designed to fuse
+positional information from the three feature maps of the triplane latent,
+enabling a native 3D generative model scalable to large-scale 3D datasets.
+Additionally, we introduce an innovative image-to-3D generation pipeline
+incorporating semantic and pixel-level image conditions, allowing the model to
+produce 3D shapes consistent with the provided conditional image input.
+Extensive experiments demonstrate the superiority of our large-scale
+pre-trained Direct3D over previous image-to-3D approaches, achieving
+significantly better generation quality and generalization ability, thus
+establishing a new state-of-the-art for 3D content creation. Project page:
+https://nju-3dv.github.io/projects/Direct3D/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spatio-Temporal Encoding of Brain Dynamics with Surface Masked
+  Autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05474v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05474v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Dahan, Logan Z. J. Williams, Yourong Guo, Daniel Rueckert, Emma C. Robinson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of robust and generalisable models for encoding the
+spatio-temporal dynamics of human brain activity is crucial for advancing
+neuroscientific discoveries. However, significant individual variation in the
+organisation of the human cerebral cortex makes it difficult to identify
+population-level trends in these signals. Recently, Surface Vision Transformers
+(SiTs) have emerged as a promising approach for modelling cortical signals, yet
+they face some limitations in low-data scenarios due to the lack of inductive
+biases in their architecture. To address these challenges, this paper proposes
+the surface Masked AutoEncoder (sMAE) and video surface Masked AutoEncoder
+(vsMAE) - for multivariate and spatio-temporal pre-training of cortical signals
+over regular icosahedral grids. These models are trained to reconstruct
+cortical feature maps from masked versions of the input by learning strong
+latent representations of cortical structure and function. Such representations
+translate into better modelling of individual phenotypes and enhanced
+performance in downstream tasks. The proposed approach was evaluated on
+cortical phenotype regression using data from the young adult Human Connectome
+Project (HCP) and developing HCP (dHCP). Results show that (v)sMAE pre-trained
+models improve phenotyping prediction performance on multiple tasks by $\ge
+26\%$, and offer faster convergence relative to models trained from scratch.
+Finally, we show that pre-training vision transformers on large datasets, such
+as the UK Biobank (UKB), supports transfer learning to low-data regimes. Our
+code and pre-trained models are publicly available at
+https://github.com/metrics-lab/surface-masked-autoencoders .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publications for MIDL 2024; 20 figures; 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Object Pose Estimation via the Aggregation of Diffusion Features <span class="chip">CVPR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18791v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18791v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianfu Wang, Guosheng Hu, Hongguang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating the pose of objects from images is a crucial task of 3D scene
+understanding, and recent approaches have shown promising results on very large
+benchmarks. However, these methods experience a significant performance drop
+when dealing with unseen objects. We believe that it results from the limited
+generalizability of image features. To address this problem, we have an
+in-depth analysis on the features of diffusion models, e.g. Stable Diffusion,
+which hold substantial potential for modeling unseen objects. Based on this
+analysis, we then innovatively introduce these diffusion features for object
+pose estimation. To achieve this, we propose three distinct architectures that
+can effectively capture and aggregate diffusion features of different
+granularity, greatly improving the generalizability of object pose estimation.
+Our approach outperforms the state-of-the-art methods by a considerable margin
+on three popular benchmark datasets, LM, O-LM, and T-LESS. In particular, our
+method achieves higher accuracy than the previous best arts on unseen objects:
+98.2% vs. 93.5% on Unseen LM, 85.9% vs. 76.3% on Unseen O-LM, showing the
+strong generalizability of our method. Our code is released at
+https://github.com/Tianfu18/diff-feats-pose.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Degradation-aware Any Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15475v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15475v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eduard Zamfir, Zongwei Wu, Nancy Mehta, Danda Pani Paudel, Yulun Zhang, Radu Timofte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing missing details from degraded low-quality inputs poses a
+significant challenge. Recent progress in image restoration has demonstrated
+the efficacy of learning large models capable of addressing various
+degradations simultaneously. Nonetheless, these approaches introduce
+considerable computational overhead and complex learning paradigms, limiting
+their practical utility. In response, we propose \textit{DaAIR}, an efficient
+All-in-One image restorer employing a Degradation-aware Learner (DaLe) in the
+low-rank regime to collaboratively mine shared aspects and subtle nuances
+across diverse degradations, generating a degradation-aware embedding. By
+dynamically allocating model capacity to input degradations, we realize an
+efficient restorer integrating holistic and specific learning within a unified
+model. Furthermore, DaAIR introduces a cost-efficient parameter update
+mechanism that enhances degradation awareness while maintaining computational
+efficiency. Extensive comparisons across five image degradations demonstrate
+that our DaAIR outperforms both state-of-the-art All-in-One models and
+degradation-specific counterparts, affirming our efficacy and practicality. The
+source will be publicly made available at https://eduardzamfir.github.io/daair/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Multiscale Surface Vision <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11909v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11909v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon Dahan, Logan Z. J. Williams, Daniel Rueckert, Emma C. Robinson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surface meshes are a favoured domain for representing structural and
+functional information on the human cortex, but their complex topology and
+geometry pose significant challenges for deep learning analysis. While
+Transformers have excelled as domain-agnostic architectures for
+sequence-to-sequence learning, the quadratic cost of the self-attention
+operation remains an obstacle for many dense prediction tasks. Inspired by some
+of the latest advances in hierarchical modelling with vision transformers, we
+introduce the Multiscale Surface Vision Transformer (MS-SiT) as a backbone
+architecture for surface deep learning. The self-attention mechanism is applied
+within local-mesh-windows to allow for high-resolution sampling of the
+underlying data, while a shifted-window strategy improves the sharing of
+information between windows. Neighbouring patches are successively merged,
+allowing the MS-SiT to learn hierarchical representations suitable for any
+prediction task. Results demonstrate that the MS-SiT outperforms existing
+surface deep learning methods for neonatal phenotyping prediction tasks using
+the Developing Human Connectome Project (dHCP) dataset. Furthermore, building
+the MS-SiT backbone into a U-shaped architecture for surface segmentation
+demonstrates competitive results on cortical parcellation using the UK Biobank
+(UKB) and manually-annotated MindBoggle datasets. Code and trained models are
+publicly available at
+https://github.com/metrics-lab/surface-vision-transformers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at MIDL 2024, 17 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are Semi-Dense Detector-Free Methods Good at Matching Local Features? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08671v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08671v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthieu Vilain, Rémi Giraud, Hugo Germain, Guillaume Bourmaud
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-dense detector-free approaches (SDF), such as LoFTR, are currently among
+the most popular image matching methods. While SDF methods are trained to
+establish correspondences between two images, their performances are almost
+exclusively evaluated using relative pose estimation metrics. Thus, the link
+between their ability to establish correspondences and the quality of the
+resulting estimated pose has thus far received little attention. This paper is
+a first attempt to study this link. We start with proposing a novel structured
+attention-based image matching architecture (SAM). It allows us to show a
+counter-intuitive result on two datasets (MegaDepth and HPatches): on the one
+hand SAM either outperforms or is on par with SDF methods in terms of
+pose/homography estimation metrics, but on the other hand SDF approaches are
+significantly better than SAM in terms of matching accuracy. We then propose to
+limit the computation of the matching accuracy to textured regions, and show
+that in this case SAM often surpasses SDF methods. Our findings highlight a
+strong correlation between the ability to establish accurate correspondences in
+textured regions and the accuracy of the resulting estimated pose/homography.
+Our code will be made available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Consistency Model is an Effective Posterior Sample Approximation for
+  Diffusion Inverse Solvers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongda Xu, Ziran Zhu, Jian Li, Dailan He, Yuanyuan Wang, Ming Sun, Ling Li, Hongwei Qin, Yan Wang, Jingjing Liu, Ya-Qin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Inverse Solvers (DIS) are designed to sample from the conditional
+distribution $p_{\theta}(X_0|y)$, with a predefined diffusion model
+$p_{\theta}(X_0)$, an operator $f(\cdot)$, and a measurement $y=f(x'_0)$
+derived from an unknown image $x'_0$. Existing DIS estimate the conditional
+score function by evaluating $f(\cdot)$ with an approximated posterior sample
+drawn from $p_{\theta}(X_0|X_t)$. However, most prior approximations rely on
+the posterior means, which may not lie in the support of the image
+distribution, thereby potentially diverge from the appearance of genuine
+images. Such out-of-support samples may significantly degrade the performance
+of the operator $f(\cdot)$, particularly when it is a neural network. In this
+paper, we introduces a novel approach for posterior approximation that
+guarantees to generate valid samples within the support of the image
+distribution, and also enhances the compatibility with neural network-based
+operators $f(\cdot)$. We first demonstrate that the solution of the Probability
+Flow Ordinary Differential Equation (PF-ODE) with an initial value $x_t$ yields
+an effective posterior sample $p_{\theta}(X_0|X_t=x_t)$. Based on this
+observation, we adopt the Consistency Model (CM), which is distilled from
+PF-ODE, for posterior sampling. Furthermore, we design a novel family of DIS
+using only CM. Through extensive experiments, we show that our proposed method
+for posterior sample approximation substantially enhance the effectiveness of
+DIS for neural network operators $f(\cdot)$ (e.g., in semantic segmentation).
+Additionally, our experiments demonstrate the effectiveness of the new CM-based
+inversion techniques. The source code is provided in the supplementary
+material.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Flow Diffusion Models: Learnable Forward Process for Improved
+  Diffusion Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.12940v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.12940v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grigory Bartosh, Dmitry Vetrov, Christian A. Naesseth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional diffusion models typically relies on a fixed forward process,
+which implicitly defines complex marginal distributions over latent variables.
+This can often complicate the reverse process' task in learning generative
+trajectories, and results in costly inference for diffusion models. To address
+these limitations, we introduce Neural Flow Diffusion Models (NFDM), a novel
+framework that enhances diffusion models by supporting a broader range of
+forward processes beyond the standard Gaussian. We also propose a novel
+parameterization technique for learning the forward process. Our framework
+provides an end-to-end, simulation-free optimization objective, effectively
+minimizing a variational upper bound on the negative log-likelihood.
+Experimental results demonstrate NFDM's strong performance, evidenced by
+state-of-the-art likelihood estimation. Furthermore, we investigate NFDM's
+capacity for learning generative dynamics with specific characteristics, such
+as deterministic straight lines trajectories, and demonstrate how the framework
+may be adopted for learning bridges between two distributions. The results
+underscores NFDM's versatility and its potential for a wide range of
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Long-Tail Learning with Foundation Model: Heavy Fine-Tuning Hurts <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10019v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10019v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiang-Xin Shi, Tong Wei, Zhi Zhou, Jie-Jing Shao, Xin-Yan Han, Yu-Feng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fine-tuning paradigm in addressing long-tail learning tasks has sparked
+significant interest since the emergence of foundation models. Nonetheless, how
+fine-tuning impacts performance in long-tail learning was not explicitly
+quantified. In this paper, we disclose that heavy fine-tuning may even lead to
+non-negligible performance deterioration on tail classes, and lightweight
+fine-tuning is more effective. The reason is attributed to inconsistent class
+conditions caused by heavy fine-tuning. With the observation above, we develop
+a low-complexity and accurate long-tail learning algorithms LIFT with the goal
+of facilitating fast prediction and compact models by adaptive lightweight
+fine-tuning. Experiments clearly verify that both the training time and the
+learned parameters are significantly reduced with more accurate predictive
+performance compared with state-of-the-art approaches. The implementation code
+is available at https://github.com/shijxcs/LIFT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SimPro: A Simple Probabilistic Framework Towards Realistic Long-Tailed
+  Semi-Supervised Learning <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.13505v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.13505v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqun Du, Yizeng Han, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in semi-supervised learning have focused on a more
+realistic yet challenging task: addressing imbalances in labeled data while the
+class distribution of unlabeled data remains both unknown and potentially
+mismatched. Current approaches in this sphere often presuppose rigid
+assumptions regarding the class distribution of unlabeled data, thereby
+limiting the adaptability of models to only certain distribution ranges. In
+this study, we propose a novel approach, introducing a highly adaptable
+framework, designated as SimPro, which does not rely on any predefined
+assumptions about the distribution of unlabeled data. Our framework, grounded
+in a probabilistic model, innovatively refines the expectation-maximization
+(EM) algorithm by explicitly decoupling the modeling of conditional and
+marginal class distributions. This separation facilitates a closed-form
+solution for class distribution estimation during the maximization phase,
+leading to the formulation of a Bayes classifier. The Bayes classifier, in
+turn, enhances the quality of pseudo-labels in the expectation phase.
+Remarkably, the SimPro framework not only comes with theoretical guarantees but
+also is straightforward to implement. Moreover, we introduce two novel class
+distributions broadening the scope of the evaluation. Our method showcases
+consistent state-of-the-art performance across diverse benchmarks and data
+distribution scenarios. Our code is available at
+https://github.com/LeapLabTHU/SimPro.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML2024 camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging edge detection and neural networks for better UAV
+  localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.06207v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.06207v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theo Di Piazza, Enric Meinhardt-Llopis, Gabriele Facciolo, Benedicte Bascle, Corentin Abgrall, Jean-Clement Devaux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel method for geolocalizing Unmanned Aerial Vehicles (UAVs)
+in environments lacking Global Navigation Satellite Systems (GNSS). Current
+state-of-the-art techniques employ an offline-trained encoder to generate a
+vector representation (embedding) of the UAV's current view, which is then
+compared with pre-computed embeddings of geo-referenced images to determine the
+UAV's position. Here, we demonstrate that the performance of these methods can
+be significantly enhanced by preprocessing the images to extract their edges,
+which exhibit robustness to seasonal and illumination variations. Furthermore,
+we establish that utilizing edges enhances resilience to orientation and
+altitude inaccuracies. Additionally, we introduce a confidence criterion for
+localization. Our findings are substantiated through synthetic experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in IGARSS2024. 4 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CeCNN: Copula-enhanced convolutional neural networks in joint prediction
+  of refraction error and axial length based on ultra-widefield fundus images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chong Zhong, Yang Li, Danjuan Yang, Meiyan Li, Xingyao Zhou, Bo Fu, Catherine C. Liu, A. H. Welsh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultra-widefield (UWF) fundus images are replacing traditional fundus images
+in screening, detection, prediction, and treatment of complications related to
+myopia because their much broader visual range is advantageous for highly
+myopic eyes. Spherical equivalent (SE) is extensively used as the main myopia
+outcome measure, and axial length (AL) has drawn increasing interest as an
+important ocular component for assessing myopia. Cutting-edge studies show that
+SE and AL are strongly correlated. Using the joint information from SE and AL
+is potentially better than using either separately. In the deep learning
+community, though there is research on multiple-response tasks with a 3D image
+biomarker, dependence among responses is only sporadically taken into
+consideration. Inspired by the spirit that information extracted from the data
+by statistical methods can improve the prediction accuracy of deep learning
+models, we formulate a class of multivariate response regression models with a
+higher-order tensor biomarker, for the bivariate tasks of
+regression-classification and regression-regression. Specifically, we propose a
+copula-enhanced convolutional neural network (CeCNN) framework that
+incorporates the dependence between responses through a Gaussian copula (with
+parameters estimated from a warm-up CNN) and uses the induced copula-likelihood
+loss with the backbone CNNs. We establish the statistical framework and
+algorithms for the aforementioned two bivariate tasks. We show that the CeCNN
+has better prediction accuracy after adding the dependency information to the
+backbone models. The modeling and the proposed CeCNN algorithm are applicable
+beyond the UWF scenario and can be effective with other backbones beyond ResNet
+and LeNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>4RoI: Instruction Tuning Large Language Model on Region-of-Interest 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03601v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03601v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shilong Zhang, Peize Sun, Shoufa Chen, Min Xiao, Wenqi Shao, Wenwei Zhang, Yu Liu, Kai Chen, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual instruction tuning large language model(LLM) on image-text pairs has
+achieved general-purpose vision-language abilities. However, the lack of
+region-text pairs limits their advancements to fine-grained multimodal
+understanding. In this paper, we propose spatial instruction tuning, which
+introduces the reference to the region-of-interest(RoI) in the instruction.
+Before sending to LLM, the reference is replaced by RoI features and
+interleaved with language embeddings as a sequence. Our model GPT4RoI, trained
+on 7 region-text pair datasets, brings an unprecedented interactive and
+conversational experience compared to previous image-level models. (1)
+Interaction beyond language: Users can interact with our model by both language
+and drawing bounding boxes to flexibly adjust the referring granularity. (2)
+Versatile multimodal abilities: A variety of attribute information within each
+RoI can be mined by GPT4RoI, e.g., color, shape, material, action, etc.
+Furthermore, it can reason about multiple RoIs based on common sense. On the
+Visual Commonsense Reasoning(VCR) dataset, GPT4RoI achieves a remarkable
+accuracy of 81.6%, surpassing all existing models by a significant margin (the
+second place is 75.6%) and almost reaching human-level performance of 85.0%.
+The code, dataset, and demo can be found at
+https://github.com/jshilong/GPT4RoI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code has been released at https://github.com/jshilong/GPT4RoI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Champ: Controllable and Consistent Human Image Animation with 3D
+  Parametric Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.14781v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.14781v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenhao Zhu, Junming Leo Chen, Zuozhuo Dai, Qingkun Su, Yinghui Xu, Xun Cao, Yao Yao, Hao Zhu, Siyu Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we introduce a methodology for human image animation by
+leveraging a 3D human parametric model within a latent diffusion framework to
+enhance shape alignment and motion guidance in curernt human generative
+techniques. The methodology utilizes the SMPL(Skinned Multi-Person Linear)
+model as the 3D human parametric model to establish a unified representation of
+body shape and pose. This facilitates the accurate capture of intricate human
+geometry and motion characteristics from source videos. Specifically, we
+incorporate rendered depth images, normal maps, and semantic maps obtained from
+SMPL sequences, alongside skeleton-based motion guidance, to enrich the
+conditions to the latent diffusion model with comprehensive 3D shape and
+detailed pose attributes. A multi-layer motion fusion module, integrating
+self-attention mechanisms, is employed to fuse the shape and motion latent
+representations in the spatial domain. By representing the 3D human parametric
+model as the motion guidance, we can perform parametric shape alignment of the
+human body between the reference image and the source video motion.
+Experimental evaluations conducted on benchmark datasets demonstrate the
+methodology's superior ability to generate high-quality human animations that
+accurately capture both pose and shape variations. Furthermore, our approach
+also exhibits superior generalization capabilities on the proposed in-the-wild
+dataset. Project page: https://fudan-generative-vision.github.io/champ.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Detector-oblivious Multi-arm Network for Keypoint Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2104.00947v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2104.00947v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuelun Shen, Qian Hu, Xin Li, Cheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a matching network to establish point correspondence
+between images. We propose a Multi-Arm Network (MAN) to learn region overlap
+and depth, which can greatly improve the keypoint matching robustness while
+bringing little computational cost during the inference stage. Another design
+that makes this framework different from many existing learning based pipelines
+that require re-training when a different keypoint detector is adopted, our
+network can directly work with different keypoint detectors without such a
+time-consuming re-training process. Comprehensive experiments conducted on
+outdoor and indoor datasets demonstrated that our proposed MAN outperforms
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpikeReveal: Unlocking Temporal Sequences from Real Blurry Inputs with
+  Spike Streams 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09486v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09486v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kang Chen, Shiyan Chen, Jiyuan Zhang, Baoyue Zhang, Yajing Zheng, Tiejun Huang, Zhaofei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing a sequence of sharp images from the blurry input is crucial
+for enhancing our insights into the captured scene and poses a significant
+challenge due to the limited temporal features embedded in the image. Spike
+cameras, sampling at rates up to 40,000 Hz, have proven effective in capturing
+motion features and beneficial for solving this ill-posed problem. Nonetheless,
+existing methods fall into the supervised learning paradigm, which suffers from
+notable performance degradation when applied to real-world scenarios that
+diverge from the synthetic training data domain. Moreover, the quality of
+reconstructed images is capped by the generated images based on motion analysis
+interpolation, which inherently differs from the actual scene, affecting the
+generalization ability of these methods in real high-speed scenarios. To
+address these challenges, we propose the first self-supervised framework for
+the task of spike-guided motion deblurring. Our approach begins with the
+formulation of a spike-guided deblurring model that explores the theoretical
+relationships among spike streams, blurry images, and their corresponding sharp
+sequences. We subsequently develop a self-supervised cascaded framework to
+alleviate the issues of spike noise and spatial-resolution mismatching
+encountered in the deblurring model. With knowledge distillation and
+re-blurring loss, we further design a lightweight deblur network to generate
+high-quality sequences with brightness and texture consistency with the
+original input. Quantitative and qualitative experiments conducted on our
+real-world and synthetic datasets with spikes validate the superior
+generalization of the proposed framework. Our code, data and trained models
+will be available at \url{https://github.com/chenkang455/S-SDM}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Restoration by Generation with Constrained Priors <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.17161v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.17161v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheng Ding, Xuaner Zhang, Zhuowen Tu, Zhihao Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The inherent generative power of denoising diffusion models makes them
+well-suited for image restoration tasks where the objective is to find the
+optimal high-quality image within the generative space that closely resembles
+the input image. We propose a method to adapt a pretrained diffusion model for
+image restoration by simply adding noise to the input image to be restored and
+then denoise. Our method is based on the observation that the space of a
+generative model needs to be constrained. We impose this constraint by
+finetuning the generative model with a set of anchor images that capture the
+characteristics of the input image. With the constrained space, we can then
+leverage the sampling strategy used for generation to do image restoration. We
+evaluate against previous methods and show superior performances on multiple
+real-world restoration datasets in preserving identity and image quality. We
+also demonstrate an important and practical application on personalized
+restoration, where we use a personal album as the anchor images to constrain
+the generative space. This approach allows us to produce results that
+accurately preserve high-frequency details, which previous works are unable to
+do. Project webpage: https://gen2res.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024 (Highlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video sentence grounding with temporally global textual knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.13611v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.13611v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cai Chen, Runzhong Zhang, Jianjun Gao, Kejun Wu, Kim-Hui Yap, Yi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal sentence grounding involves the retrieval of a video moment with a
+natural language query. Many existing works directly incorporate the given
+video and temporally localized query for temporal grounding, overlooking the
+inherent domain gap between different modalities. In this paper, we utilize
+pseudo-query features containing extensive temporally global textual knowledge
+sourced from the same video-query pair, to enhance the bridging of domain gaps
+and attain a heightened level of similarity between multi-modal features.
+Specifically, we propose a Pseudo-query Intermediary Network (PIN) to achieve
+an improved alignment of visual and comprehensive pseudo-query features within
+the feature space through contrastive learning. Subsequently, we utilize
+learnable prompts to encapsulate the knowledge of pseudo-queries, propagating
+them into the textual encoder and multi-modal fusion module, further enhancing
+the feature alignment between visual and language for better temporal
+grounding. Extensive experiments conducted on the Charades-STA and
+ActivityNet-Captions datasets demonstrate the effectiveness of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boosting Model Resilience via Implicit Adversarial Data Augmentation <span class="chip">IJCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.16307v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.16307v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoling Zhou, Wei Ye, Zhemg Lee, Rui Xie, Shikun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation plays a pivotal role in enhancing and diversifying training
+data. Nonetheless, consistently improving model performance in varied learning
+scenarios, especially those with inherent data biases, remains challenging. To
+address this, we propose to augment the deep features of samples by
+incorporating their adversarial and anti-adversarial perturbation
+distributions, enabling adaptive adjustment in the learning difficulty tailored
+to each sample's specific characteristics. We then theoretically reveal that
+our augmentation process approximates the optimization of a surrogate loss
+function as the number of augmented copies increases indefinitely. This insight
+leads us to develop a meta-learning-based framework for optimizing classifiers
+with this novel loss, introducing the effects of augmentation while bypassing
+the explicit augmentation process. We conduct extensive experiments across four
+common biased learning scenarios: long-tail learning, generalized long-tail
+learning, noisy label learning, and subpopulation shift learning. The empirical
+results demonstrate that our method consistently achieves state-of-the-art
+performance, highlighting its broad adaptability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures, accepted by IJCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MM-SAP: A Comprehensive Benchmark for Assessing Self-Awareness of
+  Multimodal Large Language Models in Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.07529v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.07529v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Wang, Yusheng Liao, Heyang Liu, Hongcheng Liu, Yu Wang, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Multimodal Large Language Models (MLLMs) have
+demonstrated exceptional capabilities in visual perception and understanding.
+However, these models also suffer from hallucinations, which limit their
+reliability as AI systems. We believe that these hallucinations are partially
+due to the models' struggle with understanding what they can and cannot
+perceive from images, a capability we refer to as self-awareness in perception.
+Despite its importance, this aspect of MLLMs has been overlooked in prior
+studies. In this paper, we aim to define and evaluate the self-awareness of
+MLLMs in perception. To do this, we first introduce the knowledge quadrant in
+perception, which helps define what MLLMs know and do not know about images.
+Using this framework, we propose a novel benchmark, the Self-Awareness in
+Perception for MLLMs (MM-SAP), specifically designed to assess this capability.
+We apply MM-SAP to a variety of popular MLLMs, offering a comprehensive
+analysis of their self-awareness and providing detailed insights. The
+experiment results reveal that current MLLMs possess limited self-awareness
+capabilities, pointing to a crucial area for future advancement in the
+development of trustworthy MLLMs. Code and data are available at
+https://github.com/YHWmz/MM-SAP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PGDS: Pose-Guidance Deep Supervision for Mitigating Clothes-Changing in
+  Person Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.05634v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.05634v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quoc-Huy Trinh, Nhat-Tan Bui, Dinh-Hieu Hoang, Phuoc-Thao Vo Thi, Hai-Dang Nguyen, Debesh Jha, Ulas Bagci, Ngan Le, Minh-Triet Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Person Re-Identification (Re-ID) task seeks to enhance the tracking of
+multiple individuals by surveillance cameras. It supports multimodal tasks,
+including text-based person retrieval and human matching. One of the most
+significant challenges faced in Re-ID is clothes-changing, where the same
+person may appear in different outfits. While previous methods have made
+notable progress in maintaining clothing data consistency and handling clothing
+change data, they still rely excessively on clothing information, which can
+limit performance due to the dynamic nature of human appearances. To mitigate
+this challenge, we propose the Pose-Guidance Deep Supervision (PGDS), an
+effective framework for learning pose guidance within the Re-ID task. It
+consists of three modules: a human encoder, a pose encoder, and a Pose-to-Human
+Projection module (PHP). Our framework guides the human encoder, i.e., the main
+re-identification model, with pose information from the pose encoder through
+multiple layers via the knowledge transfer mechanism from the PHP module,
+helping the human encoder learn body parts information without increasing
+computation resources in the inference stage. Through extensive experiments,
+our method surpasses the performance of current state-of-the-art methods,
+demonstrating its robustness and effectiveness for real-world applications. Our
+code is available at https://github.com/huyquoctrinh/PGDS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AVSS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Out-of-distribution Partial Label Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06681v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06681v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jintao Huang, Yiu-Ming Cheung, Chi-Man Vong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Partial Label Learning (PLL) tackles model learning from the data with
+inexact labels under the assumption that training and test objects are in the
+same distribution, i.e., closed-set scenario. Nevertheless, this assumption
+does not hold in real-world open-set scenarios where test data may come from
+Out-Of-Distribution (OOD), resulting in object detection failure and hence
+significantly compromising the PLL model's security and trustworthiness. This
+is a previously unexplored problem called Out-Of-Distribution Partial Label
+Learning (OODPLL) that our newly proposed PLOOD framework can effectively
+resolve. During the training phase, our framework leverages self-supervised
+learning strategy to generate positive and negative samples for each object,
+emulating in and out-of-distributions respectively. Under these distributions,
+PLL methods can learn discriminative features for OOD objects. In the inference
+phase, a novel Partial Energy (PE) scoring technique is proposed which
+leverages the label confidence established during the above training phase to
+mine the actual labels. In this way, the issue of inexact labeling in PLL can
+be effectively addressed for significantly better performance in OOD object
+detection. PLOOD is compared with SOTA PLL models and OOD scores on CIFAR-10
+and CIFAR-100 datasets against various OOD datasets. The results demonstrate
+the effectiveness of our PLOOD framework, significantly outperforming SOTA PLL
+models and marking a substantial advancement in addressing PLL problems in
+real-world OOD scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zone Evaluation: Revealing Spatial Bias in Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.13215v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.13215v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaohui Zheng, Yuming Chen, Qibin Hou, Xiang Li, Ping Wang, Ming-Ming Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A fundamental limitation of object detectors is that they suffer from
+"spatial bias", and in particular perform less satisfactorily when detecting
+objects near image borders. For a long time, there has been a lack of effective
+ways to measure and identify spatial bias, and little is known about where it
+comes from and what degree it is. To this end, we present a new zone evaluation
+protocol, extending from the traditional evaluation to a more generalized one,
+which measures the detection performance over zones, yielding a series of Zone
+Precisions (ZPs). For the first time, we provide numerical results, showing
+that the object detectors perform quite unevenly across the zones.
+Surprisingly, the detector's performance in the 96% border zone of the image
+does not reach the AP value (Average Precision, commonly regarded as the
+average detection performance in the entire image zone). To better understand
+spatial bias, a series of heuristic experiments are conducted. Our
+investigation excludes two intuitive conjectures about spatial bias that the
+object scale and the absolute positions of objects barely influence the spatial
+bias. We find that the key lies in the human-imperceptible divergence in data
+patterns between objects in different zones, thus eventually forming a visible
+performance gap between the zones. With these findings, we finally discuss a
+future direction for object detection, namely, spatial disequilibrium problem,
+aiming at pursuing a balanced detection ability over the entire image zone. By
+broadly evaluating 10 popular object detectors and 5 detection datasets, we
+shed light on the spatial bias of object detectors. We hope this work could
+raise a focus on detection robustness. The source codes, evaluation protocols,
+and tutorials are publicly available at https://github.com/Zzh-tju/ZoneEval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE TPAMI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TIGER: Text-Instructed 3D Gaussian Retrieval and Coherent Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14455v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14455v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teng Xu, Jiamin Chen, Peng Chen, Youjia Zhang, Junqing Yu, Wei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Editing objects within a scene is a critical functionality required across a
+broad spectrum of applications in computer vision and graphics. As 3D Gaussian
+Splatting (3DGS) emerges as a frontier in scene representation, the effective
+modification of 3D Gaussian scenes has become increasingly vital. This process
+entails accurately retrieve the target objects and subsequently performing
+modifications based on instructions. Though available in pieces, existing
+techniques mainly embed sparse semantics into Gaussians for retrieval, and rely
+on an iterative dataset update paradigm for editing, leading to over-smoothing
+or inconsistency issues. To this end, this paper proposes a systematic
+approach, namely TIGER, for coherent text-instructed 3D Gaussian retrieval and
+editing. In contrast to the top-down language grounding approach for 3D
+Gaussians, we adopt a bottom-up language aggregation strategy to generate a
+denser language embedded 3D Gaussians that supports open-vocabulary retrieval.
+To overcome the over-smoothing and inconsistency issues in editing, we propose
+a Coherent Score Distillation (CSD) that aggregates a 2D image editing
+diffusion model and a multi-view diffusion model for score distillation,
+producing multi-view consistent editing with much finer details. In various
+experiments, we demonstrate that our TIGER is able to accomplish more
+consistent and realistic edits than prior work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Grounding DINO 1.5: Advance the "Edge" of Open-Set Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10300v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10300v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianhe Ren, Qing Jiang, Shilong Liu, Zhaoyang Zeng, Wenlong Liu, Han Gao, Hongjie Huang, Zhengyu Ma, Xiaoke Jiang, Yihao Chen, Yuda Xiong, Hao Zhang, Feng Li, Peijun Tang, Kent Yu, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Grounding DINO 1.5, a suite of advanced open-set object
+detection models developed by IDEA Research, which aims to advance the "Edge"
+of open-set object detection. The suite encompasses two models: Grounding DINO
+1.5 Pro, a high-performance model designed for stronger generalization
+capability across a wide range of scenarios, and Grounding DINO 1.5 Edge, an
+efficient model optimized for faster speed demanded in many applications
+requiring edge deployment. The Grounding DINO 1.5 Pro model advances its
+predecessor by scaling up the model architecture, integrating an enhanced
+vision backbone, and expanding the training dataset to over 20 million images
+with grounding annotations, thereby achieving a richer semantic understanding.
+The Grounding DINO 1.5 Edge model, while designed for efficiency with reduced
+feature scales, maintains robust detection capabilities by being trained on the
+same comprehensive dataset. Empirical results demonstrate the effectiveness of
+Grounding DINO 1.5, with the Grounding DINO 1.5 Pro model attaining a 54.3 AP
+on the COCO detection benchmark and a 55.7 AP on the LVIS-minival zero-shot
+transfer benchmark, setting new records for open-set object detection.
+Furthermore, the Grounding DINO 1.5 Edge model, when optimized with TensorRT,
+achieves a speed of 75.2 FPS while attaining a zero-shot performance of 36.2 AP
+on the LVIS-minival benchmark, making it more suitable for edge computing
+scenarios. Model examples and demos with API will be released at
+https://github.com/IDEA-Research/Grounding-DINO-1.5-API
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>homepage: https://deepdataspace.com/home</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attentive Graph Enhanced Region Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03212v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03212v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiliang Chen, Qianqian Ren, Jinbao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representing urban regions accurately and comprehensively is essential for
+various urban planning and analysis tasks. Recently, with the expansion of the
+city, modeling long-range spatial dependencies with multiple data sources plays
+an important role in urban region representation. In this paper, we propose the
+Attentive Graph Enhanced Region Representation Learning (ATGRL) model, which
+aims to capture comprehensive dependencies from multiple graphs and learn rich
+semantic representations of urban regions. Specifically, we propose a
+graph-enhanced learning module to construct regional graphs by incorporating
+mobility flow patterns, point of interests (POIs) functions, and check-in
+semantics with noise filtering. Then, we present a multi-graph aggregation
+module to capture both local and global spatial dependencies between regions by
+integrating information from multiple graphs. In addition, we design a
+dual-stage fusion module to facilitate information sharing between different
+views and efficiently fuse multi-view representations for urban region
+embedding using an improved linear attention mechanism. Finally, extensive
+experiments on real-world datasets for three downstream tasks demonstrate the
+superior performance of our model compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Fish <span class="highlight-title">Dataset</span> and Evaluation Metric in Keypoint Detection --
+  Towards Precise Fish Morphological Assessment in Aquaculture Breeding <span class="chip">IJCAI2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12476v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12476v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhen Liu, Jiayu Tan, Guangyu Lan, Ao Li, Dongye Li, Le Zhao, Xiaohui Yuan, Nanqing Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate phenotypic analysis in aquaculture breeding necessitates the
+quantification of subtle morphological phenotypes. Existing datasets suffer
+from limitations such as small scale, limited species coverage, and inadequate
+annotation of keypoints for measuring refined and complex morphological
+phenotypes of fish body parts. To address this gap, we introduce FishPhenoKey,
+a comprehensive dataset comprising 23,331 high-resolution images spanning six
+fish species. Notably, FishPhenoKey includes 22 phenotype-oriented annotations,
+enabling the capture of intricate morphological phenotypes. Motivated by the
+nuanced evaluation of these subtle morphologies, we also propose a new
+evaluation metric, Percentage of Measured Phenotype (PMP). It is designed to
+assess the accuracy of individual keypoint positions and is highly sensitive to
+the phenotypes measured using the corresponding keypoints. To enhance keypoint
+detection accuracy, we further propose a novel loss, Anatomically-Calibrated
+Regularization (ACR), that can be integrated into keypoint detection models,
+leveraging biological insights to refine keypoint localization. Our
+contributions set a new benchmark in fish phenotype analysis, addressing the
+challenges of precise morphological quantification and opening new avenues for
+research in sustainable aquaculture and genetic studies. Our dataset and code
+are available at https://github.com/WeizhenLiuBioinform/Fish-Phenotype-Detect.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IJCAI2024, Code:
+  https://github.com/WeizhenLiuBioinform/Fish-Phenotype-Detect</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PaLM2-VAdapter: Progressively Aligned Language Model Makes a Strong
+  Vision-language Adapter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10896v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10896v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junfei Xiao, Zheng Xu, Alan Yuille, Shen Yan, Boyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper demonstrates that a progressively aligned language model can
+effectively bridge frozen vision encoders and large language models (LLMs).
+While the fundamental architecture and pre-training methods of vision encoders
+and LLMs have been extensively studied, the architecture and training strategy
+of vision-language adapters vary significantly across recent works. Our
+research undertakes a thorough exploration of the state-of-the-art perceiver
+resampler architecture and builds a strong baseline. However, we observe that
+the vision-language alignment with perceiver resampler exhibits slow
+convergence and limited scalability with a lack of direct supervision. To
+address this issue, we propose PaLM2-VAdapter, employing a progressively
+aligned language model as the vision-language adapter. Compared to the strong
+baseline with perceiver resampler, our method empirically shows faster
+convergence, higher performance, and stronger scalability. Extensive
+experiments across various Visual Question Answering (VQA) and captioning tasks
+on both images and videos demonstrate that our model exhibits state-of-the-art
+visual understanding and multi-modal reasoning capabilities. Notably, our
+method achieves these advancements with 30~70% fewer parameters than the
+state-of-the-art large vision-language models, marking a significant efficiency
+improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report, 15 pages; v2 fix typos, add additional results in
+  appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BiomedParse: a biomedical foundation model for image parsing of
+  everything everywhere all at once 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12971v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12971v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theodore Zhao, Yu Gu, Jianwei Yang, Naoto Usuyama, Ho Hin Lee, Tristan Naumann, Jianfeng Gao, Angela Crabtree, Brian Piening, Carlo Bifulco, Mu Wei, Hoifung Poon, Sheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical image analysis is fundamental for biomedical discovery in cell
+biology, pathology, radiology, and many other biomedical domains. Holistic
+image analysis comprises interdependent subtasks such as segmentation,
+detection, and recognition of relevant objects. Here, we propose BiomedParse, a
+biomedical foundation model for imaging parsing that can jointly conduct
+segmentation, detection, and recognition for 82 object types across 9 imaging
+modalities. Through joint learning, we can improve accuracy for individual
+tasks and enable novel applications such as segmenting all relevant objects in
+an image through a text prompt, rather than requiring users to laboriously
+specify the bounding box for each object. We leveraged readily available
+natural-language labels or descriptions accompanying those datasets and use
+GPT-4 to harmonize the noisy, unstructured text information with established
+biomedical object ontologies. We created a large dataset comprising over six
+million triples of image, segmentation mask, and textual description. On image
+segmentation, we showed that BiomedParse is broadly applicable, outperforming
+state-of-the-art methods on 102,855 test image-mask-label triples across 9
+imaging modalities (everything). On object detection, which aims to locate a
+specific object of interest, BiomedParse again attained state-of-the-art
+performance, especially on objects with irregular shapes (everywhere). On
+object recognition, which aims to identify all objects in a given image along
+with their semantic types, we showed that BiomedParse can simultaneously
+segment and label all biomedical objects in an image (all at once). In summary,
+BiomedParse is an all-in-one tool for biomedical image analysis by jointly
+solving segmentation, detection, and recognition for all major biomedical image
+modalities, paving the path for efficient and accurate image-based biomedical
+discovery.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://aka.ms/biomedparse-project</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">14</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Overcoming Miscalibrated Conversational Priors in LLM-based Chatbots <span class="chip">UAI'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christine Herlihy, Jennifer Neville, Tobias Schnabel, Adith Swaminathan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the use of Large Language Model (LLM-based) chatbots to power
+recommender systems. We observe that the chatbots respond poorly when they
+encounter under-specified requests (e.g., they make incorrect assumptions,
+hedge with a long response, or refuse to answer). We conjecture that such
+miscalibrated response tendencies (i.e., conversational priors) can be
+attributed to LLM fine-tuning using annotators -- single-turn annotations may
+not capture multi-turn conversation utility, and the annotators' preferences
+may not even be representative of users interacting with a recommender system.
+  We first analyze public LLM chat logs to conclude that query
+under-specification is common. Next, we study synthetic recommendation problems
+with configurable latent item utilities and frame them as Partially Observed
+Decision Processes (PODP). We find that pre-trained LLMs can be sub-optimal for
+PODPs and derive better policies that clarify under-specified queries when
+appropriate. Then, we re-calibrate LLMs by prompting them with learned control
+messages to approximate the improved policy. Finally, we show empirically that
+our lightweight learning approach effectively uses logged conversation data to
+re-calibrate the response strategies of LLM-based chatbots for recommendation
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of UAI'24 conference publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An LLM-based Recommender System Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Corecco, Giorgio Piatti, Luca A. Lanzendörfer, Flint Xiaofeng Fan, Roger Wattenhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) has gained popularity in the realm of recommender
+systems due to its ability to optimize long-term rewards and guide users in
+discovering relevant content. However, the successful implementation of RL in
+recommender systems is challenging because of several factors, including the
+limited availability of online data for training on-policy methods. This
+scarcity requires expensive human interaction for online model training.
+Furthermore, the development of effective evaluation frameworks that accurately
+reflect the quality of models remains a fundamental challenge in recommender
+systems. To address these challenges, we propose a comprehensive framework for
+synthetic environments that simulate human behavior by harnessing the
+capabilities of large language models (LLMs). We complement our framework with
+in-depth ablation studies and demonstrate its effectiveness with experiments on
+movie and book recommendations. By utilizing LLMs as synthetic users, this work
+introduces a modular and novel framework for training RL-based recommender
+systems. The software, including the RL environment, is publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RecDiff: Diffusion Model for Social Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.01629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.01629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongwei Li, Lianghao Xia, Chao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social recommendation has emerged as a powerful approach to enhance
+personalized recommendations by leveraging the social connections among users,
+such as following and friend relations observed in online social platforms. The
+fundamental assumption of social recommendation is that socially-connected
+users exhibit homophily in their preference patterns. This means that users
+connected by social ties tend to have similar tastes in user-item activities,
+such as rating and purchasing. However, this assumption is not always valid due
+to the presence of irrelevant and false social ties, which can contaminate user
+embeddings and adversely affect recommendation accuracy. To address this
+challenge, we propose a novel diffusion-based social denoising framework for
+recommendation (RecDiff). Our approach utilizes a simple yet effective
+hidden-space diffusion paradigm to alleivate the noisy effect in the compressed
+and dense representation space. By performing multi-step noise diffusion and
+removal, RecDiff possesses a robust ability to identify and eliminate noise
+from the encoded user representations, even when the noise levels vary. The
+diffusion module is optimized in a downstream task-aware manner, thereby
+maximizing its ability to enhance the recommendation process. We conducted
+extensive experiments to evaluate the efficacy of our framework, and the
+results demonstrate its superiority in terms of recommendation accuracy,
+training efficiency, and denoising effectiveness. The source code for the model
+implementation is publicly available at: https://github.com/HKUDS/RecDiff.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Practice-Friendly Two-Stage LLM-Enhanced Paradigm in Sequential
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dugang Liu, Shenxian Xian, Xiaolin Lin, Xiaolian Zhang, Hong Zhu, Yuan Fang, Zhen Chen, Zhong Ming
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The training paradigm integrating large language models (LLM) is gradually
+reshaping sequential recommender systems (SRS) and has shown promising results.
+However, most existing LLM-enhanced methods rely on rich textual information on
+the item side and instance-level supervised fine-tuning (SFT) to inject
+collaborative information into LLM, which is inefficient and limited in many
+applications. To alleviate these problems, this paper proposes a novel
+practice-friendly two-stage LLM-enhanced paradigm (TSLRec) for SRS.
+Specifically, in the information reconstruction stage, we design a new
+user-level SFT task for collaborative information injection with the assistance
+of a pre-trained SRS model, which is more efficient and compatible with limited
+text information. We aim to let LLM try to infer the latent category of each
+item and reconstruct the corresponding user's preference distribution for all
+categories from the user's interaction sequence. In the information
+augmentation stage, we feed each item into LLM to obtain a set of enhanced
+embeddings that combine collaborative information and LLM inference
+capabilities. These embeddings can then be used to help train various future
+SRS models. Finally, we verify the effectiveness and efficiency of our TSLRec
+on three SRS benchmark datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BeFA: A General Behavior-driven Feature Adapter for Multimedia
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qile Fan, Penghang Yu, Zhiyi Tan, Bing-Kun Bao, Guanming Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia recommender systems focus on utilizing behavioral information and
+content information to model user preferences. Typically, it employs
+pre-trained feature encoders to extract content features, then fuses them with
+behavioral features. However, pre-trained feature encoders often extract
+features from the entire content simultaneously, including excessive
+preference-irrelevant details. We speculate that it may result in the extracted
+features not containing sufficient features to accurately reflect user
+preferences. To verify our hypothesis, we introduce an attribution analysis
+method for visually and intuitively analyzing the content features. The results
+indicate that certain products' content features exhibit the issues of
+information drift}and information omission,reducing the expressive ability of
+features. Building upon this finding, we propose an effective and efficient
+general Behavior-driven Feature Adapter (BeFA) to tackle these issues. This
+adapter reconstructs the content feature with the guidance of behavioral
+information, enabling content features accurately reflecting user preferences.
+Extensive experiments demonstrate the effectiveness of the adapter across all
+multimedia recommendation methods. The code will be publicly available upon the
+paper's acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KGLink: A column type annotation method that combines knowledge graph
+  and <span class="highlight-title">pre-train</span>ed language model <span class="chip">ICDE 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubo Wang, Hao Xin, Lei Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The semantic annotation of tabular data plays a crucial role in various
+downstream tasks. Previous research has proposed knowledge graph (KG)-based and
+deep learning-based methods, each with its inherent limitations. KG-based
+methods encounter difficulties annotating columns when there is no match for
+column cells in the KG. Moreover, KG-based methods can provide multiple
+predictions for one column, making it challenging to determine the semantic
+type with the most suitable granularity for the dataset. This type granularity
+issue limits their scalability.
+  On the other hand, deep learning-based methods face challenges related to the
+valuable context missing issue. This occurs when the information within the
+table is insufficient for determining the correct column type.
+  This paper presents KGLink, a method that combines WikiData KG information
+with a pre-trained deep learning language model for table column annotation,
+effectively addressing both type granularity and valuable context missing
+issues. Through comprehensive experiments on widely used tabular datasets
+encompassing numeric and string columns with varying type granularity, we
+showcase the effectiveness and efficiency of KGLink. By leveraging the
+strengths of KGLink, we successfully surmount challenges related to type
+granularity and valuable context issues, establishing it as a robust solution
+for the semantic annotation of tabular data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in ICDE 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models for Relevance Judgment in Product Search <span class="chip">SIGIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Mehrdad, Hrushikesh Mohapatra, Mossaab Bagdouri, Prijith Chandran, Alessandro Magnani, Xunfan Cai, Ajit Puthenputhussery, Sachin Yadav, Tony Lee, ChengXiang Zhai, Ciya Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High relevance of retrieved and re-ranked items to the search query is the
+cornerstone of successful product search, yet measuring relevance of items to
+queries is one of the most challenging tasks in product information retrieval,
+and quality of product search is highly influenced by the precision and scale
+of available relevance-labelled data. In this paper, we present an array of
+techniques for leveraging Large Language Models (LLMs) for automating the
+relevance judgment of query-item pairs (QIPs) at scale. Using a unique dataset
+of multi-million QIPs, annotated by human evaluators, we test and optimize
+hyper parameters for finetuning billion-parameter LLMs with and without Low
+Rank Adaption (LoRA), as well as various modes of item attribute concatenation
+and prompting in LLM finetuning, and consider trade offs in item attribute
+inclusion for quality of relevance predictions. We demonstrate considerable
+improvement over baselines of prior generations of LLMs, as well as
+off-the-shelf models, towards relevance annotations on par with the human
+relevance evaluators. Our findings have immediate implications for the growing
+field of relevance judgment automation in product search.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure, 11 tables - SIGIR 2024, LLM4Eval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ mdendro: An R package for extended agglomerative hierarchical clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13333v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13333v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alberto Fernández, Sergio Gómez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  "mdendro" is an R package that provides a comprehensive collection of linkage
+methods for agglomerative hierarchical clustering on a matrix of proximity data
+(distances or similarities), returning a multifurcated dendrogram or
+multidendrogram. Multidendrograms can group more than two clusters at the same
+time, solving the nonuniqueness problem that arises when there are ties in the
+data. This problem causes that different binary dendrograms are possible
+depending both on the order of the input data and on the criterion used to
+break ties. Weighted and unweighted versions of the most common linkage methods
+are included in the package, which also implements two parametric linkage
+methods. In addition, package "mdendro" provides five descriptive measures to
+analyze the resulting dendrograms: cophenetic correlation coefficient, space
+distortion ratio, agglomeration coefficient, chaining coefficient and tree
+balance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 13 figures. Software available at CRAN
+  (https://cran.r-project.org/package=mdendro) and Github
+  (https://sergio-gomez.github.io/mdendro/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FABULA: Intelligence Report Generation Using Retrieval-Augmented
+  Narrative Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.13848v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.13848v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Priyanka Ranade, Anupam Joshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Narrative construction is the process of representing disparate event
+information into a logical plot structure that models an end to end story.
+Intelligence analysis is an example of a domain that can benefit tremendously
+from narrative construction techniques, particularly in aiding analysts during
+the largely manual and costly process of synthesizing event information into
+comprehensive intelligence reports. Manual intelligence report generation is
+often prone to challenges such as integrating dynamic event information,
+writing fine-grained queries, and closing information gaps. This motivates the
+development of a system that retrieves and represents critical aspects of
+events in a form that aids in automatic generation of intelligence reports.
+  We introduce a Retrieval Augmented Generation (RAG) approach to augment
+prompting of an autoregressive decoder by retrieving structured information
+asserted in a knowledge graph to generate targeted information based on a
+narrative plot model. We apply our approach to the problem of neural
+intelligence report generation and introduce FABULA, framework to augment
+intelligence analysis workflows using RAG. An analyst can use FABULA to query
+an Event Plot Graph (EPG) to retrieve relevant event plot points, which can be
+used to augment prompting of a Large Language Model (LLM) during intelligence
+report generation. Our evaluation studies show that the plot points included in
+the generated intelligence reports have high semantic relevance, high
+coherency, and low data redundancy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpreting Conversational Dense Retrieval by Rewriting-Enhanced
+  Inversion of Session Embedding <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12774v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12774v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiruo Cheng, Kelong Mao, Zhicheng Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conversational dense retrieval has shown to be effective in conversational
+search. However, a major limitation of conversational dense retrieval is their
+lack of interpretability, hindering intuitive understanding of model behaviors
+for targeted improvements. This paper presents CONVINV, a simple yet effective
+approach to shed light on interpretable conversational dense retrieval models.
+CONVINV transforms opaque conversational session embeddings into explicitly
+interpretable text while faithfully maintaining their original retrieval
+performance as much as possible. Such transformation is achieved by training a
+recently proposed Vec2Text model based on the ad-hoc query encoder, leveraging
+the fact that the session and query embeddings share the same space in existing
+conversational dense retrieval. To further enhance interpretability, we propose
+to incorporate external interpretable query rewrites into the transformation
+process. Extensive evaluations on three conversational search benchmarks
+demonstrate that CONVINV can yield more interpretable text and faithfully
+preserve original retrieval performance than baselines. Our work connects
+opaque session embeddings with transparent query rewriting, paving the way
+toward trustworthy conversational search.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024. Repo: https://github.com/Ariya12138/ConvInv</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RLTP: Reinforcement Learning to Pace for Delayed Impression Modeling in
+  Preloaded Ads <span class="chip">KDD 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02592v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02592v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Penghui Wei, Yongqiang Chen, Shaoguo Liu, Liang Wang, Bo Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To increase brand awareness, many advertisers conclude contracts with
+advertising platforms to purchase traffic and then deliver advertisements to
+target audiences. In a whole delivery period, advertisers usually desire a
+certain impression count for the ads, and they also expect that the delivery
+performance is as good as possible (e.g., obtaining high click-through rate).
+Advertising platforms employ pacing algorithms to satisfy the demands via
+adjusting the selection probabilities to traffic requests in real-time.
+However, the delivery procedure is also affected by the strategies from
+publishers, which cannot be controlled by advertising platforms. Preloading is
+a widely used strategy for many types of ads (e.g., video ads) to make sure
+that the response time for displaying after a traffic request is legitimate,
+which results in delayed impression phenomenon. Traditional pacing algorithms
+cannot handle the preloading nature well because they rely on immediate
+feedback signals, and may fail to guarantee the demands from advertisers.
+  In this paper, we focus on a new research problem of impression pacing for
+preloaded ads, and propose a Reinforcement Learning To Pace framework RLTP. It
+learns a pacing agent that sequentially produces selection probabilities in the
+whole delivery period. To jointly optimize the two objectives of impression
+count and delivery performance, RLTP employs tailored reward estimator to
+satisfy the guaranteed impression count, penalize the over-delivery and
+maximize the traffic value. Experiments on large-scale industrial datasets
+verify that RLTP outperforms baseline pacing algorithms by a large margin. We
+have deployed the RLTP framework online to our advertising platform, and
+results show that it achieves significant uplift to core metrics including
+delivery completion rate and click-through rate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2023 (Applied Data Science Track). The first two authors
+  contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Two-Stage Adaptation of Large Language Models for Text Ranking <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16720v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16720v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Longhui Zhang, Yanzhao Zhang, Dingkun Long, Pengjun Xie, Meishan Zhang, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text ranking is a critical task in information retrieval. Recent advances in
+pre-trained language models (PLMs), especially large language models (LLMs),
+present new opportunities for applying them to text ranking. While supervised
+fine-tuning (SFT) with ranking data has been widely explored to better align
+PLMs with text ranking goals, previous studies have focused primarily on
+encoder-only and encoder-decoder PLMs. Research on leveraging decoder-only LLMs
+for text ranking remains scarce. An exception to this is RankLLaMA, which uses
+direct SFT to explore LLaMA's potential for text ranking. In this work, we
+propose a two-stage progressive paradigm to better adapt LLMs to text ranking.
+First, we conduct continual pre-training (CPT) of LLMs on a large
+weakly-supervised corpus. Second, we perform SFT, and propose an improved
+optimization strategy building upon RankLLaMA. Our experimental results on
+multiple benchmarks show that our approach outperforms previous methods in both
+in-domain and out-domain scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of ACL 2024. Code and models available at
+  https://github.com/Alibaba-NLP/RankingGPT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models meet Collaborative Filtering: An Efficient
+  All-round LLM-based Recommender System <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.11343v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.11343v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sein Kim, Hongseok Kang, Seungyoon Choi, Donghyun Kim, Minchul Yang, Chanyoung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative filtering recommender systems (CF-RecSys) have shown successive
+results in enhancing the user experience on social media and e-commerce
+platforms. However, as CF-RecSys struggles under cold scenarios with sparse
+user-item interactions, recent strategies have focused on leveraging modality
+information of user/items (e.g., text or images) based on pre-trained modality
+encoders and Large Language Models (LLMs). Despite their effectiveness under
+cold scenarios, we observe that they underperform simple traditional
+collaborative filtering models under warm scenarios due to the lack of
+collaborative knowledge. In this work, we propose an efficient All-round
+LLM-based Recommender system, called A-LLMRec, that excels not only in the cold
+scenario but also in the warm scenario. Our main idea is to enable an LLM to
+directly leverage the collaborative knowledge contained in a pre-trained
+state-of-the-art CF-RecSys so that the emergent ability of the LLM as well as
+the high-quality user/item embeddings that are already trained by the
+state-of-the-art CF-RecSys can be jointly exploited. This approach yields two
+advantages: (1) model-agnostic, allowing for integration with various existing
+CF-RecSys, and (2) efficiency, eliminating the extensive fine-tuning typically
+required for LLM-based recommenders. Our extensive experiments on various
+real-world datasets demonstrate the superiority of A-LLMRec in various
+scenarios, including cold/warm, few-shot, cold user, and cross-domain
+scenarios. Beyond the recommendation task, we also show the potential of
+A-LLMRec in generating natural language outputs based on the understanding of
+the collaborative knowledge by performing a favorite genre prediction task. Our
+code is available at https://github.com/ghdtjr/A-LLMRec .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedHCDR: Federated Cross-Domain Recommendation with Hypergraph Signal
+  Decoupling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.02630v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.02630v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyu Zhang, Dongyi Zheng, Lin Zhong, Xu Yang, Jiyuan Feng, Yunqing Feng, Qing Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Cross-Domain Recommendation (CDR) has drawn significant
+attention, which utilizes user data from multiple domains to enhance the
+recommendation performance. However, current CDR methods require sharing user
+data across domains, thereby violating the General Data Protection Regulation
+(GDPR). Consequently, numerous approaches have been proposed for Federated
+Cross-Domain Recommendation (FedCDR). Nevertheless, the data heterogeneity
+across different domains inevitably influences the overall performance of
+federated learning. In this study, we propose FedHCDR, a novel Federated
+Cross-Domain Recommendation framework with Hypergraph signal decoupling.
+Specifically, to address the data heterogeneity across domains, we introduce an
+approach called hypergraph signal decoupling (HSD) to decouple the user
+features into domain-exclusive and domain-shared features. The approach employs
+high-pass and low-pass hypergraph filters to decouple domain-exclusive and
+domain-shared user representations, which are trained by the local-global
+bi-directional transfer algorithm. In addition, a hypergraph contrastive
+learning (HCL) module is devised to enhance the learning of domain-shared user
+relationship information by perturbing the user hypergraph. Extensive
+experiments conducted on three real-world scenarios demonstrate that FedHCDR
+outperforms existing baselines significantly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Arabic Handwritten Text for Person Biometric Identification: A Deep
+  Learning Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00409v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00409v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mazen Balat, Youssef Mohamed, Ahmed Heakl, Ahmed Zaky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study thoroughly investigates how well deep learning models can
+recognize Arabic handwritten text for person biometric identification. It
+compares three advanced architectures -- ResNet50, MobileNetV2, and
+EfficientNetB7 -- using three widely recognized datasets: AHAWP, Khatt, and
+LAMIS-MSHD. Results show that EfficientNetB7 outperforms the others, achieving
+test accuracies of 98.57\%, 99.15\%, and 99.79\% on AHAWP, Khatt, and
+LAMIS-MSHD datasets, respectively. EfficientNetB7's exceptional performance is
+credited to its innovative techniques, including compound scaling, depth-wise
+separable convolutions, and squeeze-and-excitation blocks. These features allow
+the model to extract more abstract and distinctive features from handwritten
+text images. The study's findings hold significant implications for enhancing
+identity verification and authentication systems, highlighting the potential of
+deep learning in Arabic handwritten text recognition for person biometric
+identification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 11 figures, 4 tables, International IEEE Conference on the
+  Intelligent Methods, Systems, and Applications (IMSA)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BeFA: A General Behavior-driven Feature Adapter for Multimedia
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qile Fan, Penghang Yu, Zhiyi Tan, Bing-Kun Bao, Guanming Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia recommender systems focus on utilizing behavioral information and
+content information to model user preferences. Typically, it employs
+pre-trained feature encoders to extract content features, then fuses them with
+behavioral features. However, pre-trained feature encoders often extract
+features from the entire content simultaneously, including excessive
+preference-irrelevant details. We speculate that it may result in the extracted
+features not containing sufficient features to accurately reflect user
+preferences. To verify our hypothesis, we introduce an attribution analysis
+method for visually and intuitively analyzing the content features. The results
+indicate that certain products' content features exhibit the issues of
+information drift}and information omission,reducing the expressive ability of
+features. Building upon this finding, we propose an effective and efficient
+general Behavior-driven Feature Adapter (BeFA) to tackle these issues. This
+adapter reconstructs the content feature with the guidance of behavioral
+information, enabling content features accurately reflecting user preferences.
+Extensive experiments demonstrate the effectiveness of the adapter across all
+multimedia recommendation methods. The code will be publicly available upon the
+paper's acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Frieren: Efficient Video-to-Audio Generation with Rectified Flow
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongqi Wang, Wenxiang Guo, Rongjie Huang, Jiawei Huang, Zehan Wang, Fuming You, Ruiqi Li, Zhou Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-to-audio (V2A) generation aims to synthesize content-matching audio
+from silent video, and it remains challenging to build V2A models with high
+generation quality, efficiency, and visual-audio temporal synchrony. We propose
+Frieren, a V2A model based on rectified flow matching. Frieren regresses the
+conditional transport vector field from noise to spectrogram latent with
+straight paths and conducts sampling by solving ODE, outperforming
+autoregressive and score-based models in terms of audio quality. By employing a
+non-autoregressive vector field estimator based on a feed-forward transformer
+and channel-level cross-modal feature fusion with strong temporal alignment,
+our model generates audio that is highly synchronized with the input video.
+Furthermore, through reflow and one-step distillation with guided vector field,
+our model can generate decent audio in a few, or even only one sampling step.
+Experiments indicate that Frieren achieves state-of-the-art performance in both
+generation quality and temporal alignment on VGGSound, with alignment accuracy
+reaching 97.22%, and 6.2% improvement in inception score over the strong
+diffusion-based baseline. Audio samples are available at
+http://frieren-v2a.github.io .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ITEACH-Net: Inverted Teacher-studEnt seArCH Network for Emotion
+  Recognition in Conversation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.15583v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.15583v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haiyang Sun, Zheng Lian, Chenglong Wang, Kang Chen, Licai Sun, Bin Liu, Jianhua Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There remain two critical challenges that hinder the development of ERC.
+Firstly, there is a lack of exploration into mining deeper insights from the
+data itself for conversational emotion tasks. Secondly, the systems exhibit
+vulnerability to random modality feature missing, which is a common occurrence
+in realistic settings. Focusing on these two key challenges, we propose a novel
+framework for incomplete multimodal learning in ERC, called "Inverted
+Teacher-studEnt seArCH Network (ITEACH-Net)." ITEACH-Net comprises two novel
+components: the Emotion Context Changing Encoder (ECCE) and the Inverted
+Teacher-Student (ITS) framework. Specifically, leveraging the tendency for
+emotional states to exhibit local stability within conversational contexts,
+ECCE captures these patterns and further perceives their evolution over time.
+Recognizing the varying challenges of handling incomplete versus complete data,
+ITS employs a teacher-student framework to decouple the respective
+computations. Subsequently, through Neural Architecture Search, the student
+model develops enhanced computational capabilities for handling incomplete data
+compared to the teacher model. During testing, we design a novel evaluation
+method, testing the model's performance under different missing rate conditions
+without altering the model weights. We conduct experiments on three benchmark
+ERC datasets, and the results demonstrate that our ITEACH-Net outperforms
+existing methods in incomplete multimodal ERC. We believe ITEACH-Net can
+inspire relevant research on the intrinsic nature of emotions within
+conversation scenarios and pave a more robust route for incomplete learning
+techniques. Codes will be made available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-05-31T00:00:00Z">2024-05-31</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">109</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video-MME: The First-Ever Comprehensive Evaluation Benchmark of
+  Multi-modal LLMs in Video Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyou Fu, Yuhan Dai, Yondong Luo, Lei Li, Shuhuai Ren, Renrui Zhang, Zihan Wang, Chenyu Zhou, Yunhang Shen, Mengdan Zhang, Peixian Chen, Yanwei Li, Shaohui Lin, Sirui Zhao, Ke Li, Tong Xu, Xiawu Zheng, Enhong Chen, Rongrong Ji, Xing Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the quest for artificial general intelligence, Multi-modal Large Language
+Models (MLLMs) have emerged as a focal point in recent advancements. However,
+the predominant focus remains on developing their capabilities in static image
+understanding. The potential of MLLMs in processing sequential visual data is
+still insufficiently explored, highlighting the absence of a comprehensive,
+high-quality assessment of their performance. In this paper, we introduce
+Video-MME, the first-ever full-spectrum, Multi-Modal Evaluation benchmark of
+MLLMs in Video analysis. Our work distinguishes from existing benchmarks
+through four key features: 1) Diversity in video types, spanning 6 primary
+visual domains with 30 subfields to ensure broad scenario generalizability; 2)
+Duration in temporal dimension, encompassing both short-, medium-, and
+long-term videos, ranging from 11 seconds to 1 hour, for robust contextual
+dynamics; 3) Breadth in data modalities, integrating multi-modal inputs besides
+video frames, including subtitles and audios, to unveil the all-round
+capabilities of MLLMs; 4) Quality in annotations, utilizing rigorous manual
+labeling by expert annotators to facilitate precise and reliable model
+assessment. 900 videos with a total of 256 hours are manually selected and
+annotated by repeatedly viewing all the video content, resulting in 2,700
+question-answer pairs. With Video-MME, we extensively evaluate various
+state-of-the-art MLLMs, including GPT-4 series and Gemini 1.5 Pro, as well as
+open-source image models like InternVL-Chat-V1.5 and video models like
+LLaVA-NeXT-Video. Our experiments reveal that Gemini 1.5 Pro is the
+best-performing commercial model, significantly outperforming the open-source
+models. Our dataset along with these findings underscores the need for further
+improvements in handling longer sequences and multi-modal data. Project Page:
+https://video-mme.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://video-mme.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization Beyond Data Imbalance: A Controlled Study on CLIP for
+  Transferable Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Wen, Bingchen Zhao, Yilun Chen, Jiangmiao Pang, Xiaojuan Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Severe data imbalance naturally exists among web-scale vision-language
+datasets. Despite this, we find CLIP pre-trained thereupon exhibits notable
+robustness to the data imbalance compared to supervised learning, and
+demonstrates significant effectiveness in learning generalizable
+representations. With an aim to investigate the reasons behind this finding, we
+conduct controlled experiments to study various underlying factors, and reveal
+that CLIP's pretext task forms a dynamic classification problem wherein only a
+subset of classes is present in training. This isolates the bias from dominant
+classes and implicitly balances the learning signal. Furthermore, the
+robustness and discriminability of CLIP improve with more descriptive language
+supervision, larger data scale, and broader open-world concepts, which are
+inaccessible to supervised learning. Our study not only uncovers the mechanisms
+behind CLIP's generalizability beyond data imbalance but also provides
+transferable insights for the research community. The findings are validated in
+both supervised and self-supervised learning, enabling models trained on
+imbalanced data to achieve CLIP-level performance on diverse recognition tasks.
+Code will be available at: https://github.com/CVMI-Lab/clip-beyond-tail.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Code <span class="highlight-title">Pretrain</span>ing Improves Entity Tracking Abilities of Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Najoung Kim, Sebastian Schuster, Shubham Toshniwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has provided indirect evidence that pretraining language models
+on code improves the ability of models to track state changes of discourse
+entities expressed in natural language. In this work, we systematically test
+this claim by comparing pairs of language models on their entity tracking
+performance. Critically, the pairs consist of base models and models trained on
+top of these base models with additional code data. We extend this analysis to
+additionally examine the effect of math training, another highly structured
+data type, and alignment tuning, an important step for enhancing the usability
+of models. We find clear evidence that models additionally trained on large
+amounts of code outperform the base models. On the other hand, we find no
+consistent benefit of additional math training or alignment tuning across
+various model families.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grammar-Aligned Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21047v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21047v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanghee Park, Jiayu Wang, Taylor Berg-Kirkpatrick, Nadia Polikarpova, Loris D'Antoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) struggle with reliably generating highly
+structured outputs, such as program code, mathematical formulas, or well-formed
+markup. Constrained decoding approaches mitigate this problem by greedily
+restricting what tokens an LLM can output at each step to guarantee that the
+output matches a given constraint. Specifically, in grammar-constrained
+decoding (GCD), the LLM's output must follow a given grammar. In this paper we
+demonstrate that GCD techniques (and in general constrained decoding
+techniques) can distort the LLM's distribution, leading to outputs that are
+grammatical but appear with likelihoods that are not proportional to the ones
+given by the LLM, and so ultimately are low-quality. We call the problem of
+aligning sampling with a grammar constraint, grammar-aligned decoding (GAD),
+and propose adaptive sampling with approximate expected futures (ASAp), a
+decoding algorithm that guarantees the output to be grammatical while provably
+producing outputs that match the conditional probability of the LLM's
+distribution conditioned on the given grammar constraint. Our algorithm uses
+prior sample outputs to soundly overapproximate the future grammaticality of
+different output prefixes. Our evaluation on code generation and structured NLP
+tasks shows how ASAp often produces outputs with higher likelihood (according
+to the LLM's distribution) than existing GCD techniques, while still enforcing
+the desired grammatical constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploratory Preference Optimization: Harnessing Implicit
+  Q*-Approximation for Sample-Efficient RLHF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengyang Xie, Dylan J. Foster, Akshay Krishnamurthy, Corby Rosset, Ahmed Awadallah, Alexander Rakhlin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning from human feedback (RLHF) has emerged as a central
+tool for language model alignment. We consider online exploration in RLHF,
+which exploits interactive access to human or AI feedback by deliberately
+encouraging the model to produce diverse, maximally informative responses. By
+allowing RLHF to confidently stray from the pre-trained model, online
+exploration offers the possibility of novel, potentially super-human
+capabilities, but its full potential as a paradigm for language model training
+has yet to be realized, owing to computational and statistical bottlenecks in
+directly adapting existing reinforcement learning techniques. We propose a new
+algorithm for online exploration in RLHF, Exploratory Preference Optimization
+(XPO), which is simple and practical -- a one-line change to (online) Direct
+Preference Optimization (DPO; Rafailov et al., 2023) -- yet enjoys the
+strongest known provable guarantees and promising empirical performance. XPO
+augments the DPO objective with a novel and principled exploration bonus,
+empowering the algorithm to explore outside the support of the initial model
+and human feedback data. In theory, we show that XPO is provably
+sample-efficient and converges to a near-optimal language model policy under
+natural exploration conditions, irrespective of whether the initial model has
+good coverage. Our analysis, which builds on the observation that DPO
+implicitly performs a form of $Q^{\star}$-approximation (or, Bellman error
+minimization), combines previously disparate techniques from language modeling
+and theoretical reinforcement learning in a serendipitous fashion through the
+perspective of KL-regularized Markov decision processes. Empirically, we find
+that XPO is more sample-efficient than non-exploratory DPO variants in a
+preliminary evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Direct Alignment of Language Models via Quality-Aware Self-Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runsheng Yu, Yong Wang, Xiaoqi Jiao, Youzhi Zhang, James T. Kwok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning from Human Feedback (RLHF) has been commonly used to
+align the behaviors of Large Language Models (LLMs) with human preferences.
+Recently, a popular alternative is Direct Policy Optimization (DPO), which
+replaces an LLM-based reward model with the policy itself, thus obviating the
+need for extra memory and training time to learn the reward model. However, DPO
+does not consider the relative qualities of the positive and negative
+responses, and can lead to sub-optimal training outcomes. To alleviate this
+problem, we investigate the use of intrinsic knowledge within the on-the-fly
+fine-tuning LLM to obtain relative qualities and help to refine the loss
+function. Specifically, we leverage the knowledge of the LLM to design a
+refinement function to estimate the quality of both the positive and negative
+responses. We show that the constructed refinement function can help
+self-refine the loss function under mild assumptions. The refinement function
+is integrated into DPO and its variant Identity Policy Optimization (IPO).
+Experiments across various evaluators indicate that they can improve the
+performance of the fine-tuned models over DPO and IPO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LACIE: Listener-Aware Finetuning for Confidence Calibration in Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elias Stengel-Eskin, Peter Hase, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When answering questions, LLMs can convey not only an answer, but a level of
+confidence about the answer being correct. This includes explicit confidence
+markers (e.g. giving a numeric score) as well as implicit markers, like an
+authoritative tone or elaborating with additional knowledge. For LLMs to be
+trustworthy knowledge sources, the confidence they convey should match their
+actual expertise; however, most current models tend towards overconfidence. To
+calibrate both implicit and explicit confidence markers, we introduce a
+pragmatic, listener-aware finetuning method (LACIE) that models the listener,
+considering not only whether an answer is right, but whether it will be
+accepted by a listener. We cast calibration as preference optimization,
+creating data via a two-agent game, where a speaker model's outputs are judged
+by a simulated listener. We then finetune three LLMs (Mistral-7B, Llama3-8B,
+Llama3-70B) with LACIE, and show that the resulting models are better
+calibrated w.r.t. a simulated listener. Crucially, these trends transfer to
+human listeners, helping them correctly predict model correctness: we conduct a
+human evaluation where annotators accept or reject an LLM's answers, finding
+that training with LACIE results in 47% fewer incorrect answers being accepted
+while maintaining the same level of acceptance for correct answers.
+Furthermore, LACIE generalizes to another dataset, resulting in a large
+increase in truthfulness on TruthfulQA when trained on TriviaQA. Our analysis
+indicates that LACIE leads to a better confidence separation between correct
+and incorrect examples. Qualitatively, we find that a LACIE-trained model
+hedges more and implicitly signals certainty when it is correct by using an
+authoritative tone or including details. Finally, LACIE finetuning leads to an
+emergent increase in model abstention (e.g. saying "I don't know") for answers
+that are likely wrong.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages. Code: https://github.com/esteng/pragmatic_calibration</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ You Only Scan Once: Efficient Multi-dimension Sequential Modeling with
+  LightNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21022v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21022v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Qin, Yuxin Mao, Xuyang Shen, Dong Li, Jing Zhang, Yuchao Dai, Yiran Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linear attention mechanisms have gained prominence in causal language models
+due to their linear computational complexity and enhanced speed. However, the
+inherent decay mechanism in linear attention presents challenges when applied
+to multi-dimensional sequence modeling tasks, such as image processing and
+multi-modal learning. In these scenarios, the utilization of sequential
+scanning to establish a global receptive field necessitates multiple scans for
+multi-dimensional data, thereby leading to inefficiencies. This paper
+identifies the inefficiency caused by a multiplicative linear recurrence and
+proposes an efficient alternative additive linear recurrence to avoid the
+issue, as it can handle multi-dimensional data within a single scan. We further
+develop an efficient multi-dimensional sequential modeling framework called
+LightNet based on the new recurrence. Moreover, we present two new
+multi-dimensional linear relative positional encoding methods, MD-TPE and
+MD-LRPE to enhance the model's ability to discern positional information in
+multi-dimensional scenarios. Our empirical evaluations across various tasks,
+including image classification, image generation, bidirectional language
+modeling, and autoregressive language modeling, demonstrate the efficacy of
+LightNet, showcasing its potential as a versatile and efficient solution for
+multi-dimensional sequential modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report. Yiran Zhong is the corresponding author. The code
+  is available at https://github.com/OpenNLPLab/LightNet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Techniques for Optimization-Based Jailbreaking on Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojun Jia, Tianyu Pang, Chao Du, Yihao Huang, Jindong Gu, Yang Liu, Xiaochun Cao, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are being rapidly developed, and a key component
+of their widespread deployment is their safety-related alignment. Many
+red-teaming efforts aim to jailbreak LLMs, where among these efforts, the
+Greedy Coordinate Gradient (GCG) attack's success has led to a growing interest
+in the study of optimization-based jailbreaking techniques. Although GCG is a
+significant milestone, its attacking efficiency remains unsatisfactory. In this
+paper, we present several improved (empirical) techniques for
+optimization-based jailbreaks like GCG. We first observe that the single target
+template of "Sure" largely limits the attacking performance of GCG; given this,
+we propose to apply diverse target templates containing harmful self-suggestion
+and/or guidance to mislead LLMs. Besides, from the optimization aspects, we
+propose an automatic multi-coordinate updating strategy in GCG (i.e.,
+adaptively deciding how many tokens to replace in each step) to accelerate
+convergence, as well as tricks like easy-to-hard initialisation. Then, we
+combine these improved technologies to develop an efficient jailbreak method,
+dubbed $\mathcal{I}$-GCG. In our experiments, we evaluate on a series of
+benchmarks (such as NeurIPS 2023 Red Teaming Track). The results demonstrate
+that our improved techniques can help GCG outperform state-of-the-art
+jailbreaking attacks and achieve nearly 100% attack success rate. The code is
+released at https://github.com/jiaxiaojunQAQ/I-GCG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a Fluid computer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20999v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20999v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robert Cardona, Eva Miranda, Daniel Peralta-Salas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In 1991, Moore [20] raised a question about whether hydrodynamics is capable
+of performing computations. Similarly, in 2016, Tao [25] asked whether a
+mechanical system, including a fluid flow, can simulate a universal Turing
+machine. In this expository article, we review the construction in [8] of a
+"Fluid computer" in dimension 3 that combines techniques in symbolic dynamics
+with the connection between steady Euler flows and contact geometry unveiled by
+Etnyre and Ghrist. In addition, we argue that the metric that renders the
+vector field Beltrami cannot be critical in the Chern-Hamilton sense [9]. We
+also sketch the completely different construction for the Euclidean metric in
+$\mathbb R^3$ as given in [7]. These results reveal the existence of
+undecidable fluid particle paths. We conclude the article with a list of open
+problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CWRCzech: 100M Query-Document Czech Click <span class="highlight-title">Dataset</span> and Its Application to
+  Web Relevance Ranking <span class="chip">SIGIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josef Vonášek, Milan Straka, Rostislav Krč, Lenka Lasoňová, Ekaterina Egorova, Jana Straková, Jakub Náplava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present CWRCzech, Click Web Ranking dataset for Czech, a 100M
+query-document Czech click dataset for relevance ranking with user behavior
+data collected from search engine logs of Seznam.cz. To the best of our
+knowledge, CWRCzech is the largest click dataset with raw text published so
+far. It provides document positions in the search results as well as
+information about user behavior: 27.6M clicked documents and 10.8M dwell times.
+In addition, we also publish a manually annotated Czech test for the relevance
+task, containing nearly 50k query-document pairs, each annotated by at least 2
+annotators. Finally, we analyze how the user behavior data improve relevance
+ranking and show that models trained on data automatically harnessed at
+sufficient scale can surpass the performance of models trained on human
+annotated data. CWRCzech is published under an academic non-commercial license
+and is available to the research community at
+https://github.com/seznam/CWRCzech.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIGIR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SaySelf: Teaching LLMs to Express Confidence with Self-Reflective
+  Rationales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyang Xu, Shujin Wu, Shizhe Diao, Xiaoze Liu, Xingyao Wang, Yangyi Chen, Jing Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often generate inaccurate or fabricated
+information and generally fail to indicate their confidence, which limits their
+broader applications. Previous work elicits confidence from LLMs by direct or
+self-consistency prompting, or constructing specific datasets for supervised
+finetuning. The prompting-based approaches have inferior performance, and the
+training-based approaches are limited to binary or inaccurate group-level
+confidence estimates. In this work, we present the advanced SaySelf, a training
+framework that teaches LLMs to express more accurate fine-grained confidence
+estimates. In addition, beyond the confidence scores, SaySelf initiates the
+process of directing LLMs to produce self-reflective rationales that clearly
+identify gaps in their parametric knowledge and explain their uncertainty. This
+is achieved by using an LLM to automatically summarize the uncertainties in
+specific knowledge via natural language. The summarization is based on the
+analysis of the inconsistency in multiple sampled reasoning chains, and the
+resulting data is utilized for supervised fine-tuning. Moreover, we utilize
+reinforcement learning with a meticulously crafted reward function to calibrate
+the confidence estimates, motivating LLMs to deliver accurate, high-confidence
+predictions and to penalize overconfidence in erroneous outputs. Experimental
+results in both in-distribution and out-of-distribution datasets demonstrate
+the effectiveness of SaySelf in reducing the confidence calibration error and
+maintaining the task performance. We show that the generated self-reflective
+rationales are reasonable and can further contribute to the calibration. The
+code is made public at \url{https://github.com/xu1868/SaySelf}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is available at \url{https://github.com/xu1868/SaySelf}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LCQ: Low-Rank Codebook based Quantization for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wen-Pu Cai, Wu-Jun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models~(LLMs) have recently demonstrated promising performance
+in many tasks. However, the high storage and computational cost of LLMs has
+become a challenge for deploying LLMs. Weight quantization has been widely used
+for model compression, which can reduce both storage and computational cost.
+Most existing weight quantization methods for LLMs use a rank-one codebook for
+quantization, which results in substantial accuracy loss when the compression
+ratio is high. In this paper, we propose a novel weight quantization method,
+called low-rank codebook based quantization~(LCQ), for LLMs. LCQ adopts a
+low-rank codebook, the rank of which can be larger than one, for quantization.
+Experiments show that LCQ can achieve better accuracy than existing methods
+with a negligibly extra storage cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Superlatives in Context: Explicit and Implicit Domain Restrictions for
+  Superlative Frames 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valentina Pyatkin, Bonnie Webber, Ido Dagan, Reut Tsarfaty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Superlatives are used to single out elements with a maximal/minimal property.
+Semantically, superlatives perform a set comparison: something (or some things)
+has the min/max property out of a set. As such, superlatives provide an ideal
+phenomenon for studying implicit phenomena and discourse restrictions. While
+this comparison set is often not explicitly defined, its (implicit)
+restrictions can be inferred from the discourse context the expression appears
+in. In this work we provide an extensive computational study on the semantics
+of superlatives. We propose a unified account of superlative semantics which
+allows us to derive a broad-coverage annotation schema. Using this unified
+schema we annotated a multi-domain dataset of superlatives and their semantic
+interpretations. We specifically focus on interpreting implicit or ambiguous
+superlative expressions, by analyzing how the discourse context restricts the
+set of interpretations. In a set of experiments we then analyze how well models
+perform at variations of predicting superlative semantics, with and without
+context. We show that the fine-grained semantics of superlatives in context can
+be challenging for contemporary models, including GPT-4.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models are Zero-Shot Next Location Predictors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ciro Beneduce, Bruno Lepri, Massimiliano Luca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting the locations an individual will visit in the future is crucial
+for solving many societal issues like disease diffusion and reduction of
+pollution among many others. The models designed to tackle next-location
+prediction, however, require a significant amount of individual-level
+information to be trained effectively. Such data may be scarce or even
+unavailable in some geographic regions or peculiar scenarios (e.g., cold-start
+in recommendation systems). Moreover, the design of a next-location predictor
+able to generalize or geographically transfer knowledge is still an open
+research challenge. Recent advances in natural language processing have led to
+a rapid diffusion of Large Language Models (LLMs) which have shown good
+generalization and reasoning capabilities. These insights, coupled with the
+recent findings that LLMs are rich in geographical knowledge, allowed us to
+believe that these models can act as zero-shot next-location predictors. This
+paper evaluates the capabilities of many popular LLMs in this role,
+specifically Llama, GPT-3.5 and Mistral 7B. After designing a proper prompt, we
+tested the models on three real-world mobility datasets. The results show that
+LLMs can obtain accuracies up to 32.4%, a significant relative improvement of
+over 600% when compared to sophisticated DL models specifically designed for
+human mobility. Moreover, we show that other LLMs are unable to perform the
+task properly. To prevent positively biased results, we also propose a
+framework inspired by other studies to test data contamination. Finally, we
+explored the possibility of using LLMs as text-based explainers for
+next-location prediction showing that can effectively provide an explanation
+for their decision. Notably, 7B models provide more generic, but still
+reliable, explanations compared to larger counterparts. Code:
+github.com/ssai-trento/LLM-zero-shot-NL
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Robot Walks into a Bar: Can Language Models Serve asCreativity Support
+  Tools for Comedy? An Evaluation of LLMs' Humour Alignment with Comedians 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piotr Wojciech Mirowski, Juliette Love, Kory W. Mathewson, Shakir Mohamed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We interviewed twenty professional comedians who perform live shows in front
+of audiences and who use artificial intelligence in their artistic process as
+part of 3-hour workshops on ``AI x Comedy'' conducted at the Edinburgh Festival
+Fringe in August 2023 and online. The workshop consisted of a comedy writing
+session with large language models (LLMs), a human-computer interaction
+questionnaire to assess the Creativity Support Index of AI as a writing tool,
+and a focus group interrogating the comedians' motivations for and processes of
+using AI, as well as their ethical concerns about bias, censorship and
+copyright. Participants noted that existing moderation strategies used in
+safety filtering and instruction-tuned LLMs reinforced hegemonic viewpoints by
+erasing minority groups and their perspectives, and qualified this as a form of
+censorship. At the same time, most participants felt the LLMs did not succeed
+as a creativity support tool, by producing bland and biased comedy tropes, akin
+to ``cruise ship comedy material from the 1950s, but a bit less racist''. Our
+work extends scholarship about the subtle difference between, one the one hand,
+harmful speech, and on the other hand, ``offensive'' language as a practice of
+resistance, satire and ``punching up''. We also interrogate the global value
+alignment behind such language models, and discuss the importance of
+community-based value alignment and data ownership to build AI tools that
+better suit artists' needs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 1 figure, published at ACM FAccT 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OR-Bench: An Over-Refusal Benchmark for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20947v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20947v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justin Cui, Wei-Lin Chiang, Ion Stoica, Cho-Jui Hsieh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) require careful safety alignment to prevent
+malicious outputs. While significant research focuses on mitigating harmful
+content generation, the enhanced safety often come with the side effect of
+over-refusal, where the LLMs may reject innocuous prompts and become less
+helpful. Although the issue of over-refusal has been empirically observed, a
+systematic measurement is challenging due to the difficulty of crafting prompts
+that appear harmful but are benign. This study proposes a novel method for
+automatically generating large-scale sets of ``seemingly toxic prompts''
+(benign prompts likely rejected by LLMs). Leveraging this technique, we
+introduce OR-Bench, the first large-scale over-refusal benchmark. OR-Bench
+comprises 80,000 seemingly toxic prompts across 10 common rejection categories,
+a subset of around 1,000 hard prompts that are challenging even for
+state-of-the-art LLMs, and an additional 600 toxic prompts to prevent
+indiscriminate responses. We then conduct a comprehensive study to measure the
+over-refusal of 25 popular LLMs across 8 model families. Our datasets are
+available at https://huggingface.co/datasets/bench-llm/OR-Bench and the
+corresponding demo can be found at
+https://huggingface.co/spaces/bench-llm/or-bench. We hope this benchmark can
+help the community develop better safety aligned models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>version 1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Estimate System Specifications in Linear Temporal Logic
+  using <span class="highlight-title">Transformer</span>s and Mamba 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        İlker Işık, Ebru Aydin Gol, Ramazan Gokberk Cinbis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal logic is a framework for representing and reasoning about
+propositions that evolve over time. It is commonly used for specifying
+requirements in various domains, including hardware and software systems, as
+well as robotics. Specification mining or formula generation involves
+extracting temporal logic formulae from system traces and has numerous
+applications, such as detecting bugs and improving interpretability. Although
+there has been a surge of deep learning-based methods for temporal logic
+satisfiability checking in recent years, the specification mining literature
+has been lagging behind in adopting deep learning methods despite their many
+advantages, such as scalability. In this paper, we introduce autoregressive
+models that can generate linear temporal logic formulae from traces, towards
+addressing the specification mining problem. We propose multiple architectures
+for this task: transformer encoder-decoder, decoder-only transformer, and
+Mamba, which is an emerging alternative to transformer models. Additionally, we
+devise a metric for quantifying the distinctiveness of the generated formulae
+and a straightforward algorithm to enforce the syntax constraints. Our
+experiments show that the proposed architectures yield promising results,
+generating correct and distinct formulae at a fraction of the compute cost
+needed for the combinatorial baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Vision Models for Text-Heavy Content Understanding and
+  Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adithya TG, Adithya SK, Abhinav R Bharadwaj, Abhiram HA, Dr. Surabhi Narayan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interacting and understanding with text heavy visual content with multiple
+images is a major challenge for traditional vision models. This paper is on
+enhancing vision models' capability to comprehend or understand and learn from
+images containing a huge amount of textual information from the likes of
+textbooks and research papers which contain multiple images like graphs, etc
+and tables in them with different types of axes and scales. The approach
+involves dataset preprocessing, fine tuning which is by using instructional
+oriented data and evaluation. We also built a visual chat application
+integrating CLIP for image encoding and a model from the Massive Text Embedding
+Benchmark which is developed to consider both textual and visual inputs. An
+accuracy of 96.71% was obtained. The aim of the project is to increase and also
+enhance the advance vision models' capabilities in understanding complex visual
+textual data interconnected data, contributing to multimodal AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures (including 1 graph)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Preemptive Answer "Attacks" on Chain-of-Thought Reasoning <span class="chip">ACL'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongwu Xu, Zehan Qi, Wei Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) showcase impressive reasoning capabilities when
+coupled with Chain-of-Thought (CoT) prompting. However, the robustness of this
+approach warrants further investigation. In this paper, we introduce a novel
+scenario termed preemptive answers, where the LLM obtains an answer before
+engaging in reasoning. This situation can arise inadvertently or induced by
+malicious users by prompt injection attacks. Experiments reveal that preemptive
+answers significantly impair the model's reasoning capability across various
+CoT methods and a broad spectrum of datasets. To bolster the robustness of
+reasoning, we propose two measures aimed at mitigating this issue to some
+extent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL'24 (Findings). Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models: A New Approach for Privacy Policy Analysis at
+  Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20900v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20900v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Rodriguez, Ian Yang, Jose M. Del Alamo, Norman Sadeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The number and dynamic nature of web and mobile applications presents
+significant challenges for assessing their compliance with data protection
+laws. In this context, symbolic and statistical Natural Language Processing
+(NLP) techniques have been employed for the automated analysis of these
+systems' privacy policies. However, these techniques typically require
+labor-intensive and potentially error-prone manually annotated datasets for
+training and validation. This research proposes the application of Large
+Language Models (LLMs) as an alternative for effectively and efficiently
+extracting privacy practices from privacy policies at scale. Particularly, we
+leverage well-known LLMs such as ChatGPT and Llama 2, and offer guidance on the
+optimal design of prompts, parameters, and models, incorporating advanced
+strategies such as few-shot learning. We further illustrate its capability to
+detect detailed and varied privacy practices accurately. Using several renowned
+datasets in the domain as a benchmark, our evaluation validates its exceptional
+performance, achieving an F1 score exceeding 93%. Besides, it does so with
+reduced costs, faster processing times, and fewer technical knowledge
+requirements. Consequently, we advocate for LLM-based solutions as a sound
+alternative to traditional NLP techniques for the automated analysis of privacy
+policies at scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A comparison of correspondence analysis with PMI-based word embedding
+  methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20895v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20895v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianqian Qi, David J. Hessen, Peter G. M. van der Heijden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Popular word embedding methods such as GloVe and Word2Vec are related to the
+factorization of the pointwise mutual information (PMI) matrix. In this paper,
+we link correspondence analysis (CA) to the factorization of the PMI matrix. CA
+is a dimensionality reduction method that uses singular value decomposition
+(SVD), and we show that CA is mathematically close to the weighted
+factorization of the PMI matrix. In addition, we present variants of CA that
+turn out to be successful in the factorization of the word-context matrix, i.e.
+CA applied to a matrix where the entries undergo a square-root transformation
+(ROOT-CA) and a root-root transformation (ROOTROOT-CA). An empirical comparison
+among CA- and PMI-based methods shows that overall results of ROOT-CA and
+ROOTROOT-CA are slightly better than those of the PMI-based methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ clembench-2024: A Challenging, Dynamic, Complementary, Multilingual
+  Benchmark and Underlying Flexible Framework for LLMs as Multi-Action Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20859v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20859v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anne Beyer, Kranti Chalamalasetti, Sherzod Hakimov, Brielen Madureira, Philipp Sadler, David Schlangen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has been established in recent work that Large Language Models (LLMs) can
+be prompted to "self-play" conversational games that probe certain capabilities
+(general instruction following, strategic goal orientation, language
+understanding abilities), where the resulting interactive game play can be
+automatically scored. In this paper, we take one of the proposed frameworks for
+setting up such game-play environments, and further test its usefulness as an
+evaluation instrument, along a number of dimensions: We show that it can easily
+keep up with new developments while avoiding data contamination, we show that
+the tests implemented within it are not yet saturated (human performance is
+substantially higher than that of even the best models), and we show that it
+lends itself to investigating additional questions, such as the impact of the
+prompting language on performance. We believe that the approach forms a good
+basis for making decisions on model choice for building applied interactive
+systems, and perhaps ultimately setting up a closed-loop development
+environment of system and simulated evaluator.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Spoken Language Understanding via Multi-level Multi-grained
+  Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuxin Cheng, Wanshi Xu, Zhihong Zhu, Hongxiang Li, Yuexian Zou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spoken language understanding (SLU) is a core task in task-oriented dialogue
+systems, which aims at understanding the user's current goal through
+constructing semantic frames. SLU usually consists of two subtasks, including
+intent detection and slot filling. Although there are some SLU frameworks joint
+modeling the two subtasks and achieving high performance, most of them still
+overlook the inherent relationships between intents and slots and fail to
+achieve mutual guidance between the two subtasks. To solve the problem, we
+propose a multi-level multi-grained SLU framework MMCL to apply contrastive
+learning at three levels, including utterance level, slot level, and word level
+to enable intent and slot to mutually guide each other. For the utterance
+level, our framework implements coarse granularity contrastive learning and
+fine granularity contrastive learning simultaneously. Besides, we also apply
+the self-distillation method to improve the robustness of the model.
+Experimental results and further analysis demonstrate that our proposed model
+achieves new state-of-the-art results on two public multi-intent SLU datasets,
+obtaining a 2.6 overall accuracy improvement on the MixATIS dataset compared to
+previous best models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Reward Models with Synthetic Critiques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihuiwen Ye, Fraser Greenlee-Scott, Max Bartolo, Phil Blunsom, Jon Ander Campos, Matthias Gallé
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reward models (RM) play a critical role in aligning language models through
+the process of reinforcement learning from human feedback. RMs are trained to
+predict a score reflecting human preference, which requires significant time
+and cost for human annotation. Additionally, RMs tend to quickly overfit on
+superficial features in the training set, hindering their generalization
+performance on unseen distributions. We propose a novel approach using
+synthetic natural language critiques generated by large language models to
+provide additional feedback, evaluating aspects such as instruction following,
+correctness, and style. This offers richer signals and more robust features for
+RMs to assess and score on. We demonstrate that high-quality critiques improve
+the performance and data efficiency of RMs initialized from different
+pretrained models. Conversely, we also show that low-quality critiques
+negatively impact performance. Furthermore, incorporating critiques enhances
+the interpretability and robustness of RM training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Don't Buy it! Reassessing the Ad Understanding Abilities of Contrastive
+  Multimodal Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20846v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20846v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        A. Bavaresco, A. Testoni, R. Fernández
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-based advertisements are complex multimodal stimuli that often contain
+unusual visual elements and figurative language. Previous research on automatic
+ad understanding has reported impressive zero-shot accuracy of contrastive
+vision-and-language models (VLMs) on an ad-explanation retrieval task. Here, we
+examine the original task setup and show that contrastive VLMs can solve it by
+exploiting grounding heuristics. To control for this confound, we introduce
+TRADE, a new evaluation test set with adversarial grounded explanations. While
+these explanations look implausible to humans, we show that they "fool" four
+different contrastive VLMs. Our findings highlight the need for an improved
+operationalisation of automatic ad understanding that truly evaluates VLMs'
+multimodal reasoning abilities. We make our code and TRADE available at
+https://github.com/dmg-illc/trade .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the main conference ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Outliers and Calibration Sets have Diminishing Effect on Quantization of
+  Modern LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Paglieri, Saurabh Dash, Tim Rocktäschel, Jack Parker-Holder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-Training Quantization (PTQ) enhances the efficiency of Large Language
+Models (LLMs) by enabling faster operation and compatibility with more
+accessible hardware through reduced memory usage, at the cost of small
+performance drops. We explore the role of calibration sets in PTQ, specifically
+their effect on hidden activations in various notable open-source LLMs.
+Calibration sets are crucial for evaluating activation magnitudes and
+identifying outliers, which can distort the quantization range and negatively
+impact performance. Our analysis reveals a marked contrast in quantization
+effectiveness across models. The older OPT model, which much of the
+quantization literature is based on, shows significant performance
+deterioration and high susceptibility to outliers with varying calibration
+sets. In contrast, newer models like Llama-2 7B, Llama-3 8B, Command-R 35B, and
+Mistral 7B demonstrate strong robustness, with Mistral 7B showing near-immunity
+to outliers and stable activations. These findings suggest a shift in PTQ
+strategies might be needed. As advancements in pre-training methods reduce the
+relevance of outliers, there is an emerging need to reassess the fundamentals
+of current quantization literature. The emphasis should pivot towards
+optimizing inference speed, rather than primarily focusing on outlier
+preservation, to align with the evolving characteristics of state-of-the-art
+LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ That's Optional: A Contemporary Exploration of "that" Omission in
+  English Subordinate Clauses <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ella Rabinovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Uniform Information Density (UID) hypothesis posits that speakers
+optimize the communicative properties of their utterances by avoiding spikes in
+information, thereby maintaining a relatively uniform information profile over
+time. This paper investigates the impact of UID principles on syntactic
+reduction, specifically focusing on the optional omission of the connector
+"that" in English subordinate clauses. Building upon previous research, we
+extend our investigation to a larger corpus of written English, utilize
+contemporary large language models (LLMs) and extend the information-uniformity
+principles by the notion of entropy, to estimate the UID manifestations in the
+usecase of syntactic reduction choices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL2024 (main conference), 8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Augmented Preference Optimization: Off-Policy Paradigms for
+  Language Model Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueqin Yin, Zhendong Wang, Yujia Xie, Weizhu Chen, Mingyuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional language model alignment methods, such as Direct Preference
+Optimization (DPO), are limited by their dependence on static, pre-collected
+paired preference data, which hampers their adaptability and practical
+applicability. To overcome this limitation, we introduce Self-Augmented
+Preference Optimization (SAPO), an effective and scalable training paradigm
+that does not require existing paired data. Building on the self-play concept,
+which autonomously generates negative responses, we further incorporate an
+off-policy learning pipeline to enhance data exploration and exploitation.
+Specifically, we employ an Exponential Moving Average (EMA) model in
+conjunction with a replay buffer to enable dynamic updates of response
+segments, effectively integrating real-time feedback with insights from
+historical data. Our comprehensive evaluations of the LLaMA3-8B and Mistral-7B
+models across benchmarks, including the Open LLM Leaderboard, IFEval,
+AlpacaEval 2.0, and MT-Bench, demonstrate that SAPO matches or surpasses
+established offline contrastive baselines, such as DPO and Odds Ratio
+Preference Optimization, and outperforms offline self-play methods like SPIN.
+Our code is available at https://github.com/yinyueqin/SAPO
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An iterated learning model of language change that mixes supervised and
+  unsupervised learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jack Bunyan, Seth Bullock, Conor Houghton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The iterated learning model is an agent-based model of language change in
+which language is transmitted from a tutor to a pupil which itself becomes a
+tutor to a new pupil, and so on. Languages that are stable, expressive, and
+compositional arise spontaneously as a consequence of a language transmission
+bottleneck. Previous models have implemented an agent's mapping from signals to
+meanings using an artificial neural network decoder, but have relied on an
+unrealistic and computationally expensive process of obversion to implement the
+associated encoder, mapping from meanings to signals. Here, a new model is
+presented in which both decoder and encoder are neural networks, trained
+separately through supervised learning, and trained together through
+unsupervised learning in the form of an autoencoder. This avoids the
+substantial computational burden entailed in obversion and introduces a mixture
+of supervised and unsupervised learning as observed during human development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multilingual Text Style Transfer: <span class="highlight-title">Dataset</span>s & Models for Indian Languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20805v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20805v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sourabrata Mukherjee, Atul Kr. Ojha, Akanksha Bansal, Deepak Alok, John P. McCrae, Ondřej Dušek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text style transfer (TST) involves altering the linguistic style of a text
+while preserving its core content. This paper focuses on sentiment transfer, a
+vital TST subtask (Mukherjee et al., 2022a), across a spectrum of Indian
+languages: Hindi, Magahi, Malayalam, Marathi, Punjabi, Odia, Telugu, and Urdu,
+expanding upon previous work on English-Bangla sentiment transfer (Mukherjee et
+al., 2023). We introduce dedicated datasets of 1,000 positive and 1,000
+negative style-parallel sentences for each of these eight languages. We then
+evaluate the performance of various benchmark models categorized into parallel,
+non-parallel, cross-lingual, and shared learning approaches, including the
+Llama2 and GPT-3.5 large language models (LLMs). Our experiments highlight the
+significance of parallel data in TST and demonstrate the effectiveness of the
+Masked Style Filling (MSF) approach (Mukherjee et al., 2023) in non-parallel
+techniques. Moreover, cross-lingual and joint multilingual learning methods
+show promise, offering insights into selecting optimal models tailored to the
+specific language and task requirements. To the best of our knowledge, this
+work represents the first comprehensive exploration of the TST task as
+sentiment transfer across a diverse set of languages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ovis: Structural Embedding Alignment for Multimodal Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyin Lu, Yang Li, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, Han-Jia Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current Multimodal Large Language Models (MLLMs) typically integrate a
+pre-trained LLM with another pre-trained vision transformer through a
+connector, such as an MLP, endowing the LLM with visual capabilities. However,
+the misalignment between two embedding strategies in MLLMs -- the structural
+textual embeddings based on an embedding look-up table and the continuous
+embeddings generated directly by the vision encoder -- makes challenges for a
+more seamless fusion of visual and textual information. We propose Ovis, a
+novel MLLM architecture designed to structurally align visual and textual
+embeddings. Ovis integrates an additional learnable visual embedding table into
+the visual encoder's process. To capture rich visual semantics, each image
+patch indexes the visual embedding table multiple times, resulting in a final
+visual embedding that is a probabilistic combination of the indexed embeddings.
+This structural approach mirrors the method used for generating textual
+embeddings. Empirical evaluations on various multimodal benchmarks demonstrate
+that Ovis outperforms open-source MLLMs of similar parameter scales and even
+surpasses the proprietary model Qwen-VL-Plus overall. These results highlight
+the potential of Ovis' structured visual representation for advancing MLLM
+architectural design and promoting more effective multimodal learning. Both the
+source code and the training dataset of Ovis will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving code-mixed hate detection by native sample mixing: A case
+  study for Hindi-English code-mixed scenario 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debajyoti Mazumder, Aakash Kumar, Jasabanta Patro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hate detection has long been a challenging task for the NLP community. The
+task becomes complex in a code-mixed environment because the models must
+understand the context and the hate expressed through language alteration.
+Compared to the monolingual setup, we see very less work on code-mixed hate as
+large-scale annotated hate corpora are unavailable to make the study. To
+overcome this bottleneck, we propose using native language hate samples. We
+hypothesise that in the era of multilingual language models (MLMs), hate in
+code-mixed settings can be detected by majorly relying on the native language
+samples. Even though the NLP literature reports the effectiveness of MLMs on
+hate detection in many cross-lingual settings, their extensive evaluation in a
+code-mixed scenario is yet to be done. This paper attempts to fill this gap
+through rigorous empirical experiments. We considered the Hindi-English
+code-mixed setup as a case study as we have the linguistic expertise for the
+same. Some of the interesting observations we got are: (i) adding native hate
+samples in the code-mixed training set, even in small quantity, improved the
+performance of MLMs for code-mixed hate detection, (ii) MLMs trained with
+native samples alone observed to be detecting code-mixed hate to a large
+extent, (iii) The visualisation of attention scores revealed that, when native
+samples were included in training, MLMs could better focus on the hate emitting
+words in the code-mixed context, and (iv) finally, when hate is subjective or
+sarcastic, naively mixing native samples doesn't help much to detect code-mixed
+hate. We will release the data and code repository to reproduce the reported
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Generated from XeLaTeX</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FinGen: A <span class="highlight-title">Dataset</span> for Argument Generation in Finance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20708v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20708v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chung-Chi Chen, Hiroya Takamura, Ichiro Kobayashi, Yusuke Miyao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thinking about the future is one of the important activities that people do
+in daily life. Futurists also pay a lot of effort into figuring out possible
+scenarios for the future. We argue that the exploration of this direction is
+still in an early stage in the NLP research. To this end, we propose three
+argument generation tasks in the financial application scenario. Our
+experimental results show these tasks are still big challenges for
+representative generation models. Based on our empirical results, we further
+point out several unresolved issues and challenges in this research direction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ It is Simple Sometimes: A Study On Improving Aspect-Based Sentiment
+  Analysis Performance <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Cabello, Uchenna Akujuobi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aspect-Based Sentiment Analysis (ABSA) involves extracting opinions from
+textual data about specific entities and their corresponding aspects through
+various complementary subtasks. Several prior research has focused on
+developing ad hoc designs of varying complexities for these subtasks. In this
+paper, we present a generative framework extensible to any ABSA subtask. We
+build upon the instruction tuned model proposed by Scaria et al. (2023), who
+present an instruction-based model with task descriptions followed by
+in-context examples on ABSA subtasks. We propose PFInstruct, an extension to
+this instruction learning paradigm by appending an NLP-related task prefix to
+the task description. This simple approach leads to improved performance across
+all tested SemEval subtasks, surpassing previous state-of-the-art (SOTA) on the
+ATE subtask (Rest14) by +3.28 F1-score, and on the AOOE subtask by an average
+of +5.43 F1-score across SemEval datasets. Furthermore, we explore the impact
+of the prefix-enhanced prompt quality on the ABSA subtasks and find that even a
+noisy prefix enhances model performance compared to the baseline. Our method
+also achieves competitive results on a biomedical domain dataset (ERSA).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Lexical Sensitivity of LLMs: Combinatorial Optimization
+  for <span class="highlight-title">Prompt</span> Enhancement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20701v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20701v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengwei Zhan, Zhen Xu, Qian Tan, Jie Song, Ru Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) demonstrate exceptional instruct-following
+ability to complete various downstream tasks. Although this impressive ability
+makes LLMs flexible task solvers, their performance in solving tasks also
+heavily relies on instructions. In this paper, we reveal that LLMs are
+over-sensitive to lexical variations in task instructions, even when the
+variations are imperceptible to humans. By providing models with neighborhood
+instructions, which are closely situated in the latent representation space and
+differ by only one semantically similar word, the performance on downstream
+tasks can be vastly different. Following this property, we propose a black-box
+Combinatorial Optimization framework for Prompt Lexical Enhancement (COPLE).
+COPLE performs iterative lexical optimization according to the feedback from a
+batch of proxy tasks, using a search strategy related to word influence.
+Experiments show that even widely-used human-crafted prompts for current
+benchmarks suffer from the lexical sensitivity of models, and COPLE recovers
+the declined model ability in both instruct-following and solving downstream
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint Embeddings for Graph Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20684v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20684v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vlad Argatu, Aaron Haag, Oliver Lohse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have achieved impressive performance in text
+understanding and have become an essential tool for building smart assistants.
+Originally focusing on text, they have been enhanced with multimodal
+capabilities in recent works that successfully built visual instruction
+following assistants. As far as the graph modality goes, however, no such
+assistants have yet been developed. Graph structures are complex in that they
+represent relation between different features and are permutation invariant.
+Moreover, representing them in purely textual form does not always lead to good
+LLM performance even for finetuned models. As a result, there is a need to
+develop a new method to integrate graphs in LLMs for general graph
+understanding. This work explores the integration of the graph modality in LLM
+for general graph instruction following tasks. It aims at producing a deep
+learning model that enhances an underlying LLM with graph embeddings and trains
+it to understand them and to produce, given an instruction, an answer grounded
+in the graph representation. The approach performs significantly better than a
+graph to text approach and remains consistent even for larger graphs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unraveling and Mitigating Retriever Inconsistencies in
+  Retrieval-Augmented Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20680v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20680v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingda Li, Xinyu Li, Yifan Chen, Wenfeng Xuan, Weinan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although Retrieval-Augmented Large Language Models (RALMs) demonstrate their
+superiority in terms of factuality, they do not consistently outperform the
+original retrieval-free Language Models (LMs). Our experiments reveal that this
+example-level performance inconsistency exists not only between
+retrieval-augmented and retrieval-free LM but also among different retrievers.
+To understand this phenomenon, we investigate the degeneration behavior of
+RALMs and theoretically decompose it into four categories. Further analysis
+based on our decomposition reveals that the innate difference in knowledge
+sources and the unpredictable degeneration of the reader model contribute most
+to the inconsistency. Drawing from our analysis, we introduce Ensemble of
+Retrievers (EoR), a trainable framework that can adaptively retrieve from
+different knowledge sources and effectively decrease unpredictable reader
+errors. Our experiments on Open Domain Question Answering show that EoR
+substantially improves performance over the RALM with a single retriever by
+considerably reducing inconsistent behaviors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 (findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Position Coupling: Leveraging Task Structure for Improved Length
+  Generalization of <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanseul Cho, Jaeyoung Cha, Pranjal Awasthi, Srinadh Bhojanapalli, Anupam Gupta, Chulhee Yun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even for simple arithmetic tasks like integer addition, it is challenging for
+Transformers to generalize to longer sequences than those encountered during
+training. To tackle this problem, we propose position coupling, a simple yet
+effective method that directly embeds the structure of the tasks into the
+positional encoding of a (decoder-only) Transformer. Taking a departure from
+the vanilla absolute position mechanism assigning unique position IDs to each
+of the tokens, we assign the same position IDs to two or more "relevant"
+tokens; for integer addition tasks, we regard digits of the same significance
+as in the same position. On the empirical side, we show that with the proposed
+position coupling, a small (1-layer) Transformer trained on 1 to 30-digit
+additions can generalize up to 200-digit additions (6.67x of the trained
+length). On the theoretical side, we prove that a 1-layer Transformer with
+coupled positions can solve the addition task involving exponentially many
+digits, whereas any 1-layer Transformer without positional information cannot
+entirely solve it. We also demonstrate that position coupling can be applied to
+other algorithmic tasks such as addition with multiple summands, Nx2
+multiplication, copy/reverse, and a two-dimensional task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>73 pages, 20 figures, 90 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DORY: Deliberative <span class="highlight-title">Prompt</span> Recovery for LLM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20657v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20657v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lirong Gao, Ru Peng, Yiming Zhang, Junbo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt recovery in large language models (LLMs) is crucial for understanding
+how LLMs work and addressing concerns regarding privacy, copyright, etc. The
+trend towards inference-only APIs complicates this task by restricting access
+to essential outputs for recovery. To tackle this challenge, we extract
+prompt-related information from limited outputs and identify a strong(negative)
+correlation between output probability-based uncertainty and the success of
+prompt recovery. This finding led to the development of Deliberative PrOmpt
+RecoverY (DORY), our novel approach that leverages uncertainty to recover
+prompts accurately. DORY involves reconstructing drafts from outputs, refining
+these with hints, and filtering out noise based on uncertainty. Our evaluation
+across diverse LLMs and prompt benchmarks shows that DORY outperforms existing
+baselines, improving performance by approximately 10.82% and establishing a new
+state-of-the-art record in prompt recovery tasks. Significantly, DORY operates
+using a single LLM without any external resources or model, offering a
+cost-effective, user-friendly prompt recovery solution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Passage-specific <span class="highlight-title">Prompt</span> Tuning for Passage Reranking in Question
+  Answering with Large Language Models <span class="chip">SIGIR24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyang Wu, Zhiyuan Peng, Sravanthi Rajanala, Hsin-Tai Wu, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective passage retrieval and reranking methods have been widely utilized
+to identify suitable candidates in open-domain question answering tasks, recent
+studies have resorted to LLMs for reranking the retrieved passages by the
+log-likelihood of the question conditioned on each passage. Although these
+methods have demonstrated promising results, the performance is notably
+sensitive to the human-written prompt (or hard prompt), and fine-tuning LLMs
+can be computationally intensive and time-consuming. Furthermore, this approach
+limits the leverage of question-passage relevance pairs and passage-specific
+knowledge to enhance the ranking capabilities of LLMs. In this paper, we
+propose passage-specific prompt tuning for reranking in open-domain question
+answering (PSPT): a parameter-efficient method that fine-tunes learnable
+passage-specific soft prompts, incorporating passage-specific knowledge from a
+limited set of question-passage relevance pairs. The method involves ranking
+retrieved passages based on the log-likelihood of the model generating the
+question conditioned on each passage and the learned soft prompt. We conducted
+extensive experiments utilizing the Llama-2-chat-7B model across three publicly
+available open-domain question answering datasets and the results demonstrate
+the effectiveness of the proposed approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Gen-IR@SIGIR24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reward-based Input Construction for Cross-document Relation Extraction <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Byeonghu Na, Suhyeon Jo, Yeongmin Kim, Il-Chul Moon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relation extraction (RE) is a fundamental task in natural language
+processing, aiming to identify relations between target entities in text. While
+many RE methods are designed for a single sentence or document, cross-document
+RE has emerged to address relations across multiple long documents. Given the
+nature of long documents in cross-document RE, extracting document embeddings
+is challenging due to the length constraints of pre-trained language models.
+Therefore, we propose REward-based Input Construction (REIC), the first
+learning-based sentence selector for cross-document RE. REIC extracts sentences
+based on relational evidence, enabling the RE module to effectively infer
+relations. Since supervision of evidence sentences is generally unavailable, we
+train REIC using reinforcement learning with RE prediction scores as rewards.
+Experimental results demonstrate the superiority of our method over heuristic
+methods for different RE structures and backbones in cross-document RE. Our
+code is publicly available at https://github.com/aailabkaist/REIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shotluck Holmes: A Family of Efficient Small-Scale Large Language Vision
+  Models For Video Captioning and Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Luo, Austin Peng, Adithya Vasudev, Rishabh Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video is an increasingly prominent and information-dense medium, yet it poses
+substantial challenges for language models. A typical video consists of a
+sequence of shorter segments, or shots, that collectively form a coherent
+narrative. Each shot is analogous to a word in a sentence where multiple data
+streams of information (such as visual and auditory data) must be processed
+simultaneously. Comprehension of the entire video requires not only
+understanding the visual-audio information of each shot but also requires that
+the model links the ideas between each shot to generate a larger,
+all-encompassing story. Despite significant progress in the field, current
+works often overlook videos' more granular shot-by-shot semantic information.
+In this project, we propose a family of efficient large language vision models
+(LLVMs) to boost video summarization and captioning called Shotluck Holmes. By
+leveraging better pretraining and data collection strategies, we extend the
+abilities of existing small LLVMs from being able to understand a picture to
+being able to understand a sequence of frames. Specifically, we show that
+Shotluck Holmes achieves better performance than state-of-the-art results on
+the Shot2Story video captioning and summary task with significantly smaller and
+more computationally efficient models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models Enhanced Sequential Recommendation for Long-tail
+  User and Item 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qidong Liu, Xian Wu, Xiangyu Zhao, Yejing Wang, Zijian Zhang, Feng Tian, Yefeng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation systems (SRS) serve the purpose of predicting
+users' subsequent preferences based on their past interactions and have been
+applied across various domains such as e-commerce and social networking
+platforms. However, practical SRS encounters challenges due to the fact that
+most users engage with only a limited number of items, while the majority of
+items are seldom consumed. These challenges, termed as the long-tail user and
+long-tail item dilemmas, often create obstacles for traditional SRS methods.
+Mitigating these challenges is crucial as they can significantly impact user
+satisfaction and business profitability. While some research endeavors have
+alleviated these issues, they still grapple with issues such as seesaw or noise
+stemming from the scarcity of interactions. The emergence of large language
+models (LLMs) presents a promising avenue to address these challenges from a
+semantic standpoint. In this study, we introduce the Large Language Models
+Enhancement framework for Sequential Recommendation (LLM-ESR), which leverages
+semantic embeddings from LLMs to enhance SRS performance without increasing
+computational overhead. To combat the long-tail item challenge, we propose a
+dual-view modeling approach that fuses semantic information from LLMs with
+collaborative signals from traditional SRS. To address the long-tail user
+challenge, we introduce a retrieval augmented self-distillation technique to
+refine user preference representations by incorporating richer interaction data
+from similar users. Through comprehensive experiments conducted on three
+authentic datasets using three widely used SRS models, our proposed enhancement
+framework demonstrates superior performance compared to existing methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ToxVidLLM: A Multimodal LLM-based Framework for Toxicity Detection in
+  Code-Mixed Videos <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krishanu Maity, A. S. Poornash, Sriparna Saha, Pushpak Bhattacharyya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In an era of rapidly evolving internet technology, the surge in multimodal
+content, including videos, has expanded the horizons of online communication.
+However, the detection of toxic content in this diverse landscape, particularly
+in low-resource code-mixed languages, remains a critical challenge. While
+substantial research has addressed toxic content detection in textual data, the
+realm of video content, especially in non-English languages, has been
+relatively underexplored. This paper addresses this research gap by introducing
+a benchmark dataset, the first of its kind, consisting of 931 videos with 4021
+code-mixed Hindi-English utterances collected from YouTube. Each utterance
+within this dataset has been meticulously annotated for toxicity, severity, and
+sentiment labels. We have developed an advanced Multimodal Multitask framework
+built for Toxicity detection in Video Content by leveraging Large Language
+Models (LLMs), crafted for the primary objective along with the additional
+tasks of conducting sentiment and severity analysis. ToxVidLLM incorporates
+three key modules the Encoder module, Cross-Modal Synchronization module, and
+Multitask module crafting a generic multimodal LLM customized for intricate
+video classification tasks. Our experiments reveal that incorporating multiple
+modalities from the videos substantially enhances the performance of toxic
+content detection by achieving an Accuracy and Weighted F1 score of 94.29% and
+94.35%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Large Language Models for Entity Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qianyu Huang, Tongfang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity matching (EM) is a critical task in data integration, aiming to
+identify records across different datasets that refer to the same real-world
+entities. Traditional methods often rely on manually engineered features and
+rule-based systems, which struggle with diverse and unstructured data. The
+emergence of Large Language Models (LLMs) such as GPT-4 offers transformative
+potential for EM, leveraging their advanced semantic understanding and
+contextual capabilities. This vision paper explores the application of LLMs to
+EM, discussing their advantages, challenges, and future research directions.
+Additionally, we review related work on applying weak supervision and
+unsupervised approaches to EM, highlighting how LLMs can enhance these methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FineRadScore: A Radiology Report Line-by-Line Evaluation Technique
+  Generating Corrections with Severity Scores 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alyssa Huang, Oishi Banerjee, Kay Wu, Eduardo Pontes Reis, Pranav Rajpurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The current gold standard for evaluating generated chest x-ray (CXR) reports
+is through radiologist annotations. However, this process can be extremely
+time-consuming and costly, especially when evaluating large numbers of reports.
+In this work, we present FineRadScore, a Large Language Model (LLM)-based
+automated evaluation metric for generated CXR reports. Given a candidate report
+and a ground-truth report, FineRadScore gives the minimum number of
+line-by-line corrections required to go from the candidate to the ground-truth
+report. Additionally, FineRadScore provides an error severity rating with each
+correction and generates comments explaining why the correction was needed. We
+demonstrate that FineRadScore's corrections and error severity scores align
+with radiologist opinions. We also show that, when used to judge the quality of
+the report as a whole, FineRadScore aligns with radiologists as well as current
+state-of-the-art automated CXR evaluation metrics. Finally, we analyze
+FineRadScore's shortcomings to provide suggestions for future improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniBias: Unveiling and Mitigating LLM Bias through Internal Attention
+  and FFN Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20612v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20612v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanzhang Zhou, Zijian Feng, Zixiao Zhu, Junlang Qian, Kezhi Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated impressive capabilities in
+various tasks using the in-context learning (ICL) paradigm. However, their
+effectiveness is often compromised by inherent bias, leading to prompt
+brittleness, i.e., sensitivity to design settings such as example selection,
+order, and prompt formatting. Previous studies have addressed LLM bias through
+external adjustment of model outputs, but the internal mechanisms that lead to
+such bias remain unexplored. Our work delves into these mechanisms,
+particularly investigating how feedforward neural networks (FFNs) and attention
+heads result in the bias of LLMs. By Interpreting the contribution of
+individual FFN vectors and attention heads, we identify the biased LLM
+components that skew LLMs' prediction toward specific labels. To mitigate these
+biases, we introduce UniBias, an inference-only method that effectively
+identifies and eliminates biased FFN vectors and attention heads. Extensive
+experiments across 12 NLP datasets demonstrate that UniBias significantly
+enhances ICL performance and alleviates prompt brittleness of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bi-Directional <span class="highlight-title">Transformer</span>s vs. word2vec: Discovering Vulnerabilities in
+  Lifted Compiled Code 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gary A. McCully, John D. Hastings, Shengjie Xu, Adam Fortier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting vulnerabilities within compiled binaries is challenging due to lost
+high-level code structures and other factors such as architectural
+dependencies, compilers, and optimization options. To address these obstacles,
+this research explores vulnerability detection by using natural language
+processing (NLP) embedding techniques with word2vec, BERT, and RoBERTa to learn
+semantics from intermediate representation (LLVM) code. Long short-term memory
+(LSTM) neural networks were trained on embeddings from encoders created using
+approximately 118k LLVM functions from the Juliet dataset. This study is
+pioneering in its comparison of word2vec models with multiple bidirectional
+transformer (BERT, RoBERTa) embeddings built using LLVM code to train neural
+networks to detect vulnerabilities in compiled binaries. word2vec Continuous
+Bag of Words (CBOW) models achieved 92.3% validation accuracy in detecting
+vulnerabilities, outperforming word2vec Skip-Gram, BERT, and RoBERTa. This
+suggests that complex contextual NLP embeddings may not provide advantages over
+simpler word2vec models for this task when a limited number (e.g. 118K) of data
+samples are used to train the bidirectional transformer-based models. The
+comparative results provide novel insights into selecting optimal embeddings
+for learning compiler-independent semantic code representations to advance
+machine learning detection of vulnerabilities in compiled binaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 0 figures, IEEE 4th Cyber Awareness and Research Symposium
+  2024 (CARS'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying while Learning for Document Event Causality Identification <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Liu, Wei Xiang, Bang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event Causality Identification (ECI) aims to detect whether there exists a
+causal relation between two events in a document. Existing studies adopt a kind
+of identifying after learning paradigm, where events' representations are first
+learned and then used for the identification. Furthermore, they mainly focus on
+the causality existence, but ignoring causal direction. In this paper, we take
+care of the causal direction and propose a new identifying while learning mode
+for the ECI task. We argue that a few causal relations can be easily identified
+with high confidence, and the directionality and structure of these identified
+causalities can be utilized to update events' representations for boosting next
+round of causality identification. To this end, this paper designs an
+*iterative learning and identifying framework*: In each iteration, we construct
+an event causality graph, on which events' causal structure representations are
+updated for boosting causal identification. Experiments on two public datasets
+show that our approach outperforms the state-of-the-art algorithms in both
+evaluations for causality existence identification and direction
+identification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Masked Language Modeling Becomes Conditional Density Estimation for
+  Tabular Data Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20602v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20602v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seunghwan An, Gyeongdong Woo, Jaesung Lim, ChangHyun Kim, Sungchul Hong, Jong-June Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, our goal is to generate synthetic data for heterogeneous
+(mixed-type) tabular datasets with high machine learning utility (MLu). Given
+that the MLu performance relies on accurately approximating the conditional
+distributions, we focus on devising a synthetic data generation method based on
+conditional distribution estimation. We propose a novel synthetic data
+generation method, MaCoDE, by redefining the multi-class classification task of
+Masked Language Modeling (MLM) as histogram-based non-parametric conditional
+density estimation. Our proposed method enables estimating conditional
+densities across arbitrary combinations of target and conditional variables.
+Furthermore, we demonstrate that our proposed method bridges the theoretical
+gap between distributional learning and MLM. To validate the effectiveness of
+our proposed model, we conduct synthetic data generation experiments on 10
+real-world datasets. Given the analogy between predicting masked input tokens
+in MLM and missing data imputation, we also evaluate the performance of
+multiple imputations on incomplete datasets with various missing data
+mechanisms. Moreover, our proposed model offers the advantage of enabling
+adjustments to data privacy levels without requiring re-training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DAFNet: Dynamic Auxiliary Fusion for Sequential Model Editing in Large
+  Language Models <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taolin Zhang, Qizhou Chen, Dongyang Li, Chengyu Wang, Xiaofeng He, Longtao Huang, Hui Xue, Jun Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, while large language models (LLMs) have demonstrated impressive
+results, they still suffer from hallucination, i.e., the generation of false
+information. Model editing is the task of fixing factual mistakes in LLMs; yet,
+most previous works treat it as a one-time task, paying little attention to
+ever-emerging mistakes generated by LLMs. We address the task of sequential
+model editing (SME) that aims to rectify mistakes continuously. A Dynamic
+Auxiliary Fusion Network (DAFNet) is designed to enhance the semantic
+interaction among the factual knowledge within the entire sequence, preventing
+catastrophic forgetting during the editing process of multiple knowledge
+triples. Specifically, (1) for semantic fusion within a relation triple, we
+aggregate the intra-editing attention flow into auto-regressive self-attention
+with token-level granularity in LLMs. We further leverage multi-layer diagonal
+inter-editing attention flow to update the weighted representations of the
+entire sequence-level granularity. (2) Considering that auxiliary parameters
+are required to store the knowledge for sequential editing, we construct a new
+dataset named \textbf{DAFSet}, fulfilling recent, popular, long-tail and robust
+properties to enhance the generality of sequential editing. Experiments show
+DAFNet significantly outperforms strong baselines in single-turn and sequential
+editing. The usage of DAFSet also consistently improves the performance of
+other auxiliary network-based methods in various scenarios
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL2024 findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GAMedX: Generative AI-based Medical Entity Data Extractor Using Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20585v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20585v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed-Khalil Ghali, Abdelrahman Farrag, Hajar Sakai, Hicham El Baz, Yu Jin, Sarah Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly evolving field of healthcare and beyond, the integration of
+generative AI in Electronic Health Records (EHRs) represents a pivotal
+advancement, addressing a critical gap in current information extraction
+techniques. This paper introduces GAMedX, a Named Entity Recognition (NER)
+approach utilizing Large Language Models (LLMs) to efficiently extract entities
+from medical narratives and unstructured text generated throughout various
+phases of the patient hospital visit. By addressing the significant challenge
+of processing unstructured medical text, GAMedX leverages the capabilities of
+generative AI and LLMs for improved data extraction. Employing a unified
+approach, the methodology integrates open-source LLMs for NER, utilizing
+chained prompts and Pydantic schemas for structured output to navigate the
+complexities of specialized medical jargon. The findings reveal significant
+ROUGE F1 score on one of the evaluation datasets with an accuracy of 98\%. This
+innovation enhances entity extraction, offering a scalable, cost-effective
+solution for automated forms filling from unstructured data. As a result,
+GAMedX streamlines the processing of unstructured narratives, and sets a new
+standard in NER applications, contributing significantly to theoretical and
+practical advancements beyond the medical technology sphere.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Point of View of a Sentiment: Towards Clinician Bias Detection in
+  Psychiatric Notes <span class="chip">NAACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alissa A. Valentine, Lauren A. Lepow, Alexander W. Charney, Isotta Landi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In psychiatry, negative patient descriptions and stigmatizing language can
+contribute to healthcare disparities in two ways: (1) read by patients they can
+harm their trust and engagement with the medical center; (2) read by future
+providers they may negatively influence the future perspective of a patient. By
+leveraging large language models, this work aims to identify the sentiment
+expressed in psychiatric clinical notes based on the reader's point of view.
+Extracting sentences from the Mount Sinai Health System's large and diverse
+clinical notes, we used prompts and in-context learning to adapt three large
+language models (GPT-3.5, Llama 2, Mistral) to classify the sentiment conveyed
+by the sentences according to the provider or non-provider point of view.
+Results showed that GPT-3.5 aligns best to provider point of view, whereas
+Mistral aligns best to non-provider point of view.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Oral presentation at NAACL 2024 Queer in AI Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with
+  Ko-H5 Benchmark <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chanjun Park, Hyeonwoo Kim, Dahyun Kim, Seonghwan Cho, Sanghoon Kim, Sukyung Lee, Yungi Kim, Hwalsuk Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces the Open Ko-LLM Leaderboard and the Ko-H5 Benchmark as
+vital tools for evaluating Large Language Models (LLMs) in Korean.
+Incorporating private test sets while mirroring the English Open LLM
+Leaderboard, we establish a robust evaluation framework that has been well
+integrated in the Korean LLM community. We perform data leakage analysis that
+shows the benefit of private test sets along with a correlation study within
+the Ko-H5 benchmark and temporal analyses of the Ko-H5 score. Moreover, we
+present empirical support for the need to expand beyond set benchmarks. We hope
+the Open Ko-LLM Leaderboard sets precedent for expanding LLM evaluation to
+foster more linguistic diversity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalization or Memorization: Data Contamination and Trustworthy
+  Evaluation for Large Language Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15938v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15938v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihong Dong, Xue Jiang, Huanyu Liu, Zhi Jin, Bin Gu, Mengfei Yang, Ge Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent statements about the impressive capabilities of large language models
+(LLMs) are usually supported by evaluating on open-access benchmarks.
+Considering the vast size and wide-ranging sources of LLMs' training data, it
+could explicitly or implicitly include test data, leading to LLMs being more
+susceptible to data contamination. However, due to the opacity of training
+data, the black-box access of models, and the rapid growth of synthetic
+training data, detecting and mitigating data contamination for LLMs faces
+significant challenges. In this paper, we propose CDD, which stands for
+Contamination Detection via output Distribution for LLMs. CDD necessitates only
+the sampled texts to detect data contamination, by identifying the peakedness
+of LLM's output distribution. To mitigate the impact of data contamination in
+evaluation, we also present TED: Trustworthy Evaluation via output
+Distribution, based on the correction of LLM's output distribution. To
+facilitate this study, we introduce two benchmarks, i.e., DetCon and ComiEval,
+for data contamination detection and contamination mitigation evaluation tasks.
+Extensive experimental results show that CDD achieves the average relative
+improvements of 21.8\%-30.2\% over other contamination detection approaches in
+terms of Accuracy, F1 Score, and AUC metrics, and can effectively detect
+implicit contamination. TED substantially mitigates performance improvements up
+to 66.9\% attributed to data contamination across various contamination setups.
+In real-world applications, we reveal that ChatGPT exhibits a high potential to
+suffer from data contamination on HumanEval benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpeechVerse: A Large-scale Generalizable Audio Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.08295v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.08295v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nilaksh Das, Saket Dingliwal, Srikanth Ronanki, Rohit Paturi, Zhaocheng Huang, Prashant Mathur, Jie Yuan, Dhanush Bekal, Xing Niu, Sai Muralidhar Jayanthi, Xilai Li, Karel Mundnich, Monica Sunkara, Sundararajan Srinivasan, Kyu J Han, Katrin Kirchhoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown incredible proficiency in performing
+tasks that require semantic understanding of natural language instructions.
+Recently, many works have further expanded this capability to perceive
+multimodal audio and text inputs, but their capabilities are often limited to
+specific fine-tuned tasks such as automatic speech recognition and translation.
+We therefore develop SpeechVerse, a robust multi-task training and curriculum
+learning framework that combines pre-trained speech and text foundation models
+via a small set of learnable parameters, while keeping the pre-trained models
+frozen during training. The models are instruction finetuned using continuous
+latent representations extracted from the speech foundation model to achieve
+optimal zero-shot performance on a diverse range of speech processing tasks
+using natural language instructions. We perform extensive benchmarking that
+includes comparing our model performance against traditional baselines across
+several datasets and tasks. Furthermore, we evaluate the model's capability for
+generalized instruction following by testing on out-of-domain datasets, novel
+prompts, and unseen tasks. Our empirical experiments reveal that our multi-task
+SpeechVerse model is even superior to conventional task-specific baselines on 9
+out of the 11 tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Single Column, 13 page</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SOUL: Unlocking the Power of Second-Order Optimization for LLM
+  Unlearning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18239v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18239v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghan Jia, Yihua Zhang, Yimeng Zhang, Jiancheng Liu, Bharat Runwal, James Diffenderfer, Bhavya Kailkhura, Sijia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have highlighted the necessity of effective
+unlearning mechanisms to comply with data regulations and ethical AI practices.
+LLM unlearning aims at removing undesired data influences and associated model
+capabilities without compromising utility out of the scope of unlearning. While
+interest in studying LLM unlearning is growing,the impact of the optimizer
+choice for LLM unlearning remains under-explored. In this work, we shed light
+on the significance of optimizer selection in LLM unlearning for the first
+time, establishing a clear connection between {second-order optimization} and
+influence unlearning (a classical approach using influence functions to update
+the model for data influence removal). This insight propels us to develop a
+second-order unlearning framework, termed SOUL, built upon the second-order
+clipped stochastic optimization (Sophia)-based LLM training method. SOUL
+extends the static, one-shot model update using influence unlearning to a
+dynamic, iterative unlearning process. Our extensive experiments show that SOUL
+consistently outperforms conventional first-order methods across various
+unlearning tasks, models, and metrics, suggesting the promise of second-order
+optimization in providing a scalable and easily implementable solution for LLM
+unlearning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ API Pack: A Massive Multi-Programming Language <span class="highlight-title">Dataset</span> for API Call
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09615v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09615v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Guo, Adriana Meza Soria, Wei Sun, Yikang Shen, Rameswar Panda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce API Pack, a massive multi-programming language dataset
+containing more than 1 million instruction-API call pairs to improve the API
+call generation capabilities of large language models. By fine-tuning
+CodeLlama-13B on 20,000 Python instances from API Pack, we achieved around 10%
+and 5% higher accuracy compared to GPT-3.5 and GPT-4, respectively, in
+generating unseen API calls. Fine-tuning on API Pack enables cross-programming
+language generalization by leveraging a large amount of data in one language
+and small amounts of data from other languages. Scaling the training data to 1
+million instances further improves the model's generalization to new APIs not
+encountered during training. We open-source the API Pack dataset, trained
+models, and associated source code at https://github.com/zguo0525/API-Pack to
+facilitate further research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ II-MMR: Identifying and Improving Multi-modal Multi-hop Reasoning in
+  Visual Question Answering <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11058v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11058v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihyung Kil, Farideh Tavazoee, Dongyeop Kang, Joo-Kyung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) often involves diverse reasoning scenarios
+across Vision and Language (V&L). Most prior VQA studies, however, have merely
+focused on assessing the model's overall accuracy without evaluating it on
+different reasoning cases. Furthermore, some recent works observe that
+conventional Chain-of-Thought (CoT) prompting fails to generate effective
+reasoning for VQA, especially for complex scenarios requiring multi-hop
+reasoning. In this paper, we propose II-MMR, a novel idea to identify and
+improve multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA
+question with an image and finds a reasoning path to reach its answer using two
+novel language promptings: (i) answer prediction-guided CoT prompt, or (ii)
+knowledge triplet-guided prompt. II-MMR then analyzes this path to identify
+different reasoning cases in current VQA benchmarks by estimating how many hops
+and what types (i.e., visual or beyond-visual) of reasoning are required to
+answer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR
+observes that most of their VQA questions are easy to answer, simply demanding
+"single-hop" reasoning, whereas only a few questions require "multi-hop"
+reasoning. Moreover, while the recent V&L model struggles with such complex
+multi-hop reasoning questions even using the traditional CoT method, II-MMR
+shows its effectiveness across all reasoning cases in both zero-shot and
+fine-tuning settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TrojanRAG: Retrieval-Augmented Generation Can Be Backdoor Driver in
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13401v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13401v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengzhou Cheng, Yidong Ding, Tianjie Ju, Zongru Wu, Wei Du, Ping Yi, Zhuosheng Zhang, Gongshen Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have raised concerns about potential security
+threats despite performing significantly in Natural Language Processing (NLP).
+Backdoor attacks initially verified that LLM is doing substantial harm at all
+stages, but the cost and robustness have been criticized. Attacking LLMs is
+inherently risky in security review, while prohibitively expensive. Besides,
+the continuous iteration of LLMs will degrade the robustness of backdoors. In
+this paper, we propose TrojanRAG, which employs a joint backdoor attack in the
+Retrieval-Augmented Generation, thereby manipulating LLMs in universal attack
+scenarios. Specifically, the adversary constructs elaborate target contexts and
+trigger sets. Multiple pairs of backdoor shortcuts are orthogonally optimized
+by contrastive learning, thus constraining the triggering conditions to a
+parameter subspace to improve the matching. To improve the recall of the RAG
+for the target contexts, we introduce a knowledge graph to construct structured
+data to achieve hard matching at a fine-grained level. Moreover, we normalize
+the backdoor scenarios in LLMs to analyze the real harm caused by backdoors
+from both attackers' and users' perspectives and further verify whether the
+context is a favorable tool for jailbreaking models. Extensive experimental
+results on truthfulness, language understanding, and harmfulness show that
+TrojanRAG exhibits versatility threats while maintaining retrieval capabilities
+on normal queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 14 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Calibrated Self-Rewarding Vision Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14622v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14622v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Zhou, Zhiyuan Fan, Dongjie Cheng, Sihan Yang, Zhaorun Chen, Chenhang Cui, Xiyao Wang, Yun Li, Linjun Zhang, Huaxiu Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) have made substantial progress by
+integrating pre-trained large language models (LLMs) and vision models through
+instruction tuning. Despite these advancements, LVLMs often exhibit the
+hallucination phenomenon, where generated text responses appear linguistically
+plausible but contradict the input image, indicating a misalignment between
+image and text pairs. This misalignment arises because the model tends to
+prioritize textual information over visual input, even when both the language
+model and visual representations are of high quality. Existing methods leverage
+additional models or human annotations to curate preference data and enhance
+modality alignment through preference optimization. These approaches may not
+effectively reflect the target LVLM's preferences, making the curated
+preferences easily distinguishable. Our work addresses these challenges by
+proposing the Calibrated Self-Rewarding (CSR) approach, which enables the model
+to self-improve by iteratively generating candidate responses, evaluating the
+reward for each response, and curating preference data for fine-tuning. In the
+reward modeling, we employ a step-wise strategy and incorporate visual
+constraints into the self-rewarding process to place greater emphasis on visual
+input. Empirical results demonstrate that CSR enhances performance and reduces
+hallucinations across ten benchmarks and tasks, achieving substantial
+improvements over existing methods by 7.62%. Our empirical results are further
+supported by rigorous theoretical analysis, under mild assumptions, verifying
+the effectiveness of introducing visual constraints into the self-rewarding
+paradigm. Additionally, CSR shows compatibility with different vision-language
+models and the ability to incrementally improve performance through iterative
+fine-tuning. Our data and code are available at
+https://github.com/YiyangZhou/CSR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fix some typos and add acknowledgement section in V3</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Use Your INSTINCT: INSTruction optimization for LLMs usIng Neural
+  bandits Coupled with <span class="highlight-title">Transformer</span>s <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02905v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02905v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoqiang Lin, Zhaoxuan Wu, Zhongxiang Dai, Wenyang Hu, Yao Shu, See-Kiong Ng, Patrick Jaillet, Bryan Kian Hsiang Low
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown remarkable instruction-following
+capabilities and achieved impressive performances in various applications.
+However, the performances of LLMs depend heavily on the instructions given to
+them, which are typically manually tuned with substantial human efforts. Recent
+work has used the query-efficient Bayesian optimization (BO) algorithm to
+automatically optimize the instructions given to black-box LLMs. However, BO
+usually falls short when optimizing highly sophisticated (e.g.,
+high-dimensional) objective functions, such as the functions mapping an
+instruction to the performance of an LLM. This is mainly due to the limited
+expressive power of the Gaussian process (GP) which is used by BO as a
+surrogate to model the objective function. Meanwhile, it has been repeatedly
+shown that neural networks (NNs), especially pre-trained transformers, possess
+strong expressive power and can model highly complex functions. So, we adopt a
+neural bandit algorithm which replaces the GP in BO by an NN surrogate to
+optimize instructions for black-box LLMs. More importantly, the neural bandit
+algorithm allows us to naturally couple the NN surrogate with the hidden
+representation learned by a pre-trained transformer (i.e., an open-source LLM),
+which significantly boosts its performance. These motivate us to propose our
+INSTruction optimization usIng Neural bandits Coupled with Transformers
+(INSTINCT) algorithm. We perform instruction optimization for ChatGPT and use
+extensive experiments to show that INSTINCT consistently outperforms baselines
+in different tasks, e.g., various instruction induction tasks and the task of
+improving zero-shot chain-of-thought instructions. Our code is available at
+https://github.com/xqlin98/INSTINCT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Not Just Novelty: A Longitudinal Study on Utility and Customization of
+  an AI Workflow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09894v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09894v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Long, Katy Ilonka Gero, Lydia B. Chilton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI brings novel and impressive abilities to help people in
+everyday tasks. There are many AI workflows that solve real and complex
+problems by chaining AI outputs together with human interaction. Although there
+is an undeniable lure of AI, it is uncertain how useful generative AI workflows
+are after the novelty wears off. Additionally, workflows built with generative
+AI have the potential to be easily customized to fit users' individual needs,
+but do users take advantage of this? We conducted a three-week longitudinal
+study with 12 users to understand the familiarization and customization of
+generative AI tools for science communication. Our study revealed that there
+exists a familiarization phase, during which users were exploring the novel
+capabilities of the workflow and discovering which aspects they found useful.
+After this phase, users understood the workflow and were able to anticipate the
+outputs. Surprisingly, after familiarization the perceived utility of the
+system was rated higher than before, indicating that the perceived utility of
+AI is not just a novelty effect. The increase in benefits mainly comes from
+end-users' ability to customize prompts, and thus potentially appropriate the
+system to their own needs. This points to a future where generative AI systems
+can allow us to design for appropriation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 16 figures. ACM Conference on Designing Interactive Systems
+  (DIS 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Cascade Learning for Efficient Inference over Streams <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04513v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04513v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lunyiu Nie, Zhimin Ding, Erdong Hu, Christopher Jermaine, Swarat Chaudhuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have a natural role in answering complex queries
+about data streams, but the high computational cost of LLM inference makes them
+infeasible in many such tasks. We propose online cascade learning, the first
+approach to address this challenge. The objective here is to learn a "cascade"
+of models, starting with lower-capacity models (such as logistic regression)
+and ending with a powerful LLM, along with a deferral policy that determines
+the model to be used on a given input. We formulate the task of learning
+cascades online as an imitation-learning problem, where smaller models are
+updated over time imitating the collected LLM demonstrations, and give a
+no-regret algorithm for the problem. Experimental results across four
+benchmarks show that our method parallels LLMs in accuracy while cutting down
+inference costs by as much as 90% with strong robustness against input
+distribution shifts, underscoring its efficacy and adaptability in stream
+processing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Main Conference Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SemRel2024: A Collection of Semantic Textual Relatedness <span class="highlight-title">Dataset</span>s for 13
+  Languages <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08638v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08638v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nedjma Ousidhoum, Shamsuddeen Hassan Muhammad, Mohamed Abdalla, Idris Abdulmumin, Ibrahim Said Ahmad, Sanchit Ahuja, Alham Fikri Aji, Vladimir Araujo, Abinew Ali Ayele, Pavan Baswani, Meriem Beloucif, Chris Biemann, Sofia Bourhim, Christine De Kock, Genet Shanko Dekebo, Oumaima Hourrane, Gopichand Kanumolu, Lokesh Madasu, Samuel Rutunda, Manish Shrivastava, Thamar Solorio, Nirmal Surange, Hailegnaw Getaneh Tilaye, Krishnapriya Vishnubhotla, Genta Winata, Seid Muhie Yimam, Saif M. Mohammad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exploring and quantifying semantic relatedness is central to representing
+language and holds significant implications across various NLP tasks. While
+earlier NLP research primarily focused on semantic similarity, often within the
+English language context, we instead investigate the broader phenomenon of
+semantic relatedness. In this paper, we present \textit{SemRel}, a new semantic
+relatedness dataset collection annotated by native speakers across 13
+languages: \textit{Afrikaans, Algerian Arabic, Amharic, English, Hausa, Hindi,
+Indonesian, Kinyarwanda, Marathi, Moroccan Arabic, Modern Standard Arabic,
+Spanish,} and \textit{Telugu}. These languages originate from five distinct
+language families and are predominantly spoken in Africa and Asia -- regions
+characterised by a relatively limited availability of NLP resources. Each
+instance in the SemRel datasets is a sentence pair associated with a score that
+represents the degree of semantic textual relatedness between the two
+sentences. The scores are obtained using a comparative annotation framework. We
+describe the data collection and annotation processes, challenges when building
+the datasets, baseline experiments, and their impact and utility in NLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18669v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18669v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vicky Zayats, Peter Chen, Melissa Ferrari, Dirk Padfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Integrating multiple generative foundation models, especially those trained
+on different modalities, into something greater than the sum of its parts poses
+significant challenges. Two key hurdles are the availability of aligned data
+(concepts that contain similar meaning but is expressed differently in
+different modalities), and effectively leveraging unimodal representations in
+cross-domain generative tasks, without compromising their original unimodal
+capabilities.
+  We propose Zipper, a multi-tower decoder architecture that addresses these
+concerns by using cross-attention to flexibly compose multimodal generative
+models from independently pre-trained unimodal decoders. In our experiments
+fusing speech and text modalities, we show the proposed architecture performs
+very competitively in scenarios with limited aligned text-speech data. We also
+showcase the flexibility of our model to selectively maintain unimodal (e.g.,
+text-to-text generation) generation performance by freezing the corresponding
+modal tower (e.g. text). In cross-modal tasks such as automatic speech
+recognition (ASR) where the output modality is text, we show that freezing the
+text backbone results in negligible performance degradation. In cross-modal
+tasks such as text-to-speech generation (TTS) where the output modality is
+speech, we show that using a pre-trained speech backbone results in superior
+performance to the baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review at NeurIPS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TRAM: Benchmarking Temporal Reasoning for Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00835v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00835v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqing Wang, Yun Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning about time is essential for understanding the nuances of events
+described in natural language. Previous research on this topic has been limited
+in scope, characterized by a lack of standardized benchmarks that would allow
+for consistent evaluations across different studies. In this paper, we
+introduce TRAM, a temporal reasoning benchmark composed of ten datasets,
+encompassing various temporal aspects of events such as order, arithmetic,
+frequency, and duration, designed to facilitate a comprehensive evaluation of
+the TeR capabilities of large language models (LLMs). We evaluate popular LLMs
+like GPT-4 and Llama2 in zero-shot and few-shot scenarios, and establish
+baselines with BERT-based and domain-specific models. Our findings indicate
+that the best-performing model lags significantly behind human performance. It
+is our aspiration that TRAM will spur further progress in enhancing the TeR
+capabilities of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Model the World with Language <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01399v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01399v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jessy Lin, Yuqing Du, Olivia Watkins, Danijar Hafner, Pieter Abbeel, Dan Klein, Anca Dragan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To interact with humans and act in the world, agents need to understand the
+range of language that people use and relate it to the visual world. While
+current agents can learn to execute simple language instructions, we aim to
+build agents that leverage diverse language -- language like "this button turns
+on the TV" or "I put the bowls away" -- that conveys general knowledge,
+describes the state of the world, provides interactive feedback, and more. Our
+key idea is that agents should interpret such diverse language as a signal that
+helps them predict the future: what they will observe, how the world will
+behave, and which situations will be rewarded. This perspective unifies
+language understanding with future prediction as a powerful self-supervised
+learning objective. We instantiate this in Dynalang, an agent that learns a
+multimodal world model to predict future text and image representations, and
+learns to act from imagined model rollouts. While current methods that learn
+language-conditioned policies degrade in performance with more diverse types of
+language, we show that Dynalang learns to leverage environment descriptions,
+game rules, and instructions to excel on tasks ranging from game-playing to
+navigating photorealistic home scans. Finally, we show that our method enables
+additional capabilities due to learning a generative model: Dynalang can be
+pretrained on text-only data, enabling learning from offline datasets, and
+generate language grounded in an environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Website: https://dynalang.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NoticIA: A Clickbait Article Summarization <span class="highlight-title">Dataset</span> in Spanish 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07611v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07611v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iker García-Ferrero, Begoña Altuna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present NoticIA, a dataset consisting of 850 Spanish news articles
+featuring prominent clickbait headlines, each paired with high-quality,
+single-sentence generative summarizations written by humans. This task demands
+advanced text understanding and summarization abilities, challenging the
+models' capacity to infer and connect diverse pieces of information to meet the
+user's informational needs generated by the clickbait headline. We evaluate the
+Spanish text comprehension capabilities of a wide range of state-of-the-art
+large language models. Additionally, we use the dataset to train
+ClickbaitFighter, a task-specific model that achieves near-human performance in
+this task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the journal Procesamiento del Lenguaje Natural</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Position: Stop Making Unscientific AGI Performance Claims <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03962v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03962v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Altmeyer, Andrew M. Demetriou, Antony Bartlett, Cynthia C. S. Liem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developments in the field of Artificial Intelligence (AI), and particularly
+large language models (LLMs), have created a 'perfect storm' for observing
+'sparks' of Artificial General Intelligence (AGI) that are spurious. Like
+simpler models, LLMs distill meaningful representations in their latent
+embeddings that have been shown to correlate with external variables.
+Nonetheless, the correlation of such representations has often been linked to
+human-like intelligence in the latter but not the former. We probe models of
+varying complexity including random projections, matrix decompositions, deep
+autoencoders and transformers: all of them successfully distill information
+that can be used to predict latent or external variables and yet none of them
+have previously been linked to AGI. We argue and empirically demonstrate that
+the finding of meaningful patterns in latent spaces of models cannot be seen as
+evidence in favor of AGI. Additionally, we review literature from the social
+sciences that shows that humans are prone to seek such patterns and
+anthropomorphize. We conclude that both the methodological setup and common
+public image of AI are ideal for the misinterpretation that correlations
+between model representations and some variables of interest are 'caused' by
+the model's understanding of underlying 'ground truth' relationships. We,
+therefore, call for the academic community to exercise extra caution, and to be
+keenly aware of principles of academic integrity, in interpreting and
+communicating about AI research outcomes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 15 figures. Pre-print to be published at International
+  Conference on Machine Learning (ICML) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Earth is Flat because...: Investigating LLMs' Belief towards
+  Misinformation via Persuasive Conversation <span class="chip">ACL'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.09085v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.09085v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rongwu Xu, Brian S. Lin, Shujian Yang, Tianqi Zhang, Weiyan Shi, Tianwei Zhang, Zhixuan Fang, Wei Xu, Han Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) encapsulate vast amounts of knowledge but still
+remain vulnerable to external misinformation. Existing research mainly studied
+this susceptibility behavior in a single-turn setting. However, belief can
+change during a multi-turn conversation, especially a persuasive one.
+Therefore, in this study, we delve into LLMs' susceptibility to persuasive
+conversations, particularly on factual questions that they can answer
+correctly. We first curate the Farm (i.e., Fact to Misinform) dataset, which
+contains factual questions paired with systematically generated persuasive
+misinformation. Then, we develop a testing framework to track LLMs' belief
+changes in a persuasive dialogue. Through extensive experiments, we find that
+LLMs' correct beliefs on factual knowledge can be easily manipulated by various
+persuasive strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL'24 (Main). Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Aya 23: Open Weight Releases to Further Multilingual Progress 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15032v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15032v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viraat Aryabumi, John Dang, Dwarak Talupuru, Saurabh Dash, David Cairuz, Hangyu Lin, Bharat Venkitesh, Madeline Smith, Jon Ander Campos, Yi Chern Tan, Kelly Marchisio, Max Bartolo, Sebastian Ruder, Acyr Locatelli, Julia Kreutzer, Nick Frosst, Aidan Gomez, Phil Blunsom, Marzieh Fadaee, Ahmet Üstün, Sara Hooker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This technical report introduces Aya 23, a family of multilingual language
+models. Aya 23 builds on the recent release of the Aya model (\"Ust\"un et al.,
+2024), focusing on pairing a highly performant pre-trained model with the
+recently released Aya collection (Singh et al., 2024). The result is a powerful
+multilingual large language model serving 23 languages, expanding state-of-art
+language modeling capabilities to approximately half of the world's population.
+The Aya model covered 101 languages whereas Aya 23 is an experiment in depth vs
+breadth, exploring the impact of allocating more capacity to fewer languages
+that are included during pre-training. Aya 23 outperforms both previous
+massively multilingual models like Aya 101 for the languages it covers, as well
+as widely used models like Gemma, Mistral and Mixtral on an extensive range of
+discriminative and generative tasks. We release the open weights for both the
+8B and 35B models as part of our continued commitment for expanding access to
+multilingual progress.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-hop Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.09140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.09140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaibhav Mavi, Anubhav Jangra, Adam Jatowt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of Question Answering (QA) has attracted significant research
+interest for long. Its relevance to language understanding and knowledge
+retrieval tasks, along with the simple setting makes the task of QA crucial for
+strong AI systems. Recent success on simple QA tasks has shifted the focus to
+more complex settings. Among these, Multi-Hop QA (MHQA) is one of the most
+researched tasks over the recent years. In broad terms, MHQA is the task of
+answering natural language questions that involve extracting and combining
+multiple pieces of information and doing multiple steps of reasoning. An
+example of a multi-hop question would be "The Argentine PGA Championship record
+holder has won how many tournaments worldwide?". Answering the question would
+need two pieces of information: "Who is the record holder for Argentine PGA
+Championship tournaments?" and "How many tournaments did [Answer of Sub Q1]
+win?". The ability to answer multi-hop questions and perform multi step
+reasoning can significantly improve the utility of NLP systems. Consequently,
+the field has seen a surge with high quality datasets, models and evaluation
+strategies. The notion of 'multiple hops' is somewhat abstract which results in
+a large variety of tasks that require multi-hop reasoning. This leads to
+different datasets and models that differ significantly from each other and
+makes the field challenging to generalize and survey. We aim to provide a
+general and formal definition of the MHQA task, and organize and summarize
+existing MHQA frameworks. We also outline some best practices for building MHQA
+datasets. This book provides a systematic and thorough introduction as well as
+the structuring of the existing attempts to this highly interesting, yet quite
+challenging task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Foundations and Trends in Information Retrieval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cooperate or Collapse: Emergence of Sustainability Behaviors in a
+  Society of LLM Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.16698v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.16698v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Piatti, Zhijing Jin, Max Kleiman-Weiner, Bernhard Schölkopf, Mrinmaya Sachan, Rada Mihalcea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As AI systems pervade human life, ensuring that large language models (LLMs)
+make safe decisions is a significant challenge. This paper introduces the
+Governance of the Commons Simulation (GovSim), a generative simulation platform
+designed to study strategic interactions and cooperative decision-making in
+LLMs. Using GovSim, we investigate the dynamics of sustainable resource sharing
+in a society of AI agents. This environment allows us to study the influence of
+ethical considerations, strategic planning, and negotiation skills on
+cooperative outcomes for AI agents. We develop an LLM-based agent architecture
+designed for these social dilemmas and test it with a variety of LLMs. We find
+that all but the most powerful LLM agents fail to achieve a sustainable
+equilibrium in GovSim. Ablations reveal that successful multi-agent
+communication between agents is critical for achieving cooperation in these
+cases. Furthermore, our analyses show that the failure to achieve sustainable
+cooperation in most LLMs stems from their inability to formulate and analyze
+hypotheses about the long-term effects of their actions on the equilibrium of
+the group. Finally, we show that agents that leverage
+``Universalization''-based reasoning, a theory of moral thinking, are able to
+achieve significantly greater sustainability. Taken together, GovSim enables us
+to study the mechanisms that underlie sustainable self-government with
+significant specificity and scale. We open source the full suite of our
+research results, including the simulation environment, agent prompts, and a
+comprehensive web interface.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Revised version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Context Pruning for Efficient and Interpretable Autoregressive
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15805v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15805v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sotiris Anagnostidis, Dario Pavllo, Luca Biggio, Lorenzo Noci, Aurelien Lucchi, Thomas Hofmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autoregressive Transformers adopted in Large Language Models (LLMs) are hard
+to scale to long sequences. Despite several works trying to reduce their
+computational cost, most of LLMs still adopt attention layers between all pairs
+of tokens in the sequence, thus incurring a quadratic cost. In this study, we
+present a novel approach that dynamically prunes contextual information while
+preserving the model's expressiveness, resulting in reduced memory and
+computational requirements during inference. Our method employs a learnable
+mechanism that determines which uninformative tokens can be dropped from the
+context at any point across the generation process. By doing so, our approach
+not only addresses performance concerns but also enhances interpretability,
+providing valuable insight into the model's decision-making process. Our
+technique can be applied to existing pre-trained models through a
+straightforward fine-tuning process, and the pruning strength can be specified
+by a sparsity parameter. Notably, our empirical findings demonstrate that we
+can effectively prune up to 80\% of the context without significant performance
+degradation on downstream tasks, offering a valuable tool for mitigating
+inference costs. Our reference implementation achieves up to $2\times$ increase
+in inference throughput and even greater memory savings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NLP Verification: Towards a General Methodology for Certifying
+  Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10144v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10144v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Casadio, Tanvi Dinkar, Ekaterina Komendantskaya, Luca Arnaboldi, Matthew L. Daggitt, Omri Isac, Guy Katz, Verena Rieser, Oliver Lemon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have exhibited substantial success in the field of
+Natural Language Processing and ensuring their safety and reliability is
+crucial: there are safety critical contexts where such models must be robust to
+variability or attack, and give guarantees over their output. Unlike Computer
+Vision, NLP lacks a unified verification methodology and, despite recent
+advancements in literature, they are often light on the pragmatical issues of
+NLP verification. In this paper, we attempt to distil and evaluate general
+components of an NLP verification pipeline, that emerges from the progress in
+the field to date. Our contributions are two-fold. Firstly, we give a general
+(i.e. algorithm-independent) characterisation of verifiable subspaces that
+result from embedding sentences into continuous spaces. We identify, and give
+an effective method to deal with, the technical challenge of semantic
+generalisability of verified subspaces; and propose it as a standard metric in
+the NLP verification pipelines (alongside with the standard metrics of model
+accuracy and model verifiability). Secondly, we propose a general methodology
+to analyse the effect of the embedding gap -- a problem that refers to the
+discrepancy between verification of geometric subspaces, and the semantic
+meaning of sentences which the geometric subspaces are supposed to represent.
+In extreme cases, poor choices in embedding of sentences may invalidate
+verification results. We propose a number of practical NLP methods that can
+help to quantify the effects of the embedding gap; and in particular we propose
+the metric of falsifiability of semantic subspaces as another fundamental
+metric to be reported as part of the NLP verification pipeline. We believe that
+together these general principles pave the way towards a more consolidated and
+effective development of this new domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMs achieve adult human performance on higher-order theory of mind
+  tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Winnie Street, John Oliver Siy, Geoff Keeling, Adrien Baranes, Benjamin Barnett, Michael McKibben, Tatenda Kanyere, Alison Lentz, Blaise Aguera y Arcas, Robin I. M. Dunbar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper examines the extent to which large language models (LLMs) have
+developed higher-order theory of mind (ToM); the human ability to reason about
+multiple mental and emotional states in a recursive manner (e.g. I think that
+you believe that she knows). This paper builds on prior work by introducing a
+handwritten test suite -- Multi-Order Theory of Mind Q&A -- and using it to
+compare the performance of five LLMs to a newly gathered adult human benchmark.
+We find that GPT-4 and Flan-PaLM reach adult-level and near adult-level
+performance on ToM tasks overall, and that GPT-4 exceeds adult performance on
+6th order inferences. Our results suggest that there is an interplay between
+model size and finetuning for the realisation of ToM abilities, and that the
+best-performing LLMs have developed a generalised capacity for ToM. Given the
+role that higher-order ToM plays in a wide range of cooperative and competitive
+human behaviours, these findings have significant implications for user-facing
+LLM applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Tale of Tails: Model Collapse as a Change of Scaling Laws 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elvis Dohmatob, Yunzhen Feng, Pu Yang, Francois Charton, Julia Kempe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As AI model size grows, neural scaling laws have become a crucial tool to
+predict the improvements of large models when increasing capacity and the size
+of original (human or natural) training data. Yet, the widespread use of
+popular models means that the ecosystem of online data and text will co-evolve
+to progressively contain increased amounts of synthesized data. In this paper
+we ask: How will the scaling laws change in the inevitable regime where
+synthetic data makes its way into the training corpus? Will future models,
+still improve, or be doomed to degenerate up to total (model) collapse? We
+develop a theoretical framework of model collapse through the lens of scaling
+laws. We discover a wide range of decay phenomena, analyzing loss of scaling,
+shifted scaling with number of generations, the ''un-learning" of skills, and
+grokking when mixing human and synthesized data. Our theory is validated by
+large-scale experiments with a transformer on an arithmetic task and text
+generation using the large language model Llama2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Out-of-Scope Intent Classification with Dual Encoding and
+  Threshold-based Re-Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossam M. Zawbaa, Wael Rashwan, Sourav Dutta, Haytham Assem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting out-of-scope user utterances is essential for task-oriented
+dialogues and intent classification. Current methodologies face difficulties
+with the unpredictable distribution of outliers and often rely on assumptions
+about data distributions. We present the Dual Encoder for Threshold-Based
+Re-Classification (DETER) to address these challenges. This end-to-end
+framework efficiently detects out-of-scope intents without requiring
+assumptions on data distributions or additional post-processing steps. The core
+of DETER utilizes dual text encoders, the Universal Sentence Encoder (USE) and
+the Transformer-based Denoising AutoEncoder (TSDAE), to generate user utterance
+embeddings, which are classified through a branched neural architecture.
+Further, DETER generates synthetic outliers using self-supervision and
+incorporates out-of-scope phrases from open-domain datasets. This approach
+ensures a comprehensive training set for out-of-scope detection. Additionally,
+a threshold-based re-classification mechanism refines the model's initial
+predictions. Evaluations on the CLINC-150, Stackoverflow, and Banking77
+datasets demonstrate DETER's efficacy. Our model outperforms previous
+benchmarks, increasing up to 13% and 5% in F1 score for known and unknown
+intents on CLINC-150 and Stackoverflow, and 16% for known and 24% % for unknown
+intents on Banking77. The source code has been released at
+https://github.com/Hossam-Mohammed-tech/Intent_Classification_OOS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LSTM-based Deep Neural Network With A Focus on Sentence Representation
+  for Sequential Sentence Classification in Medical Scientific Abstracts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15854v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15854v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phat Lam, Lam Pham, Tin Nguyen, Hieu Tang, Michael Seidl, Medina Andresel, Alexander Schindler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Sequential Sentence Classification task within the domain of medical
+abstracts, termed as SSC, involves the categorization of sentences into
+pre-defined headings based on their roles in conveying critical information in
+the abstract. In the SSC task, sentences are sequentially related to each
+other. For this reason, the role of sentence embeddings is crucial for
+capturing both the semantic information between words in the sentence and the
+contextual relationship of sentences within the abstract, which then enhances
+the SSC system performance. In this paper, we propose a LSTM-based deep
+learning network with a focus on creating comprehensive sentence representation
+at the sentence level. To demonstrate the efficacy of the created sentence
+representation, a system utilizing these sentence embeddings is also developed,
+which consists of a Convolutional-Recurrent neural network (C-RNN) at the
+abstract level and a multi-layer perception network (MLP) at the segment level.
+Our proposed system yields highly competitive results compared to
+state-of-the-art systems and further enhances the F1 scores of the baseline by
+1.0%, 2.8%, and 2.6% on the benchmark datasets PudMed 200K RCT, PudMed 20K RCT
+and NICTA-PIBOSO, respectively. This indicates the significant impact of
+improving sentence representation on boosting model performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to FedCSIS 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Optimizers Are Better Than One: LLM Catalyst for Enhancing
+  Gradient-Based Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19732v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19732v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixian Guo, Ming Liu, Zhilong Ji, Jinfeng Bai, Yiwen Guo, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning a skill generally relies on both practical experience by doer and
+insightful high-level guidance by instructor. Will this strategy also work well
+for solving complex non-convex optimization problems? Here, a common
+gradient-based optimizer acts like a disciplined doer, making locally optimal
+update at each step. Recent methods utilize large language models (LLMs) to
+optimize solutions for concrete problems by inferring from natural language
+instructions, akin to a high-level instructor. In this paper, we show that
+these two optimizers are complementary to each other, suggesting a
+collaborative optimization approach. The gradient-based optimizer and LLM-based
+optimizer are combined in an interleaved manner. We instruct LLMs using task
+descriptions and timely optimization trajectories recorded during
+gradient-based optimization. Inferred results from LLMs are used as restarting
+points for the next stage of gradient optimization. By leveraging both the
+locally rigorous gradient-based optimizer and the high-level deductive
+LLM-based optimizer, our combined optimization method consistently yields
+improvements over competitive baseline prompt tuning methods. Our results
+demonstrate the synergistic effect of conventional gradient-based optimization
+and the inference ability of LLMs. The code is released at
+https://github.com/guozix/LLM-catalyst.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Greed is All You Need: An Evaluation of Tokenizer Inference Methods <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01289v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01289v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omri Uzan, Craig W. Schmidt, Chris Tanner, Yuval Pinter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While subword tokenizers such as BPE and WordPiece are typically used to
+build vocabularies for NLP models, the method of decoding text into a sequence
+of tokens from these vocabularies is often left unspecified, or ill-suited to
+the method in which they were constructed. We provide a controlled analysis of
+seven tokenizer inference methods across four different algorithms and three
+vocabulary sizes, performed on a novel intrinsic evaluation suite we curated
+for English, combining measures rooted in morphology, cognition, and
+information theory. We show that for the most commonly used tokenizers, greedy
+inference performs surprisingly well; and that SaGe, a recently-introduced
+contextually-informed tokenizer, outperforms all others on morphological
+alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 (main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ELF: Encoding Speaker-Specific Latent Speech Feature for Speech
+  Synthesis <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.11745v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.11745v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jungil Kong, Junmo Lee, Jeongmin Kim, Beomjeong Kim, Jihoon Park, Dohee Kong, Changheon Lee, Sangjin Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a novel method for modeling numerous speakers, which
+enables expressing the overall characteristics of speakers in detail like a
+trained multi-speaker model without additional training on the target speaker's
+dataset. Although various works with similar purposes have been actively
+studied, their performance has not yet reached that of trained multi-speaker
+models due to their fundamental limitations. To overcome previous limitations,
+we propose effective methods for feature learning and representing target
+speakers' speech characteristics by discretizing the features and conditioning
+them to a speech synthesis model. Our method obtained a significantly higher
+similarity mean opinion score (SMOS) in subjective similarity evaluation than
+seen speakers of a high-performance multi-speaker model, even with unseen
+speakers. The proposed method also outperforms a zero-shot method by
+significant margins. Furthermore, our method shows remarkable performance in
+generating new artificial speakers. In addition, we demonstrate that the
+encoded latent features are sufficiently informative to reconstruct an original
+speaker's speech completely. It implies that our method can be used as a
+general methodology to encode and reconstruct speakers' characteristics in
+various tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When MOE Meets LLMs: Parameter Efficient Fine-tuning for Multi-task
+  Medical Applications <span class="chip">SIGIR'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.18339v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.18339v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qidong Liu, Xian Wu, Xiangyu Zhao, Yuanshao Zhu, Derong Xu, Feng Tian, Yefeng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent surge in Large Language Models (LLMs) has garnered significant
+attention across numerous fields. Fine-tuning is often required to fit general
+LLMs for a specific domain, like the web-based healthcare system. However, two
+problems arise during fine-tuning LLMs for medical applications. One is the
+task variety problem, which involves distinct tasks in real-world medical
+scenarios. The variety often leads to sub-optimal fine-tuning for data
+imbalance and seesaw problems. Besides, the large amount of parameters in LLMs
+leads to huge time and computation consumption by fine-tuning. To address these
+two problems, we propose a novel parameter efficient fine-tuning framework for
+multi-task medical applications, dubbed as MOELoRA. The designed framework aims
+to absorb both the benefits of mixture-of-expert (MOE) for multi-task learning
+and low-rank adaptation (LoRA) for parameter efficient fine-tuning. For
+unifying MOE and LoRA, we devise multiple experts as the trainable parameters,
+where each expert consists of a pair of low-rank matrices to retain the small
+size of trainable parameters. Then, a task-motivated gate function for all
+MOELoRA layers is proposed, which can control the contributions of each expert
+and produce distinct parameters for various tasks. We conduct experiments on a
+multi-task medical dataset, indicating MOELoRA outperforms the existing
+parameter efficient fine-tuning methods. The code is available online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by SIGIR'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Use Tools via Cooperative and Interactive Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03031v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03031v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengliang Shi, Shen Gao, Xiuyi Chen, Lingyong Yan, Haibo Shi, Dawei Yin, Zhumin Chen, Pengjie Ren, Suzan Verberne, Zhaochun Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tool learning empowers large language models (LLMs) as agents to use external
+tools to extend their capability. Existing methods employ one single LLM-based
+agent to iteratively select and execute tools, thereafter incorporating the
+result into the next action prediction. However, they still suffer from
+potential performance degradation when addressing complex tasks due to: (1) the
+limitation of the inherent capability of a single LLM to perform diverse
+actions, and (2) the struggle to adaptively correct mistakes when the task
+fails. To mitigate these problems, we propose the ConAgents, a Cooperative and
+interactive Agents framework, which modularizes the workflow of tool learning
+into Grounding, Execution, and Observing agents. We also introduce an iterative
+calibration (IterCali) method, enabling the agents to adapt themselves based on
+the feedback from the tool environment. Experiments conducted on three datasets
+demonstrate the superiority of our ConAgents (e.g., 6 point improvement over
+the SOTA baseline). We further provide fine-granularity analysis for the
+efficiency and consistency of our framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>working in process, 20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Just Rewrite It Again: A Post-Processing Method for Enhanced Semantic
+  Similarity and Privacy Preservation of Differentially Private Rewritten Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19831v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19831v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Meisenbacher, Florian Matthes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The study of Differential Privacy (DP) in Natural Language Processing often
+views the task of text privatization as a $\textit{rewriting}$ task, in which
+sensitive input texts are rewritten to hide explicit or implicit private
+information. In order to evaluate the privacy-preserving capabilities of a DP
+text rewriting mechanism, $\textit{empirical privacy}$ tests are frequently
+employed. In these tests, an adversary is modeled, who aims to infer sensitive
+information (e.g., gender) about the author behind a (privatized) text. Looking
+to improve the empirical protections provided by DP rewriting methods, we
+propose a simple post-processing method based on the goal of aligning rewritten
+texts with their original counterparts, where DP rewritten texts are rewritten
+$\textit{again}$. Our results show that such an approach not only produces
+outputs that are more semantically reminiscent of the original inputs, but also
+texts which score on average better in empirical privacy evaluations.
+Therefore, our approach raises the bar for DP rewriting methods in their
+empirical privacy evaluations, providing an extra layer of protection against
+malicious adversaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 2 tables. Accepted to ARES 2024 (IWAPS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Why Not Transform Chat Large Language Models to Non-English? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13923v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13923v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Geng, Ming Zhu, Jiahuan Li, Zhejian Lai, Wei Zou, Shuaijie She, Jiaxin Guo, Xiaofeng Zhao, Yinglu Li, Yuang Li, Chang Su, Yanqing Zhao, Xinglin Lyu, Min Zhang, Jiajun Chen, Hao Yang, Shujian Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The scarcity of non-English data limits the development of non-English large
+language models (LLMs). Transforming English-centric LLMs to non-English has
+been identified as an effective and resource-efficient method. Previous works
+start from base LLMs and perform knowledge distillation (KD) with data
+generated by stronger LLMs, e.g. GPT-4. Compared to base LLMs, chat LLMs are
+further optimized for advanced abilities, e.g. multi-turn conversation and
+human preference alignment, and thus more powerful in both helpfulness and
+safety. However, transforming a chat LLM involves two critical issues: (1) How
+can we effectively transfer advanced abilities without their supervised data?
+(2) How can we prevent the original knowledge from catastrophic forgetting
+during transformation? We target these issues by introducing a simple framework
+called TransLLM. For the first issue, TransLLM divides the transfer problem
+into some common sub-tasks with the translation chain-of-thought, which uses
+the translation as the bridge between English and non-English step-by-step. We
+further enhance the performance of sub-tasks with publicly available data. For
+the second issue, we propose a method comprising two synergistic components:
+low-rank adaptation for training to maintain the original LLM parameters, and
+recovery KD, which utilizes data generated by the chat LLM itself to recover
+the original knowledge from the frozen parameters. In the experiments, we
+transform the LLaMA-2-chat-7B to the Thai language. Our method, using only
+single-turn data, outperforms strong baselines and ChatGPT on multi-turn
+benchmark MT-bench. Furthermore, our method, without safety data, rejects more
+harmful queries of safety benchmark AdvBench than both ChatGPT and GPT-4.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPOR: A Comprehensive and Practical Evaluation Method for Compositional
+  Generalization in Data-to-Text Generation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10650v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10650v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyao Xu, Houfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Compositional generalization is an important ability of language models and
+has many different manifestations. For data-to-text generation, previous
+research on this ability is limited to a single manifestation called
+Systematicity and lacks consideration of large language models (LLMs), which
+cannot fully cover practical application scenarios. In this work, we propose
+SPOR, a comprehensive and practical evaluation method for compositional
+generalization in data-to-text generation. SPOR includes four aspects of
+manifestations (Systematicity, Productivity, Order invariance, and Rule
+learnability) and allows high-quality evaluation without additional manual
+annotations based on existing datasets. We demonstrate SPOR on two different
+datasets and evaluate some existing language models including LLMs. We find
+that the models are deficient in various aspects of the evaluation and need
+further improvement. Our work shows the necessity for comprehensive research on
+different manifestations of compositional generalization in data-to-text
+generation and provides a framework for evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Text Embeddings with Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00368v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00368v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Wang, Nan Yang, Xiaolong Huang, Linjun Yang, Rangan Majumder, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a novel and simple method for obtaining
+high-quality text embeddings using only synthetic data and less than 1k
+training steps. Unlike existing methods that often depend on multi-stage
+intermediate pre-training with billions of weakly-supervised text pairs,
+followed by fine-tuning with a few labeled datasets, our method does not
+require building complex training pipelines or relying on manually collected
+datasets that are often constrained by task diversity and language coverage. We
+leverage proprietary LLMs to generate diverse synthetic data for hundreds of
+thousands of text embedding tasks across 93 languages. We then fine-tune
+open-source decoder-only LLMs on the synthetic data using standard contrastive
+loss. Experiments demonstrate that our method achieves strong performance on
+highly competitive text embedding benchmarks without using any labeled data.
+Furthermore, when fine-tuned with a mixture of synthetic and labeled data, our
+model sets new state-of-the-art results on the BEIR and MTEB benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantum linear algebra is all you need for <span class="highlight-title">Transformer</span> architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16714v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16714v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naixu Guo, Zhan Yu, Matthew Choi, Aman Agrawal, Kouhei Nakaji, Alán Aspuru-Guzik, Patrick Rebentrost
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative machine learning methods such as large-language models are
+revolutionizing the creation of text and images. While these models are
+powerful they also harness a large amount of computational resources. The
+transformer is a key component in large language models that aims to generate a
+suitable completion of a given partial sequence. In this work, we investigate
+transformer architectures under the lens of fault-tolerant quantum computing.
+The input model is one where trained weight matrices are given as block
+encodings and we construct the query, key, and value matrices for the
+transformer. We show how to prepare a block encoding of the self-attention
+matrix, with a new subroutine for the row-wise application of the softmax
+function. In addition, we combine quantum subroutines to construct important
+building blocks in the transformer, the residual connection and layer
+normalization, and the feed-forward neural network. Our subroutines prepare an
+amplitude encoding of the transformer output, which can be measured to obtain a
+prediction. Based on common open-source large-language models, we provide
+insights into the behavior of important parameters determining the run time of
+the quantum algorithm. We discuss the potential and challenges for obtaining a
+quantum advantage.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 4 figures, 2 tables, comments are welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enabling Weak LLMs to Judge Response Reliability via Meta Ranking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12146v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12146v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijun Liu, Boqun Kou, Peng Li, Ming Yan, Ji Zhang, Fei Huang, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the strong performance of large language models (LLMs) across a wide
+range of tasks, they still have reliability issues. Previous studies indicate
+that strong LLMs like GPT-4-turbo excel in evaluating the reliability of
+responses from LLMs, but face efficiency and local deployment issues. Thus, to
+enable weak LLMs to effectively assess the reliability of LLM responses, we
+propose a novel cross-query-comparison-based method called $\textit{Meta
+Ranking}$ (MR). Unlike previous few-shot methods that solely based on
+in-context learning capabilities in LLMs, MR assesses reliability by pairwisely
+ranking the target query-response pair with multiple reference query-response
+pairs. We found that MR is highly effective in error detection for LLM
+responses, where weak LLMs, such as Phi-2, could surpass strong baselines like
+GPT-3.5-turbo, requiring only five reference samples and significantly
+improving efficiency. We further demonstrate that MR can enhance strong LLMs'
+performance in two practical applications: model cascading and instruction
+tuning. In model cascading, we combine open- and closed-source LLMs to achieve
+performance comparable to GPT-4-turbo with lower costs. In instruction tuning,
+we use MR for iterative training data filtering, significantly reducing data
+processing time and enabling LLaMA-7B and Phi-2 to surpass Alpaca-13B with
+fewer training tokens. These results underscore the high potential of MR in
+both efficiency and effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, under review. 28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Learn for Few-shot Continual Active Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.03732v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.03732v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stella Ho, Ming Liu, Shang Gao, Longxiang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning strives to ensure stability in solving previously seen
+tasks while demonstrating plasticity in a novel domain. Recent advances in
+continual learning are mostly confined to a supervised learning setting,
+especially in NLP domain. In this work, we consider a few-shot continual active
+learning setting where labeled data are inadequate, and unlabeled data are
+abundant but with a limited annotation budget. We exploit meta-learning and
+propose a method, called Meta-Continual Active Learning. This method
+sequentially queries the most informative examples from a pool of unlabeled
+data for annotation to enhance task-specific performance and tackle continual
+learning problems through meta-objective. Specifically, we employ meta-learning
+and experience replay to address inter-task confusion and catastrophic
+forgetting. We further incorporate textual augmentations to avoid memory
+over-fitting caused by experience replay and sample queries, thereby ensuring
+generalization. We conduct extensive experiments on benchmark text
+classification datasets from diverse domains to validate the feasibility and
+effectiveness of meta-continual active learning. We also analyze the impact of
+different active learning strategies on various meta continual learning models.
+The experimental results demonstrate that introducing randomness into sample
+selection is the best default strategy for maintaining generalization in
+meta-continual learning framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One Token Can Help! Learning Scalable and Pluggable Virtual Tokens for
+  Retrieval-Augmented Large Language Models <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19670v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19670v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Zhu, Zhaoheng Huang, Zhicheng Dou, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) is a promising way to improve large
+language models (LLMs) for generating more factual, accurate, and up-to-date
+content. Existing methods either optimize prompts to guide LLMs in leveraging
+retrieved information or directly fine-tune the LLMs to adapt to RAG scenarios.
+Although fine-tuning can yield better performance, it often compromises the
+LLMs' general generation capabilities by modifying their parameters. This
+limitation poses challenges in practical applications, especially when LLMs are
+already deployed, as parameter adjustments may affect their original
+functionality. To address this, we propose a novel method that involves
+learning scalable and pluggable virtual tokens for RAG. By maintaining the
+LLMs' original parameters and fine-tuning only the embeddings of these
+pluggable tokens, our approach not only enhances LLMs' performance but also
+preserves their general generation capacities. Furthermore, we design several
+training strategies to improve the scalability, flexibility, and
+generalizability of our method. Comprehensive experiments across nine
+question-answering tasks demonstrate the superiority of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>working in progress, repo: https://github.com/DaoD/SPRING/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Experimental Design for Active Transductive Inference in Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.08846v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.08846v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subhojyoti Mukherjee, Anusha Lalitha, Aniket Deshmukh, Ge Liu, Yifei Ma, Branislav Kveton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One emergent ability of large language models (LLMs) is that query-specific
+examples can be included in the prompt at inference time. In this work, we use
+active learning for adaptive prompt design and call it Active In-context Prompt
+Design (AIPD). We design the LLM prompt by adaptively choosing few-shot
+examples from a training set to optimize performance on a test set. The
+training examples are initially unlabeled and we obtain the label of the most
+informative ones, which maximally reduces uncertainty in the LLM prediction. We
+propose two algorithms, GO and SAL, which differ in how the few-shot examples
+are chosen. We analyze these algorithms in linear models: first GO and then use
+its equivalence with SAL. We experiment with many different tasks in small,
+medium-sized, and large language models; and show that GO and SAL outperform
+other methods for choosing few-shot examples in the LLM prompt at inference
+time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Iterative Feature Boosting for Explainable Speech Emotion Recognition <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20172v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20172v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alaa Nfissi, Wassim Bouachir, Nizar Bouguila, Brian Mishara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In speech emotion recognition (SER), using predefined features without
+considering their practical importance may lead to high dimensional datasets,
+including redundant and irrelevant information. Consequently, high-dimensional
+learning often results in decreasing model accuracy while increasing
+computational complexity. Our work underlines the importance of carefully
+considering and analyzing features in order to build efficient SER systems. We
+present a new supervised SER method based on an efficient feature engineering
+approach. We pay particular attention to the explainability of results to
+evaluate feature relevance and refine feature sets. This is performed
+iteratively through feature evaluation loop, using Shapley values to boost
+feature selection and improve overall framework performance. Our approach
+allows thus to balance the benefits between model performance and transparency.
+The proposed method outperforms human-level performance (HLP) and
+state-of-the-art machine learning methods in emotion recognition on the TESS
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in: 2023 International Conference on Machine Learning and
+  Applications (ICMLA)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nearest Neighbor Speculative Decoding for LLM Generation and Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19325v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19325v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghan Li, Xilun Chen, Ari Holtzman, Beidi Chen, Jimmy Lin, Wen-tau Yih, Xi Victoria Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often hallucinate and lack the ability to
+provide attribution for their generations. Semi-parametric LMs, such as kNN-LM,
+approach these limitations by refining the output of an LM for a given prompt
+using its nearest neighbor matches in a non-parametric data store. However,
+these models often exhibit slow inference speeds and produce non-fluent texts.
+In this paper, we introduce Nearest Neighbor Speculative Decoding (NEST), a
+novel semi-parametric language modeling approach that is capable of
+incorporating real-world text spans of arbitrary length into the LM generations
+and providing attribution to their sources. NEST performs token-level retrieval
+at each inference step to compute a semi-parametric mixture distribution and
+identify promising span continuations in a corpus. It then uses an approximate
+speculative decoding procedure that accepts a prefix of the retrieved span or
+generates a new token. NEST significantly enhances the generation quality and
+attribution rate of the base LM across a variety of knowledge-intensive tasks,
+surpassing the conventional kNN-LM method and performing competitively with
+in-context retrieval augmentation. In addition, NEST substantially improves the
+generation speed, achieving a 1.8x speedup in inference time when applied to
+Llama-2-Chat 70B.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Any2Point: Empowering Any-modality Large Models for Efficient 3D
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07989v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07989v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiwen Tang, Ray Zhang, Jiaming Liu, Zoey Guo, Dong Wang, Zhigang Wang, Bin Zhao, Shanghang Zhang, Peng Gao, Hongsheng Li, Xuelong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large foundation models have recently emerged as a prominent focus of
+interest, attaining superior performance in widespread scenarios. Due to the
+scarcity of 3D data, many efforts have been made to adapt pre-trained
+transformers from vision to 3D domains. However, such 2D-to-3D approaches are
+still limited, due to the potential loss of spatial geometries and high
+computation cost. More importantly, their frameworks are mainly designed for 2D
+models, lacking a general any-to-3D paradigm. In this paper, we introduce
+Any2Point, a parameter-efficient method to empower any-modality large models
+(vision, language, audio) for 3D understanding. Given a frozen transformer from
+any source modality, we propose a 3D-to-any (1D or 2D) virtual projection
+strategy that correlates the input 3D points to the original 1D or 2D positions
+within the source modality. This mechanism enables us to assign each 3D token
+with a positional encoding paired with the pre-trained model, which avoids 3D
+geometry loss caused by the true projection and better motivates the
+transformer for 3D learning with 1D/2D positional priors. Then, within each
+transformer block, we insert an any-to-3D guided adapter module for
+parameter-efficient fine-tuning. The adapter incorporates prior spatial
+knowledge from the source modality to guide the local feature aggregation of 3D
+tokens, compelling the semantic adaption of any-modality transformers. We
+conduct extensive experiments to showcase the effectiveness and efficiency of
+our method. Code and models are released at
+https://github.com/Ivan-Tang-3D/Any2Point.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and models are released at
+  https://github.com/Ivan-Tang-3D/Any2Point</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tailoring with Targeted Precision: Edit-Based Agents for Open-Domain
+  Procedure Customization <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09510v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09510v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yash Kumar Lal, Li Zhang, Faeze Brahman, Bodhisattwa Prasad Majumder, Peter Clark, Niket Tandon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How-to procedures, such as how to plant a garden, are now used by millions of
+users, but sometimes need customizing to meet a user's specific needs, e.g.,
+planting a garden without pesticides. Our goal is to measure and improve an
+LLM's ability to perform such customization. Our approach is to test several
+simple multi-LLM-agent architectures for customization, as well as an
+end-to-end LLM, using a new evaluation set, called CustomPlans, of over 200
+WikiHow procedures each with a customization need. We find that a simple
+architecture with two LLM agents used sequentially performs best, one that
+edits a generic how-to procedure and one that verifies its executability,
+significantly outperforming (10.5% absolute) an end-to-end prompted LLM. This
+suggests that LLMs can be configured reasonably effectively for procedure
+customization. This also suggests that multi-agent editing architectures may be
+worth exploring further for other customization applications (e.g. coding,
+creative writing) in the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera ready version accepted to Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spoken Question Answering and Speech Continuation Using
+  Spectrogram-Powered LLM <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15255v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15255v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eliya Nachmani, Alon Levkovitch, Roy Hirsch, Julian Salazar, Chulayuth Asawaroengchai, Soroosh Mariooryad, Ehud Rivlin, RJ Skerry-Ryan, Michelle Tadmor Ramanovich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Spectron, a novel approach to adapting pre-trained large language
+models (LLMs) to perform spoken question answering (QA) and speech
+continuation. By endowing the LLM with a pre-trained speech encoder, our model
+becomes able to take speech inputs and generate speech outputs. The entire
+system is trained end-to-end and operates directly on spectrograms, simplifying
+our architecture. Key to our approach is a training objective that jointly
+supervises speech recognition, text continuation, and speech synthesis using
+only paired speech-text pairs, enabling a `cross-modal' chain-of-thought within
+a single decoding pass. Our method surpasses existing spoken language models in
+speaker preservation and semantic coherence. Furthermore, the proposed model
+improves upon direct initialization in retaining the knowledge of the original
+LLM as demonstrated through spoken QA datasets. We release our audio samples
+(https://michelleramanovich.github.io/spectron/spectron) and spoken QA dataset
+(https://github.com/google-research-datasets/LLAMA1-Test-Set).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024 camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Symbolic Tasks to Code Generation: Diversification Yields Better
+  Task Performers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19787v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19787v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dylan Zhang, Justin Wang, Francois Charton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning -- tuning large language models on instruction-output
+pairs -- is a promising technique for making models better adapted to the real
+world. Yet, the key factors driving the model's capability to understand and
+follow instructions not seen during training remain under-explored. Our
+investigation begins with a series of synthetic experiments within the
+theoretical framework of a Turing-complete algorithm called Markov algorithm,
+which allows fine-grained control over the instruction-tuning data.
+Generalization and robustness with respect to the training distribution emerge
+once a diverse enough set of tasks is provided, even though very few examples
+are provided for each task. We extend these initial results to a real-world
+application scenario of code generation and find that a more diverse
+instruction set, extending beyond code-related tasks, improves the performance
+of code generation. Our observations suggest that a more diverse semantic space
+for instruction-tuning sets greatly improves the model's ability to follow
+instructions and perform tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedFLIP: Medical Vision-and-Language <span class="highlight-title">Self-supervised</span> Fast <span class="highlight-title">Pre-Train</span>ing
+  with Masked Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04626v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04626v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Li, Tianfang Zhang, Xinglin Zhang, Jiaqi Liu, Bingqi Ma, Yan Luo, Tao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the domain of medical analysis, extensive research has explored the
+potential of mutual learning between Masked Autoencoders(MAEs) and multimodal
+data. However, the impact of MAEs on intermodality remains a key challenge. We
+introduce MedFLIP, a Fast Language-Image Pre-training method for Medical
+analysis. We explore MAEs for zero-shot learning with crossed domains, which
+enhances the model's ability to learn from limited data, a common scenario in
+medical diagnostics. We verify that masking an image does not affect
+inter-modal learning. Furthermore, we propose the SVD loss to enhance the
+representation learning for characteristics of medical images, aiming to
+improve classification accuracy by leveraging the structural intricacies of
+such data. Our theory posits that masking encourages semantic preservation,
+robust feature extraction, regularization, domain adaptation, and invariance
+learning. Lastly, we validate using language will improve the zero-shot
+performance for the medical image analysis. MedFLIP's scaling of the masking
+process marks an advancement in the field, offering a pathway to rapid and
+precise medical image analysis without the traditional computational
+bottlenecks. Through experiments and validation, MedFLIP demonstrates efficient
+performance improvements, helps for future research and application in medical
+diagnostics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extreme Miscalibration and the Illusion of Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17509v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17509v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vyas Raina, Samson Tan, Volkan Cevher, Aditya Rawal, Sheng Zha, George Karypis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based Natural Language Processing (NLP) models are vulnerable
+to adversarial attacks, where small perturbations can cause a model to
+misclassify. Adversarial Training (AT) is often used to increase model
+robustness. However, we have discovered an intriguing phenomenon: deliberately
+or accidentally miscalibrating models masks gradients in a way that interferes
+with adversarial attack search methods, giving rise to an apparent increase in
+robustness. We show that this observed gain in robustness is an illusion of
+robustness (IOR), and demonstrate how an adversary can perform various forms of
+test-time temperature calibration to nullify the aforementioned interference
+and allow the adversarial attack to find adversarial examples. Hence, we urge
+the NLP community to incorporate test-time temperature scaling into their
+robustness evaluations to ensure that any observed gains are genuine. Finally,
+we show how the temperature can be scaled during \textit{training} to improve
+genuine robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Syntax Without Planting Trees: Understanding When and Why
+  <span class="highlight-title">Transformer</span>s Generalize Hierarchically 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.16367v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.16367v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kabir Ahuja, Vidhisha Balachandran, Madhur Panwar, Tianxing He, Noah A. Smith, Navin Goyal, Yulia Tsvetkov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers trained on natural language data have been shown to learn its
+hierarchical structure and generalize to sentences with unseen syntactic
+structures without explicitly encoding any structural bias. In this work, we
+investigate sources of inductive bias in transformer models and their training
+that could cause such generalization behavior to emerge. We extensively
+experiment with transformer models trained on multiple synthetic datasets and
+with different training objectives and show that while other objectives e.g.
+sequence-to-sequence modeling, prefix language modeling, often failed to lead
+to hierarchical generalization, models trained with the language modeling
+objective consistently learned to generalize hierarchically. We then conduct
+pruning experiments to study how transformers trained with the language
+modeling objective encode hierarchical structure. When pruned, we find joint
+existence of subnetworks within the model with different generalization
+behaviors (subnetworks corresponding to hierarchical structure and linear
+order). Finally, we take a Bayesian perspective to further uncover
+transformers' preference for hierarchical generalization: We establish a
+correlation between whether transformers generalize hierarchically on a dataset
+and whether the simplest explanation of that dataset is provided by a
+hierarchical grammar compared to regular grammars exhibiting linear
+generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code now available: https://github.com/kabirahuja2431/transformers-hg</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReEval: Automatic Hallucination Evaluation for Retrieval-Augmented Large
+  Language Models via Transferable Adversarial Attacks <span class="chip">NAACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12516v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12516v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaodong Yu, Hao Cheng, Xiaodong Liu, Dan Roth, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite remarkable advancements in mitigating hallucinations in large
+language models (LLMs) by retrieval augmentation, it remains challenging to
+measure the reliability of LLMs using static question-answering (QA) data.
+Specifically, given the potential of data contamination (e.g., leading to
+memorization), good static benchmark performance does not ensure that model can
+reliably use the provided evidence for responding, which is essential to avoid
+hallucination when the required knowledge is new or private. Inspired by
+adversarial machine learning, we investigate the feasibility of automatically
+perturbing existing static one for dynamic evaluation. Specifically, this paper
+presents ReEval, an LLM-based framework using prompt chaining to perturb the
+original evidence for generating new test cases for evaluating the LLMs'
+reliability in using new evidence for answering.
+  We implement ReEval using ChatGPT and evaluate the resulting variants of two
+popular open-domain QA datasets on a collection of LLMs under various prompting
+settings. Our generated data is human-readable and useful to trigger
+hallucination in LLM. Accurate models on static data are observed to produce
+unsupported answers from the perturbed evidence, with pronounced accuracy drops
+across LLMs including GPT-4. We find that our adversarial examples are
+transferable across all considered LLMs. The examples generated by a small
+model can be used to evaluate a much larger model, making our approach
+cost-effective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NAACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Multi-Range Theory of Translation Quality Measurement: MQM scoring
+  models and Statistical Quality Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16969v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16969v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arle Lommel, Serge Gladkoff, Alan Melby, Sue Ellen Wright, Ingemar Strandvik, Katerina Gasova, Angelika Vaasa, Andy Benzo, Romina Marazzato Sparano, Monica Foresi, Johani Innis, Lifeng Han, Goran Nenadic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The year 2024 marks the 10th anniversary of the Multidimensional Quality
+Metrics (MQM) framework for analytic translation quality evaluation. The MQM
+error typology has been widely used by practitioners in the translation and
+localization industry and has served as the basis for many derivative projects.
+The annual Conference on Machine Translation (WMT) shared tasks on both human
+and automatic translation quality evaluations used the MQM error typology.
+  The metric stands on two pillars: error typology and the scoring model. The
+scoring model calculates the quality score from annotation data, detailing how
+to convert error type and severity counts into numeric scores to determine if
+the content meets specifications. Previously, only the raw scoring model had
+been published. This April, the MQM Council published the Linear Calibrated
+Scoring Model, officially presented herein, along with the Non-Linear Scoring
+Model, which had not been published before.
+  This paper details the latest MQM developments and presents a universal
+approach to translation quality measurement across three sample size ranges. It
+also explains why Statistical Quality Control should be used for very small
+sample sizes, starting from a single sentence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>working paper, 20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Infidelity: When Faithfulness Measures on Masked Language Models
+  Are Misleading 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06795v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06795v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evan Crothers, Herna Viktor, Nathalie Japkowicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common approach to quantifying neural text classifier interpretability is
+to calculate faithfulness metrics based on iteratively masking salient input
+tokens and measuring changes in the model prediction. We propose that this
+property is better described as "sensitivity to iterative masking", and
+highlight pitfalls in using this measure for comparing text classifier
+interpretability. We show that iterative masking produces large variation in
+faithfulness scores between otherwise comparable Transformer encoder text
+classifiers. We then demonstrate that iteratively masked samples produce
+embeddings outside the distribution seen during training, resulting in
+unpredictable behaviour. We further explore task-specific considerations that
+undermine principled comparison of interpretability using iterative masking,
+such as an underlying similarity to salience-based adversarial attacks. Our
+findings give insight into how these behaviours affect neural text classifiers,
+and provide guidance on how sensitivity to iterative masking should be
+interpreted.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An NLP Crosswalk Between the Common Core State Standards and NAEP Item
+  Specifications <span class="chip">CCS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17284v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17284v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gregory Camilli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language processing (NLP) is rapidly developing for applications in
+educational assessment. In this paper, I describe an NLP-based procedure that
+can be used to support subject matter experts in establishing a crosswalk
+between item specifications and content standards. This paper extends recent
+work by proposing and demonstrating the use of multivariate similarity based on
+embedding vectors for sentences or texts. In particular, a hybrid regression
+procedure is demonstrated for establishing the match of each content standard
+to multiple item specifications. The procedure is used to evaluate the match of
+the Common Core State Standards (CCSS) for mathematics at grade 4 to the
+corresponding item specifications for the 2026 National Assessment of
+Educational Progress (NAEP).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Deleted repeated sections. Corrected proper nouns. Corrected type in
+  CCSS sentences</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Transformer</span>s Learn Higher-Order Optimization Methods for In-Context
+  Learning: A Study with Linear Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.17086v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.17086v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deqing Fu, Tian-Qi Chen, Robin Jia, Vatsal Sharan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers excel at in-context learning (ICL) -- learning from
+demonstrations without parameter updates -- but how they do so remains a
+mystery. Recent work suggests that Transformers may internally run Gradient
+Descent (GD), a first-order optimization method, to perform ICL. In this paper,
+we instead demonstrate that Transformers learn to approximate higher-order
+optimization methods for ICL. For in-context linear regression, Transformers
+share a similar convergence rate as Iterative Newton's Method; both are
+exponentially faster than GD. Empirically, predictions from successive
+Transformer layers closely match different iterations of Newton's Method
+linearly, with each middle layer roughly computing 3 iterations; thus,
+Transformers and Newton's method converge at roughly the same rate. In
+contrast, Gradient Descent converges exponentially more slowly. We also show
+that Transformers can learn in-context on ill-conditioned data, a setting where
+Gradient Descent struggles but Iterative Newton succeeds. Finally, to
+corroborate our empirical findings, we prove that Transformers can implement
+$k$ iterations of Newton's method with $k + \mathcal{O}(1)$ layers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">122</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video-MME: The First-Ever Comprehensive Evaluation Benchmark of
+  Multi-modal LLMs in Video Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21075v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21075v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyou Fu, Yuhan Dai, Yondong Luo, Lei Li, Shuhuai Ren, Renrui Zhang, Zihan Wang, Chenyu Zhou, Yunhang Shen, Mengdan Zhang, Peixian Chen, Yanwei Li, Shaohui Lin, Sirui Zhao, Ke Li, Tong Xu, Xiawu Zheng, Enhong Chen, Rongrong Ji, Xing Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the quest for artificial general intelligence, Multi-modal Large Language
+Models (MLLMs) have emerged as a focal point in recent advancements. However,
+the predominant focus remains on developing their capabilities in static image
+understanding. The potential of MLLMs in processing sequential visual data is
+still insufficiently explored, highlighting the absence of a comprehensive,
+high-quality assessment of their performance. In this paper, we introduce
+Video-MME, the first-ever full-spectrum, Multi-Modal Evaluation benchmark of
+MLLMs in Video analysis. Our work distinguishes from existing benchmarks
+through four key features: 1) Diversity in video types, spanning 6 primary
+visual domains with 30 subfields to ensure broad scenario generalizability; 2)
+Duration in temporal dimension, encompassing both short-, medium-, and
+long-term videos, ranging from 11 seconds to 1 hour, for robust contextual
+dynamics; 3) Breadth in data modalities, integrating multi-modal inputs besides
+video frames, including subtitles and audios, to unveil the all-round
+capabilities of MLLMs; 4) Quality in annotations, utilizing rigorous manual
+labeling by expert annotators to facilitate precise and reliable model
+assessment. 900 videos with a total of 256 hours are manually selected and
+annotated by repeatedly viewing all the video content, resulting in 2,700
+question-answer pairs. With Video-MME, we extensively evaluate various
+state-of-the-art MLLMs, including GPT-4 series and Gemini 1.5 Pro, as well as
+open-source image models like InternVL-Chat-V1.5 and video models like
+LLaVA-NeXT-Video. Our experiments reveal that Gemini 1.5 Pro is the
+best-performing commercial model, significantly outperforming the open-source
+models. Our dataset along with these findings underscores the need for further
+improvements in handling longer sequences and multi-modal data. Project Page:
+https://video-mme.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://video-mme.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Latent Intrinsics Emerge from Training to Relight 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Zhang, William Gao, Seemandhar Jain, Michael Maire, David. A. Forsyth, Anand Bhattad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image relighting is the task of showing what a scene from a source image
+would look like if illuminated differently. Inverse graphics schemes recover an
+explicit representation of geometry and a set of chosen intrinsics, then
+relight with some form of renderer. However error control for inverse graphics
+is difficult, and inverse graphics methods can represent only the effects of
+the chosen intrinsics. This paper describes a relighting method that is
+entirely data-driven, where intrinsics and lighting are each represented as
+latent variables. Our approach produces SOTA relightings of real scenes, as
+measured by standard metrics. We show that albedo can be recovered from our
+latent intrinsics without using any example albedos, and that the albedos
+recovered are competitive with SOTA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization Beyond Data Imbalance: A Controlled Study on CLIP for
+  Transferable Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Wen, Bingchen Zhao, Yilun Chen, Jiangmiao Pang, Xiaojuan Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Severe data imbalance naturally exists among web-scale vision-language
+datasets. Despite this, we find CLIP pre-trained thereupon exhibits notable
+robustness to the data imbalance compared to supervised learning, and
+demonstrates significant effectiveness in learning generalizable
+representations. With an aim to investigate the reasons behind this finding, we
+conduct controlled experiments to study various underlying factors, and reveal
+that CLIP's pretext task forms a dynamic classification problem wherein only a
+subset of classes is present in training. This isolates the bias from dominant
+classes and implicitly balances the learning signal. Furthermore, the
+robustness and discriminability of CLIP improve with more descriptive language
+supervision, larger data scale, and broader open-world concepts, which are
+inaccessible to supervised learning. Our study not only uncovers the mechanisms
+behind CLIP's generalizability beyond data imbalance but also provides
+transferable insights for the research community. The findings are validated in
+both supervised and self-supervised learning, enabling models trained on
+imbalanced data to achieve CLIP-level performance on diverse recognition tasks.
+Code will be available at: https://github.com/CVMI-Lab/clip-beyond-tail.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mixed Diffusion for 3D Indoor Scene Synthesis <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyi Hu, Diego Martin Arroyo, Stephanie Debats, Fabian Manhardt, Luca Carlone, Federico Tombari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Realistic conditional 3D scene synthesis significantly enhances and
+accelerates the creation of virtual environments, which can also provide
+extensive training data for computer vision and robotics research among other
+applications. Diffusion models have shown great performance in related
+applications, e.g., making precise arrangements of unordered sets. However,
+these models have not been fully explored in floor-conditioned scene synthesis
+problems. We present MiDiffusion, a novel mixed discrete-continuous diffusion
+model architecture, designed to synthesize plausible 3D indoor scenes from
+given room types, floor plans, and potentially pre-existing objects. We
+represent a scene layout by a 2D floor plan and a set of objects, each defined
+by its category, location, size, and orientation. Our approach uniquely
+implements structured corruption across the mixed discrete semantic and
+continuous geometric domains, resulting in a better conditioned problem for the
+reverse denoising step. We evaluate our approach on the 3D-FRONT dataset. Our
+experimental results demonstrate that MiDiffusion substantially outperforms
+state-of-the-art autoregressive and diffusion models in floor-conditioned 3D
+scene synthesis. In addition, our models can handle partial object constraints
+via a corruption-and-masking strategy without task specific training. We show
+MiDiffusion maintains clear advantages over existing approaches in scene
+completion and furniture arrangement experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 14 figures. Under review. Code to be released at:
+  https://github.com/MIT-SPARK/MiDiffusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Directly Denoising for Both Variance Preserving and Variance
+  Exploding Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21059v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21059v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingjing Wang, Dan Zhang, Feng Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous work has demonstrated that, in the Variance Preserving (VP)
+scenario, the nascent Directly Denoising Diffusion Models (DDDM) can generate
+high-quality images in one step while achieving even better performance in
+multistep sampling. However, the Pseudo-LPIPS loss used in DDDM leads to
+concerns about the bias in assessment. Here, we propose a unified DDDM (uDDDM)
+framework that generates images in one-step/multiple steps for both Variance
+Preserving (VP) and Variance Exploding (VE) cases. We provide theoretical
+proofs of the existence and uniqueness of the model's solution paths, as well
+as the non-intersecting property of the sampling paths. Additionally, we
+propose an adaptive Pseudo-Huber loss function to balance the convergence to
+the true solution and the stability of convergence process.Through a
+comprehensive evaluation, we demonstrate that uDDDMs achieve FID scores
+comparable to the best-performing methods available for CIFAR-10 in both VP and
+VE. Specifically, uDDDM achieves one-step generation on CIFAR10 with FID of
+2.63 and 2.53 for VE and VP respectively. By extending the sampling to 1000
+steps, we further reduce FID score to 1.71 and 1.65 for VE and VP respectively,
+setting state-of-the-art performance in both cases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Organic Weed Control Prototype using Directed Energy and Deep
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21056v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21056v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deng Cao, Hongbo Zhang, Rajveer Dhillon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Organic weed control is a vital to improve crop yield with a sustainable
+approach. In this work, a directed energy weed control robot prototype
+specifically designed for organic farms is proposed. The robot uses a novel
+distributed array robot (DAR) unit for weed treatment. Soybean and corn
+databases are built to train deep learning neural nets to perform weed
+recognition. The initial deep learning neural nets show a high performance in
+classifying crops. The robot uses a patented directed energy plant eradication
+recipe that is completely organic and UV-C free, with no chemical damage or
+physical disturbance to the soil. The deep learning can classify 8 common weed
+species in a soybean field under natural environment with up to 98% accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectrum-Aware Parameter Efficient Fine-Tuning for Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinxi Zhang, Song Wen, Ligong Han, Felix Juefei-Xu, Akash Srivastava, Junzhou Huang, Hao Wang, Molei Tao, Dimitris N. Metaxas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapting large-scale pre-trained generative models in a parameter-efficient
+manner is gaining traction. Traditional methods like low rank adaptation
+achieve parameter efficiency by imposing constraints but may not be optimal for
+tasks requiring high representation capacity. We propose a novel spectrum-aware
+adaptation framework for generative models. Our method adjusts both singular
+values and their basis vectors of pretrained weights. Using the Kronecker
+product and efficient Stiefel optimizers, we achieve parameter-efficient
+adaptation of orthogonal matrices. We introduce Spectral Orthogonal
+Decomposition Adaptation (SODA), which balances computational efficiency and
+representation capacity. Extensive evaluations on text-to-image diffusion
+models demonstrate SODA's effectiveness, offering a spectrum-aware alternative
+to existing fine-tuning methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kaleido Diffusion: Improving Conditional Diffusion Models with
+  Autoregressive Latent Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiatao Gu, Ying Shen, Shuangfei Zhai, Yizhe Zhang, Navdeep Jaitly, Joshua M. Susskind
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as a powerful tool for generating high-quality
+images from textual descriptions. Despite their successes, these models often
+exhibit limited diversity in the sampled images, particularly when sampling
+with a high classifier-free guidance weight. To address this issue, we present
+Kaleido, a novel approach that enhances the diversity of samples by
+incorporating autoregressive latent priors. Kaleido integrates an
+autoregressive language model that encodes the original caption and generates
+latent variables, serving as abstract and intermediary representations for
+guiding and facilitating the image generation process. In this paper, we
+explore a variety of discrete latent representations, including textual
+descriptions, detection bounding boxes, object blobs, and visual tokens. These
+representations diversify and enrich the input conditions to the diffusion
+models, enabling more diverse outputs. Our experimental results demonstrate
+that Kaleido effectively broadens the diversity of the generated image samples
+from a given textual description while maintaining high image quality.
+Furthermore, we show that Kaleido adheres closely to the guidance provided by
+the generated latent variables, demonstrating its capability to effectively
+control and direct the image generation process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ You Only Scan Once: Efficient Multi-dimension Sequential Modeling with
+  LightNet 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21022v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21022v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Qin, Yuxin Mao, Xuyang Shen, Dong Li, Jing Zhang, Yuchao Dai, Yiran Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linear attention mechanisms have gained prominence in causal language models
+due to their linear computational complexity and enhanced speed. However, the
+inherent decay mechanism in linear attention presents challenges when applied
+to multi-dimensional sequence modeling tasks, such as image processing and
+multi-modal learning. In these scenarios, the utilization of sequential
+scanning to establish a global receptive field necessitates multiple scans for
+multi-dimensional data, thereby leading to inefficiencies. This paper
+identifies the inefficiency caused by a multiplicative linear recurrence and
+proposes an efficient alternative additive linear recurrence to avoid the
+issue, as it can handle multi-dimensional data within a single scan. We further
+develop an efficient multi-dimensional sequential modeling framework called
+LightNet based on the new recurrence. Moreover, we present two new
+multi-dimensional linear relative positional encoding methods, MD-TPE and
+MD-LRPE to enhance the model's ability to discern positional information in
+multi-dimensional scenarios. Our empirical evaluations across various tasks,
+including image classification, image generation, bidirectional language
+modeling, and autoregressive language modeling, demonstrate the efficacy of
+LightNet, showcasing its potential as a versatile and efficient solution for
+multi-dimensional sequential modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report. Yiran Zhong is the corresponding author. The code
+  is available at https://github.com/OpenNLPLab/LightNet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MpoxSLDNet: A Novel CNN Model for Detecting Monkeypox Lesions and
+  Performance Comparison with <span class="highlight-title">Pre-train</span>ed Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatema Jannat Dihan, Saydul Akbar Murad, Abu Jafar Md Muzahid, K. M. Aslam Uddin, Mohammed J. F. Alenazi, Anupam Kumar Bairagi, Sujit Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monkeypox virus (MPXV) is a zoonotic virus that poses a significant threat to
+public health, particularly in remote parts of Central and West Africa. Early
+detection of monkeypox lesions is crucial for effective treatment. However, due
+to its similarity with other skin diseases, monkeypox lesion detection is a
+challenging task. To detect monkeypox, many researchers used various
+deep-learning models such as MobileNetv2, VGG16, ResNet50, InceptionV3,
+DenseNet121, EfficientNetB3, MobileNetV2, and Xception. However, these models
+often require high storage space due to their large size. This study aims to
+improve the existing challenges by introducing a CNN model named MpoxSLDNet
+(Monkeypox Skin Lesion Detector Network) to facilitate early detection and
+categorization of Monkeypox lesions and Non-Monkeypox lesions in digital
+images. Our model represents a significant advancement in the field of
+monkeypox lesion detection by offering superior performance metrics, including
+precision, recall, F1-score, accuracy, and AUC, compared to traditional
+pre-trained models such as VGG16, ResNet50, and DenseNet121. The key novelty of
+our approach lies in MpoxSLDNet's ability to achieve high detection accuracy
+while requiring significantly less storage space than existing models. By
+addressing the challenge of high storage requirements, MpoxSLDNet presents a
+practical solution for early detection and categorization of monkeypox lesions
+in resource-constrained healthcare settings. In this study, we have used
+"Monkeypox Skin Lesion Dataset" comprising 1428 skin images of monkeypox
+lesions and 1764 skin images of Non-Monkeypox lesions. Dataset's limitations
+could potentially impact the model's ability to generalize to unseen cases.
+However, the MpoxSLDNet model achieved a validation accuracy of 94.56%,
+compared to 86.25%, 84.38%, and 67.19% for VGG16, DenseNet121, and ResNet50,
+respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ StrucTexTv3: An Efficient Vision-Language Model for Text-rich Image
+  Perception, Comprehension, and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21013v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21013v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyuan Lyu, Yulin Li, Hao Zhou, Weihong Ma, Xingyu Wan, Qunyi Xie, Liang Wu, Chengquan Zhang, Kun Yao, Errui Ding, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-rich images have significant and extensive value, deeply integrated into
+various aspects of human life. Notably, both visual cues and linguistic symbols
+in text-rich images play crucial roles in information transmission but are
+accompanied by diverse challenges. Therefore, the efficient and effective
+understanding of text-rich images is a crucial litmus test for the capability
+of Vision-Language Models. We have crafted an efficient vision-language model,
+StrucTexTv3, tailored to tackle various intelligent tasks for text-rich images.
+The significant design of StrucTexTv3 is presented in the following aspects:
+Firstly, we adopt a combination of an effective multi-scale reduced visual
+transformer and a multi-granularity token sampler (MG-Sampler) as a visual
+token generator, successfully solving the challenges of high-resolution input
+and complex representation learning for text-rich images. Secondly, we enhance
+the perception and comprehension abilities of StrucTexTv3 through instruction
+learning, seamlessly integrating various text-oriented tasks into a unified
+framework. Thirdly, we have curated a comprehensive collection of high-quality
+text-rich images, abbreviated as TIM-30M, encompassing diverse scenarios like
+incidental scenes, office documents, web pages, and screenshots, thereby
+improving the robustness of our model. Our method achieved SOTA results in
+text-rich image perception tasks, and significantly improved performance in
+comprehension tasks. Among multimodal models with LLM decoder of approximately
+1.8B parameters, it stands out as a leader, which also makes the deployment of
+edge devices feasible. In summary, the StrucTexTv3 model, featuring efficient
+structural design, outstanding performance, and broad adaptability, offers
+robust support for diverse intelligent application tasks involving text-rich
+images, thus exhibiting immense potential for widespread application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hard Cases Detection in Motion Prediction by Vision-Language Foundation
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Yang, Qingwen Zhang, Kei Ikemura, Nazre Batool, John Folkesson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Addressing hard cases in autonomous driving, such as anomalous road users,
+extreme weather conditions, and complex traffic interactions, presents
+significant challenges. To ensure safety, it is crucial to detect and manage
+these scenarios effectively for autonomous driving systems. However, the rarity
+and high-risk nature of these cases demand extensive, diverse datasets for
+training robust models. Vision-Language Foundation Models (VLMs) have shown
+remarkable zero-shot capabilities as being trained on extensive datasets. This
+work explores the potential of VLMs in detecting hard cases in autonomous
+driving. We demonstrate the capability of VLMs such as GPT-4v in detecting hard
+cases in traffic participant motion prediction on both agent and scenario
+levels. We introduce a feasible pipeline where VLMs, fed with sequential image
+frames with designed prompts, effectively identify challenging agents or
+scenarios, which are verified by existing prediction models. Moreover, by
+taking advantage of this detection of hard cases by VLMs, we further improve
+the training efficiency of the existing motion prediction pipeline by
+performing data selection for the training samples suggested by GPT. We show
+the effectiveness and feasibility of our pipeline incorporating VLMs with
+state-of-the-art methods on NuScenes datasets. The code is accessible at
+https://github.com/KTH-RPL/Detect_VLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Intelligent Vehicles Symposium (IV) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Stopping Criteria for Training Generative Adversarial Networks in
+  Biomedical Imaging <span class="chip">SC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Muneeb Saad, Mubashir Husain Rehmani, Ruairi O'Reilly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) have high computational costs to train
+their complex architectures. Throughout the training process, GANs' output is
+analyzed qualitatively based on the loss and synthetic images' diversity and
+quality. Based on this qualitative analysis, training is manually halted once
+the desired synthetic images are generated. By utilizing an early stopping
+criterion, the computational cost and dependence on manual oversight can be
+reduced yet impacted by training problems such as mode collapse,
+non-convergence, and instability. This is particularly prevalent in biomedical
+imagery, where training problems degrade the diversity and quality of synthetic
+images, and the high computational cost associated with training makes complex
+architectures increasingly inaccessible. This work proposes a novel early
+stopping criteria to quantitatively detect training problems, halt training,
+and reduce the computational costs associated with synthesizing biomedical
+images. Firstly, the range of generator and discriminator loss values is
+investigated to assess whether mode collapse, non-convergence, and instability
+occur sequentially, concurrently, or interchangeably throughout the training of
+GANs. Secondly, utilizing these occurrences in conjunction with the Mean
+Structural Similarity Index (MS-SSIM) and Fr\'echet Inception Distance (FID)
+scores of synthetic images forms the basis of the proposed early stopping
+criteria. This work helps identify the occurrence of training problems in GANs
+using low-resource computational cost and reduces training time to generate
+diversified and high-quality synthetic images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted at the 35th IEEE Irish Signals and Systems
+  Conference (ISSC 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Quantification for Bird's Eye View Semantic Segmentation:
+  Methods and Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linlin Yu, Bowen Yang, Tianhao Wang, Kangshuo Li, Feng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fusion of raw features from multiple sensors on an autonomous vehicle to
+create a Bird's Eye View (BEV) representation is crucial for planning and
+control systems. There is growing interest in using deep learning models for
+BEV semantic segmentation. Anticipating segmentation errors and improving the
+explainability of DNNs is essential for autonomous driving, yet it is
+under-studied. This paper introduces a benchmark for predictive uncertainty
+quantification in BEV segmentation. The benchmark assesses various approaches
+across three popular datasets using two representative backbones and focuses on
+the effectiveness of predicted uncertainty in identifying misclassified and
+out-of-distribution (OOD) pixels, as well as calibration. Empirical findings
+highlight the challenges in uncertainty quantification. Our results find that
+evidential deep learning based approaches show the most promise by efficiently
+quantifying aleatoric and epistemic uncertainty. We propose the
+Uncertainty-Focal-Cross-Entropy (UFCE) loss, designed for highly imbalanced
+data, which consistently improves the segmentation quality and calibration.
+Additionally, we introduce a vacuity-scaled regularization term that enhances
+the model's focus on high uncertainty pixels, improving epistemic uncertainty
+quantification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeCo: Decoupling Token Compression from Semantic Abstraction in
+  Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linli Yao, Lei Li, Shuhuai Ren, Lean Wang, Yuanxin Liu, Xu Sun, Lu Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The visual projector, which bridges the vision and language modalities and
+facilitates cross-modal alignment, serves as a crucial component in MLLMs.
+However, measuring the effectiveness of projectors in vision-language alignment
+remains under-explored, which currently can only be inferred from the
+performance of MLLMs on downstream tasks. Motivated by the problem, this study
+examines the projector module by interpreting the vision-language semantic flow
+within MLLMs. Specifically, we trace back the semantic relevance flow from
+generated language tokens to raw visual encoder patches and the intermediate
+outputs produced by projectors. Our findings reveal that compressive projectors
+(e.g., QFormer), abstract visual patches into a limited set of semantic
+concepts, such as objects or attributes, resulting in a 'double abstraction'
+phenomenon. This involves a first visual semantic abstraction by the projector
+referring to pre-defined query tokens, and a second extraction by the LLM based
+on text instructions. The double abstraction is inefficient in training and
+will result in cumulative vision semantics deficiency. To mitigate this issue,
+we propose the key insight of 'Decouple Compression from Abstraction (DeCo),
+that is compressing the visual token number at the patch level by projectors
+and allowing the LLM to handle visual semantic abstraction entirely.
+Consequently, we adopt a simple compressor, i.e., 2D Adaptive Pooling, to
+downsample visual patches in a parameter-free manner. Empirical evaluation
+demonstrates that DeCo surpasses traditional compressive projectors regarding
+both performance and efficiency. It achieves performance gains of 0.9%, 7.1%,
+and 2.9% across the MLLM Benchmarks, Visual Localization, and Open-ended VQA
+tasks with fewer trainable parameters and faster convergence speed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Adversarial Networks in Ultrasound Imaging: Extending Field
+  of View Beyond Conventional Limits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matej Gazda, Samuel Kadoury, Jakub Gazda, Peter Drotar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transthoracic Echocardiography (TTE) is a fundamental, non-invasive
+diagnostic tool in cardiovascular medicine, enabling detailed visualization of
+cardiac structures crucial for diagnosing various heart conditions. Despite its
+widespread use, TTE ultrasound imaging faces inherent limitations, notably the
+trade-off between field of view (FoV) and resolution. This paper introduces a
+novel application of conditional Generative Adversarial Networks (cGANs),
+specifically designed to extend the FoV in TTE ultrasound imaging while
+maintaining high resolution. Our proposed cGAN architecture, termed echoGAN,
+demonstrates the capability to generate realistic anatomical structures through
+outpainting, effectively broadening the viewable area in medical imaging. This
+advancement has the potential to enhance both automatic and manual ultrasound
+navigation, offering a more comprehensive view that could significantly reduce
+the learning curve associated with ultrasound imaging and aid in more accurate
+diagnoses. The results confirm that echoGAN reliably reproduce detailed cardiac
+features, thereby promising a significant step forward in the field of
+non-invasive cardiac naviagation and diagnostics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Gaussian Scale-Space Fields <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Mujkanovic, Ntumba Elie Nsampi, Christian Theobalt, Hans-Peter Seidel, Thomas Leimkühler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian scale spaces are a cornerstone of signal representation and
+processing, with applications in filtering, multiscale analysis, anti-aliasing,
+and many more. However, obtaining such a scale space is costly and cumbersome,
+in particular for continuous representations such as neural fields. We present
+an efficient and lightweight method to learn the fully continuous, anisotropic
+Gaussian scale space of an arbitrary signal. Based on Fourier feature
+modulation and Lipschitz bounding, our approach is trained self-supervised,
+i.e., training does not require any manual filtering. Our neural Gaussian
+scale-space fields faithfully capture multiscale representations across a broad
+range of modalities, and support a diverse set of applications. These include
+images, geometry, light-stage data, texture anti-aliasing, and multiscale
+optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages; SIGGRAPH 2024; project page at
+  https://neural-gaussian-scale-space-fields.mpi-inf.mpg.de</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> Amortizing intractable inference in diffusion models for vision,
+  language, and control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddarth Venkatraman, Moksh Jain, Luca Scimeca, Minsu Kim, Marcin Sendera, Mohsin Hasan, Luke Rowe, Sarthak Mittal, Pablo Lemos, Emmanuel Bengio, Alexandre Adam, Jarrid Rector-Brooks, <span class="highlight-author">Yoshua Bengio</span>, Glen Berseth, Nikolay Malkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as effective distribution estimators in vision,
+language, and reinforcement learning, but their use as priors in downstream
+tasks poses an intractable posterior inference problem. This paper studies
+amortized sampling of the posterior over data, $\mathbf{x}\sim p^{\rm
+post}(\mathbf{x})\propto p(\mathbf{x})r(\mathbf{x})$, in a model that consists
+of a diffusion generative model prior $p(\mathbf{x})$ and a black-box
+constraint or likelihood function $r(\mathbf{x})$. We state and prove the
+asymptotic correctness of a data-free learning objective, relative trajectory
+balance, for training a diffusion model that samples from this posterior, a
+problem that existing methods solve only approximately or in restricted cases.
+Relative trajectory balance arises from the generative flow network perspective
+on diffusion models, which allows the use of deep reinforcement learning
+techniques to improve mode coverage. Experiments illustrate the broad potential
+of unbiased inference of arbitrary posteriors under diffusion priors: in vision
+(classifier guidance), language (infilling under a discrete diffusion LLM), and
+multimodal data (text-to-image generation). Beyond generative modeling, we
+apply relative trajectory balance to the problem of continuous control with a
+score-based behavior prior, achieving state-of-the-art results on benchmarks in
+offline reinforcement learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/GFNOrg/diffusion-finetuning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast yet Safe: Early-Exiting with Risk Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Metod Jazbec, Alexander Timans, Tin Hadži Veljković, Kaspar Sakmann, Dan Zhang, Christian A. Naesseth, Eric Nalisnick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling machine learning models significantly improves their performance.
+However, such gains come at the cost of inference being slow and
+resource-intensive. Early-exit neural networks (EENNs) offer a promising
+solution: they accelerate inference by allowing intermediate layers to exit and
+produce a prediction early. Yet a fundamental issue with EENNs is how to
+determine when to exit without severely degrading performance. In other words,
+when is it 'safe' for an EENN to go 'fast'? To address this issue, we
+investigate how to adapt frameworks of risk control to EENNs. Risk control
+offers a distribution-free, post-hoc solution that tunes the EENN's exiting
+mechanism so that exits only occur when the output is of sufficient quality. We
+empirically validate our insights on a range of vision and language tasks,
+demonstrating that risk control can produce substantial computational savings,
+all the while preserving user-specified performance goals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 11 figures, 4 tables (incl. appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Predicting ptychography probe positions using single-shot phase
+  retrieval neural network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20910v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20910v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Du, Tao Zhou, Junjing Deng, Daniel J. Ching, Steven Henke, Mathew J. Cherukara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ptychography is a powerful imaging technique that is used in a variety of
+fields, including materials science, biology, and nanotechnology. However, the
+accuracy of the reconstructed ptychography image is highly dependent on the
+accuracy of the recorded probe positions which often contain errors. These
+errors are typically corrected jointly with phase retrieval through numerical
+optimization approaches. When the error accumulates along the scan path or when
+the error magnitude is large, these approaches may not converge with
+satisfactory result. We propose a fundamentally new approach for ptychography
+probe position prediction for data with large position errors, where a neural
+network is used to make single-shot phase retrieval on individual diffraction
+patterns, yielding the object image at each scan point. The pairwise offsets
+among these images are then found using a robust image registration method, and
+the results are combined to yield the complete scan path by constructing and
+solving a linear equation. We show that our method can achieve good position
+prediction accuracy for data with large and accumulating errors on the order of
+$10^2$ pixels, a magnitude that often makes optimization-based algorithms fail
+to converge. For ptychography instruments without sophisticated position
+control equipment such as interferometers, our method is of significant
+practical potential.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Vision Models for Text-Heavy Content Understanding and
+  Interaction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adithya TG, Adithya SK, Abhinav R Bharadwaj, Abhiram HA, Dr. Surabhi Narayan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interacting and understanding with text heavy visual content with multiple
+images is a major challenge for traditional vision models. This paper is on
+enhancing vision models' capability to comprehend or understand and learn from
+images containing a huge amount of textual information from the likes of
+textbooks and research papers which contain multiple images like graphs, etc
+and tables in them with different types of axes and scales. The approach
+involves dataset preprocessing, fine tuning which is by using instructional
+oriented data and evaluation. We also built a visual chat application
+integrating CLIP for image encoding and a model from the Massive Text Embedding
+Benchmark which is developed to consider both textual and visual inputs. An
+accuracy of 96.71% was obtained. The aim of the project is to increase and also
+enhance the advance vision models' capabilities in understanding complex visual
+textual data interconnected data, contributing to multimodal AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures (including 1 graph)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MALT: Multi-scale Action Learning <span class="highlight-title">Transformer</span> for Online Action
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Yang, Ruoyu Wang, Yang Tan, Liping Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online action detection (OAD) aims to identify ongoing actions from streaming
+video in real-time, without access to future frames. Since these actions
+manifest at varying scales of granularity, ranging from coarse to fine,
+projecting an entire set of action frames to a single latent encoding may
+result in a lack of local information, necessitating the acquisition of action
+features across multiple scales. In this paper, we propose a multi-scale action
+learning transformer (MALT), which includes a novel recurrent decoder (used for
+feature fusion) that includes fewer parameters and can be trained more
+efficiently. A hierarchical encoder with multiple encoding branches is further
+proposed to capture multi-scale action features. The output from the preceding
+branch is then incrementally input to the subsequent branch as part of a
+cross-attention calculation. In this way, output features transition from
+coarse to fine as the branches deepen. We also introduce an explicit frame
+scoring mechanism employing sparse attention, which filters irrelevant frames
+more efficiently, without requiring an additional network. The proposed method
+achieved state-of-the-art performance on two benchmark datasets (THUMOS'14 and
+TVSeries), outperforming all existing models used for comparison, with an mAP
+of 0.2% for THUMOS'14 and an mcAP of 0.1% for TVseries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ S4Fusion: Saliency-aware Selective State Space Model for Infrared
+  Visible Image Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20881v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20881v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haolong Ma, Hui Li, Chunyang Cheng, Gaoang Wang, Xiaoning Song, Xiaojun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As one of the tasks in Image Fusion, Infrared and Visible Image Fusion aims
+to integrate complementary information captured by sensors of different
+modalities into a single image. The Selective State Space Model (SSSM), known
+for its ability to capture long-range dependencies, has demonstrated its
+potential in the field of computer vision. However, in image fusion, current
+methods underestimate the potential of SSSM in capturing the global spatial
+information of both modalities. This limitation prevents the simultaneous
+consideration of the global spatial information from both modalities during
+interaction, leading to a lack of comprehensive perception of salient targets.
+Consequently, the fusion results tend to bias towards one modality instead of
+adaptively preserving salient targets. To address this issue, we propose the
+Saliency-aware Selective State Space Fusion Model (S4Fusion). In our S4Fusion,
+the designed Cross-Modal Spatial Awareness Module (CMSA) can simultaneously
+focus on global spatial information from both modalities while facilitating
+their interaction, thereby comprehensively capturing complementary information.
+Additionally, S4Fusion leverages a pre-trained network to perceive uncertainty
+in the fused images. By minimizing this uncertainty, S4Fusion adaptively
+highlights salient targets from both images. Extensive experiments demonstrate
+that our approach produces high-quality images and enhances performance in
+downstream tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NurIPS, Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating Calibration and Corruption Robustness of Post-hoc Pruned
+  Perception CNNs: An Image Classification Benchmark Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pallavi Mitra, Gesina Schwalbe, Nadja Klein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional Neural Networks (CNNs) have achieved state-of-the-art
+performance in many computer vision tasks. However, high computational and
+storage demands hinder their deployment into resource-constrained environments,
+such as embedded devices. Model pruning helps to meet these restrictions by
+reducing the model size, while maintaining superior performance. Meanwhile,
+safety-critical applications pose more than just resource and performance
+constraints. In particular, predictions must not be overly confident, i.e.,
+provide properly calibrated uncertainty estimations (proper uncertainty
+calibration), and CNNs must be robust against corruptions like naturally
+occurring input perturbations (natural corruption robustness). This work
+investigates the important trade-off between uncertainty calibration, natural
+corruption robustness, and performance for current state-of-research post-hoc
+CNN pruning techniques in the context of image classification tasks. Our study
+reveals that post-hoc pruning substantially improves the model's uncertainty
+calibration, performance, and natural corruption robustness, sparking hope for
+safe and robust embedded CNNs.Furthermore, uncertainty calibration and natural
+corruption robustness are not mutually exclusive targets under pruning, as
+evidenced by the improved safety aspects obtained by post-hoc unstructured
+pruning with increasing compression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Responsible AI for Earth Observation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedram Ghamisi, Weikang Yu, Andrea Marinoni, Caroline M. Gevaert, Claudio Persello, Sivasakthy Selvakumaran, Manuela Girotto, Benjamin P. Horton, Philippe Rufin, Patrick Hostert, Fabio Pacifici, Peter M. Atkinson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The convergence of artificial intelligence (AI) and Earth observation (EO)
+technologies has brought geoscience and remote sensing into an era of
+unparalleled capabilities. AI's transformative impact on data analysis,
+particularly derived from EO platforms, holds great promise in addressing
+global challenges such as environmental monitoring, disaster response and
+climate change analysis. However, the rapid integration of AI necessitates a
+careful examination of the responsible dimensions inherent in its application
+within these domains. In this paper, we represent a pioneering effort to
+systematically define the intersection of AI and EO, with a central focus on
+responsible AI practices. Specifically, we identify several critical components
+guiding this exploration from both academia and industry perspectives within
+the EO field: AI and EO for social good, mitigating unfair biases, AI security
+in EO, geo-privacy and privacy-preserving measures, as well as maintaining
+scientific excellence, open data, and guiding AI usage based on ethical
+principles. Furthermore, the paper explores potential opportunities and
+emerging trends, providing valuable insights for future research endeavors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Channel Pruning for Multi-Head Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eunho Lee, Youngbae Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the strong performance of Transformers, their quadratic computation
+complexity presents challenges in applying them to vision tasks. Automatic
+pruning is one of effective methods for reducing computation complexity without
+heuristic approaches. However, directly applying it to multi-head attention is
+not straightforward due to channel misalignment. In this paper, we propose an
+automatic channel pruning method to take into account the multi-head attention
+mechanism. First, we incorporate channel similarity-based weights into the
+pruning indicator to preserve more informative channels in each head. Then, we
+adjust pruning indicator to enforce removal of channels in equal proportions
+across all heads, preventing the channel misalignment. We also add a reweight
+module to compensate for information loss resulting from channel removal, and
+an effective initialization step for pruning indicator based on difference of
+attention between original structure and each channel. Our proposed method can
+be used to not only original attention, but also linear attention, which is
+more efficient as linear complexity with respect to the number of tokens. On
+ImageNet-1K, applying our pruning method to the FLattenTransformer, which
+includes both attention mechanisms, shows outperformed accuracy for several
+MACs compared with previous state-of-the-art efficient models and pruned
+methods. Code will be available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeshXL: Neural Coordinate Field for Generative 3D Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sijin Chen, Xin Chen, Anqi Pang, Xianfang Zeng, Wei Cheng, Yijun Fu, Fukun Yin, Yanru Wang, Zhibin Wang, Chi Zhang, Jingyi Yu, Gang Yu, Bin Fu, Tao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The polygon mesh representation of 3D data exhibits great flexibility, fast
+rendering speed, and storage efficiency, which is widely preferred in various
+applications. However, given its unstructured graph representation, the direct
+generation of high-fidelity 3D meshes is challenging. Fortunately, with a
+pre-defined ordering strategy, 3D meshes can be represented as sequences, and
+the generation process can be seamlessly treated as an auto-regressive problem.
+In this paper, we validate the Neural Coordinate Field (NeurCF), an explicit
+coordinate representation with implicit neural embeddings, is a
+simple-yet-effective representation for large-scale sequential mesh modeling.
+After that, we present MeshXL, a family of generative pre-trained
+auto-regressive models, which addresses the process of 3D mesh generation with
+modern large language model approaches. Extensive experiments show that MeshXL
+is able to generate high-quality 3D meshes, and can also serve as foundation
+models for various down-stream applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MegActor: Harness the Power of Raw Video for Vivid Portrait Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20851v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20851v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shurong Yang, Huadong Li, Juhao Wu, Minhao Jing, Linze Li, Renhe Ji, Jiajun Liang, Haoqiang Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite raw driving videos contain richer information on facial expressions
+than intermediate representations such as landmarks in the field of portrait
+animation, they are seldom the subject of research. This is due to two
+challenges inherent in portrait animation driven with raw videos: 1)
+significant identity leakage; 2) Irrelevant background and facial details such
+as wrinkles degrade performance. To harnesses the power of the raw videos for
+vivid portrait animation, we proposed a pioneering conditional diffusion model
+named as MegActor. First, we introduced a synthetic data generation framework
+for creating videos with consistent motion and expressions but inconsistent IDs
+to mitigate the issue of ID leakage. Second, we segmented the foreground and
+background of the reference image and employed CLIP to encode the background
+details. This encoded information is then integrated into the network via a
+text embedding module, thereby ensuring the stability of the background.
+Finally, we further style transfer the appearance of the reference image to the
+driving video to eliminate the influence of facial details in the driving
+videos. Our final model was trained solely on public datasets, achieving
+results comparable to commercial models. We hope this will help the open-source
+community.The code is available at
+https://github.com/megvii-research/MegFaceAnimate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ einspace: Searching for Neural Architectures from Fundamental Operations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20838v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20838v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linus Ericsson, Miguel Espinosa, Chenhongyi Yang, Antreas Antoniou, Amos Storkey, Shay B. Cohen, Steven McDonagh, Elliot J. Crowley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural architecture search (NAS) finds high performing networks for a given
+task. Yet the results of NAS are fairly prosaic; they did not e.g. create a
+shift from convolutional structures to transformers. This is not least because
+the search spaces in NAS often aren't diverse enough to include such
+transformations a priori. Instead, for NAS to provide greater potential for
+fundamental design shifts, we need a novel expressive search space design which
+is built from more fundamental operations. To this end, we introduce einspace,
+a search space based on a parameterised probabilistic context-free grammar. Our
+space is versatile, supporting architectures of various sizes and complexities,
+while also containing diverse network operations which allow it to model
+convolutions, attention components and more. It contains many existing
+competitive architectures, and provides flexibility for discovering new ones.
+Using this search space, we perform experiments to find novel architectures as
+well as improvements on existing ones on the diverse Unseen NAS datasets. We
+show that competitive architectures can be obtained by searching from scratch,
+and we consistently find large improvements when initialising the search with
+strong baselines. We believe that this work is an important advancement towards
+a transformative NAS paradigm where search space expressivity and strategic
+search initialisation play key roles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at https://linusericsson.github.io/einspace/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval Meets Reasoning: Even High-school Textbook Knowledge Benefits
+  Multimodal Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Tan, Jingxuan Wei, Linzhuang Sun, Zhangyang Gao, Siyuan Li, Bihui Yu, Ruifeng Guo, Stan Z. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models equipped with retrieval-augmented generation (RAG)
+represent a burgeoning field aimed at enhancing answering capabilities by
+leveraging external knowledge bases. Although the application of RAG with
+language-only models has been extensively explored, its adaptation into
+multimodal vision-language models remains nascent. Going beyond mere answer
+generation, the primary goal of multimodal RAG is to cultivate the models'
+ability to reason in response to relevant queries. To this end, we introduce a
+novel multimodal RAG framework named RMR (Retrieval Meets Reasoning). The RMR
+framework employs a bi-modal retrieval module to identify the most relevant
+question-answer pairs, which then serve as scaffolds for the multimodal
+reasoning process. This training-free approach not only encourages the model to
+engage deeply with the reasoning processes inherent in the retrieved content
+but also facilitates the generation of answers that are precise and richly
+interpretable. Surprisingly, utilizing solely the ScienceQA dataset, collected
+from elementary and high school science curricula, RMR significantly boosts the
+performance of various vision-language models across a spectrum of benchmark
+datasets, including A-OKVQA, MMBench, and SEED. These outcomes highlight the
+substantial potential of our multimodal retrieval and reasoning mechanism to
+improve the reasoning capabilities of vision-language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Open-World Semi-Supervised Learning: Distribution Mismatch
+  and Inductive Inference <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seongheon Park, Hyuk Kwon, Kwanghoon Sohn, Kibok Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-world semi-supervised learning (OWSSL) extends conventional
+semi-supervised learning to open-world scenarios by taking account of novel
+categories in unlabeled datasets. Despite the recent advancements in OWSSL, the
+success often relies on the assumptions that 1) labeled and unlabeled datasets
+share the same balanced class prior distribution, which does not generally hold
+in real-world applications, and 2) unlabeled training datasets are utilized for
+evaluation, where such transductive inference might not adequately address
+challenges in the wild. In this paper, we aim to generalize OWSSL by addressing
+them. Our work suggests that practical OWSSL may require different training
+settings, evaluation methods, and learning strategies compared to those
+prevalent in the existing literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR Workshop on Computer Vision in the Wild (CVinW), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context-aware Difference Distilling for Multi-change Captioning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20810v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20810v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunbin Tu, Liang Li, Li Su, Zheng-Jun Zha, Chenggang Yan, Qingming Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-change captioning aims to describe complex and coupled changes within
+an image pair in natural language. Compared with single-change captioning, this
+task requires the model to have higher-level cognition ability to reason an
+arbitrary number of changes. In this paper, we propose a novel context-aware
+difference distilling (CARD) network to capture all genuine changes for
+yielding sentences. Given an image pair, CARD first decouples context features
+that aggregate all similar/dissimilar semantics, termed common/difference
+context features. Then, the consistency and independence constraints are
+designed to guarantee the alignment/discrepancy of common/difference context
+features. Further, the common context features guide the model to mine locally
+unchanged features, which are subtracted from the pair to distill locally
+difference features. Next, the difference context features augment the locally
+difference features to ensure that all changes are distilled. In this way, we
+obtain an omni-representation of all changes, which is translated into
+linguistic sentences by a transformer decoder. Extensive experiments on three
+public datasets show CARD performs favourably against state-of-the-art
+methods.The code is available at https://github.com/tuyunbin/CARD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 main conference (long paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ovis: Structural Embedding Alignment for Multimodal Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyin Lu, Yang Li, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, Han-Jia Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current Multimodal Large Language Models (MLLMs) typically integrate a
+pre-trained LLM with another pre-trained vision transformer through a
+connector, such as an MLP, endowing the LLM with visual capabilities. However,
+the misalignment between two embedding strategies in MLLMs -- the structural
+textual embeddings based on an embedding look-up table and the continuous
+embeddings generated directly by the vision encoder -- makes challenges for a
+more seamless fusion of visual and textual information. We propose Ovis, a
+novel MLLM architecture designed to structurally align visual and textual
+embeddings. Ovis integrates an additional learnable visual embedding table into
+the visual encoder's process. To capture rich visual semantics, each image
+patch indexes the visual embedding table multiple times, resulting in a final
+visual embedding that is a probabilistic combination of the indexed embeddings.
+This structural approach mirrors the method used for generating textual
+embeddings. Empirical evaluations on various multimodal benchmarks demonstrate
+that Ovis outperforms open-source MLLMs of similar parameter scales and even
+surpasses the proprietary model Qwen-VL-Plus overall. These results highlight
+the potential of Ovis' structured visual representation for advancing MLLM
+architectural design and promoting more effective multimodal learning. Both the
+source code and the training dataset of Ovis will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InsightSee: Advancing Multi-agent Vision-Language Models for Enhanced
+  Visual Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaxiang Zhang, Yaojia Mu, Guo-Niu Zhu, Zhongxue Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate visual understanding is imperative for advancing autonomous systems
+and intelligent robots. Despite the powerful capabilities of vision-language
+models (VLMs) in processing complex visual scenes, precisely recognizing
+obscured or ambiguously presented visual elements remains challenging. To
+tackle such issues, this paper proposes InsightSee, a multi-agent framework to
+enhance VLMs' interpretative capabilities in handling complex visual
+understanding scenarios. The framework comprises a description agent, two
+reasoning agents, and a decision agent, which are integrated to refine the
+process of visual information interpretation. The design of these agents and
+the mechanisms by which they can be enhanced in visual information processing
+are presented. Experimental results demonstrate that the InsightSee framework
+not only boosts performance on specific visual tasks but also retains the
+original models' strength. The proposed framework outperforms state-of-the-art
+algorithms in 6 out of 9 benchmark tests, with a substantial advancement in
+multimodal understanding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GS-Phong: Meta-Learned 3D Gaussians for Relightable Novel View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20791v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20791v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yumeng He, Yunbo Wang, Xiaokang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoupling the illumination in 3D scenes is crucial for novel view synthesis
+and relighting. In this paper, we propose a novel method for representing a
+scene illuminated by a point light using a set of relightable 3D Gaussian
+points. Inspired by the Blinn-Phong model, our approach decomposes the scene
+into ambient, diffuse, and specular components, enabling the synthesis of
+realistic lighting effects. To facilitate the decomposition of geometric
+information independent of lighting conditions, we introduce a novel bilevel
+optimization-based meta-learning framework. The fundamental idea is to view the
+rendering tasks under various lighting positions as a multi-task learning
+problem, which our meta-learning approach effectively addresses by generalizing
+the learned Gaussian geometries not only across different viewpoints but also
+across diverse light positions. Experimental results demonstrate the
+effectiveness of our approach in terms of training efficiency and rendering
+quality compared to existing methods for free-viewpoint relighting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoMoFusion: Fast and High-quality Fusion of Infrared and Visible Image
+  with Consistency Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiming Meng, Hui Li, Zeyang Zhang, Zhongwei Shen, Yunlong Yu, Xiaoning Song, Xiaojun Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models are widely utilized to model the distribution of fused
+images in the field of infrared and visible image fusion. However, current
+generative models based fusion methods often suffer from unstable training and
+slow inference speed. To tackle this problem, a novel fusion method based on
+consistency model is proposed, termed as CoMoFusion, which can generate the
+high-quality images and achieve fast image inference speed. In specific, the
+consistency model is used to construct multi-modal joint features in the latent
+space with the forward and reverse process. Then, the infrared and visible
+features extracted by the trained consistency model are fed into fusion module
+to generate the final fused image. In order to enhance the texture and salient
+information of fused images, a novel loss based on pixel value selection is
+also designed. Extensive experiments on public datasets illustrate that our
+method obtains the SOTA fusion performance compared with the existing fusion
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information Theoretic Text-to-Image Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Wang, Giulio Franzese, Alessandro Finamore, Massimo Gallo, Pietro Michiardi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models for Text-to-Image (T2I) conditional generation have seen
+tremendous success recently. Despite their success, accurately capturing user
+intentions with these models still requires a laborious trial and error
+process. This challenge is commonly identified as a model alignment problem, an
+issue that has attracted considerable attention by the research community.
+Instead of relying on fine-grained linguistic analyses of prompts, human
+annotation, or auxiliary vision-language models to steer image generation, in
+this work we present a novel method that relies on an information-theoretic
+alignment measure. In a nutshell, our method uses self-supervised fine-tuning
+and relies on point-wise mutual information between prompts and images to
+define a synthetic training set to induce model alignment. Our comparative
+analysis shows that our method is on-par or superior to the state-of-the-art,
+yet requires nothing but a pre-trained denoising network to estimate MI and a
+lightweight fine-tuning strategy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diffusion Models Are Innate One-Step Generators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Zheng, Tianming Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Models (DMs) have achieved great success in image generation and
+other fields. By fine sampling through the trajectory defined by the SDE/ODE
+solver based on a well-trained score model, DMs can generate remarkable
+high-quality results. However, this precise sampling often requires multiple
+steps and is computationally demanding. To address this problem, instance-based
+distillation methods have been proposed to distill a one-step generator from a
+DM by having a simpler student model mimic a more complex teacher model. Yet,
+our research reveals an inherent limitations in these methods: the teacher
+model, with more steps and more parameters, occupies different local minima
+compared to the student model, leading to suboptimal performance when the
+student model attempts to replicate the teacher. To avoid this problem, we
+introduce a novel distributional distillation method, which uses an exclusive
+distributional loss. This method exceeds state-of-the-art (SOTA) results while
+requiring significantly fewer training images. Additionally, we show that DMs'
+layers are activated differently at different time steps, leading to an
+inherent capability to generate images in a single step. Freezing most of the
+convolutional layers in a DM during distributional distillation leads to
+further performance improvements. Our method achieves the SOTA results on
+CIFAR-10 (FID 1.54), AFHQv2 64x64 (FID 1.23), FFHQ 64x64 (FID 0.85) and
+ImageNet 64x64 (FID 1.16) with great efficiency. Most of those results are
+obtained with only 5 million training images within 6 hours on 8 A100 GPUs.
+This breakthrough not only enhances the understanding of efficient image
+generation models but also offers a scalable framework for advancing the state
+of the art in various applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures and 4 tables on the main contents</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent
+  Codes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Benaglia, Angelo Porrello, Pietro Buzzega, Simone Calderara, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trajectory forecasting is crucial for video surveillance analytics, as it
+enables the anticipation of future movements for a set of agents, e.g.
+basketball players engaged in intricate interactions with long-term intentions.
+Deep generative models offer a natural learning approach for trajectory
+forecasting, yet they encounter difficulties in achieving an optimal balance
+between sampling fidelity and diversity. We address this challenge by
+leveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a
+discrete latent space to tackle the issue of posterior collapse. Specifically,
+we introduce an instance-based codebook that allows tailored latent
+representations for each example. In a nutshell, the rows of the codebook are
+dynamically adjusted to reflect contextual information (i.e., past motion
+patterns extracted from the observed trajectories). In this way, the
+discretization process gains flexibility, leading to improved reconstructions.
+Notably, instance-level dynamics are injected into the codebook through
+low-rank updates, which restrict the customization of the codebook to a lower
+dimension space. The resulting discrete space serves as the basis of the
+subsequent step, which regards the training of a diffusion-based predictive
+model. We show that such a two-fold framework, augmented with instance-level
+discretization, leads to accurate and diverse forecasts, yielding
+state-of-the-art performance on three established benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Augmentation in CLIP for Improved Anatomy Detection on
+  Multi-modal Medical Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mansi Kakkar, Dattesh Shanbhag, Chandan Aladahalli, Gurunath Reddy M
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-language models have emerged as a powerful tool for previously
+challenging multi-modal classification problem in the medical domain. This
+development has led to the exploration of automated image description
+generation for multi-modal clinical scans, particularly for radiology report
+generation. Existing research has focused on clinical descriptions for specific
+modalities or body regions, leaving a gap for a model providing entire-body
+multi-modal descriptions. In this paper, we address this gap by automating the
+generation of standardized body station(s) and list of organ(s) across the
+whole body in multi-modal MR and CT radiological images. Leveraging the
+versatility of the Contrastive Language-Image Pre-training (CLIP), we refine
+and augment the existing approach through multiple experiments, including
+baseline model fine-tuning, adding station(s) as a superset for better
+correlation between organs, along with image and language augmentations. Our
+proposed approach demonstrates 47.6% performance improvement over baseline
+PubMedCLIP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>$\copyright$ 2024 IEEE. Accepted in 46th Annual International
+  Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extreme Point Supervised Instance Segmentation <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20729v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20729v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeonjun Lee, Sehyun Hwang, Suha Kwak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel approach to learning instance segmentation
+using extreme points, i.e., the topmost, leftmost, bottommost, and rightmost
+points, of each object. These points are readily available in the modern
+bounding box annotation process while offering strong clues for precise
+segmentation, and thus allows to improve performance at the same annotation
+cost with box-supervised methods. Our work considers extreme points as a part
+of the true instance mask and propagates them to identify potential foreground
+and background points, which are all together used for training a pseudo label
+generator. Then pseudo labels given by the generator are in turn used for
+supervised learning of our final model. On three public benchmarks, our method
+significantly outperforms existing box-supervised methods, further narrowing
+the gap with its fully supervised counterpart. In particular, our model
+generates high-quality masks when a target object is separated into multiple
+parts, where previous box-supervised methods often fail.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024 Accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GI-NAS: Boosting Gradient Inversion Attacks through Adaptive Neural
+  Architecture Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenbo Yu, Hao Fang, Bin Chen, Xiaohang Sui, Chuan Chen, Hao Wu, Shu-Tao Xia, Ke Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient Inversion Attacks invert the transmitted gradients in Federated
+Learning (FL) systems to reconstruct the sensitive data of local clients and
+have raised considerable privacy concerns. A majority of gradient inversion
+methods rely heavily on explicit prior knowledge (e.g., a well pre-trained
+generative model), which is often unavailable in realistic scenarios. To
+alleviate this issue, researchers have proposed to leverage the implicit prior
+knowledge of an over-parameterized network. However, they only utilize a fixed
+neural architecture for all the attack settings. This would hinder the adaptive
+use of implicit architectural priors and consequently limit the
+generalizability. In this paper, we further exploit such implicit prior
+knowledge by proposing Gradient Inversion via Neural Architecture Search
+(GI-NAS), which adaptively searches the network and captures the implicit
+priors behind neural architectures. Extensive experiments verify that our
+proposed GI-NAS can achieve superior attack performance compared to
+state-of-the-art gradient inversion methods, even under more practical settings
+with high-resolution images, large-sized batches, and advanced defense
+strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ContextGS: Compact 3D Gaussian Splatting with Anchor Level Context Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Wang, Zhihao Li, Lanqing Guo, Wenhan Yang, Alex C. Kot, Bihan Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, 3D Gaussian Splatting (3DGS) has become a promising framework for
+novel view synthesis, offering fast rendering speeds and high fidelity.
+However, the large number of Gaussians and their associated attributes require
+effective compression techniques. Existing methods primarily compress neural
+Gaussians individually and independently, i.e., coding all the neural Gaussians
+at the same time, with little design for their interactions and spatial
+dependence. Inspired by the effectiveness of the context model in image
+compression, we propose the first autoregressive model at the anchor level for
+3DGS compression in this work. We divide anchors into different levels and the
+anchors that are not coded yet can be predicted based on the already coded ones
+in all the coarser levels, leading to more accurate modeling and higher coding
+efficiency. To further improve the efficiency of entropy coding, e.g., to code
+the coarsest level with no already coded anchors, we propose to introduce a
+low-dimensional quantized feature as the hyperprior for each anchor, which can
+be effectively compressed. Our work pioneers the context model in the anchor
+level for 3DGS representation, yielding an impressive size reduction of over
+100 times compared to vanilla 3DGS and 15 times compared to the most recent
+state-of-the-art work Scaffold-GS, while achieving comparable or even higher
+rendering quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Power of Cooperative Supervision: Multiple Teachers Framework for
+  Enhanced 3D Semi-Supervised Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20720v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20720v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin-Hee Lee, Jae-Keun Lee, Je-Seok Kim, Soon Kwon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To ensure safe urban driving for autonomous platforms, it is crucial not only
+to develop high-performance object detection techniques but also to establish a
+diverse and representative dataset that captures various urban environments and
+object characteristics. To address these two issues, we have constructed a
+multi-class 3D LiDAR dataset reflecting diverse urban environments and object
+characteristics, and developed a robust 3D semi-supervised object detection
+(SSOD) based on a multiple teachers framework. This SSOD framework categorizes
+similar classes and assigns specialized teachers to each category. Through
+collaborative supervision among these category-specialized teachers, the
+student network becomes increasingly proficient, leading to a highly effective
+object detector. We propose a simple yet effective augmentation technique,
+Pie-based Point Compensating Augmentation (PieAug), to enable the teacher
+network to generate high-quality pseudo-labels. Extensive experiments on the
+WOD, KITTI, and our datasets validate the effectiveness of our proposed method
+and the quality of our dataset. Experimental results demonstrate that our
+approach consistently outperforms existing state-of-the-art 3D semi-supervised
+object detection methods across all datasets. We plan to release our
+multi-class LiDAR dataset and the source code available on our Github
+repository in the near future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Climate Variable Downscaling with Conditional Normalizing Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20719v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20719v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christina Winkler, Paula Harder, David Rolnick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictions of global climate models typically operate on coarse spatial
+scales due to the large computational costs of climate simulations. This has
+led to a considerable interest in methods for statistical downscaling, a
+similar process to super-resolution in the computer vision context, to provide
+more local and regional climate information. In this work, we apply conditional
+normalizing flows to the task of climate variable downscaling. We showcase its
+successful performance on an ERA5 water content dataset for different
+upsampling factors. Additionally, we show that the method allows us to assess
+the predictive uncertainty in terms of standard deviation from the fitted
+conditional distribution mean.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cyclic image generation using chaotic dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takaya Tanaka, Yutaka Yamaguti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Successive image generation using cyclic transformations is demonstrated by
+extending the CycleGAN model to transform images among three different
+categories. Repeated application of the trained generators produces sequences
+of images that transition among the different categories. The generated image
+sequences occupy a more limited region of the image space compared with the
+original training dataset. Quantitative evaluation using precision and recall
+metrics indicates that the generated images have high quality but reduced
+diversity relative to the training dataset. Such successive generation
+processes are characterized as chaotic dynamics in terms of dynamical system
+theory. Positive Lyapunov exponents estimated from the generated trajectories
+confirm the presence of chaotic dynamics, with the Lyapunov dimension of the
+attractor found to be comparable to the intrinsic dimension of the training
+data manifold. The results suggest that chaotic dynamics in the image space
+defined by the deep generative model contribute to the diversity of the
+generated images, constituting a novel approach for multi-class image
+generation. This model can be interpreted as an extension of classical
+associative memory to perform hetero-association among image categories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Mutual Information Maximization for Generalized Category
+  Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaorui Tan, Chengrui Zhang, Xi Yang, Jie Sun, Kaizhu Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalized category discovery presents a challenge in a realistic scenario,
+which requires the model's generalization ability to recognize unlabeled
+samples from known and unknown categories. This paper revisits the challenge of
+generalized category discovery through the lens of information maximization
+(InfoMax) with a probabilistic parametric classifier. Our findings reveal that
+ensuring independence between known and unknown classes while concurrently
+assuming a uniform probability distribution across all classes, yields an
+enlarged margin among known and unknown classes that promotes the model's
+performance. To achieve the aforementioned independence, we propose a novel
+InfoMax-based method, Regularized Parametric InfoMax (RPIM), which adopts
+pseudo labels to supervise unlabeled samples during InfoMax, while proposing a
+regularization to ensure the quality of the pseudo labels. Additionally, we
+introduce novel semantic-bias transformation to refine the features from the
+pre-trained model instead of direct fine-tuning to rescue the computational
+costs. Extensive experiments on six benchmark datasets validate the
+effectiveness of our method. RPIM significantly improves the performance
+regarding unknown classes, surpassing the state-of-the-art method by an average
+margin of 3.5%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ R$^2$-Gaussian: Rectifying Radiative Gaussian Splatting for Tomographic
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruyi Zha, Tao Jun Lin, Yuanhao Cai, Jiwen Cao, Yanhao Zhang, Hongdong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D Gaussian splatting (3DGS) has shown promising results in image rendering
+and surface reconstruction. However, its potential in volumetric reconstruction
+tasks, such as X-ray computed tomography, remains under-explored. This paper
+introduces R2-Gaussian, the first 3DGS-based framework for sparse-view
+tomographic reconstruction. By carefully deriving X-ray rasterization
+functions, we discover a previously unknown integration bias in the standard
+3DGS formulation, which hampers accurate volume retrieval. To address this
+issue, we propose a novel rectification technique via refactoring the
+projection from 3D to 2D Gaussians. Our new method presents three key
+innovations: (1) introducing tailored Gaussian kernels, (2) extending
+rasterization to X-ray imaging, and (3) developing a CUDA-based differentiable
+voxelizer. Extensive experiments demonstrate that our method outperforms
+state-of-the-art approaches by 0.93 dB in PSNR and 0.014 in SSIM. Crucially, it
+delivers high-quality results in 3 minutes, which is 12x faster than NeRF-based
+methods and on par with traditional algorithms. The superior performance and
+rapid convergence of our method highlight its practical value.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditioning GAN Without Training <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kidist Amde Mekonnen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning algorithms have a large number of trainable parameters often
+with sizes of hundreds of thousands or more. Training this algorithm requires a
+large amount of training data and generating a sufficiently large dataset for
+these algorithms is costly\cite{noguchi2019image}.
+  GANs are generative neural networks that use two deep learning networks that
+are competing with each other. The networks are generator and discriminator
+networks. The generator tries to generate realistic images which resemble the
+actual training dataset by approximating the training data distribution and the
+discriminator is trained to classify images as real or
+fake(generated)\cite{goodfellow2016nips}. Training these GAN algorithms also
+requires a large amount of training dataset\cite{noguchi2019image}.
+  In this study, the aim is to address the question, "Given an unconditioned
+pretrained generator network and a pretrained classifier, is it feasible to
+develop a conditioned generator without relying on any training dataset?"
+  The paper begins with a general introduction to the problem. The subsequent
+sections are structured as follows: Section 2 provides background information
+on the problem. Section 3 reviews relevant literature on the topic. Section 4
+outlines the methodology employed in this study. Section 5 presents the
+experimental results. Section 6 discusses the findings and proposes potential
+future research directions. Finally, Section 7 offers concluding remarks.
+  The implementation can be accessed
+\href{https://github.com/kidist-amde/BigGAN-PyTorch}{here}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, Part of my MSc project course, School Project
+  Course 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Counterfactual Image Generation Using Mahalanobis Distance
+  with Distribution Preferences in Feature Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukai Zhang, Ao Xu, Zihao Li, Tieru Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of Artificial Intelligence (AI), the importance of Explainable
+Artificial Intelligence (XAI) is increasingly recognized, particularly as AI
+models become more integral to our lives. One notable single-instance XAI
+approach is counterfactual explanation, which aids users in comprehending a
+model's decisions and offers guidance on altering these decisions. Specifically
+in the context of image classification models, effective image counterfactual
+explanations can significantly enhance user understanding. This paper
+introduces a novel method for computing feature importance within the feature
+space of a black-box model. By employing information fusion techniques, our
+method maximizes the use of data to address feature counterfactual explanations
+in the feature space. Subsequently, we utilize an image generation model to
+transform these feature counterfactual explanations into image counterfactual
+explanations. Our experiments demonstrate that the counterfactual explanations
+generated by our method closely resemble the original images in both pixel and
+feature spaces. Additionally, our method outperforms established baselines,
+achieving impressive experimental results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adv-KD: Adversarial Knowledge Distillation for Faster Diffusion Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kidist Amde Mekonnen, Nicola Dall'Asen, Paolo Rota
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Probabilistic Models (DPMs) have emerged as a powerful class of
+deep generative models, achieving remarkable performance in image synthesis
+tasks. However, these models face challenges in terms of widespread adoption
+due to their reliance on sequential denoising steps during sample generation.
+This dependence leads to substantial computational requirements, making them
+unsuitable for resource-constrained or real-time processing systems. To address
+these challenges, we propose a novel method that integrates denoising phases
+directly into the model's architecture, thereby reducing the need for
+resource-intensive computations. Our approach combines diffusion models with
+generative adversarial networks (GANs) through knowledge distillation, enabling
+more efficient training and evaluation. By utilizing a pre-trained diffusion
+model as a teacher model, we train a student model through adversarial
+learning, employing layerwise transformations for denoising and submodules for
+predicting the teacher model's output at various points in time. This
+integration significantly reduces the number of parameters and denoising steps
+required, leading to improved sampling speed at test time. We validate our
+method with extensive experiments, demonstrating comparable performance with
+reduced computational requirements compared to existing approaches. By enabling
+the deployment of diffusion models on resource-constrained devices, our
+research mitigates their computational burden and paves the way for wider
+accessibility and practical use across the research community and end-users.
+  Our code is publicly available at https://github.com/kidist-amde/Adv-KD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 11 figures, ELLIS Doctoral Symposium 2023 in Helsinki,
+  Finland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 4Diffusion: Multi-view Video Diffusion Model for 4D Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20674v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20674v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haiyu Zhang, Xinyuan Chen, Yaohui Wang, Xihui Liu, Yunhong Wang, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current 4D generation methods have achieved noteworthy efficacy with the aid
+of advanced diffusion generative models. However, these methods lack multi-view
+spatial-temporal modeling and encounter challenges in integrating diverse prior
+knowledge from multiple diffusion models, resulting in inconsistent temporal
+appearance and flickers. In this paper, we propose a novel 4D generation
+pipeline, namely 4Diffusion aimed at generating spatial-temporally consistent
+4D content from a monocular video. We first design a unified diffusion model
+tailored for multi-view video generation by incorporating a learnable motion
+module into a frozen 3D-aware diffusion model to capture multi-view
+spatial-temporal correlations. After training on a curated dataset, our
+diffusion model acquires reasonable temporal consistency and inherently
+preserves the generalizability and spatial consistency of the 3D-aware
+diffusion model. Subsequently, we propose 4D-aware Score Distillation Sampling
+loss, which is based on our multi-view video diffusion model, to optimize 4D
+representation parameterized by dynamic NeRF. This aims to eliminate
+discrepancies arising from multiple diffusion models, allowing for generating
+spatial-temporally consistent 4D content. Moreover, we devise an anchor loss to
+enhance the appearance details and facilitate the learning of dynamic NeRF.
+Extensive qualitative and quantitative experiments demonstrate that our method
+achieves superior performance compared to previous methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://aejion.github.io/4diffusion/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating and unmasking feature-level vulnerabilities of CNNs to
+  adversarial perturbations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Coppola, Hwee Kuan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study explores the impact of adversarial perturbations on Convolutional
+Neural Networks (CNNs) with the aim of enhancing the understanding of their
+underlying mechanisms. Despite numerous defense methods proposed in the
+literature, there is still an incomplete understanding of this phenomenon.
+Instead of treating the entire model as vulnerable, we propose that specific
+feature maps learned during training contribute to the overall vulnerability.
+To investigate how the hidden representations learned by a CNN affect its
+vulnerability, we introduce the Adversarial Intervention framework. Experiments
+were conducted on models trained on three well-known computer vision datasets,
+subjecting them to attacks of different nature. Our focus centers on the
+effects that adversarial perturbations to a model's initial layer have on the
+overall behavior of the model. Empirical results revealed compelling insights:
+a) perturbing selected channel combinations in shallow layers causes
+significant disruptions; b) the channel combinations most responsible for the
+disruptions are common among different types of attacks; c) despite shared
+vulnerable combinations of channels, different attacks affect hidden
+representations with varying magnitudes; d) there exists a positive correlation
+between a kernel's magnitude and its vulnerability. In conclusion, this work
+introduces a novel framework to study the vulnerability of a CNN model to
+adversarial perturbations, revealing insights that contribute to a deeper
+understanding of the phenomenon. The identified properties pave the way for the
+development of efficient ad-hoc defense mechanisms in future applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 15 figures (including appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fourier123: One Image to High-Quality 3D Object Generation with Hybrid
+  Fourier Score Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuzhou Yang, Yu Wang, Haijie Li, Jiarui Meng, Xiandong Meng, Jian Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single image-to-3D generation is pivotal for crafting controllable 3D assets.
+Given its underconstrained nature, we leverage geometric priors from a 3D novel
+view generation diffusion model and appearance priors from a 2D image
+generation method to guide the optimization process. We note that a disparity
+exists between the training datasets of 2D and 3D diffusion models, leading to
+their outputs showing marked differences in appearance. Specifically, 2D models
+tend to deliver more detailed visuals, whereas 3D models produce consistent yet
+over-smooth results across different views. Hence, we optimize a set of 3D
+Gaussians using 3D priors in spatial domain to ensure geometric consistency,
+while exploiting 2D priors in the frequency domain through Fourier transform
+for higher visual quality. This 2D-3D hybrid Fourier Score Distillation
+objective function (dubbed hy-FSD), can be integrated into existing 3D
+generation methods, yielding significant performance improvements. With this
+technique, we further develop an image-to-3D generation pipeline to create
+high-quality 3D objects within one minute, named Fourier123. Extensive
+experiments demonstrate that Fourier123 excels in efficient generation with
+rapid convergence speed and visual-friendly generation results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MASA: Motion-aware Masked Autoencoder with Semantic Alignment for Sign
+  Language Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20666v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20666v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weichao Zhao, Hezhen Hu, Wengang Zhou, Yunyao Mao, Min Wang, Houqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sign language recognition (SLR) has long been plagued by insufficient model
+representation capabilities. Although current pre-training approaches have
+alleviated this dilemma to some extent and yielded promising performance by
+employing various pretext tasks on sign pose data, these methods still suffer
+from two primary limitations: 1) Explicit motion information is usually
+disregarded in previous pretext tasks, leading to partial information loss and
+limited representation capability. 2) Previous methods focus on the local
+context of a sign pose sequence, without incorporating the guidance of the
+global meaning of lexical signs. To this end, we propose a Motion-Aware masked
+autoencoder with Semantic Alignment (MASA) that integrates rich motion cues and
+global semantic information in a self-supervised learning paradigm for SLR. Our
+framework contains two crucial components, i.e., a motion-aware masked
+autoencoder (MA) and a momentum semantic alignment module (SA). Specifically,
+in MA, we introduce an autoencoder architecture with a motion-aware masked
+strategy to reconstruct motion residuals of masked frames, thereby explicitly
+exploring dynamic motion cues among sign pose sequences. Moreover, in SA, we
+embed our framework with global semantic awareness by aligning the embeddings
+of different augmented samples from the input sequence in the shared latent
+space. In this way, our framework can simultaneously learn local motion cues
+and global semantic features for comprehensive sign language representation.
+Furthermore, we conduct extensive experiments to validate the effectiveness of
+our method, achieving new state-of-the-art performance on four public
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TCSVT 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenMix: Combining Generative and Mixture Data Augmentation for Medical
+  Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansang Lee, Haeil Lee, Helen Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a novel data augmentation technique called GenMix,
+which combines generative and mixture approaches to leverage the strengths of
+both methods. While generative models excel at creating new data patterns, they
+face challenges such as mode collapse in GANs and difficulties in training
+diffusion models, especially with limited medical imaging data. On the other
+hand, mixture models enhance class boundary regions but tend to favor the major
+class in scenarios with class imbalance. To address these limitations, GenMix
+integrates both approaches to complement each other. GenMix operates in two
+stages: (1) training a generative model to produce synthetic images, and (2)
+performing mixup between synthetic and real data. This process improves the
+quality and diversity of synthetic data while simultaneously benefiting from
+the new pattern learning of generative models and the boundary enhancement of
+mixture models. We validate the effectiveness of our method on the task of
+classifying focal liver lesions (FLLs) in CT images. Our results demonstrate
+that GenMix enhances the performance of various generative models, including
+DCGAN, StyleGAN, Textual Inversion, and Diffusion Models. Notably, the proposed
+method with Textual Inversion outperforms other methods without fine-tuning
+diffusion model on the FLL dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shotluck Holmes: A Family of Efficient Small-Scale Large Language Vision
+  Models For Video Captioning and Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Luo, Austin Peng, Adithya Vasudev, Rishabh Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video is an increasingly prominent and information-dense medium, yet it poses
+substantial challenges for language models. A typical video consists of a
+sequence of shorter segments, or shots, that collectively form a coherent
+narrative. Each shot is analogous to a word in a sentence where multiple data
+streams of information (such as visual and auditory data) must be processed
+simultaneously. Comprehension of the entire video requires not only
+understanding the visual-audio information of each shot but also requires that
+the model links the ideas between each shot to generate a larger,
+all-encompassing story. Despite significant progress in the field, current
+works often overlook videos' more granular shot-by-shot semantic information.
+In this project, we propose a family of efficient large language vision models
+(LLVMs) to boost video summarization and captioning called Shotluck Holmes. By
+leveraging better pretraining and data collection strategies, we extend the
+abilities of existing small LLVMs from being able to understand a picture to
+being able to understand a sequence of frames. Specifically, we show that
+Shotluck Holmes achieves better performance than state-of-the-art results on
+the Shot2Story video captioning and summary task with significantly smaller and
+more computationally efficient models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Gaze-aware Compositional GAN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20643v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20643v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nerea Aranjuelo, Siyu Huang, Ignacio Arganda-Carreras, Luis Unzueta, Oihana Otaegui, Hanspeter Pfister, Donglai Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaze-annotated facial data is crucial for training deep neural networks
+(DNNs) for gaze estimation. However, obtaining these data is labor-intensive
+and requires specialized equipment due to the challenge of accurately
+annotating the gaze direction of a subject. In this work, we present a
+generative framework to create annotated gaze data by leveraging the benefits
+of labeled and unlabeled data sources. We propose a Gaze-aware Compositional
+GAN that learns to generate annotated facial images from a limited labeled
+dataset. Then we transfer this model to an unlabeled data domain to take
+advantage of the diversity it provides. Experiments demonstrate our approach's
+effectiveness in generating within-domain image augmentations in the ETH-XGaze
+dataset and cross-domain augmentations in the CelebAMask-HQ dataset domain for
+gaze estimation DNN training. We also show additional applications of our work,
+which include facial image editing and gaze redirection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ETRA 2024 as Full paper, and as journal paper in
+  Proceedings of the ACM on Computer Graphics and Interactive Techniques</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Action-OOD: An End-to-End Skeleton-Based Model for Robust
+  Out-of-Distribution Human Action Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Xu, Anqi Zhu, Jingyu Lin, Qiuhong Ke, Cunjian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human action recognition is a crucial task in computer vision systems.
+However, in real-world scenarios, human actions often fall outside the
+distribution of training data, requiring a model to both recognize
+in-distribution (ID) actions and reject out-of-distribution (OOD) ones. Despite
+its importance, there has been limited research on OOD detection in human
+actions. Existing works on OOD detection mainly focus on image data with RGB
+structure, and many methods are post-hoc in nature. While these methods are
+convenient and computationally efficient, they often lack sufficient accuracy
+and fail to consider the presence of OOD samples. To address these challenges,
+we propose a novel end-to-end skeleton-based model called Action-OOD,
+specifically designed for OOD human action detection. Unlike some existing
+approaches that may require prior knowledge of existing OOD data distribution,
+our model solely utilizes in-distribution (ID) data during the training stage,
+effectively mitigating the overconfidence issue prevalent in OOD detection. We
+introduce an attention-based feature fusion block, which enhances the model's
+capability to recognize unknown classes while preserving classification
+accuracy for known classes. Further, we present a novel energy-based loss
+function and successfully integrate it with the traditional cross-entropy loss
+to maximize the separation of data distributions between ID and OOD. Through
+extensive experiments conducted on NTU-RGB+D 60, NTU-RGB+D 120, and
+Kinetics-400 datasets, we demonstrate the superior performance of our proposed
+approach compared to state-of-the-art methods. Our findings underscore the
+effectiveness of classic OOD detection techniques in the context of
+skeleton-based action recognition tasks, offering promising avenues for future
+research in this field. Code will be available at:
+https://github.com/YilliaJing/Action-OOD.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under consideration at Computer Vision and Image Understanding</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ToxVidLLM: A Multimodal LLM-based Framework for Toxicity Detection in
+  Code-Mixed Videos <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krishanu Maity, A. S. Poornash, Sriparna Saha, Pushpak Bhattacharyya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In an era of rapidly evolving internet technology, the surge in multimodal
+content, including videos, has expanded the horizons of online communication.
+However, the detection of toxic content in this diverse landscape, particularly
+in low-resource code-mixed languages, remains a critical challenge. While
+substantial research has addressed toxic content detection in textual data, the
+realm of video content, especially in non-English languages, has been
+relatively underexplored. This paper addresses this research gap by introducing
+a benchmark dataset, the first of its kind, consisting of 931 videos with 4021
+code-mixed Hindi-English utterances collected from YouTube. Each utterance
+within this dataset has been meticulously annotated for toxicity, severity, and
+sentiment labels. We have developed an advanced Multimodal Multitask framework
+built for Toxicity detection in Video Content by leveraging Large Language
+Models (LLMs), crafted for the primary objective along with the additional
+tasks of conducting sentiment and severity analysis. ToxVidLLM incorporates
+three key modules the Encoder module, Cross-Modal Synchronization module, and
+Multitask module crafting a generic multimodal LLM customized for intricate
+video classification tasks. Our experiments reveal that incorporating multiple
+modalities from the videos substantially enhances the performance of toxic
+content detection by achieving an Accuracy and Weighted F1 score of 94.29% and
+94.35%, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EPIDetect: Video-based convulsive seizure detection in chronic epilepsy
+  mouse model for anti-epilepsy drug screening 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20614v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20614v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junming Ren, Zhoujian Xiao, Yujia Zhang, Yujie Yang, Ling He, Ezra Yoon, Stephen Temitayo Bello, Xi Chen, Dapeng Wu, Micky Tortorella, Jufang He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the preclinical translational studies, drug candidates with remarkable
+anti-epileptic efficacy demonstrate long-term suppression of spontaneous
+recurrent seizures (SRSs), particularly convulsive seizures (CSs), in mouse
+models of chronic epilepsy. However, the current methods for monitoring CSs
+have limitations in terms of invasiveness, specific laboratory settings, high
+cost, and complex operation, which hinder drug screening efforts. In this
+study, a camera-based system for automated detection of CSs in chronically
+epileptic mice is first established to screen potential anti-epilepsy drugs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting and Maximizing Temporal Knowledge in Semi-supervised Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20610v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20610v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wooseok Shin, Hyun Joon Park, Jin Sob Kim, Sung Won Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In semi-supervised semantic segmentation, the Mean Teacher- and
+co-training-based approaches are employed to mitigate confirmation bias and
+coupling problems. However, despite their high performance, these approaches
+frequently involve complex training pipelines and a substantial computational
+burden, limiting the scalability and compatibility of these methods. In this
+paper, we propose a PrevMatch framework that effectively mitigates the
+aforementioned limitations by maximizing the utilization of the temporal
+knowledge obtained during the training process. The PrevMatch framework relies
+on two core strategies: (1) we reconsider the use of temporal knowledge and
+thus directly utilize previous models obtained during training to generate
+additional pseudo-label guidance, referred to as previous guidance. (2) we
+design a highly randomized ensemble strategy to maximize the effectiveness of
+the previous guidance. Experimental results on four benchmark semantic
+segmentation datasets confirm that the proposed method consistently outperforms
+existing methods across various evaluation protocols. In particular, with
+DeepLabV3+ and ResNet-101 network settings, PrevMatch outperforms the existing
+state-of-the-art method, Diverse Co-training, by +1.6 mIoU on Pascal VOC with
+only 92 annotated images, while achieving 2.4 times faster training.
+Furthermore, the results indicate that PrevMatch induces stable optimization,
+particularly in benefiting classes that exhibit poor performance. Code is
+available at https://github.com/wooseok-shin/PrevMatch
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, submitted to IEEE TPAMI. This work has been
+  submitted to the IEEE for possible publication. Copyright may be transferred
+  without notice, after which this version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Textual Inversion and <span class="highlight-title">Self-supervised</span> Refinement for Radiology Report
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20607v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20607v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanjiang Luo, Hongxiang Li, Xuan Wu, Meng Cao, Xiaoshuang Huang, Zhihong Zhu, Peixi Liao, Hu Chen, Yi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing mainstream approaches follow the encoder-decoder paradigm for
+generating radiology reports. They focus on improving the network structure of
+encoders and decoders, which leads to two shortcomings: overlooking the
+modality gap and ignoring report content constraints. In this paper, we
+proposed Textual Inversion and Self-supervised Refinement (TISR) to address the
+above two issues. Specifically, textual inversion can project text and image
+into the same space by representing images as pseudo words to eliminate the
+cross-modeling gap. Subsequently, self-supervised refinement refines these
+pseudo words through contrastive loss computation between images and texts,
+enhancing the fidelity of generated reports to images. Notably, TISR is
+orthogonal to most existing methods, plug-and-play. We conduct experiments on
+two widely-used public datasets and achieve significant improvements on various
+baselines, which demonstrates the effectiveness and generalization of TISR. The
+code will be available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-Language Meets the Skeleton: Progressively Distillation with
+  Cross-Modal Knowledge for 3D Action Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Chen, Tian He, Junfeng Fu, Ling Wang, Jingcai Guo, Hong Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised and self-supervised learning are two main training paradigms for
+skeleton-based human action recognition. However, the former one-hot
+classification requires labor-intensive predefined action categories
+annotations, while the latter involves skeleton transformations (e.g.,
+cropping) in the pretext tasks that may impair the skeleton structure. To
+address these challenges, we introduce a novel skeleton-based training
+framework (C$^2$VL) based on Cross-modal Contrastive learning that uses the
+progressive distillation to learn task-agnostic human skeleton action
+representation from the Vision-Language knowledge prompts. Specifically, we
+establish the vision-language action concept space through vision-language
+knowledge prompts generated by pre-trained large multimodal models (LMMs),
+which enrich the fine-grained details that the skeleton action space lacks.
+Moreover, we propose the intra-modal self-similarity and inter-modal
+cross-consistency softened targets in the cross-modal contrastive process to
+progressively control and guide the degree of pulling vision-language knowledge
+prompts and corresponding skeletons closer. These soft instance discrimination
+and self-knowledge distillation strategies contribute to the learning of better
+skeleton-based action representations from the noisy skeleton-vision-language
+pairs. During the inference phase, our method requires only the skeleton data
+as the input for action recognition and no longer for vision-language prompts.
+Extensive experiments show that our method achieves state-of-the-art results on
+NTU RGB+D 60, NTU RGB+D 120, and PKU-MMD datasets. The code will be available
+in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Searching for internal symbols underlying deep learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jung H. Lee, Sujith Vijayan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) enables deep neural networks (DNNs) to automatically learn
+complex tasks or rules from given examples without instructions or guiding
+principles. As we do not engineer DNNs' functions, it is extremely difficult to
+diagnose their decisions, and multiple lines of studies proposed to explain
+principles of DNNs/DL operations. Notably, one line of studies suggests that
+DNNs may learn concepts, the high level features recognizable to humans. Thus,
+we hypothesized that DNNs develop abstract codes, not necessarily recognizable
+to humans, which can be used to augment DNNs' decision-making. To address this
+hypothesis, we combined foundation segmentation models and unsupervised
+learning to extract internal codes and identify potential use of abstract codes
+to make DL's decision-making more reliable and safer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, 3 tables and Appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Semi-Supervised Learning via <span class="highlight-title">Self-Supervised</span> Feature
+  Adaptation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiachen Liang, Ruibing Hou, Hong Chang, Bingpeng Ma, Shiguang Shan, Xilin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional semi-supervised learning (SSL) assumes that the feature
+distributions of labeled and unlabeled data are consistent which rarely holds
+in realistic scenarios. In this paper, we propose a novel SSL setting, where
+unlabeled samples are drawn from a mixed distribution that deviates from the
+feature distribution of labeled samples. Under this setting, previous SSL
+methods tend to predict wrong pseudo-labels with the model fitted on labeled
+data, resulting in noise accumulation. To tackle this issue, we propose
+Self-Supervised Feature Adaptation (SSFA), a generic framework for improving
+SSL performance when labeled and unlabeled data come from different
+distributions. SSFA decouples the prediction of pseudo-labels from the current
+model to improve the quality of pseudo-labels. Particularly, SSFA incorporates
+a self-supervised task into the SSL framework and uses it to adapt the feature
+extractor of the model to the unlabeled data. In this way, the extracted
+features better fit the distribution of unlabeled data, thereby generating
+high-quality pseudo-labels. Extensive experiments show that our proposed SSFA
+is applicable to various pseudo-label-based SSL learners and significantly
+improves performance in labeled, unlabeled, and even unseen distributions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages; Accepted by NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disrupting Diffusion: Token-Level Attention Erasure Attack against
+  Diffusion-based Customization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20584v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20584v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yisu Liu, Jinyang An, Wanqian Zhang, Dayan Wu, Jingzi Gu, Zheng Lin, Weiping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the development of diffusion-based customization methods like
+DreamBooth, individuals now have access to train the models that can generate
+their personalized images. Despite the convenience, malicious users have
+misused these techniques to create fake images, thereby triggering a privacy
+security crisis. In light of this, proactive adversarial attacks are proposed
+to protect users against customization. The adversarial examples are trained to
+distort the customization model's outputs and thus block the misuse. In this
+paper, we propose DisDiff (Disrupting Diffusion), a novel adversarial attack
+method to disrupt the diffusion model outputs. We first delve into the
+intrinsic image-text relationships, well-known as cross-attention, and
+empirically find that the subject-identifier token plays an important role in
+guiding image generation. Thus, we propose the Cross-Attention Erasure module
+to explicitly "erase" the indicated attention maps and disrupt the text
+guidance. Besides,we analyze the influence of the sampling process of the
+diffusion model on Projected Gradient Descent (PGD) attack and introduce a
+novel Merit Sampling Scheduler to adaptively modulate the perturbation updating
+amplitude in a step-aware manner. Our DisDiff outperforms the state-of-the-art
+methods by 12.75% of FDFR scores and 7.25% of ISM scores across two facial
+benchmarks and two commonly used prompts on average.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal evaluation and design of imaging systems using information
+  estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20559v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20559v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henry Pinkard, Leyla Kabuli, Eric Markley, Tiffany Chien, Jiantao Jiao, Laura Waller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information theory, which describes the transmission of signals in the
+presence of noise, has enabled the development of reliable communication
+systems that underlie the modern world. Imaging systems can also be viewed as a
+form of communication, in which information about the object is "transmitted"
+through images. However, the application of information theory to imaging
+systems has been limited by the challenges of accounting for their physical
+constraints. Here, we introduce a framework that addresses these limitations by
+modeling the probabilistic relationship between objects and their measurements.
+Using this framework, we develop a method to estimate information using only a
+dataset of noisy measurements, without making any assumptions about the image
+formation process. We demonstrate that these estimates comprehensively quantify
+measurement quality across a diverse range of imaging systems and applications.
+Furthermore, we introduce Information-Driven Encoder Analysis Learning (IDEAL),
+a technique to optimize the design of imaging hardware for maximum information
+capture. This work provides new insights into the fundamental performance
+limits of imaging systems and offers powerful new tools for their analysis and
+design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ II-MMR: Identifying and Improving Multi-modal Multi-hop Reasoning in
+  Visual Question Answering <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11058v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11058v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihyung Kil, Farideh Tavazoee, Dongyeop Kang, Joo-Kyung Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Question Answering (VQA) often involves diverse reasoning scenarios
+across Vision and Language (V&L). Most prior VQA studies, however, have merely
+focused on assessing the model's overall accuracy without evaluating it on
+different reasoning cases. Furthermore, some recent works observe that
+conventional Chain-of-Thought (CoT) prompting fails to generate effective
+reasoning for VQA, especially for complex scenarios requiring multi-hop
+reasoning. In this paper, we propose II-MMR, a novel idea to identify and
+improve multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA
+question with an image and finds a reasoning path to reach its answer using two
+novel language promptings: (i) answer prediction-guided CoT prompt, or (ii)
+knowledge triplet-guided prompt. II-MMR then analyzes this path to identify
+different reasoning cases in current VQA benchmarks by estimating how many hops
+and what types (i.e., visual or beyond-visual) of reasoning are required to
+answer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR
+observes that most of their VQA questions are easy to answer, simply demanding
+"single-hop" reasoning, whereas only a few questions require "multi-hop"
+reasoning. Moreover, while the recent V&L model struggles with such complex
+multi-hop reasoning questions even using the traditional CoT method, II-MMR
+shows its effectiveness across all reasoning cases in both zero-shot and
+fine-tuning settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> Hierarchical World Models as Visual Whole-Body Humanoid Controllers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18418v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18418v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicklas Hansen, Jyothir S V, Vlad Sobal, <span class="highlight-author">Yann LeCun</span>, Xiaolong Wang, Hao Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whole-body control for humanoids is challenging due to the high-dimensional
+nature of the problem, coupled with the inherent instability of a bipedal
+morphology. Learning from visual observations further exacerbates this
+difficulty. In this work, we explore highly data-driven approaches to visual
+whole-body humanoid control based on reinforcement learning, without any
+simplifying assumptions, reward design, or skill primitives. Specifically, we
+propose a hierarchical world model in which a high-level agent generates
+commands based on visual observations for a low-level agent to execute, both of
+which are trained with rewards. Our approach produces highly performant control
+policies in 8 tasks with a simulated 56-DoF humanoid, while synthesizing
+motions that are broadly preferred by humans. Code and videos:
+https://nicklashansen.com/rlpuppeteer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and videos at https://nicklashansen.com/rlpuppeteer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GAD-Generative Learning for HD Map-Free Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00515v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00515v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijian Sun, Yanbo Jia, Qi Zeng, Zihao Liu, Jiang Liao, Yue Li, Xianfeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep-learning-based techniques have been widely adopted for autonomous
+driving software stacks for mass production in recent years, focusing primarily
+on perception modules, with some work extending this method to prediction
+modules. However, the downstream planning and control modules are still
+designed with hefty handcrafted rules, dominated by optimization-based methods
+such as quadratic programming or model predictive control. This results in a
+performance bottleneck for autonomous driving systems in that corner cases
+simply cannot be solved by enumerating hand-crafted rules. We present a
+deep-learning-based approach that brings prediction, decision, and planning
+modules together with the attempt to overcome the rule-based methods'
+deficiency in real-world applications of autonomous driving, especially for
+urban scenes. The DNN model we proposed is solely trained with 10 hours of
+human driver data, and it supports all mass-production ADAS features available
+on the market to date. This method is deployed onto a Jiyue test car with no
+modification to its factory-ready sensor set and compute platform. the
+feasibility, usability, and commercial potential are demonstrated in this
+article.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Calibrated Self-Rewarding Vision Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14622v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14622v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Zhou, Zhiyuan Fan, Dongjie Cheng, Sihan Yang, Zhaorun Chen, Chenhang Cui, Xiyao Wang, Yun Li, Linjun Zhang, Huaxiu Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) have made substantial progress by
+integrating pre-trained large language models (LLMs) and vision models through
+instruction tuning. Despite these advancements, LVLMs often exhibit the
+hallucination phenomenon, where generated text responses appear linguistically
+plausible but contradict the input image, indicating a misalignment between
+image and text pairs. This misalignment arises because the model tends to
+prioritize textual information over visual input, even when both the language
+model and visual representations are of high quality. Existing methods leverage
+additional models or human annotations to curate preference data and enhance
+modality alignment through preference optimization. These approaches may not
+effectively reflect the target LVLM's preferences, making the curated
+preferences easily distinguishable. Our work addresses these challenges by
+proposing the Calibrated Self-Rewarding (CSR) approach, which enables the model
+to self-improve by iteratively generating candidate responses, evaluating the
+reward for each response, and curating preference data for fine-tuning. In the
+reward modeling, we employ a step-wise strategy and incorporate visual
+constraints into the self-rewarding process to place greater emphasis on visual
+input. Empirical results demonstrate that CSR enhances performance and reduces
+hallucinations across ten benchmarks and tasks, achieving substantial
+improvements over existing methods by 7.62%. Our empirical results are further
+supported by rigorous theoretical analysis, under mild assumptions, verifying
+the effectiveness of introducing visual constraints into the self-rewarding
+paradigm. Additionally, CSR shows compatibility with different vision-language
+models and the ability to incrementally improve performance through iterative
+fine-tuning. Our data and code are available at
+https://github.com/YiyangZhou/CSR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fix some typos and add acknowledgement section in V3</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pre- to Post-Contrast Breast MRI Synthesis for Enhanced Tumour
+  Segmentation <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.10879v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.10879v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Osuala, Smriti Joshi, Apostolia Tsirikoglou, Lidia Garrucho, Walter H. L. Pinaya, Oliver Diaz, Karim Lekadir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite its benefits for tumour detection and treatment, the administration
+of contrast agents in dynamic contrast-enhanced MRI (DCE-MRI) is associated
+with a range of issues, including their invasiveness, bioaccumulation, and a
+risk of nephrogenic systemic fibrosis. This study explores the feasibility of
+producing synthetic contrast enhancements by translating pre-contrast
+T1-weighted fat-saturated breast MRI to their corresponding first DCE-MRI
+sequence leveraging the capabilities of a generative adversarial network (GAN).
+Additionally, we introduce a Scaled Aggregate Measure (SAMe) designed for
+quantitatively evaluating the quality of synthetic data in a principled manner
+and serving as a basis for selecting the optimal generative model. We assess
+the generated DCE-MRI data using quantitative image quality metrics and apply
+them to the downstream task of 3D breast tumour segmentation. Our results
+highlight the potential of post-contrast DCE-MRI synthesis in enhancing the
+robustness of breast tumour segmentation models via data augmentation. Our code
+is available at https://github.com/RichardObi/pre_post_synthesis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as oral presentation at SPIE Medical Imaging 2024 (Image
+  Processing)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Behind Every Domain There is a Shift: Adapting Distortion-aware Vision
+  <span class="highlight-title">Transformer</span>s for Panoramic Semantic Segmentation <span class="chip">CVPR 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.11860v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.11860v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Zhang, Kailun Yang, Hao Shi, Simon Reiß, Kunyu Peng, Chaoxiang Ma, Haodong Fu, Philip H. S. Torr, Kaiwei Wang, Rainer Stiefelhagen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we address panoramic semantic segmentation which is
+under-explored due to two critical challenges: (1) image distortions and object
+deformations on panoramas; (2) lack of semantic annotations in the 360{\deg}
+imagery. To tackle these problems, first, we propose the upgraded Transformer
+for Panoramic Semantic Segmentation, i.e., Trans4PASS+, equipped with
+Deformable Patch Embedding (DPE) and Deformable MLP (DMLPv2) modules for
+handling object deformations and image distortions whenever (before or after
+adaptation) and wherever (shallow or deep levels). Second, we enhance the
+Mutual Prototypical Adaptation (MPA) strategy via pseudo-label rectification
+for unsupervised domain adaptive panoramic segmentation. Third, aside from
+Pinhole-to-Panoramic (Pin2Pan) adaptation, we create a new dataset (SynPASS)
+with 9,080 panoramic images, facilitating Synthetic-to-Real (Syn2Real)
+adaptation scheme in 360{\deg} imagery. Extensive experiments are conducted,
+which cover indoor and outdoor scenarios, and each of them is investigated with
+Pin2Pan and Syn2Real regimens. Trans4PASS+ achieves state-of-the-art
+performances on four domain adaptive panoramic semantic segmentation
+benchmarks. Code is available at https://github.com/jamycheung/Trans4PASS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Transactions on Pattern Analysis and Machine
+  Intelligence (TPAMI). Extended version of CVPR 2022 paper arXiv:2203.01452.
+  Code is available at https://github.com/jamycheung/Trans4PASS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Imbalanced Motion: Part-Decoupling Network for Video Portrait
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16565v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16565v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianshu Yu, Changqun Xia, Jia Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video portrait segmentation (VPS), aiming at segmenting prominent foreground
+portraits from video frames, has received much attention in recent years.
+However, simplicity of existing VPS datasets leads to a limitation on extensive
+research of the task. In this work, we propose a new intricate large-scale
+Multi-scene Video Portrait Segmentation dataset MVPS consisting of 101 video
+clips in 7 scenario categories, in which 10,843 sampled frames are finely
+annotated at pixel level. The dataset has diverse scenes and complicated
+background environments, which is the most complex dataset in VPS to our best
+knowledge. Through the observation of a large number of videos with portraits
+during dataset construction, we find that due to the joint structure of human
+body, motion of portraits is part-associated, which leads that different parts
+are relatively independent in motion. That is, motion of different parts of the
+portraits is imbalanced. Towards this imbalance, an intuitive and reasonable
+idea is that different motion states in portraits can be better exploited by
+decoupling the portraits into parts. To achieve this, we propose a
+Part-Decoupling Network (PDNet) for video portrait segmentation. Specifically,
+an Inter-frame Part-Discriminated Attention (IPDA) module is proposed which
+unsupervisedly segments portrait into parts and utilizes different
+attentiveness on discriminative features specified to each different part. In
+this way, appropriate attention can be imposed to portrait parts with
+imbalanced motion to extract part-discriminated correlations, so that the
+portraits can be segmented more accurately. Experimental results demonstrate
+that our method achieves leading performance with the comparison to
+state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Victim and The Beneficiary: Exploiting a Poisoned Model to Train a
+  Clean Model on Poisoned Data <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.11265v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.11265v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan Zhu, Rui Wang, Cong Zou, Lihua Jing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, backdoor attacks have posed a serious security threat to the
+training process of deep neural networks (DNNs). The attacked model behaves
+normally on benign samples but outputs a specific result when the trigger is
+present. However, compared with the rocketing progress of backdoor attacks,
+existing defenses are difficult to deal with these threats effectively or
+require benign samples to work, which may be unavailable in real scenarios. In
+this paper, we find that the poisoned samples and benign samples can be
+distinguished with prediction entropy. This inspires us to propose a novel
+dual-network training framework: The Victim and The Beneficiary (V&B), which
+exploits a poisoned model to train a clean model without extra benign samples.
+Firstly, we sacrifice the Victim network to be a powerful poisoned sample
+detector by training on suspicious samples. Secondly, we train the Beneficiary
+network on the credible samples selected by the Victim to inhibit backdoor
+injection. Thirdly, a semi-supervised suppression strategy is adopted for
+erasing potential backdoors and improving model performance. Furthermore, to
+better inhibit missed poisoned samples, we propose a strong data augmentation
+method, AttentionMix, which works well with our proposed V&B framework.
+Extensive experiments on two widely used datasets against 6 state-of-the-art
+attacks demonstrate that our framework is effective in preventing backdoor
+injection and robust to various attacks while maintaining the performance on
+benign samples. Our code is available at https://github.com/Zixuan-Zhu/VaB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 figures, published to ICCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HQ-DiT: Efficient Diffusion <span class="highlight-title">Transformer</span> with FP4 Hybrid Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19751v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19751v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Liu, Sai Qian Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Transformers (DiTs) have recently gained substantial attention in
+both industrial and academic fields for their superior visual generation
+capabilities, outperforming traditional diffusion models that use U-Net.
+However,the enhanced performance of DiTs also comes with high parameter counts
+and implementation costs, seriously restricting their use on resource-limited
+devices such as mobile phones. To address these challenges, we introduce the
+Hybrid Floating-point Quantization for DiT(HQ-DiT), an efficient post-training
+quantization method that utilizes 4-bit floating-point (FP) precision on both
+weights and activations for DiT inference. Compared to fixed-point quantization
+(e.g., INT8), FP quantization, complemented by our proposed clipping range
+selection mechanism, naturally aligns with the data distribution within DiT,
+resulting in a minimal quantization error. Furthermore, HQ-DiT also implements
+a universal identity mathematical transform to mitigate the serious
+quantization error caused by the outliers. The experimental results demonstrate
+that DiT can achieve extremely low-precision quantization (i.e., 4 bits) with
+negligible impact on performance. Our approach marks the first instance where
+both weights and activations in DiTs are quantized to just 4 bits, with only a
+0.12 increase in sFID on ImageNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Pixel Is Worth More Than One 3D Gaussians in Single-View 3D
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20310v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20310v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianghao Shen, Xue Nan, Tianfu Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning 3D scene representation from a single-view image is a long-standing
+fundamental problem in computer vision, with the inherent ambiguity in
+predicting contents unseen from the input view. Built on the recently proposed
+3D Gaussian Splatting (3DGS), the Splatter Image method has made promising
+progress on fast single-image novel view synthesis via learning a single 3D
+Gaussian for each pixel based on the U-Net feature map of an input image.
+However, it has limited expressive power to represent occluded components that
+are not observable in the input view. To address this problem, this paper
+presents a Hierarchical Splatter Image method in which a pixel is worth more
+than one 3D Gaussians. Specifically, each pixel is represented by a parent 3D
+Gaussian and a small number of child 3D Gaussians. Parent 3D Gaussians are
+learned as done in the vanilla Splatter Image. Child 3D Gaussians are learned
+via a lightweight Multi-Layer Perceptron (MLP) which takes as input the
+projected image features of a parent 3D Gaussian and the embedding of a target
+camera view. Both parent and child 3D Gaussians are learned end-to-end in a
+stage-wise way. The joint condition of input image features from eyes of the
+parent Gaussians and the target camera position facilitates learning to
+allocate child Gaussians to ``see the unseen'', recovering the occluded details
+that are often missed by parent Gaussians.
+  In experiments, the proposed method is tested on the ShapeNet-SRN and CO3D
+datasets with state-of-the-art performance obtained, especially showing
+promising capabilities of reconstructing occluded contents in the input view.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Memory Consolidation Enables Long-Context Video Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05861v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05861v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivana Balažević, Yuge Shi, Pinelopi Papalampidi, Rahma Chaabouni, Skanda Koppula, Olivier J. Hénaff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most transformer-based video encoders are limited to short temporal contexts
+due to their quadratic complexity. While various attempts have been made to
+extend this context, this has often come at the cost of both conceptual and
+computational complexity. We propose to instead re-purpose existing pre-trained
+video transformers by simply fine-tuning them to attend to memories derived
+non-parametrically from past activations. By leveraging redundancy reduction,
+our memory-consolidated vision transformer (MC-ViT) effortlessly extends its
+context far into the past and exhibits excellent scaling behavior when learning
+from longer videos. In doing so, MC-ViT sets a new state-of-the-art in
+long-context video understanding on EgoSchema, Perception Test, and Diving48,
+outperforming methods that benefit from orders of magnitude more parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning-Based Object Pose Estimation: A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07801v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07801v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Liu, Wei Sun, Hui Yang, Zhiwen Zeng, Chongpei Liu, Jin Zheng, Xingyu Liu, Hossein Rahmani, Nicu Sebe, Ajmal Mian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object pose estimation is a fundamental computer vision problem with broad
+applications in augmented reality and robotics. Over the past decade, deep
+learning models, due to their superior accuracy and robustness, have
+increasingly supplanted conventional algorithms reliant on engineered point
+pair features. Nevertheless, several challenges persist in contemporary
+methods, including their dependency on labeled training data, model
+compactness, robustness under challenging conditions, and their ability to
+generalize to novel unseen objects. A recent survey discussing the progress
+made on different aspects of this area, outstanding challenges, and promising
+future directions, is missing. To fill this gap, we discuss the recent advances
+in deep learning-based object pose estimation, covering all three formulations
+of the problem, \emph{i.e.}, instance-level, category-level, and unseen object
+pose estimation. Our survey also covers multiple input data modalities,
+degrees-of-freedom of output poses, object properties, and downstream tasks,
+providing the readers with a holistic understanding of this field.
+Additionally, it discusses training paradigms of different domains, inference
+modes, application areas, evaluation metrics, and benchmark datasets, as well
+as reports the performance of current state-of-the-art methods on these
+benchmarks, thereby facilitating the readers in selecting the most suitable
+method for their application. Finally, the survey identifies key challenges,
+reviews the prevailing trends along with their pros and cons, and identifies
+promising directions for future research. We also keep tracing the latest works
+at https://github.com/CNJianLiu/Awesome-Object-Pose-Estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From CNNs to Shift-Invariant Twin Models Based on Complex Wavelets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.00394v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.00394v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hubert Leterme, Kévin Polisano, Valérie Perrier, Karteek Alahari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel method to increase shift invariance and prediction
+accuracy in convolutional neural networks. Specifically, we replace the
+first-layer combination "real-valued convolutions + max pooling" (RMax) by
+"complex-valued convolutions + modulus" (CMod), which is stable to
+translations, or shifts. To justify our approach, we claim that CMod and RMax
+produce comparable outputs when the convolution kernel is band-pass and
+oriented (Gabor-like filter). In this context, CMod can therefore be considered
+as a stable alternative to RMax. To enforce this property, we constrain the
+convolution kernels to adopt such a Gabor-like structure. The corresponding
+architecture is called mathematical twin, because it employs a well-defined
+mathematical operator to mimic the behavior of the original, freely-trained
+model. Our approach achieves superior accuracy on ImageNet and CIFAR-10
+classification tasks, compared to prior methods based on low-pass filtering.
+Arguably, our approach's emphasis on retaining high-frequency details
+contributes to a better balance between shift invariance and information
+preservation, resulting in improved performance. Furthermore, it has a lower
+computational cost and memory footprint than concurrent work, making it a
+promising solution for practical implementation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIP-QDA: An Explainable Concept Bottleneck Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.00110v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.00110v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rémi Kazmierczak, Eloïse Berthier, Goran Frehse, Gianni Franchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce an explainable algorithm designed from a
+multi-modal foundation model, that performs fast and explainable image
+classification. Drawing inspiration from CLIP-based Concept Bottleneck Models
+(CBMs), our method creates a latent space where each neuron is linked to a
+specific word. Observing that this latent space can be modeled with simple
+distributions, we use a Mixture of Gaussians (MoG) formalism to enhance the
+interpretability of this latent space. Then, we introduce CLIP-QDA, a
+classifier that only uses statistical values to infer labels from the concepts.
+In addition, this formalism allows for both local and global explanations.
+These explanations come from the inner design of our architecture, our work is
+part of a new family of greybox models, combining performances of opaque
+foundation models and the interpretability of transparent models. Our empirical
+findings show that in instances where the MoG assumption holds, CLIP-QDA
+achieves similar accuracy with state-of-the-art methods CBMs. Our explanations
+compete with existing XAI methods while being faster to compute.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TIC-TAC: A Framework for Improved Covariance Estimation in Deep
+  Heteroscedastic Regression <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.18953v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.18953v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Megh Shukla, Mathieu Salzmann, Alexandre Alahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep heteroscedastic regression involves jointly optimizing the mean and
+covariance of the predicted distribution using the negative log-likelihood.
+However, recent works show that this may result in sub-optimal convergence due
+to the challenges associated with covariance estimation. While the literature
+addresses this by proposing alternate formulations to mitigate the impact of
+the predicted covariance, we focus on improving the predicted covariance
+itself. We study two questions: (1) Does the predicted covariance truly capture
+the randomness of the predicted mean? (2) In the absence of supervision, how
+can we quantify the accuracy of covariance estimation? We address (1) with a
+Taylor Induced Covariance (TIC), which captures the randomness of the predicted
+mean by incorporating its gradient and curvature through the second order
+Taylor polynomial. Furthermore, we tackle (2) by introducing a Task Agnostic
+Correlations (TAC) metric, which combines the notion of correlations and
+absolute error to evaluate the covariance. We evaluate TIC-TAC across multiple
+experiments spanning synthetic and real-world datasets. Our results show that
+not only does TIC accurately learn the covariance, it additionally facilitates
+an improved convergence of the negative log-likelihood. Our code is available
+at https://github.com/vita-epfl/TIC-TAC
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Please feel free to provide feedback!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 360Loc: A <span class="highlight-title">Dataset</span> and Benchmark for Omnidirectional Visual Localization
+  with Cross-device Queries <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.17389v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.17389v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huajian Huang, Changkun Liu, Yipeng Zhu, Hui Cheng, Tristan Braud, Sai-Kit Yeung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Portable 360$^\circ$ cameras are becoming a cheap and efficient tool to
+establish large visual databases. By capturing omnidirectional views of a
+scene, these cameras could expedite building environment models that are
+essential for visual localization. However, such an advantage is often
+overlooked due to the lack of valuable datasets. This paper introduces a new
+benchmark dataset, 360Loc, composed of 360$^\circ$ images with ground truth
+poses for visual localization. We present a practical implementation of
+360$^\circ$ mapping combining 360$^\circ$ images with lidar data to generate
+the ground truth 6DoF poses. 360Loc is the first dataset and benchmark that
+explores the challenge of cross-device visual positioning, involving
+360$^\circ$ reference frames, and query frames from pinhole, ultra-wide FoV
+fisheye, and 360$^\circ$ cameras. We propose a virtual camera approach to
+generate lower-FoV query frames from 360$^\circ$ images, which ensures a fair
+comparison of performance among different query types in visual localization
+tasks. We also extend this virtual camera approach to feature matching-based
+and pose regression-based methods to alleviate the performance loss caused by
+the cross-device domain gap, and evaluate its effectiveness against
+state-of-the-art baselines. We demonstrate that omnidirectional visual
+localization is more robust in challenging large-scale scenes with symmetries
+and repetitive structures. These results provide new insights into 360-camera
+mapping and omnidirectional visual localization with cross-device queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024. Project Page: https://huajianup.github.io/research/360Loc/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exposure Bracketing is All You Need for Unifying Image Restoration and
+  Enhancement Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00766v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00766v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhilu Zhang, Shuohao Zhang, Renlong Wu, Zifei Yan, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is highly desired but challenging to acquire high-quality photos with
+clear content in low-light environments. Although multi-image processing
+methods (using burst, dual-exposure, or multi-exposure images) have made
+significant progress in addressing this issue, they typically focus on specific
+restoration or enhancement problems, and do not fully explore the potential of
+utilizing multiple images. Motivated by the fact that multi-exposure images are
+complementary in denoising, deblurring, high dynamic range imaging, and
+super-resolution, we propose to utilize exposure bracketing photography to
+unify image restoration and enhancement tasks in this work. Due to the
+difficulty in collecting real-world pairs, we suggest a solution that first
+pre-trains the model with synthetic paired data and then adapts it to
+real-world unlabeled images. In particular, a temporally modulated recurrent
+network (TMRNet) and self-supervised adaptation method are proposed. Moreover,
+we construct a data simulation pipeline to synthesize pairs and collect
+real-world images from 200 nighttime scenarios. Experiments on both datasets
+show that our method performs favorably against the state-of-the-art
+multi-image processing ones. The dataset, code, and pre-trained models are
+available at https://github.com/cszhilu1998/BracketIRE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attention-aware Semantic Communications for Collaborative Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07217v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07217v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiwoong Im, Nayoung Kwon, Taewoo Park, Jiheon Woo, Jaeho Lee, Yongjune Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a communication-efficient collaborative inference framework in the
+domain of edge inference, focusing on the efficient use of vision transformer
+(ViT) models. The partitioning strategy of conventional collaborative inference
+fails to reduce communication cost because of the inherent architecture of ViTs
+maintaining consistent layer dimensions across the entire transformer encoder.
+Therefore, instead of employing the partitioning strategy, our framework
+utilizes a lightweight ViT model on the edge device, with the server deploying
+a complicated ViT model. To enhance communication efficiency and achieve the
+classification accuracy of the server model, we propose two strategies: 1)
+attention-aware patch selection and 2) entropy-aware image transmission.
+Attention-aware patch selection leverages the attention scores generated by the
+edge device's transformer encoder to identify and select the image patches
+critical for classification. This strategy enables the edge device to transmit
+only the essential patches to the server, significantly improving communication
+efficiency. Entropy-aware image transmission uses min-entropy as a metric to
+accurately determine whether to depend on the lightweight model on the edge
+device or to request the inference from the server model. In our framework, the
+lightweight ViT model on the edge device acts as a semantic encoder,
+efficiently identifying and selecting the crucial image information required
+for the classification task. Our experiments demonstrate that the proposed
+collaborative inference framework can reduce communication overhead by 68% with
+only a minimal loss in accuracy compared to the server model on the ImageNet
+dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multilinear Mixture of Experts: Scalable Expert Specialization through
+  Factorization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12550v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12550v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Oldfield, Markos Georgopoulos, Grigorios G. Chrysos, Christos Tzelepis, Yannis Panagakis, Mihalis A. Nicolaou, Jiankang Deng, Ioannis Patras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Mixture of Experts (MoE) paradigm provides a powerful way to decompose
+dense layers into smaller, modular computations often more amenable to human
+interpretation, debugging, and editability. However, a major challenge lies in
+the computational cost of scaling the number of experts high enough to achieve
+fine-grained specialization. In this paper, we propose the Multilinear Mixture
+of Experts ($\mu$MoE) layer to address this, focusing on vision models.
+$\mu$MoE layers enable scalable expert specialization by performing an implicit
+computation on prohibitively large weight tensors entirely in factorized form.
+Consequently, $\mu$MoEs (1) avoid the restrictively high inference-time costs
+of 'soft' MoEs, yet (2) do not inherit the training issues of the popular
+'sparse' MoEs' discrete (non-differentiable) expert routing. We present both
+qualitative and quantitative evidence that scaling $\mu$MoE layers when
+fine-tuning foundation models for vision tasks leads to more specialized
+experts at the class-level, further enabling manual bias correction in CelebA
+attribute classification. Finally, we show qualitative results demonstrating
+the expert specialism achieved when pre-training large GPT2 and MLP-Mixer
+models with parameter-matched $\mu$MoE blocks at every layer, maintaining
+comparable accuracy. Our code is available at:
+https://github.com/james-oldfield/muMoE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Github: https://github.com/james-oldfield/muMoE. Project page:
+  https://james-oldfield.github.io/muMoE/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MEGA: Masked Generative Autoencoder for Human Mesh Recovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18839v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18839v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guénolé Fiche, Simon Leglaive, Xavier Alameda-Pineda, Francesc Moreno-Noguer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human Mesh Recovery (HMR) from a single RGB image is a highly ambiguous
+problem, as similar 2D projections can correspond to multiple 3D
+interpretations. Nevertheless, most HMR methods overlook this ambiguity and
+make a single prediction without accounting for the associated uncertainty. A
+few approaches generate a distribution of human meshes, enabling the sampling
+of multiple predictions; however, none of them is competitive with the latest
+single-output model when making a single prediction. This work proposes a new
+approach based on masked generative modeling. By tokenizing the human pose and
+shape, we formulate the HMR task as generating a sequence of discrete tokens
+conditioned on an input image. We introduce MEGA, a MaskEd Generative
+Autoencoder trained to recover human meshes from images and partial human mesh
+token sequences. Given an image, our flexible generation scheme allows us to
+predict a single human mesh in deterministic mode or to generate multiple human
+meshes in stochastic mode. MEGA enables us to propose multiple outputs and to
+evaluate the uncertainty of the predictions. Experiments on in-the-wild
+benchmarks show that MEGA achieves state-of-the-art performance in
+deterministic and stochastic modes, outperforming single-output and
+multi-output approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Retrieval-Augmented Generation for AI-Generated Content: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.19473v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.19473v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Penghao Zhao, Hailin Zhang, Qinhan Yu, Zhengren Wang, Yunteng Geng, Fangcheng Fu, Ling Yang, Wentao Zhang, Jie Jiang, Bin Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in model algorithms, the growth of foundational models, and
+access to high-quality datasets have propelled the evolution of Artificial
+Intelligence Generated Content (AIGC). Despite its notable successes, AIGC
+still faces hurdles such as updating knowledge, handling long-tail data,
+mitigating data leakage, and managing high training and inference costs.
+Retrieval-Augmented Generation (RAG) has recently emerged as a paradigm to
+address such challenges. In particular, RAG introduces the information
+retrieval process, which enhances the generation process by retrieving relevant
+objects from available data stores, leading to higher accuracy and better
+robustness. In this paper, we comprehensively review existing efforts that
+integrate RAG technique into AIGC scenarios. We first classify RAG foundations
+according to how the retriever augments the generator, distilling the
+fundamental abstractions of the augmentation methodologies for various
+retrievers and generators. This unified perspective encompasses all RAG
+scenarios, illuminating advancements and pivotal technologies that help with
+potential future progress. We also summarize additional enhancements methods
+for RAG, facilitating effective engineering and implementation of RAG systems.
+Then from another view, we survey on practical applications of RAG across
+different modalities and tasks, offering valuable references for researchers
+and practitioners. Furthermore, we introduce the benchmarks for RAG, discuss
+the limitations of current RAG systems, and suggest potential directions for
+future research. Github: https://github.com/PKU-DAIR/RAG-Survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Citing 334 papers, 21 pages, 1 table, 12 figures. Project:
+  https://github.com/PKU-DAIR/RAG-Survey</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Having Second Thoughts? Let's hear it 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15356v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15356v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jung H. Lee, Sujith Vijayan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models loosely mimic bottom-up signal pathways from low-order
+sensory areas to high-order cognitive areas. After training, DL models can
+outperform humans on some domain-specific tasks, but their decision-making
+process has been known to be easily disrupted. Since the human brain consists
+of multiple functional areas highly connected to one another and relies on
+intricate interplays between bottom-up and top-down (from high-order to
+low-order areas) processing, we hypothesize that incorporating top-down signal
+processing may make DL models more robust. To address this hypothesis, we
+propose a certification process mimicking selective attention and test if it
+could make DL models more robust. Our empirical evaluations suggest that this
+newly proposed certification can improve DL models' accuracy and help us build
+safety measures to alleviate their vulnerabilities with both artificial and
+natural adversarial examples.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, 3 table and Append/Supplementary materials.
+  Section 3 has been substantially revised</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DP-IQA: Utilizing Diffusion Prior for Blind Image Quality Assessment in
+  the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19996v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19996v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honghao Fu, Yufei Wang, Wenhan Yang, Bihan Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image quality assessment (IQA) plays a critical role in selecting
+high-quality images and guiding compression and enhancement methods in a series
+of applications. The blind IQA, which assesses the quality of in-the-wild
+images containing complex authentic distortions without reference images, poses
+greater challenges. Existing methods are limited to modeling a uniform
+distribution with local patches and are bothered by the gap between low and
+high-level visions (caused by widely adopted pre-trained classification
+networks). In this paper, we propose a novel IQA method called diffusion
+priors-based IQA (DP-IQA), which leverages the prior knowledge from the
+pre-trained diffusion model with its excellent powers to bridge semantic gaps
+in the perception of the visual quality of images. Specifically, we use
+pre-trained stable diffusion as the backbone, extract multi-level features from
+the denoising U-Net during the upsampling process at a specified timestep, and
+decode them to estimate the image quality score. The text and image adapters
+are adopted to mitigate the domain gap for downstream tasks and correct the
+information loss caused by the variational autoencoder bottleneck. Finally, we
+distill the knowledge in the above model into a CNN-based student model,
+significantly reducing the parameter to enhance applicability, with the student
+model performing similarly or even better than the teacher model surprisingly.
+Experimental results demonstrate that our DP-IQA achieves state-of-the-art
+results on various in-the-wild datasets with better generalization capability,
+which shows the superiority of our method in global modeling and utilizing the
+hierarchical feature clues of diffusion for evaluating image quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Awesome Multi-modal Object Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14200v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14200v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunhui Zhang, Li Liu, Hao Wen, Xi Zhou, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal object tracking (MMOT) is an emerging field that combines data
+from various modalities, \eg vision (RGB), depth, thermal infrared, event,
+language and audio, to estimate the state of an arbitrary object in a video
+sequence. It is of great significance for many applications such as autonomous
+driving and intelligent surveillance. In recent years, MMOT has received more
+and more attention. However, existing MMOT algorithms mainly focus on two
+modalities (\eg RGB+depth, RGB+thermal infrared, and RGB+language). To leverage
+more modalities, some recent efforts have been made to learn a unified visual
+object tracking model for any modality. Additionally, some large-scale
+multi-modal tracking benchmarks have been established by simultaneously
+providing more than two modalities, such as vision-language-audio (\eg
+WebUAV-3M) and vision-depth-language (\eg UniMod1K). To track the latest
+progress in MMOT, we conduct a comprehensive investigation in this report.
+Specifically, we first divide existing MMOT tasks into five main categories,
+\ie RGBL tracking, RGBE tracking, RGBD tracking, RGBT tracking, and
+miscellaneous (RGB+X), where X can be any modality, such as language, depth,
+and event. Then, we analyze and summarize each MMOT task, focusing on widely
+used datasets and mainstream tracking algorithms based on their technical
+paradigms (\eg self-supervised learning, prompt learning, knowledge
+distillation, generative models, and state space models). Finally, we maintain
+a continuously updated paper list for MMOT at
+https://github.com/983632847/Awesome-Multimodal-Object-Tracking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A continuously updated project to track the latest progress in
+  multi-modal object tracking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 4DHands: Reconstructing Interactive Hands in 4D with <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20330v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20330v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dixuan Lin, Yuxiang Zhang, Mengcheng Li, Yebin Liu, Wei Jing, Qi Yan, Qianying Wang, Hongwen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce 4DHands, a robust approach to recovering
+interactive hand meshes and their relative movement from monocular inputs. Our
+approach addresses two major limitations of previous methods: lacking a unified
+solution for handling various hand image inputs and neglecting the positional
+relationship of two hands within images. To overcome these challenges, we
+develop a transformer-based architecture with novel tokenization and feature
+fusion strategies. Specifically, we propose a Relation-aware Two-Hand
+Tokenization (RAT) method to embed positional relation information into the
+hand tokens. In this way, our network can handle both single-hand and two-hand
+inputs and explicitly leverage relative hand positions, facilitating the
+reconstruction of intricate hand interactions in real-world scenarios. As such
+tokenization indicates the relative relationship of two hands, it also supports
+more effective feature fusion. To this end, we further develop a
+Spatio-temporal Interaction Reasoning (SIR) module to fuse hand tokens in 4D
+with attention and decode them into 3D hand meshes and relative temporal
+movements. The efficacy of our approach is validated on several benchmark
+datasets. The results on in-the-wild videos and real-world scenarios
+demonstrate the superior performances of our approach for interactive hand
+reconstruction. More video results can be found on the project page:
+https://4dhands.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>More demo videos can be seen at our project page:
+  https://4dhands.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Attention Analysis in Online Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20091v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20091v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miriam Navarro, Álvaro Becerra, Roberto Daza, Ruth Cobos, Aythami Morales, Julian Fierrez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present an approach in the Multimodal Learning Analytics
+field. Within this approach, we have developed a tool to visualize and analyze
+eye movement data collected during learning sessions in online courses. The
+tool is named VAAD (an acronym for Visual Attention Analysis Dashboard). These
+eye movement data have been gathered using an eye-tracker and subsequently
+processed and visualized for interpretation. The purpose of the tool is to
+conduct a descriptive analysis of the data by facilitating its visualization,
+enabling the identification of differences and learning patterns among various
+learner populations. Additionally, it integrates a predictive module capable of
+anticipating learner activities during a learning session. Consequently, VAAD
+holds the potential to offer valuable insights into online learning behaviors
+from both descriptive and predictive perspectives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CEDI 2024 (VII Congreso Espa\~nol de Inform\'atica), A
+  Coru\~na, Spain</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MotionGS : Compact Gaussian Splatting SLAM by Motion Filter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11129v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11129v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinli Guo, Weidong Zhang, Ruonan Liu, Peng Han, Hongtian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With their high-fidelity scene representation capability, the attention of
+SLAM field is deeply attracted by the Neural Radiation Field (NeRF) and 3D
+Gaussian Splatting (3DGS). Recently, there has been a surge in NeRF-based SLAM,
+while 3DGS-based SLAM is sparse. A novel 3DGS-based SLAM approach with a fusion
+of deep visual feature, dual keyframe selection and 3DGS is presented in this
+paper. Compared with the existing methods, the proposed tracking is achieved by
+feature extraction and motion filter on each frame. The joint optimization of
+poses and 3D Gaussians runs through the entire mapping process. Additionally,
+the coarse-to-fine pose estimation and compact Gaussian scene representation
+are implemented by dual keyframe selection and novel loss functions.
+Experimental results demonstrate that the proposed algorithm not only
+outperforms the existing methods in tracking and mapping, but also has less
+memory usage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoDeGAN: Contrastive Disentanglement for Generative Adversarial Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.03636v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.03636v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangwei Zhao, Zejia Liu, Xiaohan Guo, Lili Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Disentanglement, a critical concern in interpretable machine learning, has
+also garnered significant attention from the computer vision community. Many
+existing GAN-based class disentanglement (unsupervised) approaches, such as
+InfoGAN and its variants, primarily aim to maximize the mutual information (MI)
+between the generated image and its latent codes. However, this focus may lead
+to a tendency for the network to generate highly similar images when presented
+with the same latent class factor, potentially resulting in mode collapse or
+mode dropping. To alleviate this problem, we propose \texttt{CoDeGAN}
+(Contrastive Disentanglement for Generative Adversarial Networks), where we
+relax similarity constraints for disentanglement from the image domain to the
+feature domain. This modification not only enhances the stability of GAN
+training but also improves their disentangling capabilities. Moreover, we
+integrate self-supervised pre-training into CoDeGAN to learn semantic
+representations, significantly facilitating unsupervised disentanglement.
+Extensive experimental results demonstrate the superiority of our method over
+state-of-the-art approaches across multiple benchmarks. The code is available
+at https://github.com/learninginvision/CoDeGAN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VQ-HPS: Human Pose and Shape Estimation in a Vector-Quantized Latent
+  Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.08291v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.08291v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guénolé Fiche, Simon Leglaive, Xavier Alameda-Pineda, Antonio Agudo, Francesc Moreno-Noguer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous works on Human Pose and Shape Estimation (HPSE) from RGB images can
+be broadly categorized into two main groups: parametric and non-parametric
+approaches. Parametric techniques leverage a low-dimensional statistical body
+model for realistic results, whereas recent non-parametric methods achieve
+higher precision by directly regressing the 3D coordinates of the human body
+mesh. This work introduces a novel paradigm to address the HPSE problem,
+involving a low-dimensional discrete latent representation of the human mesh
+and framing HPSE as a classification task. Instead of predicting body model
+parameters or 3D vertex coordinates, we focus on predicting the proposed
+discrete latent representation, which can be decoded into a registered human
+mesh. This innovative paradigm offers two key advantages. Firstly, predicting a
+low-dimensional discrete representation confines our predictions to the space
+of anthropomorphic poses and shapes even when little training data is
+available. Secondly, by framing the problem as a classification task, we can
+harness the discriminative power inherent in neural networks. The proposed
+model, VQ-HPS, predicts the discrete latent representation of the mesh. The
+experimental results demonstrate that VQ-HPS outperforms the current
+state-of-the-art non-parametric approaches while yielding results as realistic
+as those produced by parametric methods when trained with little data. VQ-HPS
+also shows promising results when training on large-scale datasets,
+highlighting the significant potential of the classification approach for HPSE.
+See the project page at https://g-fiche.github.io/research-pages/vqhps/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-Efficient 3D Visual Grounding via Order-Aware Referring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16539v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16539v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tung-Yu Wu, Sheng-Yu Huang, Yu-Chiang Frank Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D visual grounding aims to identify the target object within a 3D point
+cloud scene referred to by a natural language description. Previous works
+usually require significant data relating to point color and their descriptions
+to exploit the corresponding complicated verbo-visual relations. In our work,
+we introduce Vigor, a novel Data-Efficient 3D Visual Grounding framework via
+Order-aware Referring. Vigor leverages LLM to produce a desirable referential
+order from the input description for 3D visual grounding. With the proposed
+stacked object-referring blocks, the predicted anchor objects in the above
+order allow one to locate the target object progressively without supervision
+on the identities of anchor objects or exact relations between anchor/target
+objects. In addition, we present an order-aware warm-up training strategy,
+which augments referential orders for pre-training the visual grounding
+framework. This allows us to better capture the complex verbo-visual relations
+and benefit the desirable data-efficient learning scheme. Experimental results
+on the NR3D and ScanRefer datasets demonstrate our superiority in low-resource
+scenarios. In particular, Vigor surpasses current state-of-the-art frameworks
+by 9.3% and 7.6% grounding accuracy under 1% data and 10% data settings on the
+NR3D dataset, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedLPPA: Learning Personalized <span class="highlight-title">Prompt</span> and Aggregation for Federated
+  Weakly-supervised Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17502v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17502v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Lin, Yixiang Liu, Jiewei Wu, Pujin Cheng, Zhiyuan Cai, Kenneth K. Y. Wong, Xiaoying Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) effectively mitigates the data silo challenge brought
+about by policies and privacy concerns, implicitly harnessing more data for
+deep model training. However, traditional centralized FL models grapple with
+diverse multi-center data, especially in the face of significant data
+heterogeneity, notably in medical contexts. In the realm of medical image
+segmentation, the growing imperative to curtail annotation costs has amplified
+the importance of weakly-supervised techniques which utilize sparse annotations
+such as points, scribbles, etc. A pragmatic FL paradigm shall accommodate
+diverse annotation formats across different sites, which research topic remains
+under-investigated. In such context, we propose a novel personalized FL
+framework with learnable prompt and aggregation (FedLPPA) to uniformly leverage
+heterogeneous weak supervision for medical image segmentation. In FedLPPA, a
+learnable universal knowledge prompt is maintained, complemented by multiple
+learnable personalized data distribution prompts and prompts representing the
+supervision sparsity. Integrated with sample features through a dual-attention
+mechanism, those prompts empower each local task decoder to adeptly adjust to
+both the local distribution and the supervision form. Concurrently, a
+dual-decoder strategy, predicated on prompt similarity, is introduced for
+enhancing the generation of pseudo-labels in weakly-supervised learning,
+alleviating overfitting and noise accumulation inherent to local data, while an
+adaptable aggregation method is employed to customize the task decoder on a
+parameter-wise basis. Extensive experiments on four distinct medical image
+segmentation tasks involving different modalities underscore the superiority of
+FedLPPA, with its efficacy closely parallels that of fully supervised
+centralized training. Our code and data will be available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Optimizers Are Better Than One: LLM Catalyst for Enhancing
+  Gradient-Based Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19732v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19732v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixian Guo, Ming Liu, Zhilong Ji, Jinfeng Bai, Yiwen Guo, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning a skill generally relies on both practical experience by doer and
+insightful high-level guidance by instructor. Will this strategy also work well
+for solving complex non-convex optimization problems? Here, a common
+gradient-based optimizer acts like a disciplined doer, making locally optimal
+update at each step. Recent methods utilize large language models (LLMs) to
+optimize solutions for concrete problems by inferring from natural language
+instructions, akin to a high-level instructor. In this paper, we show that
+these two optimizers are complementary to each other, suggesting a
+collaborative optimization approach. The gradient-based optimizer and LLM-based
+optimizer are combined in an interleaved manner. We instruct LLMs using task
+descriptions and timely optimization trajectories recorded during
+gradient-based optimization. Inferred results from LLMs are used as restarting
+points for the next stage of gradient optimization. By leveraging both the
+locally rigorous gradient-based optimizer and the high-level deductive
+LLM-based optimizer, our combined optimization method consistently yields
+improvements over competitive baseline prompt tuning methods. Our results
+demonstrate the synergistic effect of conventional gradient-based optimization
+and the inference ability of LLMs. The code is released at
+https://github.com/guozix/LLM-catalyst.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ N-Dimensional Gaussians for Fitting of High Dimensional Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20067v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20067v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stavros Diolatzis, Tobias Zirr, Alexandr Kuznetsov, Georgios Kopanas, Anton Kaplanyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the wake of many new ML-inspired approaches for reconstructing and
+representing high-quality 3D content, recent hybrid and explicitly learned
+representations exhibit promising performance and quality characteristics.
+However, their scaling to higher dimensions is challenging, e.g. when
+accounting for dynamic content with respect to additional parameters such as
+material properties, illumination, or time. In this paper, we tackle these
+challenges for an explicit representations based on Gaussian mixture models.
+With our solutions, we arrive at efficient fitting of compact N-dimensional
+Gaussian mixtures and enable efficient evaluation at render time: For fast
+fitting and evaluation, we introduce a high-dimensional culling scheme that
+efficiently bounds N-D Gaussians, inspired by Locality Sensitive Hashing. For
+adaptive refinement yet compact representation, we introduce a loss-adaptive
+density control scheme that incrementally guides the use of additional capacity
+towards missing details. With these tools we can for the first time represent
+complex appearance that depends on many input dimensions beyond position or
+viewing angle within a compact, explicit representation optimized in minutes
+and rendered in milliseconds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://www.sdiolatz.info/ndg-fitting/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scale-Invariant Feature Disentanglement via Adversarial Learning for
+  UAV-based Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15465v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15465v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Liu, Liang Yao, Chuanyi Zhang, Ting Wu, Xinlei Zhang, Xiruo Jiang, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting objects from Unmanned Aerial Vehicles (UAV) is often hindered by a
+large number of small objects, resulting in low detection accuracy. To address
+this issue, mainstream approaches typically utilize multi-stage inferences.
+Despite their remarkable detecting accuracies, real-time efficiency is
+sacrificed, making them less practical to handle real applications. To this
+end, we propose to improve the single-stage inference accuracy through learning
+scale-invariant features. Specifically, a Scale-Invariant Feature Disentangling
+module is designed to disentangle scale-related and scale-invariant features.
+Then an Adversarial Feature Learning scheme is employed to enhance
+disentanglement. Finally, scale-invariant features are leveraged for robust
+UAV-based object detection. Furthermore, we construct a multi-modal UAV object
+detection dataset, State-Air, which incorporates annotated UAV state
+parameters. We apply our approach to three state-of-the-art lightweight
+detection frameworks on three benchmark datasets, including State-Air.
+Extensive experiments demonstrate that our approach can effectively improve
+model accuracy. Our code and dataset are provided in Supplementary Materials
+and will be publicly available once the paper is accepted.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking and Improving Detail Image Caption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19092v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19092v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyuan Dong, Jiawen Li, Bohong Wu, Jiacong Wang, Yuan Zhang, Haoyuan Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image captioning has long been regarded as a fundamental task in visual
+understanding. Recently, however, few large vision-language model (LVLM)
+research discusses model's image captioning performance because of the outdated
+short-caption benchmarks and unreliable evaluation metrics. In this work, we
+propose to benchmark detail image caption task by curating high-quality
+evaluation datasets annotated by human experts, GPT-4V and Gemini-1.5-Pro. We
+also design a more reliable caption evaluation metric called CAPTURE (CAPtion
+evaluation by exTracting and coUpling coRE information). CAPTURE extracts
+visual elements, e.g., objects, attributes and relations from captions, and
+then matches these elements through three stages, achieving the highest
+consistency with expert judgements over other rule-based or model-based caption
+metrics. The proposed benchmark and metric provide reliable evaluation for
+LVLM's detailed image captioning ability. Guided by this evaluation, we further
+explore to unleash LVLM's detail caption capabilities by synthesizing
+high-quality data through a five-stage data construction pipeline. Our pipeline
+only uses a given LVLM itself and other open-source tools, without any human or
+GPT-4V annotation in the loop. Experiments show that the proposed data
+construction strategy significantly improves model-generated detail caption
+data quality for LVLMs with leading performance, and the data quality can be
+further improved in a self-looping paradigm. All code and dataset will be
+publicly available at https://github.com/foundation-multimodal-models/CAPTURE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SparseDrive: End-to-End Autonomous Driving via Sparse Scene
+  Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19620v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19620v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenchao Sun, Xuewu Lin, Yining Shi, Chuang Zhang, Haoran Wu, Sifa Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The well-established modular autonomous driving system is decoupled into
+different standalone tasks, e.g. perception, prediction and planning, suffering
+from information loss and error accumulation across modules. In contrast,
+end-to-end paradigms unify multi-tasks into a fully differentiable framework,
+allowing for optimization in a planning-oriented spirit. Despite the great
+potential of end-to-end paradigms, both the performance and efficiency of
+existing methods are not satisfactory, particularly in terms of planning
+safety. We attribute this to the computationally expensive BEV (bird's eye
+view) features and the straightforward design for prediction and planning. To
+this end, we explore the sparse representation and review the task design for
+end-to-end autonomous driving, proposing a new paradigm named SparseDrive.
+Concretely, SparseDrive consists of a symmetric sparse perception module and a
+parallel motion planner. The sparse perception module unifies detection,
+tracking and online mapping with a symmetric model architecture, learning a
+fully sparse representation of the driving scene. For motion prediction and
+planning, we review the great similarity between these two tasks, leading to a
+parallel design for motion planner. Based on this parallel design, which models
+planning as a multi-modal problem, we propose a hierarchical planning selection
+strategy , which incorporates a collision-aware rescore module, to select a
+rational and safe trajectory as the final planning output. With such effective
+designs, SparseDrive surpasses previous state-of-the-arts by a large margin in
+performance of all tasks, while achieving much higher training and inference
+efficiency. Code will be avaliable at https://github.com/swc-17/SparseDrive for
+facilitating future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NoiseBoost: Alleviating Hallucination with Noise Perturbation for
+  Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20081v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20081v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Wu, Boyuan Jiang, Zhengkai Jiang, Qingdong He, Donghao Luo, Shengzhi Wang, Qingwen Liu, Chengjie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) contribute a powerful mechanism to
+understanding visual information building on large language models. However,
+MLLMs are notorious for suffering from hallucinations, especially when
+generating lengthy, detailed descriptions for images. Our analysis reveals that
+hallucinations stem from the inherent summarization mechanism of large language
+models, leading to excessive dependence on linguistic tokens while neglecting
+vision information. In this paper, we propose NoiseBoost, a broadly applicable
+and simple method for alleviating hallucinations for MLLMs through the
+integration of noise feature perturbations. Noise perturbation acts as a
+regularizer, facilitating a balanced distribution of attention weights among
+visual and linguistic tokens. Despite its simplicity, NoiseBoost consistently
+enhances the performance of MLLMs across common training strategies, including
+supervised fine-tuning and reinforcement learning. Further, NoiseBoost
+pioneerly enables semi-supervised learning for MLLMs, unleashing the power of
+unlabeled data. Comprehensive experiments demonstrate that NoiseBoost improves
+dense caption accuracy by 8.1% with human evaluation and achieves comparable
+results with 50% of the data by mining unlabeled data. Code and models are
+available at https://kaiwu5.github.io/noiseboost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures with supplementary material</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Efficient and Multi-private Key Secure Aggregation for Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08970v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08970v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xue Yang, Zifeng Liu, Xiaohu Tang, Rongxing Lu, Bo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of privacy leaks in federated learning, secure aggregation
+protocols that mainly adopt either homomorphic encryption or threshold secret
+sharing have been widely developed for federated learning to protect the
+privacy of the local training data of each client. However, these existing
+protocols suffer from many shortcomings, such as the dependence on a trusted
+third party, the vulnerability to clients being corrupted, low efficiency, the
+trade-off between security and fault tolerance, etc. To solve these
+disadvantages, we propose an efficient and multi-private key secure aggregation
+scheme for federated learning. Specifically, we skillfully modify the variant
+ElGamal encryption technique to achieve homomorphic addition operation, which
+has two important advantages: 1) The server and each client can freely select
+public and private keys without introducing a trust third party and 2) Compared
+to the variant ElGamal encryption, the plaintext space is relatively large,
+which is more suitable for the deep model. Besides, for the high dimensional
+deep model parameter, we introduce a super-increasing sequence to compress
+multi-dimensional data into 1-D, which can greatly reduce encryption and
+decryption times as well as communication for ciphertext transmission. Detailed
+security analyses show that our proposed scheme achieves the semantic security
+of both individual local gradients and the aggregated result while achieving
+optimal robustness in tolerating both client collusion and dropped clients.
+Extensive simulations demonstrate that the accuracy of our scheme is almost the
+same as the non-private approach, while the efficiency of our scheme is much
+better than the state-of-the-art homomorphic encryption-based secure
+aggregation schemes. More importantly, the efficiency advantages of our scheme
+will become increasingly prominent as the number of model parameters increases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReasonPix2Pix: Instruction Reasoning <span class="highlight-title">Dataset</span> for Advanced Image Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ying Jin, Pengyang Ling, Xiaoyi Dong, Pan Zhang, Jiaqi Wang, Dahua Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction-based image editing focuses on equipping a generative model with
+the capacity to adhere to human-written instructions for editing images.
+Current approaches typically comprehend explicit and specific instructions.
+However, they often exhibit a deficiency in executing active reasoning
+capacities required to comprehend instructions that are implicit or
+insufficiently defined. To enhance active reasoning capabilities and impart
+intelligence to the editing model, we introduce ReasonPix2Pix, a comprehensive
+reasoning-attentive instruction editing dataset. The dataset is characterized
+by 1) reasoning instruction, 2) more realistic images from fine-grained
+categories, and 3) increased variances between input and edited images. When
+fine-tuned with our dataset under supervised conditions, the model demonstrates
+superior performance in instructional editing tasks, independent of whether the
+tasks require reasoning or not. The code will be available at
+https://github.com/Jin-Ying/ReasonPix2Pix.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Scaling White-Box <span class="highlight-title">Transformer</span>s for Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20299v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20299v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinrui Yang, Xianhang Li, Druv Pai, Yuyin Zhou, Yi Ma, Yaodong Yu, Cihang Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CRATE, a white-box transformer architecture designed to learn compressed and
+sparse representations, offers an intriguing alternative to standard vision
+transformers (ViTs) due to its inherent mathematical interpretability. Despite
+extensive investigations into the scaling behaviors of language and vision
+transformers, the scalability of CRATE remains an open question which this
+paper aims to address. Specifically, we propose CRATE-$\alpha$, featuring
+strategic yet minimal modifications to the sparse coding block in the CRATE
+architecture design, and a light training recipe designed to improve the
+scalability of CRATE. Through extensive experiments, we demonstrate that
+CRATE-$\alpha$ can effectively scale with larger model sizes and datasets. For
+example, our CRATE-$\alpha$-B substantially outperforms the prior best CRATE-B
+model accuracy on ImageNet classification by 3.7%, achieving an accuracy of
+83.2%. Meanwhile, when scaling further, our CRATE-$\alpha$-L obtains an
+ImageNet classification accuracy of 85.1%. More notably, these model
+performance improvements are achieved while preserving, and potentially even
+enhancing the interpretability of learned CRATE models, as we demonstrate
+through showing that the learned token representations of increasingly larger
+trained CRATE-$\alpha$ models yield increasingly higher-quality unsupervised
+object segmentation of images. The project page is
+https://rayjryang.github.io/CRATE-alpha/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://rayjryang.github.io/CRATE-alpha/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Meta-Learning Enabled Lightweight Multiscale Few-Shot Object
+  Detection in Remote Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18426v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18426v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenbin Guan, Zijiu Yang, Xiaohong Wu, Liqiong Chen, Feng Huang, Xiaohai He, Honggang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Presently, the task of few-shot object detection (FSOD) in remote sensing
+images (RSIs) has become a focal point of attention. Numerous few-shot
+detectors, particularly those based on two-stage detectors, face challenges
+when dealing with the multiscale complexities inherent in RSIs. Moreover, these
+detectors present impractical characteristics in real-world applications,
+mainly due to their unwieldy model parameters when handling large amount of
+data. In contrast, we recognize the advantages of one-stage detectors,
+including high detection speed and a global receptive field. Consequently, we
+choose the YOLOv7 one-stage detector as a baseline and subject it to a novel
+meta-learning training framework. This transformation allows the detector to
+adeptly address FSOD tasks while capitalizing on its inherent advantage of
+lightweight. Additionally, we thoroughly investigate the samples generated by
+the meta-learning strategy and introduce a novel meta-sampling approach to
+retain samples produced by our designed meta-detection head. Coupled with our
+devised meta-cross loss, we deliberately utilize "negative samples" that are
+often overlooked to extract valuable knowledge from them. This approach serves
+to enhance detection accuracy and efficiently refine the overall meta-learning
+strategy. To validate the effectiveness of our proposed detector, we conducted
+performance comparisons with current state-of-the-art detectors using the DIOR
+and NWPU VHR-10.v2 datasets, yielding satisfactory results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multimodal Cross-Domain Few-Shot Learning for Egocentric Action
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19917v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19917v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masashi Hatano, Ryo Hachiuma, Ryo Fujii, Hideo Saito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address a novel cross-domain few-shot learning task (CD-FSL) with
+multimodal input and unlabeled target data for egocentric action recognition.
+This paper simultaneously tackles two critical challenges associated with
+egocentric action recognition in CD-FSL settings: (1) the extreme domain gap in
+egocentric videos (\eg, daily life vs. industrial domain) and (2) the
+computational cost for real-world applications. We propose MM-CDFSL, a
+domain-adaptive and computationally efficient approach designed to enhance
+adaptability to the target domain and improve inference speed. To address the
+first challenge, we propose the incorporation of multimodal distillation into
+the student RGB model using teacher models. Each teacher model is trained
+independently on source and target data for its respective modality. Leveraging
+only unlabeled target data during multimodal distillation enhances the student
+model's adaptability to the target domain. We further introduce ensemble masked
+inference, a technique that reduces the number of input tokens through masking.
+In this approach, ensemble prediction mitigates the performance degradation
+caused by masking, effectively addressing the second issue. Our approach
+outperformed the state-of-the-art CD-FSL approaches with a substantial margin
+on multiple egocentric datasets, improving by an average of 6.12/6.10 points
+for 1-shot/5-shot settings while achieving $2.2$ times faster inference speed.
+Project page: https://masashi-hatano.github.io/MM-CDFSL/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Directly Denoising Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13540v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13540v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dan Zhang, Jingjing Wang, Feng Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present the Directly Denoising Diffusion Model (DDDM): a
+simple and generic approach for generating realistic images with few-step
+sampling, while multistep sampling is still preserved for better performance.
+DDDMs require no delicately designed samplers nor distillation on pre-trained
+distillation models. DDDMs train the diffusion model conditioned on an
+estimated target that was generated from previous training iterations of its
+own. To generate images, samples generated from the previous time step are also
+taken into consideration, guiding the generation process iteratively. We
+further propose Pseudo-LPIPS, a novel metric loss that is more robust to
+various values of hyperparameter. Despite its simplicity, the proposed approach
+can achieve strong performance in benchmark datasets. Our model achieves FID
+scores of 2.57 and 2.33 on CIFAR-10 in one-step and two-step sampling
+respectively, surpassing those obtained from GANs and distillation-based
+models. By extending the sampling to 1000 steps, we further reduce FID score to
+1.79, aligning with state-of-the-art methods in the literature. For ImageNet
+64x64, our approach stands as a competitive contender against leading models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ParSEL: Parameterized Shape Editing with Language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20319v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20319v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Ganeshan, Ryan Y. Huang, Xianghao Xu, R. Kenny Jones, Daniel Ritchie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to edit 3D assets from natural language presents a compelling
+paradigm to aid in the democratization of 3D content creation. However, while
+natural language is often effective at communicating general intent, it is
+poorly suited for specifying precise manipulation. To address this gap, we
+introduce ParSEL, a system that enables controllable editing of high-quality 3D
+assets from natural language. Given a segmented 3D mesh and an editing request,
+ParSEL produces a parameterized editing program. Adjusting the program
+parameters allows users to explore shape variations with a precise control over
+the magnitudes of edits. To infer editing programs which align with an input
+edit request, we leverage the abilities of large-language models (LLMs).
+However, while we find that LLMs excel at identifying initial edit operations,
+they often fail to infer complete editing programs, and produce outputs that
+violate shape semantics. To overcome this issue, we introduce Analytical Edit
+Propagation (AEP), an algorithm which extends a seed edit with additional
+operations until a complete editing program has been formed. Unlike prior
+methods, AEP searches for analytical editing operations compatible with a range
+of possible user edits through the integration of computer algebra systems for
+geometric analysis. Experimentally we demonstrate ParSEL's effectiveness in
+enabling controllable editing of 3D objects through natural language requests
+over alternative system designs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Iris-SAM: Iris Segmentation Using a Foundation Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06497v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06497v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parisa Farmanifard, Arun Ross
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Iris segmentation is a critical component of an iris biometric system and it
+involves extracting the annular iris region from an ocular image. In this work,
+we develop a pixel-level iris segmentation model from a foundational model,
+viz., Segment Anything Model (SAM), that has been successfully used for
+segmenting arbitrary objects. The primary contribution of this work lies in the
+integration of different loss functions during the fine-tuning of SAM on ocular
+images. In particular, the importance of Focal Loss is borne out in the
+fine-tuning process since it strategically addresses the class imbalance
+problem (i.e., iris versus non-iris pixels). Experiments on ND-IRIS-0405,
+CASIA-Iris-Interval-v3, and IIT-Delhi-Iris datasets convey the efficacy of the
+trained model for the task of iris segmentation. For instance, on the
+ND-IRIS-0405 dataset, an average segmentation accuracy of 99.58% was achieved,
+compared to the best baseline performance of 89.75%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The University of California San Francisco Brain Metastases Stereotactic
+  Radiosurgery (UCSF-BMSR) MRI <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.07248v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.07248v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeffrey D. Rudie, Rachit Saluja, David A. Weiss, Pierre Nedelec, Evan Calabrese, John B. Colby, Benjamin Laguna, John Mongan, Steve Braunstein, Christopher P. Hess, Andreas M. Rauschecker, Leo P. Sugrue, Javier E. Villanueva-Meyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The University of California San Francisco Brain Metastases Stereotactic
+Radiosurgery (UCSF-BMSR) dataset is a public, clinical, multimodal brain MRI
+dataset consisting of 560 brain MRIs from 412 patients with expert annotations
+of 5136 brain metastases. Data consists of registered and skull stripped T1
+post-contrast, T1 pre-contrast, FLAIR and subtraction (T1 pre-contrast - T1
+post-contrast) images and voxelwise segmentations of enhancing brain metastases
+in NifTI format. The dataset also includes patient demographics, surgical
+status and primary cancer types. The UCSF-BSMR has been made publicly available
+in the hopes that researchers will use these data to push the boundaries of AI
+applications for brain metastases. The dataset is freely available for
+non-commercial use at https://imagingdatasets.ucsf.edu/dataset/1
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 2 tables, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Any2Point: Empowering Any-modality Large Models for Efficient 3D
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07989v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07989v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiwen Tang, Ray Zhang, Jiaming Liu, Zoey Guo, Dong Wang, Zhigang Wang, Bin Zhao, Shanghang Zhang, Peng Gao, Hongsheng Li, Xuelong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large foundation models have recently emerged as a prominent focus of
+interest, attaining superior performance in widespread scenarios. Due to the
+scarcity of 3D data, many efforts have been made to adapt pre-trained
+transformers from vision to 3D domains. However, such 2D-to-3D approaches are
+still limited, due to the potential loss of spatial geometries and high
+computation cost. More importantly, their frameworks are mainly designed for 2D
+models, lacking a general any-to-3D paradigm. In this paper, we introduce
+Any2Point, a parameter-efficient method to empower any-modality large models
+(vision, language, audio) for 3D understanding. Given a frozen transformer from
+any source modality, we propose a 3D-to-any (1D or 2D) virtual projection
+strategy that correlates the input 3D points to the original 1D or 2D positions
+within the source modality. This mechanism enables us to assign each 3D token
+with a positional encoding paired with the pre-trained model, which avoids 3D
+geometry loss caused by the true projection and better motivates the
+transformer for 3D learning with 1D/2D positional priors. Then, within each
+transformer block, we insert an any-to-3D guided adapter module for
+parameter-efficient fine-tuning. The adapter incorporates prior spatial
+knowledge from the source modality to guide the local feature aggregation of 3D
+tokens, compelling the semantic adaption of any-modality transformers. We
+conduct extensive experiments to showcase the effectiveness and efficiency of
+our method. Code and models are released at
+https://github.com/Ivan-Tang-3D/Any2Point.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and models are released at
+  https://github.com/Ivan-Tang-3D/Any2Point</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ KerasCV and KerasNLP: Vision and Language Power-Ups 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Watson, Divyashree Shivakumar Sreepathihalli, Francois Chollet, Martin Gorner, Kiranbir Sodhia, Ramesh Sampath, Tirth Patel, Haifeng Jin, Neel Kovelamudi, Gabriel Rasskin, Samaneh Saadat, Luke Wood, Chen Qian, Jonathan Bischof, Ian Stenbit, Abheesht Sharma, Anshuman Mishra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Keras domain packages KerasCV and KerasNLP, extensions of the
+Keras API for Computer Vision and Natural Language Processing workflows,
+capable of running on either JAX, TensorFlow, or PyTorch. These domain packages
+are designed to enable fast experimentation, with a focus on ease-of-use and
+performance. We adopt a modular, layered design: at the library's lowest level
+of abstraction, we provide building blocks for creating models and data
+preprocessing pipelines, and at the library's highest level of abstraction, we
+provide pretrained ``task" models for popular architectures such as Stable
+Diffusion, YOLOv8, GPT2, BERT, Mistral, CLIP, Gemma, T5, etc. Task models have
+built-in preprocessing, pretrained weights, and can be fine-tuned on raw
+inputs. To enable efficient training, we support XLA compilation for all
+models, and run all preprocessing via a compiled graph of TensorFlow operations
+using the tf.data API. The libraries are fully open-source (Apache 2.0 license)
+and available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Journal of Machine Learning Open Source Software</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SNeurodCNN: Structure-focused Neurodegeneration Convolutional Neural
+  Network for Modelling and Classification of Alzheimer's Disease 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.03922v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.03922v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simisola Odimayo, Chollette C. Olisah, Khadija Mohammed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alzheimer's disease (AD), the predominant form of dementia, is a growing
+global challenge, emphasizing the urgent need for accurate and early diagnosis.
+Current clinical diagnoses rely on radiologist expert interpretation, which is
+prone to human error. Deep learning has thus far shown promise for early AD
+diagnosis. However, existing methods often overlook focal structural atrophy
+critical for enhanced understanding of the cerebral cortex neurodegeneration.
+This paper proposes a deep learning framework that includes a novel
+structure-focused neurodegeneration CNN architecture named SNeurodCNN and an
+image brightness enhancement preprocessor using gamma correction. The
+SNeurodCNN architecture takes as input the focal structural atrophy features
+resulting from segmentation of brain structures captured through magnetic
+resonance imaging (MRI). As a result, the architecture considers only necessary
+CNN components, which comprises of two downsampling convolutional blocks and
+two fully connected layers, for achieving the desired classification task, and
+utilises regularisation techniques to regularise learnable parameters.
+Leveraging mid-sagittal and para-sagittal brain image viewpoints from the
+Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset, our framework
+demonstrated exceptional performance. The para-sagittal viewpoint achieved
+97.8% accuracy, 97.0% specificity, and 98.5% sensitivity, while the
+mid-sagittal viewpoint offered deeper insights with 98.1% accuracy, 97.2%
+specificity, and 99.0% sensitivity. Model analysis revealed the ability of
+SNeurodCNN to capture the structural dynamics of mild cognitive impairment
+(MCI) and AD in the frontal lobe, occipital lobe, cerebellum, temporal, and
+parietal lobe, suggesting its potential as a brain structural change
+digi-biomarker for early AD diagnosis. This work can be reproduced using code
+we made available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>36 Pages, 10 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Liver Fat Quantification Network with Body Shape 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11386v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11386v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiyue Wang, Wu Xue, Xiaoke Zhang, Fang Jin, James Hahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is critically important to detect the content of liver fat as it is
+related to cardiac complications and cardiovascular disease mortality. However,
+existing methods are either associated with high cost and/or medical
+complications (e.g., liver biopsy, imaging technology) or only roughly estimate
+the grades of steatosis. In this paper, we propose a deep neural network to
+estimate the percentage of liver fat using only body shapes. The proposed is
+composed of a flexible baseline network and a lightweight Attention module. The
+attention module is trained to generate discriminative and diverse features
+which significant improve the performance. In order to validate the method, we
+perform extensive tests on the public medical dataset. The results verify that
+our proposed method yields state-of-the-art performance with Root mean squared
+error (RMSE) of 5.26% and R-Squared value over 0.8. It offers an accurate and
+more accessible assessment of hepatic steatosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Autonomous Driving with Spiking Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui-Jie Zhu, Ziqing Wang, Leilani Gilpin, Jason K. Eshraghian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving demands an integrated approach that encompasses
+perception, prediction, and planning, all while operating under strict energy
+constraints to enhance scalability and environmental sustainability. We present
+Spiking Autonomous Driving (SAD), the first unified Spiking Neural Network
+(SNN) to address the energy challenges faced by autonomous driving systems
+through its event-driven and energy-efficient nature. SAD is trained end-to-end
+and consists of three main modules: perception, which processes inputs from
+multi-view cameras to construct a spatiotemporal bird's eye view; prediction,
+which utilizes a novel dual-pathway with spiking neurons to forecast future
+states; and planning, which generates safe trajectories considering predicted
+occupancy, traffic rules, and ride comfort. Evaluated on the nuScenes dataset,
+SAD achieves competitive performance in perception, prediction, and planning
+tasks, while drawing upon the energy efficiency of SNNs. This work highlights
+the potential of neuromorphic computing to be applied to energy-efficient
+autonomous driving, a critical step toward sustainable and safety-critical
+automotive technology. Our code is available at
+\url{https://github.com/ridgerchu/SAD}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MedFLIP: Medical Vision-and-Language <span class="highlight-title">Self-supervised</span> Fast <span class="highlight-title">Pre-Train</span>ing
+  with Masked Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04626v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04626v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Li, Tianfang Zhang, Xinglin Zhang, Jiaqi Liu, Bingqi Ma, Yan Luo, Tao Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the domain of medical analysis, extensive research has explored the
+potential of mutual learning between Masked Autoencoders(MAEs) and multimodal
+data. However, the impact of MAEs on intermodality remains a key challenge. We
+introduce MedFLIP, a Fast Language-Image Pre-training method for Medical
+analysis. We explore MAEs for zero-shot learning with crossed domains, which
+enhances the model's ability to learn from limited data, a common scenario in
+medical diagnostics. We verify that masking an image does not affect
+inter-modal learning. Furthermore, we propose the SVD loss to enhance the
+representation learning for characteristics of medical images, aiming to
+improve classification accuracy by leveraging the structural intricacies of
+such data. Our theory posits that masking encourages semantic preservation,
+robust feature extraction, regularization, domain adaptation, and invariance
+learning. Lastly, we validate using language will improve the zero-shot
+performance for the medical image analysis. MedFLIP's scaling of the masking
+process marks an advancement in the field, offering a pathway to rapid and
+precise medical image analysis without the traditional computational
+bottlenecks. Through experiments and validation, MedFLIP demonstrates efficient
+performance improvements, helps for future research and application in medical
+diagnostics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HoSNN: Adversarially-Robust Homeostatic Spiking Neural Networks with
+  Adaptive Firing Thresholds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10373v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10373v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hejia Geng, Peng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While spiking neural networks (SNNs) offer a promising neurally-inspired
+model of computation, they are vulnerable to adversarial attacks. We present
+the first study that draws inspiration from neural homeostasis to design a
+threshold-adapting leaky integrate-and-fire (TA-LIF) neuron model and utilize
+TA-LIF neurons to construct the adversarially robust homeostatic SNNs (HoSNNs)
+for improved robustness. The TA-LIF model incorporates a self-stabilizing
+dynamic thresholding mechanism, offering a local feedback control solution to
+the minimization of each neuron's membrane potential error caused by
+adversarial disturbance. Theoretical analysis demonstrates favorable dynamic
+properties of TA-LIF neurons in terms of the bounded-input bounded-output
+stability and suppressed time growth of membrane potential error, underscoring
+their superior robustness compared with the standard LIF neurons. When trained
+with weak FGSM attacks (attack budget = 2/255) and tested with much stronger
+PGD attacks (attack budget = 8/255), our HoSNNs significantly improve model
+accuracy on several datasets: from 30.54% to 74.91% on FashionMNIST, from 0.44%
+to 35.06% on SVHN, from 0.56% to 42.63% on CIFAR10, from 0.04% to 16.66% on
+CIFAR100, over the conventional LIF-based SNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HD Maps are Lane Detection Generalizers: A Novel Generative Framework
+  for Single-Source Domain Generalization <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.16589v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.16589v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daeun Lee, Minhyeok Heo, Jiwon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lane detection is a vital task for vehicles to navigate and localize their
+position on the road. To ensure reliable driving, lane detection models must
+have robust generalization performance in various road environments. However,
+despite the advanced performance in the trained domain, their generalization
+performance still falls short of expectations due to the domain discrepancy. To
+bridge this gap, we propose a novel generative framework using HD Maps for
+Single-Source Domain Generalization (SSDG) in lane detection. We first generate
+numerous front-view images from lane markings of HD Maps. Next, we
+strategically select a core subset among the generated images using (i) lane
+structure and (ii) road surrounding criteria to maximize their diversity. In
+the end, utilizing this core set, we train lane detection models to boost their
+generalization performance. We validate that our generative framework from HD
+Maps outperforms the Domain Adaptation model MLDA with +3.01%p accuracy
+improvement, even though we do not access the target domain images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR Data-Driven Autonomous Driving Simulation Workshop,
+  2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">13</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CWRCzech: 100M Query-Document Czech Click <span class="highlight-title">Dataset</span> and Its Application to
+  Web Relevance Ranking <span class="chip">SIGIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20994v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20994v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josef Vonášek, Milan Straka, Rostislav Krč, Lenka Lasoňová, Ekaterina Egorova, Jana Straková, Jakub Náplava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present CWRCzech, Click Web Ranking dataset for Czech, a 100M
+query-document Czech click dataset for relevance ranking with user behavior
+data collected from search engine logs of Seznam.cz. To the best of our
+knowledge, CWRCzech is the largest click dataset with raw text published so
+far. It provides document positions in the search results as well as
+information about user behavior: 27.6M clicked documents and 10.8M dwell times.
+In addition, we also publish a manually annotated Czech test for the relevance
+task, containing nearly 50k query-document pairs, each annotated by at least 2
+annotators. Finally, we analyze how the user behavior data improve relevance
+ranking and show that models trained on data automatically harnessed at
+sufficient scale can surpass the performance of models trained on human
+annotated data. CWRCzech is published under an academic non-commercial license
+and is available to the research community at
+https://github.com/seznam/CWRCzech.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIGIR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SelfGNN: <span class="highlight-title">Self-Supervised</span> Graph Neural Networks for Sequential
+  Recommendation <span class="chip">SIGIR'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxi Liu, Lianghao Xia, Chao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation effectively addresses information overload by
+modeling users' temporal and sequential interaction patterns. To overcome the
+limitations of supervision signals, recent approaches have adopted
+self-supervised learning techniques in recommender systems. However, there are
+still two critical challenges that remain unsolved. Firstly, existing
+sequential models primarily focus on long-term modeling of individual
+interaction sequences, overlooking the valuable short-term collaborative
+relationships among the behaviors of different users. Secondly, real-world data
+often contain noise, particularly in users' short-term behaviors, which can
+arise from temporary intents or misclicks. Such noise negatively impacts the
+accuracy of both graph and sequence models, further complicating the modeling
+process. To address these challenges, we propose a novel framework called
+Self-Supervised Graph Neural Network (SelfGNN) for sequential recommendation.
+The SelfGNN framework encodes short-term graphs based on time intervals and
+utilizes Graph Neural Networks (GNNs) to learn short-term collaborative
+relationships. It captures long-term user and item representations at multiple
+granularity levels through interval fusion and dynamic behavior modeling.
+Importantly, our personalized self-augmented learning structure enhances model
+robustness by mitigating noise in short-term graphs based on long-term user
+interests and personal stability. Extensive experiments conducted on four
+real-world datasets demonstrate that SelfGNN outperforms various
+state-of-the-art baselines. Our model implementation codes are available at
+https://github.com/HKUDS/SelfGNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGIR'24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Popularity-Aware Alignment and Contrast for Mitigating Popularity Bias <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miaomiao Cai, Lei Chen, Yifan Wang, Haoyue Bai, Peijie Sun, Le Wu, Min Zhang, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collaborative Filtering (CF) typically suffers from the significant challenge
+of popularity bias due to the uneven distribution of items in real-world
+datasets. This bias leads to a significant accuracy gap between popular and
+unpopular items. It not only hinders accurate user preference understanding but
+also exacerbates the Matthew effect in recommendation systems. To alleviate
+popularity bias, existing efforts focus on emphasizing unpopular items or
+separating the correlation between item representations and their popularity.
+Despite the effectiveness, existing works still face two persistent challenges:
+(1) how to extract common supervision signals from popular items to improve the
+unpopular item representations, and (2) how to alleviate the representation
+separation caused by popularity bias. In this work, we conduct an empirical
+analysis of popularity bias and propose Popularity-Aware Alignment and Contrast
+(PAAC) to address two challenges. Specifically, we use the common supervisory
+signals modeled in popular item representations and propose a novel
+popularity-aware supervised alignment module to learn unpopular item
+representations. Additionally, we suggest re-weighting the contrastive learning
+loss to mitigate the representation separation from a popularity-centric
+perspective. Finally, we validate the effectiveness and rationale of PAAC in
+mitigating popularity bias through extensive experiments on three real-world
+datasets. Our code is available at
+https://github.com/miaomiao-cai2/KDD2024-PAAC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by KDD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information Maximization via Variational Autoencoders for Cross-Domain
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuying Ning, Wujiang Xu, Xiaolei Liu, Mingming Ha, Qiongxu Ma, Youru Li, Linxun Chen, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-Domain Sequential Recommendation (CDSR) methods aim to address the data
+sparsity and cold-start problems present in Single-Domain Sequential
+Recommendation (SDSR). Existing CDSR methods typically rely on overlapping
+users, designing complex cross-domain modules to capture users' latent
+interests that can propagate across different domains. However, their
+propagated informative information is limited to the overlapping users and the
+users who have rich historical behavior records. As a result, these methods
+often underperform in real-world scenarios, where most users are
+non-overlapping (cold-start) and long-tailed. In this research, we introduce a
+new CDSR framework named Information Maximization Variational Autoencoder
+(\textbf{\texttt{IM-VAE}}). Here, we suggest using a Pseudo-Sequence Generator
+to enhance the user's interaction history input for downstream fine-grained
+CDSR models to alleviate the cold-start issues. We also propose a Generative
+Recommendation Framework combined with three regularizers inspired by the
+mutual information maximization (MIM) theory \cite{mcgill1954multivariate} to
+capture the semantic differences between a user's interests shared across
+domains and those specific to certain domains, as well as address the
+informational gap between a user's actual interaction sequences and the
+pseudo-sequences generated. To the best of our knowledge, this paper is the
+first CDSR work that considers the information disentanglement and denoising of
+pseudo-sequences in the open-world recommendation scenario. Empirical
+experiments illustrate that \texttt{IM-VAE} outperforms the state-of-the-art
+approaches on two real-world cross-domain datasets on all sorts of users,
+including cold-start and tailed users, demonstrating the effectiveness of
+\texttt{IM-VAE} in open-world recommendation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Passage-specific <span class="highlight-title">Prompt</span> Tuning for Passage Reranking in Question
+  Answering with Large Language Models <span class="chip">SIGIR24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuyang Wu, Zhiyuan Peng, Sravanthi Rajanala, Hsin-Tai Wu, Yi Fang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Effective passage retrieval and reranking methods have been widely utilized
+to identify suitable candidates in open-domain question answering tasks, recent
+studies have resorted to LLMs for reranking the retrieved passages by the
+log-likelihood of the question conditioned on each passage. Although these
+methods have demonstrated promising results, the performance is notably
+sensitive to the human-written prompt (or hard prompt), and fine-tuning LLMs
+can be computationally intensive and time-consuming. Furthermore, this approach
+limits the leverage of question-passage relevance pairs and passage-specific
+knowledge to enhance the ranking capabilities of LLMs. In this paper, we
+propose passage-specific prompt tuning for reranking in open-domain question
+answering (PSPT): a parameter-efficient method that fine-tunes learnable
+passage-specific soft prompts, incorporating passage-specific knowledge from a
+limited set of question-passage relevance pairs. The method involves ranking
+retrieved passages based on the log-likelihood of the model generating the
+question conditioned on each passage and the learned soft prompt. We conducted
+extensive experiments utilizing the Llama-2-chat-7B model across three publicly
+available open-domain question answering datasets and the results demonstrate
+the effectiveness of the proposed approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Gen-IR@SIGIR24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models Enhanced Sequential Recommendation for Long-tail
+  User and Item 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20646v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20646v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qidong Liu, Xian Wu, Xiangyu Zhao, Yejing Wang, Zijian Zhang, Feng Tian, Yefeng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sequential recommendation systems (SRS) serve the purpose of predicting
+users' subsequent preferences based on their past interactions and have been
+applied across various domains such as e-commerce and social networking
+platforms. However, practical SRS encounters challenges due to the fact that
+most users engage with only a limited number of items, while the majority of
+items are seldom consumed. These challenges, termed as the long-tail user and
+long-tail item dilemmas, often create obstacles for traditional SRS methods.
+Mitigating these challenges is crucial as they can significantly impact user
+satisfaction and business profitability. While some research endeavors have
+alleviated these issues, they still grapple with issues such as seesaw or noise
+stemming from the scarcity of interactions. The emergence of large language
+models (LLMs) presents a promising avenue to address these challenges from a
+semantic standpoint. In this study, we introduce the Large Language Models
+Enhancement framework for Sequential Recommendation (LLM-ESR), which leverages
+semantic embeddings from LLMs to enhance SRS performance without increasing
+computational overhead. To combat the long-tail item challenge, we propose a
+dual-view modeling approach that fuses semantic information from LLMs with
+collaborative signals from traditional SRS. To address the long-tail user
+challenge, we introduce a retrieval augmented self-distillation technique to
+refine user preference representations by incorporating richer interaction data
+from similar users. Through comprehensive experiments conducted on three
+authentic datasets using three widely used SRS models, our proposed enhancement
+framework demonstrates superior performance compared to existing methodologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Distillation for Alleviating Performance Heterogeneity in
+  Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20626v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20626v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyu Zhang, Ziqi Jiang, Jiangchao Yao, Fuli Feng, Kun Kuang, Zhou Zhao, Shuo Li, Hongxia Yang, Tat-Seng Chua, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommendation performance usually exhibits a long-tail distribution over
+users -- a small portion of head users enjoy much more accurate recommendation
+services than the others. We reveal two sources of this performance
+heterogeneity problem: the uneven distribution of historical interactions (a
+natural source); and the biased training of recommender models (a model
+source). As addressing this problem cannot sacrifice the overall performance, a
+wise choice is to eliminate the model bias while maintaining the natural
+heterogeneity. The key to debiased training lies in eliminating the effect of
+confounders that influence both the user's historical behaviors and the next
+behavior. The emerging causal recommendation methods achieve this by modeling
+the causal effect between user behaviors, however potentially neglect
+unobserved confounders (\eg, friend suggestions) that are hard to measure in
+practice. To address unobserved confounders, we resort to the front-door
+adjustment (FDA) in causal theory and propose a causal multi-teacher
+distillation framework (CausalD). FDA requires proper mediators in order to
+estimate the causal effects of historical behaviors on the next behavior. To
+achieve this, we equip CausalD with multiple heterogeneous recommendation
+models to model the mediator distribution. Then, the causal effect estimated by
+FDA is the expectation of recommendation prediction over the mediator
+distribution and the prior distribution of historical behaviors, which is
+technically achieved by multi-teacher ensemble. To pursue efficient inference,
+CausalD further distills multiple teachers into one student model to directly
+infer the causal effect for making recommendations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TKDE 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Enhanced Multi-intent <span class="highlight-title">Transformer</span> Network for Recommendation <span class="chip">WWW 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20565v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20565v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ding Zou, Wei Wei, Feida Zhu, Chuanyu Xu, Tao Zhang, Chengfu Huo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incorporating Knowledge Graphs into Recommendation has attracted growing
+attention in industry, due to the great potential of KG in providing abundant
+supplementary information and interpretability for the underlying models.
+However, simply integrating KG into recommendation usually brings in negative
+feedback in industry, due to the ignorance of the following two factors: i)
+users' multiple intents, which involve diverse nodes in KG. For example, in
+e-commerce scenarios, users may exhibit preferences for specific styles,
+brands, or colors. ii) knowledge noise, which is a prevalent issue in Knowledge
+Enhanced Recommendation (KGR) and even more severe in industry scenarios. The
+irrelevant knowledge properties of items may result in inferior model
+performance compared to approaches that do not incorporate knowledge. To tackle
+these challenges, we propose a novel approach named Knowledge Enhanced
+Multi-intent Transformer Network for Recommendation (KGTN), comprising two
+primary modules: Global Intents Modeling with Graph Transformer, and Knowledge
+Contrastive Denoising under Intents. Specifically, Global Intents with Graph
+Transformer focuses on capturing learnable user intents, by incorporating
+global signals from user-item-relation-entity interactions with a graph
+transformer, meanwhile learning intent-aware user/item representations.
+Knowledge Contrastive Denoising under Intents is dedicated to learning precise
+and robust representations. It leverages intent-aware representations to sample
+relevant knowledge, and proposes a local-global contrastive mechanism to
+enhance noise-irrelevant representation learning. Extensive experiments
+conducted on benchmark datasets show the superior performance of our proposed
+method over the state-of-the-arts. And online A/B testing results on Alibaba
+large-scale industrial recommendation platform also indicate the real-scenario
+effectiveness of KGTN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept By The Web Conf 2024 (WWW 2024) Industry Track. arXiv admin
+  note: text overlap with arXiv:2204.08807</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM-RankFusion: Mitigating Intrinsic Inconsistency in LLM-based Ranking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zeng, Ojas Tendolkar, Raymond Baartmans, Qingyun Wu, Huazheng Wang, Lizhong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ranking passages by prompting a large language model (LLM) can achieve
+promising performance in modern information retrieval (IR) systems. A common
+approach is to sort the ranking list by prompting LLMs for pairwise comparison.
+However, sorting-based methods require consistent comparisons to correctly sort
+the passages, which we show that LLMs often violate. We identify two kinds of
+intrinsic inconsistency in LLM-based pairwise comparisons: order inconsistency
+which leads to conflicting results when switching the passage order, and
+transitive inconsistency which leads to non-transitive triads among all
+preference pairs. In this paper, we propose LLM-RankFusion, an LLM-based
+ranking framework that mitigates these inconsistencies and produces a robust
+ranking list. LLM-RankFusion mitigates order inconsistency using in-context
+learning (ICL) to demonstrate order-agnostic comparisons and calibration to
+estimate the underlying preference probability between two passages. We then
+address transitive inconsistency by aggregating the ranking results from
+multiple rankers. In our experiments, we empirically show that LLM-RankFusion
+can significantly reduce inconsistent pairwise comparison results, and improve
+the ranking quality by making the final ranking list more robust.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ImplicitSLIM and How it Improves Embedding-based Collaborative Filtering <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilya Shenbin, Sergey Nikolenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present ImplicitSLIM, a novel unsupervised learning approach for sparse
+high-dimensional data, with applications to collaborative filtering. Sparse
+linear methods (SLIM) and their variations show outstanding performance, but
+they are memory-intensive and hard to scale. ImplicitSLIM improves
+embedding-based models by extracting embeddings from SLIM-like models in a
+computationally cheap and memory-efficient way, without explicit learning of
+heavy SLIM-like models. We show that ImplicitSLIM improves performance and
+speeds up convergence for both state of the art and classical collaborative
+filtering methods. The source code for ImplicitSLIM, related models, and
+applications is available at https://github.com/ilya-shenbin/ImplicitSLIM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at ICLR 2024; authors' version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-hop Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.09140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.09140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaibhav Mavi, Anubhav Jangra, Adam Jatowt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of Question Answering (QA) has attracted significant research
+interest for long. Its relevance to language understanding and knowledge
+retrieval tasks, along with the simple setting makes the task of QA crucial for
+strong AI systems. Recent success on simple QA tasks has shifted the focus to
+more complex settings. Among these, Multi-Hop QA (MHQA) is one of the most
+researched tasks over the recent years. In broad terms, MHQA is the task of
+answering natural language questions that involve extracting and combining
+multiple pieces of information and doing multiple steps of reasoning. An
+example of a multi-hop question would be "The Argentine PGA Championship record
+holder has won how many tournaments worldwide?". Answering the question would
+need two pieces of information: "Who is the record holder for Argentine PGA
+Championship tournaments?" and "How many tournaments did [Answer of Sub Q1]
+win?". The ability to answer multi-hop questions and perform multi step
+reasoning can significantly improve the utility of NLP systems. Consequently,
+the field has seen a surge with high quality datasets, models and evaluation
+strategies. The notion of 'multiple hops' is somewhat abstract which results in
+a large variety of tasks that require multi-hop reasoning. This leads to
+different datasets and models that differ significantly from each other and
+makes the field challenging to generalize and survey. We aim to provide a
+general and formal definition of the MHQA task, and organize and summarize
+existing MHQA frameworks. We also outline some best practices for building MHQA
+datasets. This book provides a systematic and thorough introduction as well as
+the structuring of the existing attempts to this highly interesting, yet quite
+challenging task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Foundations and Trends in Information Retrieval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Taxonomy of Mathematical Plagiarism <span class="chip">ECIR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.16969v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.16969v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankit Satpute, Andre Greiner-Petter, Noah Gießing, Isabel Beckenbach, Moritz Schubotz, Olaf Teschke, Akiko Aizawa, Bela Gipp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plagiarism is a pressing concern, even more so with the availability of large
+language models. Existing plagiarism detection systems reliably find copied and
+moderately reworded text but fail for idea plagiarism, especially in
+mathematical science, which heavily uses formal mathematical notation. We make
+two contributions. First, we establish a taxonomy of mathematical content reuse
+by annotating potentially plagiarised 122 scientific document pairs. Second, we
+analyze the best-performing approaches to detect plagiarism and mathematical
+content similarity on the newly established taxonomy. We found that the
+best-performing methods for plagiarism and math content similarity achieve an
+overall detection score (PlagDet) of 0.06 and 0.16, respectively. The
+best-performing methods failed to detect most cases from all seven newly
+established math similarity types. Outlined contributions will benefit research
+in plagiarism detection systems, recommender systems, question-answering
+systems, and search engines. We make our experiment's code and annotated
+dataset available to the community:
+https://github.com/gipplab/Taxonomy-of-Mathematical-Plagiarism
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46th European Conference on Information Retrieval (ECIR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Text Embeddings with Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.00368v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.00368v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Wang, Nan Yang, Xiaolong Huang, Linjun Yang, Rangan Majumder, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce a novel and simple method for obtaining
+high-quality text embeddings using only synthetic data and less than 1k
+training steps. Unlike existing methods that often depend on multi-stage
+intermediate pre-training with billions of weakly-supervised text pairs,
+followed by fine-tuning with a few labeled datasets, our method does not
+require building complex training pipelines or relying on manually collected
+datasets that are often constrained by task diversity and language coverage. We
+leverage proprietary LLMs to generate diverse synthetic data for hundreds of
+thousands of text embedding tasks across 93 languages. We then fine-tune
+open-source decoder-only LLMs on the synthetic data using standard contrastive
+loss. Experiments demonstrate that our method achieves strong performance on
+highly competitive text embedding benchmarks without using any labeled data.
+Furthermore, when fine-tuned with a mixture of synthetic and labeled data, our
+model sets new state-of-the-art results on the BEIR and MTEB benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalization Beyond Data Imbalance: A Controlled Study on CLIP for
+  Transferable Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Wen, Bingchen Zhao, Yilun Chen, Jiangmiao Pang, Xiaojuan Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Severe data imbalance naturally exists among web-scale vision-language
+datasets. Despite this, we find CLIP pre-trained thereupon exhibits notable
+robustness to the data imbalance compared to supervised learning, and
+demonstrates significant effectiveness in learning generalizable
+representations. With an aim to investigate the reasons behind this finding, we
+conduct controlled experiments to study various underlying factors, and reveal
+that CLIP's pretext task forms a dynamic classification problem wherein only a
+subset of classes is present in training. This isolates the bias from dominant
+classes and implicitly balances the learning signal. Furthermore, the
+robustness and discriminability of CLIP improve with more descriptive language
+supervision, larger data scale, and broader open-world concepts, which are
+inaccessible to supervised learning. Our study not only uncovers the mechanisms
+behind CLIP's generalizability beyond data imbalance but also provides
+transferable insights for the research community. The findings are validated in
+both supervised and self-supervised learning, enabling models trained on
+imbalanced data to achieve CLIP-level performance on diverse recognition tasks.
+Code will be available at: https://github.com/CVMI-Lab/clip-beyond-tail.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recurrent neural networks: vanishing and exploding gradients are not the
+  end of the story 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Zucchet, Antonio Orvieto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recurrent neural networks (RNNs) notoriously struggle to learn long-term
+memories, primarily due to vanishing and exploding gradients. The recent
+success of state-space models (SSMs), a subclass of RNNs, to overcome such
+difficulties challenges our theoretical understanding. In this paper, we delve
+into the optimization challenges of RNNs and discover that, as the memory of a
+network increases, changes in its parameters result in increasingly large
+output variations, making gradient-based learning highly sensitive, even
+without exploding gradients. Our analysis further reveals the importance of the
+element-wise recurrence design pattern combined with careful parametrizations
+in mitigating this effect. This feature is present in SSMs, as well as in other
+architectures, such as LSTMs. Overall, our insights provide a new explanation
+for some of the difficulties in gradient-based learning of RNNs and why some
+architectures perform better than others.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Network Verification with Branch-and-Bound for General
+  Nonlinearities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhouxing Shi, Qirui Jin, Zico Kolter, Suman Jana, Cho-Jui Hsieh, Huan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Branch-and-bound (BaB) is among the most effective methods for neural network
+(NN) verification. However, existing works on BaB have mostly focused on NNs
+with piecewise linear activations, especially ReLU networks. In this paper, we
+develop a general framework, named GenBaB, to conduct BaB for general
+nonlinearities in general computational graphs based on linear bound
+propagation. To decide which neuron to branch, we design a new branching
+heuristic which leverages linear bounds as shortcuts to efficiently estimate
+the potential improvement after branching. To decide nontrivial branching
+points for general nonlinear functions, we propose to optimize branching points
+offline, which can be efficiently leveraged during verification with a lookup
+table. We demonstrate the effectiveness of our GenBaB on verifying a wide range
+of NNs, including networks with activation functions such as Sigmoid, Tanh,
+Sine and GeLU, as well as networks involving multi-dimensional nonlinear
+operations such as multiplications in LSTMs and Vision Transformers. Our
+framework also allows the verification of general nonlinear computation graphs
+and enables verification applications beyond simple neural networks,
+particularly for AC Optimal Power Flow (ACOPF). GenBaB is part of the latest
+$\alpha,\!\beta$-CROWN, the winner of the 4th International Verification of
+Neural Networks Competition (VNN-COMP 2023).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph External Attention Enhanced <span class="highlight-title">Transformer</span> <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianqing Liang, Min Chen, Jiye Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Transformer architecture has recently gained considerable attention in
+the field of graph representation learning, as it naturally overcomes several
+limitations of Graph Neural Networks (GNNs) with customized attention
+mechanisms or positional and structural encodings. Despite making some
+progress, existing works tend to overlook external information of graphs,
+specifically the correlation between graphs. Intuitively, graphs with similar
+structures should have similar representations. Therefore, we propose Graph
+External Attention (GEA) -- a novel attention mechanism that leverages multiple
+external node/edge key-value units to capture inter-graph correlations
+implicitly. On this basis, we design an effective architecture called Graph
+External Attention Enhanced Transformer (GEAET), which integrates local
+structure and global interaction information for more comprehensive graph
+representations. Extensive experiments on benchmark datasets demonstrate that
+GEAET achieves state-of-the-art empirical performance. The source code is
+available for reproducibility at: https://github.com/icm1018/GEAET.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s are SSMs: Generalized Models and Efficient Algorithms
+  Through Structured State Space Duality <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tri Dao, Albert Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Transformers have been the main architecture behind deep learning's
+success in language modeling, state-space models (SSMs) such as Mamba have
+recently been shown to match or outperform Transformers at small to medium
+scale. We show that these families of models are actually quite closely
+related, and develop a rich framework of theoretical connections between SSMs
+and variants of attention, connected through various decompositions of a
+well-studied class of structured semiseparable matrices. Our state space
+duality (SSD) framework allows us to design a new architecture (Mamba-2) whose
+core layer is an a refinement of Mamba's selective SSM that is 2-8X faster,
+while continuing to be competitive with Transformers on language modeling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spectrum-Aware Parameter Efficient Fine-Tuning for Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinxi Zhang, Song Wen, Ligong Han, Felix Juefei-Xu, Akash Srivastava, Junzhou Huang, Hao Wang, Molei Tao, Dimitris N. Metaxas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapting large-scale pre-trained generative models in a parameter-efficient
+manner is gaining traction. Traditional methods like low rank adaptation
+achieve parameter efficiency by imposing constraints but may not be optimal for
+tasks requiring high representation capacity. We propose a novel spectrum-aware
+adaptation framework for generative models. Our method adjusts both singular
+values and their basis vectors of pretrained weights. Using the Kronecker
+product and efficient Stiefel optimizers, we achieve parameter-efficient
+adaptation of orthogonal matrices. We introduce Spectral Orthogonal
+Decomposition Adaptation (SODA), which balances computational efficiency and
+representation capacity. Extensive evaluations on text-to-image diffusion
+models demonstrate SODA's effectiveness, offering a spectrum-aware alternative
+to existing fine-tuning methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grammar-Aligned Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21047v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21047v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kanghee Park, Jiayu Wang, Taylor Berg-Kirkpatrick, Nadia Polikarpova, Loris D'Antoni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) struggle with reliably generating highly
+structured outputs, such as program code, mathematical formulas, or well-formed
+markup. Constrained decoding approaches mitigate this problem by greedily
+restricting what tokens an LLM can output at each step to guarantee that the
+output matches a given constraint. Specifically, in grammar-constrained
+decoding (GCD), the LLM's output must follow a given grammar. In this paper we
+demonstrate that GCD techniques (and in general constrained decoding
+techniques) can distort the LLM's distribution, leading to outputs that are
+grammatical but appear with likelihoods that are not proportional to the ones
+given by the LLM, and so ultimately are low-quality. We call the problem of
+aligning sampling with a grammar constraint, grammar-aligned decoding (GAD),
+and propose adaptive sampling with approximate expected futures (ASAp), a
+decoding algorithm that guarantees the output to be grammatical while provably
+producing outputs that match the conditional probability of the LLM's
+distribution conditioned on the given grammar constraint. Our algorithm uses
+prior sample outputs to soundly overapproximate the future grammaticality of
+different output prefixes. Our evaluation on code generation and structured NLP
+tasks shows how ASAp often produces outputs with higher likelihood (according
+to the LLM's distribution) than existing GCD techniques, while still enforcing
+the desired grammatical constraints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploratory Preference Optimization: Harnessing Implicit
+  Q*-Approximation for Sample-Efficient RLHF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengyang Xie, Dylan J. Foster, Akshay Krishnamurthy, Corby Rosset, Ahmed Awadallah, Alexander Rakhlin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning from human feedback (RLHF) has emerged as a central
+tool for language model alignment. We consider online exploration in RLHF,
+which exploits interactive access to human or AI feedback by deliberately
+encouraging the model to produce diverse, maximally informative responses. By
+allowing RLHF to confidently stray from the pre-trained model, online
+exploration offers the possibility of novel, potentially super-human
+capabilities, but its full potential as a paradigm for language model training
+has yet to be realized, owing to computational and statistical bottlenecks in
+directly adapting existing reinforcement learning techniques. We propose a new
+algorithm for online exploration in RLHF, Exploratory Preference Optimization
+(XPO), which is simple and practical -- a one-line change to (online) Direct
+Preference Optimization (DPO; Rafailov et al., 2023) -- yet enjoys the
+strongest known provable guarantees and promising empirical performance. XPO
+augments the DPO objective with a novel and principled exploration bonus,
+empowering the algorithm to explore outside the support of the initial model
+and human feedback data. In theory, we show that XPO is provably
+sample-efficient and converges to a near-optimal language model policy under
+natural exploration conditions, irrespective of whether the initial model has
+good coverage. Our analysis, which builds on the observation that DPO
+implicitly performs a form of $Q^{\star}$-approximation (or, Bellman error
+minimization), combines previously disparate techniques from language modeling
+and theoretical reinforcement learning in a serendipitous fashion through the
+perspective of KL-regularized Markov decision processes. Empirically, we find
+that XPO is more sample-efficient than non-exploratory DPO variants in a
+preliminary evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Attention-Based Multi-Context Convolutional Encoder-Decoder Neural
+  Network for Work Zone Traffic Impact Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinhua Jiang, Xishun Liao, Yaofa Gong, Jiaqi Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Work zone is one of the major causes of non-recurrent traffic congestion and
+road incidents. Despite the significance of its impact, studies on predicting
+the traffic impact of work zones remain scarce. In this paper, we propose a
+data integration pipeline that enhances the utilization of work zone and
+traffic data from diversified platforms, and introduce a novel deep learning
+model to predict the traffic speed and incident likelihood during planned work
+zone events. The proposed model transforms traffic patterns into 2D space-time
+images for both model input and output and employs an attention-based
+multi-context convolutional encoder-decoder architecture to capture the
+spatial-temporal dependencies between work zone events and traffic variations.
+Trained and validated on four years of archived work zone traffic data from
+Maryland, USA, the model demonstrates superior performance over baseline models
+in predicting traffic speed, incident likelihood, and inferred traffic
+attributes such as queue length and congestion timings (i.e., start time and
+duration). Specifically, the proposed model outperforms the baseline models by
+reducing the prediction error of traffic speed by 5% to 34%, queue length by
+11% to 29%, congestion timing by 6% to 17%, and increasing the accuracy of
+incident predictions by 5% to 7%. Consequently, this model offers substantial
+promise for enhancing the planning and traffic management of work zones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Target Networks and Over-parameterization Stabilize Off-policy
+  Bootstrapping with Function Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21043v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21043v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengdi Che, Chenjun Xiao, Jincheng Mei, Bo Dai, Ramki Gummadi, Oscar A Ramirez, Christopher K Harris, A. Rupam Mahmood, Dale Schuurmans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We prove that the combination of a target network and over-parameterized
+linear function approximation establishes a weaker convergence condition for
+bootstrapped value estimation in certain cases, even with off-policy data. Our
+condition is naturally satisfied for expected updates over the entire
+state-action space or learning with a batch of complete trajectories from
+episodic Markov decision processes. Notably, using only a target network or an
+over-parameterized model does not provide such a convergence guarantee.
+Additionally, we extend our results to learning with truncated trajectories,
+showing that convergence is achievable for all tasks with minor modifications,
+akin to value truncation for the final states in trajectories. Our primary
+result focuses on temporal difference estimation for prediction, providing
+high-probability value estimation error bounds and empirical analysis on
+Baird's counterexample and a Four-room task. Furthermore, we explore the
+control setting, demonstrating that similar convergence conditions apply to
+Q-learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing information content of representation spaces for
+  disentanglement with VAE ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kieran A. Murphy, Sam Dillavou, Dani S. Bassett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Disentanglement is the endeavour to use machine learning to divide
+information about a dataset into meaningful fragments. In practice these
+fragments are representation (sub)spaces, often the set of channels in the
+latent space of a variational autoencoder (VAE). Assessments of disentanglement
+predominantly employ metrics that are coarse-grained at the model level, but
+this approach can obscure much about the process of information fragmentation.
+Here we propose to study the learned channels in aggregate, as the fragments of
+information learned by an ensemble of repeat training runs. Additionally, we
+depart from prior work where measures of similarity between individual
+subspaces neglected the nature of data embeddings as probability distributions.
+Instead, we view representation subspaces as communication channels that
+perform a soft clustering of the data; consequently, we generalize two classic
+information-theoretic measures of similarity between clustering assignments to
+compare representation spaces. We develop a lightweight method of estimation
+based on fingerprinting representation subspaces by their ability to
+distinguish dataset samples, allowing us to identify, analyze, and leverage
+meaningful structure in ensembles of VAEs trained on synthetic and natural
+datasets. Using this fully unsupervised pipeline we identify "hotspots" in the
+space of information fragments: groups of nearly identical representation
+subspaces that appear repeatedly in an ensemble of VAEs, particularly as
+regularization is increased. Finally, we leverage the proposed methodology to
+achieve ensemble learning with VAEs, boosting the information content of a set
+of weak learners -- a capability not possible with previous methods of
+assessing channel similarity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code:
+  https://github.com/murphyka/representation-space-info-comparison</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A-PETE: Adaptive Prototype Explanations of Tree Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21036v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21036v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacek Karolczak, Jerzy Stefanowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The need for interpreting machine learning models is addressed through
+prototype explanations within the context of tree ensembles. An algorithm named
+Adaptive Prototype Explanations of Tree Ensembles (A-PETE) is proposed to
+automatise the selection of prototypes for these classifiers. Its unique
+characteristics is using a specialised distance measure and a modified k-medoid
+approach. Experiments demonstrated its competitive predictive accuracy with
+respect to earlier explanation algorithms. It also provides a a sufficient
+number of prototypes for the purpose of interpreting the random forest
+classifier.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fusion-PSRO: Nash Policy Fusion for Policy Space Response Oracles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21027v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21027v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiesong Lian, Yucong Huang, Mingzhi Wang, Chengdong Ma, Yixue Hao, Ying Wen, Yaodong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For solving zero-sum games involving non-transitivity, a common approach is
+to maintain population policies to approximate the Nash Equilibrium (NE).
+Previous research has shown that the Policy Space Response Oracle (PSRO) is an
+effective multi-agent reinforcement learning framework for these games.
+However, repeatedly training new policies from scratch to approximate the Best
+Response (BR) to opponents' mixed policies at each iteration is inefficient and
+costly. While some PSRO methods initialize a new BR policy by inheriting from
+past BR policies, this approach limits the exploration of new policies,
+especially against challenging opponents.To address this issue, we propose
+Fusion-PSRO, which uses model fusion to initialize the policy for better
+approximation to BR. With Top-k probabilities from NE, we select high-quality
+base policies and fuse them into a new BR policy through model averaging. This
+approach allows the initialized policy to incorporate multiple expert policies,
+making it easier to handle difficult opponents compared to inheriting or
+initializing from scratch. Additionally, our method only modifies the policy
+initialization, enabling its application to nearly all PSRO variants without
+additional training overhead.Our experiments with non-transitive matrix games,
+Leduc poker, and the more complex Liars Dice demonstrate that Fusion-PSRO
+enhances the performance of nearly all PSRO variants, achieving lower
+exploitability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Conventional Parametric Modeling: Data-Driven Framework for
+  Estimation and Prediction of Time Activity Curves in Dynamic PET Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21021v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21021v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niloufar Zakariaei, Arman Rahmim, Eldad Haber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dynamic Positron Emission Tomography (dPET) imaging and Time-Activity Curve
+(TAC) analyses are essential for understanding and quantifying the
+biodistribution of radiopharmaceuticals over time and space. Traditional
+compartmental modeling, while foundational, commonly struggles to fully capture
+the complexities of biological systems, including non-linear dynamics and
+variability. This study introduces an innovative data-driven neural
+network-based framework, inspired by Reaction Diffusion systems, designed to
+address these limitations. Our approach, which adaptively fits TACs from dPET,
+enables the direct calibration of diffusion coefficients and reaction terms
+from observed data, offering significant improvements in predictive accuracy
+and robustness over traditional methods, especially in complex biological
+scenarios. By more accurately modeling the spatio-temporal dynamics of
+radiopharmaceuticals, our method advances modeling of pharmacokinetic and
+pharmacodynamic processes, enabling new possibilities in quantitative nuclear
+medicine.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Techniques for Optimization-Based Jailbreaking on Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaojun Jia, Tianyu Pang, Chao Du, Yihao Huang, Jindong Gu, Yang Liu, Xiaochun Cao, Min Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are being rapidly developed, and a key component
+of their widespread deployment is their safety-related alignment. Many
+red-teaming efforts aim to jailbreak LLMs, where among these efforts, the
+Greedy Coordinate Gradient (GCG) attack's success has led to a growing interest
+in the study of optimization-based jailbreaking techniques. Although GCG is a
+significant milestone, its attacking efficiency remains unsatisfactory. In this
+paper, we present several improved (empirical) techniques for
+optimization-based jailbreaks like GCG. We first observe that the single target
+template of "Sure" largely limits the attacking performance of GCG; given this,
+we propose to apply diverse target templates containing harmful self-suggestion
+and/or guidance to mislead LLMs. Besides, from the optimization aspects, we
+propose an automatic multi-coordinate updating strategy in GCG (i.e.,
+adaptively deciding how many tokens to replace in each step) to accelerate
+convergence, as well as tricks like easy-to-hard initialisation. Then, we
+combine these improved technologies to develop an efficient jailbreak method,
+dubbed $\mathcal{I}$-GCG. In our experiments, we evaluate on a series of
+benchmarks (such as NeurIPS 2023 Red Teaming Track). The results demonstrate
+that our improved techniques can help GCG outperform state-of-the-art
+jailbreaking attacks and achieve nearly 100% attack success rate. The code is
+released at https://github.com/jiaxiaojunQAQ/I-GCG.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ G-<span class="highlight-title">Transformer</span> for Conditional Average Potential Outcome Estimation over
+  Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Hess, Dennis Frauen, Valentyn Melnychuk, Stefan Feuerriegel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating potential outcomes for treatments over time based on observational
+data is important for personalized decision-making in medicine. Yet, existing
+neural methods for this task suffer from either (a) bias or (b) large variance.
+In order to address both limitations, we introduce the G-transformer (GT). Our
+GT is a novel, neural end-to-end model designed for unbiased, low-variance
+estimation of conditional average potential outcomes (CAPOs) over time.
+Specifically, our GT is the first neural model to perform regression-based
+iterative G-computation for CAPOs in the time-varying setting. We evaluate the
+effectiveness of our GT across various experiments. In sum, this work
+represents a significant step towards personalized decision-making from
+electronic health records.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Predictions by Characteristic Rules <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.21003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.21003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amr Alkhatib, Henrik Boström, Michalis Vazirgiannis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Characteristic rules have been advocated for their ability to improve
+interpretability over discriminative rules within the area of rule learning.
+However, the former type of rule has not yet been used by techniques for
+explaining predictions. A novel explanation technique, called CEGA
+(Characteristic Explanatory General Association rules), is proposed, which
+employs association rule mining to aggregate multiple explanations generated by
+any standard local explanation technique into a set of characteristic rules. An
+empirical investigation is presented, in which CEGA is compared to two
+state-of-the-art methods, Anchors and GLocalX, for producing local and
+aggregated explanations in the form of discriminative rules. The results
+suggest that the proposed approach provides a better trade-off between fidelity
+and complexity compared to the two state-of-the-art approaches; CEGA and
+Anchors significantly outperform GLocalX with respect to fidelity, while CEGA
+and GLocalX significantly outperform Anchors with respect to the number of
+generated rules. The effect of changing the format of the explanations of CEGA
+to discriminative rules and using LIME and SHAP as local explanation techniques
+instead of Anchors are also investigated. The results show that the
+characteristic explanatory rules still compete favorably with rules in the
+standard discriminative format. The results also indicate that using CEGA in
+combination with either SHAP or Anchors consistently leads to a higher fidelity
+compared to using LIME as the local explanation technique.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Machine Learning and Knowledge Discovery in Databases. ECML PKDD 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information limits and Thouless-Anderson-Palmer equations for spiked
+  matrix models with structured noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20993v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20993v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jean Barbier, Francesco Camilli, Marco Mondelli, Yizhou Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a prototypical problem of Bayesian inference for a structured
+spiked model: a low-rank signal is corrupted by additive noise. While both
+information-theoretic and algorithmic limits are well understood when the noise
+is i.i.d. Gaussian, the more realistic case of structured noise still proves to
+be challenging. To capture the structure while maintaining mathematical
+tractability, a line of work has focused on rotationally invariant noise.
+However, existing studies either provide sub-optimal algorithms or they are
+limited to a special class of noise ensembles. In this paper, we establish the
+first characterization of the information-theoretic limits for a noise matrix
+drawn from a general trace ensemble. These limits are then achieved by an
+efficient algorithm inspired by the theory of adaptive Thouless-Anderson-Palmer
+(TAP) equations. Our approach leverages tools from statistical physics (replica
+method) and random matrix theory (generalized spherical integrals), and it
+unveils the equivalence between the rotationally invariant model and a
+surrogate Gaussian model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hard Cases Detection in Motion Prediction by Vision-Language Foundation
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Yang, Qingwen Zhang, Kei Ikemura, Nazre Batool, John Folkesson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Addressing hard cases in autonomous driving, such as anomalous road users,
+extreme weather conditions, and complex traffic interactions, presents
+significant challenges. To ensure safety, it is crucial to detect and manage
+these scenarios effectively for autonomous driving systems. However, the rarity
+and high-risk nature of these cases demand extensive, diverse datasets for
+training robust models. Vision-Language Foundation Models (VLMs) have shown
+remarkable zero-shot capabilities as being trained on extensive datasets. This
+work explores the potential of VLMs in detecting hard cases in autonomous
+driving. We demonstrate the capability of VLMs such as GPT-4v in detecting hard
+cases in traffic participant motion prediction on both agent and scenario
+levels. We introduce a feasible pipeline where VLMs, fed with sequential image
+frames with designed prompts, effectively identify challenging agents or
+scenarios, which are verified by existing prediction models. Moreover, by
+taking advantage of this detection of hard cases by VLMs, we further improve
+the training efficiency of the existing motion prediction pipeline by
+performing data selection for the training samples suggested by GPT. We show
+the effectiveness and feasibility of our pipeline incorporating VLMs with
+state-of-the-art methods on NuScenes datasets. The code is accessible at
+https://github.com/KTH-RPL/Detect_VLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE Intelligent Vehicles Symposium (IV) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Locking Machine Learning Models into Hardware 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20990v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20990v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eleanor Clifford, Adhithya Saravanan, Harry Langford, Cheng Zhang, Yiren Zhao, Robert Mullins, Ilia Shumailov, Jamie Hayes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern Machine Learning models are expensive IP and business competitiveness
+often depends on keeping this IP confidential. This in turn restricts how these
+models are deployed -- for example it is unclear how to deploy a model
+on-device without inevitably leaking the underlying model. At the same time,
+confidential computing technologies such as Multi-Party Computation or
+Homomorphic encryption remain impractical for wide adoption. In this paper we
+take a different approach and investigate feasibility of ML-specific mechanisms
+that deter unauthorized model use by restricting the model to only be usable on
+specific hardware, making adoption on unauthorized hardware inconvenient. That
+way, even if IP is compromised, it cannot be trivially used without specialised
+hardware or major model adjustment. In a sense, we seek to enable cheap locking
+of machine learning models into specific hardware. We demonstrate that locking
+mechanisms are feasible by either targeting efficiency of model
+representations, such making models incompatible with quantisation, or tie the
+model's operation on specific characteristics of hardware, such as number of
+cycles for arithmetic operations. We demonstrate that locking comes with
+negligible work and latency overheads, while significantly restricting
+usability of the resultant model on unauthorized hardware.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures of main text; 14 pages, 16 figures of appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Communication-Efficient Distributed Deep Learning via Federated Dynamic
+  Averaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michail Theologitis, Georgios Frangias, Georgios Anestis, Vasilis Samoladas, Antonios Deligiannakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by the ever-growing volume and decentralized nature of data, coupled
+with the escalating size of modern models, distributed deep learning (DDL) has
+been entrenched as the preferred paradigm for training. However, frequent
+synchronization of DL models, encompassing millions to many billions of
+parameters, creates a communication bottleneck, severely hindering scalability.
+Worse yet, DDL algorithms typically waste valuable bandwidth, and make
+themselves less practical in bandwidth-constrained federated settings, by
+relying on overly simplistic, periodic, and rigid synchronization schedules. To
+address these shortcomings, we propose Federated Dynamic Averaging (FDA), a
+communication-efficient DDL strategy that dynamically triggers synchronization
+based on the value of the model variance. Through extensive experiments across
+a wide range of learning tasks we demonstrate that FDA reduces communication
+cost by orders of magnitude, compared to both traditional and cutting-edge
+communication-efficient algorithms. Remarkably, FDA achieves this without
+sacrificing convergence speed - in stark contrast to the trade-offs encountered
+in the field. Additionally, we show that FDA maintains robust performance
+across diverse data heterogeneity settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Early Stopping Criteria for Training Generative Adversarial Networks in
+  Biomedical Imaging <span class="chip">SC 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Muneeb Saad, Mubashir Husain Rehmani, Ruairi O'Reilly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GANs) have high computational costs to train
+their complex architectures. Throughout the training process, GANs' output is
+analyzed qualitatively based on the loss and synthetic images' diversity and
+quality. Based on this qualitative analysis, training is manually halted once
+the desired synthetic images are generated. By utilizing an early stopping
+criterion, the computational cost and dependence on manual oversight can be
+reduced yet impacted by training problems such as mode collapse,
+non-convergence, and instability. This is particularly prevalent in biomedical
+imagery, where training problems degrade the diversity and quality of synthetic
+images, and the high computational cost associated with training makes complex
+architectures increasingly inaccessible. This work proposes a novel early
+stopping criteria to quantitatively detect training problems, halt training,
+and reduce the computational costs associated with synthesizing biomedical
+images. Firstly, the range of generator and discriminator loss values is
+investigated to assess whether mode collapse, non-convergence, and instability
+occur sequentially, concurrently, or interchangeably throughout the training of
+GANs. Secondly, utilizing these occurrences in conjunction with the Mean
+Structural Similarity Index (MS-SSIM) and Fr\'echet Inception Distance (FID)
+scores of synthetic images forms the basis of the proposed early stopping
+criteria. This work helps identify the occurrence of training problems in GANs
+using low-resource computational cost and reduces training time to generate
+diversified and high-quality synthetic images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted at the 35th IEEE Irish Signals and Systems
+  Conference (ISSC 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty Quantification for Bird's Eye View Semantic Segmentation:
+  Methods and Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linlin Yu, Bowen Yang, Tianhao Wang, Kangshuo Li, Feng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fusion of raw features from multiple sensors on an autonomous vehicle to
+create a Bird's Eye View (BEV) representation is crucial for planning and
+control systems. There is growing interest in using deep learning models for
+BEV semantic segmentation. Anticipating segmentation errors and improving the
+explainability of DNNs is essential for autonomous driving, yet it is
+under-studied. This paper introduces a benchmark for predictive uncertainty
+quantification in BEV segmentation. The benchmark assesses various approaches
+across three popular datasets using two representative backbones and focuses on
+the effectiveness of predicted uncertainty in identifying misclassified and
+out-of-distribution (OOD) pixels, as well as calibration. Empirical findings
+highlight the challenges in uncertainty quantification. Our results find that
+evidential deep learning based approaches show the most promise by efficiently
+quantifying aleatoric and epistemic uncertainty. We propose the
+Uncertainty-Focal-Cross-Entropy (UFCE) loss, designed for highly imbalanced
+data, which consistently improves the segmentation quality and calibration.
+Additionally, we introduce a vacuity-scaled regularization term that enhances
+the model's focus on high uncertainty pixels, improving epistemic uncertainty
+quantification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian Design Principles for Offline-to-Online Reinforcement Learning <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Hu, Yiqin Yang, Jianing Ye, Chengjie Wu, Ziqing Mai, Yujing Hu, Tangjie Lv, Changjie Fan, Qianchuan Zhao, Chongjie Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline reinforcement learning (RL) is crucial for real-world applications
+where exploration can be costly or unsafe. However, offline learned policies
+are often suboptimal, and further online fine-tuning is required. In this
+paper, we tackle the fundamental dilemma of offline-to-online fine-tuning: if
+the agent remains pessimistic, it may fail to learn a better policy, while if
+it becomes optimistic directly, performance may suffer from a sudden drop. We
+show that Bayesian design principles are crucial in solving such a dilemma.
+Instead of adopting optimistic or pessimistic policies, the agent should act in
+a way that matches its belief in optimal policies.
+  Such a probability-matching agent can avoid a sudden performance drop while
+still being guaranteed to find the optimal policy. Based on our theoretical
+findings, we introduce a novel algorithm that outperforms existing methods on
+various benchmarks, demonstrating the efficacy of our approach. Overall, the
+proposed approach provides a new perspective on offline-to-online RL that has
+the potential to enable more effective learning from offline data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Forty-first International Conference on Machine Learning (ICML), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Gaussian Scale-Space Fields <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20980v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20980v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Mujkanovic, Ntumba Elie Nsampi, Christian Theobalt, Hans-Peter Seidel, Thomas Leimkühler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian scale spaces are a cornerstone of signal representation and
+processing, with applications in filtering, multiscale analysis, anti-aliasing,
+and many more. However, obtaining such a scale space is costly and cumbersome,
+in particular for continuous representations such as neural fields. We present
+an efficient and lightweight method to learn the fully continuous, anisotropic
+Gaussian scale space of an arbitrary signal. Based on Fourier feature
+modulation and Lipschitz bounding, our approach is trained self-supervised,
+i.e., training does not require any manual filtering. Our neural Gaussian
+scale-space fields faithfully capture multiscale representations across a broad
+range of modalities, and support a diverse set of applications. These include
+images, geometry, light-stage data, texture anti-aliasing, and multiscale
+optimization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages; SIGGRAPH 2024; project page at
+  https://neural-gaussian-scale-space-fields.mpi-inf.mpg.de</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ACE: A Model Poisoning Attack on Contribution Evaluation Methods in
+  Federated Learning <span class="chip">USENIX Security</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangchen Xu, Fengqing Jiang, Luyao Niu, Jinyuan Jia, Bo Li, Radha Poovendran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Federated Learning (FL), a set of clients collaboratively train a machine
+learning model (called global model) without sharing their local training data.
+The local training data of clients is typically non-i.i.d. and heterogeneous,
+resulting in varying contributions from individual clients to the final
+performance of the global model. In response, many contribution evaluation
+methods were proposed, where the server could evaluate the contribution made by
+each client and incentivize the high-contributing clients to sustain their
+long-term participation in FL. Existing studies mainly focus on developing new
+metrics or algorithms to better measure the contribution of each client.
+However, the security of contribution evaluation methods of FL operating in
+adversarial environments is largely unexplored. In this paper, we propose the
+first model poisoning attack on contribution evaluation methods in FL, termed
+ACE. Specifically, we show that any malicious client utilizing ACE could
+manipulate the parameters of its local model such that it is evaluated to have
+a high contribution by the server, even when its local training data is indeed
+of low quality. We perform both theoretical analysis and empirical evaluations
+of ACE. Theoretically, we show our design of ACE can effectively boost the
+malicious client's perceived contribution when the server employs the
+widely-used cosine distance metric to measure contribution. Empirically, our
+results show ACE effectively and efficiently deceive five state-of-the-art
+contribution evaluation methods. In addition, ACE preserves the accuracy of the
+final global models on testing inputs. We also explore six countermeasures to
+defend ACE. Our results show they are inadequate to thwart ACE, highlighting
+the urgent need for new defenses to safeguard the contribution evaluation
+methods in FL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in the 33rd USENIX Security Symposium, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SaySelf: Teaching LLMs to Express Confidence with Self-Reflective
+  Rationales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20974v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20974v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyang Xu, Shujin Wu, Shizhe Diao, Xiaoze Liu, Xingyao Wang, Yangyi Chen, Jing Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often generate inaccurate or fabricated
+information and generally fail to indicate their confidence, which limits their
+broader applications. Previous work elicits confidence from LLMs by direct or
+self-consistency prompting, or constructing specific datasets for supervised
+finetuning. The prompting-based approaches have inferior performance, and the
+training-based approaches are limited to binary or inaccurate group-level
+confidence estimates. In this work, we present the advanced SaySelf, a training
+framework that teaches LLMs to express more accurate fine-grained confidence
+estimates. In addition, beyond the confidence scores, SaySelf initiates the
+process of directing LLMs to produce self-reflective rationales that clearly
+identify gaps in their parametric knowledge and explain their uncertainty. This
+is achieved by using an LLM to automatically summarize the uncertainties in
+specific knowledge via natural language. The summarization is based on the
+analysis of the inconsistency in multiple sampled reasoning chains, and the
+resulting data is utilized for supervised fine-tuning. Moreover, we utilize
+reinforcement learning with a meticulously crafted reward function to calibrate
+the confidence estimates, motivating LLMs to deliver accurate, high-confidence
+predictions and to penalize overconfidence in erroneous outputs. Experimental
+results in both in-distribution and out-of-distribution datasets demonstrate
+the effectiveness of SaySelf in reducing the confidence calibration error and
+maintaining the task performance. We show that the generated self-reflective
+rationales are reasonable and can further contribute to the calibration. The
+code is made public at \url{https://github.com/xu1868/SaySelf}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code is available at \url{https://github.com/xu1868/SaySelf}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LCQ: Low-Rank Codebook based Quantization for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20973v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20973v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wen-Pu Cai, Wu-Jun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models~(LLMs) have recently demonstrated promising performance
+in many tasks. However, the high storage and computational cost of LLMs has
+become a challenge for deploying LLMs. Weight quantization has been widely used
+for model compression, which can reduce both storage and computational cost.
+Most existing weight quantization methods for LLMs use a rank-one codebook for
+quantization, which results in substantial accuracy loss when the compression
+ratio is high. In this paper, we propose a novel weight quantization method,
+called low-rank codebook based quantization~(LCQ), for LLMs. LCQ adopts a
+low-rank codebook, the rank of which can be larger than one, for quantization.
+Experiments show that LCQ can achieve better accuracy than existing methods
+with a negligibly extra storage cost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> Amortizing intractable inference in diffusion models for vision,
+  language, and control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddarth Venkatraman, Moksh Jain, Luca Scimeca, Minsu Kim, Marcin Sendera, Mohsin Hasan, Luke Rowe, Sarthak Mittal, Pablo Lemos, Emmanuel Bengio, Alexandre Adam, Jarrid Rector-Brooks, <span class="highlight-author">Yoshua Bengio</span>, Glen Berseth, Nikolay Malkin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have emerged as effective distribution estimators in vision,
+language, and reinforcement learning, but their use as priors in downstream
+tasks poses an intractable posterior inference problem. This paper studies
+amortized sampling of the posterior over data, $\mathbf{x}\sim p^{\rm
+post}(\mathbf{x})\propto p(\mathbf{x})r(\mathbf{x})$, in a model that consists
+of a diffusion generative model prior $p(\mathbf{x})$ and a black-box
+constraint or likelihood function $r(\mathbf{x})$. We state and prove the
+asymptotic correctness of a data-free learning objective, relative trajectory
+balance, for training a diffusion model that samples from this posterior, a
+problem that existing methods solve only approximately or in restricted cases.
+Relative trajectory balance arises from the generative flow network perspective
+on diffusion models, which allows the use of deep reinforcement learning
+techniques to improve mode coverage. Experiments illustrate the broad potential
+of unbiased inference of arbitrary posteriors under diffusion priors: in vision
+(classifier guidance), language (infilling under a discrete diffusion LLM), and
+multimodal data (text-to-image generation). Beyond generative modeling, we
+apply relative trajectory balance to the problem of continuous control with a
+score-based behavior prior, achieving state-of-the-art results on benchmarks in
+offline reinforcement learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/GFNOrg/diffusion-finetuning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PUAL: A Classifier on Trifurcate Positive-Unlabeled Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoke Wang, Xiaochen Yang, Rui Zhu, Jing-Hao Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Positive-unlabeled (PU) learning aims to train a classifier using the data
+containing only labeled-positive instances and unlabeled instances. However,
+existing PU learning methods are generally hard to achieve satisfactory
+performance on trifurcate data, where the positive instances distribute on both
+sides of the negative instances. To address this issue, firstly we propose a PU
+classifier with asymmetric loss (PUAL), by introducing a structure of
+asymmetric loss on positive instances into the objective function of the global
+and local learning classifier. Then we develop a kernel-based algorithm to
+enable PUAL to obtain non-linear decision boundary. We show that, through
+experiments on both simulated and real-world datasets, PUAL can achieve
+satisfactory classification on trifurcate data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Aligning Multiclass Neural Network Classifier Criterion with Task
+  Performance via $F_β$-Score 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Tsoi, Deyuan Li, Taesoo Daniel Lee, Marynel Vázquez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiclass neural network classifiers are typically trained using
+cross-entropy loss. Following training, the performance of this same neural
+network is evaluated using an application-specific metric based on the
+multiclass confusion matrix, such as the Macro $F_\beta$-Score. It is
+questionable whether the use of cross-entropy will yield a classifier that
+aligns with the intended application-specific performance criteria,
+particularly in scenarios where there is a need to emphasize one aspect of
+classifier performance. For example, if greater precision is preferred over
+recall, the $\beta$ value in the $F_\beta$ evaluation metric can be adjusted
+accordingly, but the cross-entropy objective remains unaware of this preference
+during training. We propose a method that addresses this training-evaluation
+gap for multiclass neural network classifiers such that users can train these
+models informed by the desired final $F_\beta$-Score. Following prior work in
+binary classification, we utilize the concepts of the soft-set confusion
+matrices and a piecewise-linear approximation of the Heaviside step function.
+Our method extends the $2 \times 2$ binary soft-set confusion matrix to a
+multiclass $d \times d$ confusion matrix and proposes dynamic adaptation of the
+threshold value $\tau$, which parameterizes the piecewise-linear Heaviside
+approximation during run-time. We present a theoretical analysis that shows
+that our method can be used to optimize for a soft-set based approximation of
+Macro-$F_\beta$ that is a consistent estimator of Macro-$F_\beta$, and our
+extensive experiments show the practical effectiveness of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effective Interplay between Sparsity and Quantization: From Theory to
+  Practice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simla Burcu Harma, Ayan Chakraborty, Elizaveta Kostenok, Danila Mishin, Dongho Ha, Babak Falsafi, Martin Jaggi, Ming Liu, Yunho Oh, Suvinay Subramanian, Amir Yazdanbakhsh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing size of deep neural networks necessitates effective model
+compression to improve computational efficiency and reduce their memory
+footprint. Sparsity and quantization are two prominent compression methods that
+have individually demonstrated significant reduction in computational and
+memory footprints while preserving model accuracy. While effective, the
+interplay between these two methods remains an open question. In this paper, we
+investigate the interaction between these two methods and assess whether their
+combination impacts final model accuracy. We mathematically prove that applying
+sparsity before quantization is the optimal sequence for these operations,
+minimizing error in computation. Our empirical studies across a wide range of
+models, including OPT and Llama model families (125M-8B) and ViT corroborate
+these theoretical findings. In addition, through rigorous analysis, we
+demonstrate that sparsity and quantization are not orthogonal; their
+interaction can significantly harm model accuracy, with quantization error
+playing a dominant role in this degradation. Our findings extend to the
+efficient deployment of large models in resource-limited compute platforms and
+reduce serving cost, offering insights into best practices for applying these
+compression methods to maximize efficacy without compromising accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Concentration Bounds for Optimized Certainty Equivalent Risk Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayon Ghosh, L. A. Prashanth, Krishna Jagannathan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of estimating the Optimized Certainty Equivalent
+(OCE) risk from independent and identically distributed (i.i.d.) samples. For
+the classic sample average approximation (SAA) of OCE, we derive mean-squared
+error as well as concentration bounds (assuming sub-Gaussianity). Further, we
+analyze an efficient stochastic approximation-based OCE estimator, and derive
+finite sample bounds for the same. To show the applicability of our bounds, we
+consider a risk-aware bandit problem, with OCE as the risk. For this problem,
+we derive bound on the probability of mis-identification. Finally, we conduct
+numerical experiments to validate the theoretical findings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Estimate System Specifications in Linear Temporal Logic
+  using <span class="highlight-title">Transformer</span>s and Mamba 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        İlker Işık, Ebru Aydin Gol, Ramazan Gokberk Cinbis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal logic is a framework for representing and reasoning about
+propositions that evolve over time. It is commonly used for specifying
+requirements in various domains, including hardware and software systems, as
+well as robotics. Specification mining or formula generation involves
+extracting temporal logic formulae from system traces and has numerous
+applications, such as detecting bugs and improving interpretability. Although
+there has been a surge of deep learning-based methods for temporal logic
+satisfiability checking in recent years, the specification mining literature
+has been lagging behind in adopting deep learning methods despite their many
+advantages, such as scalability. In this paper, we introduce autoregressive
+models that can generate linear temporal logic formulae from traces, towards
+addressing the specification mining problem. We propose multiple architectures
+for this task: transformer encoder-decoder, decoder-only transformer, and
+Mamba, which is an emerging alternative to transformer models. Additionally, we
+devise a metric for quantifying the distinctiveness of the generated formulae
+and a straightforward algorithm to enforce the syntax constraints. Our
+experiments show that the proposed architectures yield promising results,
+generating correct and distinct formulae at a fraction of the compute cost
+needed for the combinatorial baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast yet Safe: Early-Exiting with Risk Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20915v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20915v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Metod Jazbec, Alexander Timans, Tin Hadži Veljković, Kaspar Sakmann, Dan Zhang, Christian A. Naesseth, Eric Nalisnick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling machine learning models significantly improves their performance.
+However, such gains come at the cost of inference being slow and
+resource-intensive. Early-exit neural networks (EENNs) offer a promising
+solution: they accelerate inference by allowing intermediate layers to exit and
+produce a prediction early. Yet a fundamental issue with EENNs is how to
+determine when to exit without severely degrading performance. In other words,
+when is it 'safe' for an EENN to go 'fast'? To address this issue, we
+investigate how to adapt frameworks of risk control to EENNs. Risk control
+offers a distribution-free, post-hoc solution that tunes the EENN's exiting
+mechanism so that exits only occur when the output is of sufficient quality. We
+empirically validate our insights on a range of vision and language tasks,
+demonstrating that risk control can produce substantial computational savings,
+all the while preserving user-specified performance goals.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 11 figures, 4 tables (incl. appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VENI, VINDy, VICI: a variational reduced-order modeling framework with
+  uncertainty quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20905v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20905v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paolo Conti, Jonas Kneifl, Andrea Manzoni, Attilio Frangi, Jörg Fehr, Steven L. Brunton, J. Nathan Kutz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The simulation of many complex phenomena in engineering and science requires
+solving expensive, high-dimensional systems of partial differential equations
+(PDEs). To circumvent this, reduced-order models (ROMs) have been developed to
+speed up computations. However, when governing equations are unknown or
+partially known, typically ROMs lack interpretability and reliability of the
+predicted solutions.
+  In this work we present a data-driven, non-intrusive framework for building
+ROMs where the latent variables and dynamics are identified in an interpretable
+manner and uncertainty is quantified. Starting from a limited amount of
+high-dimensional, noisy data the proposed framework constructs an efficient ROM
+by leveraging variational autoencoders for dimensionality reduction along with
+a newly introduced, variational version of sparse identification of nonlinear
+dynamics (SINDy), which we refer to as Variational Identification of Nonlinear
+Dynamics (VINDy).
+  In detail, the method consists of Variational Encoding of Noisy Inputs (VENI)
+to identify the distribution of reduced coordinates. Simultaneously, we learn
+the distribution of the coefficients of a pre-determined set of candidate
+functions by VINDy. Once trained offline, the identified model can be queried
+for new parameter instances and new initial conditions to compute the
+corresponding full-time solutions. The probabilistic setup enables uncertainty
+quantification as the online testing consists of Variational Inference
+naturally providing Certainty Intervals (VICI). In this work we showcase the
+effectiveness of the newly proposed VINDy method in identifying interpretable
+and accurate dynamical system for the R\"ossler system with different noise
+intensities and sources. Then the performance of the overall method - named
+VENI, VINDy, VICI - is tested on PDE benchmarks including structural mechanics
+and fluid dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sheaf HyperNetworks for Personalized Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bao Nguyen, Lorenzo Sani, Xinchi Qiu, Pietro Liò, Nicholas D. Lane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph hypernetworks (GHNs), constructed by combining graph neural networks
+(GNNs) with hypernetworks (HNs), leverage relational data across various
+domains such as neural architecture search, molecular property prediction and
+federated learning. Despite GNNs and HNs being individually successful, we show
+that GHNs present problems compromising their performance, such as
+over-smoothing and heterophily. Moreover, we cannot apply GHNs directly to
+personalized federated learning (PFL) scenarios, where a priori client relation
+graph may be absent, private, or inaccessible. To mitigate these limitations in
+the context of PFL, we propose a novel class of HNs, sheaf hypernetworks
+(SHNs), which combine cellular sheaf theory with HNs to improve parameter
+sharing for PFL. We thoroughly evaluate SHNs across diverse PFL tasks,
+including multi-class classification, traffic and weather forecasting.
+Additionally, we provide a methodology for constructing client relation graphs
+in scenarios where such graphs are unavailable. We show that SHNs consistently
+outperform existing PFL solutions in complex non-IID scenarios. While the
+baselines' performance fluctuates depending on the task, SHNs show improvements
+of up to 2.7% in accuracy and 5.3% in lower mean squared error over the
+best-performing baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 12 figures, 7 tables, pre-print under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flow matching achieves minimax optimal convergence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20879v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20879v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenji Fukumizu, Taiji Suzuki, Noboru Isobe, Kazusato Oko, Masanori Koyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Flow matching (FM) has gained significant attention as a simulation-free
+generative model. Unlike diffusion models, which are based on stochastic
+differential equations, FM employs a simpler approach by solving an ordinary
+differential equation with an initial condition from a normal distribution,
+thus streamlining the sample generation process. This paper discusses the
+convergence properties of FM in terms of the $p$-Wasserstein distance, a
+measure of distributional discrepancy. We establish that FM can achieve the
+minmax optimal convergence rate for $1 \leq p \leq 2$, presenting the first
+theoretical evidence that FM can reach convergence rates comparable to those of
+diffusion models. Our analysis extends existing frameworks by examining a
+broader class of mean and variance functions for the vector fields and
+identifies specific conditions necessary to attain these optimal rates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Waveform Design for Over-the-Air Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikos G. Evgenidis, Nikos A. Mitsiou, Sotiris A. Tegos, Panagiotis D. Diamantoulakis, Panagiotis Sarigiannidis, Ioannis T. Rekanos, George K. Karagiannidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In response to the increasing number of devices anticipated in
+next-generation networks, a shift toward over-the-air (OTA) computing has been
+proposed. Leveraging the superposition of multiple access channels, OTA
+computing enables efficient resource management by supporting simultaneous
+uncoded transmission in the time and the frequency domain. Thus, to advance the
+integration of OTA computing, our study presents a theoretical analysis
+addressing practical issues encountered in current digital communication
+transceivers, such as time sampling error and intersymbol interference (ISI).
+To this end, we examine the theoretical mean squared error (MSE) for OTA
+transmission under time sampling error and ISI, while also exploring methods
+for minimizing the MSE in the OTA transmission. Utilizing alternating
+optimization, we also derive optimal power policies for both the devices and
+the base station. Additionally, we propose a novel deep neural network
+(DNN)-based approach to design waveforms enhancing OTA transmission performance
+under time sampling error and ISI. To ensure fair comparison with existing
+waveforms like the raised cosine (RC) and the better-than-raised-cosine (BRTC),
+we incorporate a custom loss function integrating energy and bandwidth
+constraints, along with practical design considerations such as waveform
+symmetry. Simulation results validate our theoretical analysis and demonstrate
+performance gains of the designed pulse over RC and BTRC waveforms. To
+facilitate testing of our results without necessitating the DNN structure
+recreation, we provide curve fitting parameters for select DNN-based waveforms
+as well.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Efficiency of Safe Reinforcement Learning via Sample
+  Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangding Gu, Laixi Shi, Yuhao Ding, Alois Knoll, Costas Spanos, Adam Wierman, Ming Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe reinforcement learning (RL) is crucial for deploying RL agents in
+real-world applications, as it aims to maximize long-term rewards while
+satisfying safety constraints. However, safe RL often suffers from sample
+inefficiency, requiring extensive interactions with the environment to learn a
+safe policy. We propose Efficient Safe Policy Optimization (ESPO), a novel
+approach that enhances the efficiency of safe RL through sample manipulation.
+ESPO employs an optimization framework with three modes: maximizing rewards,
+minimizing costs, and balancing the trade-off between the two. By dynamically
+adjusting the sampling process based on the observed conflict between reward
+and safety gradients, ESPO theoretically guarantees convergence, optimization
+stability, and improved sample complexity bounds. Experiments on the
+Safety-MuJoCo and Omnisafe benchmarks demonstrate that ESPO significantly
+outperforms existing primal-based and primal-dual-based baselines in terms of
+reward maximization and constraint satisfaction. Moreover, ESPO achieves
+substantial gains in sample efficiency, requiring 25--29% fewer samples than
+baselines, and reduces training time by 21--38%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLIM: a Scalable Light-weight Root Cause Analysis for Imbalanced Data in
+  Microservice 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Ren, Jingbang Yang, Linxiao Yang, Xinyue Gu, Liang Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The newly deployed service -- one kind of change service, could lead to a new
+type of minority fault. Existing state-of-the-art methods for fault
+localization rarely consider the imbalanced fault classification in change
+service. This paper proposes a novel method that utilizes decision rule sets to
+deal with highly imbalanced data by optimizing the F1 score subject to
+cardinality constraints. The proposed method greedily generates the rule with
+maximal marginal gain and uses an efficient minorize-maximization (MM) approach
+to select rules iteratively, maximizing a non-monotone submodular lower bound.
+Compared with existing fault localization algorithms, our algorithm can adapt
+to the imbalanced fault scenario of change service, and provide interpretable
+fault causes which are easy to understand and verify. Our method can also be
+deployed in the online training setting, with only about 15% training overhead
+compared to the current SOTA methods. Empirical studies showcase that our
+algorithm outperforms existing fault localization algorithms in both accuracy
+and model interpretability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ einspace: Searching for Neural Architectures from Fundamental Operations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20838v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20838v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linus Ericsson, Miguel Espinosa, Chenhongyi Yang, Antreas Antoniou, Amos Storkey, Shay B. Cohen, Steven McDonagh, Elliot J. Crowley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural architecture search (NAS) finds high performing networks for a given
+task. Yet the results of NAS are fairly prosaic; they did not e.g. create a
+shift from convolutional structures to transformers. This is not least because
+the search spaces in NAS often aren't diverse enough to include such
+transformations a priori. Instead, for NAS to provide greater potential for
+fundamental design shifts, we need a novel expressive search space design which
+is built from more fundamental operations. To this end, we introduce einspace,
+a search space based on a parameterised probabilistic context-free grammar. Our
+space is versatile, supporting architectures of various sizes and complexities,
+while also containing diverse network operations which allow it to model
+convolutions, attention components and more. It contains many existing
+competitive architectures, and provides flexibility for discovering new ones.
+Using this search space, we perform experiments to find novel architectures as
+well as improvements on existing ones on the diverse Unseen NAS datasets. We
+show that competitive architectures can be obtained by searching from scratch,
+and we consistently find large improvements when initialising the search with
+strong baselines. We believe that this work is an important advancement towards
+a transformative NAS paradigm where search space expressivity and strategic
+search initialisation play key roles.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at https://linusericsson.github.io/einspace/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Solving partial differential equations with sampled neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chinmay Datar, Taniya Kapoor, Abhishek Chandra, Qing Sun, Iryna Burak, Erik Lien Bolager, Anna Veselovska, Massimo Fornasier, Felix Dietrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Approximation of solutions to partial differential equations (PDE) is an
+important problem in computational science and engineering. Using neural
+networks as an ansatz for the solution has proven a challenge in terms of
+training time and approximation accuracy. In this contribution, we discuss how
+sampling the hidden weights and biases of the ansatz network from data-agnostic
+and data-dependent probability distributions allows us to progress on both
+challenges. In most examples, the random sampling schemes outperform iterative,
+gradient-based optimization of physics-informed neural networks regarding
+training time and accuracy by several orders of magnitude. For time-dependent
+PDE, we construct neural basis functions only in the spatial domain and then
+solve the associated ordinary differential equation with classical methods from
+scientific computing over a long time horizon. This alleviates one of the
+greatest challenges for neural PDE solvers because it does not require us to
+parameterize the solution in time. For second-order elliptic PDE in Barron
+spaces, we prove the existence of sampled networks with $L^2$ convergence to
+the solution. We demonstrate our approach on several time-dependent and static
+PDEs. We also illustrate how sampled networks can effectively solve inverse
+problems in this setting. Benefits compared to common numerical schemes include
+spectral convergence and mesh-free construction of basis functions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Outliers and Calibration Sets have Diminishing Effect on Quantization of
+  Modern LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davide Paglieri, Saurabh Dash, Tim Rocktäschel, Jack Parker-Holder
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-Training Quantization (PTQ) enhances the efficiency of Large Language
+Models (LLMs) by enabling faster operation and compatibility with more
+accessible hardware through reduced memory usage, at the cost of small
+performance drops. We explore the role of calibration sets in PTQ, specifically
+their effect on hidden activations in various notable open-source LLMs.
+Calibration sets are crucial for evaluating activation magnitudes and
+identifying outliers, which can distort the quantization range and negatively
+impact performance. Our analysis reveals a marked contrast in quantization
+effectiveness across models. The older OPT model, which much of the
+quantization literature is based on, shows significant performance
+deterioration and high susceptibility to outliers with varying calibration
+sets. In contrast, newer models like Llama-2 7B, Llama-3 8B, Command-R 35B, and
+Mistral 7B demonstrate strong robustness, with Mistral 7B showing near-immunity
+to outliers and stable activations. These findings suggest a shift in PTQ
+strategies might be needed. As advancements in pre-training methods reduce the
+relevance of outliers, there is an emerging need to reassess the fundamentals
+of current quantization literature. The emphasis should pivot towards
+optimizing inference speed, rather than primarily focusing on outlier
+preservation, to align with the evolving characteristics of state-of-the-art
+LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-Augmented Preference Optimization: Off-Policy Paradigms for
+  Language Model Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueqin Yin, Zhendong Wang, Yujia Xie, Weizhu Chen, Mingyuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional language model alignment methods, such as Direct Preference
+Optimization (DPO), are limited by their dependence on static, pre-collected
+paired preference data, which hampers their adaptability and practical
+applicability. To overcome this limitation, we introduce Self-Augmented
+Preference Optimization (SAPO), an effective and scalable training paradigm
+that does not require existing paired data. Building on the self-play concept,
+which autonomously generates negative responses, we further incorporate an
+off-policy learning pipeline to enhance data exploration and exploitation.
+Specifically, we employ an Exponential Moving Average (EMA) model in
+conjunction with a replay buffer to enable dynamic updates of response
+segments, effectively integrating real-time feedback with insights from
+historical data. Our comprehensive evaluations of the LLaMA3-8B and Mistral-7B
+models across benchmarks, including the Open LLM Leaderboard, IFEval,
+AlpacaEval 2.0, and MT-Bench, demonstrate that SAPO matches or surpasses
+established offline contrastive baselines, such as DPO and Odds Ratio
+Preference Optimization, and outperforms offline self-play methods like SPIN.
+Our code is available at https://github.com/yinyueqin/SAPO
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Open-World Semi-Supervised Learning: Distribution Mismatch
+  and Inductive Inference <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seongheon Park, Hyuk Kwon, Kwanghoon Sohn, Kibok Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-world semi-supervised learning (OWSSL) extends conventional
+semi-supervised learning to open-world scenarios by taking account of novel
+categories in unlabeled datasets. Despite the recent advancements in OWSSL, the
+success often relies on the assumptions that 1) labeled and unlabeled datasets
+share the same balanced class prior distribution, which does not generally hold
+in real-world applications, and 2) unlabeled training datasets are utilized for
+evaluation, where such transductive inference might not adequately address
+challenges in the wild. In this paper, we aim to generalize OWSSL by addressing
+them. Our work suggests that practical OWSSL may require different training
+settings, evaluation methods, and learning strategies compared to those
+prevalent in the existing literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR Workshop on Computer Vision in the Wild (CVinW), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of clinical, dosimetric and radiomic features for predicting
+  local failure after stereotactic radiotherapy of brain metastases in
+  malignant melanoma 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nanna E. Hartong, Ilias Sachpazidis, Oliver Blanck, Lucas Etzel, Jan C. Peeken, Stephanie E. Combs, Horst Urbach, Maxim Zaitsev, Dimos Baltas, Ilinca Popp, Anca-Ligia Grosu, Tobias Fechter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: The aim of this study was to investigate the role of clinical,
+dosimetric and pretherapeutic magnetic resonance imaging (MRI) features for
+lesion-specific outcome prediction of stereotactic radiotherapy (SRT) in
+patients with brain metastases from malignant melanoma (MBM).
+  Methods: In this multicenter, retrospective analysis, we reviewed 517 MBM
+from 130 patients treated with SRT (single fraction or hypofractionated). For
+each gross tumor volume (GTV) 1576 radiomic features (RF) were calculated (788
+each for the GTV and for a 3 mm margin around the GTV). Clinical parameters,
+radiation dose and RF from pretherapeutic contrast-enhanced T1-weighted MRI
+from different institutions were evaluated with a feature processing and
+elimination pipeline in a nested cross-validation scheme.
+  Results: Seventy-two (72) of 517 lesions (13.9%) showed a local failure (LF)
+after SRT. The processing pipeline showed clinical, dosimetric and radiomic
+features providing information for LF prediction. The most prominent ones were
+the correlation of the gray level co-occurrence matrix of the margin (hazard
+ratio (HR): 0.37, confidence interval (CI): 0.23-0.58) and systemic therapy
+before SRT (HR: 0.55, CI: 0.42-0.70). The majority of RF associated with LF was
+calculated in the margin around the GTV.
+  Conclusions: Pretherapeutic MRI based RF connected with lesion-specific
+outcome after SRT could be identified, despite multicentric data and minor
+differences in imaging protocols. Image data analysis of the surrounding
+metastatic environment may provide therapy-relevant information with the
+potential to further individualize radiotherapy strategies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Convex Optimisation: The Optimal Switching Regret for all
+  Segmentations Simultaneously 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20824v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20824v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Pasteris, Chris Hicks, Vasilios Mavroudis, Mark Herbster
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the classic problem of online convex optimisation. Whereas the
+notion of static regret is relevant for stationary problems, the notion of
+switching regret is more appropriate for non-stationary problems. A switching
+regret is defined relative to any segmentation of the trial sequence, and is
+equal to the sum of the static regrets of each segment. In this paper we show
+that, perhaps surprisingly, we can achieve the asymptotically optimal switching
+regret on every possible segmentation simultaneously. Our algorithm for doing
+so is very efficient: having a space and per-trial time complexity that is
+logarithmic in the time-horizon. Our algorithm also obtains novel bounds on its
+dynamic regret: being adaptive to variations in the rate of change of the
+comparator sequence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pursuing Overall Welfare in Federated Learning through Sequential
+  Decision Making <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seok-Ju Hahn, Gi-Soo Kim, Junghye Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In traditional federated learning, a single global model cannot perform
+equally well for all clients. Therefore, the need to achieve the client-level
+fairness in federated system has been emphasized, which can be realized by
+modifying the static aggregation scheme for updating the global model to an
+adaptive one, in response to the local signals of the participating clients.
+Our work reveals that existing fairness-aware aggregation strategies can be
+unified into an online convex optimization framework, in other words, a central
+server's sequential decision making process. To enhance the decision making
+capability, we propose simple and intuitive improvements for suboptimal designs
+within existing methods, presenting AAggFF. Considering practical requirements,
+we further subdivide our method tailored for the cross-device and the
+cross-silo settings, respectively. Theoretical analyses guarantee sublinear
+regret upper bounds for both settings: $\mathcal{O}(\sqrt{T \log{K}})$ for the
+cross-device setting, and $\mathcal{O}(K \log{T})$ for the cross-silo setting,
+with $K$ clients and $T$ federation rounds. Extensive experiments demonstrate
+that the federated system equipped with AAggFF achieves better degree of
+client-level fairness than existing methods in both practical settings. Code is
+available at https://github.com/vaseline555/AAggFF
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimally Improving Cooperative Learning in a Social Setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahrzad Haddadan, Cheng Xin, Jie Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider a cooperative learning scenario where a collection of networked
+agents with individually owned classifiers dynamically update their
+predictions, for the same classification task, through communication or
+observations of each other's predictions. Clearly if highly influential
+vertices use erroneous classifiers, there will be a negative effect on the
+accuracy of all the agents in the network. We ask the following question: how
+can we optimally fix the prediction of a few classifiers so as maximize the
+overall accuracy in the entire network. To this end we consider an aggregate
+and an egalitarian objective function. We show a polynomial time algorithm for
+optimizing the aggregate objective function, and show that optimizing the
+egalitarian objective function is NP-hard. Furthermore, we develop
+approximation algorithms for the egalitarian improvement. The performance of
+all of our algorithms are guaranteed by mathematical analysis and backed by
+experiments on synthetic and real data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shape Constraints in Symbolic Regression using Penalized Least Squares 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Viktor Martinek, Julia Reuter, Ophelia Frotscher, Sanaz Mostaghim, Markus Richter, Roland Herzog
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the addition of shape constraints and their consideration during the
+parameter estimation step of symbolic regression (SR). Shape constraints serve
+as a means to introduce prior knowledge about the shape of the otherwise
+unknown model function into SR. Unlike previous works that have explored shape
+constraints in SR, we propose minimizing shape constraint violations during
+parameter estimation using gradient-based numerical optimization.
+  We test three algorithm variants to evaluate their performance in identifying
+three symbolic expressions from a synthetically generated data set. This paper
+examines two benchmark scenarios: one with varying noise levels and another
+with reduced amounts of training data. The results indicate that incorporating
+shape constraints into the expression search is particularly beneficial when
+data is scarce. Compared to using shape constraints only in the selection
+process, our approach of minimizing violations during parameter estimation
+shows a statistically significant benefit in some of our test cases, without
+being significantly worse in any instance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rough <span class="highlight-title">Transformer</span>s: Lightweight Continuous-Time Sequence Modelling with
+  Path Signatures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20799v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20799v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando Moreno-Pino, Álvaro Arroyo, Harrison Waldon, Xiaowen Dong, Álvaro Cartea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time-series data in real-world settings typically exhibit long-range
+dependencies and are observed at non-uniform intervals. In these settings,
+traditional sequence-based recurrent models struggle. To overcome this,
+researchers often replace recurrent architectures with Neural ODE-based models
+to account for irregularly sampled data and use Transformer-based architectures
+to account for long-range dependencies. Despite the success of these two
+approaches, both incur very high computational costs for input sequences of
+even moderate length. To address this challenge, we introduce the Rough
+Transformer, a variation of the Transformer model that operates on
+continuous-time representations of input sequences and incurs significantly
+lower computational costs. In particular, we propose \textit{multi-view
+signature attention}, which uses path signatures to augment vanilla attention
+and to capture both local and global (multi-scale) dependencies in the input
+data, while remaining robust to changes in the sequence length and sampling
+frequency and yielding improved spatial processing. We find that, on a variety
+of time-series-related tasks, Rough Transformers consistently outperform their
+vanilla attention counterparts while obtaining the representational benefits of
+Neural ODE-based models, all at a fraction of the computational time and memory
+resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review. arXiv admin note: text overlap with
+  arXiv:2403.10288</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ovis: Structural Embedding Alignment for Multimodal Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyin Lu, Yang Li, Qing-Guo Chen, Zhao Xu, Weihua Luo, Kaifu Zhang, Han-Jia Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current Multimodal Large Language Models (MLLMs) typically integrate a
+pre-trained LLM with another pre-trained vision transformer through a
+connector, such as an MLP, endowing the LLM with visual capabilities. However,
+the misalignment between two embedding strategies in MLLMs -- the structural
+textual embeddings based on an embedding look-up table and the continuous
+embeddings generated directly by the vision encoder -- makes challenges for a
+more seamless fusion of visual and textual information. We propose Ovis, a
+novel MLLM architecture designed to structurally align visual and textual
+embeddings. Ovis integrates an additional learnable visual embedding table into
+the visual encoder's process. To capture rich visual semantics, each image
+patch indexes the visual embedding table multiple times, resulting in a final
+visual embedding that is a probabilistic combination of the indexed embeddings.
+This structural approach mirrors the method used for generating textual
+embeddings. Empirical evaluations on various multimodal benchmarks demonstrate
+that Ovis outperforms open-source MLLMs of similar parameter scales and even
+surpasses the proprietary model Qwen-VL-Plus overall. These results highlight
+the potential of Ovis' structured visual representation for advancing MLLM
+architectural design and promoting more effective multimodal learning. Both the
+source code and the training dataset of Ovis will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model Interpretation and Explainability: Towards Creating Transparency
+  in Prediction Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20794v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20794v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donald Kridel, Jacob Dineen, Daniel Dolk, David Castillo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable AI (XAI) has a counterpart in analytical modeling which we refer
+to as model explainability. We tackle the issue of model explainability in the
+context of prediction models. We analyze a dataset of loans from a credit card
+company and apply three stages: execute and compare four different prediction
+methods, apply the best known explainability techniques in the current
+literature to the model training sets to identify feature importance (FI)
+(static case), and finally to cross-check whether the FI set holds up under
+what if prediction scenarios for continuous and categorical variables (dynamic
+case). We found inconsistency in FI identification between the static and
+dynamic cases. We summarize the state of the art in model explainability and
+suggest further research to advance the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GS-Phong: Meta-Learned 3D Gaussians for Relightable Novel View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20791v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20791v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yumeng He, Yunbo Wang, Xiaokang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decoupling the illumination in 3D scenes is crucial for novel view synthesis
+and relighting. In this paper, we propose a novel method for representing a
+scene illuminated by a point light using a set of relightable 3D Gaussian
+points. Inspired by the Blinn-Phong model, our approach decomposes the scene
+into ambient, diffuse, and specular components, enabling the synthesis of
+realistic lighting effects. To facilitate the decomposition of geometric
+information independent of lighting conditions, we introduce a novel bilevel
+optimization-based meta-learning framework. The fundamental idea is to view the
+rendering tasks under various lighting positions as a multi-task learning
+problem, which our meta-learning approach effectively addresses by generalizing
+the learned Gaussian geometries not only across different viewpoints but also
+across diverse light positions. Experimental results demonstrate the
+effectiveness of our approach in terms of training efficiency and rendering
+quality compared to existing methods for free-viewpoint relighting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intersectional Unfairness Discovery <span class="chip">ICML-2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gezheng Xu, Qi Chen, Charles Ling, Boyu Wang, Changjian Shui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI systems have been shown to produce unfair results for certain subgroups of
+population, highlighting the need to understand bias on certain sensitive
+attributes. Current research often falls short, primarily focusing on the
+subgroups characterized by a single sensitive attribute, while neglecting the
+nature of intersectional fairness of multiple sensitive attributes. This paper
+focuses on its one fundamental aspect by discovering diverse high-bias
+subgroups under intersectional sensitive attributes. Specifically, we propose a
+Bias-Guided Generative Network (BGGN). By treating each bias value as a reward,
+BGGN efficiently generates high-bias intersectional sensitive attributes.
+Experiments on real-world text and image datasets demonstrate a diverse and
+efficient discovery of BGGN. To further evaluate the generated unseen but
+possible unfair intersectional sensitive attributes, we formulate them as
+prompts and use modern generative AI to produce new texts and images. The
+results of frequently generating biased data provides new insights of
+discovering potential unfairness in popular modern generative AI systems.
+Warning: This paper contains generative examples that are offensive in nature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML-2024 Camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reinforcement Learning for Sociohydrology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20772v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20772v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tirthankar Roy, Shivendra Srivastava, Beichen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we discuss how reinforcement learning (RL) provides an
+effective and efficient framework for solving sociohydrology problems. The
+efficacy of RL for these types of problems is evident because of its ability to
+update policies in an iterative manner - something that is also foundational to
+sociohydrology, where we are interested in representing the co-evolution of
+human-water interactions. We present a simple case study to demonstrate the
+implementation of RL in a problem of runoff reduction through management
+decisions related to changes in land-use land-cover (LULC). We then discuss the
+benefits of RL for these types of problems and share our perspectives on the
+future research directions in this area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Generalization and Convergence by Enhancing Implicit
+  Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingze Wang, Haotian He, Jinbo Wang, Zilin Wang, Guanhua Huang, Feiyu Xiong, Zhiyu Li, Weinan E, Lei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose an Implicit Regularization Enhancement (IRE)
+framework to accelerate the discovery of flat solutions in deep learning,
+thereby improving generalization and convergence. Specifically, IRE decouples
+the dynamics of flat and sharp directions, which boosts the sharpness reduction
+along flat directions while maintaining the training stability in sharp
+directions. We show that IRE can be practically incorporated with {\em generic
+base optimizers} without introducing significant computational overload.
+Experiments show that IRE consistently improves the generalization performance
+for image classification tasks across a variety of benchmark datasets
+(CIFAR-10/100, ImageNet) and models (ResNets and ViTs). Surprisingly, IRE also
+achieves a $2\times$ {\em speed-up} compared to AdamW in the pre-training of
+Llama models (of sizes ranging from 60M to 229M) on datasets including
+Wikitext-103, Minipile, and Openwebtext. Moreover, we provide theoretical
+guarantees, showing that IRE can substantially accelerate the convergence
+towards flat minima in Sharpness-aware Minimization (SAM).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Share Your Secrets for Privacy! Confidential Forecasting with Vertical
+  Federated Learning <span class="chip">ECAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Shankar, Lydia Y. Chen, Jérémie Decouchant, Dimitra Gkorou, Rihan Hai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vertical federated learning (VFL) is a promising area for time series
+forecasting in industrial applications, such as predictive maintenance and
+machine control. Critical challenges to address in manufacturing include data
+privacy and over-fitting on small and noisy datasets during both training and
+inference. Additionally, to increase industry adaptability, such forecasting
+models must scale well with the number of parties while ensuring strong
+convergence and low-tuning complexity. We address those challenges and propose
+'Secret-shared Time Series Forecasting with VFL' (STV), a novel framework that
+exhibits the following key features: i) a privacy-preserving algorithm for
+forecasting with SARIMAX and autoregressive trees on vertically partitioned
+data; ii) serverless forecasting using secret sharing and multi-party
+computation; iii) novel N-party algorithms for matrix multiplication and
+inverse operations for direct parameter optimization, giving strong convergence
+with minimal hyperparameter tuning complexity. We conduct evaluations on six
+representative datasets from public and industry-specific contexts. Our results
+demonstrate that STV's forecasting accuracy is comparable to those of
+centralized approaches. They also show that our direct optimization can
+outperform centralized methods, which include state-of-the-art diffusion models
+and long-short-term memory, by 23.81% on forecasting accuracy. We also conduct
+a scalability analysis by examining the communication costs of direct and
+iterative optimization to navigate the choice between the two. Code and
+appendix are available: https://github.com/adis98/STV
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the 27TH EUROPEAN CONFERENCE ON ARTIFICIAL INTELLIGENCE
+  (ECAI 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Information Theoretic Text-to-Image Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Wang, Giulio Franzese, Alessandro Finamore, Massimo Gallo, Pietro Michiardi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models for Text-to-Image (T2I) conditional generation have seen
+tremendous success recently. Despite their success, accurately capturing user
+intentions with these models still requires a laborious trial and error
+process. This challenge is commonly identified as a model alignment problem, an
+issue that has attracted considerable attention by the research community.
+Instead of relying on fine-grained linguistic analyses of prompts, human
+annotation, or auxiliary vision-language models to steer image generation, in
+this work we present a novel method that relies on an information-theoretic
+alignment measure. In a nutshell, our method uses self-supervised fine-tuning
+and relies on point-wise mutual information between prompts and images to
+define a synthetic training set to induce model alignment. Our comparative
+analysis shows that our method is on-par or superior to the state-of-the-art,
+yet requires nothing but a pre-trained denoising network to estimate MI and a
+lightweight fine-tuning strategy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenTensor: Reproducing Faster Matrix Multiplication Discovering
+  Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20748v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20748v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiwen Sun, Wenye Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  OpenTensor is a reproduction of AlphaTensor, which discovered a new algorithm
+that outperforms the state-of-the-art methods for matrix multiplication by Deep
+Reinforcement Learning (DRL). While AlphaTensor provides a promising framework
+for solving scientific problems, it is really hard to reproduce due to the
+massive tricks and lack of source codes. In this paper, we clean up the
+algorithm pipeline, clarify the technical details, and make some improvements
+to the training process. Computational results show that OpenTensor can
+successfully find efficient matrix multiplication algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent
+  Codes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Benaglia, Angelo Porrello, Pietro Buzzega, Simone Calderara, Rita Cucchiara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trajectory forecasting is crucial for video surveillance analytics, as it
+enables the anticipation of future movements for a set of agents, e.g.
+basketball players engaged in intricate interactions with long-term intentions.
+Deep generative models offer a natural learning approach for trajectory
+forecasting, yet they encounter difficulties in achieving an optimal balance
+between sampling fidelity and diversity. We address this challenge by
+leveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a
+discrete latent space to tackle the issue of posterior collapse. Specifically,
+we introduce an instance-based codebook that allows tailored latent
+representations for each example. In a nutshell, the rows of the codebook are
+dynamically adjusted to reflect contextual information (i.e., past motion
+patterns extracted from the observed trajectories). In this way, the
+discretization process gains flexibility, leading to improved reconstructions.
+Notably, instance-level dynamics are injected into the codebook through
+low-rank updates, which restrict the customization of the codebook to a lower
+dimension space. The resulting discrete space serves as the basis of the
+subsequent step, which regards the training of a diffusion-based predictive
+model. We show that such a two-fold framework, augmented with instance-level
+discretization, leads to accurate and diverse forecasts, yielding
+state-of-the-art performance on three established benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Random Forest for Partially Overlapping Clinical Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youngjun Park, Cord Eric Schmidt, Benedikt Marcel Batton, Anne-Christin Hauschild
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the healthcare sector, a consciousness surrounding data privacy and
+corresponding data protection regulations, as well as heterogeneous and
+non-harmonized data, pose huge challenges to large-scale data analysis.
+Moreover, clinical data often involves partially overlapping features, as some
+observations may be missing due to various reasons, such as differences in
+procedures, diagnostic tests, or other recorded patient history information
+across hospitals or institutes. To address the challenges posed by partially
+overlapping features and incomplete data in clinical datasets, a comprehensive
+approach is required. Particularly in the domain of medical data, promising
+outcomes are achieved by federated random forests whenever features align.
+However, for most standard algorithms, like random forest, it is essential that
+all data sets have identical parameters. Therefore, in this work the concept of
+federated random forest is adapted to a setting with partially overlapping
+features. Moreover, our research assesses the effectiveness of the newly
+developed federated random forest models for partially overlapping clinical
+data. For aggregating the federated, globally optimized model, only features
+available locally at each site can be used. We tackled two issues in
+federation: (i) the quantity of involved parties, (ii) the varying overlap of
+features. This evaluation was conducted across three clinical datasets. The
+federated random forest model even in cases where only a subset of features
+overlaps consistently demonstrates superior performance compared to its local
+counterpart. This holds true across various scenarios, including datasets with
+imbalanced classes. Consequently, federated random forests for partially
+overlapped data offer a promising solution to transcend barriers in
+collaborative research and corporate cooperation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Maximum Temperature Prediction Using Remote Sensing Data Via
+  Convolutional Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Innocenti, Giacomo Blanco, Luca Barco, Claudio Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Urban heat islands, defined as specific zones exhibiting substantially higher
+temperatures than their immediate environs, pose significant threats to
+environmental sustainability and public health. This study introduces a novel
+machine-learning model that amalgamates data from the Sentinel-3 satellite,
+meteorological predictions, and additional remote sensing inputs. The primary
+aim is to generate detailed spatiotemporal maps that forecast the peak
+temperatures within a 24-hour period in Turin. Experimental results validate
+the model's proficiency in predicting temperature patterns, achieving a Mean
+Absolute Error (MAE) of 2.09 degrees Celsius for the year 2023 at a resolution
+of 20 meters per pixel, thereby enriching our knowledge of urban climatic
+behavior. This investigation enhances the understanding of urban microclimates,
+emphasizing the importance of cross-disciplinary data integration, and laying
+the groundwork for informed policy-making aimed at alleviating the negative
+impacts of extreme urban temperatures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, submitted to IEEE MetroLivEnv 2024 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning on Large Graphs using Intersecting Communities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20724v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20724v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Finkelshtein, İsmail İlkan Ceylan, Michael Bronstein, Ron Levie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Message Passing Neural Networks (MPNNs) are a staple of graph machine
+learning. MPNNs iteratively update each node's representation in an input graph
+by aggregating messages from the node's neighbors, which necessitates a memory
+complexity of the order of the number of graph edges. This complexity might
+quickly become prohibitive for large graphs provided they are not very sparse.
+In this paper, we propose a novel approach to alleviate this problem by
+approximating the input graph as an intersecting community graph (ICG) -- a
+combination of intersecting cliques. The key insight is that the number of
+communities required to approximate a graph does not depend on the graph size.
+We develop a new constructive version of the Weak Graph Regularity Lemma to
+efficiently construct an approximating ICG for any input graph. We then devise
+an efficient graph learning algorithm operating directly on ICG in linear
+memory and time with respect to the number of nodes (rather than edges). This
+offers a new and fundamentally different pipeline for learning on very large
+non-sparse graphs, whose applicability is demonstrated empirically on node
+classification tasks and spatio-temporal data processing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cyclic image generation using chaotic dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20717v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20717v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takaya Tanaka, Yutaka Yamaguti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Successive image generation using cyclic transformations is demonstrated by
+extending the CycleGAN model to transform images among three different
+categories. Repeated application of the trained generators produces sequences
+of images that transition among the different categories. The generated image
+sequences occupy a more limited region of the image space compared with the
+original training dataset. Quantitative evaluation using precision and recall
+metrics indicates that the generated images have high quality but reduced
+diversity relative to the training dataset. Such successive generation
+processes are characterized as chaotic dynamics in terms of dynamical system
+theory. Positive Lyapunov exponents estimated from the generated trajectories
+confirm the presence of chaotic dynamics, with the Lyapunov dimension of the
+attractor found to be comparable to the intrinsic dimension of the training
+data manifold. The results suggest that chaotic dynamics in the image space
+defined by the deep generative model contribute to the diversity of the
+generated images, constituting a novel approach for multi-class image
+generation. This model can be interpreted as an extension of classical
+associative memory to perform hetero-association among image categories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Decision <span class="highlight-title">Transformer</span>: Reinforcement Learning via Hierarchical
+  Chain-of-Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20692v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20692v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sili Huang, Jifeng Hu, Hechang Chen, Lichao Sun, Bo Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning is a promising approach for offline reinforcement
+learning (RL) to handle online tasks, which can be achieved by providing task
+prompts. Recent works demonstrated that in-context RL could emerge with
+self-improvement in a trial-and-error manner when treating RL tasks as an
+across-episodic sequential prediction problem. Despite the self-improvement not
+requiring gradient updates, current works still suffer from high computational
+costs when the across-episodic sequence increases with task horizons. To this
+end, we propose an In-context Decision Transformer (IDT) to achieve
+self-improvement in a high-level trial-and-error manner. Specifically, IDT is
+inspired by the efficient hierarchical structure of human decision-making and
+thus reconstructs the sequence to consist of high-level decisions instead of
+low-level actions that interact with environments. As one high-level decision
+can guide multi-step low-level actions, IDT naturally avoids excessively long
+sequences and solves online tasks more efficiently. Experimental results show
+that IDT achieves state-of-the-art in long-horizon tasks over current
+in-context RL methods. In particular, the online evaluation time of our IDT is
+\textbf{36$\times$} times faster than baselines in the D4RL benchmark and
+\textbf{27$\times$} times faster in the Grid World benchmark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unleashing the Potential of Diffusion Models for Incomplete Data
+  Imputation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengrui Zhang, Liancheng Fang, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces DiffPuter, an iterative method for missing data
+imputation that leverages the Expectation-Maximization (EM) algorithm and
+Diffusion Models. By treating missing data as hidden variables that can be
+updated during model training, we frame the missing data imputation task as an
+EM problem. During the M-step, DiffPuter employs a diffusion model to learn the
+joint distribution of both the observed and currently estimated missing data.
+In the E-step, DiffPuter re-estimates the missing data based on the conditional
+probability given the observed data, utilizing the diffusion model learned in
+the M-step. Starting with an initial imputation, DiffPuter alternates between
+the M-step and E-step until convergence. Through this iterative process,
+DiffPuter progressively refines the complete data distribution, yielding
+increasingly accurate estimations of the missing data. Our theoretical analysis
+demonstrates that the unconditional training and conditional sampling processes
+of the diffusion model align precisely with the objectives of the M-step and
+E-step, respectively. Empirical evaluations across 10 diverse datasets and
+comparisons with 16 different imputation methods highlight DiffPuter's superior
+performance. Notably, DiffPuter achieves an average improvement of 8.10% in MAE
+and 5.64% in RMSE compared to the most competitive existing method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditioning GAN Without Training <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kidist Amde Mekonnen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning algorithms have a large number of trainable parameters often
+with sizes of hundreds of thousands or more. Training this algorithm requires a
+large amount of training data and generating a sufficiently large dataset for
+these algorithms is costly\cite{noguchi2019image}.
+  GANs are generative neural networks that use two deep learning networks that
+are competing with each other. The networks are generator and discriminator
+networks. The generator tries to generate realistic images which resemble the
+actual training dataset by approximating the training data distribution and the
+discriminator is trained to classify images as real or
+fake(generated)\cite{goodfellow2016nips}. Training these GAN algorithms also
+requires a large amount of training dataset\cite{noguchi2019image}.
+  In this study, the aim is to address the question, "Given an unconditioned
+pretrained generator network and a pretrained classifier, is it feasible to
+develop a conditioned generator without relying on any training dataset?"
+  The paper begins with a general introduction to the problem. The subsequent
+sections are structured as follows: Section 2 provides background information
+on the problem. Section 3 reviews relevant literature on the topic. Section 4
+outlines the methodology employed in this study. Section 5 presents the
+experimental results. Section 6 discusses the findings and proposes potential
+future research directions. Finally, Section 7 offers concluding remarks.
+  The implementation can be accessed
+\href{https://github.com/kidist-amde/BigGAN-PyTorch}{here}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, Part of my MSc project course, School Project
+  Course 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Counterfactual Image Generation Using Mahalanobis Distance
+  with Distribution Preferences in Feature Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukai Zhang, Ao Xu, Zihao Li, Tieru Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of Artificial Intelligence (AI), the importance of Explainable
+Artificial Intelligence (XAI) is increasingly recognized, particularly as AI
+models become more integral to our lives. One notable single-instance XAI
+approach is counterfactual explanation, which aids users in comprehending a
+model's decisions and offers guidance on altering these decisions. Specifically
+in the context of image classification models, effective image counterfactual
+explanations can significantly enhance user understanding. This paper
+introduces a novel method for computing feature importance within the feature
+space of a black-box model. By employing information fusion techniques, our
+method maximizes the use of data to address feature counterfactual explanations
+in the feature space. Subsequently, we utilize an image generation model to
+transform these feature counterfactual explanations into image counterfactual
+explanations. Our experiments demonstrate that the counterfactual explanations
+generated by our method closely resemble the original images in both pixel and
+feature spaces. Additionally, our method outperforms established baselines,
+achieving impressive experimental results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ No-Regret Learning for Fair Multi-Agent Social Welfare Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengxiao Zhang, Ramiro Deo-Campo Vuong, Haipeng Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of online multi-agent Nash social welfare (NSW)
+maximization. While previous works of Hossain et al. [2021], Jones et al.
+[2023] study similar problems in stochastic multi-agent multi-armed bandits and
+show that $\sqrt{T}$-regret is possible after $T$ rounds, their fairness
+measure is the product of all agents' rewards, instead of their NSW (that is,
+their geometric mean). Given the fundamental role of NSW in the fairness
+literature, it is more than natural to ask whether no-regret fair learning with
+NSW as the objective is possible. In this work, we provide a complete answer to
+this question in various settings. Specifically, in stochastic $N$-agent
+$K$-armed bandits, we develop an algorithm with
+$\widetilde{\mathcal{O}}\left(K^{\frac{2}{N}}T^{\frac{N-1}{N}}\right)$ regret
+and prove that the dependence on $T$ is tight, making it a sharp contrast to
+the $\sqrt{T}$-regret bounds of Hossain et al. [2021], Jones et al. [2023]. We
+then consider a more challenging version of the problem with adversarial
+rewards. Somewhat surprisingly, despite NSW being a concave function, we prove
+that no algorithm can achieve sublinear regret. To circumvent such negative
+results, we further consider a setting with full-information feedback and
+design two algorithms with $\sqrt{T}$-regret: the first one has no dependence
+on $N$ at all and is applicable to not just NSW but a broad class of welfare
+functions, while the second one has better dependence on $K$ and is preferable
+when $N$ is small. Finally, we also show that logarithmic regret is possible
+whenever there exists one agent who is indifferent about different arms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provably Efficient Interactive-Grounded Learning with Personalized
+  Reward 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20677v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20677v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengxiao Zhang, Yuheng Zhang, Haipeng Luo, Paul Mineiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interactive-Grounded Learning (IGL) [Xie et al., 2021] is a powerful
+framework in which a learner aims at maximizing unobservable rewards through
+interacting with an environment and observing reward-dependent feedback on the
+taken actions. To deal with personalized rewards that are ubiquitous in
+applications such as recommendation systems, Maghakian et al. [2022] study a
+version of IGL with context-dependent feedback, but their algorithm does not
+come with theoretical guarantees. In this work, we consider the same problem
+and provide the first provably efficient algorithms with sublinear regret under
+realizability. Our analysis reveals that the step-function estimator of prior
+work can deviate uncontrollably due to finite-sample effects. Our solution is a
+novel Lipschitz reward estimator which underestimates the true reward and
+enjoys favorable generalization performances. Building on this estimator, we
+propose two algorithms, one based on explore-then-exploit and the other based
+on inverse-gap weighting. We apply IGL to learning from image feedback and
+learning from text feedback, which are reward-free settings that arise in
+practice. Experimental results showcase the importance of using our Lipschitz
+reward estimator and the overall effectiveness of our algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adv-KD: Adversarial Knowledge Distillation for Faster Diffusion Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kidist Amde Mekonnen, Nicola Dall'Asen, Paolo Rota
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Probabilistic Models (DPMs) have emerged as a powerful class of
+deep generative models, achieving remarkable performance in image synthesis
+tasks. However, these models face challenges in terms of widespread adoption
+due to their reliance on sequential denoising steps during sample generation.
+This dependence leads to substantial computational requirements, making them
+unsuitable for resource-constrained or real-time processing systems. To address
+these challenges, we propose a novel method that integrates denoising phases
+directly into the model's architecture, thereby reducing the need for
+resource-intensive computations. Our approach combines diffusion models with
+generative adversarial networks (GANs) through knowledge distillation, enabling
+more efficient training and evaluation. By utilizing a pre-trained diffusion
+model as a teacher model, we train a student model through adversarial
+learning, employing layerwise transformations for denoising and submodules for
+predicting the teacher model's output at various points in time. This
+integration significantly reduces the number of parameters and denoising steps
+required, leading to improved sampling speed at test time. We validate our
+method with extensive experiments, demonstrating comparable performance with
+reduced computational requirements compared to existing approaches. By enabling
+the deployment of diffusion models on resource-constrained devices, our
+research mitigates their computational burden and paves the way for wider
+accessibility and practical use across the research community and end-users.
+  Our code is publicly available at https://github.com/kidist-amde/Adv-KD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 11 figures, ELLIS Doctoral Symposium 2023 in Helsinki,
+  Finland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Position Coupling: Leveraging Task Structure for Improved Length
+  Generalization of <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanseul Cho, Jaeyoung Cha, Pranjal Awasthi, Srinadh Bhojanapalli, Anupam Gupta, Chulhee Yun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even for simple arithmetic tasks like integer addition, it is challenging for
+Transformers to generalize to longer sequences than those encountered during
+training. To tackle this problem, we propose position coupling, a simple yet
+effective method that directly embeds the structure of the tasks into the
+positional encoding of a (decoder-only) Transformer. Taking a departure from
+the vanilla absolute position mechanism assigning unique position IDs to each
+of the tokens, we assign the same position IDs to two or more "relevant"
+tokens; for integer addition tasks, we regard digits of the same significance
+as in the same position. On the empirical side, we show that with the proposed
+position coupling, a small (1-layer) Transformer trained on 1 to 30-digit
+additions can generalize up to 200-digit additions (6.67x of the trained
+length). On the theoretical side, we prove that a 1-layer Transformer with
+coupled positions can solve the addition task involving exponentially many
+digits, whereas any 1-layer Transformer without positional information cannot
+entirely solve it. We also demonstrate that position coupling can be applied to
+other algorithmic tasks such as addition with multiple summands, Nx2
+multiplication, copy/reverse, and a two-dimensional task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>73 pages, 20 figures, 90 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Paratope and Epitope Prediction by Multi-Modal Contrastive
+  Learning and Interaction Informativeness Estimation <span class="chip">IJCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20668v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20668v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Wang, Yongkang Wang, Wen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately predicting antibody-antigen binding residues, i.e., paratopes and
+epitopes, is crucial in antibody design. However, existing methods solely focus
+on uni-modal data (either sequence or structure), disregarding the
+complementary information present in multi-modal data, and most methods predict
+paratopes and epitopes separately, overlooking their specific spatial
+interactions. In this paper, we propose a novel Multi-modal contrastive
+learning and Interaction informativeness estimation-based method for Paratope
+and Epitope prediction, named MIPE, by using both sequence and structure data
+of antibodies and antigens. MIPE implements a multi-modal contrastive learning
+strategy, which maximizes representations of binding and non-binding residues
+within each modality and meanwhile aligns uni-modal representations towards
+effective modal representations. To exploit the spatial interaction
+information, MIPE also incorporates an interaction informativeness estimation
+that computes the estimated interaction matrices between antibodies and
+antigens, thereby approximating them to the actual ones. Extensive experiments
+demonstrate the superiority of our method compared to baselines. Additionally,
+the ablation studies and visualizations demonstrate the superiority of MIPE
+owing to the better representations acquired through multi-modal contrastive
+learning and the interaction patterns comprehended by the interaction
+informativeness estimation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by IJCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Weak Robust Compatibility Between Learning Algorithms and Counterfactual
+  Explanation Generation Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ao Xu, Tieru Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual explanation generation is a powerful method for Explainable
+Artificial Intelligence. It can help users understand why machine learning
+models make specific decisions, and how to change those decisions. Evaluating
+the robustness of counterfactual explanation algorithms is therefore crucial.
+Previous literature has widely studied the robustness based on the perturbation
+of input instances. However, the robustness defined from the perspective of
+perturbed instances is sometimes biased, because this definition ignores the
+impact of learning algorithms on robustness. In this paper, we propose a more
+reasonable definition, Weak Robust Compatibility, based on the perspective of
+explanation strength. In practice, we propose WRC-Test to help us generate more
+robust counterfactuals. Meanwhile, we designed experiments to verify the
+effectiveness of WRC-Test. Theoretically, we introduce the concepts of PAC
+learning theory and define the concept of PAC WRC-Approximability. Based on
+reasonable assumptions, we establish oracle inequalities about weak robustness,
+which gives a sufficient condition for PAC WRC-Approximability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sign is Not a Remedy: Multiset-to-Multiset Message Passing for Learning
+  on Heterophilic Graphs <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Langzhang Liang, Sunwoo Kim, Kijung Shin, Zenglin Xu, Shirui Pan, Yuan Qi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have gained significant attention as a powerful
+modeling and inference method, especially for homophilic graph-structured data.
+To empower GNNs in heterophilic graphs, where adjacent nodes exhibit dissimilar
+labels or features, Signed Message Passing (SMP) has been widely adopted.
+However, there is a lack of theoretical and empirical analysis regarding the
+limitations of SMP. In this work, we unveil some potential pitfalls of SMP and
+their remedies. We first identify two limitations of SMP: undesirable
+representation update for multi-hop neighbors and vulnerability against
+oversmoothing issues. To overcome these challenges, we propose a novel message
+passing function called Multiset to Multiset GNN(M2M-GNN). Our theoretical
+analyses and extensive experiments demonstrate that M2M-GNN effectively
+alleviates the aforementioned limitations of SMP, yielding superior performance
+in comparison
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reward-based Input Construction for Cross-document Relation Extraction <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20649v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20649v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Byeonghu Na, Suhyeon Jo, Yeongmin Kim, Il-Chul Moon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relation extraction (RE) is a fundamental task in natural language
+processing, aiming to identify relations between target entities in text. While
+many RE methods are designed for a single sentence or document, cross-document
+RE has emerged to address relations across multiple long documents. Given the
+nature of long documents in cross-document RE, extracting document embeddings
+is challenging due to the length constraints of pre-trained language models.
+Therefore, we propose REward-based Input Construction (REIC), the first
+learning-based sentence selector for cross-document RE. REIC extracts sentences
+based on relational evidence, enabling the RE module to effectively infer
+relations. Since supervision of evidence sentences is generally unavailable, we
+train REIC using reinforcement learning with RE prediction scores as rewards.
+Experimental results demonstrate the superiority of our method over heuristic
+methods for different RE structures and backbones in cross-document RE. Our
+code is publicly available at https://github.com/aailabkaist/REIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Shotluck Holmes: A Family of Efficient Small-Scale Large Language Vision
+  Models For Video Captioning and Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Luo, Austin Peng, Adithya Vasudev, Rishabh Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video is an increasingly prominent and information-dense medium, yet it poses
+substantial challenges for language models. A typical video consists of a
+sequence of shorter segments, or shots, that collectively form a coherent
+narrative. Each shot is analogous to a word in a sentence where multiple data
+streams of information (such as visual and auditory data) must be processed
+simultaneously. Comprehension of the entire video requires not only
+understanding the visual-audio information of each shot but also requires that
+the model links the ideas between each shot to generate a larger,
+all-encompassing story. Despite significant progress in the field, current
+works often overlook videos' more granular shot-by-shot semantic information.
+In this project, we propose a family of efficient large language vision models
+(LLVMs) to boost video summarization and captioning called Shotluck Holmes. By
+leveraging better pretraining and data collection strategies, we extend the
+abilities of existing small LLVMs from being able to understand a picture to
+being able to understand a sequence of frames. Specifically, we show that
+Shotluck Holmes achieves better performance than state-of-the-art results on
+the Shot2Story video captioning and summary task with significantly smaller and
+more computationally efficient models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Resampling methods for Private Statistical Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07131v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07131v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karan Chadha, John Duchi, Rohith Kuditipudi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the task of constructing confidence intervals with differential
+privacy. We propose two private variants of the non-parametric bootstrap, which
+privately compute the median of the results of multiple "little" bootstraps run
+on partitions of the data and give asymptotic bounds on the coverage error of
+the resulting confidence intervals. For a fixed differential privacy parameter
+$\epsilon$, our methods enjoy the same error rates as that of the non-private
+bootstrap to within logarithmic factors in the sample size $n$. We empirically
+validate the performance of our methods for mean estimation, median estimation,
+and logistic regression with both real and synthetic data. Our methods achieve
+similar coverage accuracy to existing methods (and non-private baselines) while
+providing notably shorter ($\gtrsim 10$ times) confidence intervals than
+previous approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>45 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mamba: Linear-Time Sequence Modeling with Selective State Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.00752v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.00752v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Albert Gu, Tri Dao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models, now powering most of the exciting applications in deep
+learning, are almost universally based on the Transformer architecture and its
+core attention module. Many subquadratic-time architectures such as linear
+attention, gated convolution and recurrent models, and structured state space
+models (SSMs) have been developed to address Transformers' computational
+inefficiency on long sequences, but they have not performed as well as
+attention on important modalities such as language. We identify that a key
+weakness of such models is their inability to perform content-based reasoning,
+and make several improvements. First, simply letting the SSM parameters be
+functions of the input addresses their weakness with discrete modalities,
+allowing the model to selectively propagate or forget information along the
+sequence length dimension depending on the current token. Second, even though
+this change prevents the use of efficient convolutions, we design a
+hardware-aware parallel algorithm in recurrent mode. We integrate these
+selective SSMs into a simplified end-to-end neural network architecture without
+attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5$\times$
+higher throughput than Transformers) and linear scaling in sequence length, and
+its performance improves on real data up to million-length sequences. As a
+general sequence model backbone, Mamba achieves state-of-the-art performance
+across several modalities such as language, audio, and genomics. On language
+modeling, our Mamba-3B model outperforms Transformers of the same size and
+matches Transformers twice its size, both in pretraining and downstream
+evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generalization or Memorization: Data Contamination and Trustworthy
+  Evaluation for Large Language Models <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15938v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15938v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihong Dong, Xue Jiang, Huanyu Liu, Zhi Jin, Bin Gu, Mengfei Yang, Ge Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent statements about the impressive capabilities of large language models
+(LLMs) are usually supported by evaluating on open-access benchmarks.
+Considering the vast size and wide-ranging sources of LLMs' training data, it
+could explicitly or implicitly include test data, leading to LLMs being more
+susceptible to data contamination. However, due to the opacity of training
+data, the black-box access of models, and the rapid growth of synthetic
+training data, detecting and mitigating data contamination for LLMs faces
+significant challenges. In this paper, we propose CDD, which stands for
+Contamination Detection via output Distribution for LLMs. CDD necessitates only
+the sampled texts to detect data contamination, by identifying the peakedness
+of LLM's output distribution. To mitigate the impact of data contamination in
+evaluation, we also present TED: Trustworthy Evaluation via output
+Distribution, based on the correction of LLM's output distribution. To
+facilitate this study, we introduce two benchmarks, i.e., DetCon and ComiEval,
+for data contamination detection and contamination mitigation evaluation tasks.
+Extensive experimental results show that CDD achieves the average relative
+improvements of 21.8\%-30.2\% over other contamination detection approaches in
+terms of Accuracy, F1 Score, and AUC metrics, and can effectively detect
+implicit contamination. TED substantially mitigates performance improvements up
+to 66.9\% attributed to data contamination across various contamination setups.
+In real-world applications, we reveal that ChatGPT exhibits a high potential to
+suffer from data contamination on HumanEval benchmark.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ P4: Towards private, personalized, and Peer-to-Peer learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17697v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17697v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Mahdi Maheri, Sandra Siby, Sina Abdollahi, Anastasia Borovykh, Hamed Haddadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized learning is a proposed approach to address the problem of data
+heterogeneity in collaborative machine learning. In a decentralized setting,
+the two main challenges of personalization are client clustering and data
+privacy. In this paper, we address these challenges by developing P4
+(Personalized Private Peer-to-Peer) a method that ensures that each client
+receives a personalized model while maintaining differential privacy guarantee
+of each client's local dataset during and after the training. Our approach
+includes the design of a lightweight algorithm to identify similar clients and
+group them in a private, peer-to-peer (P2P) manner. Once grouped, we develop
+differentially-private knowledge distillation for clients to co-train with
+minimal impact on accuracy. We evaluate our proposed method on three benchmark
+datasets (FEMNIST or Federated EMNIST, CIFAR-10 and CIFAR-100) and two
+different neural network architectures (Linear and CNN-based networks) across a
+range of privacy parameters. The results demonstrate the potential of P4, as it
+outperforms the state-of-the-art of differential private P2P by up to 40
+percent in terms of accuracy. We also show the practicality of P4 by
+implementing it on resource constrained devices, and validating that it has
+minimal overhead, e.g., about 7 seconds to run collaborative training between
+two clients.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Conditional Optimal Transport through Simulation-Free Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.04240v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.04240v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gavin Kerrigan, Giosue Migliorini, Padhraic Smyth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the geometry of conditional optimal transport (COT) and prove a
+dynamical formulation which generalizes the Benamou-Brenier Theorem. Equipped
+with these tools, we propose a simulation-free flow-based method for
+conditional generative modeling. Our method couples an arbitrary source
+distribution to a specified target distribution through a triangular COT plan,
+and a conditional generative model is obtained by approximating the geodesic
+path of measures induced by this COT plan. Our theory and methods are
+applicable in infinite-dimensional settings, making them well suited for a wide
+class of Bayesian inverse problems. Empirically, we demonstrate that our method
+is competitive on several challenging conditional generation tasks, including
+an infinite-dimensional inverse problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SOUL: Unlocking the Power of Second-Order Optimization for LLM
+  Unlearning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.18239v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.18239v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghan Jia, Yihua Zhang, Yimeng Zhang, Jiancheng Liu, Bharat Runwal, James Diffenderfer, Bhavya Kailkhura, Sijia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have highlighted the necessity of effective
+unlearning mechanisms to comply with data regulations and ethical AI practices.
+LLM unlearning aims at removing undesired data influences and associated model
+capabilities without compromising utility out of the scope of unlearning. While
+interest in studying LLM unlearning is growing,the impact of the optimizer
+choice for LLM unlearning remains under-explored. In this work, we shed light
+on the significance of optimizer selection in LLM unlearning for the first
+time, establishing a clear connection between {second-order optimization} and
+influence unlearning (a classical approach using influence functions to update
+the model for data influence removal). This insight propels us to develop a
+second-order unlearning framework, termed SOUL, built upon the second-order
+clipped stochastic optimization (Sophia)-based LLM training method. SOUL
+extends the static, one-shot model update using influence unlearning to a
+dynamic, iterative unlearning process. Our extensive experiments show that SOUL
+consistently outperforms conventional first-order methods across various
+unlearning tasks, models, and metrics, suggesting the promise of second-order
+optimization in providing a scalable and easily implementable solution for LLM
+unlearning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ API Pack: A Massive Multi-Programming Language <span class="highlight-title">Dataset</span> for API Call
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09615v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09615v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Guo, Adriana Meza Soria, Wei Sun, Yikang Shen, Rameswar Panda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce API Pack, a massive multi-programming language dataset
+containing more than 1 million instruction-API call pairs to improve the API
+call generation capabilities of large language models. By fine-tuning
+CodeLlama-13B on 20,000 Python instances from API Pack, we achieved around 10%
+and 5% higher accuracy compared to GPT-3.5 and GPT-4, respectively, in
+generating unseen API calls. Fine-tuning on API Pack enables cross-programming
+language generalization by leveraging a large amount of data in one language
+and small amounts of data from other languages. Scaling the training data to 1
+million instances further improves the model's generalization to new APIs not
+encountered during training. We open-source the API Pack dataset, trained
+models, and associated source code at https://github.com/zguo0525/API-Pack to
+facilitate further research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Accelerated Gradient Method for Convex Smooth Simple Bilevel
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08097v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08097v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jincheng Cao, Ruichen Jiang, Erfan Yazdandoost Hamedani, Aryan Mokhtari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on simple bilevel optimization problems, where we
+minimize a convex smooth objective function over the optimal solution set of
+another convex smooth constrained optimization problem. We present a novel
+bilevel optimization method that locally approximates the solution set of the
+lower-level problem using a cutting plane approach and employs an accelerated
+gradient-based update to reduce the upper-level objective function over the
+approximated solution set. We measure the performance of our method in terms of
+suboptimality and infeasibility errors and provide non-asymptotic convergence
+guarantees for both error criteria. Specifically, when the feasible set is
+compact, we show that our method requires at most
+$\mathcal{O}(\max\{1/\sqrt{\epsilon_{f}}, 1/\epsilon_g\})$ iterations to find a
+solution that is $\epsilon_f$-suboptimal and $\epsilon_g$-infeasible. Moreover,
+under the additional assumption that the lower-level objective satisfies the
+$r$-th H\"olderian error bound, we show that our method achieves an iteration
+complexity of
+$\mathcal{O}(\max\{\epsilon_{f}^{-\frac{2r-1}{2r}},\epsilon_{g}^{-\frac{2r-1}{2r}}\})$,
+which matches the optimal complexity of single-level convex constrained
+optimization when $r=1$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Collective Variable Free Transition Path Sampling with Generative Flow
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kiyoung Seong, Seonghyun Park, Seonghwan Kim, Woo Youn Kim, Sungsoo Ahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding transition paths between meta-stable states in molecular
+systems is fundamental for material design and drug discovery. However,
+sampling these paths via molecular dynamics simulations is computationally
+prohibitive due to the high-energy barriers between the meta-stable states.
+Recent machine learning approaches are often restricted to simple systems or
+rely on collective variables (CVs) extracted from expensive domain knowledge.
+In this work, we propose to leverage generative flow networks (GFlowNets) to
+sample transition paths without relying on CVs. We reformulate the problem as
+amortized energy-based sampling over molecular trajectories and train a bias
+potential by minimizing the squared log-ratio between the target distribution
+and the generator, derived from the flow matching objective of GFlowNets. Our
+evaluation on three proteins (Alanine Dipeptide, Polyproline, and Chignolin)
+demonstrates that our approach, called TPS-GFN, generates more realistic and
+diverse transition paths than the previous CV-free machine learning approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Online Fisher Markets: Static Pricing Limits and Adaptive
+  Enhancements 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.00825v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.00825v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Devansh Jalota, Yinyu Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fisher markets are one of the most fundamental models for resource
+allocation. However, the problem of computing equilibrium prices in Fisher
+markets typically relies on complete knowledge of users' budgets and utility
+functions and requires transactions to happen in a static market where all
+users are present simultaneously. Motivated by these practical considerations,
+we study an online variant of Fisher markets, wherein users with privately
+known utility and budget parameters, drawn i.i.d. from a distribution, arrive
+sequentially. In this setting, we first study the limitations of static pricing
+algorithms, which set uniform prices for all users, along two performance
+metrics: (i) regret, i.e., the optimality gap in the objective of the
+Eisenberg-Gale program between an online algorithm and an oracle with complete
+information, and (ii) capacity violations, i.e., the over-consumption of goods
+relative to their capacities. Given the limitations of static pricing, we
+design adaptive posted-pricing algorithms, one with knowledge of the
+distribution of users' budget and utility parameters and another that adjusts
+prices solely based on past observations of user consumption, i.e., revealed
+preference feedback, with improved performance guarantees. Finally, we present
+numerical experiments to compare our revealed preference algorithm's
+performance to several benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> Hierarchical World Models as Visual Whole-Body Humanoid Controllers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18418v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18418v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicklas Hansen, Jyothir S V, Vlad Sobal, <span class="highlight-author">Yann LeCun</span>, Xiaolong Wang, Hao Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Whole-body control for humanoids is challenging due to the high-dimensional
+nature of the problem, coupled with the inherent instability of a bipedal
+morphology. Learning from visual observations further exacerbates this
+difficulty. In this work, we explore highly data-driven approaches to visual
+whole-body humanoid control based on reinforcement learning, without any
+simplifying assumptions, reward design, or skill primitives. Specifically, we
+propose a hierarchical world model in which a high-level agent generates
+commands based on visual observations for a low-level agent to execute, both of
+which are trained with rewards. Our approach produces highly performant control
+policies in 8 tasks with a simulated 56-DoF humanoid, while synthesizing
+motions that are broadly preferred by humans. Code and videos:
+https://nicklashansen.com/rlpuppeteer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code and videos at https://nicklashansen.com/rlpuppeteer</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mastering Long-Tail Complexity on Graphs: Characterization, Learning,
+  and Generalization <span class="chip">KDD 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09938v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09938v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haohui Wang, Baoyu Jing, Kaize Ding, Yada Zhu, Wei Cheng, Si Zhang, Yonghui Fan, Liqing Zhang, Dawei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of long-tail classification on graphs, the vast majority of
+existing work primarily revolves around the development of model debiasing
+strategies, intending to mitigate class imbalances and enhance the overall
+performance. Despite the notable success, there is very limited literature that
+provides a theoretical tool for characterizing the behaviors of long-tail
+classes in graphs and gaining insight into generalization performance in
+real-world scenarios. To bridge this gap, we propose a generalization bound for
+long-tail classification on graphs by formulating the problem in the fashion of
+multi-task learning, i.e., each task corresponds to the prediction of one
+particular class. Our theoretical results show that the generalization
+performance of long-tail classification is dominated by the overall loss range
+and the task complexity. Building upon the theoretical findings, we propose a
+novel generic framework HierTail for long-tail classification on graphs. In
+particular, we start with a hierarchical task grouping module that allows us to
+assign related tasks into hypertasks and thus control the complexity of the
+task space; then, we further design a balanced contrastive learning module to
+adaptively balance the gradients of both head and tail classes to control the
+loss range across all tasks in a unified fashion. Extensive experiments
+demonstrate the effectiveness of HierTail in characterizing long-tail classes
+on real graphs, which achieves up to 12.9% improvement over the leading
+baseline method in accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at KDD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Inference and Reinforcement Learning: A unified inference on
+  continuous state and action spaces under partial observability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.07946v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.07946v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Parvin Malekzadeh, Konstantinos N. Plataniotis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) has garnered significant attention for developing
+decision-making agents that aim to maximize rewards, specified by an external
+supervisor, within fully observable environments. However, many real-world
+problems involve partial observations, formulated as partially observable
+Markov decision processes (POMDPs). Previous studies have tackled RL in POMDPs
+by either incorporating the memory of past actions and observations or by
+inferring the true state of the environment from observed data. However,
+aggregating observed data over time becomes impractical in continuous spaces.
+Moreover, inference-based RL approaches often require many samples to perform
+well, as they focus solely on reward maximization and neglect uncertainty in
+the inferred state. Active inference (AIF) is a framework formulated in POMDPs
+and directs agents to select actions by minimizing a function called expected
+free energy (EFE). This supplies reward-maximizing (exploitative) behaviour, as
+in RL, with information-seeking (exploratory) behaviour. Despite this
+exploratory behaviour of AIF, its usage is limited to discrete spaces due to
+the computational challenges associated with EFE. In this paper, we propose a
+unified principle that establishes a theoretical connection between AIF and RL,
+enabling seamless integration of these two approaches and overcoming their
+aforementioned limitations in continuous space POMDP settings. We substantiate
+our findings with theoretical analysis, providing novel perspectives for
+utilizing AIF in the design of artificial agents. Experimental results
+demonstrate the superior learning capabilities of our method in solving
+continuous space partially observable tasks. Notably, our approach harnesses
+information-seeking exploration, enabling it to effectively solve reward-free
+problems and rendering explicit task reward design by an external supervisor
+optional.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>90 pages including appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Calibrated Self-Rewarding Vision Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14622v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14622v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Zhou, Zhiyuan Fan, Dongjie Cheng, Sihan Yang, Zhaorun Chen, Chenhang Cui, Xiyao Wang, Yun Li, Linjun Zhang, Huaxiu Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Vision-Language Models (LVLMs) have made substantial progress by
+integrating pre-trained large language models (LLMs) and vision models through
+instruction tuning. Despite these advancements, LVLMs often exhibit the
+hallucination phenomenon, where generated text responses appear linguistically
+plausible but contradict the input image, indicating a misalignment between
+image and text pairs. This misalignment arises because the model tends to
+prioritize textual information over visual input, even when both the language
+model and visual representations are of high quality. Existing methods leverage
+additional models or human annotations to curate preference data and enhance
+modality alignment through preference optimization. These approaches may not
+effectively reflect the target LVLM's preferences, making the curated
+preferences easily distinguishable. Our work addresses these challenges by
+proposing the Calibrated Self-Rewarding (CSR) approach, which enables the model
+to self-improve by iteratively generating candidate responses, evaluating the
+reward for each response, and curating preference data for fine-tuning. In the
+reward modeling, we employ a step-wise strategy and incorporate visual
+constraints into the self-rewarding process to place greater emphasis on visual
+input. Empirical results demonstrate that CSR enhances performance and reduces
+hallucinations across ten benchmarks and tasks, achieving substantial
+improvements over existing methods by 7.62%. Our empirical results are further
+supported by rigorous theoretical analysis, under mild assumptions, verifying
+the effectiveness of introducing visual constraints into the self-rewarding
+paradigm. Additionally, CSR shows compatibility with different vision-language
+models and the ability to incrementally improve performance through iterative
+fine-tuning. Our data and code are available at
+https://github.com/YiyangZhou/CSR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fix some typos and add acknowledgement section in V3</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Open Ad Hoc Teamwork with Cooperative Game Theory <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianhong Wang, Yang Li, Yuan Zhang, Wei Pan, Samuel Kaski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ad hoc teamwork poses a challenging problem, requiring the design of an agent
+to collaborate with teammates without prior coordination or joint training.
+Open ad hoc teamwork (OAHT) further complicates this challenge by considering
+environments with a changing number of teammates, referred to as open teams.
+One promising solution in practice to this problem is leveraging the
+generalizability of graph neural networks to handle an unrestricted number of
+agents, named graph-based policy learning (GPL). However, its joint Q-value
+representation over a coordination graph lacks convincing explanations. In this
+paper, we establish a new theory to understand the joint Q-value representation
+for OAHT, from the perspective of cooperative game theory, and validate its
+learning paradigm. Building on our theory, we propose a novel algorithm named
+CIAO, compatible with GPL framework, with additional provable implementation
+tricks that can facilitate learning. The demos of experimental results are
+available on https://sites.google.com/view/ciao2024, and the code of
+experiments is published on https://github.com/hsvgbkhgbv/CIAO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICML 2024, 29 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Use Your INSTINCT: INSTruction optimization for LLMs usIng Neural
+  bandits Coupled with <span class="highlight-title">Transformer</span>s <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02905v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02905v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoqiang Lin, Zhaoxuan Wu, Zhongxiang Dai, Wenyang Hu, Yao Shu, See-Kiong Ng, Patrick Jaillet, Bryan Kian Hsiang Low
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown remarkable instruction-following
+capabilities and achieved impressive performances in various applications.
+However, the performances of LLMs depend heavily on the instructions given to
+them, which are typically manually tuned with substantial human efforts. Recent
+work has used the query-efficient Bayesian optimization (BO) algorithm to
+automatically optimize the instructions given to black-box LLMs. However, BO
+usually falls short when optimizing highly sophisticated (e.g.,
+high-dimensional) objective functions, such as the functions mapping an
+instruction to the performance of an LLM. This is mainly due to the limited
+expressive power of the Gaussian process (GP) which is used by BO as a
+surrogate to model the objective function. Meanwhile, it has been repeatedly
+shown that neural networks (NNs), especially pre-trained transformers, possess
+strong expressive power and can model highly complex functions. So, we adopt a
+neural bandit algorithm which replaces the GP in BO by an NN surrogate to
+optimize instructions for black-box LLMs. More importantly, the neural bandit
+algorithm allows us to naturally couple the NN surrogate with the hidden
+representation learned by a pre-trained transformer (i.e., an open-source LLM),
+which significantly boosts its performance. These motivate us to propose our
+INSTruction optimization usIng Neural bandits Coupled with Transformers
+(INSTINCT) algorithm. We perform instruction optimization for ChatGPT and use
+extensive experiments to show that INSTINCT consistently outperforms baselines
+in different tasks, e.g., various instruction induction tasks and the task of
+improving zero-shot chain-of-thought instructions. Our code is available at
+https://github.com/xqlin98/INSTINCT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pre- to Post-Contrast Breast MRI Synthesis for Enhanced Tumour
+  Segmentation <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.10879v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.10879v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richard Osuala, Smriti Joshi, Apostolia Tsirikoglou, Lidia Garrucho, Walter H. L. Pinaya, Oliver Diaz, Karim Lekadir
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite its benefits for tumour detection and treatment, the administration
+of contrast agents in dynamic contrast-enhanced MRI (DCE-MRI) is associated
+with a range of issues, including their invasiveness, bioaccumulation, and a
+risk of nephrogenic systemic fibrosis. This study explores the feasibility of
+producing synthetic contrast enhancements by translating pre-contrast
+T1-weighted fat-saturated breast MRI to their corresponding first DCE-MRI
+sequence leveraging the capabilities of a generative adversarial network (GAN).
+Additionally, we introduce a Scaled Aggregate Measure (SAMe) designed for
+quantitatively evaluating the quality of synthetic data in a principled manner
+and serving as a basis for selecting the optimal generative model. We assess
+the generated DCE-MRI data using quantitative image quality metrics and apply
+them to the downstream task of 3D breast tumour segmentation. Our results
+highlight the potential of post-contrast DCE-MRI synthesis in enhancing the
+robustness of breast tumour segmentation models via data augmentation. Our code
+is available at https://github.com/RichardObi/pre_post_synthesis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as oral presentation at SPIE Medical Imaging 2024 (Image
+  Processing)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Primal Dual Continual Learning: Balancing Stability and Plasticity
+  through Adaptive Memory Allocation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00154v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00154v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juan Elenter, Navid NaderiAlizadeh, Tara Javidi, Alejandro Ribeiro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning is inherently a constrained learning problem. The goal is
+to learn a predictor under a no-forgetting requirement. Although several prior
+studies formulate it as such, they do not solve the constrained problem
+explicitly. In this work, we show that it is both possible and beneficial to
+undertake the constrained optimization problem directly. To do this, we
+leverage recent results in constrained learning through Lagrangian duality. We
+focus on memory-based methods, where a small subset of samples from previous
+tasks can be stored in a replay buffer. In this setting, we analyze two
+versions of the continual learning problem: a coarse approach with constraints
+at the task level and a fine approach with constraints at the sample level. We
+show that dual variables indicate the sensitivity of the optimal value of the
+continual learning problem with respect to constraint perturbations. We then
+leverage this result to partition the buffer in the coarse approach, allocating
+more resources to harder tasks, and to populate the buffer in the fine
+approach, including only impactful samples. We derive a deviation bound on dual
+variables as sensitivity indicators, and empirically corroborate this result in
+diverse continual learning benchmarks. We also discuss the limitations of these
+methods with respect to the amount of memory available and the expressiveness
+of the parametrization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ eXponential FAmily Dynamical Systems (XFADS): Large-scale nonlinear
+  Gaussian state-space modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01371v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01371v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Dowling, Yuan Zhao, Il Memming Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-space graphical models and the variational autoencoder framework
+provide a principled apparatus for learning dynamical systems from data.
+State-of-the-art probabilistic approaches are often able to scale to large
+problems at the cost of flexibility of the variational posterior or
+expressivity of the dynamics model. However, those consolidations can be
+detrimental if the ultimate goal is to learn a generative model capable of
+explaining the spatiotemporal structure of the data and making accurate
+forecasts. We introduce a low-rank structured variational autoencoding
+framework for nonlinear Gaussian state-space graphical models capable of
+capturing dense covariance structures that are important for learning dynamical
+systems with predictive capabilities. Our inference algorithm exploits the
+covariance structures that arise naturally from sample based approximate
+Gaussian message passing and low-rank amortized posterior updates --
+effectively performing approximate variational smoothing with time complexity
+scaling linearly in the state dimensionality. In comparisons with other deep
+state-space model architectures our approach consistently demonstrates the
+ability to learn a more predictive generative model. Furthermore, when applied
+to neural physiological recordings, our approach is able to learn a dynamical
+system capable of forecasting population spiking and behavioral correlates from
+a small portion of single trials.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Cascade Learning for Efficient Inference over Streams <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.04513v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.04513v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lunyiu Nie, Zhimin Ding, Erdong Hu, Christopher Jermaine, Swarat Chaudhuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have a natural role in answering complex queries
+about data streams, but the high computational cost of LLM inference makes them
+infeasible in many such tasks. We propose online cascade learning, the first
+approach to address this challenge. The objective here is to learn a "cascade"
+of models, starting with lower-capacity models (such as logistic regression)
+and ending with a powerful LLM, along with a deferral policy that determines
+the model to be used on a given input. We formulate the task of learning
+cascades online as an imitation-learning problem, where smaller models are
+updated over time imitating the collected LLM demonstrations, and give a
+no-regret algorithm for the problem. Experimental results across four
+benchmarks show that our method parallels LLMs in accuracy while cutting down
+inference costs by as much as 90% with strong robustness against input
+distribution shifts, underscoring its efficacy and adaptability in stream
+processing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 Main Conference Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18669v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18669v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vicky Zayats, Peter Chen, Melissa Ferrari, Dirk Padfield
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Integrating multiple generative foundation models, especially those trained
+on different modalities, into something greater than the sum of its parts poses
+significant challenges. Two key hurdles are the availability of aligned data
+(concepts that contain similar meaning but is expressed differently in
+different modalities), and effectively leveraging unimodal representations in
+cross-domain generative tasks, without compromising their original unimodal
+capabilities.
+  We propose Zipper, a multi-tower decoder architecture that addresses these
+concerns by using cross-attention to flexibly compose multimodal generative
+models from independently pre-trained unimodal decoders. In our experiments
+fusing speech and text modalities, we show the proposed architecture performs
+very competitively in scenarios with limited aligned text-speech data. We also
+showcase the flexibility of our model to selectively maintain unimodal (e.g.,
+text-to-text generation) generation performance by freezing the corresponding
+modal tower (e.g. text). In cross-modal tasks such as automatic speech
+recognition (ASR) where the output modality is text, we show that freezing the
+text backbone results in negligible performance degradation. In cross-modal
+tasks such as text-to-speech generation (TTS) where the output modality is
+speech, we show that using a pre-trained speech backbone results in superior
+performance to the baseline.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review at NeurIPS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Model the World with Language <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01399v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01399v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jessy Lin, Yuqing Du, Olivia Watkins, Danijar Hafner, Pieter Abbeel, Dan Klein, Anca Dragan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To interact with humans and act in the world, agents need to understand the
+range of language that people use and relate it to the visual world. While
+current agents can learn to execute simple language instructions, we aim to
+build agents that leverage diverse language -- language like "this button turns
+on the TV" or "I put the bowls away" -- that conveys general knowledge,
+describes the state of the world, provides interactive feedback, and more. Our
+key idea is that agents should interpret such diverse language as a signal that
+helps them predict the future: what they will observe, how the world will
+behave, and which situations will be rewarded. This perspective unifies
+language understanding with future prediction as a powerful self-supervised
+learning objective. We instantiate this in Dynalang, an agent that learns a
+multimodal world model to predict future text and image representations, and
+learns to act from imagined model rollouts. While current methods that learn
+language-conditioned policies degrade in performance with more diverse types of
+language, we show that Dynalang learns to leverage environment descriptions,
+game rules, and instructions to excel on tasks ranging from game-playing to
+navigating photorealistic home scans. Finally, we show that our method enables
+additional capabilities due to learning a generative model: Dynalang can be
+pretrained on text-only data, enabling learning from offline datasets, and
+generate language grounded in an environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Website: https://dynalang.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GUIDE: Guidance-based Incremental Learning with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03938v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03938v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bartosz Cywiński, Kamil Deja, Tomasz Trzciński, Bartłomiej Twardowski, Łukasz Kuciński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce GUIDE, a novel continual learning approach that directs
+diffusion models to rehearse samples at risk of being forgotten. Existing
+generative strategies combat catastrophic forgetting by randomly sampling
+rehearsal examples from a generative model. Such an approach contradicts
+buffer-based approaches where sampling strategy plays an important role. We
+propose to bridge this gap by incorporating classifier guidance into the
+diffusion process to produce rehearsal examples specifically targeting
+information forgotten by a continuously trained model. This approach enables
+the generation of samples from preceding task distributions, which are more
+likely to be misclassified in the context of recently encountered classes. Our
+experimental results show that GUIDE significantly reduces catastrophic
+forgetting, outperforming conventional random sampling approaches and
+surpassing recent state-of-the-art methods in continual learning with
+generative replay.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian Program Learning by Decompiling Amortized Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07856v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07856v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro B. Palmarini, Christopher G. Lucas, N. Siddharth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DreamCoder is an inductive program synthesis system that, whilst solving
+problems, learns to simplify search in an iterative wake-sleep procedure. The
+cost of search is amortized by training a neural search policy, reducing search
+breadth and effectively "compiling" useful information to compose program
+solutions across tasks. Additionally, a library of program components is learnt
+to compress and express discovered solutions in fewer components, reducing
+search depth. We present a novel approach for library learning that directly
+leverages the neural search policy, effectively "decompiling" its amortized
+knowledge to extract relevant program components. This provides stronger
+amortized inference: the amortized knowledge learnt to reduce search breadth is
+now also used to reduce search depth. We integrate our approach with DreamCoder
+and demonstrate faster domain proficiency with improved generalization on a
+range of domains, particularly when fewer example solutions are available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unity by Diversity: Improved Representation Learning in Multimodal VAEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05300v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05300v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas M. Sutter, Yang Meng, Andrea Agostini, Daphné Chopard, Norbert Fortin, Julia E. Vogt, Bahbak Shahbaba, Stephan Mandt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational Autoencoders for multimodal data hold promise for many tasks in
+data analysis, such as representation learning, conditional generation, and
+imputation. Current architectures either share the encoder output, decoder
+input, or both across modalities to learn a shared representation. Such
+architectures impose hard constraints on the model. In this work, we show that
+a better latent representation can be obtained by replacing these hard
+constraints with a soft constraint. We propose a new mixture-of-experts prior,
+softly guiding each modality's latent representation towards a shared aggregate
+posterior. This approach results in a superior latent representation and allows
+each encoding to preserve information better from its uncompressed original
+features. In extensive experiments on multiple benchmark datasets and two
+challenging real-world datasets, we show improved learned latent
+representations and imputation of missing data modalities compared to existing
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Deep Traffic Forecasting Models with Dynamic Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06650v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06650v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincent Zhihao Zheng, Seongjin Choi, Lijun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models for traffic forecasting often assume the residual is
+independent and isotropic across time and space. This assumption simplifies
+loss functions such as mean absolute error, but real-world residual processes
+often exhibit significant autocorrelation and structured spatiotemporal
+correlation. This paper introduces a dynamic regression (DR) framework to
+enhance existing spatiotemporal traffic forecasting models by incorporating
+structured learning for the residual process. We assume the residual of the
+base model (i.e., a well-developed traffic forecasting model) follows a
+matrix-variate seasonal autoregressive (AR) model, which is seamlessly
+integrated into the training process through the redesign of the loss function.
+Importantly, the parameters of the DR framework are jointly optimized alongside
+the base model. We evaluate the effectiveness of the proposed framework on
+state-of-the-art (SOTA) deep traffic forecasting models using both speed and
+flow datasets, demonstrating improved performance and providing interpretable
+AR coefficients and spatiotemporal covariance matrices.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CARTE: <span class="highlight-title">Pretrain</span>ing and Transfer for Tabular Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.16785v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.16785v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Myung Jun Kim, Léo Grinsztajn, Gaël Varoquaux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained deep-learning models are the go-to solution for images or text.
+However, for tabular data the standard is still to train tree-based models.
+Indeed, transfer learning on tables hits the challenge of data integration:
+finding correspondences, correspondences in the entries (entity matching) where
+different words may denote the same entity, correspondences across columns
+(schema matching), which may come in different orders, names... We propose a
+neural architecture that does not need such correspondences. As a result, we
+can pretrain it on background data that has not been matched. The architecture
+-- CARTE for Context Aware Representation of Table Entries -- uses a graph
+representation of tabular (or relational) data to process tables with different
+columns, string embedding of entries and columns names to model an open
+vocabulary, and a graph-attentional network to contextualize entries with
+column names and neighboring entries. An extensive benchmark shows that CARTE
+facilitates learning, outperforming a solid set of baselines including the best
+tree-based models. CARTE also enables joint learning across tables with
+unmatched columns, enhancing a small table with bigger ones. CARTE opens the
+door to large pretrained models for tabular data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intelligent and Miniaturized Neural Interfaces: An Emerging Era in
+  Neurotechnology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.10780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.10780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahsa Shoaran, Uisub Shin, MohammadAli Shaeri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Integrating smart algorithms on neural devices presents significant
+opportunities for various brain disorders. In this paper, we review the latest
+advancements in the development of three categories of intelligent neural
+prostheses featuring embedded signal processing on the implantable or wearable
+device. These include: 1) Neural interfaces for closed-loop symptom tracking
+and responsive stimulation; 2) Neural interfaces for emerging network-related
+conditions, such as psychiatric disorders; and 3) Intelligent BMI SoCs for
+movement recovery following paralysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EvoluNet: Advancing Dynamic Non-IID Transfer Learning on Graphs <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.00664v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.00664v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haohui Wang, Yuzhen Mao, Yujun Yan, Yaoqing Yang, Jianhui Sun, Kevin Choi, Balaji Veeramani, Alison Hu, Edward Bowen, Tyler Cody, Dawei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-IID transfer learning on graphs is crucial in many high-stakes domains.
+The majority of existing works assume stationary distribution for both source
+and target domains. However, real-world graphs are intrinsically dynamic,
+presenting challenges in terms of domain evolution and dynamic discrepancy
+between source and target domains. To bridge the gap, we shift the problem to
+the dynamic setting and pose the question: given the label-rich source graphs
+and the label-scarce target graphs both observed in previous T timestamps, how
+can we effectively characterize the evolving domain discrepancy and optimize
+the generalization performance of the target domain at the incoming T+1
+timestamp? To answer it, we propose a generalization bound for dynamic non-IID
+transfer learning on graphs, which implies the generalization performance is
+dominated by domain evolution and domain discrepancy between source and target
+graphs. Inspired by the theoretical results, we introduce a novel generic
+framework named EvoluNet. It leverages a transformer-based temporal encoding
+module to model temporal information of the evolving domains and then uses a
+dynamic domain unification module to efficiently learn domain-invariant
+representations across the source and target domains. Finally, EvoluNet
+outperforms the state-of-the-art models by up to 12.1%, demonstrating its
+effectiveness in transferring knowledge from dynamic source graphs to dynamic
+target graphs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TIC-TAC: A Framework for Improved Covariance Estimation in Deep
+  Heteroscedastic Regression <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.18953v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.18953v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Megh Shukla, Mathieu Salzmann, Alexandre Alahi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep heteroscedastic regression involves jointly optimizing the mean and
+covariance of the predicted distribution using the negative log-likelihood.
+However, recent works show that this may result in sub-optimal convergence due
+to the challenges associated with covariance estimation. While the literature
+addresses this by proposing alternate formulations to mitigate the impact of
+the predicted covariance, we focus on improving the predicted covariance
+itself. We study two questions: (1) Does the predicted covariance truly capture
+the randomness of the predicted mean? (2) In the absence of supervision, how
+can we quantify the accuracy of covariance estimation? We address (1) with a
+Taylor Induced Covariance (TIC), which captures the randomness of the predicted
+mean by incorporating its gradient and curvature through the second order
+Taylor polynomial. Furthermore, we tackle (2) by introducing a Task Agnostic
+Correlations (TAC) metric, which combines the notion of correlations and
+absolute error to evaluate the covariance. We evaluate TIC-TAC across multiple
+experiments spanning synthetic and real-world datasets. Our results show that
+not only does TIC accurately learn the covariance, it additionally facilitates
+an improved convergence of the negative log-likelihood. Our code is available
+at https://github.com/vita-epfl/TIC-TAC
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024. Please feel free to provide feedback!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multivariate Probabilistic Time Series Forecasting with Correlated
+  Errors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01000v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01000v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vincent Zhihao Zheng, Lijun Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately modeling the correlation structure of errors is essential for
+reliable uncertainty quantification in probabilistic time series forecasting.
+Recent deep learning models for multivariate time series have developed
+efficient parameterizations for time-varying contemporaneous covariance, but
+they often assume temporal independence of errors for simplicity. However,
+real-world data frequently exhibit significant error autocorrelation and
+cross-lag correlation due to factors such as missing covariates. In this paper,
+we present a plug-and-play method that learns the covariance structure of
+errors over multiple steps for autoregressive models with Gaussian-distributed
+errors. To achieve scalable inference and computational efficiency, we model
+the contemporaneous covariance using a low-rank-plus-diagonal parameterization
+and characterize cross-covariance through a group of independent latent
+temporal processes. The learned covariance matrix can be used to calibrate
+predictions based on observed residuals. We evaluate our method on
+probabilistic models built on RNN and Transformer architectures, and the
+results confirm the effectiveness of our approach in enhancing predictive
+accuracy and uncertainty quantification without significantly increasing the
+parameter size.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper extends the work presented in arXiv:2305.17028 to a
+  multivariate setting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Accuracy Booster: Enabling 4-bit Fixed-point Arithmetic for DNN Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.10737v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.10737v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simla Burcu Harma, Ayan Chakraborty, Nicholas Sperry, Babak Falsafi, Martin Jaggi, Yunho Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unprecedented demand for computing resources to train DNN models has led
+to a search for minimal numerical encoding. Recent state-of-the-art (SOTA)
+proposals advocate for multi-level scaled narrow bitwidth numerical formats. In
+this paper, we show that single-level scaling is sufficient to maintain
+training accuracy while maximizing arithmetic density. We identify a previously
+proposed single-level scaled format for 8-bit training, Hybrid Block Floating
+Point (HBFP), as the optimal candidate to minimize. We perform a full-scale
+exploration of the HBFP design space using mathematical tools to study the
+interplay among various parameters and identify opportunities for even smaller
+encodings across layers and epochs. Based on our findings, we propose Accuracy
+Booster, a mixed-mantissa HBFP technique that uses 4-bit mantissas for over 99%
+of all arithmetic operations in training and 6-bit mantissas only in the last
+epoch and first/last layers. We show Accuracy Booster enables increasing
+arithmetic density over all other SOTA formats by at least 2.3x while achieving
+state-of-the-art accuracies in 4-bit training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CycleFormer : TSP Solver Based on Language Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20042v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20042v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jieun Yook, Junpyo Seo, Joon Huh, Han Joon Byun, Byung-ro Mooon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new transformer model for the Traveling Salesman Problem (TSP)
+called CycleFormer. We identified distinctive characteristics that need to be
+considered when applying a conventional transformer model to TSP and aimed to
+fully incorporate these elements into the TSP-specific transformer. Unlike the
+token sets in typical language models, which are limited and static, the token
+(node) set in TSP is unlimited and dynamic. To exploit this fact to the
+fullest, we equated the encoder output with the decoder linear layer and
+directly connected the context vector of the encoder to the decoder encoding.
+Additionally, we added a positional encoding to the encoder tokens that
+reflects the two-dimensional nature of TSP, and devised a circular positional
+encoding for the decoder tokens that considers the cyclic properties of a tour.
+By incorporating these ideas, CycleFormer outperforms state-of-the-art (SOTA)
+transformer models for TSP from TSP-50 to TSP-500. Notably, on TSP-500, the
+optimality gap was reduced by approximately 2.8 times, from 3.09% to 1.10%,
+compared to the existing SOTA. The code will be made available at
+https://github.com/Giventicket/CycleFormer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Common Intuition to Transfer Learning Can Win or Lose: Case Studies
+  for Linear Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.05621v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.05621v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yehuda Dar, Daniel LeJeune, Richard G. Baraniuk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study a fundamental transfer learning process from source to target linear
+regression tasks, including overparameterized settings where there are more
+learned parameters than data samples. The target task learning is addressed by
+using its training data together with the parameters previously computed for
+the source task. We define a transfer learning approach to the target task as a
+linear regression optimization with a regularization on the distance between
+the to-be-learned target parameters and the already-learned source parameters.
+We analytically characterize the generalization performance of our transfer
+learning approach and demonstrate its ability to resolve the peak in
+generalization errors in double descent phenomena of the minimum L2-norm
+solution to linear regression. Moreover, we show that for sufficiently related
+tasks, the optimally tuned transfer learning approach can outperform the
+optimally tuned ridge regression method, even when the true parameter vector
+conforms to an isotropic Gaussian prior distribution. Namely, we demonstrate
+that transfer learning can beat the minimum mean square error (MMSE) solution
+of the independent target task. Our results emphasize the ability of transfer
+learning to extend the solution space to the target task and, by that, to have
+an improved MMSE solution. We formulate the linear MMSE solution to our
+transfer learning setting and point out its key differences from the common
+design philosophy to transfer learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Gameplay Filters: Safe Robot Walking through Adversarial Imagination 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.00846v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.00846v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duy P. Nguyen, Kai-Chieh Hsu, Wenhao Yu, Jie Tan, Jaime F. Fisac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ensuring the safe operation of legged robots in uncertain, novel environments
+is crucial to their widespread adoption. Despite recent advances in safety
+filters that can keep arbitrary task-driven policies from incurring safety
+failures, existing solutions for legged robot locomotion still rely on
+simplified dynamics and may fail when the robot is perturbed away from
+predefined stable gaits. This paper presents a general approach that leverages
+offline game-theoretic reinforcement learning to synthesize a highly robust
+safety filter for high-order nonlinear dynamics. This gameplay filter then
+maintains runtime safety by continually simulating adversarial futures and
+precluding task-driven actions that would cause it to lose future games (and
+thereby violate safety). Validated on a 36-dimensional quadruped robot
+locomotion task, the gameplay safety filter exhibits inherent robustness to the
+sim-to-real gap without manual tuning or heuristic designs. Physical
+experiments demonstrate the effectiveness of the gameplay safety filter under
+perturbations, such as tugging and unmodeled irregular terrains, while
+simulation studies shed light on how to trade off computation and
+conservativeness without compromising safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IncomeSCM: From tabular data set to time-series simulator and causal
+  estimation benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16069v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16069v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fredrik D. Johansson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating observational estimators of causal effects demands information
+that is rarely available: unconfounded interventions and outcomes from the
+population of interest, created either by randomization or adjustment. As a
+result, it is customary to fall back on simulators when creating benchmark
+tasks. Simulators offer great control but are often too simplistic to make
+challenging tasks, either because they are hand-designed and lack the nuances
+of real-world data, or because they are fit to observational data without
+structural constraints. In this work, we propose a general, repeatable strategy
+for turning observational data into sequential structural causal models and
+challenging estimation tasks by following two simple principles: 1) fitting
+real-world data where possible, and 2) creating complexity by composing simple,
+hand-designed mechanisms. We implement these ideas in a highly configurable
+software package and apply it to the well-known Adult income data set to
+construct the \tt IncomeSCM simulator. From this, we devise multiple estimation
+tasks and sample data sets to compare established estimators of causal effects.
+The tasks present a suitable challenge, with effect estimates varying greatly
+in quality between methods, despite similar performance in the modeling of
+factual outcomes, highlighting the need for dedicated causal estimators and
+model selection criteria.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Attention-aware Semantic Communications for Collaborative Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07217v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07217v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiwoong Im, Nayoung Kwon, Taewoo Park, Jiheon Woo, Jaeho Lee, Yongjune Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a communication-efficient collaborative inference framework in the
+domain of edge inference, focusing on the efficient use of vision transformer
+(ViT) models. The partitioning strategy of conventional collaborative inference
+fails to reduce communication cost because of the inherent architecture of ViTs
+maintaining consistent layer dimensions across the entire transformer encoder.
+Therefore, instead of employing the partitioning strategy, our framework
+utilizes a lightweight ViT model on the edge device, with the server deploying
+a complicated ViT model. To enhance communication efficiency and achieve the
+classification accuracy of the server model, we propose two strategies: 1)
+attention-aware patch selection and 2) entropy-aware image transmission.
+Attention-aware patch selection leverages the attention scores generated by the
+edge device's transformer encoder to identify and select the image patches
+critical for classification. This strategy enables the edge device to transmit
+only the essential patches to the server, significantly improving communication
+efficiency. Entropy-aware image transmission uses min-entropy as a metric to
+accurately determine whether to depend on the lightweight model on the edge
+device or to request the inference from the server model. In our framework, the
+lightweight ViT model on the edge device acts as a semantic encoder,
+efficiently identifying and selecting the crucial image information required
+for the classification task. Our experiments demonstrate that the proposed
+collaborative inference framework can reduce communication overhead by 68% with
+only a minimal loss in accuracy compared to the server model on the ImageNet
+dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpretable Knowledge Tracing via Response Influence-based
+  Counterfactual Reasoning <span class="chip">ICDE'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10045v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10045v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Cui, Minghe Yu, Bo Jiang, Aimin Zhou, Jianyong Wang, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge tracing (KT) plays a crucial role in computer-aided education and
+intelligent tutoring systems, aiming to assess students' knowledge proficiency
+by predicting their future performance on new questions based on their past
+response records. While existing deep learning knowledge tracing (DLKT) methods
+have significantly improved prediction accuracy and achieved state-of-the-art
+results, they often suffer from a lack of interpretability. To address this
+limitation, current approaches have explored incorporating psychological
+influences to achieve more explainable predictions, but they tend to overlook
+the potential influences of historical responses. In fact, understanding how
+models make predictions based on response influences can enhance the
+transparency and trustworthiness of the knowledge tracing process, presenting
+an opportunity for a new paradigm of interpretable KT. However, measuring
+unobservable response influences is challenging. In this paper, we resort to
+counterfactual reasoning that intervenes in each response to answer
+\textit{what if a student had answered a question incorrectly that he/she
+actually answered correctly, and vice versa}. Based on this, we propose RCKT, a
+novel response influence-based counterfactual knowledge tracing framework. RCKT
+generates response influences by comparing prediction outcomes from factual
+sequences and constructed counterfactual sequences after interventions.
+Additionally, we introduce maximization and inference techniques to leverage
+accumulated influences from different past responses, further improving the
+model's performance and credibility. Extensive experimental results demonstrate
+that our RCKT method outperforms state-of-the-art knowledge tracing methods on
+four datasets against six baselines, and provides credible interpretations of
+response influences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICDE'24 (fixing a few typos). Source code at
+  https://github.com/JJCui96/RCKT. Keywords: knowledge tracing, interpretable
+  machine learning, counterfactual reasoning, artificial intelligence for
+  education</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deciphering RNA Secondary Structure Prediction: A Probabilistic K-Rook
+  Matching Perspective <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.14041v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.14041v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Tan, Zhangyang Gao, Hanqun Cao, Xingran Chen, Ge Wang, Lirong Wu, Jun Xia, Jiangbin Zheng, Stan Z. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The secondary structure of ribonucleic acid (RNA) is more stable and
+accessible in the cell than its tertiary structure, making it essential for
+functional prediction. Although deep learning has shown promising results in
+this field, current methods suffer from poor generalization and high
+complexity. In this work, we reformulate the RNA secondary structure prediction
+as a K-Rook problem, thereby simplifying the prediction process into
+probabilistic matching within a finite solution space. Building on this
+innovative perspective, we introduce RFold, a simple yet effective method that
+learns to predict the most matching K-Rook solution from the given sequence.
+RFold employs a bi-dimensional optimization strategy that decomposes the
+probabilistic matching problem into row-wise and column-wise components to
+reduce the matching complexity, simplifying the solving process while
+guaranteeing the validity of the output. Extensive experiments demonstrate that
+RFold achieves competitive performance and about eight times faster inference
+efficiency than the state-of-the-art approaches. The code and Colab demo are
+available in
+\href{http://github.com/A4Bio/RFold}{http://github.com/A4Bio/RFold}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BioT5+: Towards Generalized Biological Understanding with IUPAC
+  Integration and Multi-task Tuning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17810v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17810v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhi Pei, Lijun Wu, Kaiyuan Gao, Xiaozhuan Liang, Yin Fang, Jinhua Zhu, Shufang Xie, Tao Qin, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research trends in computational biology have increasingly focused on
+integrating text and bio-entity modeling, especially in the context of
+molecules and proteins. However, previous efforts like BioT5 faced challenges
+in generalizing across diverse tasks and lacked a nuanced understanding of
+molecular structures, particularly in their textual representations (e.g.,
+IUPAC). This paper introduces BioT5+, an extension of the BioT5 framework,
+tailored to enhance biological research and drug discovery. BioT5+ incorporates
+several novel features: integration of IUPAC names for molecular understanding,
+inclusion of extensive bio-text and molecule data from sources like bioRxiv and
+PubChem, the multi-task instruction tuning for generality across tasks, and a
+numerical tokenization technique for improved processing of numerical data.
+These enhancements allow BioT5+ to bridge the gap between molecular
+representations and their textual descriptions, providing a more holistic
+understanding of biological entities, and largely improving the grounded
+reasoning of bio-text and bio-sequences. The model is pre-trained and
+fine-tuned with a large number of experiments, including \emph{3 types of
+problems (classification, regression, generation), 15 kinds of tasks, and 21
+total benchmark datasets}, demonstrating the remarkable performance and
+state-of-the-art results in most cases. BioT5+ stands out for its ability to
+capture intricate relationships in biological data, thereby contributing
+significantly to bioinformatics and computational biology. Our code is
+available at \url{https://github.com/QizhiPei/BioT5}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multilinear Mixture of Experts: Scalable Expert Specialization through
+  Factorization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12550v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12550v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Oldfield, Markos Georgopoulos, Grigorios G. Chrysos, Christos Tzelepis, Yannis Panagakis, Mihalis A. Nicolaou, Jiankang Deng, Ioannis Patras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Mixture of Experts (MoE) paradigm provides a powerful way to decompose
+dense layers into smaller, modular computations often more amenable to human
+interpretation, debugging, and editability. However, a major challenge lies in
+the computational cost of scaling the number of experts high enough to achieve
+fine-grained specialization. In this paper, we propose the Multilinear Mixture
+of Experts ($\mu$MoE) layer to address this, focusing on vision models.
+$\mu$MoE layers enable scalable expert specialization by performing an implicit
+computation on prohibitively large weight tensors entirely in factorized form.
+Consequently, $\mu$MoEs (1) avoid the restrictively high inference-time costs
+of 'soft' MoEs, yet (2) do not inherit the training issues of the popular
+'sparse' MoEs' discrete (non-differentiable) expert routing. We present both
+qualitative and quantitative evidence that scaling $\mu$MoE layers when
+fine-tuning foundation models for vision tasks leads to more specialized
+experts at the class-level, further enabling manual bias correction in CelebA
+attribute classification. Finally, we show qualitative results demonstrating
+the expert specialism achieved when pre-training large GPT2 and MLP-Mixer
+models with parameter-matched $\mu$MoE blocks at every layer, maintaining
+comparable accuracy. Our code is available at:
+https://github.com/james-oldfield/muMoE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Github: https://github.com/james-oldfield/muMoE. Project page:
+  https://james-oldfield.github.io/muMoE/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Context Pruning for Efficient and Interpretable Autoregressive
+  <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15805v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15805v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sotiris Anagnostidis, Dario Pavllo, Luca Biggio, Lorenzo Noci, Aurelien Lucchi, Thomas Hofmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autoregressive Transformers adopted in Large Language Models (LLMs) are hard
+to scale to long sequences. Despite several works trying to reduce their
+computational cost, most of LLMs still adopt attention layers between all pairs
+of tokens in the sequence, thus incurring a quadratic cost. In this study, we
+present a novel approach that dynamically prunes contextual information while
+preserving the model's expressiveness, resulting in reduced memory and
+computational requirements during inference. Our method employs a learnable
+mechanism that determines which uninformative tokens can be dropped from the
+context at any point across the generation process. By doing so, our approach
+not only addresses performance concerns but also enhances interpretability,
+providing valuable insight into the model's decision-making process. Our
+technique can be applied to existing pre-trained models through a
+straightforward fine-tuning process, and the pruning strength can be specified
+by a sparsity parameter. Notably, our empirical findings demonstrate that we
+can effectively prune up to 80\% of the context without significant performance
+degradation on downstream tasks, offering a valuable tool for mitigating
+inference costs. Our reference implementation achieves up to $2\times$ increase
+in inference throughput and even greater memory savings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online <span class="highlight-title">Prompt</span> Pricing based on Combinatorial Multi-Armed Bandit and
+  Hierarchical Stackelberg Game 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15154v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15154v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meiling Li, Hongrun Ren, Haixu Xiong, Zhenxing Qian, Xinpeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generation models have shown promising performance in various tasks, making
+trading around machine learning models possible. In this paper, we aim at a
+novel prompt trading scenario, prompt bundle trading (PBT) system, and propose
+an online pricing mechanism. Based on the combinatorial multi-armed bandit
+(CMAB) and three-stage hierarchical Stackelburg (HS) game, our pricing
+mechanism considers the profits of the consumer, platform, and seller,
+simultaneously achieving the profit satisfaction of these three participants.
+We break down the pricing issue into two steps, namely unknown category
+selection and incentive strategy optimization. The former step is to select a
+set of categories with the highest qualities, and the latter is to derive the
+optimal strategy for each participant based on the chosen categories. Unlike
+the existing fixed pricing mode, the PBT pricing mechanism we propose is more
+flexible and diverse, which is more in accord with the transaction needs of
+real-world scenarios. We test our method on a simulated text-to-image dataset.
+The experimental results demonstrate the effectiveness of our algorithm, which
+provides a feasible price-setting standard for the prompt marketplaces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Performative Reinforcement Learning in Gradually Shifting Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ben Rank, Stelios Triantafyllou, Debmalya Mandal, Goran Radanovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When Reinforcement Learning (RL) agents are deployed in practice, they might
+impact their environment and change its dynamics. We propose a new framework to
+model this phenomenon, where the current environment depends on the deployed
+policy as well as its previous dynamics. This is a generalization of
+Performative RL (PRL) [Mandal et al., 2023]. Unlike PRL, our framework allows
+to model scenarios where the environment gradually adjusts to a deployed
+policy. We adapt two algorithms from the performative prediction literature to
+our setting and propose a novel algorithm called Mixed Delayed Repeated
+Retraining (MDRR). We provide conditions under which these algorithms converge
+and compare them using three metrics: number of retrainings, approximation
+guarantee, and number of samples per deployment. MDRR is the first algorithm in
+this setting which combines samples from multiple deployments in its training.
+This makes MDRR particularly suitable for scenarios where the environment's
+response strongly depends on its previous dynamics, which are common in
+practice. We experimentally compare the algorithms using a simulation-based
+testbed and our results show that MDRR converges significantly faster than
+previous approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Equivariant Deep Weight Space Alignment <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.13397v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.13397v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviv Navon, Aviv Shamsian, Ethan Fetaya, Gal Chechik, Nadav Dym, Haggai Maron
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Permutation symmetries of deep networks make basic operations like model
+merging and similarity estimation challenging. In many cases, aligning the
+weights of the networks, i.e., finding optimal permutations between their
+weights, is necessary. Unfortunately, weight alignment is an NP-hard problem.
+Prior research has mainly focused on solving relaxed versions of the alignment
+problem, leading to either time-consuming methods or sub-optimal solutions. To
+accelerate the alignment process and improve its quality, we propose a novel
+framework aimed at learning to solve the weight alignment problem, which we
+name Deep-Align. To that end, we first prove that weight alignment adheres to
+two fundamental symmetries and then, propose a deep architecture that respects
+these symmetries. Notably, our framework does not require any labeled data. We
+provide a theoretical analysis of our approach and evaluate Deep-Align on
+several types of network architectures and learning setups. Our experimental
+results indicate that a feed-forward pass with Deep-Align produces better or
+equivalent alignments compared to those produced by current optimization
+algorithms. Additionally, our alignments can be used as an effective
+initialization for other methods, leading to improved solutions with a
+significant speedup in convergence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NLP Verification: Towards a General Methodology for Certifying
+  Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.10144v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.10144v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Casadio, Tanvi Dinkar, Ekaterina Komendantskaya, Luca Arnaboldi, Matthew L. Daggitt, Omri Isac, Guy Katz, Verena Rieser, Oliver Lemon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have exhibited substantial success in the field of
+Natural Language Processing and ensuring their safety and reliability is
+crucial: there are safety critical contexts where such models must be robust to
+variability or attack, and give guarantees over their output. Unlike Computer
+Vision, NLP lacks a unified verification methodology and, despite recent
+advancements in literature, they are often light on the pragmatical issues of
+NLP verification. In this paper, we attempt to distil and evaluate general
+components of an NLP verification pipeline, that emerges from the progress in
+the field to date. Our contributions are two-fold. Firstly, we give a general
+(i.e. algorithm-independent) characterisation of verifiable subspaces that
+result from embedding sentences into continuous spaces. We identify, and give
+an effective method to deal with, the technical challenge of semantic
+generalisability of verified subspaces; and propose it as a standard metric in
+the NLP verification pipelines (alongside with the standard metrics of model
+accuracy and model verifiability). Secondly, we propose a general methodology
+to analyse the effect of the embedding gap -- a problem that refers to the
+discrepancy between verification of geometric subspaces, and the semantic
+meaning of sentences which the geometric subspaces are supposed to represent.
+In extreme cases, poor choices in embedding of sentences may invalidate
+verification results. We propose a number of practical NLP methods that can
+help to quantify the effects of the embedding gap; and in particular we propose
+the metric of falsifiability of semantic subspaces as another fundamental
+metric to be reported as part of the NLP verification pipeline. We believe that
+together these general principles pave the way towards a more consolidated and
+effective development of this new domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Tale of Tails: Model Collapse as a Change of Scaling Laws 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07043v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07043v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elvis Dohmatob, Yunzhen Feng, Pu Yang, Francois Charton, Julia Kempe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As AI model size grows, neural scaling laws have become a crucial tool to
+predict the improvements of large models when increasing capacity and the size
+of original (human or natural) training data. Yet, the widespread use of
+popular models means that the ecosystem of online data and text will co-evolve
+to progressively contain increased amounts of synthesized data. In this paper
+we ask: How will the scaling laws change in the inevitable regime where
+synthetic data makes its way into the training corpus? Will future models,
+still improve, or be doomed to degenerate up to total (model) collapse? We
+develop a theoretical framework of model collapse through the lens of scaling
+laws. We discover a wide range of decay phenomena, analyzing loss of scaling,
+shifted scaling with number of generations, the ''un-learning" of skills, and
+grokking when mixing human and synthesized data. Our theory is validated by
+large-scale experiments with a transformer on an arithmetic task and text
+generation using the large language model Llama2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-dimensional robust regression under heavy-tailed data: Asymptotics
+  and Universality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16476v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16476v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Urte Adomaityte, Leonardo Defilippis, Bruno Loureiro, Gabriele Sicuro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the high-dimensional properties of robust regression
+estimators in the presence of heavy-tailed contamination of both the covariates
+and response functions. In particular, we provide a sharp asymptotic
+characterisation of M-estimators trained on a family of elliptical covariate
+and noise data distributions including cases where second and higher moments do
+not exist. We show that, despite being consistent, the Huber loss with
+optimally tuned location parameter $\delta$ is suboptimal in the
+high-dimensional regime in the presence of heavy-tailed noise, highlighting the
+necessity of further regularisation to achieve optimal performance. This result
+also uncovers the existence of a transition in $\delta$ as a function of the
+sample complexity and contamination. Moreover, we derive the decay rates for
+the excess risk of ridge regression. We show that, while it is both optimal and
+universal for covariate distributions with finite second moment, its decay rate
+can be considerably faster when the covariates' second moment does not exist.
+Finally, we show that our formulas readily generalise to a richer family of
+models and data distributions, such as generalised linear estimation with
+arbitrary convex regularisation trained on mixture models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages + Supplementary information</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedSheafHN: Personalized Federated Learning on Graph-structured Data <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16056v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16056v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenfei Liang, Yanan Zhao, Rui She, Yiming Li, Wee Peng Tay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized subgraph Federated Learning (FL) is a task that customizes Graph
+Neural Networks (GNNs) to individual client needs, accommodating diverse data
+distributions. However, applying hypernetworks in FL, while aiming to
+facilitate model personalization, often encounters challenges due to inadequate
+representation of client-specific characteristics. To overcome these
+limitations, we propose a model called FedSheafHN, using enhanced collaboration
+graph embedding and efficient personalized model parameter generation.
+Specifically, our model embeds each client's local subgraph into a
+server-constructed collaboration graph. We utilize sheaf diffusion in the
+collaboration graph to learn client representations. Our model improves the
+integration and interpretation of complex client characteristics. Furthermore,
+our model ensures the generation of personalized models through advanced
+hypernetworks optimized for parallel operations across clients. Empirical
+evaluations demonstrate that FedSheafHN outperforms existing methods in most
+scenarios, in terms of client model performance on various graph-structured
+datasets. It also has fast model convergence and effective new clients
+generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was submitted to ICML 2024 in Feb 2024. You can find a
+  record
+  here:https://github.com/CarrieWFF/ICML-2024-submission-recording/blob/main/Screenshot%20of%20FedSheafHN%20submission%20to%20ICML%202024.png</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Anatomical Region Recognition and Real-time Bone Tracking Methods by
+  Dynamically Decoding A-Mode Ultrasound Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19542v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19542v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bangyu Lan, Stefano Stramigioli, Kenan Niu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate bone tracking is crucial for kinematic analysis in orthopedic
+surgery and prosthetic robotics. Traditional methods (e.g., skin markers) are
+subject to soft tissue artifacts, and the bone pins used in surgery introduce
+the risk of additional trauma and infection. For electromyography (EMG), its
+inability to directly measure joint angles requires complex algorithms for
+kinematic estimation. To address these issues, A-mode ultrasound-based tracking
+has been proposed as a non-invasive and safe alternative. However, this
+approach suffers from limited accuracy in peak detection when processing
+received ultrasound signals. To build a precise and real-time bone tracking
+approach, this paper introduces a deep learning-based method for anatomical
+region recognition and bone tracking using A-mode ultrasound signals,
+specifically focused on the knee joint. The algorithm is capable of
+simultaneously performing bone tracking and identifying the anatomical region
+where the A-mode ultrasound transducer is placed. It contains the fully
+connection between all encoding and decoding layers of the cascaded U-Nets to
+focus only on the signal region that is most likely to have the bone peak, thus
+pinpointing the exact location of the peak and classifying the anatomical
+region of the signal. The experiment showed a 97% accuracy in the
+classification of the anatomical regions and a precision of around 0.5$\pm$1mm
+under dynamic tracking conditions for various anatomical areas surrounding the
+knee joint. In general, this approach shows great potential beyond the
+traditional method, in terms of the accuracy achieved and the recognition of
+the anatomical region where the ultrasound has been attached as an additional
+functionality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Permutation Decision Trees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02617v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02617v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harikrishnan N B, Arham Jain, Nithin Nagaraj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision Tree is a well understood Machine Learning model that is based on
+minimizing impurities in the internal nodes. The most common impurity measures
+are Shannon entropy and Gini impurity. These impurity measures are insensitive
+to the order of training data and hence the final tree obtained is invariant to
+any permutation of the data. This is a limitation in terms of modeling when
+there are temporal order dependencies between data instances. In this research,
+we propose the adoption of Effort-To-Compress (ETC) - a complexity measure, for
+the first time, as an alternative impurity measure. Unlike Shannon entropy and
+Gini impurity, structural impurity based on ETC is able to capture order
+dependencies in the data, thus obtaining potentially different decision trees
+for different permutations of the same data instances, a concept we term as
+Permutation Decision Trees (PDT). We then introduce the notion of Permutation
+Bagging achieved using permutation decision trees without the need for random
+feature selection and sub-sampling. We conduct a performance comparison between
+Permutation Decision Trees and classical decision trees across various
+real-world datasets, including Appendicitis, Breast Cancer Wisconsin, Diabetes
+Pima Indian, Ionosphere, Iris, Sonar, and Wine. Our findings reveal that PDT
+demonstrates comparable performance to classical decision trees across most
+datasets. Remarkably, in certain instances, PDT even slightly surpasses the
+performance of classical decision trees. In comparing Permutation Bagging with
+Random Forest, we attain comparable performance to Random Forest models
+consisting of 50 to 1000 trees, using merely 21 trees. This highlights the
+efficiency and effectiveness of Permutation Bagging in achieving comparable
+performance outcomes with significantly fewer trees.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simplifying <span class="highlight-title">Transformer</span> Blocks <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.01906v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.01906v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bobby He, Thomas Hofmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A simple design recipe for deep Transformers is to compose identical building
+blocks. But standard transformer blocks are far from simple, interweaving
+attention and MLP sub-blocks with skip connections & normalisation layers in
+precise arrangements. This complexity leads to brittle architectures, where
+seemingly minor changes can significantly reduce training speed, or render
+models untrainable.
+  In this work, we ask to what extent the standard transformer block can be
+simplified? Combining signal propagation theory and empirical observations, we
+motivate modifications that allow many block components to be removed with no
+loss of training speed, including skip connections, projection or value
+parameters, sequential sub-blocks and normalisation layers. In experiments on
+both autoregressive decoder-only and BERT encoder-only models, our simplified
+transformers emulate the per-update training speed and performance of standard
+transformers, while enjoying 15% faster training throughput, and using 15%
+fewer parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Climate Variable Prediction with Conditioned Spatio-Temporal
+  Normalizing Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.06958v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.06958v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christina Winkler, David Rolnick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigates how conditional normalizing flows can be applied to
+remote sensing data products in climate science for spatio-temporal prediction.
+The method is chosen due to its desired properties such as exact likelihood
+computation, predictive uncertainty estimation and efficient inference and
+sampling which facilitates faster exploration of climate scenarios.
+Experimental findings reveal that the conditioned spatio-temporal flow
+surpasses both deterministic and stochastic baselines in prolonged rollout
+scenarios. It exhibits stable extrapolation beyond the training time horizon
+for extended rollout durations. These findings contribute valuable insights to
+the field of spatio-temporal modeling, with potential applications spanning
+diverse scientific disciplines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Attention Analysis in Online Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20091v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20091v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miriam Navarro, Álvaro Becerra, Roberto Daza, Ruth Cobos, Aythami Morales, Julian Fierrez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present an approach in the Multimodal Learning Analytics
+field. Within this approach, we have developed a tool to visualize and analyze
+eye movement data collected during learning sessions in online courses. The
+tool is named VAAD (an acronym for Visual Attention Analysis Dashboard). These
+eye movement data have been gathered using an eye-tracker and subsequently
+processed and visualized for interpretation. The purpose of the tool is to
+conduct a descriptive analysis of the data by facilitating its visualization,
+enabling the identification of differences and learning patterns among various
+learner populations. Additionally, it integrates a predictive module capable of
+anticipating learner activities during a learning session. Consequently, VAAD
+holds the potential to offer valuable insights into online learning behaviors
+from both descriptive and predictive perspectives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CEDI 2024 (VII Congreso Espa\~nol de Inform\'atica), A
+  Coru\~na, Spain</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Female mosquito detection by means of AI techniques inside release
+  containers in the context of a Sterile Insect Technique program 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10843v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10843v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Naranjo-Alcazar, Jordi Grau-Haro, David Almenar, Pedro Zuccarello
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Sterile Insect Technique (SIT) is a biological pest control technique
+based on the release into the environment of sterile males of the insect
+species whose population is to be controlled. The entire SIT process involves
+mass-rearing within a biofactory, sorting of the specimens by sex,
+sterilization, and subsequent release of the sterile males into the
+environment. The reason for avoiding the release of female specimens is
+because, unlike males, females bite, with the subsequent risk of disease
+transmission. In the case of Aedes mosquito biofactories for SIT, the key point
+of the whole process is sex separation. This process is nowadays performed by a
+combination of mechanical devices and AI-based vision systems. However, there
+is still a possibility of false negatives, so a last stage of verification is
+necessary before releasing them into the environment. It is known that the
+sound produced by the flapping of adult male mosquitoes is different from that
+produced by females, so this feature can be used to detect the presence of
+females in containers prior to environmental release. This paper presents a
+study for the detection of females in Aedes mosquito release vessels for SIT
+programs. The containers used consist of PVC a tubular design of 8.8cm diameter
+and 12.5cm height. The containers were placed in an experimental setup that
+allowed the recording of the sound of mosquito flight inside of them. Each
+container was filled with 250 specimens considering the cases of (i) only male
+mosquitoes, (ii) only female mosquitoes, and (iii) 75% males and 25% females.
+Case (i) was used for training and testing, whereas cases (ii) and (iii) were
+used only for testing. Two algorithms were implemented for the detection of
+female mosquitoes: an unsupervised outlier detection algorithm (iForest) and a
+one-class SVM trained with male-only recordings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted EUSIPCO 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Out-of-Scope Intent Classification with Dual Encoding and
+  Threshold-based Re-Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossam M. Zawbaa, Wael Rashwan, Sourav Dutta, Haytham Assem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting out-of-scope user utterances is essential for task-oriented
+dialogues and intent classification. Current methodologies face difficulties
+with the unpredictable distribution of outliers and often rely on assumptions
+about data distributions. We present the Dual Encoder for Threshold-Based
+Re-Classification (DETER) to address these challenges. This end-to-end
+framework efficiently detects out-of-scope intents without requiring
+assumptions on data distributions or additional post-processing steps. The core
+of DETER utilizes dual text encoders, the Universal Sentence Encoder (USE) and
+the Transformer-based Denoising AutoEncoder (TSDAE), to generate user utterance
+embeddings, which are classified through a branched neural architecture.
+Further, DETER generates synthetic outliers using self-supervision and
+incorporates out-of-scope phrases from open-domain datasets. This approach
+ensures a comprehensive training set for out-of-scope detection. Additionally,
+a threshold-based re-classification mechanism refines the model's initial
+predictions. Evaluations on the CLINC-150, Stackoverflow, and Banking77
+datasets demonstrate DETER's efficacy. Our model outperforms previous
+benchmarks, increasing up to 13% and 5% in F1 score for known and unknown
+intents on CLINC-150 and Stackoverflow, and 16% for known and 24% % for unknown
+intents on Banking77. The source code has been released at
+https://github.com/Hossam-Mohammed-tech/Intent_Classification_OOS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hardware-Efficient EMG Decoding for Next-Generation Hand Prostheses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20052v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20052v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Kalbasi, MohammadAli Shaeri, Vincent Alexandre Mendez, Solaiman Shokur, Silvestro Micera, Mahsa Shoaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in neural engineering have enabled the development of Robotic
+Prosthetic Hands (RPHs) aimed at restoring hand functionality. Current
+commercial RPHs offer limited control through basic on/off commands. Recent
+progresses in machine learning enable finger movement decoding with higher
+degrees of freedom, yet the high computational complexity of such models limits
+their application in portable devices. Future RPH designs must balance
+portability, low power consumption, and high decoding accuracy to be practical
+for individuals with disabilities. To this end, we introduce a novel
+attractor-based neural network to realize on-chip movement decoding for
+next-generation portable RPHs. The proposed architecture comprises an encoder,
+an attention layer, an attractor network, and a refinement regressor. We tested
+our model on four healthy subjects and achieved a decoding accuracy of 80.3%.
+Our proposed model is over 120 and 50 times more compact compared to
+state-of-the-art LSTM and CNN models, respectively, with comparable (or
+superior) decoding accuracy. Therefore, it exhibits minimal hardware complexity
+and can be effectively integrated as a System-on-Chip.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\{copyright} 2024 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoDeGAN: Contrastive Disentanglement for Generative Adversarial Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.03636v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.03636v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangwei Zhao, Zejia Liu, Xiaohan Guo, Lili Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Disentanglement, a critical concern in interpretable machine learning, has
+also garnered significant attention from the computer vision community. Many
+existing GAN-based class disentanglement (unsupervised) approaches, such as
+InfoGAN and its variants, primarily aim to maximize the mutual information (MI)
+between the generated image and its latent codes. However, this focus may lead
+to a tendency for the network to generate highly similar images when presented
+with the same latent class factor, potentially resulting in mode collapse or
+mode dropping. To alleviate this problem, we propose \texttt{CoDeGAN}
+(Contrastive Disentanglement for Generative Adversarial Networks), where we
+relax similarity constraints for disentanglement from the image domain to the
+feature domain. This modification not only enhances the stability of GAN
+training but also improves their disentangling capabilities. Moreover, we
+integrate self-supervised pre-training into CoDeGAN to learn semantic
+representations, significantly facilitating unsupervised disentanglement.
+Extensive experimental results demonstrate the superiority of our method over
+state-of-the-art approaches across multiple benchmarks. The code is available
+at https://github.com/learninginvision/CoDeGAN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ All-in-one simulation-based inference <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.09636v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.09636v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manuel Gloeckler, Michael Deistler, Christian Weilbach, Frank Wood, Jakob H. Macke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Amortized Bayesian inference trains neural networks to solve stochastic
+inference problems using model simulations, thereby making it possible to
+rapidly perform Bayesian inference for any newly observed data. However,
+current simulation-based amortized inference methods are simulation-hungry and
+inflexible: They require the specification of a fixed parametric prior,
+simulator, and inference tasks ahead of time. Here, we present a new amortized
+inference method -- the Simformer -- which overcomes these limitations. By
+training a probabilistic diffusion model with transformer architectures, the
+Simformer outperforms current state-of-the-art amortized inference approaches
+on benchmark tasks and is substantially more flexible: It can be applied to
+models with function-valued parameters, it can handle inference scenarios with
+missing or unstructured data, and it can sample arbitrary conditionals of the
+joint distribution of parameters and data, including both posterior and
+likelihood. We showcase the performance and flexibility of the Simformer on
+simulators from ecology, epidemiology, and neuroscience, and demonstrate that
+it opens up new possibilities and application domains for amortized Bayesian
+inference on simulation-based models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in the proceedings of the 41st International
+  Conference on Machine Learning (ICML 2024), Vienna, Austria. PMLR 235, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TensorKrowch: Smooth integration of tensor networks in machine learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08595v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08595v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José Ramón Pareja Monturiol, David Pérez-García, Alejandro Pozas-Kerstjens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tensor networks are factorizations of high-dimensional tensors into networks
+of smaller tensors. They have applications in physics and mathematics, and
+recently have been proposed as promising machine learning architectures. To
+ease the integration of tensor networks in machine learning pipelines, we
+introduce TensorKrowch, an open source Python library built on top of PyTorch.
+Providing a user-friendly interface, TensorKrowch allows users to construct any
+tensor network, train it, and integrate it as a layer in more intricate deep
+learning models. In this paper, we describe the main functionality and basic
+usage of TensorKrowch, and provide technical details on its building blocks and
+the optimizations performed to achieve efficient operation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 2 figures. The TensorKrowch GitHub repository is in
+  https://github.com/joserapa98/tensorkrowch and the TensorKrowch documentation
+  is in https://joserapa98.github.io/tensorkrowch. V3: Accepted version,
+  corrected acknowledgments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Network Analytics for Anti-Money Laundering -- A Systematic Literature
+  <span class="highlight-title">Review</span> and Experimental Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19383v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19383v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Deprez, Toon Vanderschueren, Bart Baesens, Tim Verdonck, Wouter Verbeke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Money laundering presents a pervasive challenge, burdening society by
+financing illegal activities. To more effectively combat and detect money
+laundering, the use of network information is increasingly being explored,
+exploiting that money laundering necessarily involves interconnected parties.
+This has lead to a surge in literature on network analytics (NA) for anti-money
+laundering (AML). The literature, however, is fragmented and a comprehensive
+overview of existing work is missing. This results in limited understanding of
+the methods that may be applied and their comparative detection power.
+Therefore, this paper presents an extensive and systematic review of the
+literature. We identify and analyse 97 papers in the Web of Science and Scopus
+databases, resulting in a taxonomy of approaches following the fraud analytics
+framework of Bockel-Rickermann et al.. Moreover, this paper presents a
+comprehensive experimental framework to evaluate and compare the performance of
+prominent NA methods in a uniform setup. The framework is applied on the
+publicly available Elliptic data set and implements manual feature engineering,
+random walk-based methods, and deep learning GNNs. We conclude from the results
+that network analytics increases the predictive power of the AML model with
+graph neural networks giving the best results. An open source implementation of
+the experimental framework is provided to facilitate researchers and
+practitioners to extend upon these results and experiment on proprietary data.
+As such, we aim to promote a standardised approach towards the analysis and
+evaluation of network analytics for AML.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Relaxed Contrastive Learning for Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.04928v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.04928v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seonguk Seo, Jinkyu Kim, Geeho Kim, Bohyung Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel contrastive learning framework to effectively address the
+challenges of data heterogeneity in federated learning. We first analyze the
+inconsistency of gradient updates across clients during local training and
+establish its dependence on the distribution of feature representations,
+leading to the derivation of the supervised contrastive learning (SCL)
+objective to mitigate local deviations. In addition, we show that a na\"ive
+adoption of SCL in federated learning leads to representation collapse,
+resulting in slow convergence and limited performance gains. To address this
+issue, we introduce a relaxed contrastive learning loss that imposes a
+divergence penalty on excessively similar sample pairs within each class. This
+strategy prevents collapsed representations and enhances feature
+transferability, facilitating collaborative training and leading to significant
+performance improvements. Our framework outperforms all existing federated
+learning approaches by huge margins on the standard benchmarks through
+extensive experimental results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ End-to-End Training Induces Information Bottleneck through Layer-Role
+  Differentiation: A Comparative Analysis with Layer-wise Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09050v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09050v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keitaro Sakamoto, Issei Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end (E2E) training, optimizing the entire model through error
+backpropagation, fundamentally supports the advancements of deep learning.
+Despite its high performance, E2E training faces the problems of memory
+consumption, parallel computing, and discrepancy with the functionalities of
+the actual brain. Various alternative methods have been proposed to overcome
+these difficulties; however, no one can yet match the performance of E2E
+training, thereby falling short in practicality. Furthermore, there is no deep
+understanding regarding differences in the trained model properties beyond the
+performance gap. In this paper, we reconsider why E2E training demonstrates a
+superior performance through a comparison with layer-wise training, a non-E2E
+method that locally sets errors. On the basis of the observation that E2E
+training has an advantage in propagating input information, we analyze the
+information plane dynamics of intermediate representations based on the
+Hilbert-Schmidt independence criterion (HSIC). The results of our normalized
+HSIC value analysis reveal the E2E training ability to exhibit different
+information dynamics across layers, in addition to efficient information
+propagation. Furthermore, we show that this layer-role differentiation leads to
+the final representation following the information bottleneck principle. It
+suggests the need to consider the cooperative interactions between layers, not
+just the final layer when analyzing the information bottleneck of deep
+learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TMLR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantum Theory and Application of Contextual Optimal Transport <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14991v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14991v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicola Mariella, Albert Akhriev, Francesco Tacchino, Christa Zoufal, Juan Carlos Gonzalez-Espitia, Benedek Harsanyi, Eugene Koskin, Ivano Tavernelli, Stefan Woerner, Marianna Rapsomaniki, Sergiy Zhuk, Jannis Born
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimal Transport (OT) has fueled machine learning (ML) across many domains.
+When paired data measurements $(\boldsymbol{\mu}, \boldsymbol{\nu})$ are
+coupled to covariates, a challenging conditional distribution learning setting
+arises. Existing approaches for learning a $\textit{global}$ transport map
+parameterized through a potentially unseen context utilize Neural OT and
+largely rely on Brenier's theorem. Here, we propose a first-of-its-kind quantum
+computing formulation for amortized optimization of contextualized
+transportation plans. We exploit a direct link between doubly stochastic
+matrices and unitary operators thus unravelling a natural connection between OT
+and quantum computation. We verify our method (QontOT) on synthetic and real
+data by predicting variations in cell type distributions conditioned on drug
+dosage. Importantly we conduct a 24-qubit hardware experiment on a task
+challenging for classical computers and report a performance that cannot be
+matched with our classical neural OT approach. In sum, this is a first step
+toward learning to predict contextualized transportation plans through quantum
+computing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Two Optimizers Are Better Than One: LLM Catalyst for Enhancing
+  Gradient-Based Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19732v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19732v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixian Guo, Ming Liu, Zhilong Ji, Jinfeng Bai, Yiwen Guo, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning a skill generally relies on both practical experience by doer and
+insightful high-level guidance by instructor. Will this strategy also work well
+for solving complex non-convex optimization problems? Here, a common
+gradient-based optimizer acts like a disciplined doer, making locally optimal
+update at each step. Recent methods utilize large language models (LLMs) to
+optimize solutions for concrete problems by inferring from natural language
+instructions, akin to a high-level instructor. In this paper, we show that
+these two optimizers are complementary to each other, suggesting a
+collaborative optimization approach. The gradient-based optimizer and LLM-based
+optimizer are combined in an interleaved manner. We instruct LLMs using task
+descriptions and timely optimization trajectories recorded during
+gradient-based optimization. Inferred results from LLMs are used as restarting
+points for the next stage of gradient optimization. By leveraging both the
+locally rigorous gradient-based optimizer and the high-level deductive
+LLM-based optimizer, our combined optimization method consistently yields
+improvements over competitive baseline prompt tuning methods. Our results
+demonstrate the synergistic effect of conventional gradient-based optimization
+and the inference ability of LLMs. The code is released at
+https://github.com/guozix/LLM-catalyst.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploratory Machine Learning with Unknown Unknowns <span class="chip">AAAI'21</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2002.01605v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2002.01605v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Zhao, Jia-Wei Shan, Yu-Jie Zhang, Zhi-Hua Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In conventional supervised learning, a training dataset is given with
+ground-truth labels from a known label set, and the learned model will classify
+unseen instances to known labels. This paper studies a new problem setting in
+which there are unknown classes in the training data misperceived as other
+labels, and thus their existence appears unknown from the given supervision. We
+attribute the unknown unknowns to the fact that the training dataset is badly
+advised by the incompletely perceived label space due to the insufficient
+feature information. To this end, we propose the exploratory machine learning,
+which examines and investigates training data by actively augmenting the
+feature space to discover potentially hidden classes. Our method consists of
+three ingredients including rejection model, feature exploration, and model
+cascade. We provide theoretical analysis to justify its superiority, and
+validate the effectiveness on both synthetic and real datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published at Artificial Intelligence, preliminary conference version
+  published at AAAI'21</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Surge Phenomenon in Optimal Learning Rate and Batch Size Scaling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14578v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14578v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuaipeng Li, Penghao Zhao, Hailin Zhang, Xingwu Sun, Hao Wu, Dian Jiao, Weiyan Wang, Chengjun Liu, Zheng Fang, Jinbao Xue, Yangyu Tao, Bin Cui, Di Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In current deep learning tasks, Adam style optimizers such as Adam, Adagrad,
+RMSProp, Adafactor, and Lion have been widely used as alternatives to SGD style
+optimizers. These optimizers typically update model parameters using the sign
+of gradients, resulting in more stable convergence curves. The learning rate
+and the batch size are the most critical hyperparameters for optimizers, which
+require careful tuning to enable effective convergence. Previous research has
+shown that the optimal learning rate increases linearly or follows similar
+rules with batch size for SGD style optimizers. However, this conclusion is not
+applicable to Adam style optimizers. In this paper, we elucidate the connection
+between optimal learning rates and batch sizes for Adam style optimizers
+through both theoretical analysis and extensive experiments. First, we raise
+the scaling law between batch sizes and optimal learning rates in the sign of
+gradient case, in which we prove that the optimal learning rate first rises and
+then falls as the batch size increases. Moreover, the peak value of the surge
+will gradually move toward the larger batch size as training progresses.
+Second, we conducted experiments on various CV and NLP tasks and verified the
+correctness of the scaling law.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Entropy Search for Safe Efficient Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19059v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19059v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dorina Weichert, Alexander Kister, Sebastian Houben, Patrick Link, Gunar Ernis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The practical use of Bayesian Optimization (BO) in engineering applications
+imposes special requirements: high sampling efficiency on the one hand and
+finding a robust solution on the other hand. We address the case of adversarial
+robustness, where all parameters are controllable during the optimization
+process, but a subset of them is uncontrollable or even adversely perturbed at
+the time of application. To this end, we develop an efficient information-based
+acquisition function that we call Robust Entropy Search (RES). We empirically
+demonstrate its benefits in experiments on synthetic and real-life data. The
+results showthat RES reliably finds robust optima, outperforming
+state-of-the-art algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SecureBoost+ : A High Performance Gradient Boosting Tree Framework for
+  Large Scale Vertical Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.10927v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.10927v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijing Chen, Guoqiang Ma, Tao Fan, Yan Kang, Qian Xu, Qiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient boosting decision tree (GBDT) is a widely used ensemble algorithm in
+the industry. Its vertical federated learning version, SecureBoost, is one of
+the most popular algorithms used in cross-silo privacy-preserving modeling. As
+the area of privacy computation thrives in recent years, demands for
+large-scale and high-performance federated learning have grown dramatically in
+real-world applications. In this paper, to fulfill these requirements, we
+propose SecureBoost+ that is both novel and improved from the prior work
+SecureBoost. SecureBoost+ integrates several ciphertext calculation
+optimizations and engineering optimizations. The experimental results
+demonstrate that Secureboost+ has significant performance improvements on large
+and high dimensional data sets compared to SecureBoost. It makes effective and
+efficient large-scale vertical federated learning possible.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Efficient and Multi-private Key Secure Aggregation for Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08970v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08970v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xue Yang, Zifeng Liu, Xiaohu Tang, Rongxing Lu, Bo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of privacy leaks in federated learning, secure aggregation
+protocols that mainly adopt either homomorphic encryption or threshold secret
+sharing have been widely developed for federated learning to protect the
+privacy of the local training data of each client. However, these existing
+protocols suffer from many shortcomings, such as the dependence on a trusted
+third party, the vulnerability to clients being corrupted, low efficiency, the
+trade-off between security and fault tolerance, etc. To solve these
+disadvantages, we propose an efficient and multi-private key secure aggregation
+scheme for federated learning. Specifically, we skillfully modify the variant
+ElGamal encryption technique to achieve homomorphic addition operation, which
+has two important advantages: 1) The server and each client can freely select
+public and private keys without introducing a trust third party and 2) Compared
+to the variant ElGamal encryption, the plaintext space is relatively large,
+which is more suitable for the deep model. Besides, for the high dimensional
+deep model parameter, we introduce a super-increasing sequence to compress
+multi-dimensional data into 1-D, which can greatly reduce encryption and
+decryption times as well as communication for ciphertext transmission. Detailed
+security analyses show that our proposed scheme achieves the semantic security
+of both individual local gradients and the aggregated result while achieving
+optimal robustness in tolerating both client collusion and dropped clients.
+Extensive simulations demonstrate that the accuracy of our scheme is almost the
+same as the non-private approach, while the efficiency of our scheme is much
+better than the state-of-the-art homomorphic encryption-based secure
+aggregation schemes. More importantly, the efficiency advantages of our scheme
+will become increasingly prominent as the number of model parameters increases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ESM All-Atom: Multi-scale Protein Language Model for Unified Molecular
+  Modeling <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.12995v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.12995v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangjie Zheng, Siyu Long, Tianyu Lu, Junwei Yang, Xinyu Dai, Ming Zhang, Zaiqing Nie, Wei-Ying Ma, Hao Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Protein language models have demonstrated significant potential in the field
+of protein engineering. However, current protein language models primarily
+operate at the residue scale, which limits their ability to provide information
+at the atom level. This limitation prevents us from fully exploiting the
+capabilities of protein language models for applications involving both
+proteins and small molecules. In this paper, we propose ESM-AA (ESM All-Atom),
+a novel approach that enables atom-scale and residue-scale unified molecular
+modeling. ESM-AA achieves this by pre-training on multi-scale code-switch
+protein sequences and utilizing a multi-scale position encoding to capture
+relationships among residues and atoms. Experimental results indicate that
+ESM-AA surpasses previous methods in protein-molecule tasks, demonstrating the
+full utilization of protein language models. Further investigations reveal that
+through unified molecular modeling, ESM-AA not only gains molecular knowledge
+but also retains its understanding of proteins. The source codes of ESM-AA are
+publicly released at https://github.com/zhengkangjie/ESM-AA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML2024 camera-ready, update some experimental results, add github
+  url</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conditioning GAN Without Training <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kidist Amde Mekonnen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning algorithms have a large number of trainable parameters often
+with sizes of hundreds of thousands or more. Training this algorithm requires a
+large amount of training data and generating a sufficiently large dataset for
+these algorithms is costly\cite{noguchi2019image}.
+  GANs are generative neural networks that use two deep learning networks that
+are competing with each other. The networks are generator and discriminator
+networks. The generator tries to generate realistic images which resemble the
+actual training dataset by approximating the training data distribution and the
+discriminator is trained to classify images as real or
+fake(generated)\cite{goodfellow2016nips}. Training these GAN algorithms also
+requires a large amount of training dataset\cite{noguchi2019image}.
+  In this study, the aim is to address the question, "Given an unconditioned
+pretrained generator network and a pretrained classifier, is it feasible to
+develop a conditioned generator without relying on any training dataset?"
+  The paper begins with a general introduction to the problem. The subsequent
+sections are structured as follows: Section 2 provides background information
+on the problem. Section 3 reviews relevant literature on the topic. Section 4
+outlines the methodology employed in this study. Section 5 presents the
+experimental results. Section 6 discusses the findings and proposes potential
+future research directions. Finally, Section 7 offers concluding remarks.
+  The implementation can be accessed
+\href{https://github.com/kidist-amde/BigGAN-PyTorch}{here}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, Part of my MSc project course, School Project
+  Course 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adv-KD: Adversarial Knowledge Distillation for Faster Diffusion Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kidist Amde Mekonnen, Nicola Dall'Asen, Paolo Rota
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Probabilistic Models (DPMs) have emerged as a powerful class of
+deep generative models, achieving remarkable performance in image synthesis
+tasks. However, these models face challenges in terms of widespread adoption
+due to their reliance on sequential denoising steps during sample generation.
+This dependence leads to substantial computational requirements, making them
+unsuitable for resource-constrained or real-time processing systems. To address
+these challenges, we propose a novel method that integrates denoising phases
+directly into the model's architecture, thereby reducing the need for
+resource-intensive computations. Our approach combines diffusion models with
+generative adversarial networks (GANs) through knowledge distillation, enabling
+more efficient training and evaluation. By utilizing a pre-trained diffusion
+model as a teacher model, we train a student model through adversarial
+learning, employing layerwise transformations for denoising and submodules for
+predicting the teacher model's output at various points in time. This
+integration significantly reduces the number of parameters and denoising steps
+required, leading to improved sampling speed at test time. We validate our
+method with extensive experiments, demonstrating comparable performance with
+reduced computational requirements compared to existing approaches. By enabling
+the deployment of diffusion models on resource-constrained devices, our
+research mitigates their computational burden and paves the way for wider
+accessibility and practical use across the research community and end-users.
+  Our code is publicly available at https://github.com/kidist-amde/Adv-KD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 11 figures, ELLIS Doctoral Symposium 2023 in Helsinki,
+  Finland</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-Language Meets the Skeleton: Progressively Distillation with
+  Cross-Modal Knowledge for 3D Action Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20606v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20606v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Chen, Tian He, Junfeng Fu, Ling Wang, Jingcai Guo, Hong Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised and self-supervised learning are two main training paradigms for
+skeleton-based human action recognition. However, the former one-hot
+classification requires labor-intensive predefined action categories
+annotations, while the latter involves skeleton transformations (e.g.,
+cropping) in the pretext tasks that may impair the skeleton structure. To
+address these challenges, we introduce a novel skeleton-based training
+framework (C$^2$VL) based on Cross-modal Contrastive learning that uses the
+progressive distillation to learn task-agnostic human skeleton action
+representation from the Vision-Language knowledge prompts. Specifically, we
+establish the vision-language action concept space through vision-language
+knowledge prompts generated by pre-trained large multimodal models (LMMs),
+which enrich the fine-grained details that the skeleton action space lacks.
+Moreover, we propose the intra-modal self-similarity and inter-modal
+cross-consistency softened targets in the cross-modal contrastive process to
+progressively control and guide the degree of pulling vision-language knowledge
+prompts and corresponding skeletons closer. These soft instance discrimination
+and self-knowledge distillation strategies contribute to the learning of better
+skeleton-based action representations from the noisy skeleton-vision-language
+pairs. During the inference phase, our method requires only the skeleton data
+as the input for action recognition and no longer for vision-language prompts.
+Extensive experiments show that our method achieves state-of-the-art results on
+NTU RGB+D 60, NTU RGB+D 120, and PKU-MMD datasets. The code will be available
+in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Advancing Ear Biometrics: Enhancing Accuracy and Robustness through Deep
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youssef Mohamed, Zeyad Youssef, Ahmed Heakl, Ahmed Zaky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biometric identification is a reliable method to verify individuals based on
+their unique physical or behavioral traits, offering a secure alternative to
+traditional methods like passwords or PINs. This study focuses on ear biometric
+identification, exploiting its distinctive features for enhanced accuracy,
+reliability, and usability. While past studies typically investigate face
+recognition and fingerprint analysis, our research demonstrates the
+effectiveness of ear biometrics in overcoming limitations such as variations in
+facial expressions and lighting conditions. We utilized two datasets: AMI (700
+images from 100 individuals) and EarNV1.0 (28,412 images from 164 individuals).
+To improve the accuracy and robustness of our ear biometric identification
+system, we applied various techniques including data preprocessing and
+augmentation. Our models achieved a testing accuracy of 99.35% on the AMI
+Dataset and 98.1% on the EarNV1.0 dataset, showcasing the effectiveness of our
+approach in precisely identifying individuals based on ear biometric
+characteristics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 8 figures, 3 tables, International IEEE Conference on the
+  Intelligent Methods, Systems, and Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bootstrap3D: Improving 3D Content Creation with Synthetic Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2406.00093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2406.00093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyi Sun, Tong Wu, Pan Zhang, Yuhang Zang, Xiaoyi Dong, Yuanjun Xiong, Dahua Lin, Jiaqi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed remarkable progress in multi-view diffusion
+models for 3D content creation. However, there remains a significant gap in
+image quality and prompt-following ability compared to 2D diffusion models. A
+critical bottleneck is the scarcity of high-quality 3D assets with detailed
+captions. To address this challenge, we propose Bootstrap3D, a novel framework
+that automatically generates an arbitrary quantity of multi-view images to
+assist in training multi-view diffusion models. Specifically, we introduce a
+data generation pipeline that employs (1) 2D and video diffusion models to
+generate multi-view images based on constructed text prompts, and (2) our
+fine-tuned 3D-aware MV-LLaVA for filtering high-quality data and rewriting
+inaccurate captions. Leveraging this pipeline, we have generated 1 million
+high-quality synthetic multi-view images with dense descriptive captions to
+address the shortage of high-quality 3D data. Furthermore, we present a
+Training Timestep Reschedule (TTR) strategy that leverages the denoising
+process to learn multi-view consistency while maintaining the original 2D
+diffusion prior. Extensive experiments demonstrate that Bootstrap3D can
+generate high-quality multi-view images with superior aesthetic quality,
+image-text alignment, and maintained view consistency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://sunzey.github.io/Bootstrap3D/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeRF View Synthesis: Subjective Quality Assessment and Objective Metrics
+  Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20078v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20078v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Martin, Antonio Rodrigues, Joao Ascenso, Maria Paula Queluz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural radiance fields (NeRF) are a groundbreaking computer vision technology
+that enables the generation of high-quality, immersive visual content from
+multiple viewpoints. This capability holds significant advantages for
+applications such as virtual/augmented reality, 3D modelling and content
+creation for the film and entertainment industry. However, the evaluation of
+NeRF methods poses several challenges, including a lack of comprehensive
+datasets, reliable assessment methodologies, and objective quality metrics.
+This paper addresses the problem of NeRF quality assessment thoroughly, by
+conducting a rigorous subjective quality assessment test that considers several
+scene classes and recently proposed NeRF view synthesis methods. Additionally,
+the performance of a wide range of state-of-the-art conventional and
+learning-based full-reference 2D image and video quality assessment metrics is
+evaluated against the subjective scores of the subjective study. The
+experimental results are analyzed in depth, providing a comparative evaluation
+of several NeRF methods and objective quality metrics, across different classes
+of visual scenes, including real and synthetic content for front-face and
+360-degree camera trajectories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2024-05-30T00:00:00Z">2024-05-30</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">155</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Zero to Hero: Cold-Start Anomaly Detection <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tal Reiss, George Kour, Naama Zwerdling, Ateret Anaby-Tavor, Yedid Hoshen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When first deploying an anomaly detection system, e.g., to detect
+out-of-scope queries in chatbots, there are no observed data, making
+data-driven approaches ineffective. Zero-shot anomaly detection methods offer a
+solution to such "cold-start" cases, but unfortunately they are often not
+accurate enough. This paper studies the realistic but underexplored cold-start
+setting where an anomaly detection model is initialized using zero-shot
+guidance, but subsequently receives a small number of contaminated observations
+(namely, that may include anomalies). The goal is to make efficient use of both
+the zero-shot guidance and the observations. We propose ColdFusion, a method
+that effectively adapts the zero-shot anomaly detector to contaminated
+observations. To support future development of this new setting, we propose an
+evaluation suite consisting of evaluation protocols and metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024. Our code is available at
+  https://github.com/talreiss/ColdFusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Xwin-LM: Strong and Scalable Alignment Practice for LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bolin Ni, JingCheng Hu, Yixuan Wei, Houwen Peng, Zheng Zhang, Gaofeng Meng, Han Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present Xwin-LM, a comprehensive suite of alignment
+methodologies for large language models (LLMs). This suite encompasses several
+key techniques, including supervised finetuning (SFT), reward modeling (RM),
+rejection sampling finetuning (RS), and direct preference optimization (DPO).
+The key components are as follows: (1) Xwin-LM-SFT, models initially finetuned
+with high-quality instruction data; (2) Xwin-Pair, a large-scale, multi-turn
+preference dataset meticulously annotated using GPT-4; (3) Xwin-RM, reward
+models trained on Xwin-Pair, developed at scales of 7B, 13B, and 70B
+parameters; (4) Xwin-Set, a multiwise preference dataset in which each prompt
+is linked to 64 unique responses generated by Xwin-LM-SFT and scored by
+Xwin-RM; (5) Xwin-LM-RS, models finetuned with the highest-scoring responses
+from Xwin-Set; (6) Xwin-LM-DPO, models further optimized on Xwin-Set using the
+DPO algorithm. Our evaluations on AlpacaEval and MT-bench demonstrate
+consistent and significant improvements across the pipeline, demonstrating the
+strength and scalability of Xwin-LM. The repository
+https://github.com/Xwin-LM/Xwin-LM will be continually updated to foster
+community research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CausalQuest: Collecting Natural Causal Questions for AI Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roberto Ceraolo, Dmitrii Kharlapenko, Amélie Reymond, Rada Mihalcea, Mrinmaya Sachan, Bernhard Schölkopf, Zhijing Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans have an innate drive to seek out causality. Whether fuelled by
+curiosity or specific goals, we constantly question why things happen, how they
+are interconnected, and many other related phenomena. To develop AI agents
+capable of addressing this natural human quest for causality, we urgently need
+a comprehensive dataset of natural causal questions. Unfortunately, existing
+datasets either contain only artificially-crafted questions that do not reflect
+real AI usage scenarios or have limited coverage of questions from specific
+sources. To address this gap, we present CausalQuest, a dataset of 13,500
+naturally occurring questions sourced from social networks, search engines, and
+AI assistants. We formalize the definition of causal questions and establish a
+taxonomy for finer-grained classification. Through a combined effort of human
+annotators and large language models (LLMs), we carefully label the dataset. We
+find that 42% of the questions humans ask are indeed causal, with the majority
+seeking to understand the causes behind given effects. Using this dataset, we
+train efficient classifiers (up to 2.85B parameters) for the binary task of
+identifying causal questions, achieving high performance with F1 scores of up
+to 0.877. We conclude with a rich set of future research directions that can
+build upon our data and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ANAH: Analytical Annotation of Hallucinations in Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20315v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20315v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziwei Ji, Yuzhe Gu, Wenwei Zhang, Chengqi Lyu, Dahua Lin, Kai Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reducing the `$\textit{hallucination}$' problem of Large Language Models
+(LLMs) is crucial for their wide applications. A comprehensive and fine-grained
+measurement of the hallucination is the first key step for the governance of
+this issue but is under-explored in the community. Thus, we present
+$\textbf{ANAH}$, a bilingual dataset that offers $\textbf{AN}$alytical
+$\textbf{A}$nnotation of $\textbf{H}$allucinations in LLMs within Generative
+Question Answering. Each answer sentence in our dataset undergoes rigorous
+annotation, involving the retrieval of a reference fragment, the judgment of
+the hallucination type, and the correction of hallucinated content. ANAH
+consists of ~12k sentence-level annotations for ~4.3k LLM responses covering
+over 700 topics, constructed by a human-in-the-loop pipeline. Thanks to the
+fine granularity of the hallucination annotations, we can quantitatively
+confirm that the hallucinations of LLMs progressively accumulate in the answer
+and use ANAH to train and evaluate hallucination annotators. We conduct
+extensive experiments on studying generative and discriminative annotators and
+show that, although current open-source LLMs have difficulties in fine-grained
+hallucination annotation, the generative annotator trained with ANAH can
+surpass all open-source LLMs and GPT-3.5, obtain performance competitive with
+GPT-4, and exhibits better generalization ability on unseen questions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ S3D: A Simple and Cost-Effective Self-Speculative Decoding Scheme for
+  Low-Memory GPUs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20314v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20314v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Zhong, Manasa Bharadwaj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speculative decoding (SD) has attracted a significant amount of research
+attention due to the substantial speedup it can achieve for LLM inference.
+However, despite the high speedups they offer, speculative decoding methods
+often achieve optimal performance on high-end devices or with a substantial GPU
+memory overhead. Given limited memory and the necessity of quantization, a
+high-performing model on a high-end GPU can slow down by up to 7 times. To this
+end, we propose Skippy Simultaneous Speculative Decoding (or S3D), a
+cost-effective self-speculative SD method based on simultaneous multi-token
+decoding and mid-layer skipping. When compared against recent effective
+open-source SD systems, our method has achieved one of the top
+performance-memory ratios while requiring minimal architecture changes and
+training data. Leveraging our memory efficiency, we created a smaller yet more
+effective SD model based on Phi-3. It is 1.4 to 2 times faster than the
+quantized EAGLE model and operates in half-precision while using less VRAM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models Can Self-Improve At Web Agent Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ajay Patel, Markus Hofmarcher, Claudiu Leoveanu-Condrei, Marius-Constantin Dinu, Chris Callison-Burch, Sepp Hochreiter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training models to act as agents that can effectively navigate and perform
+actions in a complex environment, such as a web browser, has typically been
+challenging due to lack of training data. Large language models (LLMs) have
+recently demonstrated some capability to navigate novel environments as agents
+in a zero-shot or few-shot fashion, purely guided by natural language
+instructions as prompts. Recent research has also demonstrated LLMs have the
+capability to exceed their base performance through self-improvement, i.e.
+fine-tuning on data generated by the model itself. In this work, we explore the
+extent to which LLMs can self-improve their performance as agents in
+long-horizon tasks in a complex environment using the WebArena benchmark. In
+WebArena, an agent must autonomously navigate and perform actions on web pages
+to achieve a specified objective. We explore fine-tuning on three distinct
+synthetic training data mixtures and achieve a 31\% improvement in task
+completion rate over the base model on the WebArena benchmark through a
+self-improvement procedure. We additionally contribute novel evaluation metrics
+for assessing the performance, robustness, capabilities, and quality of
+trajectories of our fine-tuned agent models to a greater degree than simple,
+aggregate-level benchmark scores currently used to measure self-improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Group Robust Preference Optimization in Reward-free RLHF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shyam Sundhar Ramesh, Yifan Hu, Iason Chaimalas, Viraj Mehta, Pier Giuseppe Sessa, Haitham Bou Ammar, Ilija Bogunovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapting large language models (LLMs) for specific tasks usually involves
+fine-tuning through reinforcement learning with human feedback (RLHF) on
+preference data. While these data often come from diverse labelers' groups
+(e.g., different demographics, ethnicities, company teams, etc.), traditional
+RLHF approaches adopt a "one-size-fits-all" approach, i.e., they
+indiscriminately assume and optimize a single preference model, thus not being
+robust to unique characteristics and needs of the various groups. To address
+this limitation, we propose a novel Group Robust Preference Optimization (GRPO)
+method to align LLMs to individual groups' preferences robustly. Our approach
+builds upon reward-free direct preference optimization methods, but unlike
+previous approaches, it seeks a robust policy which maximizes the worst-case
+group performance. To achieve this, GRPO adaptively and sequentially weights
+the importance of different groups, prioritizing groups with worse cumulative
+loss. We theoretically study the feasibility of GRPO and analyze its
+convergence for the log-linear policy class. By fine-tuning LLMs with GRPO
+using diverse group-based global opinion data, we significantly improved
+performance for the worst-performing groups, reduced loss imbalances across
+groups, and improved probability accuracies compared to non-robust baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Who Writes the <span class="highlight-title">Review</span>, Human or AI? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Panagiotis C. Theocharopoulos, Spiros V. Georgakopoulos, Sotiris K. Tasoulis, Vassilis P. Plagianakos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing use of Artificial Intelligence in Natural Language
+Processing, concerns have been raised regarding the detection of AI-generated
+text in various domains. This study aims to investigate this issue by proposing
+a methodology to accurately distinguish AI-generated and human-written book
+reviews. Our approach utilizes transfer learning, enabling the model to
+identify generated text across different topics while improving its ability to
+detect variations in writing style and vocabulary. To evaluate the
+effectiveness of the proposed methodology, we developed a dataset consisting of
+real book reviews and AI-generated reviews using the recently proposed Vicuna
+open-source language model. The experimental results demonstrate that it is
+feasible to detect the original source of text, achieving an accuracy rate of
+96.86%. Our efforts are oriented toward the exploration of the capabilities and
+limitations of Large Language Models in the context of text identification.
+Expanding our knowledge in these aspects will be valuable for effectively
+navigating similar models in the future and ensuring the integrity and
+authenticity of human-generated content.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ROAST: <span class="highlight-title">Review</span>-level Opinion Aspect Sentiment Target Joint Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siva Uday Sampreeth Chebolu, Franck Dernoncourt, Nedim Lipka, Thamar Solorio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aspect-Based Sentiment Analysis (ABSA) has experienced tremendous expansion
+and diversity due to various shared tasks spanning several languages and fields
+and organized via SemEval workshops and Germeval. Nonetheless, a few
+shortcomings still need to be addressed, such as the lack of low-resource
+language evaluations and the emphasis on sentence-level analysis. To thoroughly
+assess ABSA techniques in the context of complete reviews, this research
+presents a novel task, Review-Level Opinion Aspect Sentiment Target (ROAST).
+ROAST seeks to close the gap between sentence-level and text-level ABSA by
+identifying every ABSA constituent at the review level. We extend the available
+datasets to enable ROAST, addressing the drawbacks noted in previous research
+by incorporating low-resource languages, numerous languages, and a variety of
+topics. Through this effort, ABSA research will be able to cover more ground
+and get a deeper comprehension of the task and its practical application in a
+variety of languages and domains (https://github.com/RiTUAL-UH/ROAST-ABSA).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2309.13297</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ETHER: Efficient Finetuning of Large-Scale Models with Hyperplane
+  Reflections <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Massimo Bini, Karsten Roth, Zeynep Akata, Anna Khoreva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient finetuning (PEFT) has become ubiquitous to adapt
+foundation models to downstream task requirements while retaining their
+generalization ability. However, the amount of additionally introduced
+parameters and compute for successful adaptation and hyperparameter searches
+can explode quickly, especially when deployed at scale to serve numerous
+individual requests. To ensure effective, parameter-efficient, and
+hyperparameter-robust adaptation, we propose the ETHER transformation family,
+which performs Efficient fineTuning via HypErplane Reflections. By design,
+ETHER transformations require a minimal number of parameters, are less likely
+to deteriorate model performance, and exhibit robustness to hyperparameter and
+learning rate choices. In particular, we introduce ETHER and its relaxation
+ETHER+, which match or outperform existing PEFT methods with significantly
+fewer parameters ($\sim$$10$-$100$ times lower than LoRA or OFT) across
+multiple image synthesis and natural language tasks without exhaustive
+hyperparameter tuning. Finally, we investigate the recent emphasis on
+Hyperspherical Energy retention for adaptation and raise questions on its
+practical utility. The code is available at https://github.com/mwbini/ether.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024. Code available at
+  https://github.com/mwbini/ether</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IsraParlTweet: The Israeli Parliamentary and Twitter Resource <span class="chip">LREC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guy Mor-Lan, Effi Levi, Tamir Sheafer, Shaul R. Shenhav
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce IsraParlTweet, a new linked corpus of Hebrew-language
+parliamentary discussions from the Knesset (Israeli Parliament) between the
+years 1992-2023 and Twitter posts made by Members of the Knesset between the
+years 2008-2023, containing a total of 294.5 million Hebrew tokens. In addition
+to raw text, the corpus contains comprehensive metadata on speakers and Knesset
+sessions as well as several linguistic annotations. As a result, IsraParlTweet
+can be used to conduct a wide variety of quantitative and qualitative analyses
+and provide valuable insights into political discourse in Israel.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at LREC-COLING 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Auto Arena of LLMs: Automating LLM Evaluations with Agent Peer-battles
+  and Committee Discussions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruochen Zhao, Wenxuan Zhang, Yew Ken Chia, Deli Zhao, Lidong Bing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As LLMs evolve on a daily basis, there is an urgent need for a trustworthy
+evaluation method that can provide robust evaluation results in a timely
+fashion. Currently, as static benchmarks are prone to contamination concerns,
+users tend to trust human voting platforms, such as Chatbot Arena. However,
+human annotations require extensive manual efforts. To provide an automatic,
+robust, and trustworthy evaluation framework, we innovatively propose the
+Auto-Arena of LLMs, which automates the entire evaluation process with LLM
+agents. Firstly, an examiner LLM devises queries. Then, a pair of candidate
+LLMs engage in a multi-round peer-battle around the query, during which the
+LLM's true performance gaps become visible. Finally, a committee of LLM judges
+collectively discuss and determine the winner, which alleviates bias and
+promotes fairness. In our extensive experiment on the 17 newest LLMs,
+Auto-Arena shows the highest correlation with human preferences, providing a
+promising alternative to human evaluation platforms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Large Language Model Biases in Persona-Steered Generation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andy Liu, Mona Diab, Daniel Fried
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of persona-steered text generation requires large language models
+(LLMs) to generate text that reflects the distribution of views that an
+individual fitting a persona could have. People have multifaceted personas, but
+prior work on bias in LLM-generated opinions has only explored multiple-choice
+settings or one-dimensional personas. We define an incongruous persona as a
+persona with multiple traits where one trait makes its other traits less likely
+in human survey data, e.g. political liberals who support increased military
+spending. We find that LLMs are 9.7% less steerable towards incongruous
+personas than congruous ones, sometimes generating the stereotypical stance
+associated with its demographic rather than the target stance. Models that we
+evaluate that are fine-tuned with Reinforcement Learning from Human Feedback
+(RLHF) are more steerable, especially towards stances associated with political
+liberals and women, but present significantly less diverse views of personas.
+We also find variance in LLM steerability that cannot be predicted from
+multiple-choice opinion evaluation. Our results show the importance of
+evaluating models in open-ended text generation, as it can surface new LLM
+opinion biases. Moreover, such a setup can shed light on our ability to steer
+models toward a richer and more diverse range of viewpoints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of ACL 2024. Code and data available at
+  https://github.com/andyjliu/persona-steered-generation-bias</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Hierarchical Multi-Agent Workflows for Zero-Shot <span class="highlight-title">Prompt</span>
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20252v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20252v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchi Liu, Jaskirat Singh, Gaowen Liu, Ali Payani, Liang Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown great progress in responding to user
+questions, allowing for a multitude of diverse applications. Yet, the quality
+of LLM outputs heavily depends on the prompt design, where a good prompt might
+enable the LLM to answer a very challenging question correctly. Therefore,
+recent works have developed many strategies for improving the prompt, including
+both manual crafting and in-domain optimization. However, their efficacy in
+unrestricted scenarios remains questionable, as the former depends on human
+design for specific questions and the latter usually generalizes poorly to
+unseen scenarios. To address these problems, we give LLMs the freedom to design
+the best prompts according to themselves. Specifically, we include a hierarchy
+of LLMs, first constructing a prompt with precise instructions and accurate
+wording in a hierarchical manner, and then using this prompt to generate the
+final answer to the user query. We term this pipeline Hierarchical Multi-Agent
+Workflow, or HMAW. In contrast with prior works, HMAW imposes no human
+restriction and requires no training, and is completely task-agnostic while
+capable of adjusting to the nuances of the underlying task. Through both
+quantitative and qualitative experiments across multiple benchmarks, we verify
+that despite its simplicity, the proposed approach can create detailed and
+suitable prompts, further boosting the performance of current LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval Augmented Structured Generation: Business Document Information
+  Extraction As Tool Use 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franz Louis Cesista, Rui Aguiar, Jason Kim, Paolo Acilo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Business Document Information Extraction (BDIE) is the problem of
+transforming a blob of unstructured information (raw text, scanned documents,
+etc.) into a structured format that downstream systems can parse and use. It
+has two main tasks: Key-Information Extraction (KIE) and Line Items Recognition
+(LIR). In this paper, we argue that BDIE is best modeled as a Tool Use problem,
+where the tools are these downstream systems. We then present Retrieval
+Augmented Structured Generation (RASG), a novel general framework for BDIE that
+achieves state of the art (SOTA) results on both KIE and LIR tasks on BDIE
+benchmarks.
+  The contributions of this paper are threefold: (1) We show, with ablation
+benchmarks, that Large Language Models (LLMs) with RASG are already competitive
+with or surpasses current SOTA Large Multimodal Models (LMMs) without RASG on
+BDIE benchmarks. (2) We propose a new metric class for Line Items Recognition,
+General Line Items Recognition Metric (GLIRM), that is more aligned with
+practical BDIE use cases compared to existing metrics, such as ANLS*, DocILE,
+and GriTS. (3) We provide a heuristic algorithm for backcalculating bounding
+boxes of predicted line items and tables without the need for vision encoders.
+Finally, we claim that, while LMMs might sometimes offer marginal performance
+benefits, LLMs + RASG is oftentimes superior given real-world applications and
+constraints of BDIE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE 7th International Conference on Multimedia
+  Information Processing and Retrieval (MIPR), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TS-Align: A Teacher-Student Collaborative Framework for Scalable
+  Iterative Finetuning of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Zhang, Chengguang Tang, Dading Chong, Ke Shi, Guohua Tang, Feng Jiang, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mainstream approaches to aligning large language models (LLMs) heavily rely
+on human preference data, particularly when models require periodic updates.
+The standard process for iterative alignment of LLMs involves collecting new
+human feedback for each update. However, the data collection process is costly
+and challenging to scale. To address this issue, we introduce the "TS-Align"
+framework, which fine-tunes a policy model using pairwise feedback data
+automatically mined from its outputs. This automatic mining process is
+efficiently accomplished through the collaboration between a large-scale
+teacher model and a small-scale student model. The policy fine-tuning process
+can be iteratively repeated using on-policy generations within our proposed
+teacher-student collaborative framework. Through extensive experiments, we
+demonstrate that our final aligned policy outperforms the base policy model
+with an average win rate of 69.7% across seven conversational or
+instruction-following datasets. Furthermore, we show that the ranking
+capability of the teacher is effectively distilled into the student through our
+pipeline, resulting in a small-scale yet effective reward model for policy
+model alignment.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PostDoc: Generating Poster from a Long Multimodal Document Using Deep
+  Submodular Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vijay Jaisankar, Sambaran Bandyopadhyay, Kalp Vyas, Varre Chaitanya, Shwetha Somasundaram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A poster from a long input document can be considered as a one-page
+easy-to-read multimodal (text and images) summary presented on a nice template
+with good design elements. Automatic transformation of a long document into a
+poster is a very less studied but challenging task. It involves content
+summarization of the input document followed by template generation and
+harmonization. In this work, we propose a novel deep submodular function which
+can be trained on ground truth summaries to extract multimodal content from the
+document and explicitly ensures good coverage, diversity and alignment of text
+and images. Then, we use an LLM based paraphraser and propose to generate a
+template with various design aspects conditioned on the input content. We show
+the merits of our approach through extensive automated and human evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jina CLIP: Your CLIP Model Is Also Your Text Retriever <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Koukounas, Georgios Mastrapas, Michael Günther, Bo Wang, Scott Martens, Isabelle Mohr, Saba Sturua, Mohammad Kalim Akram, Joan Fontanals Martínez, Saahil Ognawala, Susana Guzman, Maximilian Werk, Nan Wang, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pretraining (CLIP) is widely used to train models
+to align images and texts in a common embedding space by mapping them to
+fixed-sized vectors. These models are key to multimodal information retrieval
+and related tasks. However, CLIP models generally underperform in text-only
+tasks compared to specialized text models. This creates inefficiencies for
+information retrieval systems that keep separate embeddings and models for
+text-only and multimodal tasks. We propose a novel, multi-task contrastive
+training method to address this issue, which we use to train the jina-clip-v1
+model to achieve the state-of-the-art performance on both text-image and
+text-text retrieval tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, ICML2024 workshop submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TAIA: Large Language Models are Out-of-Distribution Data Learners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyang Jiang, Yusheng Liao, Ya Zhang, Yu Wang, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning on task-specific question-answer pairs is a predominant method
+for enhancing the performance of instruction-tuned large language models (LLMs)
+on downstream tasks. However, in certain specialized domains, such as
+healthcare or harmless content generation, it is nearly impossible to obtain a
+large volume of high-quality data that matches the downstream distribution. To
+improve the performance of LLMs in data-scarce domains with domain-mismatched
+data, we re-evaluated the Transformer architecture and discovered that not all
+parameter updates during fine-tuning contribute positively to downstream
+performance. Our analysis reveals that within the self-attention and
+feed-forward networks, only the fine-tuned attention parameters are
+particularly beneficial when the training set's distribution does not fully
+align with the test set. Based on this insight, we propose an effective
+inference-time intervention method: \uline{T}raining \uline{A}ll parameters but
+\uline{I}nferring with only \uline{A}ttention (\trainallInfAttn). We
+empirically validate \trainallInfAttn using two general instruction-tuning
+datasets and evaluate it on seven downstream tasks involving math, reasoning,
+and knowledge understanding across LLMs of different parameter sizes and
+fine-tuning techniques. Our comprehensive experiments demonstrate that
+\trainallInfAttn achieves superior improvements compared to both the fully
+fine-tuned model and the base model in most scenarios, with significant
+performance gains. The high tolerance of \trainallInfAttn to data mismatches
+makes it resistant to jailbreaking tuning and enhances specialized tasks using
+general data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robo-Instruct: Simulator-Augmented Instruction Alignment For Finetuning
+  CodeLLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichao Hu, Junyi Jessy Li, Arjun Guha, Joydeep Biswas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown great promise at generating robot
+programs from natural language given domain-specific robot application
+programming interfaces (APIs). However, the performance gap between proprietary
+LLMs and smaller open-weight LLMs remains wide. This raises a question: Can we
+fine-tune smaller open-weight LLMs for generating domain-specific robot
+programs to close the performance gap with proprietary LLMs? While
+Self-Instruct is a promising solution by generating a diverse set of training
+data, it cannot verify the correctness of these programs. In contrast, a robot
+simulator with a well-defined world can identify execution errors but limits
+the diversity of programs that it can verify. In this work, we introduce
+Robo-Instruct, which brings the best of both worlds -- it promotes the
+diversity of Self-Instruct while providing the correctness of simulator-based
+checking. Robo-Instruct introduces RoboSim to synthesize a consistent world
+state on the fly by inferring properties relevant to the program being checked,
+and simulating actions accordingly. Furthermore, the instructions and programs
+generated by Self-Instruct may be subtly inconsistent -- such as the program
+missing a step implied by the instruction. Robo-Instruct further addresses this
+with InstAlign, an instruction-program alignment procedure that revises the
+task instruction to reflect the actual results of the generated program. Given
+a few seed task descriptions and the robot APIs, Robo-Instruct is capable of
+generating a training dataset using only a small open-weight model. This
+dataset can then be used to fine-tune small open-weight language models,
+enabling them to match or even exceed the performance of several proprietary
+LLMs, such as GPT-3.5-Turbo and Gemini-Pro.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InstructionCP: A fast approach to transfer Large Language Models into
+  target language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20175v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20175v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuang-Ming Chen, Hung-yi Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of large language models (LLMs) in recent years has
+largely focused on English, resulting in models that respond exclusively in
+English. To adapt these models to other languages, continual pre-training (CP)
+is often employed, followed by supervised fine-tuning (SFT) to maintain
+conversational abilities. However, CP and SFT can reduce a model's ability to
+filter harmful content. We propose Instruction Continual Pre-training (InsCP),
+which integrates instruction tags into the CP process to prevent loss of
+conversational proficiency while acquiring new languages. Our experiments
+demonstrate that InsCP retains conversational and Reinforcement Learning from
+Human Feedback (RLHF) abilities. Empirical evaluations on language alignment,
+reliability, and knowledge benchmarks confirm the efficacy of InsCP. Notably,
+this approach requires only 0.1 billion tokens of high-quality
+instruction-following data, thereby reducing resource consumption.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Feature Boosting for Explainable Speech Emotion Recognition <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alaa Nfissi, Wassim Bouachir, Nizar Bouguila, Brian Mishara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In speech emotion recognition (SER), using predefined features without
+considering their practical importance may lead to high dimensional datasets,
+including redundant and irrelevant information. Consequently, high-dimensional
+learning often results in decreasing model accuracy while increasing
+computational complexity. Our work underlines the importance of carefully
+considering and analyzing features in order to build efficient SER systems. We
+present a new supervised SER method based on an efficient feature engineering
+approach. We pay particular attention to the explainability of results to
+evaluate feature relevance and refine feature sets. This is performed
+iteratively through feature evaluation loop, using Shapley values to boost
+feature selection and improve overall framework performance. Our approach
+allows thus to balance the benefits between model performance and transparency.
+The proposed method outperforms human-level performance (HLP) and
+state-of-the-art machine learning methods in emotion recognition on the TESS
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in: 2023 International Conference on Machine Learning and
+  Applications (ICMLA)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reasoning about concepts with LLMs: Inconsistencies abound 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rosario Uceda-Sosa, Karthikeyan Natesan Ramamurthy, Maria Chang, Moninder Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to summarize and organize knowledge into abstract concepts is key
+to learning and reasoning. Many industrial applications rely on the consistent
+and systematic use of concepts, especially when dealing with decision-critical
+knowledge. However, we demonstrate that, when methodically questioned, large
+language models (LLMs) often display and demonstrate significant
+inconsistencies in their knowledge. Computationally, the basic aspects of the
+conceptualization of a given domain can be represented as Is-A hierarchies in a
+knowledge graph (KG) or ontology, together with a few properties or axioms that
+enable straightforward reasoning. We show that even simple ontologies can be
+used to reveal conceptual inconsistencies across several LLMs. We also propose
+strategies that domain experts can use to evaluate and improve the coverage of
+key domain concepts in LLMs of various sizes. In particular, we have been able
+to significantly enhance the performance of LLMs of various sizes with openly
+available weights using simple knowledge-graph (KG) based prompting strategies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 5 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heidelberg-Boston @ SIGTYP 2024 Shared Task: Enhancing Low-Resource
+  Language Analysis With Character-Aware Hierarchical <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20145v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20145v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frederick Riemenschneider, Kevin Krahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Historical languages present unique challenges to the NLP community, with one
+prominent hurdle being the limited resources available in their closed corpora.
+This work describes our submission to the constrained subtask of the SIGTYP
+2024 shared task, focusing on PoS tagging, morphological tagging, and
+lemmatization for 13 historical languages. For PoS and morphological tagging we
+adapt a hierarchical tokenization method from Sun et al. (2023) and combine it
+with the advantages of the DeBERTa-V3 architecture, enabling our models to
+efficiently learn from every character in the training data. We also
+demonstrate the effectiveness of character-level T5 models on the lemmatization
+task. Pre-trained from scratch with limited data, our models achieved first
+place in the constrained subtask, nearly reaching the performance levels of the
+unconstrained task's winner. Our code is available at
+https://github.com/bowphs/SIGTYP-2024-hierarchical-transformers
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the 6th Workshop on Research in
+  Computational Linguistic Typology and Multilingual NLP (SIGTYP-WS) 2024; 11
+  pages, 1 figure, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GNN-RAG: Graph Neural Retrieval for Large Language Model Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Costas Mavromatis, George Karypis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graphs (KGs) represent human-crafted factual knowledge in the form
+of triplets (head, relation, tail), which collectively form a graph. Question
+Answering over KGs (KGQA) is the task of answering natural questions grounding
+the reasoning to the information provided by the KG. Large Language Models
+(LLMs) are the state-of-the-art models for QA tasks due to their remarkable
+ability to understand natural language. On the other hand, Graph Neural
+Networks (GNNs) have been widely used for KGQA as they can handle the complex
+graph information stored in the KG. In this work, we introduce GNN-RAG, a novel
+method for combining language understanding abilities of LLMs with the
+reasoning abilities of GNNs in a retrieval-augmented generation (RAG) style.
+First, a GNN reasons over a dense KG subgraph to retrieve answer candidates for
+a given question. Second, the shortest paths in the KG that connect question
+entities and answer candidates are extracted to represent KG reasoning paths.
+The extracted paths are verbalized and given as input for LLM reasoning with
+RAG. In our GNN-RAG framework, the GNN acts as a dense subgraph reasoner to
+extract useful graph information, while the LLM leverages its natural language
+processing ability for ultimate KGQA. Furthermore, we develop a retrieval
+augmentation (RA) technique to further boost KGQA performance with GNN-RAG.
+Experimental results show that GNN-RAG achieves state-of-the-art performance in
+two widely used KGQA benchmarks (WebQSP and CWQ), outperforming or matching
+GPT-4 performance with a 7B tuned LLM. In addition, GNN-RAG excels on multi-hop
+and multi-entity questions outperforming competing approaches by 8.9--15.5%
+points at answer F1.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Models Need Inductive Biases to Count Inductively 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingshan Chang, Yonatan Bisk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counting is a fundamental example of generalization, whether viewed through
+the mathematical lens of Peano's axioms defining the natural numbers or the
+cognitive science literature for children learning to count. The argument holds
+for both cases that learning to count means learning to count infinitely. While
+few papers have tried to distill transformer "reasoning" to the simplest case
+of counting, investigating length generalization does occur throughout the
+literature. In the "train short, test long" paradigm of NLP, length refers to
+the training sentence length. In formal language recognition, length refers to
+the input sequence length, or the maximum stack size induced by a pushdown
+automata. In general problem solving, length refers to the number of hops in a
+deductive reasoning chain or the recursion depth. For all cases, counting is
+central to task success. And crucially, generalizing counting inductively is
+central to success on OOD instances. This work provides extensive empirical
+results on training language models to count. We experiment with architectures
+ranging from RNNs, Transformers, State-Space Models and RWKV. We present
+carefully-designed task formats, auxiliary tasks and positional embeddings to
+avoid limitations in generalization with OOD-position and OOD-vocabulary. We
+find that while traditional RNNs trivially achieve inductive counting,
+Transformers have to rely on positional embeddings to count out-of-domain. As
+counting is the basis for many arguments concerning the expressivity of
+Transformers, our finding calls for the community to reexamine the application
+scope of primitive functions defined in formal characterizations. Finally,
+modern RNNs also largely underperform traditional RNNs in generalizing counting
+inductively. We discuss how design choices that enable parallelized training of
+modern RNNs cause them to lose merits of a recurrent nature.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fill in the Gap! Combining <span class="highlight-title">Self-supervised</span> Representation Learning with
+  Neural Audio Synthesis for Speech Inpainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ihab Asaad, Maxime Jacquelin, Olivier Perrotin, Laurent Girin, Thomas Hueber
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most speech self-supervised learning (SSL) models are trained with a pretext
+task which consists in predicting missing parts of the input signal, either
+future segments (causal prediction) or segments masked anywhere within the
+input (non-causal prediction). Learned speech representations can then be
+efficiently transferred to downstream tasks (e.g., automatic speech or speaker
+recognition). In the present study, we investigate the use of a speech SSL
+model for speech inpainting, that is reconstructing a missing portion of a
+speech signal from its surrounding context, i.e., fulfilling a downstream task
+that is very similar to the pretext task. To that purpose, we combine an SSL
+encoder, namely HuBERT, with a neural vocoder, namely HiFiGAN, playing the role
+of a decoder. In particular, we propose two solutions to match the HuBERT
+output with the HiFiGAN input, by freezing one and fine-tuning the other, and
+vice versa. Performance of both approaches was assessed in single- and
+multi-speaker settings, for both informed and blind inpainting configurations
+(i.e., the position of the mask is known or unknown, respectively), with
+different objective metrics and a perceptual evaluation. Performances show that
+if both solutions allow to correctly reconstruct signal portions up to the size
+of 200ms (and even 400ms in some cases), fine-tuning the SSL encoder provides a
+more accurate signal reconstruction in the single-speaker setting case, while
+freezing it (and training the neural vocoder instead) is a better strategy when
+dealing with multi-speaker data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Divide-and-Conquer Meets Consensus: Unleashing the Power of Functions in
+  Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingchang Chen, Hongxuan Tang, Zheng Chu, Qianglong Chen, Zekun Wang, Ming Liu, Bing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent progress made by large language models in code generation,
+they still struggle with programs that meet complex requirements. Recent work
+utilizes plan-and-solve decomposition to decrease the complexity and leverage
+self-tests to refine the generated program. Yet, planning deep-inside
+requirements in advance can be challenging, and the tests need to be accurate
+to accomplish self-improvement. To this end, we propose FunCoder, a code
+generation framework incorporating the divide-and-conquer strategy with
+functional consensus. Specifically, FunCoder recursively branches off
+sub-functions as smaller goals during code generation, represented by a tree
+hierarchy. These sub-functions are then composited to attain more complex
+objectives. Additionally, we designate functions via a consensus formed by
+identifying similarities in program behavior, mitigating error propagation.
+FunCoder outperforms state-of-the-art methods by +9.8% on average in HumanEval,
+MBPP, xCodeEval and MATH with GPT-3.5 and GPT-4. Moreover, our method
+demonstrates superiority on smaller models: With FunCoder, StableCode-3b
+surpasses GPT-3.5 by +18.6% and achieves 97.7% of GPT-4's performance on
+HumanEval. Further analysis reveals that our proposed dynamic function
+decomposition is capable of handling complex requirements, and the functional
+consensus prevails over self-testing in correctness evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Fine-Tuning Paradox: Boosting Translation Quality Without
+  Sacrificing LLM Abilities <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Stap, Eva Hasler, Bill Byrne, Christof Monz, Ke Tran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning large language models (LLMs) for machine translation has shown
+improvements in overall translation quality. However, it is unclear what is the
+impact of fine-tuning on desirable LLM behaviors that are not present in neural
+machine translation models, such as steerability, inherent document-level
+translation abilities, and the ability to produce less literal translations. We
+perform an extensive translation evaluation on the LLaMA and Falcon family of
+models with model size ranging from 7 billion up to 65 billion parameters. Our
+results show that while fine-tuning improves the general translation quality of
+LLMs, several abilities degrade. In particular, we observe a decline in the
+ability to perform formality steering, to produce technical translations
+through few-shot examples, and to perform document-level translation. On the
+other hand, we observe that the model produces less literal translations after
+fine-tuning on parallel data. We show that by including monolingual data as
+part of the fine-tuning data we can maintain the abilities while simultaneously
+enhancing overall translation quality. Our findings emphasize the need for
+fine-tuning strategies that preserve the benefits of LLMs for machine
+translation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 (long, main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Student Answer Forecasting: <span class="highlight-title">Transformer</span>-Driven Answer Choice Prediction
+  for Language Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elena Grazia Gado, Tommaso Martorella, Luca Zunino, Paola Mejia-Domenzain, Vinitra Swamy, Jibril Frej, Tanja Käser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent Tutoring Systems (ITS) enhance personalized learning by
+predicting student answers to provide immediate and customized instruction.
+However, recent research has primarily focused on the correctness of the answer
+rather than the student's performance on specific answer choices, limiting
+insights into students' thought processes and potential misconceptions. To
+address this gap, we present MCQStudentBert, an answer forecasting model that
+leverages the capabilities of Large Language Models (LLMs) to integrate
+contextual understanding of students' answering history along with the text of
+the questions and answers. By predicting the specific answer choices students
+are likely to make, practitioners can easily extend the model to new answer
+choices or remove answer choices for the same multiple-choice question (MCQ)
+without retraining the model. In particular, we compare MLP, LSTM, BERT, and
+Mistral 7B architectures to generate embeddings from students' past
+interactions, which are then incorporated into a finetuned BERT's
+answer-forecasting mechanism. We apply our pipeline to a dataset of language
+learning MCQ, gathered from an ITS with over 10,000 students to explore the
+predictive accuracy of MCQStudentBert, which incorporates student interaction
+patterns, in comparison to correct answer prediction and traditional
+mastery-learning feature-based approaches. This work opens the door to more
+personalized content, modularization, and granular support.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a poster paper at EDM 2024: 17th International Conference
+  on Educational Data Mining in Atlanta, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Would I Lie To You? Inference Time Alignment of Language Models using
+  Direct Preference Heads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avelina Asada Hadji-Kyriacou, Ognjen Arandjelovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained Language Models (LMs) exhibit strong zero-shot and in-context
+learning capabilities; however, their behaviors are often difficult to control.
+By utilizing Reinforcement Learning from Human Feedback (RLHF), it is possible
+to fine-tune unsupervised LMs to follow instructions and produce outputs that
+reflect human preferences. Despite its benefits, RLHF has been shown to
+potentially harm a language model's reasoning capabilities and introduce
+artifacts such as hallucinations where the model may fabricate facts. To
+address this issue we introduce Direct Preference Heads (DPH), a fine-tuning
+framework that enables LMs to learn human preference signals through an
+auxiliary reward head without directly affecting the output distribution of the
+language modeling head. We perform a theoretical analysis of our objective
+function and find strong ties to Conservative Direct Preference Optimization
+(cDPO). Finally we evaluate our models on GLUE, RACE, and the GPT4All
+evaluation suite and demonstrate that our method produces models which achieve
+higher scores than those fine-tuned with Supervised Fine-Tuning (SFT) or Direct
+Preference Optimization (DPO) alone.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kernel Language Entropy: Fine-grained Uncertainty Quantification for
+  LLMs from Semantic Similarities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Nikitin, Jannik Kossen, Yarin Gal, Pekka Marttinen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty quantification in Large Language Models (LLMs) is crucial for
+applications where safety and reliability are important. In particular,
+uncertainty can be used to improve the trustworthiness of LLMs by detecting
+factually incorrect model responses, commonly called hallucinations.
+Critically, one should seek to capture the model's semantic uncertainty, i.e.,
+the uncertainty over the meanings of LLM outputs, rather than uncertainty over
+lexical or syntactic variations that do not affect answer correctness. To
+address this problem, we propose Kernel Language Entropy (KLE), a novel method
+for uncertainty estimation in white- and black-box LLMs. KLE defines positive
+semidefinite unit trace kernels to encode the semantic similarities of LLM
+outputs and quantifies uncertainty using the von Neumann entropy. It considers
+pairwise semantic dependencies between answers (or semantic clusters),
+providing more fine-grained uncertainty estimates than previous methods based
+on hard clustering of answers. We theoretically prove that KLE generalizes the
+previous state-of-the-art method called semantic entropy and empirically
+demonstrate that it improves uncertainty quantification performance across
+multiple natural language generation datasets and LLM architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Out-of-Scope Intent Classification with Dual Encoding and
+  Threshold-based Re-Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossam M. Zawbaa, Wael Rashwan, Sourav Dutta, Haytham Assem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting out-of-scope user utterances is essential for task-oriented
+dialogues and intent classification. Current methodologies face difficulties
+with the unpredictable distribution of outliers and often rely on assumptions
+about data distributions. We present the Dual Encoder for Threshold-Based
+Re-Classification (DETER) to address these challenges. This end-to-end
+framework efficiently detects out-of-scope intents without requiring
+assumptions on data distributions or additional post-processing steps. The core
+of DETER utilizes dual text encoders, the Universal Sentence Encoder (USE) and
+the Transformer-based Denoising AutoEncoder (TSDAE), to generate user utterance
+embeddings, which are classified through a branched neural architecture.
+Further, DETER generates synthetic outliers using self-supervision and
+incorporates out-of-scope phrases from open-domain datasets. This approach
+ensures a comprehensive training set for out-of-scope detection. Additionally,
+a threshold-based re-classification mechanism refines the model's initial
+predictions. Evaluations on the CLINC-150, Stackoverflow, and Banking77
+datasets demonstrate DETER's efficacy. Our model outperforms previous
+benchmarks, increasing up to 13% and 5% in F1 score for known and unknown
+intents on CLINC-150 and Stackoverflow, and 16% for known and 24% % for unknown
+intents on Banking77. The source code has been released at
+https://github.com/Hossam-Mohammed-tech/Intent\_Classification\_OOS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Aspect Controllable Text Generation with Disentangled
+  Counterfactual Augmentation <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Liu, Xiangyu Liu, Xiangrong Zhu, Wei Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-aspect controllable text generation aims to control the generated texts
+in attributes from multiple aspects (e.g., "positive" from sentiment and
+"sport" from topic). For ease of obtaining training samples, existing works
+neglect attribute correlations formed by the intertwining of different
+attributes. Particularly, the stereotype formed by imbalanced attribute
+correlations significantly affects multi-aspect control. In this paper, we
+propose MAGIC, a new multi-aspect controllable text generation method with
+disentangled counterfactual augmentation. We alleviate the issue of imbalanced
+attribute correlations during training using counterfactual feature vectors in
+the attribute latent space by disentanglement. During inference, we enhance
+attribute correlations by target-guided counterfactual augmentation to further
+improve multi-aspect control. Experiments show that MAGIC outperforms
+state-of-the-art baselines in both imbalanced and balanced attribute
+correlation scenarios. Our source code and data are available at
+https://github.com/nju-websoft/MAGIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the 62nd Annual Meeting of the Association for
+  Computational Linguistics (ACL 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenKubeSec: LLM-Based Kubernetes Misconfiguration Detection,
+  Localization, Reasoning, and Remediation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ehud Malul, Yair Meidan, Dudu Mimran, Yuval Elovici, Asaf Shabtai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge associated with Kubernetes configuration files (KCFs) is that
+they are often highly complex and error-prone, leading to security
+vulnerabilities and operational setbacks. Rule-based (RB) tools for KCF
+misconfiguration detection rely on static rule sets, making them inherently
+limited and unable to detect newly-discovered misconfigurations. RB tools also
+suffer from misdetection, since mistakes are likely when coding the detection
+rules. Recent methods for detecting and remediating KCF misconfigurations are
+limited in terms of their scalability and detection coverage, or due to the
+fact that they have high expertise requirements and do not offer automated
+remediation along with misconfiguration detection. Novel approaches that employ
+LLMs in their pipeline rely on API-based, general-purpose, and mainly
+commercial models. Thus, they pose security challenges, have inconsistent
+classification performance, and can be costly. In this paper, we propose
+GenKubeSec, a comprehensive and adaptive, LLM-based method, which, in addition
+to detecting a wide variety of KCF misconfigurations, also identifies the exact
+location of the misconfigurations and provides detailed reasoning about them,
+along with suggested remediation. When empirically compared with three
+industry-standard RB tools, GenKubeSec achieved equivalent precision (0.990)
+and superior recall (0.999). When a random sample of KCFs was examined by a
+Kubernetes security expert, GenKubeSec's explanations as to misconfiguration
+localization, reasoning and remediation were 100% correct, informative and
+useful. To facilitate further advancements in this domain, we share the unique
+dataset we collected, a unified misconfiguration index we developed for label
+standardization, our experimentation code, and GenKubeSec itself as an
+open-source tool.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KNOW: A Real-World Ontology for Knowledge Capture with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arto Bendiken
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present KNOW--the Knowledge Navigator Ontology for the World--the first
+ontology designed to capture everyday knowledge to augment large language
+models (LLMs) in real-world generative AI use cases such as personal AI
+assistants. Our domain is human life, both its everyday concerns and its major
+milestones. We have limited the initial scope of the modeled concepts to only
+established human universals: spacetime (places, events) plus social (people,
+groups, organizations). The inclusion criteria for modeled concepts are
+pragmatic, beginning with universality and utility. We compare and contrast
+previous work such as Schema.org and Cyc--as well as attempts at a synthesis of
+knowledge graphs and language models--noting how LLMs already encode internally
+much of the commonsense tacit knowledge that took decades to capture in the Cyc
+project. We also make available code-generated software libraries for the 12
+most popular programming languages, enabling the direct use of ontology
+concepts in software engineering. We emphasize simplicity and developer
+experience in promoting AI interoperability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is In-Context Learning Sufficient for Instruction Following in LLMs? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Zhao, Maksym Andriushchenko, Francesco Croce, Nicolas Flammarion
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning (ICL) allows LLMs to learn from examples without changing
+their weights, which is a particularly promising capability for long-context
+LLMs that can potentially learn from many examples. Recently, Lin et al. (2024)
+proposed URIAL, a method using only three in-context examples to align base
+LLMs, achieving non-trivial instruction following performance. In this work, we
+show that, while effective, ICL alignment with URIAL still underperforms
+compared to instruction fine-tuning on established benchmarks such as MT-Bench
+and AlpacaEval 2.0 (LC), especially with more capable base LMs. Unlike for
+tasks such as classification, translation, or summarization, adding more ICL
+demonstrations for long-context LLMs does not systematically improve
+instruction following performance. To address this limitation, we derive a
+greedy selection approach for ICL examples that noticeably improves
+performance, yet without bridging the gap to instruction fine-tuning. Finally,
+we provide a series of ablation studies to better understand the reasons behind
+the remaining gap, and we show how some aspects of ICL depart from the existing
+knowledge and are specific to the instruction tuning setting. Overall, our work
+advances the understanding of ICL as an alignment technique. We provide our
+code at https://github.com/tml-epfl/icl-alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Code at https://github.com/tml-epfl/icl-alignment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DevEval: A Manually-Annotated Code Generation Benchmark Aligned with
+  Real-World Code Repositories <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Li, Ge Li, Yunfei Zhao, Yongmin Li, Huanyu Liu, Hao Zhu, Lecheng Wang, Kaibo Liu, Zheng Fang, Lanshen Wang, Jiazheng Ding, Xuanming Zhang, Yuqi Zhu, Yihong Dong, Zhi Jin, Binhua Li, Fei Huang, Yongbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  How to evaluate the coding abilities of Large Language Models (LLMs) remains
+an open question. We find that existing benchmarks are poorly aligned with
+real-world code repositories and are insufficient to evaluate the coding
+abilities of LLMs.
+  To address the knowledge gap, we propose a new benchmark named DevEval, which
+has three advances. (1) DevEval aligns with real-world repositories in multiple
+dimensions, e.g., code distributions and dependency distributions. (2) DevEval
+is annotated by 13 developers and contains comprehensive annotations (e.g.,
+requirements, original repositories, reference code, and reference
+dependencies). (3) DevEval comprises 1,874 testing samples from 117
+repositories, covering 10 popular domains (e.g., Internet, Database). Based on
+DevEval, we propose repository-level code generation and evaluate 8 popular
+LLMs on DevEval (e.g., gpt-4, gpt-3.5, StarCoder 2, DeepSeek Coder, CodeLLaMa).
+Our experiments reveal these LLMs' coding abilities in real-world code
+repositories. For example, in our experiments, the highest Pass@1 of
+gpt-4-turbo is only 53.04%. We also analyze LLMs' failed cases and summarize
+their shortcomings. We hope DevEval can facilitate the development of LLMs in
+real code repositories. DevEval, prompts, and LLMs' predictions have been
+released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by the 62nd Annual Meeting of the Association for
+  Computational Linguistics (ACL 2024). arXiv admin note: substantial text
+  overlap with arXiv:2404.00599, arXiv:2401.06401</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quest: Query-centric Data Synthesis Approach for Long-context Scaling of
+  Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19846v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19846v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaochen Gao, Xing Wu, Qi Fu, Songlin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models, initially pre-trained with a limited context length,
+can better handle longer texts by continuing training on a corpus with extended
+contexts. However, obtaining effective long-context data is challenging due to
+the scarcity and uneven distribution of long documents across different
+domains. To address this issue, we propose a Query-centric data synthesis
+method, abbreviated as Quest. Quest is an interpretable method based on the
+observation that documents retrieved by similar queries are relevant but
+low-redundant, thus well-suited for synthesizing long-context data. The method
+is also scalable and capable of constructing large amounts of long-context
+data. Using Quest, we synthesize a long-context dataset up to 128k context
+length, significantly outperforming other data synthesis methods on multiple
+long-context benchmark datasets. In addition, we further verify that the Quest
+method is predictable through scaling law experiments, making it a reliable
+solution for advancing long-context models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improve Student's Reasoning Generalizability through Cascading
+  Decomposed CoTs Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19842v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19842v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengwei Dai, Kun Li, Wei Zhou, Songlin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) exhibit enhanced reasoning at larger scales,
+driving efforts to distill these capabilities into smaller models via
+teacher-student learning. Previous works simply fine-tune student models on
+teachers' generated Chain-of-Thoughts (CoTs) data. Although these methods
+enhance in-domain (IND) reasoning performance, they struggle to generalize to
+out-of-domain (OOD) tasks. We believe that the widespread spurious correlations
+between questions and answers may lead the model to preset a specific answer
+which restricts the diversity and generalizability of its reasoning process. In
+this paper, we propose Cascading Decomposed CoTs Distillation (CasCoD) to
+address these issues by decomposing the traditional single-step learning
+process into two cascaded learning steps. Specifically, by restructuring the
+training objectives -- removing the answer from outputs and concatenating the
+question with the rationale as input -- CasCoD's two-step learning process
+ensures that students focus on learning rationales without interference from
+the preset answers, thus improving reasoning generalizability. Extensive
+experiments demonstrate the effectiveness of CasCoD on both IND and OOD
+benchmark reasoning datasets. Code can be found at
+https://github.com/C-W-D/CasCoD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Just Rewrite It Again: A Post-Processing Method for Enhanced Semantic
+  Similarity and Privacy Preservation of Differentially Private Rewritten Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19831v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19831v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Meisenbacher, Florian Matthes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The study of Differential Privacy (DP) in Natural Language Processing often
+views the task of text privatization as a $\textit{rewriting}$ task, in which
+sensitive input texts are rewritten to hide explicit or implicit private
+information. In order to evaluate the privacy-preserving capabilities of a DP
+text rewriting mechanism, $\textit{empirical privacy}$ tests are frequently
+employed. In these tests, an adversary is modeled, who aims to infer sensitive
+information (e.g., gender) about the author behind a (privatized) text. Looking
+to improve the empirical protections provided by DP rewriting methods, we
+propose a simple post-processing method based on the goal of aligning rewritten
+texts with their original counterparts, where DP rewritten texts are rewritten
+$\textit{again}$. Our results shown that such an approach not only produces
+outputs that are more semantically reminiscent of the original inputs, but also
+texts which score on average better in empirical privacy evaluations.
+Therefore, our approach raises the bar for DP rewriting methods in their
+empirical privacy evaluations, providing an extra layer of protection against
+malicious adversaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures, 2 tables. Accepted to ARES 2024 (IWAPS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Mutual Learning of Dialogue Discourse Parsing and Topic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19799v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19799v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahui Xu, Feng Jiang, Anningzhe Gao, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of large language models (LLMs) has propelled the development
+of dialogue systems. Unlike the popular ChatGPT-like assistant model, which
+only satisfies the user's preferences, task-oriented dialogue systems have also
+faced new requirements and challenges in the broader business field. They are
+expected to provide correct responses at each dialogue turn, at the same time,
+achieve the overall goal defined by the task. By understanding rhetorical
+structures and topic structures via topic segmentation and discourse parsing, a
+dialogue system may do a better planning to achieve both objectives. However,
+while both structures belong to discourse structure in linguistics, rhetorical
+structure and topic structure are mostly modeled separately or with one
+assisting the other in the prior work. The interaction between these two
+structures has not been considered for joint modeling and mutual learning.
+Furthermore, unsupervised learning techniques to achieve the above are not well
+explored. To fill this gap, we propose an unsupervised mutual learning
+framework of two structures leveraging the global and local connections between
+them. We extend the topic modeling between non-adjacent discourse units to
+ensure global structural relevance with rhetorical structures. We also
+incorporate rhetorical structures into the topic structure through a graph
+neural network model to ensure local coherence consistency. Finally, we utilize
+the similarity between the two fused structures for mutual learning. The
+experimental results demonstrate that our methods outperform all strong
+baselines on two dialogue rhetorical datasets (STAC and Molweni), as well as
+dialogue topic datasets (Doc2Dial and TIAGE).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SLM as Guardian: Pioneering AI Safety with Small Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ohjoon Kwon, Donghyeon Jeon, Nayoung Choi, Gyu-Hwung Cho, Changbong Kim, Hyunwoo Lee, Inho Kang, Sun Kim, Taiwoo Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most prior safety research of large language models (LLMs) has focused on
+enhancing the alignment of LLMs to better suit the safety requirements of
+humans. However, internalizing such safeguard features into larger models
+brought challenges of higher training cost and unintended degradation of
+helpfulness. To overcome such challenges, a modular approach employing a
+smaller LLM to detect harmful user queries is regarded as a convenient solution
+in designing LLM-based system with safety requirements.
+  In this paper, we leverage a smaller LLM for both harmful query detection and
+safeguard response generation. We introduce our safety requirements and the
+taxonomy of harmfulness categories, and then propose a multi-task learning
+mechanism fusing the two tasks into a single model. We demonstrate the
+effectiveness of our approach, providing on par or surpassing harmful query
+detection and safeguard response performance compared to the publicly available
+LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PDDLEGO: Iterative Planning in Textual Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19793v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19793v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zhang, Peter Jansen, Tianyi Zhang, Peter Clark, Chris Callison-Burch, Niket Tandon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Planning in textual environments have been shown to be a long-standing
+challenge even for current models. A recent, promising line of work uses LLMs
+to generate a formal representation of the environment that can be solved by a
+symbolic planner. However, existing methods rely on a fully-observed
+environment where all entity states are initially known, so a one-off
+representation can be constructed, leading to a complete plan. In contrast, we
+tackle partially-observed environments where there is initially no sufficient
+information to plan for the end-goal. We propose PDDLEGO that iteratively
+construct a planning representation that can lead to a partial plan for a given
+sub-goal. By accomplishing the sub-goal, more information is acquired to
+augment the representation, eventually achieving the end-goal. We show that
+plans produced by few-shot PDDLEGO are 43% more efficient than generating plans
+end-to-end on the Coin Collector simulation, with strong performance (98%) on
+the more complex Cooking World simulation where end-to-end LLMs fail to
+generate coherent plans (4%).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In *SEM 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Symbolic Tasks to Code Generation: Diversification Yields Better
+  Task Performers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dylan Zhang, Justin Wang, Francois Charton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction tuning -- tuning large language models on instruction-output
+pairs -- is a promising technique for making models better adapted to the real
+world. Yet, the key factors driving the model's capability to understand and
+follow instructions not seen during training remain under-explored. Our
+investigation begins with a series of synthetic experiments within the
+theoretical framework of a Turing-complete algorithm called Markov algorithm,
+which allows fine-grained control over the instruction-tuning data.
+Generalization and robustness with respect to the training distribution emerge
+once a diverse enough set of tasks is provided, even though very few examples
+are provided for each task. We extend these initial results to a real-world
+application scenario of code generation and find that a more diverse
+instruction set, extending beyond code-related tasks, improves the performance
+of code generation. Our observations suggest that a more diverse semantic space
+for instruction-tuning sets greatly improves the model's ability to follow
+instructions and perform tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dataflow-Guided Retrieval Augmentation for Repository-Level Code
+  Completion <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Cheng, Yuhan Wu, Wei Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed the deployment of code language models (LMs) in
+various code intelligence tasks such as code completion. Yet, it is challenging
+for pre-trained LMs to generate correct completions in private repositories.
+Previous studies retrieve cross-file context based on import relations or text
+similarity, which is insufficiently relevant to completion targets. In this
+paper, we propose a dataflow-guided retrieval augmentation approach, called
+DraCo, for repository-level code completion. DraCo parses a private repository
+into code entities and establishes their relations through an extended dataflow
+analysis, forming a repo-specific context graph. Whenever triggering code
+completion, DraCo precisely retrieves relevant background knowledge from the
+repo-specific context graph and generates well-formed prompts to query code
+LMs. Furthermore, we construct a large Python dataset, ReccEval, with more
+diverse completion targets. Our experiments demonstrate the superior accuracy
+and applicable efficiency of DraCo, improving code exact match by 3.43% and
+identifier F1-score by 3.27% on average compared to the state-of-the-art
+approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in the 62nd Annual Meeting of the Association for
+  Computational Linguistics (ACL 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Consistency and Role-Specific Knowledge Capturing by
+  Rebuilding Fictional Character's Persona 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeiyoon Park, Chanjun Park, Heuiseok Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent introduction of Assistants API, it is expected that
+document-based language models will be actively used in various domains,
+especially Role-playing. However, a key challenge lies in utilizing
+protagonist's persona: Assistants API often fails to achieve with its search
+because the information extraction part is different each time and it often
+omits important information such as protagonist's backstory or relationships.
+It is hard to maintain a consistent persona simply by using the persona
+document as input to the Assistants API. To address the challenge of achieving
+stable persona consistency, we propose CharacterGPT, a novel persona
+reconstruction framework to alleviate the shortcomings of the Assistants API.
+Our method involves Character Persona Training (CPT), an effective persona
+rebuilding process that updates the character persona by extracting the
+character's traits from given summary of the novel for each character as if the
+story in a novel progresses. In our experiments, we ask each character to take
+the Big Five Inventory personality test in various settings and analyze the
+results. To assess whether it can think outside the box, we let each character
+generate short novels. Extensive experiments and human evaluation demonstrate
+that CharacterGPT presents new possibilities for role-playing agent research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Reinforcement Learning with Label-Sensitive Reward for Natural
+  Language Understanding <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19763v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19763v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuo Liao, Shuang Li, Meng Zhao, Liqun Liu, Mengge Xue, Zhenyu Hu, Honglin Han, Chengguo Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent strides in large language models (LLMs) have yielded remarkable
+performance, leveraging reinforcement learning from human feedback (RLHF) to
+significantly enhance generation and alignment capabilities. However, RLHF
+encounters numerous challenges, including the objective mismatch issue, leading
+to suboptimal performance in Natural Language Understanding (NLU) tasks. To
+address this limitation, we propose a novel Reinforcement Learning framework
+enhanced with Label-sensitive Reward (RLLR) to amplify the performance of LLMs
+in NLU tasks. By incorporating label-sensitive pairs into reinforcement
+learning, our method aims to adeptly capture nuanced label-sensitive semantic
+features during RL, thereby enhancing natural language understanding.
+Experiments conducted on five diverse foundation models across eight tasks
+showcase promising results. In comparison to Supervised Fine-tuning models
+(SFT), RLLR demonstrates an average performance improvement of 1.54%. Compared
+with RLHF models, the improvement averages at 0.69%. These results reveal the
+effectiveness of our method for LLMs in NLU tasks. Code and data available at:
+https://github.com/MagiaSN/ACL2024_RLLR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept at ACL2024 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ X-Instruction: Aligning Language Model in Low-resource Languages with
+  Self-curated Cross-lingual Instructions <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chong Li, Wen Yang, Jiajun Zhang, Jinliang Lu, Shaonan Wang, Chengqing Zong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models respond well in high-resource languages like English
+but struggle in low-resource languages. It may arise from the lack of
+high-quality instruction following data in these languages. Directly
+translating English samples into these languages can be a solution but
+unreliable, leading to responses with translation errors and lacking
+language-specific or cultural knowledge. To address this issue, we propose a
+novel method to construct cross-lingual instruction following samples with
+instruction in English and response in low-resource languages. Specifically,
+the language model first learns to generate appropriate English instructions
+according to the natural web texts in other languages as responses. The
+candidate cross-lingual instruction tuning samples are further refined and
+diversified. We have employed this method to build a large-scale cross-lingual
+instruction tuning dataset on 10 languages, namely X-Instruction. The
+instruction data built using our method incorporate more language-specific
+knowledge compared with the naive translation method. Experimental results have
+shown that the response quality of the model tuned on X-Instruction greatly
+exceeds the model distilled from a powerful teacher model, reaching or even
+surpassing the ones of ChatGPT. In addition, we find that models tuned on
+cross-lingual instruction following samples can follow the instruction in the
+output language without further tuning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024. Our codes, data and model weights are available at
+  https://github.com/ZNLP/X-Instruction</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PertEval: Unveiling Real Knowledge Capacity of LLMs with
+  Knowledge-Invariant Perturbations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19740v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19740v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiatong Li, Renjun Hu, Kunzhe Huang, Yan Zhuang, Qi Liu, Mengxiao Zhu, Xing Shi, Wei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expert-designed close-ended benchmarks serve as vital tools in assessing the
+knowledge capacity of large language models (LLMs). Despite their widespread
+use, concerns have mounted regarding their reliability due to limited test
+scenarios and an unavoidable risk of data contamination. To rectify this, we
+present PertEval, a toolkit devised for in-depth probing of LLMs' knowledge
+capacity through knowledge-invariant perturbations. These perturbations employ
+human-like restatement techniques to generate on-the-fly test samples from
+static benchmarks, meticulously retaining knowledge-critical content while
+altering irrelevant details. Our toolkit further includes a suite of transition
+analyses that compare performance on raw vs. perturbed test sets to precisely
+assess LLMs' genuine knowledge capacity. Six state-of-the-art LLMs are
+re-evaluated using PertEval. Results reveal significantly inflated performance
+of the LLMs on raw benchmarks, including an absolute 21% overestimation for
+GPT-4. Additionally, through a nuanced response pattern analysis, we discover
+that PertEval retains LLMs' uncertainty to specious knowledge, potentially
+being resolved through rote memorization and leading to inflated performance.
+We also find that the detailed transition analyses by PertEval could illuminate
+weaknesses in existing LLMs' knowledge mastery and guide the development of
+refinement. Given these insights, we posit that PertEval can act as an
+essential tool that, when applied alongside any close-ended benchmark, unveils
+the true knowledge capacity of LLMs, marking a significant step toward more
+trustworthy LLM evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 12 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Imitation: Learning Key Reasoning Steps from Dual
+  Chain-of-Thoughts in Reasoning Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19737v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19737v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengwei Dai, Kun Li, Wei Zhou, Songlin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Large Language Models (LLMs) scale up and gain powerful Chain-of-Thoughts
+(CoTs) reasoning abilities, practical resource constraints drive efforts to
+distill these capabilities into more compact Smaller Language Models (SLMs). We
+find that CoTs consist mainly of simple reasoning forms, with a small
+proportion ($\approx 4.7\%$) of key reasoning steps that truly impact
+conclusions. However, previous distillation methods typically involve
+supervised fine-tuning student SLMs only on correct CoTs data produced by
+teacher LLMs, resulting in students struggling to learn the key reasoning
+steps, instead imitating the teacher's reasoning forms and making errors or
+omissions on these steps. To address these issues, drawing an analogy to human
+learning, where analyzing mistakes according to correct solutions often reveals
+the crucial steps leading to successes or failures, we propose
+mistak\textbf{E}-\textbf{D}riven key reason\textbf{I}ng step
+distilla\textbf{T}ion (\textbf{EDIT}), a novel method that further aids SLMs
+learning key reasoning steps rather than mere simple fine-tuning. Firstly, to
+expose these crucial steps in CoTs, we design specific prompts to generate dual
+CoTs data with similar reasoning paths but divergent conclusions. Then, we
+apply the minimum edit distance algorithm on the dual CoTs data to locate these
+key steps and optimize the likelihood of these steps. Extensive experiments
+validate the effectiveness of EDIT across both in-domain and out-of-domain
+benchmark reasoning datasets. Further analysis shows that EDIT can generate
+high-quality CoTs with more correct key reasoning steps. Notably, we also
+explore how different mistake patterns affect performance and find that EDIT
+benefits more from logical errors than from knowledge or mathematical
+calculation errors in dual CoTs\footnote{Code can be found at
+\url{https://github.com/C-W-D/EDIT}}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Two Optimizers Are Better Than One: LLM Catalyst for Enhancing
+  Gradient-Based Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixian Guo, Ming Liu, Zhilong Ji, Jinfeng Bai, Yiwen Guo, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning a skill generally relies on both practical experience by doer and
+insightful high-level guidance by instructor. Will this strategy also work well
+for solving complex non-convex optimization problems? Here, a common
+gradient-based optimizer acts like a disciplined doer, making locally optimal
+update at each step. Recent methods utilize large language models (LLMs) to
+optimize solutions for concrete problems by inferring from natural language
+instructions, akin to a high-level instructor. In this paper, we show that
+these two optimizers are complementary to each other, suggesting a
+collaborative optimization approach. The gradient-based optimizer and LLM-based
+optimizer are combined in an interleaved manner. We instruct LLMs using task
+descriptions and timely optimization trajectories recorded during
+gradient-based optimization. Inferred results from LLMs are used as restarting
+points for the next stage of gradient optimization. By leveraging both the
+locally rigorous gradient-based optimizer and the high-level deductive
+LLM-based optimizer, our combined optimization method consistently yields
+improvements over competitive baseline prompt tuning methods. Our results
+demonstrate the synergistic effect of conventional gradient-based optimization
+and the inference ability of LLMs. The code is released at
+https://github.com/guozix/LLM-catalyst.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Large Vision Language Models with Self-Training on Image
+  Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19716v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19716v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihe Deng, Pan Lu, Fan Yin, Ziniu Hu, Sheng Shen, James Zou, Kai-Wei Chang, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision language models (LVLMs) integrate large language models (LLMs)
+with pre-trained vision encoders, thereby activating the perception capability
+of the model to understand image inputs for different queries and conduct
+subsequent reasoning. Improving this capability requires high-quality
+vision-language data, which is costly and labor-intensive to acquire.
+Self-training approaches have been effective in single-modal settings to
+alleviate the need for labeled data by leveraging model's own generation.
+However, effective self-training remains a challenge regarding the unique
+visual perception and reasoning capability of LVLMs. To address this, we
+introduce Self-Training on Image Comprehension (STIC), which emphasizes a
+self-training approach specifically for image comprehension. First, the model
+self-constructs a preference dataset for image descriptions using unlabeled
+images. Preferred responses are generated through a step-by-step prompt, while
+dis-preferred responses are generated from either corrupted images or
+misleading prompts. To further self-improve reasoning on the extracted visual
+information, we let the model reuse a small portion of existing
+instruction-tuning data and append its self-generated image descriptions to the
+prompts. We validate the effectiveness of STIC across seven different
+benchmarks, demonstrating substantial performance gains of 4.0% on average
+while using 70% less supervised fine-tuning data than the current method.
+Further studies investigate various components of STIC and highlight its
+potential to leverage vast quantities of unlabeled images for self-training.
+Code and data are made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 14 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpecDec++: Boosting Speculative Decoding via Adaptive Candidate Lengths 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixuan Huang, Xudong Guo, Mengdi Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speculative decoding reduces the inference latency of a target large language
+model via utilizing a smaller and faster draft model. Its performance depends
+on a hyperparameter K -- the candidate length, i.e., the number of candidate
+tokens for the target model to verify in each round. However, previous methods
+often use simple heuristics to choose K, which may result in sub-optimal
+performance. We study the choice of the candidate length K and formulate it as
+a Markov Decision Process. We theoretically show that the optimal policy of
+this Markov decision process takes the form of a threshold policy, i.e., the
+current speculation should stop and be verified when the probability of getting
+a rejection exceeds a threshold value. Motivated by this theory, we propose
+SpecDec++, an enhanced version of speculative decoding that adaptively
+determines the candidate length on the fly. We augment the draft model with a
+trained acceptance prediction head to predict the conditional acceptance
+probability of the candidate tokens. SpecDec++ will stop the current
+speculation when the predicted probability that at least one token gets
+rejected exceeds a threshold. We implement SpecDec++ and apply it to the
+llama-2-chat 7B & 70B model pair. Our adaptive method achieves a 2.04x speedup
+on the Alpaca dataset (an additional 7.2% improvement over the baseline
+speculative decoding). On the GSM8K and HumanEval datasets, our method achieves
+a 2.26x speedup (9.4% improvement) and 2.23x speedup (11.1% improvement),
+respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Significance of Chain of Thought in Gender Bias Mitigation for
+  English-Dravidian Machine Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19701v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19701v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lavanya Prahallad, Radhika Mamidi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gender bias in machine translation (MT) systems poses a significant challenge
+to achieving accurate and inclusive translations. This paper examines gender
+bias in machine translation systems for languages such as Telugu and Kannada
+from the Dravidian family, analyzing how gender inflections affect translation
+accuracy and neutrality using Google Translate and ChatGPT. It finds that while
+plural forms can reduce bias, individual-centric sentences often maintain the
+bias due to historical stereotypes. The study evaluates the Chain of Thought
+processing, noting significant bias mitigation from 80% to 4% in Telugu and
+from 40% to 0% in Kannada. It also compares Telugu and Kannada translations,
+emphasizing the need for language specific strategies to address these
+challenges and suggesting directions for future research to enhance fairness in
+both data preparation and prompts during inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One Token Can Help! Learning Scalable and Pluggable Virtual Tokens for
+  Retrieval-Augmented Large Language Models <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutao Zhu, Zhaoheng Huang, Zhicheng Dou, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented generation (RAG) is a promising way to improve large
+language models (LLMs) for generating more factual, accurate, and up-to-date
+content. Existing methods either optimize prompts to guide LLMs in leveraging
+retrieved information or directly fine-tune the LLMs to adapt to RAG scenarios.
+Although fine-tuning can yield better performance, it often compromises the
+LLMs' general generation capabilities by modifying their parameters. This
+limitation poses challenges in practical applications, especially when LLMs are
+already deployed, as parameter adjustments may affect their original
+functionality. To address this, we propose a novel method that involves
+learning scalable and pluggable virtual tokens for RAG. By maintaining the
+LLMs' original parameters and fine-tuning only the embeddings of these
+pluggable tokens, our approach not only enhances LLMs' performance but also
+preserves their general generation capacities. Furthermore, we design several
+training strategies to improve the scalability, flexibility, and
+generalizability of our method. Comprehensive experiments across nine
+question-answering tasks demonstrate the superiority of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>working in progress, repo: https://github.com/DaoD/SPRING/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PATIENT-Ψ: Using Large Language Models to Simulate Patients for
+  Training Mental Health Professionals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19660v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19660v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyi Wang, Stephanie Milani, Jamie C. Chiu, Shaun M. Eack, Travis Labrum, Samuel M. Murphy, Nev Jones, Kate Hardy, Hong Shen, Fei Fang, Zhiyu Zoey Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mental illness remains one of the most critical public health issues, with a
+significant gap between the available mental health support and patient needs.
+Many mental health professionals highlight a disconnect between their training
+and real-world patient interactions, leaving some trainees feeling unprepared
+and potentially affecting their early career success. In this paper, we propose
+PATIENT-{\Psi}, a novel patient simulation framework for cognitive behavior
+therapy (CBT) training. To build PATIENT-{\Psi}, we constructed diverse patient
+profiles and their corresponding cognitive models based on CBT principles, and
+then used large language models (LLMs) programmed with the patient cognitive
+models to act as a simulated therapy patient. We propose an interactive
+training scheme, PATIENT-{\Psi}-TRAINER, for mental health trainees to practice
+a key skill in CBT -- formulating the cognitive model of the patient -- through
+role-playing a therapy session with PATIENT-{\Psi}. To evaluate PATIENT-{\Psi},
+we conducted a user study of 4 mental health trainees and 10 experts. The
+results demonstrate that practice using PATIENT-{\Psi}-TRAINER greatly enhances
+the perceived skill acquisition and confidence of the trainees beyond existing
+forms of training such as textbooks, videos, and role-play with non-patients.
+Based on the experts' perceptions, PATIENT-{\Psi} is perceived to be closer to
+real patient interactions than GPT-4, and PATIENT-{\Psi}-TRAINER holds strong
+promise to improve trainee competencies. Our pioneering patient simulation
+training framework, using LLMs, holds great potential to enhance and advance
+mental health training, ultimately leading to improved patient care and
+outcomes. We will release all our data, code, and the training platform.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detecting Hallucinations in Large Language Model Generation: A Token
+  Probability Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ernesto Quevedo, Jorge Yero, Rachel Koerner, Pablo Rivas, Tomas Cerny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concerns regarding the propensity of Large Language Models (LLMs) to produce
+inaccurate outputs, also known as hallucinations, have escalated. Detecting
+them is vital for ensuring the reliability of applications relying on
+LLM-generated content. Current methods often demand substantial resources and
+rely on extensive LLMs or employ supervised learning with multidimensional
+features or intricate linguistic and semantic analyses difficult to reproduce
+and largely depend on using the same LLM that hallucinated. This paper
+introduces a supervised learning approach employing two simple classifiers
+utilizing only four numerical features derived from tokens and vocabulary
+probabilities obtained from other LLM evaluators, which are not necessarily the
+same. The method yields promising results, surpassing state-of-the-art outcomes
+in multiple tasks across three different benchmarks. Additionally, we provide a
+comprehensive examination of the strengths and weaknesses of our approach,
+highlighting the significance of the features utilized and the LLM employed as
+an evaluator. We have released our code publicly at
+https://github.com/Baylor-AI/HalluDetect.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICAI'24 - The 26th Int'l Conf on Artificial Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GKT: A Novel Guidance-Based Knowledge Transfer Framework For Efficient
+  Cloud-edge Collaboration LLM Deployment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Yao, Zuchao Li, Hai Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The burgeoning size of Large Language Models (LLMs) has led to enhanced
+capabilities in generating responses, albeit at the expense of increased
+inference times and elevated resource demands. Existing methods of
+acceleration, predominantly hinged on knowledge distillation, generally
+necessitate fine-tuning of considerably large models, such as Llama-7B, posing
+a challenge for average users. Furthermore, present techniques for expediting
+inference and reducing costs operate independently. To address these issues, we
+introduce a novel and intuitive Guidance-based Knowledge Transfer (GKT)
+framework. This approach leverages a larger LLM as a ''teacher'' to create
+guidance prompts, paired with a smaller ''student'' model to finalize
+responses. Remarkably, GKT requires no fine-tuning and doesn't necessitate the
+teacher and student models to have the same vocabulary, allowing for extensive
+batch generation to accelerate the process while ensuring user customization.
+GKT can be seamlessly integrated into cloud-edge collaboration architectures,
+and is versatile enough for plug-and-play application across various models. It
+excels in both efficiency and affordability, epitomizing a ''cheap and
+cheerful'' solution. GKT achieves a maximum accuracy improvement of 14.18%,
+along with a 10.72 times speed-up on GSM8K and an accuracy improvement of 14.00
+% along with a 7.73 times speed-up in CSQA. When utilizing ChatGPT as teacher
+model and Llama2-70B as the student model, we can achieve 95.00% of ChatGPT's
+performance at 52% of the cost. The results highlight substantial enhancements
+in accuracy and processing speed on the GSM8K and CSQA datasets, surpassing the
+performance of using either the student or teacher models in isolation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Easy Problems That LLMs Get Wrong 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Williams, James Huckle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a comprehensive Linguistic Benchmark designed to evaluate the
+limitations of Large Language Models (LLMs) in domains such as logical
+reasoning, spatial intelligence, and linguistic understanding, among others.
+Through a series of straightforward questions, it uncovers the significant
+limitations of well-regarded models to perform tasks that humans manage with
+ease. It also highlights the potential of prompt engineering to mitigate some
+errors and underscores the necessity for better training methodologies. Our
+findings stress the importance of grounding LLMs with human reasoning and
+common sense, emphasising the need for human-in-the-loop for enterprise
+applications. We hope this work paves the way for future research to enhance
+the usefulness and reliability of new models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AutogenAI Ltd. Associated code at
+  https://github.com/autogenai/easy-problems-that-llms-get-wrong</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SVFT: Parameter-Efficient Fine-Tuning with Singular Vectors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19597v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19597v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vijay Lingam, Atula Tejaswi, Aditya Vavre, Aneesh Shetty, Gautham Krishna Gudur, Joydeep Ghosh, Alex Dimakis, Eunsol Choi, Aleksandar Bojchevski, Sujay Sanghavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Popular parameter-efficient fine-tuning (PEFT) methods, such as LoRA and its
+variants, freeze pre-trained model weights \(W\) and inject learnable matrices
+\(\Delta W\). These \(\Delta W\) matrices are structured for efficient
+parameterization, often using techniques like low-rank approximations or
+scaling vectors. However, these methods typically show a performance gap
+compared to full fine-tuning. Although recent PEFT methods have narrowed this
+gap, they do so at the cost of additional learnable parameters. We propose
+SVFT, a simple approach that fundamentally differs from existing methods: the
+structure imposed on \(\Delta W\) depends on the specific weight matrix \(W\).
+Specifically, SVFT updates \(W\) as a sparse combination of outer products of
+its singular vectors, training only the coefficients (scales) of these sparse
+combinations. This approach allows fine-grained control over expressivity
+through the number of coefficients. Extensive experiments on language and
+vision benchmarks show that SVFT recovers up to 96% of full fine-tuning
+performance while training only 0.006 to 0.25% of parameters, outperforming
+existing methods that only recover up to 85% performance using 0.03 to 0.8% of
+the trainable parameter budget.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5 figures, 14 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why Larger Language Models Do In-context Learning Differently? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenmei Shi, Junyi Wei, Zhuoyan Xu, Yingyu Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLM) have emerged as a powerful tool for AI, with the
+key ability of in-context learning (ICL), where they can perform well on unseen
+tasks based on a brief series of task examples without necessitating any
+adjustments to the model parameters. One recent interesting mysterious
+observation is that models of different scales may have different ICL
+behaviors: larger models tend to be more sensitive to noise in the test
+context. This work studies this observation theoretically aiming to improve the
+understanding of LLM and ICL. We analyze two stylized settings: (1) linear
+regression with one-layer single-head linear transformers and (2) parity
+classification with two-layer multiple attention heads transformers (non-linear
+data and non-linear model). In both settings, we give closed-form optimal
+solutions and find that smaller models emphasize important hidden features
+while larger ones cover more hidden features; thus, smaller models are more
+robust to noise while larger ones are more easily distracted, leading to
+different ICL behaviors. This sheds light on where transformers pay attention
+to and how that affects ICL. Preliminary experimental results on large base and
+chat models provide positive support for our analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoSy: Evaluating Textual Explanations of Neurons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Kopf, Philine Lou Bommer, Anna Hedström, Sebastian Lapuschkin, Marina M. -C. Höhne, Kirill Bykov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A crucial aspect of understanding the complex nature of Deep Neural Networks
+(DNNs) is the ability to explain learned concepts within their latent
+representations. While various methods exist to connect neurons to textual
+descriptions of human-understandable concepts, evaluating the quality of these
+explanation methods presents a major challenge in the field due to a lack of
+unified, general-purpose quantitative evaluation. In this work, we introduce
+CoSy (Concept Synthesis) -- a novel, architecture-agnostic framework to
+evaluate the quality of textual explanations for latent neurons. Given textual
+explanations, our proposed framework leverages a generative model conditioned
+on textual input to create data points representing the textual explanation.
+Then, the neuron's response to these explanation data points is compared with
+the response to control data points, providing a quality estimate of the given
+explanation. We ensure the reliability of our proposed framework in a series of
+meta-evaluation experiments and demonstrate practical value through insights
+from benchmarking various concept-based textual explanation methods for
+Computer Vision tasks, showing that tested explanation methods significantly
+differ in quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ESG-FTSE: A corpus of news articles with ESG relevance labels and use
+  cases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20218v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20218v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mariya Pavlova, Bernard Casey, Miaosen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present ESG-FTSE, the first corpus comprised of news articles with
+Environmental, Social and Governance (ESG) relevance annotations. In recent
+years, investors and regulators have pushed ESG investing to the mainstream due
+to the urgency of climate change. This has led to the rise of ESG scores to
+evaluate an investment's credentials as socially responsible. While demand for
+ESG scores is high, their quality varies wildly. Quantitative techniques can be
+applied to improve ESG scores, thus, responsible investing. To contribute to
+resource building for ESG and financial text mining, we pioneer the ESG-FTSE
+corpus. We further present the first of its kind ESG annotation schema. It has
+three levels: a binary classification (relevant versus irrelevant news
+articles), ESG classification (ESG-related news articles), and target company.
+Both supervised and unsupervised learning experiments for ESG relevance
+detection were conducted to demonstrate that the corpus can be used in
+different settings to derive accurate ESG predictions. Keywords: corpus
+annotation, ESG labels, annotation schema, news article, natural language
+processing
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The corpus is available at
+  https://github.com/mariavpavlova/ESG-FTSE-Corpus.
+  https://aclanthology.org/2024.finnlp-1.14/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safe Multi-agent Reinforcement Learning with Natural Language
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyan Wang, Meng Fang, Tristan Tomilin, Fei Fang, Yali Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The role of natural language constraints in Safe Multi-agent Reinforcement
+Learning (MARL) is crucial, yet often overlooked. While Safe MARL has vast
+potential, especially in fields like robotics and autonomous vehicles, its full
+potential is limited by the need to define constraints in pre-designed
+mathematical terms, which requires extensive domain expertise and reinforcement
+learning knowledge, hindering its broader adoption. To address this limitation
+and make Safe MARL more accessible and adaptable, we propose a novel approach
+named Safe Multi-agent Reinforcement Learning with Natural Language constraints
+(SMALL). Our method leverages fine-tuned language models to interpret and
+process free-form textual constraints, converting them into semantic embeddings
+that capture the essence of prohibited states and behaviours. These embeddings
+are then integrated into the multi-agent policy learning process, enabling
+agents to learn policies that minimize constraint violations while optimizing
+rewards. To evaluate the effectiveness of SMALL, we introduce the LaMaSafe, a
+multi-task benchmark designed to assess the performance of multiple agents in
+adhering to natural language constraints. Empirical evaluations across various
+environments demonstrate that SMALL achieves comparable rewards and
+significantly fewer constraint violations, highlighting its effectiveness in
+understanding and enforcing natural language constraints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient LLM-Jailbreaking by Introducing Visual Modality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenxing Niu, Yuyao Sun, Haodong Ren, Haoxuan Ji, Quan Wang, Xiaoke Ma, Gang Hua, Rong Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on jailbreaking attacks against large language models
+(LLMs), eliciting them to generate objectionable content in response to harmful
+user queries. Unlike previous LLM-jailbreaks that directly orient to LLMs, our
+approach begins by constructing a multimodal large language model (MLLM)
+through the incorporation of a visual module into the target LLM. Subsequently,
+we conduct an efficient MLLM-jailbreak to generate jailbreaking embeddings
+embJS. Finally, we convert the embJS into text space to facilitate the
+jailbreaking of the target LLM. Compared to direct LLM-jailbreaking, our
+approach is more efficient, as MLLMs are more vulnerable to jailbreaking than
+pure LLM. Additionally, to improve the attack success rate (ASR) of
+jailbreaking, we propose an image-text semantic matching scheme to identify a
+suitable initial input. Extensive experiments demonstrate that our approach
+surpasses current state-of-the-art methods in terms of both efficiency and
+effectiveness. Moreover, our approach exhibits superior cross-class
+jailbreaking capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video-Language Critic: Transferable Reward Functions for
+  Language-Conditioned Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minttu Alakuijala, Reginald McLean, Isaac Woungang, Nariman Farsad, Samuel Kaski, Pekka Marttinen, Kai Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language is often the easiest and most convenient modality for humans
+to specify tasks for robots. However, learning to ground language to behavior
+typically requires impractical amounts of diverse, language-annotated
+demonstrations collected on each target robot. In this work, we aim to separate
+the problem of what to accomplish from how to accomplish it, as the former can
+benefit from substantial amounts of external observation-only data, and only
+the latter depends on a specific robot embodiment. To this end, we propose
+Video-Language Critic, a reward model that can be trained on readily available
+cross-embodiment data using contrastive learning and a temporal ranking
+objective, and use it to score behavior traces from a separate reinforcement
+learning actor. When trained on Open X-Embodiment data, our reward model
+enables 2x more sample-efficient policy training on Meta-World tasks than a
+sparse reward only, despite a significant domain gap. Using in-domain data but
+in a challenging task generalization setting on Meta-World, we further
+demonstrate more sample-efficient training than is possible with prior
+language-conditioned reward models that are either trained with binary
+classification, use static images, or do not leverage the temporal information
+present in video data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages in the main text, 16 pages including references and
+  supplementary materials. 4 figures and 3 tables in the main text, 1 table in
+  supplementary materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Similarity is Not All You Need: Endowing Retrieval Augmented Generation
+  with Multi Layered Thoughts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19893v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19893v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunjing Gan, Dan Yang, Binbin Hu, Hanxiao Zhang, Siyuan Li, Ziqi Liu, Yue Shen, Lin Ju, Zhiqiang Zhang, Jinjie Gu, Lei Liang, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, large language models (LLMs) have made remarkable
+achievements in various domains. However, the untimeliness and cost of
+knowledge updates coupled with hallucination issues of LLMs have curtailed
+their applications in knowledge intensive tasks, where retrieval augmented
+generation (RAG) can be of help. Nevertheless, existing retrieval augmented
+models typically use similarity as a bridge between queries and documents and
+follow a retrieve then read procedure. In this work, we argue that similarity
+is not always the panacea and totally relying on similarity would sometimes
+degrade the performance of retrieval augmented generation. To this end, we
+propose MetRag, a Multi layEred Thoughts enhanced Retrieval Augmented
+Generation framework. To begin with, beyond existing similarity oriented
+thought, we embrace a small scale utility model that draws supervision from an
+LLM for utility oriented thought and further come up with a smarter model by
+comprehensively combining the similarity and utility oriented thoughts.
+Furthermore, given the fact that the retrieved document set tends to be huge
+and using them in isolation makes it difficult to capture the commonalities and
+characteristics among them, we propose to make an LLM as a task adaptive
+summarizer to endow retrieval augmented generation with compactness-oriented
+thought. Finally, with multi layered thoughts from the precedent stages, an LLM
+is called for knowledge augmented generation. Extensive experiments on
+knowledge-intensive tasks have demonstrated the superiority of MetRag.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Words to Actions: Unveiling the Theoretical Underpinnings of
+  LLM-Driven Autonomous Systems <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianliang He, Siyu Chen, Fengzhuo Zhang, Zhuoran Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, from a theoretical lens, we aim to understand why large
+language model (LLM) empowered agents are able to solve decision-making
+problems in the physical world. To this end, consider a hierarchical
+reinforcement learning (RL) model where the LLM Planner and the Actor perform
+high-level task planning and low-level execution, respectively. Under this
+model, the LLM Planner navigates a partially observable Markov decision process
+(POMDP) by iteratively generating language-based subgoals via prompting. Under
+proper assumptions on the pretraining data, we prove that the pretrained LLM
+Planner effectively performs Bayesian aggregated imitation learning (BAIL)
+through in-context learning. Additionally, we highlight the necessity for
+exploration beyond the subgoals derived from BAIL by proving that naively
+executing the subgoals returned by LLM leads to a linear regret. As a remedy,
+we introduce an $\epsilon$-greedy exploration strategy to BAIL, which is proven
+to incur sublinear regret when the pretraining error is small. Finally, we
+extend our theoretical framework to include scenarios where the LLM Planner
+serves as a world model for inferring the transition model of the environment
+and to multi-agent settings, enabling coordination among multiple Actors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SysCaps: Language Interfaces for Simulation Surrogates of Complex
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19653v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19653v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Emami, Zhaonan Li, Saumya Sinha, Truc Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven simulation surrogates help computational scientists study complex
+systems. They can also help inform impactful policy decisions. We introduce a
+learning framework for surrogate modeling where language is used to interface
+with the underlying system being simulated. We call a language description of a
+system a "system caption", or SysCap. To address the lack of datasets of paired
+natural language SysCaps and simulation runs, we use large language models
+(LLMs) to synthesize high-quality captions. Using our framework, we train
+multimodal text and timeseries regression models for two real-world simulators
+of complex energy systems. Our experiments demonstrate the feasibility of
+designing language interfaces for real-world surrogate models at comparable
+accuracy to standard baselines. We qualitatively and quantitatively show that
+SysCaps unlock text-prompt-style surrogate modeling and new generalization
+abilities beyond what was previously possible. We will release the generated
+SysCaps datasets and our code to support follow-on studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Perplexed by Perplexity: Perplexity-Based Data Pruning With Small
+  Reference Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zachary Ankner, Cody Blakeney, Kartik Sreenivasan, Max Marion, Matthew L. Leavitt, Mansheej Paul
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate whether small language models can determine
+high-quality subsets of large-scale text datasets that improve the performance
+of larger language models. While existing work has shown that pruning based on
+the perplexity of a larger model can yield high-quality data, we investigate
+whether smaller models can be used for perplexity-based pruning and how pruning
+is affected by the domain composition of the data being pruned. We demonstrate
+that for multiple dataset compositions, perplexity-based pruning of pretraining
+data can \emph{significantly} improve downstream task performance: pruning
+based on perplexities computed with a 125 million parameter model improves the
+average performance on downstream tasks of a 3 billion parameter model by up to
+2.04 and achieves up to a $1.45\times$ reduction in pretraining steps to reach
+commensurate baseline performance. Furthermore, we demonstrate that such
+perplexity-based data pruning also yields downstream performance gains in the
+over-trained and data-constrained regimes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Impact of Coding Data Instruction Fine-Tuning on Large
+  Language Models Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinlu Zhang, Zhiyu Zoey Chen, Xi Ye, Xianjun Yang, Lichang Chen, William Yang Wang, Linda Ruth Petzold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction Fine-Tuning (IFT) significantly enhances the zero-shot
+capabilities of pretrained Large Language Models (LLMs). While coding data is
+known to boost reasoning abilities during LLM pretraining, its role in
+activating internal reasoning capacities during IFT remains understudied. This
+paper investigates a key question: How does coding data impact LLMs' reasoning
+capacities during the IFT stage? To explore this, we thoroughly examine the
+impact of coding data across different coding data proportions, model families,
+sizes, and reasoning domains, from various perspectives. Specifically, we
+create three IFT datasets with increasing coding data proportions, fine-tune
+six LLM backbones across different families and scales on these datasets,
+evaluate the tuned models' performance across twelve tasks in three reasoning
+domains, and analyze the outcomes from three broad-to-granular perspectives:
+overall, domain-level, and task-specific. Our holistic analysis provides
+valuable insights in each perspective. First, coding data tuning enhances the
+overall reasoning capabilities of LLMs across different model families and
+scales. Moreover, the effect of coding data varies among different domains but
+shows consistent trends across model families and scales within each domain.
+Additionally, coding data generally yields comparable task-specific benefits
+across different model families, with the optimal coding data proportions in
+IFT datasets being task-specific.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Automatic Question Usability Evaluation Toolkit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Moore, Eamon Costello, Huy A. Nguyen, John Stamper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating multiple-choice questions (MCQs) involves either labor intensive
+human assessments or automated methods that prioritize readability, often
+overlooking deeper question design flaws. To address this issue, we introduce
+the Scalable Automatic Question Usability Evaluation Toolkit (SAQUET), an
+open-source tool that leverages the Item-Writing Flaws (IWF) rubric for a
+comprehensive and automated quality evaluation of MCQs. By harnessing the
+latest in large language models such as GPT-4, advanced word embeddings, and
+Transformers designed to analyze textual complexity, SAQUET effectively
+pinpoints and assesses a wide array of flaws in MCQs. We first demonstrate the
+discrepancy between commonly used automated evaluation metrics and the human
+assessment of MCQ quality. Then we evaluate SAQUET on a diverse dataset of MCQs
+across the five domains of Chemistry, Statistics, Computer Science, Humanities,
+and Healthcare, showing how it effectively distinguishes between flawed and
+flawless questions, providing a level of analysis beyond what is achievable
+with traditional metrics. With an accuracy rate of over 94% in detecting the
+presence of flaws identified by human evaluators, our findings emphasize the
+limitations of existing evaluation methods and showcase potential in improving
+the quality of educational assessments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Artificial Intelligence in Education 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Ontology-Enhanced Representation Learning for Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20527v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20527v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Ronzano, Jay Nanavati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Taking advantage of the widespread use of ontologies to organise and
+harmonize knowledge across several distinct domains, this paper proposes a
+novel approach to improve an embedding-Large Language Model (embedding-LLM) of
+interest by infusing the knowledge formalized by a reference ontology:
+ontological knowledge infusion aims at boosting the ability of the considered
+LLM to effectively model the knowledge domain described by the infused
+ontology. The linguistic information (i.e. concept synonyms and descriptions)
+and structural information (i.e. is-a relations) formalized by the ontology are
+utilized to compile a comprehensive set of concept definitions, with the
+assistance of a powerful generative LLM (i.e. GPT-3.5-turbo). These concept
+definitions are then employed to fine-tune the target embedding-LLM using a
+contrastive learning framework. To demonstrate and evaluate the proposed
+approach, we utilize the biomedical disease ontology MONDO. The results show
+that embedding-LLMs enhanced by ontological disease knowledge exhibit an
+improved capability to effectively evaluate the similarity of in-domain
+sentences from biomedical documents mentioning diseases, without compromising
+their out-of-domain performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Generation and Tagging of Knowledge Components from
+  Multiple-Choice Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20526v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20526v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steven Moore, Robin Schmucker, Tom Mitchell, John Stamper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Components (KCs) linked to assessments enhance the measurement of
+student learning, enrich analytics, and facilitate adaptivity. However,
+generating and linking KCs to assessment items requires significant effort and
+domain-specific knowledge. To streamline this process for higher-education
+courses, we employed GPT-4 to generate KCs for multiple-choice questions (MCQs)
+in Chemistry and E-Learning. We analyzed discrepancies between the KCs
+generated by the Large Language Model (LLM) and those made by humans through
+evaluation from three domain experts in each subject area. This evaluation
+aimed to determine whether, in instances of non-matching KCs, evaluators showed
+a preference for the LLM-generated KCs over their human-created counterparts.
+We also developed an ontology induction algorithm to cluster questions that
+assess similar KCs based on their content. Our most effective LLM strategy
+accurately matched KCs for 56% of Chemistry and 35% of E-Learning MCQs, with
+even higher success when considering the top five KC suggestions. Human
+evaluators favored LLM-generated KCs, choosing them over human-assigned ones
+approximately two-thirds of the time, a preference that was statistically
+significant across both domains. Our clustering algorithm successfully grouped
+questions by their underlying KCs without needing explicit labels or contextual
+information. This research advances the automation of KC generation and
+classification for assessment items, alleviating the need for student data or
+predefined KC labels.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Learning @ Scale 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Multilingual Are Large Language Models Fine-Tuned for Translation? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20512v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20512v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aquia Richburg, Marine Carpuat
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A new paradigm for machine translation has recently emerged: fine-tuning
+large language models (LLM) on parallel text has been shown to outperform
+dedicated translation systems trained in a supervised fashion on much larger
+amounts of parallel data (Xu et al., 2024a; Alves et al., 2024). However, it
+remains unclear whether this paradigm can enable massively multilingual machine
+translation or whether it requires fine-tuning dedicated models for a small
+number of language pairs. How does translation fine-tuning impact the MT
+capabilities of LLMs for zero-shot languages, zero-shot language pairs, and
+translation tasks that do not involve English? To address these questions, we
+conduct an extensive empirical evaluation of the translation quality of the
+TOWER family of language models (Alves et al., 2024) on 132 translation tasks
+from the multi-parallel FLORES-200 data. We find that translation fine-tuning
+improves translation quality even for zero-shot languages on average, but that
+the impact is uneven depending on the language pairs involved. These results
+call for further research to effectively enable massively multilingual
+translation with LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SPOT: Text Source Prediction from Originality Score Thresholding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20505v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20505v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edouard Yvinec, Gabriel Kasser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The wide acceptance of large language models (LLMs) has unlocked new
+applications and social risks. Popular countermeasures aim at detecting
+misinformation, usually involve domain specific models trained to recognize the
+relevance of any information. Instead of evaluating the validity of the
+information, we propose to investigate LLM generated text from the perspective
+of trust. In this study, we define trust as the ability to know if an input
+text was generated by a LLM or a human. To do so, we design SPOT, an efficient
+method, that classifies the source of any, standalone, text input based on
+originality score. This score is derived from the prediction of a given LLM to
+detect other LLMs. We empirically demonstrate the robustness of the method to
+the architecture, training data, evaluation data, task and compression of
+modern LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Transfer Q Star: Principled Decoding for LLM Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Souradip Chakraborty, Soumya Suvra Ghosal, Ming Yin, Dinesh Manocha, Mengdi Wang, Amrit Singh Bedi, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aligning foundation models is essential for their safe and trustworthy
+deployment. However, traditional fine-tuning methods are computationally
+intensive and require updating billions of model parameters. A promising
+alternative, alignment via decoding, adjusts the response distribution directly
+without model updates to maximize a target reward $r$, thus providing a
+lightweight and adaptable framework for alignment. However, principled decoding
+methods rely on oracle access to an optimal Q-function ($Q^*$), which is often
+unavailable in practice. Hence, prior SoTA methods either approximate this
+$Q^*$ using $Q^{\pi_{\texttt{sft}}}$ (derived from the reference $\texttt{SFT}$
+model) or rely on short-term rewards, resulting in sub-optimal decoding
+performance. In this work, we propose Transfer $Q^*$, which implicitly
+estimates the optimal value function for a target reward $r$ through a baseline
+model $\rho_{\texttt{BL}}$ aligned with a baseline reward $\rho_{\texttt{BL}}$
+(which can be different from the target reward $r$). Theoretical analyses of
+Transfer $Q^*$ provide a rigorous characterization of its optimality, deriving
+an upper bound on the sub-optimality gap and identifying a hyperparameter to
+control the deviation from the pre-trained reference $\texttt{SFT}$ model based
+on user needs. Our approach significantly reduces the sub-optimality gap
+observed in prior SoTA methods and demonstrates superior empirical performance
+across key metrics such as coherence, diversity, and quality in extensive tests
+on several synthetic and real datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Phantom: General Trigger Attacks on Retrieval Augmented Language
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harsh Chaudhari, Giorgio Severi, John Abascal, Matthew Jagielski, Christopher A. Choquette-Choo, Milad Nasr, Cristina Nita-Rotaru, Alina Oprea
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval Augmented Generation (RAG) expands the capabilities of modern large
+language models (LLMs) in chatbot applications, enabling developers to adapt
+and personalize the LLM output without expensive training or fine-tuning. RAG
+systems use an external knowledge database to retrieve the most relevant
+documents for a given query, providing this context to the LLM generator. While
+RAG achieves impressive utility in many applications, its adoption to enable
+personalized generative models introduces new security risks. In this work, we
+propose new attack surfaces for an adversary to compromise a victim's RAG
+system, by injecting a single malicious document in its knowledge database. We
+design Phantom, general two-step attack framework against RAG augmented LLMs.
+The first step involves crafting a poisoned document designed to be retrieved
+by the RAG system within the top-k results only when an adversarial trigger, a
+specific sequence of words acting as backdoor, is present in the victim's
+queries. In the second step, a specially crafted adversarial string within the
+poisoned document triggers various adversarial attacks in the LLM generator,
+including denial of service, reputation damage, privacy violations, and harmful
+behaviors. We demonstrate our attacks on multiple LLM architectures, including
+Gemma, Vicuna, and Llama.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Focused Feedback Generation for Scientific Writing Assistance <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20477v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20477v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Chamoun, Michael Schlichktrull, Andreas Vlachos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scientific writing is a challenging task, particularly for novice researchers
+who often rely on feedback from experienced peers. Recent work has primarily
+focused on improving surface form and style rather than manuscript content. In
+this paper, we propose a novel task: automated focused feedback generation for
+scientific writing assistance. We present SWIF$^{2}$T: a Scientific WrIting
+Focused Feedback Tool. It is designed to generate specific, actionable and
+coherent comments, which identify weaknesses in a scientific paper and/or
+propose revisions to it. Our approach consists of four components - planner,
+investigator, reviewer and controller - leveraging multiple Large Language
+Models (LLMs) to implement them. We compile a dataset of 300 peer reviews
+citing weaknesses in scientific papers and conduct human evaluation. The
+results demonstrate the superiority in specificity, reading comprehension, and
+overall helpfulness of SWIF$^{2}$T's feedback compared to other approaches. In
+our analysis, we also identified cases where automatically generated reviews
+were judged better than human ones, suggesting opportunities for integration of
+AI-generated feedback in scientific writing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending the Massive Text Embedding Benchmark to French 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Ciancone, Imene Kerboua, Marion Schaeffer, Wissam Siblini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, numerous embedding models have been made available and
+widely used for various NLP tasks. Choosing a model that performs well for
+several tasks in English has been largely simplified by the Massive Text
+Embedding Benchmark (MTEB), but extensions to other languages remain
+challenging. This is why we expand MTEB to propose the first massive benchmark
+of sentence embeddings for French. Not only we gather 22 existing datasets in
+an easy-to-use interface, but we also create three new French datasets for a
+global evaluation over 8 different tasks. We perform a large scale comparison
+with 46 carefully selected embedding models, conduct comprehensive statistical
+tests, and analyze the correlation between model performance and many of their
+characteristics. We find out that even if no model is the best on all tasks,
+large multilingual models pre-trained on sentence similarity perform
+particularly well. Our work comes with open-source code, new datasets and a
+public leaderboard.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalable Detection of Salient Entities in News Articles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eliyar Asgarieh, Kapil Thadani, Neil O'Hare
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  News articles typically mention numerous entities, a large fraction of which
+are tangential to the story. Detecting the salience of entities in articles is
+thus important to applications such as news search, analysis and summarization.
+In this work, we explore new approaches for efficient and effective salient
+entity detection by fine-tuning pretrained transformer models with
+classification heads that use entity tags or contextualized entity
+representations directly. Experiments show that these straightforward
+techniques dramatically outperform prior work across datasets with varying
+sizes and salience definitions. We also study knowledge distillation techniques
+to effectively reduce the computational cost of these models without affecting
+their accuracy. Finally, we conduct extensive analyses and ablation experiments
+to characterize the behavior of the proposed models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Antibiotic Stewardship using a Natural Language Approach for
+  Better Feature Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20419v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20419v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simon A. Lee, Trevor Brokowski, Jeffrey N. Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid emergence of antibiotic-resistant bacteria is recognized as a
+global healthcare crisis, undermining the efficacy of life-saving antibiotics.
+This crisis is driven by the improper and overuse of antibiotics, which
+escalates bacterial resistance. In response, this study explores the use of
+clinical decision support systems, enhanced through the integration of
+electronic health records (EHRs), to improve antibiotic stewardship. However,
+EHR systems present numerous data-level challenges, complicating the effective
+synthesis and utilization of data. In this work, we transform EHR data into a
+serialized textual representation and employ pretrained foundation models to
+demonstrate how this enhanced feature representation can aid in antibiotic
+susceptibility predictions. Our results suggest that this text representation,
+combined with foundation models, provides a valuable tool to increase
+interpretability and support antibiotic stewardship efforts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jailbreaking Large Language Models Against Moderation Guardrails via
+  Cipher Characters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haibo Jin, Andy Zhou, Joe D. Menke, Haohan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are typically harmless but remain vulnerable to
+carefully crafted prompts known as ``jailbreaks'', which can bypass protective
+measures and induce harmful behavior. Recent advancements in LLMs have
+incorporated moderation guardrails that can filter outputs, which trigger
+processing errors for certain malicious questions. Existing red-teaming
+benchmarks often neglect to include questions that trigger moderation
+guardrails, making it difficult to evaluate jailbreak effectiveness. To address
+this issue, we introduce JAMBench, a harmful behavior benchmark designed to
+trigger and evaluate moderation guardrails. JAMBench involves 160 manually
+crafted instructions covering four major risk categories at multiple severity
+levels. Furthermore, we propose a jailbreak method, JAM (Jailbreak Against
+Moderation), designed to attack moderation guardrails using jailbreak prefixes
+to bypass input-level filters and a fine-tuned shadow model functionally
+equivalent to the guardrail model to generate cipher characters to bypass
+output-level filters. Our extensive experiments on four LLMs demonstrate that
+JAM achieves higher jailbreak success ($\sim$ $\times$ 19.88) and lower
+filtered-out rates ($\sim$ $\times$ 1/6) than baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SeamlessExpressiveLM: Speech Language Model for Expressive
+  Speech-to-Speech Translation with Chain-of-Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20410v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20410v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyu Gong, Bandhav Veluri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expressive speech-to-speech translation (S2ST) is a key research topic in
+seamless communication, which focuses on the preservation of semantics and
+speaker vocal style in translated speech. Early works synthesized speaker style
+aligned speech in order to directly learn the mapping from speech to target
+speech spectrogram. Without reliance on style aligned data, recent studies
+leverage the advances of language modeling (LM) and build cascaded LMs on
+semantic and acoustic tokens. This work proposes SeamlessExpressiveLM, a single
+speech language model for expressive S2ST. We decompose the complex
+source-to-target speech mapping into intermediate generation steps with
+chain-of-thought prompting. The model is first guided to translate target
+semantic content and then transfer the speaker style to multi-stream acoustic
+units. Evaluated on Spanish-to-English and Hungarian-to-English translations,
+SeamlessExpressiveLM outperforms cascaded LMs in both semantic quality and
+style transfer, meanwhile achieving better parameter efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ X<span class="highlight-title">Prompt</span>:Explaining Large Language Model's Generation via Joint <span class="highlight-title">Prompt</span>
+  Attribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20404v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20404v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yurui Chang, Bochuan Cao, Yujia Wang, Jinghui Chen, Lu Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated impressive performances in
+complex text generation tasks. However, the contribution of the input prompt to
+the generated content still remains obscure to humans, underscoring the
+necessity of elucidating and explaining the causality between input and output
+pairs. Existing works for providing prompt-specific explanation often confine
+model output to be classification or next-word prediction. Few initial attempts
+aiming to explain the entire language generation often treat input prompt texts
+independently, ignoring their combinatorial effects on the follow-up
+generation. In this study, we introduce a counterfactual explanation framework
+based on joint prompt attribution, XPrompt, which aims to explain how a few
+prompt texts collaboratively influences the LLM's complete generation.
+Particularly, we formulate the task of prompt attribution for generation
+interpretation as a combinatorial optimization problem, and introduce a
+probabilistic algorithm to search for the casual input combination in the
+discrete space. We define and utilize multiple metrics to evaluate the produced
+explanations, demonstrating both faithfulness and efficiency of our framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hallucination-Free? Assessing the Reliability of Leading AI Legal
+  Research Tools 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20362v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20362v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Varun Magesh, Faiz Surani, Matthew Dahl, Mirac Suzgun, Christopher D. Manning, Daniel E. Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legal practice has witnessed a sharp rise in products incorporating
+artificial intelligence (AI). Such tools are designed to assist with a wide
+range of core legal tasks, from search and summarization of caselaw to document
+drafting. But the large language models used in these tools are prone to
+"hallucinate," or make up false information, making their use risky in
+high-stakes domains. Recently, certain legal research providers have touted
+methods such as retrieval-augmented generation (RAG) as "eliminating"
+(Casetext, 2023) or "avoid[ing]" hallucinations (Thomson Reuters, 2023), or
+guaranteeing "hallucination-free" legal citations (LexisNexis, 2023). Because
+of the closed nature of these systems, systematically assessing these claims is
+challenging. In this article, we design and report on the first preregistered
+empirical evaluation of AI-driven legal research tools. We demonstrate that the
+providers' claims are overstated. While hallucinations are reduced relative to
+general-purpose chatbots (GPT-4), we find that the AI research tools made by
+LexisNexis (Lexis+ AI) and Thomson Reuters (Westlaw AI-Assisted Research and
+Ask Practical Law AI) each hallucinate between 17% and 33% of the time. We also
+document substantial differences between systems in responsiveness and
+accuracy. Our article makes four key contributions. It is the first to assess
+and report the performance of RAG-based proprietary legal AI tools. Second, it
+introduces a comprehensive, preregistered dataset for identifying and
+understanding vulnerabilities in these systems. Third, it proposes a clear
+typology for differentiating between hallucinations and accurate legal
+responses. Last, it provides evidence to inform the responsibilities of legal
+professionals in supervising and verifying AI outputs, which remains a central
+open question for the responsible integration of AI into law.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our dataset, tool outputs, and labels will be made available upon
+  publication. This version of the manuscript (May 30, 2024) is updated to
+  reflect an evaluation of Westlaw's AI-Assisted Research</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recurrent Drafter for Fast Speculative Decoding in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09919v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09919v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aonan Zhang, Chong Wang, Yi Wang, Xuanyu Zhang, Yunfei Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce an improved approach of speculative decoding
+aimed at enhancing the efficiency of serving large language models. Our method
+capitalizes on the strengths of two established techniques: the classic
+two-model speculative decoding approach, and the more recent single-model
+approach, Medusa. Drawing inspiration from Medusa, our approach adopts a
+single-model strategy for speculative decoding. However, our method
+distinguishes itself by employing a single, lightweight draft head with a
+recurrent dependency design, akin in essence to the small, draft model uses in
+classic speculative decoding, but without the complexities of the full
+transformer architecture. And because of the recurrent dependency, we can use
+beam search to swiftly filter out undesired candidates with the draft head. The
+outcome is a method that combines the simplicity of single-model design and
+avoids the need to create a data-dependent tree attention structure only for
+inference in Medusa. We empirically demonstrate the effectiveness of the
+proposed method on several popular open source language models, along with a
+comprehensive analysis of the trade-offs involved in adopting this approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contextual Position Encoding: Learning to Count What's Important 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18719v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18719v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olga Golovneva, Tianlu Wang, Jason Weston, Sainbayar Sukhbaatar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The attention mechanism is a critical component of Large Language Models
+(LLMs) that allows tokens in a sequence to interact with each other, but is
+order-invariant. Incorporating position encoding (PE) makes it possible to
+address by position, such as attending to the i-th token. However, current PE
+methods use token counts to derive position, and thus cannot generalize to
+higher levels of abstraction, such as attending to the i-th sentence. In this
+paper, we propose a new position encoding method, Contextual Position Encoding
+(CoPE), that allows positions to be conditioned on context by incrementing
+position only on certain tokens determined by the model. This allows more
+general position addressing such as attending to the $i$-th particular word,
+noun, or sentence. We show that CoPE can solve the selective copy, counting and
+Flip-Flop tasks where popular position embeddings fail, and improves perplexity
+on language modeling and coding tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ You Need to Pay Better Attention: Rethinking the Mathematics of
+  Attention Mechanism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehran Hosseini, Peyman Hosseini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaled Dot Product Attention (SDPA) is the backbone of many modern
+deep-learning models. It is so versatile that it has been used in natural
+language, vision, and multi-modal domains with very little change compared to
+its original formulation. This paper discusses why the current formulation is
+inefficient by delving into the mathematical details of the attention
+mechanism. We propose three improvements to mitigate these inefficiencies,
+thereby, introducing three enhanced attention mechanisms: Optimised, Efficient,
+and Super Attention. Optimised and Efficient Attention have one and two matrix
+multiplications fewer per head, respectively, and 25% and 50% fewer parameters,
+respectively, than standard SDPA, but perform similarly to standard SDPA in
+both vision and natural language tasks. They can be used in all applications
+where SDPA is used while offering smaller model sizes and faster training and
+inference without noticeable loss in performance. Super Attention introduces a
+new linear transformation on the values, transforming them from the left. It
+outperforms standard SPDA on vision and natural language tasks by up to 17%
+while having one fewer matrix multiplication per head and 25% fewer parameters
+than standard SDPA. Consequently, it is also faster than standard SDPA. Super
+Attention is ideal in applications where the attention layer's context length
+is fixed, such as Vision Transformers. In addition to providing mathematical
+reasoning, we evaluate the presented attention mechanisms on several datasets
+including MNIST, CIFAR100, ImageNet, IMDB Movie Reviews, and Amazon Reviews
+datasets, as well as combined Europarl and Anki English-Spanish datasets for
+neural machine translation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From One to Many: Expanding the Scope of Toxicity Mitigation in Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.03893v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.03893v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luiza Pozzobon, Patrick Lewis, Sara Hooker, Beyza Ermis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To date, toxicity mitigation in language models has almost entirely been
+focused on single-language settings. As language models embrace multilingual
+capabilities, it's crucial our safety measures keep pace. Recognizing this
+research gap, our approach expands the scope of conventional toxicity
+mitigation to address the complexities presented by multiple languages. In the
+absence of sufficient annotated datasets across languages, we employ translated
+data to evaluate and enhance our mitigation techniques. We also compare
+finetuning mitigation approaches against retrieval-augmented techniques under
+both static and continual toxicity mitigation scenarios. This allows us to
+examine the effects of translation quality and the cross-lingual transfer on
+toxicity mitigation. We also explore how model size and data quantity affect
+the success of these mitigation efforts. Covering nine languages, our study
+represents a broad array of linguistic families and levels of resource
+availability, ranging from high to mid-resource languages. Through
+comprehensive experiments, we provide insights into the complexities of
+multilingual toxicity mitigation, offering valuable insights and paving the way
+for future research in this increasingly important field. Code and data are
+available at https://github.com/for-ai/goodtriever.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tag-LLM: Repurposing General-Purpose LLMs for Specialized Domains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhong Shen, Neil Tenenholtz, James Brian Hall, David Alvarez-Melis, Nicolo Fusi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable proficiency in
+understanding and generating natural language. However, their capabilities wane
+in highly specialized domains underrepresented in the pretraining corpus, such
+as physical and biomedical sciences. This work explores how to repurpose
+general LLMs into effective task solvers for specialized domains. We introduce
+a novel, model-agnostic framework for learning custom input tags, which are
+parameterized as continuous vectors appended to the LLM's embedding layer, to
+condition the LLM. We design two types of input tags: domain tags are used to
+delimit specialized representations (e.g., chemical formulas) and provide
+domain-relevant context; function tags are used to represent specific functions
+(e.g., predicting molecular properties) and compress function-solving
+instructions. We develop a three-stage protocol to learn these tags using
+auxiliary data and domain knowledge. By explicitly disentangling task domains
+from task functions, our method enables zero-shot generalization to unseen
+problems through diverse combinations of the input tags. It also boosts LLM's
+performance in various specialized domains, such as predicting protein or
+chemical properties and modeling drug-target interactions, outperforming expert
+models tailored to these tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unveiling Linguistic Regions in Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14700v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14700v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihao Zhang, Jun Zhao, Qi Zhang, Tao Gui, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated considerable cross-lingual
+alignment and generalization ability. Current research primarily focuses on
+improving LLMs' cross-lingual generalization capabilities. However, there is
+still a lack of research on the intrinsic mechanisms of how LLMs achieve
+cross-lingual alignment. From the perspective of region partitioning, this
+paper conducts several investigations on the linguistic competence of LLMs. We
+discover a core region in LLMs that corresponds to linguistic competence,
+accounting for approximately 1% of the total model parameters. Removing this
+core region by setting parameters to zero results in a significant performance
+decrease across 30 different languages. Furthermore, this core region exhibits
+significant dimensional dependence, perturbations to even a single parameter on
+specific dimensions leading to a loss of linguistic competence. Moreover, we
+discover that distinct monolingual regions exist for different languages, and
+disruption to these specific regions substantially reduces the LLMs'
+proficiency in those corresponding languages. Our research also indicates that
+freezing the core linguistic region during further pre-training can mitigate
+the issue of catastrophic forgetting (CF), a common phenomenon observed during
+further pre-training of LLMs. Overall, exploring the LLMs' functional regions
+provides insights into the foundation of their intelligence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024. Camera-Ready Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAP-Neo: Highly Capable and Transparent Bilingual Large Language Model
+  Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19327v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19327v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ge Zhang, Scott Qu, Jiaheng Liu, Chenchen Zhang, Chenghua Lin, Chou Leuang Yu, Danny Pan, Esther Cheng, Jie Liu, Qunshu Lin, Raven Yuan, Tuney Zheng, Wei Pang, Xinrun Du, Yiming Liang, Yinghao Ma, Yizhi Li, Ziyang Ma, Bill Lin, Emmanouil Benetos, Huan Yang, Junting Zhou, Kaijing Ma, Minghao Liu, Morry Niu, Noah Wang, Quehry Que, Ruibo Liu, Sine Liu, Shawn Guo, Soren Gao, Wangchunshu Zhou, Xinyue Zhang, Yizhi Zhou, Yubo Wang, Yuelin Bai, Yuhan Zhang, Yuxiang Zhang, Zenith Wang, Zhenzhu Yang, Zijian Zhao, Jiajun Zhang, Wanli Ouyang, Wenhao Huang, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have made great strides in recent years to
+achieve unprecedented performance across different tasks. However, due to
+commercial interest, the most competitive models like GPT, Gemini, and Claude
+have been gated behind proprietary interfaces without disclosing the training
+details. Recently, many institutions have open-sourced several strong LLMs like
+LLaMA-3, comparable to existing closed-source LLMs. However, only the model's
+weights are provided with most details (e.g., intermediate checkpoints,
+pre-training corpus, and training code, etc.) being undisclosed. To improve the
+transparency of LLMs, the research community has formed to open-source truly
+open LLMs (e.g., Pythia, Amber, OLMo), where more details (e.g., pre-training
+corpus and training code) are being provided. These models have greatly
+advanced the scientific study of these large models including their strengths,
+weaknesses, biases and risks. However, we observe that the existing truly open
+LLMs on reasoning, knowledge, and coding tasks are still inferior to existing
+state-of-the-art LLMs with similar model sizes. To this end, we open-source
+MAP-Neo, a highly capable and transparent bilingual language model with 7B
+parameters trained from scratch on 4.5T high-quality tokens. Our MAP-Neo is the
+first fully open-sourced bilingual LLM with comparable performance compared to
+existing state-of-the-art LLMs. Moreover, we open-source all details to
+reproduce our MAP-Neo, where the cleaned pre-training corpus, data cleaning
+pipeline, checkpoints, and well-optimized training/evaluation framework are
+provided. Finally, we hope our MAP-Neo will enhance and strengthen the open
+research community and inspire more innovations and creativities to facilitate
+the further improvements of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://map-neo.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formalizing and Benchmarking <span class="highlight-title">Prompt</span> Injection Attacks and Defenses <span class="chip">USENIX Security</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12815v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12815v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupei Liu, Yuqi Jia, Runpeng Geng, Jinyuan Jia, Neil Zhenqiang Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A prompt injection attack aims to inject malicious instruction/data into the
+input of an LLM-Integrated Application such that it produces results as an
+attacker desires. Existing works are limited to case studies. As a result, the
+literature lacks a systematic understanding of prompt injection attacks and
+their defenses. We aim to bridge the gap in this work. In particular, we
+propose a framework to formalize prompt injection attacks. Existing attacks are
+special cases in our framework. Moreover, based on our framework, we design a
+new attack by combining existing ones. Using our framework, we conduct a
+systematic evaluation on 5 prompt injection attacks and 10 defenses with 10
+LLMs and 7 tasks. Our work provides a common benchmark for quantitatively
+evaluating future prompt injection attacks and defenses. To facilitate research
+on this topic, we make our platform public at
+https://github.com/liu00222/Open-Prompt-Injection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in USENIX Security Symposium 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ensemble Learning for Heterogeneous Large Language Models with Deep
+  Parallel Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.12715v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.12715v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichong Huang, Xiaocheng Feng, Baohang Li, Yang Xiang, Hui Wang, Bing Qin, Ting Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) exhibit complementary strengths in various
+tasks, motivating the research of LLM ensembling. However, existing work
+focuses on training an extra reward model or fusion model to select or combine
+all candidate answers, posing a great challenge to the generalization on unseen
+data distributions. Besides, prior methods use textual responses as
+communication media, ignoring the valuable information in the internal
+representations. In this work, we propose a training-free ensemble framework
+DeePEn, fusing the informative probability distributions yielded by different
+LLMs at each decoding step. Unfortunately, the vocabulary discrepancy between
+heterogeneous LLMs directly makes averaging the distributions unfeasible due to
+the token misalignment. To address this challenge, DeePEn maps the probability
+distribution of each model from its own probability space to a universal
+relative space based on the relative representation theory, and performs
+aggregation. Next, we devise a search-based inverse transformation to transform
+the aggregated result back to the probability space of one of the ensembling
+LLMs (main model), in order to determine the next token. We conduct extensive
+experiments on ensembles of different number of LLMs, ensembles of LLMs with
+different architectures, and ensembles between the LLM and the specialist
+model. Experimental results show that (i) DeePEn achieves consistent
+improvements across six benchmarks covering subject examination, reasoning, and
+knowledge, (ii) a well-performing specialist model can benefit from a less
+effective LLM through distribution fusion, and (iii) DeePEn has complementary
+strengths with other ensemble methods such as voting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EMS: Efficient and Effective Massively Multilingual Sentence Embedding
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.15744v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.15744v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoyuan Mao, Chenhui Chu, Sadao Kurohashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Massively multilingual sentence representation models, e.g., LASER,
+SBERT-distill, and LaBSE, help significantly improve cross-lingual downstream
+tasks. However, the use of a large amount of data or inefficient model
+architectures results in heavy computation to train a new model according to
+our preferred languages and domains. To resolve this issue, we introduce
+efficient and effective massively multilingual sentence embedding (EMS), using
+cross-lingual token-level reconstruction (XTR) and sentence-level contrastive
+learning as training objectives. Compared with related studies, the proposed
+model can be efficiently trained using significantly fewer parallel sentences
+and GPU computation resources. Empirical results showed that the proposed model
+significantly yields better or comparable results with regard to cross-lingual
+sentence retrieval, zero-shot cross-lingual genre classification, and sentiment
+classification. Ablative analyses demonstrated the efficiency and effectiveness
+of each component of the proposed model. We release the codes for model
+training and the EMS pre-trained sentence embedding model, which supports 62
+languages ( https://github.com/Mao-KU/EMS ).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work is a multilingual extension of arXiv:2105.13856. This work
+  has been accepted by IEEE/ACM Transactions on Audio, Speech, and Language
+  Processing (DOI: 10.1109/TASLP.2024.3402064). Copyright has been transferred</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Not All Experts are Equal: Efficient Expert Pruning and Skipping for
+  Mixture-of-Experts Large Language Models <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14800v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14800v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xudong Lu, Qi Liu, Yuhui Xu, Aojun Zhou, Siyuan Huang, Bo Zhang, Junchi Yan, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A pivotal advancement in the progress of large language models (LLMs) is the
+emergence of the Mixture-of-Experts (MoE) LLMs. Compared to traditional LLMs,
+MoE LLMs can achieve higher performance with fewer parameters, but it is still
+hard to deploy them due to their immense parameter sizes. Different from
+previous weight pruning methods that rely on specifically designed hardware,
+this paper mainly aims to enhance the deployment efficiency of MoE LLMs by
+introducing plug-and-play expert-level sparsification techniques. Specifically,
+we propose, for the first time to our best knowledge, post-training approaches
+for task-agnostic and task-specific expert pruning and skipping of MoE LLMs,
+tailored to improve deployment efficiency while maintaining model performance
+across a wide range of tasks. Extensive experiments show that our proposed
+methods can simultaneously reduce model sizes and increase the inference speed,
+while maintaining satisfactory performance. Data and code will be available at
+https://github.com/Lucky-Lance/Expert_Sparsity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Mixture-of-Experts Large Language Models, ACL2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Fine-tuning of Large Language Models under Heterogeneous Tasks
+  and Client Resources 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.11505v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.11505v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiamu Bai, Daoyuan Chen, Bingchen Qian, Liuyi Yao, Yaliang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has recently been applied to the parameter-efficient
+fine-tuning of Large Language Models (LLMs). While promising, it raises
+significant challenges due to the heterogeneous resources and data
+distributions of clients. This study introduces FlexLoRA, a simple yet
+effective aggregation scheme for LLM fine-tuning, which mitigates the ``bucket
+effect'' in traditional FL that restricts the potential of clients with ample
+resources by tying them to the capabilities of the least-resourced
+participants. FlexLoRA allows for dynamic adjustment of local LoRA ranks,
+fostering the development of a global model imbued with broader, less
+task-specific knowledge. By synthesizing a full-size LoRA weight from
+individual client contributions and employing Singular Value Decomposition
+(SVD) for weight redistribution, FlexLoRA fully leverages heterogeneous client
+resources. Involving thousands of clients performing heterogeneous NLP tasks
+and client resources, our experiments validate the efficacy of FlexLoRA, with
+the federated global model achieving consistently better improvement over SOTA
+FL methods in downstream NLP task performance across various heterogeneous
+distributions. FlexLoRA's practicality is further underscored by our
+theoretical analysis and its seamless integration with existing LoRA-based FL
+methods, offering a path toward cross-device, privacy-preserving federated
+tuning for LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 13 tables, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Unlearning of <span class="highlight-title">Pre-train</span>ed Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15159v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15159v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin Yao, Eli Chien, Minxin Du, Xinyao Niu, Tianhao Wang, Zezhou Cheng, Xiang Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigates the concept of the `right to be forgotten' within the
+context of large language models (LLMs). We explore machine unlearning as a
+pivotal solution, with a focus on pre-trained models--a notably
+under-researched area. Our research delineates a comprehensive framework for
+machine unlearning in pre-trained LLMs, encompassing a critical analysis of
+seven diverse unlearning methods. Through rigorous evaluation using curated
+datasets from arXiv, books, and GitHub, we establish a robust benchmark for
+unlearning performance, demonstrating that these methods are over $10^5$ times
+more computationally efficient than retraining. Our results show that
+integrating gradient ascent with gradient descent on in-distribution data
+improves hyperparameter robustness. We also provide detailed guidelines for
+efficient hyperparameter tuning in the unlearning process. Our findings advance
+the discourse on ethical AI practices, offering substantive insights into the
+mechanics of machine unlearning for pre-trained LLMs and underscoring the
+potential for responsible AI development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 main. Code and data at
+  https://github.com/yaojin17/Unlearning_LLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Hierarchical Classification on the Common Procurement
+  Vocabulary Taxonomy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09983v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09983v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Moiraghi, Matteo Palmonari, Davide Allavena, Federico Morando
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classifying public tenders is a useful task for both companies that are
+invited to participate and for inspecting fraudulent activities. To facilitate
+the task for both participants and public administrations, the European Union
+presented a common taxonomy (Common Procurement Vocabulary, CPV) which is
+mandatory for tenders of certain importance; however, the contracts in which a
+CPV label is mandatory are the minority compared to all the Public
+Administrations activities. Classifying over a real-world taxonomy introduces
+some difficulties that can not be ignored. First of all, some fine-grained
+classes have an insufficient (if any) number of observations in the training
+set, while other classes are far more frequent (even thousands of times) than
+the average. To overcome those difficulties, we present a zero-shot approach,
+based on a pre-trained language model that relies only on label description and
+respects the label taxonomy. To train our proposed model, we used industrial
+data, which comes from contrattipubblici.org, a service by SpazioDati s.r.l.
+that collects public contracts stipulated in Italy in the last 25 years.
+Results show that the proposed model achieves better performance in classifying
+low-frequent classes compared to three different baselines, and is also able to
+predict never-seen classes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Full-length version of the short paper accepted at COMPSAC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Code Repair with LLMs gives an Exploration-Exploitation Tradeoff 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17503v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17503v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Tang, Keya Hu, Jin Peng Zhou, Sicheng Zhong, Wei-Long Zheng, Xujie Si, Kevin Ellis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Iteratively improving and repairing source code with large language models
+(LLMs), known as refinement, has emerged as a popular way of generating
+programs that would be too complex to construct in one shot. Given a bank of
+test cases, together with a candidate program, an LLM can improve that program
+by being prompted with failed test cases. But it remains an open question how
+to best iteratively refine code, with prior work employing simple greedy or
+breadth-first strategies. We show here that refinement exposes an
+explore-exploit tradeoff: exploit by refining the program that passes the most
+test cases, or explore by refining a lesser considered program. We frame this
+as an arm-acquiring bandit problem, which we solve with Thompson Sampling. The
+resulting LLM-based program synthesis algorithm is broadly applicable: Across
+loop invariant synthesis, visual reasoning puzzles, and competition programming
+problems, we find that our new method can solve more problems using fewer
+language model calls.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text clustering with LLM embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15112v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15112v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alina Petukhova, João P. Matos-Carvalho, Nuno Fachada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text clustering is an important approach for organising the growing amount of
+digital content, helping to structure and find hidden patterns in uncategorised
+data. However, the effectiveness of text clustering heavily relies on the
+choice of textual embeddings and clustering algorithms. We argue that recent
+advances in large language models (LLMs) can potentially improve this task. In
+this research, we investigated how different textual embeddings -- particularly
+those used in LLMs -- and clustering algorithms affect how text datasets are
+clustered. A series of experiments were conducted to assess how embeddings
+influence clustering results, the role played by dimensionality reduction
+through summarisation, and model size adjustment. Findings reveal that LLM
+embeddings excel at capturing subtleties in structured language, while BERT
+leads the lightweight options in performance. In addition, we observe that
+increasing model dimensionality and employing summarization techniques do not
+consistently lead to improvements in clustering efficiency, suggesting that
+these strategies require careful analysis to use in real-life models. These
+results highlight a complex balance between the need for refined text
+representation and computational feasibility in text clustering applications.
+This study extends traditional text clustering frameworks by incorporating
+embeddings from LLMs, providing a path for improved methodologies, while
+informing new avenues for future research in various types of textual analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Representational Capacity of Recurrent Neural Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12942v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12942v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franz Nowak, Anej Svete, Li Du, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work investigates the computational expressivity of language models
+(LMs) based on recurrent neural networks (RNNs). Siegelmann and Sontag (1992)
+famously showed that RNNs with rational weights and hidden states and unbounded
+computation time are Turing complete. However, LMs define weightings over
+strings in addition to just (unweighted) language membership and the analysis
+of the computational power of RNN LMs (RLMs) should reflect this. We extend the
+Turing completeness result to the probabilistic case, showing how a rationally
+weighted RLM with unbounded computation time can simulate any deterministic
+probabilistic Turing machine (PTM) with rationally weighted transitions. Since,
+in practice, RLMs work in real-time, processing a symbol at every time step, we
+treat the above result as an upper bound on the expressivity of RLMs. We also
+provide a lower bound by showing that under the restriction to real-time
+computation, such models can simulate deterministic real-time rational PTMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added requirement for non-negative probabilities to definitions 2.3
+  and 3.1, fixed typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Morphology-Based Investigation of Positional Encodings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.04530v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.04530v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Poulami Ghosh, Shikhar Vashishth, Raj Dabre, Pushpak Bhattacharyya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contemporary deep learning models effectively handle languages with diverse
+morphology despite not being directly integrated into them. Morphology and word
+order are closely linked, with the latter incorporated into transformer-based
+models through positional encodings. This prompts a fundamental inquiry: Is
+there a correlation between the morphological complexity of a language and the
+utilization of positional encoding in pre-trained language models? In pursuit
+of an answer, we present the first study addressing this question, encompassing
+22 languages and 5 downstream tasks. Our findings reveal that the importance of
+positional encoding diminishes with increasing morphological complexity in
+languages. Our study motivates the need for a deeper understanding of
+positional encoding, augmenting them to better reflect the different languages
+under consideration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Systematic Analysis for <span class="highlight-title">Pretrain</span>ed Language Model Priming for
+  Parameter-Efficient Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.01032v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.01032v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shih-Cheng Huang, Shih-Heng Wang, Min-Han Shih, Saurav Sahay, Hung-yi Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient (PE) methods (like Prompts or Adapters) for adapting
+pre-trained language models (PLM) to downstream tasks have been popular
+recently. However, hindrances still prevent these methods from reaching their
+full potential. For example, two significant challenges are few-shot adaptation
+and cross-task generalization. To tackle these issues, we propose a general PE
+priming framework to enhance and explore the few-shot adaptation and
+generalization ability of PE methods. In this framework, PLMs are primed with
+PE methods for rapidly adapting to various target tasks. To evaluate the
+generalization ability of these PE methods, we conduct experiments on a
+few-shot cross-domain benchmark containing 160 diverse NLP tasks. Our
+experiment not only reveals the best priming strategy but also verifies that
+priming facilitates the adaptation to target tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information
+  Seeking in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Hu, Chumin Liu, Xidong Feng, Yilun Zhao, See-Kiong Ng, Anh Tuan Luu, Junxian He, Pang Wei Koh, Bryan Hooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the face of uncertainty, the ability to *seek information* is of
+fundamental importance. In many practical applications, such as medical
+diagnosis and troubleshooting, the information needed to solve the task is not
+initially given and has to be actively sought by asking follow-up questions
+(for example, a doctor asking a patient for more details about their symptoms).
+In this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to
+augment large language models with the ability to actively seek information by
+asking effective questions. UoT combines 1) an *uncertainty-aware simulation
+approach* which enables the model to simulate possible future scenarios and how
+likely they are to occur, 2) *uncertainty-based rewards* motivated by
+information gain which incentivizes the model to seek information, and 3) a
+*reward propagation scheme* to select the optimal question to ask in a way that
+maximizes the expected reward. In experiments on medical diagnosis,
+troubleshooting, and the `20 Questions` game, UoT achieves an average
+performance improvement of 38.1% in the rate of successful task completion
+across multiple LLMs compared with direct prompting and also improves
+efficiency (i.e., the number of questions needed to complete the task). Our
+code has been released [here](https://github.com/zhiyuanhubj/UoT)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update Results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Debating with More Persuasive LLMs Leads to More Truthful Answers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06782v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06782v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akbir Khan, John Hughes, Dan Valentine, Laura Ruis, Kshitij Sachan, Ansh Radhakrishnan, Edward Grefenstette, Samuel R. Bowman, Tim Rocktäschel, Ethan Perez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Common methods for aligning large language models (LLMs) with desired
+behaviour heavily rely on human-labelled data. However, as models grow
+increasingly sophisticated, they will surpass human expertise, and the role of
+human evaluation will evolve into non-experts overseeing experts. In
+anticipation of this, we ask: can weaker models assess the correctness of
+stronger models? We investigate this question in an analogous setting, where
+stronger models (experts) possess the necessary information to answer questions
+and weaker models (non-experts) lack this information. The method we evaluate
+is debate, where two LLM experts each argue for a different answer, and a
+non-expert selects the answer. We find that debate consistently helps both
+non-expert models and humans answer questions, achieving 76% and 88% accuracy
+respectively (naive baselines obtain 48% and 60%). Furthermore, optimising
+expert debaters for persuasiveness in an unsupervised manner improves
+non-expert ability to identify the truth in debates. Our results provide
+encouraging empirical evidence for the viability of aligning models with debate
+in the absence of ground truth.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For code please check: https://github.com/ucl-dark/llm_debate</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Lingual Knowledge Editing in Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08952v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08952v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaan Wang, Yunlong Liang, Zengkui Sun, Yuxuan Cao, Jiarong Xu, Fandong Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge editing aims to change language models' performance on several
+special cases (i.e., editing scope) by infusing the corresponding expected
+knowledge into them. With the recent advancements in large language models
+(LLMs), knowledge editing has been shown as a promising technique to adapt LLMs
+to new knowledge without retraining from scratch. However, most of the previous
+studies neglect the multi-lingual nature of some main-stream LLMs (e.g., LLaMA,
+ChatGPT and GPT-4), and typically focus on monolingual scenarios, where LLMs
+are edited and evaluated in the same language. As a result, it is still unknown
+the effect of source language editing on a different target language. In this
+paper, we aim to figure out this cross-lingual effect in knowledge editing.
+Specifically, we first collect a large-scale cross-lingual synthetic dataset by
+translating ZsRE from English to Chinese. Then, we conduct English editing on
+various knowledge editing methods covering different paradigms, and evaluate
+their performance in Chinese, and vice versa. To give deeper analyses of the
+cross-lingual effect, the evaluation includes four aspects, i.e., reliability,
+generality, locality and portability. Furthermore, we analyze the inconsistent
+behaviors of the edited models and discuss their specific challenges. Data and
+codes are available at https://github.com/krystalan/Bi_ZsRE
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chatlaw: A Multi-Agent Collaborative Legal Assistant with Knowledge
+  Graph Enhanced Mixture-of-Experts Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16092v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16092v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxi Cui, Munan Ning, Zongjian Li, Bohua Chen, Yang Yan, Hao Li, Bin Ling, Yonghong Tian, Li Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI legal assistants based on Large Language Models (LLMs) can provide
+accessible legal consulting services, but the hallucination problem poses
+potential legal risks. This paper presents Chatlaw, an innovative legal
+assistant utilizing a Mixture-of-Experts (MoE) model and a multi-agent system
+to enhance the reliability and accuracy of AI-driven legal services. By
+integrating knowledge graphs with artificial screening, we construct a
+high-quality legal dataset to train the MoE model. This model utilizes
+different experts to address various legal issues, optimizing the accuracy of
+legal responses. Additionally, Standardized Operating Procedures (SOP), modeled
+after real law firm workflows, significantly reduce errors and hallucinations
+in legal services. Our MoE model outperforms GPT-4 in the Lawbench and Unified
+Qualification Exam for Legal Professionals by 7.73% in accuracy and 11 points,
+respectively, and also surpasses other models in multiple dimensions during
+real-case consultations, demonstrating our robust capability for legal
+consultation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Uncertainty-Aware Language Agent <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14016v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14016v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuzhou Han, Wray Buntine, Ehsan Shareghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Language Agents have achieved promising success by placing Large
+Language Models at the core of a more versatile design that dynamically
+interacts with the external world, the existing approaches neglect the notion
+of uncertainty during these interactions. We present the Uncertainty-Aware
+Language Agent (UALA), a framework that orchestrates the interaction between
+the agent and the external world using uncertainty quantification. Compared
+with other well-known counterparts like ReAct, our extensive experiments across
+3 representative tasks (HotpotQA, StrategyQA, MMLU) and various LLM sizes
+demonstrate that UALA brings a significant improvement of performance, while
+having a substantially lower reliance on the external world (i.e., reduced
+number of tool calls and tokens). Our analyses provide various insights
+including the great potential of UALA compared with agent fine-tuning, and
+underscore the unreliability of verbalised confidence of LLMs as a proxy for
+uncertainty.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code and data are at https://uala-agent.github.io. (accepted to
+  ACL 2024 Findings). arXiv admin note: text overlap with arXiv:2310.05915</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PiVe: <span class="highlight-title">Prompt</span>ing with Iterative Verification Improving Graph-based
+  Generative Capability of LLMs <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12392v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12392v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuzhou Han, Nigel Collier, Wray Buntine, Ehsan Shareghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown great abilities of solving various
+natural language tasks in different domains. Due to the training objective of
+LLMs and their pre-training data, LLMs are not very well equipped for tasks
+involving structured data generation. We propose a framework, Prompting with
+Iterative Verification (PiVe), to improve graph-based generative capability of
+LLMs. We show how a small language model could be trained to act as a verifier
+module for the output of an LLM~(i.e., ChatGPT, GPT-4), and to iteratively
+improve its performance via fine-grained corrective instructions. We also show
+how the verifier module could apply iterative corrections offline for a more
+cost-effective solution to the text-to-graph generation task. Experiments on
+three graph-based datasets show consistent improvement gained via PiVe.
+Additionally, we create GenWiki-HIQ and highlight that the verifier module can
+be used as a data augmentation tool to help improve the quality of
+automatically generated parallel text-graph datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code and data are at https://github.com/Jiuzhouh/PiVe (accepted
+  to ACL 2024 Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prismatic VLMs: Investigating the Design Space of Visually-Conditioned
+  Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Karamcheti, Suraj Nair, Ashwin Balakrishna, Percy Liang, Thomas Kollar, Dorsa Sadigh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visually-conditioned language models (VLMs) have seen growing adoption in
+applications such as visual dialogue, scene understanding, and robotic task
+planning; adoption that has fueled a wealth of new models such as LLaVa,
+InstructBLIP, and PaLI-3. Despite the volume of new releases, key design
+decisions around image preprocessing, architecture, and optimization are
+under-explored, making it challenging to understand what factors account for
+model performance $-$ a challenge further complicated by the lack of objective,
+consistent evaluations. To address these gaps, we first compile a suite of
+standardized evaluations spanning visual question answering, object
+localization, and challenge sets that probe properties such as hallucination;
+evaluations that provide fine-grained insight VLM capabilities. Second, we
+rigorously investigate VLMs along key design axes, including pretrained visual
+representations and training from base vs. instruct-tuned language models,
+amongst others. We couple our analysis with three resource contributions: (1) a
+unified framework for evaluating VLMs, (2) optimized, flexible training code,
+and (3) checkpoints for all models, including a family of VLMs at the 7-13B
+scale that strictly outperform InstructBLIP and LLaVa v1.5, the
+state-of-the-art in open VLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICML 2024. 22 pages, 11 figures. Training code and
+  models: https://github.com/TRI-ML/prismatic-vlms. Evaluation code:
+  https://github.com/TRI-ML/vlm-evaluation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Language Models Represent Beliefs of Self and Others 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.18496v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.18496v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Zhu, Zhining Zhang, Yizhou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding and attributing mental states, known as Theory of Mind (ToM),
+emerges as a fundamental capability for human social reasoning. While Large
+Language Models (LLMs) appear to possess certain ToM abilities, the mechanisms
+underlying these capabilities remain elusive. In this study, we discover that
+it is possible to linearly decode the belief status from the perspectives of
+various agents through neural activations of language models, indicating the
+existence of internal representations of self and others' beliefs. By
+manipulating these representations, we observe dramatic changes in the models'
+ToM performance, underscoring their pivotal role in the social reasoning
+process. Additionally, our findings extend to diverse social reasoning tasks
+that involve different causal inference patterns, suggesting the potential
+generalizability of these representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://walter0807.github.io/RepBelief/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta-Task Planning for Language Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16510v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16510v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Zhang, Derrick Goh Xin Deik, Dexun Li, Hao Zhang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of neural language models has sparked a new surge of
+intelligent agent research. Unlike traditional agents, large language
+model-based agents (LLM agents) have emerged as a promising paradigm for
+achieving artificial general intelligence (AGI) due to their superior reasoning
+and generalization capabilities. Effective planning is crucial for the success
+of LLM agents in real-world tasks, making it a highly pursued topic in the
+community. Current planning methods typically translate tasks into executable
+action sequences. However, determining a feasible or optimal sequence for
+complex tasks at fine granularity, which often requires compositing long chains
+of heterogeneous actions, remains challenging. This paper introduces Meta-Task
+Planning (MTP), a zero-shot methodology for collaborative LLM-based multi-agent
+systems that simplifies complex task planning by decomposing it into a
+hierarchy of subordinate tasks, or meta-tasks. Each meta-task is then mapped
+into executable actions. MTP was assessed on two rigorous benchmarks,
+TravelPlanner and API-Bank. Notably, MTP achieved an average $\sim40\%$ success
+rate on TravelPlanner, significantly higher than the state-of-the-art (SOTA)
+baseline ($2.92\%$), and outperforming $LLM_{api}$-4 with ReAct on API-Bank by
+$\sim14\%$, showing the immense potential of integrating LLM with multi-agent
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ChatKBQA: A Generate-then-Retrieve Framework for Knowledge Base Question
+  Answering with Fine-tuned Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08975v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08975v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Luo, Haihong E, Zichen Tang, Shiyao Peng, Yikai Guo, Wentai Zhang, Chenghao Ma, Guanting Dong, Meina Song, Wei Lin, Yifan Zhu, Luu Anh Tuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Base Question Answering (KBQA) aims to answer natural language
+questions over large-scale knowledge bases (KBs), which can be summarized into
+two crucial steps: knowledge retrieval and semantic parsing. However, three
+core challenges remain: inefficient knowledge retrieval, mistakes of retrieval
+adversely impacting semantic parsing, and the complexity of previous KBQA
+methods. To tackle these challenges, we introduce ChatKBQA, a novel and simple
+generate-then-retrieve KBQA framework, which proposes first generating the
+logical form with fine-tuned LLMs, then retrieving and replacing entities and
+relations with an unsupervised retrieval method, to improve both generation and
+retrieval more directly. Experimental results show that ChatKBQA achieves new
+state-of-the-art performance on standard KBQA datasets, WebQSP, and CWQ. This
+work can also be regarded as a new paradigm for combining LLMs with knowledge
+graphs (KGs) for interpretable and knowledge-required question answering. Our
+code is publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Findings of ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A New Benchmark for Evaluating Automatic Speech Recognition in the
+  Arabic Call Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04280v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04280v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qusai Abo Obaidah, Muhy Eddin Za'ter, Adnan Jaljuli, Ali Mahboub, Asma Hakouz, Bashar Al-Rfooh, Yazan Estaitia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work is an attempt to introduce a comprehensive benchmark for Arabic
+speech recognition, specifically tailored to address the challenges of
+telephone conversations in Arabic language. Arabic, characterized by its rich
+dialectal diversity and phonetic complexity, presents a number of unique
+challenges for automatic speech recognition (ASR) systems. These challenges are
+further amplified in the domain of telephone calls, where audio quality,
+background noise, and conversational speech styles negatively affect
+recognition accuracy. Our work aims to establish a robust benchmark that not
+only encompasses the broad spectrum of Arabic dialects but also emulates the
+real-world conditions of call-based communications. By incorporating diverse
+dialectical expressions and accounting for the variable quality of call
+recordings, this benchmark seeks to provide a rigorous testing ground for the
+development and evaluation of ASR systems capable of navigating the
+complexities of Arabic speech in telephonic contexts. This work also attempts
+to establish a baseline performance evaluation using state-of-the-art ASR
+technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of Semantic Search and its Role in
+  Retrieved-Augmented-Generation (RAG) for Arabic Language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.18350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.18350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Mahboub, Muhy Eddin Za'ter, Bashar Al-Rfooh, Yazan Estaitia, Adnan Jaljuli, Asma Hakouz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The latest advancements in machine learning and deep learning have brought
+forth the concept of semantic similarity, which has proven immensely beneficial
+in multiple applications and has largely replaced keyword search. However,
+evaluating semantic similarity and conducting searches for a specific query
+across various documents continue to be a complicated task. This complexity is
+due to the multifaceted nature of the task, the lack of standard benchmarks,
+whereas these challenges are further amplified for Arabic language. This paper
+endeavors to establish a straightforward yet potent benchmark for semantic
+search in Arabic. Moreover, to precisely evaluate the effectiveness of these
+metrics and the dataset, we conduct our assessment of semantic search within
+the framework of retrieval augmented generation (RAG).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Small Models, Big Insights: Leveraging Slim Proxy Models To Decide When
+  and What to Retrieve for LLMs <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12052v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12052v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiejun Tan, Zhicheng Dou, Yutao Zhu, Peidong Guo, Kun Fang, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The integration of large language models (LLMs) and search engines represents
+a significant evolution in knowledge acquisition methodologies. However,
+determining the knowledge that an LLM already possesses and the knowledge that
+requires the help of a search engine remains an unresolved issue. Most existing
+methods solve this problem through the results of preliminary answers or
+reasoning done by the LLM itself, but this incurs excessively high
+computational costs. This paper introduces a novel collaborative approach,
+namely SlimPLM, that detects missing knowledge in LLMs with a slim proxy model,
+to enhance the LLM's knowledge acquisition process. We employ a proxy model
+which has far fewer parameters, and take its answers as heuristic answers.
+Heuristic answers are then utilized to predict the knowledge required to answer
+the user question, as well as the known and unknown knowledge within the LLM.
+We only conduct retrieval for the missing knowledge in questions that the LLM
+does not know. Extensive experimental results on five datasets with two LLMs
+demonstrate a notable improvement in the end-to-end performance of LLMs in
+question-answering tasks, achieving or surpassing current state-of-the-art
+models with lower LLM inference costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 main conference. Repo:
+  https://github.com/plageon/SlimPLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ C$^{3}$Bench: A Comprehensive Classical Chinese Understanding Benchmark
+  for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17732v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17732v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahuan Cao, Yongxin Shi, Dezhi Peng, Yang Liu, Lianwen Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classical Chinese Understanding (CCU) holds significant value in preserving
+and exploration of the outstanding traditional Chinese culture. Recently,
+researchers have attempted to leverage the potential of Large Language Models
+(LLMs) for CCU by capitalizing on their remarkable comprehension and semantic
+capabilities. However, no comprehensive benchmark is available to assess the
+CCU capabilities of LLMs. To fill this gap, this paper introduces C$^{3}$bench,
+a Comprehensive Classical Chinese understanding benchmark, which comprises
+50,000 text pairs for five primary CCU tasks, including classification,
+retrieval, named entity recognition, punctuation, and translation. Furthermore,
+the data in C$^{3}$bench originates from ten different domains, covering most
+of the categories in classical Chinese. Leveraging the proposed C$^{3}$bench,
+we extensively evaluate the quantitative performance of 15 representative LLMs
+on all five CCU tasks. Our results not only establish a public leaderboard of
+LLMs' CCU capabilities but also gain some findings. Specifically, existing LLMs
+are struggle with CCU tasks and still inferior to supervised models.
+Additionally, the results indicate that CCU is a task that requires special
+attention. We believe this study could provide a standard benchmark,
+comprehensive baselines, and valuable insights for the future advancement of
+LLM-based CCU research. The evaluation pipeline and dataset are available at
+\url{https://github.com/SCUT-DLVCLab/C3bench}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BIDER: Bridging Knowledge Inconsistency for Efficient
+  Retrieval-Augmented LLMs via Key Supporting Evidence <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12174v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12174v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajie Jin, Yutao Zhu, Yujia Zhou, Zhicheng Dou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented large language models (LLMs) have demonstrated efficacy
+in knowledge-intensive tasks such as open-domain QA, addressing inherent
+challenges in knowledge update and factual inadequacy. However, inconsistencies
+between retrieval knowledge and the necessary knowledge for LLMs, leading to a
+decline in LLM's answer quality. This paper introduces BIDER, an approach that
+refines retrieval documents into Key Supporting Evidence (KSE) through
+knowledge synthesis, supervised fine-tuning (SFT), and preference alignment. We
+train BIDER by learning from crafting KSE, while maximizing its output to align
+with LLM's information acquisition preferences through reinforcement learning.
+Evaluations across five datasets show BIDER boosts LLMs' answer quality by 7%
+while reducing input content length in retrieval documents by 80%,
+outperforming existing methods. The proposed KSE simulation effectively equips
+LLMs with essential information for accurate question answering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tool Learning with Large Language Models: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17935v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17935v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changle Qu, Sunhao Dai, Xiaochi Wei, Hengyi Cai, Shuaiqiang Wang, Dawei Yin, Jun Xu, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, tool learning with large language models (LLMs) has emerged as a
+promising paradigm for augmenting the capabilities of LLMs to tackle highly
+complex problems. Despite growing attention and rapid advancements in this
+field, the existing literature remains fragmented and lacks systematic
+organization, posing barriers to entry for newcomers. This gap motivates us to
+conduct a comprehensive survey of existing works on tool learning with LLMs. In
+this survey, we focus on reviewing existing literature from the two primary
+aspects (1) why tool learning is beneficial and (2) how tool learning is
+implemented, enabling a comprehensive understanding of tool learning with LLMs.
+We first explore the "why" by reviewing both the benefits of tool integration
+and the inherent benefits of the tool learning paradigm from six specific
+aspects. In terms of "how", we systematically review the literature according
+to a taxonomy of four key stages in the tool learning workflow: task planning,
+tool selection, tool calling, and response generation. Additionally, we provide
+a detailed summary of existing benchmarks and evaluation methods, categorizing
+them according to their relevance to different stages. Finally, we discuss
+current challenges and outline potential future directions, aiming to inspire
+both researchers and industrial developers to further explore this emerging and
+promising area. We also maintain a GitHub repository to continually keep track
+of the relevant papers and resources in this rising area at
+\url{https://github.com/quchangle1/LLM-Tool-Survey}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilling Robustness into Natural Language Inference Models with
+  Domain-Targeted Augmentation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13067v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13067v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joe Stacey, Marek Rei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation optimises a smaller student model to behave similarly
+to a larger teacher model, retaining some of the performance benefits. While
+this method can improve results on in-distribution examples, it does not
+necessarily generalise to out-of-distribution (OOD) settings. We investigate
+two complementary methods for improving the robustness of the resulting student
+models on OOD domains. The first approach augments the distillation with
+generated unlabelled examples that match the target distribution. The second
+method upsamples data points among the training set that are similar to the
+target distribution. When applied on the task of natural language inference
+(NLI), our experiments on MNLI show that distillation with these modifications
+outperforms previous robustness solutions. We also find that these methods
+improve performance on OOD domains even beyond the target domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LQER: Low-Rank Quantization Error Reconstruction for LLMs <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02446v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02446v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Zhang, Jianyi Cheng, George A. Constantinides, Yiren Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization of Large Language Models (LLMs) is challenging. In
+this work, we introduce Low-rank Quantization Error Reduction (LQER), which
+combines quantization and low-rank approximation to recover the model
+capability. LQER leverages an activation-induced scale matrix to drive the
+singular value distribution of quantization error towards a desirable
+distribution, which enables nearly-lossless W4A8 quantization on various LLMs
+and downstream tasks without the need for knowledge distillation, grid search,
+or gradient-base iterative optimization. Unlike existing methods, the
+computation pattern of LQER eliminates the need for specialized Scatter and
+Gather processes to collect high-precision weights from irregular memory
+locations. Our W4A8 LLMs achieve near-lossless performance on six popular
+downstream tasks, while using 1.36$\times$ fewer hardware resources than the
+leading state-of-the-art method. We open-source our framework at
+https://github.com/ChengZhang-98/lqer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Advancing Large Language Models to Capture Varied Speaking Styles and
+  Respond Properly in Spoken Conversations <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.12786v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.12786v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guan-Ting Lin, Cheng-Han Chiang, Hung-yi Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In spoken dialogue, even if two current turns are the same sentence, their
+responses might still differ when they are spoken in different styles. The
+spoken styles, containing paralinguistic and prosodic information, mark the
+most significant difference between text and speech modality. When using
+text-only LLMs to model spoken dialogue, text-only LLMs cannot give different
+responses based on the speaking style of the current turn. In this paper, we
+focus on enabling LLMs to listen to the speaking styles and respond properly.
+Our goal is to teach the LLM that "even if the sentences are identical if they
+are spoken in different styles, their corresponding responses might be
+different". Since there is no suitable dataset for achieving this goal, we
+collect a speech-to-speech dataset, StyleTalk, with the following desired
+characteristics: when two current speeches have the same content but are spoken
+in different styles, their responses will be different. To teach LLMs to
+understand and respond properly to the speaking styles, we propose the
+Spoken-LLM framework that can model the linguistic content and the speaking
+styles. We train Spoken-LLM using the StyleTalk dataset and devise a two-stage
+training pipeline to help the Spoken-LLM better learn the speaking styles.
+Based on extensive experiments, we show that Spoken-LLM outperforms text-only
+baselines and prior speech LLMs methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real
+  Computer Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.07972v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.07972v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianbao Xie, Danyang Zhang, Jixuan Chen, Xiaochuan Li, Siheng Zhao, Ruisheng Cao, Toh Jing Hua, Zhoujun Cheng, Dongchan Shin, Fangyu Lei, Yitao Liu, Yiheng Xu, Shuyan Zhou, Silvio Savarese, Caiming Xiong, Victor Zhong, Tao Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous agents that accomplish complex computer tasks with minimal human
+interventions have the potential to transform human-computer interaction,
+significantly enhancing accessibility and productivity. However, existing
+benchmarks either lack an interactive environment or are limited to
+environments specific to certain applications or domains, failing to reflect
+the diverse and complex nature of real-world computer use, thereby limiting the
+scope of tasks and agent scalability. To address this issue, we introduce
+OSWorld, the first-of-its-kind scalable, real computer environment for
+multimodal agents, supporting task setup, execution-based evaluation, and
+interactive learning across various operating systems such as Ubuntu, Windows,
+and macOS. OSWorld can serve as a unified, integrated computer environment for
+assessing open-ended computer tasks that involve arbitrary applications.
+Building upon OSWorld, we create a benchmark of 369 computer tasks involving
+real web and desktop apps in open domains, OS file I/O, and workflows spanning
+multiple applications. Each task example is derived from real-world computer
+use cases and includes a detailed initial state setup configuration and a
+custom execution-based evaluation script for reliable, reproducible evaluation.
+Extensive evaluation of state-of-the-art LLM/VLM-based agents on OSWorld
+reveals significant deficiencies in their ability to serve as computer
+assistants. While humans can accomplish over 72.36% of the tasks, the best
+model achieves only 12.24% success, primarily struggling with GUI grounding and
+operational knowledge. Comprehensive analysis using OSWorld provides valuable
+insights for developing multimodal generalist agents that were not possible
+with previous benchmarks. Our code, environment, baseline models, and data are
+publicly available at https://os-world.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CICLe: Conformal In-Context Learning for Largescale Multi-Class Food
+  Risk Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11904v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11904v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Korbinian Randl, John Pavlopoulos, Aron Henriksson, Tony Lindgren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contaminated or adulterated food poses a substantial risk to human health.
+Given sets of labeled web texts for training, Machine Learning and Natural
+Language Processing can be applied to automatically detect such risks. We
+publish a dataset of 7,546 short texts describing public food recall
+announcements. Each text is manually labeled, on two granularity levels (coarse
+and fine), for food products and hazards that the recall corresponds to. We
+describe the dataset and benchmark naive, traditional, and Transformer models.
+Based on our analysis, Logistic Regression based on a tf-idf representation
+outperforms RoBERTa and XLM-R on classes with low support. Finally, we discuss
+different prompting strategies and present an LLM-in-the-loop framework, based
+on Conformal Prediction, which boosts the performance of the base classifier
+while reducing energy consumption compared to normal prompting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PrivLM-Bench: A Multi-level Privacy Evaluation Benchmark for Language
+  Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.04044v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.04044v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Li, Dadi Guo, Donghao Li, Wei Fan, Qi Hu, Xin Liu, Chunkit Chan, Duanyi Yao, Yuan Yao, Yangqiu Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of language models (LMs) brings unprecedented
+accessibility and usage for both models and users. On the one hand, powerful
+LMs achieve state-of-the-art performance over numerous downstream NLP tasks. On
+the other hand, more and more attention is paid to unrestricted model accesses
+that may bring malicious privacy risks of data leakage. To address these
+issues, many recent works propose privacy-preserving language models (PPLMs)
+with differential privacy (DP). Unfortunately, different DP implementations
+make it challenging for a fair comparison among existing PPLMs. In this paper,
+we present PrivLM-Bench, a multi-perspective privacy evaluation benchmark to
+empirically and intuitively quantify the privacy leakage of LMs. Instead of
+only reporting DP parameters, PrivLM-Bench sheds light on the neglected
+inference data privacy during actual usage. PrivLM-Bench first clearly defines
+multi-faceted privacy objectives. Then, PrivLM-Bench constructs a unified
+pipeline to perform private fine-tuning. Lastly, PrivLM-Bench performs existing
+privacy attacks on LMs with pre-defined privacy objectives as the empirical
+evaluation results. The empirical attack results are used to fairly and
+intuitively evaluate the privacy leakage of various PPLMs. We conduct extensive
+experiments on three datasets of GLUE for mainstream LMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intelligent Go-Explore: Standing on the Shoulders of Giant Foundation
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15143v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15143v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Lu, Shengran Hu, Jeff Clune
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Go-Explore is a powerful family of algorithms designed to solve
+hard-exploration problems, built on the principle of archiving discovered
+states, and iteratively returning to and exploring from the most promising
+states. This approach has led to superhuman performance across a wide variety
+of challenging problems including Atari games and robotic control, but requires
+manually designing heuristics to guide exploration, which is time-consuming and
+infeasible in general. To resolve this, we propose Intelligent Go-Explore (IGE)
+which greatly extends the scope of the original Go-Explore by replacing these
+heuristics with the intelligence and internalized human notions of
+interestingness captured by giant foundation models (FMs). This provides IGE
+with a human-like ability to instinctively identify how interesting or
+promising any new state is (e.g. discovering new objects, locations, or
+behaviors), even in complex environments where heuristics are hard to define.
+Moreover, IGE offers the exciting and previously impossible opportunity to
+recognize and capitalize on serendipitous discoveries that cannot be predicted
+ahead of time. We evaluate IGE on a range of language-based tasks that require
+search and exploration. In Game of 24, a multistep mathematical reasoning
+problem, IGE reaches 100% success rate 70.8% faster than the best classic graph
+search baseline. Next, in BabyAI-Text, a challenging partially observable
+gridworld, IGE exceeds the previous SOTA with orders of magnitude fewer online
+samples. Finally, in TextWorld, we show the unique ability of IGE to succeed in
+settings requiring long-horizon exploration where prior SOTA FM agents like
+Reflexion completely fail. Overall, IGE combines the tremendous strengths of
+FMs and the powerful Go-Explore algorithm, opening up a new frontier of
+research into creating more generally capable agents with impressive
+exploration capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sibyl: Sensible Empathetic Dialogue Generation with Visionary
+  Commonsense Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.15316v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.15316v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lanrui Wang, Jiangnan Li, Chenxu Yang, Zheng Lin, Hongyin Tang, Huan Liu, Xiaolei Huang, Yanan Cao, Jingang Wang, Weiping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been a heightened interest in building chatbots based on
+Large Language Models (LLMs) to emulate human-like qualities in dialogues,
+including expressing empathy and offering emotional support. Despite having
+access to commonsense knowledge to better understand the psychological aspects
+and causality of dialogue context, even these powerful LLMs struggle to achieve
+the goals of empathy and emotional support. As current approaches do not
+adequately anticipate dialogue future, they may mislead language models to
+ignore complex dialogue goals of empathy and emotional support, resulting in
+unsupportive responses lacking empathy. To address this issue, we present an
+innovative framework named Sensible Empathetic Dialogue Generation with
+Visionary Commonsense Knowledge (Sibyl). Designed to concentrate on the
+imminent dialogue future, this paradigm directs LLMs toward the implicit
+requirements of the conversation, aiming to provide more sensible responses.
+Experimental results demonstrate that incorporating our paradigm for acquiring
+commonsense knowledge into LLMs comprehensively enhances the quality of their
+responses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Do LVLMs Understand Charts? Analyzing and Correcting Factual Errors in
+  Chart Captioning <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.10160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.10160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kung-Hsiang Huang, Mingyang Zhou, Hou Pong Chan, Yi R. Fung, Zhenhailong Wang, Lingyu Zhang, Shih-Fu Chang, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large vision-language models (LVLMs) have led to
+significant progress in generating natural language descriptions for visual
+content and thus enhancing various applications. One issue with these powerful
+models is that they sometimes produce texts that are factually inconsistent
+with the visual input. While there has been some effort to mitigate such
+inconsistencies in natural image captioning, the factuality of generated
+captions for structured document images, such as charts, has not received as
+much scrutiny, posing a potential threat to information reliability in critical
+applications. This work delves into the factuality aspect by introducing a
+comprehensive typology of factual errors in generated chart captions. A
+large-scale human annotation effort provides insight into the error patterns
+and frequencies in captions crafted by various chart captioning models,
+ultimately forming the foundation of a novel dataset, CHOCOLATE. Our analysis
+reveals that even state-of-the-art models, including GPT-4V, frequently produce
+captions laced with factual inaccuracies. In response to this challenge, we
+establish the new task of Chart Caption Factual Error Correction and introduce
+CHARTVE, a model for visual entailment that outperforms proprietary and
+open-source LVLMs in evaluating factual consistency. Furthermore, we propose
+C2TFEC, an interpretable two-stage framework that excels at correcting factual
+errors. This work inaugurates a new domain in factual error correction for
+chart captions, presenting a novel evaluation mechanism, and demonstrating an
+effective approach to ensuring the factuality of generated chart captions. The
+code and data as well as the continuously updated benchmark can be found at:
+https://khuangaf.github.io/CHOCOLATE/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPA: Towards A Computational Friendly Cloud-Base and On-Devices
+  Collaboration Seq2seq Personalized Generation <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07088v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07088v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanming Liu, Xinyue Peng, Jiannan Cao, Le Dai, Xingzu Liu, Weihao Liu, Mingbang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models(LLMs) have shown its outperforming ability on various
+tasks and question answering. However, LLMs require substantial memory storage
+on low-resource devices. More critically, the computational speed on these
+devices is also severely limited. In this paper, we propose SPA(Side Plugin
+Adaption), a lightweight architecture for fast on-devices inference on the
+constraints of strict on-devices computation and memory constraints. Compared
+with other on-devices seq2seq generation, SPA could make a fast and stable
+inference on low-resource constraints, allowing it to obtain cost effiency. Our
+method establish an interaction between a pretrained LLMs on-cloud and additive
+parameters on-devices, which could provide the knowledge on both pretrained
+LLMs and featured personal feature. Further more, SPA provides a framework to
+keep feature-base parameters on low computational devices while leave the
+parameters containing general information on the high computational devices.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, second version of SPA(Side Plugin Adaption)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multi-Perspective Analysis of Memorization in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.11577v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.11577v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Chen, Namgi Han, Yusuke Miyao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), trained on massive corpora with billions of
+parameters, show unprecedented performance in various fields. Though surprised
+by their excellent performances, researchers also noticed some special
+behaviors of those LLMs. One of those behaviors is memorization, in which LLMs
+can generate the same content used to train them. Though previous research has
+discussed memorization, the memorization of LLMs still lacks explanation,
+especially the cause of memorization and the dynamics of generating them. In
+this research, we comprehensively discussed memorization from various
+perspectives and extended the discussion scope to not only just the memorized
+content but also less and unmemorized content. Through various studies, we
+found that: (1) Through experiments, we revealed the relation of memorization
+between model size, continuation size, and context size. Further, we showed how
+unmemorized sentences transition to memorized sentences. (2) Through embedding
+analysis, we showed the distribution and decoding dynamics across model size in
+embedding space for sentences with different memorization scores. The n-gram
+statistics analysis presents d (3) An analysis over n-gram and entropy decoding
+dynamics discovered a boundary effect when the model starts to generate
+memorized sentences or unmemorized sentences. (4)We trained a Transformer model
+to predict the memorization of different models, showing that it is possible to
+predict memorizations by context.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RelayAttention for Efficient Large Language Model Serving with Long
+  System <span class="highlight-title">Prompt</span>s <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14808v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14808v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Zhu, Xinjiang Wang, Wayne Zhang, Rynson W. H. Lau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A practical large language model (LLM) service may involve a long system
+prompt, which specifies the instructions, examples, and knowledge documents of
+the task and is reused across requests. However, the long system prompt causes
+throughput/latency bottlenecks as the cost of generating the next token grows
+w.r.t. the sequence length. This paper aims to improve the efficiency of LLM
+services that involve long system prompts. Our key observation is that handling
+these system prompts requires heavily redundant memory accesses in existing
+causal attention computation algorithms. Specifically, for batched requests,
+the cached hidden states (\ie, key-value pairs) of system prompts are
+transferred from off-chip DRAM to on-chip SRAM multiple times, each
+corresponding to an individual request. To eliminate such a redundancy, we
+propose RelayAttention, an attention algorithm that allows reading these hidden
+states from DRAM exactly once for a batch of input tokens. RelayAttention is a
+free lunch: it maintains the generation quality while requiring no model
+retraining, as it is based on a mathematical reformulation of causal attention.
+We have observed significant performance improvements to a production-level
+system, vLLM, through integration with RelayAttention. The improvements are
+even more profound with longer system prompts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted by the ACL 2024 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instruct Once, Chat Consistently in Multiple Rounds: An Efficient Tuning
+  Framework for Dialogue <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.06967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.06967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Wang, Chak Tou Leong, Jiashuo Wang, Dongding Lin, Wenjie Li, Xiao-Yong Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tuning language models for dialogue generation has been a prevalent paradigm
+for building capable dialogue agents. Yet, traditional tuning narrowly views
+dialogue generation as resembling other language generation tasks, ignoring the
+role disparities between two speakers and the multi-round interactive process
+that dialogues ought to be. Such a manner often leads to unsatisfactory chat
+consistency for the built agent. In this work, we emphasize the interactive,
+communicative nature of dialogue and argue that it is more feasible to model
+the speaker roles of agent and user separately, enabling the agent to adhere to
+its role consistently. With this in mind, we propose an efficient Multi-round
+Interactive Dialogue Tuning (Midi-Tuning) framework. It models the agent and
+user individually with two adapters built upon large language models. The
+adapters make use of respective utterances round by round in alternating order
+and they are tuned via a round-level memory caching mechanism. Extensive
+experiments demonstrate that, our framework performs superior to traditional
+fine-tuning and harbors the tremendous potential for improving dialogue
+consistency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLaMA Pro: Progressive LLaMA with Block Expansion <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.02415v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.02415v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengyue Wu, Yukang Gan, Yixiao Ge, Zeyu Lu, Jiahao Wang, Ye Feng, Ying Shan, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans generally acquire new skills without compromising the old; however,
+the opposite holds for Large Language Models (LLMs), e.g., from LLaMA to
+CodeLLaMA. To this end, we propose a new post-pretraining method for LLMs with
+an expansion of Transformer blocks. We tune the expanded blocks using only new
+corpus, efficiently and effectively improving the model's knowledge without
+catastrophic forgetting. In this paper, we experiment on the corpus of code and
+math, yielding LLaMA Pro-8.3B, a versatile foundation model initialized from
+LLaMA2-7B, excelling in general tasks, programming, and mathematics. LLaMA Pro
+and its instruction-following counterpart (LLaMA Pro-Instruct) achieve advanced
+performance among various benchmarks, demonstrating superiority over existing
+open models in the LLaMA family and the immense potential of reasoning and
+addressing diverse tasks as an intelligent agent. Our findings provide valuable
+insights into integrating natural and programming languages, laying a solid
+foundation for developing advanced language agents that operate effectively in
+various environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL 2024, Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models as Zero-shot Dialogue State Tracker through
+  Function Calling <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.10466v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.10466v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekun Li, Zhiyu Zoey Chen, Mike Ross, Patrick Huber, Seungwhan Moon, Zhaojiang Lin, Xin Luna Dong, Adithya Sagar, Xifeng Yan, Paul A. Crook
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are increasingly prevalent in conversational
+systems due to their advanced understanding and generative capabilities in
+general contexts. However, their effectiveness in task-oriented dialogues
+(TOD), which requires not only response generation but also effective dialogue
+state tracking (DST) within specific tasks and domains, remains less
+satisfying. In this work, we propose a novel approach FnCTOD for solving DST
+with LLMs through function calling. This method improves zero-shot DST,
+allowing adaptation to diverse domains without extensive data collection or
+model tuning. Our experimental results demonstrate that our approach achieves
+exceptional performance with both modestly sized open-source and also
+proprietary LLMs: with in-context prompting it enables various 7B or 13B
+parameter models to surpass the previous state-of-the-art (SOTA) achieved by
+ChatGPT, and improves ChatGPT's performance beating the SOTA by 5.6% average
+joint goal accuracy (JGA). Individual model results for GPT-3.5 and GPT-4 are
+boosted by 4.8% and 14%, respectively. We also show that by fine-tuning on a
+small collection of diverse task-oriented dialogues, we can equip modestly
+sized models, specifically a 13B parameter LLaMA2-Chat model, with
+function-calling capabilities and DST performance comparable to ChatGPT while
+maintaining their chat capabilities. We have made the code publicly available
+at https://github.com/facebookresearch/FnCTOD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 Main. Code available at:
+  https://github.com/facebookresearch/FnCTOD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InferCept: Efficient Intercept Support for Augmented Large Language
+  Model Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01869v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01869v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reyna Abhyankar, Zijian He, Vikranth Srivatsa, Hao Zhang, Yiying Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models are increasingly integrated with external environments,
+tools, and agents like ChatGPT plugins to extend their capability beyond
+language-centric tasks. However, today's LLM inference systems are designed for
+standalone LLMs. They treat each external interaction as the end of LLM
+generation and form a new request when the interaction finishes, causing
+unnecessary recomputation of already computed contexts, which accounts for
+37-40% of total model forwarding time. This paper presents InferCept, the first
+LLM inference framework targeting augmented LLMs and supporting the efficient
+interception of LLM generation. InferCept minimizes the GPU resource waste
+caused by LLM interceptions and dedicates saved memory for serving more
+requests. InferCept improves the overall serving throughput by 1.6x-2x and
+completes 2x more requests per second compared to the state-of-the-art LLM
+inference systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Patchscopes: A Unifying Framework for Inspecting Hidden Representations
+  of Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.06102v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.06102v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asma Ghandeharioun, Avi Caciularu, Adam Pearce, Lucas Dixon, Mor Geva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the internal representations of large language models (LLMs)
+can help explain models' behavior and verify their alignment with human values.
+Given the capabilities of LLMs in generating human-understandable text, we
+propose leveraging the model itself to explain its internal representations in
+natural language. We introduce a framework called Patchscopes and show how it
+can be used to answer a wide range of questions about an LLM's computation. We
+show that many prior interpretability methods based on projecting
+representations into the vocabulary space and intervening on the LLM
+computation can be viewed as instances of this framework. Moreover, several of
+their shortcomings such as failure in inspecting early layers or lack of
+expressivity can be mitigated by Patchscopes. Beyond unifying prior inspection
+techniques, Patchscopes also opens up new possibilities such as using a more
+capable model to explain the representations of a smaller model, and multihop
+reasoning error correction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024 (to appear)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imp: Highly Capable Large Multimodal Models for Mobile Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.12107v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.12107v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenwei Shao, Zhou Yu, Jun Yu, Xuecheng Ouyang, Lihao Zheng, Zhenbiao Gai, Mingyang Wang, Jiajun Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By harnessing the capabilities of large language models (LLMs), recent large
+multimodal models (LMMs) have shown remarkable versatility in open-world
+multimodal understanding. Nevertheless, they are usually parameter-heavy and
+computation-intensive, thus hindering their applicability in
+resource-constrained scenarios. To this end, several lightweight LMMs have been
+proposed successively to maximize the capabilities under constrained scale
+(e.g., 3B). Despite the encouraging results achieved by these methods, most of
+them only focus on one or two aspects of the design space, and the key design
+choices that influence model capability have not yet been thoroughly
+investigated. In this paper, we conduct a systematic study for lightweight LMMs
+from the aspects of model architecture, training strategy, and training data.
+Based on our findings, we obtain Imp -- a family of highly capable LMMs at the
+2B-4B scales. Notably, our Imp-3B model steadily outperforms all the existing
+lightweight LMMs of similar size, and even surpasses the state-of-the-art LMMs
+at the 13B scale. With low-bit quantization and resolution reduction
+techniques, our Imp model can be deployed on a Qualcomm Snapdragon 8Gen3 mobile
+chip with a high inference speed of about 13 tokens/s.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>fix some typos and correct a few number in the tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SLIDE: A Framework Integrating Small and Large Language Models for
+  Open-Domain Dialogues Evaluation <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15924v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15924v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kun Zhao, Bohao Yang, Chen Tang, Chenghua Lin, Liang Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The long-standing one-to-many problem of gold standard responses in
+open-domain dialogue systems presents challenges for automatic evaluation
+metrics. Though prior works have demonstrated some success by applying powerful
+Large Language Models (LLMs), existing approaches still struggle with the
+one-to-many problem, and exhibit subpar performance in domain-specific
+scenarios. We assume the commonsense reasoning biases within LLMs may hinder
+their performance in domainspecific evaluations. To address both issues, we
+propose a novel framework SLIDE (Small and Large Integrated for Dialogue
+Evaluation), that leverages both a small, specialised model (SLM), and LLMs for
+the evaluation of open domain dialogues. Our approach introduces several
+techniques: (1) Contrastive learning to differentiate between robust and
+non-robust response embeddings; (2) A novel metric for semantic sensitivity
+that combines embedding cosine distances with similarity learned through neural
+networks, and (3) a strategy for incorporating the evaluation results from both
+the SLM and LLMs. Our empirical results demonstrate that our approach achieves
+state-of-the-art performance in both the classification and evaluation tasks,
+and additionally the SLIDE evaluator exhibits better correlation with human
+judgements. Our code is available at https://
+github.com/hegehongcha/SLIDE-ACL2024.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACL2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ORLM: Training Large Language Models for Optimization Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17743v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17743v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyang Tang, Chenyu Huang, Xin Zheng, Shixi Hu, Zizhuo Wang, Dongdong Ge, Benyou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have emerged as powerful tools for tackling
+complex Operations Research (OR) problem by providing the capacity in
+automating optimization modeling. However, current methodologies heavily rely
+on prompt engineering (e.g., multi-agent cooperation) with proprietary LLMs,
+raising data privacy concerns that could be prohibitive in industry
+applications. To tackle this issue, we propose training open-source LLMs for
+optimization modeling. We identify four critical requirements for the training
+dataset of OR LLMs, design and implement OR-Instruct, a semi-automated process
+for creating synthetic data tailored to specific requirements. We also
+introduce the IndustryOR benchmark, the first industrial benchmark for testing
+LLMs on solving real-world OR problems. We apply the data from OR-Instruct to
+various open-source LLMs of 7b size (termed as ORLMs), resulting in a
+significantly improved capability for optimization modeling. Our
+best-performing ORLM achieves state-of-the-art performance on the NL4OPT, MAMO,
+and IndustryOR benchmarks. Our code and data are available at
+\url{https://github.com/Cardinal-Operations/ORLM}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond the Answers: <span class="highlight-title">Review</span>ing the Rationality of Multiple Choice
+  Question Answering for the Evaluation of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01349v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01349v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochun Wang, Sendong Zhao, Zewen Qiang, Nuwa Xi, Bing Qin, Ting Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of natural language processing (NLP), Large Language Models
+(LLMs) have precipitated a paradigm shift, markedly enhancing performance in
+natural language generation tasks. Despite these advancements, the
+comprehensive evaluation of LLMs remains an inevitable challenge for the
+community. Recently, the utilization of Multiple Choice Question Answering
+(MCQA) as a benchmark for LLMs has gained considerable traction. This study
+first investigates the limitations of MCQA as an evaluation method for LLMs and
+then analyzes the fundamental reason for the limitations of MCQA, that while
+LLMs may select the correct answers, it is possible that they also recognize
+other wrong options as correct. Finally, we propose a dataset augmenting method
+for Multiple-Choice Questions (MCQs), MCQA+, that can more accurately reflect
+the performance of the model, which underscores the need for more robust
+evaluation mechanisms in assessing the performance of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Low-Resource Relation Representations through Multi-View
+  Decoupling <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.17267v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.17267v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenghao Fan, Wei Wei, Xiaoye Qu, Zhenyi Lu, Wenfeng Xie, Yu Cheng, Dangyang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, prompt-tuning with pre-trained language models (PLMs) has
+demonstrated the significantly enhancing ability of relation extraction (RE)
+tasks. However, in low-resource scenarios, where the available training data is
+scarce, previous prompt-based methods may still perform poorly for prompt-based
+representation learning due to a superficial understanding of the relation. To
+this end, we highlight the importance of learning high-quality relation
+representation in low-resource scenarios for RE, and propose a novel
+prompt-based relation representation method, named MVRE
+(\underline{M}ulti-\underline{V}iew \underline{R}elation
+\underline{E}xtraction), to better leverage the capacity of PLMs to improve the
+performance of RE within the low-resource prompt-tuning paradigm. Specifically,
+MVRE decouples each relation into different perspectives to encompass
+multi-view relation representations for maximizing the likelihood during
+relation inference. Furthermore, we also design a Global-Local loss and a
+Dynamic-Initialization method for better alignment of the multi-view
+relation-representing virtual words, containing the semantics of relation
+labels during the optimization learning process and initialization. Extensive
+experiments on three benchmark datasets show that our method can achieve
+state-of-the-art in low-resource settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online antisemitism across platforms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.07783v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.07783v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tom De Smedt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We created a fine-grained AI system for the detection of antisemitism. This
+Explainable AI will identify English and German anti-Semitic expressions of
+dehumanization, verbal aggression and conspiracies in online social media
+messages across platforms, to support high-level decision making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse Matrix in Large Language Model Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15525v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15525v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoze He, Juncheng Billy Li, Xuan Jiang, Heather Miller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LoRA and its variants have become popular parameter-efficient fine-tuning
+(PEFT) methods due to their ability to avoid excessive computational costs.
+However, an accuracy gap often exists between PEFT methods and full fine-tuning
+(FT), and this gap has yet to be systematically studied. In this work, we
+introduce a method for selecting sparse sub-matrices that aim to minimize the
+performance gap between PEFT vs. full fine-tuning (FT) while also reducing both
+fine-tuning computational cost and memory cost. Our Sparse Matrix Tuning (SMT)
+method begins by identifying the most significant sub-matrices in the gradient
+update, updating only these blocks during the fine-tuning process. In our
+experiments, we demonstrate that SMT consistently surpasses other PEFT baseline
+(e.g. LoRA and DoRA) in fine-tuning popular large language models such as LLaMA
+across a broad spectrum of tasks, while reducing the GPU memory footprint by
+67% compared to FT. We also examine how the performance of LoRA and DoRA tends
+to plateau and decline as the number of trainable parameters increases, in
+contrast, our SMT method does not suffer from such issue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hypothesis Search: Inductive Reasoning with Language Models <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05660v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05660v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruocheng Wang, Eric Zelikman, Gabriel Poesia, Yewen Pu, Nick Haber, Noah D. Goodman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inductive reasoning is a core problem-solving capacity: humans can identify
+underlying principles from a few examples, which robustly generalize to novel
+scenarios. Recent work evaluates large language models (LLMs) on inductive
+reasoning tasks by directly prompting them yielding "in context learning." This
+works well for straightforward inductive tasks but performs poorly on complex
+tasks such as the Abstraction and Reasoning Corpus (ARC). In this work, we
+propose to improve the inductive reasoning ability of LLMs by generating
+explicit hypotheses at multiple levels of abstraction: we prompt the LLM to
+propose multiple abstract hypotheses about the problem, in natural language,
+then implement the natural language hypotheses as concrete Python programs.
+These programs can be verified by running on observed examples and generalized
+to novel inputs. To reduce the hypothesis search space, we explore steps to
+filter the set of hypotheses to implement: we either ask the LLM to summarize
+them into a smaller set of hypotheses or ask human annotators to select a
+subset. We verify our pipeline's effectiveness on the ARC visual inductive
+reasoning benchmark, its variant 1D-ARC, string transformation dataset SyGuS,
+and list transformation dataset List Functions. On a random 100-problem subset
+of ARC, our automated pipeline using LLM summaries achieves 30% accuracy,
+outperforming the direct prompting baseline (accuracy of 17%). With the minimal
+human input of selecting from LLM-generated candidates, performance is boosted
+to 33%. Our ablations show that both abstract hypothesis generation and
+concrete program representations benefit LLMs on inductive reasoning tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2024. The first two authors contributed equally. Code:
+  https://github.com/Relento/hypothesis_search</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AgentClinic: a multimodal agent benchmark to evaluate AI in simulated
+  clinical environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.07960v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.07960v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Schmidgall, Rojin Ziaei, Carl Harris, Eduardo Reis, Jeffrey Jopling, Michael Moor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosing and managing a patient is a complex, sequential decision making
+process that requires physicians to obtain information -- such as which tests
+to perform -- and to act upon it. Recent advances in artificial intelligence
+(AI) and large language models (LLMs) promise to profoundly impact clinical
+care. However, current evaluation schemes overrely on static medical
+question-answering benchmarks, falling short on interactive decision-making
+that is required in real-life clinical work. Here, we present AgentClinic: a
+multimodal benchmark to evaluate LLMs in their ability to operate as agents in
+simulated clinical environments. In our benchmark, the doctor agent must
+uncover the patient's diagnosis through dialogue and active data collection. We
+present two open medical agent benchmarks: a multimodal image and dialogue
+environment, AgentClinic-NEJM, and a dialogue-only environment,
+AgentClinic-MedQA. We embed cognitive and implicit biases both in patient and
+doctor agents to emulate realistic interactions between biased agents. We find
+that introducing bias leads to large reductions in diagnostic accuracy of the
+doctor agents, as well as reduced compliance, confidence, and follow-up
+consultation willingness in patient agents. Evaluating a suite of
+state-of-the-art LLMs, we find that several models that excel in benchmarks
+like MedQA are performing poorly in AgentClinic-MedQA. We find that the LLM
+used in the patient agent is an important factor for performance in the
+AgentClinic benchmark. We show that both having limited interactions as well as
+too many interaction reduces diagnostic accuracy in doctor agents. The code and
+data for this work is publicly available at https://AgentClinic.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Joint Study of Phrase Grounding and Task Performance in Vision and
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02691v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02691v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noriyuki Kojima, Hadar Averbuch-Elor, Yoav Artzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Key to tasks that require reasoning about natural language in visual contexts
+is grounding words and phrases to image regions. However, observing this
+grounding in contemporary models is complex, even if it is generally expected
+to take place if the task is addressed in a way that is conductive to
+generalization. We propose a framework to jointly study task performance and
+phrase grounding, and propose three benchmarks to study the relation between
+the two. Our results show that contemporary models demonstrate inconsistency
+between their ability to ground phrases and solve tasks. We show how this can
+be addressed through brute-force training on ground phrasing annotations, and
+analyze the dynamics it creates. Code and at available at
+https://github.com/lil-lab/phrase_grounding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This was published in TMLR in 2024, on January 24th</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GUARD: Role-playing to Generate Natural-language Jailbreakings to Test
+  Guideline Adherence of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03299v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03299v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haibo Jin, Ruoxi Chen, Andy Zhou, Yang Zhang, Haohan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discovery of "jailbreaks" to bypass safety filters of Large Language
+Models (LLMs) and harmful responses have encouraged the community to implement
+safety measures. One major safety measure is to proactively test the LLMs with
+jailbreaks prior to the release. Therefore, such testing will require a method
+that can generate jailbreaks massively and efficiently. In this paper, we
+follow a novel yet intuitive strategy to generate jailbreaks in the style of
+the human generation. We propose a role-playing system that assigns four
+different roles to the user LLMs to collaborate on new jailbreaks. Furthermore,
+we collect existing jailbreaks and split them into different independent
+characteristics using clustering frequency and semantic patterns sentence by
+sentence. We organize these characteristics into a knowledge graph, making them
+more accessible and easier to retrieve. Our system of different roles will
+leverage this knowledge graph to generate new jailbreaks, which have proved
+effective in inducing LLMs to generate unethical or guideline-violating
+responses. In addition, we also pioneer a setting in our system that will
+automatically follow the government-issued guidelines to generate jailbreaks to
+test whether LLMs follow the guidelines accordingly. We refer to our system as
+GUARD (Guideline Upholding through Adaptive Role-play Diagnostics). We have
+empirically validated the effectiveness of GUARD on three cutting-edge
+open-sourced LLMs (Vicuna-13B, LongChat-7B, and Llama-2-7B), as well as a
+widely-utilized commercial LLM (ChatGPT). Moreover, our work extends to the
+realm of vision language models (MiniGPT-v2 and Gemini Vision Pro), showcasing
+GUARD's versatility and contributing valuable insights for the development of
+safer, more reliable LLM-based applications across diverse modalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 papges</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mining experimental data from Materials Science literature with Large
+  Language Models: an evaluation study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.11052v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.11052v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Foppiano, Guillaume Lambard, Toshiyuki Amagasa, Masashi Ishii
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study is dedicated to assessing the capabilities of large language
+models (LLMs) such as GPT-3.5-Turbo, GPT-4, and GPT-4-Turbo in extracting
+structured information from scientific documents in materials science. To this
+end, we primarily focus on two critical tasks of information extraction: (i) a
+named entity recognition (NER) of studied materials and physical properties and
+(ii) a relation extraction (RE) between these entities. Due to the evident lack
+of datasets within Materials Informatics (MI), we evaluated using SuperMat,
+based on superconductor research, and MeasEval, a generic measurement
+evaluation corpus. The performance of LLMs in executing these tasks is
+benchmarked against traditional models based on the BERT architecture and
+rule-based approaches (baseline). We introduce a novel methodology for the
+comparative analysis of intricate material expressions, emphasising the
+standardisation of chemical formulas to tackle the complexities inherent in
+materials science information assessment. For NER, LLMs fail to outperform the
+baseline with zero-shot prompting and exhibit only limited improvement with
+few-shot prompting. However, a GPT-3.5-Turbo fine-tuned with the appropriate
+strategy for RE outperforms all models, including the baseline. Without any
+fine-tuning, GPT-4 and GPT-4-Turbo display remarkable reasoning and
+relationship extraction capabilities after being provided with merely a couple
+of examples, surpassing the baseline. Overall, the results suggest that
+although LLMs demonstrate relevant reasoning skills in connecting concepts,
+specialised models are currently a better choice for tasks requiring extracting
+complex domain-specific entities like materials. These insights provide initial
+guidance applicable to other materials science sub-domains in future work.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages: 5 figures and 1 table in the body. 32 Tables in the
+  Appendix / Supplementary materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive Learning and Mixture of Experts Enables Precise Vector
+  Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.15713v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.15713v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Logan Hallee, Rohan Kapur, Arjun Patel, Jason P. Gleghorn, Bohdan Khomtchouk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advancement of transformer neural networks has significantly elevated the
+capabilities of sentence similarity models, but they struggle with highly
+discriminative tasks and produce sub-optimal representations of important
+documents like scientific literature. With the increased reliance on retrieval
+augmentation and search, representing diverse documents as concise and
+descriptive vectors is crucial. This paper improves upon the vectors embeddings
+of scientific literature by assembling niche datasets using co-citations as a
+similarity metric, focusing on biomedical domains. We apply a novel Mixture of
+Experts (MoE) extension pipeline to pretrained BERT models, where every
+multi-layer perceptron section is enlarged and copied into multiple distinct
+experts. Our MoE variants perform well over $N$ scientific domains with $N$
+dedicated experts, whereas standard BERT models excel in only one domain.
+Notably, extending just a single transformer block to MoE captures 85% of the
+benefit seen from full MoE extension at every layer. This holds promise for
+versatile and efficient One-Size-Fits-All transformer networks for numerically
+representing diverse inputs. Our methodology marks significant advancements in
+representing scientific text and holds promise for enhancing vector database
+search and compilation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient <span class="highlight-title">Prompt</span> Optimization Through the Lens of Best Arm
+  Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.09723v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.09723v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengshuai Shi, Kun Yang, Zihan Chen, Jundong Li, Jing Yang, Cong Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable instruction-following capability of large language models
+(LLMs) has sparked a growing interest in automatically finding good prompts,
+i.e., prompt optimization. Most existing works follow the scheme of selecting
+from a pre-generated pool of candidate prompts. However, these designs mainly
+focus on the generation strategy, while limited attention has been paid to the
+selection method. Especially, the cost incurred during the selection (e.g.,
+accessing LLM and evaluating the responses) is rarely explicitly considered. To
+overcome this limitation, this work provides a principled framework, TRIPLE, to
+efficiently perform prompt selection under an explicit budget constraint.
+TRIPLE is built on a novel connection established between prompt optimization
+and fixed-budget best arm identification (BAI-FB) in multi-armed bandits (MAB);
+thus, it is capable of leveraging the rich toolbox from BAI-FB systematically
+and also incorporating unique characteristics of prompt optimization. Extensive
+experiments on multiple well-adopted tasks using various LLMs demonstrate the
+remarkable performance improvement of TRIPLE over baselines while satisfying
+the limited budget constraints. As an extension, variants of TRIPLE are
+proposed to efficiently select examples for few-shot prompts, also achieving
+superior empirical performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SWE-agent: Agent-Computer Interfaces Enable Automated Software
+  Engineering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15793v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15793v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John Yang, Carlos E. Jimenez, Alexander Wettig, Kilian Lieret, Shunyu Yao, Karthik Narasimhan, Ofir Press
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language model (LM) agents are increasingly being used to automate
+complicated tasks in digital environments. Just as humans benefit from powerful
+software applications, such as integrated development environments, for complex
+tasks like software engineering, we posit that LM agents represent a new
+category of end users with their own needs and abilities, and would benefit
+from specially-built interfaces to the software they use. We investigate how
+interface design affects the performance of language model agents. As a result
+of this exploration, we introduce SWE-agent: a system that facilitates LM
+agents to autonomously use computers to solve software engineering tasks.
+SWE-agent's custom agent-computer interface (ACI) significantly enhances an
+agent's ability to create and edit code files, navigate entire repositories,
+and execute tests and other programs. We evaluate SWE-agent on SWE-bench and
+HumanEvalFix, achieving state-of-the-art performance on both with a pass@1 rate
+of 12.5% and 87.7%, respectively, far exceeding the previous state-of-the-art
+achieved with non-interactive LMs. Finally, we provide insight on how the
+design of the ACI can impact agents' behavior and performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code, data, and demo available at https://swe-agent.com</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficacy of ByT5 in Multilingual Translation of Biblical Texts for
+  Underrepresented Languages <span class="chip">NAACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.13350v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.13350v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Corinne Aars, Lauren Adams, Xiaokan Tian, Zhaoyu Wang, Colton Wismer, Jason Wu, Pablo Rivas, Korn Sooksatra, Matthew Fendt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents the development and evaluation of a ByT5-based
+multilingual translation model tailored for translating the Bible into
+underrepresented languages. Utilizing the comprehensive Johns Hopkins
+University Bible Corpus, we trained the model to capture the intricate nuances
+of character-based and morphologically rich languages. Our results, measured by
+the BLEU score and supplemented with sample translations, suggest the model can
+improve accessibility to sacred texts. It effectively handles the distinctive
+biblical lexicon and structure, thus bridging the linguistic divide. The study
+also discusses the model's limitations and suggests pathways for future
+enhancements, focusing on expanding access to sacred literature across
+linguistic boundaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>LXAI Workshop at the 2024 Annual Conference of the North American
+  Chapter of the Association for Computational Linguistics (NAACL 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">175</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unique3D: High-Quality and Efficient 3D Mesh Generation from a Single
+  Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kailu Wu, Fangfu Liu, Zhihan Cai, Runjie Yan, Hanyang Wang, Yating Hu, Yueqi Duan, Kaisheng Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce Unique3D, a novel image-to-3D framework for
+efficiently generating high-quality 3D meshes from single-view images,
+featuring state-of-the-art generation fidelity and strong generalizability.
+Previous methods based on Score Distillation Sampling (SDS) can produce
+diversified 3D results by distilling 3D knowledge from large 2D diffusion
+models, but they usually suffer from long per-case optimization time with
+inconsistent issues. Recent works address the problem and generate better 3D
+results either by finetuning a multi-view diffusion model or training a fast
+feed-forward model. However, they still lack intricate textures and complex
+geometries due to inconsistency and limited generated resolution. To
+simultaneously achieve high fidelity, consistency, and efficiency in single
+image-to-3D, we propose a novel framework Unique3D that includes a multi-view
+diffusion model with a corresponding normal diffusion model to generate
+multi-view images with their normal maps, a multi-level upscale process to
+progressively improve the resolution of generated orthographic multi-views, as
+well as an instant and consistent mesh reconstruction algorithm called ISOMER,
+which fully integrates the color and geometric priors into mesh results.
+Extensive experiments demonstrate that our Unique3D significantly outperforms
+other image-to-3D baselines in terms of geometric and textural details.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://wukailu.github.io/Unique3D</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MotionLLM: Understanding Human Behaviors from Human Motions and Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling-Hao Chen, Shunlin Lu, Ailing Zeng, Hao Zhang, Benyou Wang, Ruimao Zhang, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study delves into the realm of multi-modality (i.e., video and motion
+modalities) human behavior understanding by leveraging the powerful
+capabilities of Large Language Models (LLMs). Diverging from recent LLMs
+designed for video-only or motion-only understanding, we argue that
+understanding human behavior necessitates joint modeling from both videos and
+motion sequences (e.g., SMPL sequences) to capture nuanced body part dynamics
+and semantics effectively. In light of this, we present MotionLLM, a
+straightforward yet effective framework for human motion understanding,
+captioning, and reasoning. Specifically, MotionLLM adopts a unified
+video-motion training strategy that leverages the complementary advantages of
+existing coarse video-text data and fine-grained motion-text data to glean rich
+spatial-temporal insights. Furthermore, we collect a substantial dataset,
+MoVid, comprising diverse videos, motions, captions, and instructions.
+Additionally, we propose the MoVid-Bench, with carefully manual annotations,
+for better evaluation of human behavior understanding on video and motion.
+Extensive experiments show the superiority of MotionLLM in the caption,
+spatial-temporal comprehension, and reasoning ability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>MotionLLM version 1.0, project page see https://lhchen.top/MotionLLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Perception by Large Language Model's Weights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feipeng Ma, Hongwei Xue, Guangting Wang, Yizhou Zhou, Fengyun Rao, Shilin Yan, Yueyi Zhang, Siying Wu, Mike Zheng Shou, Xiaoyan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing Multimodal Large Language Models (MLLMs) follow the paradigm that
+perceives visual information by aligning visual features with the input space
+of Large Language Models (LLMs), and concatenating visual tokens with text
+tokens to form a unified sequence input for LLMs. These methods demonstrate
+promising results on various vision-language tasks but are limited by the high
+computational effort due to the extended input sequence resulting from the
+involvement of visual tokens. In this paper, instead of input space alignment,
+we propose a novel parameter space alignment paradigm that represents visual
+information as model weights. For each input image, we use a vision encoder to
+extract visual features, convert features into perceptual weights, and merge
+the perceptual weights with LLM's weights. In this way, the input of LLM does
+not require visual tokens, which reduces the length of the input sequence and
+greatly improves efficiency. Following this paradigm, we propose VLoRA with the
+perceptual weights generator. The perceptual weights generator is designed to
+convert visual features to perceptual weights with low-rank property,
+exhibiting a form similar to LoRA. The experimental results show that our VLoRA
+achieves comparable performance on various benchmarks for MLLMs, while
+significantly reducing the computational costs for both training and inference.
+The code and models will be made open-source.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OccSora: 4D Occupancy Generation Models as World Simulators for
+  Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lening Wang, Wenzhao Zheng, Yilong Ren, Han Jiang, Zhiyong Cui, Haiyang Yu, Jiwen Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the evolution of 3D scenes is important for effective
+autonomous driving. While conventional methods mode scene development with the
+motion of individual instances, world models emerge as a generative framework
+to describe the general scene dynamics. However, most existing methods adopt an
+autoregressive framework to perform next-token prediction, which suffer from
+inefficiency in modeling long-term temporal evolutions. To address this, we
+propose a diffusion-based 4D occupancy generation model, OccSora, to simulate
+the development of the 3D world for autonomous driving. We employ a 4D scene
+tokenizer to obtain compact discrete spatial-temporal representations for 4D
+occupancy input and achieve high-quality reconstruction for long-sequence
+occupancy videos. We then learn a diffusion transformer on the spatial-temporal
+representations and generate 4D occupancy conditioned on a trajectory prompt.
+We conduct extensive experiments on the widely used nuScenes dataset with Occ3D
+occupancy annotations. OccSora can generate 16s-videos with authentic 3D layout
+and temporal consistency, demonstrating its ability to understand the spatial
+and temporal distributions of driving scenes. With trajectory-aware 4D
+generation, OccSora has the potential to serve as a world simulator for the
+decision-making of autonomous driving. Code is available at:
+https://github.com/wzzheng/OccSora.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at: https://github.com/wzzheng/OccSora</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RapVerse: Coherent Vocals and Whole-Body Motions Generations from Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20336v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20336v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaben Chen, Xin Yan, Yihang Chen, Siyuan Cen, Qinwei Ma, Haoyu Zhen, Kaizhi Qian, Lie Lu, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce a challenging task for simultaneously generating
+3D holistic body motions and singing vocals directly from textual lyrics
+inputs, advancing beyond existing works that typically address these two
+modalities in isolation. To facilitate this, we first collect the RapVerse
+dataset, a large dataset containing synchronous rapping vocals, lyrics, and
+high-quality 3D holistic body meshes. With the RapVerse dataset, we investigate
+the extent to which scaling autoregressive multimodal transformers across
+language, audio, and motion can enhance the coherent and realistic generation
+of vocals and whole-body human motions. For modality unification, a
+vector-quantized variational autoencoder is employed to encode whole-body
+motion sequences into discrete motion tokens, while a vocal-to-unit model is
+leveraged to obtain quantized audio tokens preserving content, prosodic
+information, and singer identity. By jointly performing transformer modeling on
+these three modalities in a unified way, our framework ensures a seamless and
+realistic blend of vocals and human motions. Extensive experiments demonstrate
+that our unified generation framework not only produces coherent and realistic
+singing vocals alongside human motions directly from textual inputs but also
+rivals the performance of specialized single-modality generation systems,
+establishing new benchmarks for joint vocal-motion generation. The project page
+is available for research purposes at https://vis-www.cs.umass.edu/RapVerse.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project website: https://vis-www.cs.umass.edu/RapVerse</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VividDream: Generating 3D Scene with Ambient Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20334v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20334v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao-Chih Lee, Yi-Ting Chen, Andrew Wang, Ting-Hsuan Liao, Brandon Y. Feng, Jia-Bin Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce VividDream, a method for generating explorable 4D scenes with
+ambient dynamics from a single input image or text prompt. VividDream first
+expands an input image into a static 3D point cloud through iterative
+inpainting and geometry merging. An ensemble of animated videos is then
+generated using video diffusion models with quality refinement techniques and
+conditioned on renderings of the static 3D scene from the sampled camera
+trajectories. We then optimize a canonical 4D scene representation using an
+animated video ensemble, with per-video motion embeddings and visibility masks
+to mitigate inconsistencies. The resulting 4D scene enables free-view
+exploration of a 3D scene with plausible ambient scene dynamics. Experiments
+demonstrate that VividDream can provide human viewers with compelling 4D
+experiences generated based on diverse real images and text prompts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://vivid-dream-4d.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SurgiTrack: Fine-Grained Multi-Class Multi-Tool Tracking in Surgical
+  Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chinedu Innocent Nwoye, Nicolas Padoy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate tool tracking is essential for the success of computer-assisted
+intervention. Previous efforts often modeled tool trajectories rigidly,
+overlooking the dynamic nature of surgical procedures, especially tracking
+scenarios like out-of-body and out-of-camera views. Addressing this limitation,
+the new CholecTrack20 dataset provides detailed labels that account for
+multiple tool trajectories in three perspectives: (1) intraoperative, (2)
+intracorporeal, and (3) visibility, representing the different types of
+temporal duration of tool tracks. These fine-grained labels enhance tracking
+flexibility but also increase the task complexity. Re-identifying tools after
+occlusion or re-insertion into the body remains challenging due to high visual
+similarity, especially among tools of the same category. This work recognizes
+the critical role of the tool operators in distinguishing tool track instances,
+especially those belonging to the same tool category. The operators'
+information are however not explicitly captured in surgical videos. We
+therefore propose SurgiTrack, a novel deep learning method that leverages
+YOLOv7 for precise tool detection and employs an attention mechanism to model
+the originating direction of the tools, as a proxy to their operators, for tool
+re-identification. To handle diverse tool trajectory perspectives, SurgiTrack
+employs a harmonizing bipartite matching graph, minimizing conflicts and
+ensuring accurate tool identity association. Experimental results on
+CholecTrack20 demonstrate SurgiTrack's effectiveness, outperforming baselines
+and state-of-the-art methods with real-time inference capability. This work
+sets a new standard in surgical tool tracking, providing dynamic trajectories
+for more adaptable and precise assistance in minimally invasive surgeries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures, 9 tables, 1 video. Supplementary video available
+  at: https://vimeo.com/951853260</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 4DHands: Reconstructing Interactive Hands in 4D with <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20330v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20330v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dixuan Lin, Yuxiang Zhang, Mengcheng Li, Yebin Liu, Wei Jing, Qi Yan, Qianying Wang, Hongwen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce 4DHands, a robust approach to recovering
+interactive hand meshes and their relative movement from monocular inputs. Our
+approach addresses two major limitations of previous methods: lacking a unified
+solution for handling various hand image inputs and neglecting the positional
+relationship of two hands within images. To overcome these challenges, we
+develop a transformer-based architecture with novel tokenization and feature
+fusion strategies. Specifically, we propose a Relation-aware Two-Hand
+Tokenization (RAT) method to embed positional relation information into the
+hand tokens. In this way, our network can handle both single-hand and two-hand
+inputs and explicitly leverage relative hand positions, facilitating the
+reconstruction of intricate hand interactions in real-world scenarios. As such
+tokenization indicates the relative relationship of two hands, it also supports
+more effective feature fusion. To this end, we further develop a
+Spatio-temporal Interaction Reasoning (SIR) module to fuse hand tokens in 4D
+with attention and decode them into 3D hand meshes and relative temporal
+movements. The efficacy of our approach is validated on several benchmark
+datasets. The results on in-the-wild videos and real-world scenarios
+demonstrate the superior performances of our approach for interactive hand
+reconstruction. More video results can be found on the project page:
+https://4dhands.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>More demo videos can be seen at our project page:
+  https://4dhands.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GECO: Generative Image-to-3D within a SECOnd 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20327v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20327v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Wang, Jiatao Gu, Xiaoxiao Long, Yuan Liu, Lingjie Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D generation has seen remarkable progress in recent years. Existing
+techniques, such as score distillation methods, produce notable results but
+require extensive per-scene optimization, impacting time efficiency.
+Alternatively, reconstruction-based approaches prioritize efficiency but
+compromise quality due to their limited handling of uncertainty. We introduce
+GECO, a novel method for high-quality 3D generative modeling that operates
+within a second. Our approach addresses the prevalent issues of uncertainty and
+inefficiency in current methods through a two-stage approach. In the initial
+stage, we train a single-step multi-view generative model with score
+distillation. Then, a second-stage distillation is applied to address the
+challenge of view inconsistency from the multi-view prediction. This two-stage
+process ensures a balanced approach to 3D generation, optimizing both quality
+and efficiency. Our comprehensive experiments demonstrate that GECO achieves
+high-quality image-to-3D generation with an unprecedented level of efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://cwchenwang.github.io/geco</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MotionFollower: Editing Video Motion via Lightweight Score-Guided
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyuan Tu, Qi Dai, Zihao Zhang, Sicheng Xie, Zhi-Qi Cheng, Chong Luo, Xintong Han, Zuxuan Wu, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite impressive advancements in diffusion-based video editing models in
+altering video attributes, there has been limited exploration into modifying
+motion information while preserving the original protagonist's appearance and
+background. In this paper, we propose MotionFollower, a lightweight
+score-guided diffusion model for video motion editing. To introduce conditional
+controls to the denoising process, MotionFollower leverages two of our proposed
+lightweight signal controllers, one for poses and the other for appearances,
+both of which consist of convolution blocks without involving heavy attention
+calculations. Further, we design a score guidance principle based on a
+two-branch architecture, including the reconstruction and editing branches,
+which significantly enhance the modeling capability of texture details and
+complicated backgrounds. Concretely, we enforce several consistency
+regularizers and losses during the score estimation. The resulting gradients
+thus inject appropriate guidance to the intermediate latents, forcing the model
+to preserve the original background details and protagonists' appearances
+without interfering with the motion modification. Experiments demonstrate the
+competitive motion editing ability of MotionFollower qualitatively and
+quantitatively. Compared with MotionEditor, the most advanced motion editing
+model, MotionFollower achieves an approximately 80% reduction in GPU memory
+while delivering superior motion editing performance and exclusively supporting
+large camera movements and actions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 18 figures. Project page at
+  https://francis-rings.github.io/MotionFollower/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Don't drop your samples! Coherence-aware training benefits Conditional
+  diffusion <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Dufour, Victor Besnier, Vicky Kalogeiton, David Picard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conditional diffusion models are powerful generative models that can leverage
+various types of conditional information, such as class labels, segmentation
+masks, or text captions. However, in many real-world scenarios, conditional
+information may be noisy or unreliable due to human annotation errors or weak
+alignment. In this paper, we propose the Coherence-Aware Diffusion (CAD), a
+novel method that integrates coherence in conditional information into
+diffusion models, allowing them to learn from noisy annotations without
+discarding data. We assume that each data point has an associated coherence
+score that reflects the quality of the conditional information. We then
+condition the diffusion model on both the conditional information and the
+coherence score. In this way, the model learns to ignore or discount the
+conditioning when the coherence is low. We show that CAD is theoretically sound
+and empirically effective on various conditional generation tasks. Moreover, we
+show that leveraging coherence generates realistic and diverse samples that
+respect conditional information better than models trained on cleaned datasets
+where samples with low coherence have been discarded.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CVPR 2024 as a Highlight. Project page:
+  https://nicolas-dufour.github.io/cad.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ $\textit{S}^3$Gaussian: <span class="highlight-title">Self-Supervised</span> Street Gaussians for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Huang, Xiaobao Wei, Wenzhao Zheng, Pengju An, Ming Lu, Wei Zhan, Masayoshi Tomizuka, Kurt Keutzer, Shanghang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Photorealistic 3D reconstruction of street scenes is a critical technique for
+developing real-world simulators for autonomous driving. Despite the efficacy
+of Neural Radiance Fields (NeRF) for driving scenes, 3D Gaussian Splatting
+(3DGS) emerges as a promising direction due to its faster speed and more
+explicit representation. However, most existing street 3DGS methods require
+tracked 3D vehicle bounding boxes to decompose the static and dynamic elements
+for effective reconstruction, limiting their applications for in-the-wild
+scenarios. To facilitate efficient 3D scene reconstruction without costly
+annotations, we propose a self-supervised street Gaussian
+($\textit{S}^3$Gaussian) method to decompose dynamic and static elements from
+4D consistency. We represent each scene with 3D Gaussians to preserve the
+explicitness and further accompany them with a spatial-temporal field network
+to compactly model the 4D dynamics. We conduct extensive experiments on the
+challenging Waymo-Open dataset to evaluate the effectiveness of our method. Our
+$\textit{S}^3$Gaussian demonstrates the ability to decompose static and dynamic
+scenes and achieves the best performance without using 3D annotations. Code is
+available at: https://github.com/nnanhuang/S3Gaussian/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at: https://github.com/nnanhuang/S3Gaussian/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-based Manipulation from Single Human Video with Open-World Object
+  Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifeng Zhu, Arisrei Lim, Peter Stone, Yuke Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an object-centric approach to empower robots to learn vision-based
+manipulation skills from human videos. We investigate the problem of imitating
+robot manipulation from a single human video in the open-world setting, where a
+robot must learn to manipulate novel objects from one video demonstration. We
+introduce ORION, an algorithm that tackles the problem by extracting an
+object-centric manipulation plan from a single RGB-D video and deriving a
+policy that conditions on the extracted plan. Our method enables the robot to
+learn from videos captured by daily mobile devices such as an iPad and
+generalize the policies to deployment environments with varying visual
+backgrounds, camera angles, spatial layouts, and novel object instances. We
+systematically evaluate our method on both short-horizon and long-horizon
+tasks, demonstrating the efficacy of ORION in learning from a single human
+video in the open world. Videos can be found in the project website
+https://ut-austin-rpl.github.io/ORION-release.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving the Training of Rectified Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangyun Lee, Zinan Lin, Giulia Fanti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have shown great promise for image and video generation, but
+sampling from state-of-the-art models requires expensive numerical integration
+of a generative ODE. One approach for tackling this problem is rectified flows,
+which iteratively learn smooth ODE paths that are less susceptible to
+truncation error. However, rectified flows still require a relatively large
+number of function evaluations (NFEs). In this work, we propose improved
+techniques for training rectified flows, allowing them to compete with
+knowledge distillation methods even in the low NFE setting. Our main insight is
+that under realistic settings, a single iteration of the Reflow algorithm for
+training rectified flows is sufficient to learn nearly straight trajectories;
+hence, the current practice of using multiple Reflow iterations is unnecessary.
+We thus propose techniques to improve one-round training of rectified flows,
+including a U-shaped timestep distribution and LPIPS-Huber premetric. With
+these techniques, we improve the FID of the previous 2-rectified flow by up to
+72% in the 1 NFE setting on CIFAR-10. On ImageNet 64$\times$64, our improved
+rectified flow outperforms the state-of-the-art distillation methods such as
+consistency distillation and progressive distillation in both one-step and
+two-step settings and rivals the performance of improved consistency training
+(iCT) in FID. Code is available at https://github.com/sangyun884/rfpp.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ParSEL: Parameterized Shape Editing with Language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20319v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20319v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditya Ganeshan, Ryan Y. Huang, Xianghao Xu, R. Kenny Jones, Daniel Ritchie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to edit 3D assets from natural language presents a compelling
+paradigm to aid in the democratization of 3D content creation. However, while
+natural language is often effective at communicating general intent, it is
+poorly suited for specifying precise manipulation. To address this gap, we
+introduce ParSEL, a system that enables controllable editing of high-quality 3D
+assets from natural language. Given a segmented 3D mesh and an editing request,
+ParSEL produces a parameterized editing program. Adjusting the program
+parameters allows users to explore shape variations with a precise control over
+the magnitudes of edits. To infer editing programs which align with an input
+edit request, we leverage the abilities of large-language models (LLMs).
+However, while we find that LLMs excel at identifying initial edit operations,
+they often fail to infer complete editing programs, and produce outputs that
+violate shape semantics. To overcome this issue, we introduce Analytical Edit
+Propagation (AEP), an algorithm which extends a seed edit with additional
+operations until a complete editing program has been formed. Unlike prior
+methods, AEP searches for analytical editing operations compatible with a range
+of possible user edits through the integration of computer algebra systems for
+geometric analysis. Experimentally we demonstrate ParSEL's effectiveness in
+enabling controllable editing of 3D objects through natural language requests
+over alternative system designs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Pixel Is Worth More Than One 3D Gaussians in Single-View 3D
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianghao Shen, Tianfu Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning 3D scene representation from a single-view image is a long-standing
+fundamental problem in computer vision, with the inherent ambiguity in
+predicting contents unseen from the input view. Built on the recently proposed
+3D Gaussian Splatting (3DGS), the Splatter Image method has made promising
+progress on fast single-image novel view synthesis via learning a single 3D
+Gaussian for each pixel based on the U-Net feature map of an input image.
+However, it has limited expressive power to represent occluded components that
+are not observable in the input view. To address this problem, this paper
+presents a Hierarchical Splatter Image method in which a pixel is worth more
+than one 3D Gaussians. Specifically,
+  each pixel is represented by a parent 3D Gaussian and a small number of child
+3D Gaussians. Parent 3D Gaussians are learned as done in the vanilla Splatter
+Image. Child 3D Gaussians are learned via a lightweight Multi-Layer Perceptron
+(MLP) which takes as input the projected image features of a parent 3D Gaussian
+and the embedding of a target camera view. Both parent and child 3D Gaussians
+are learned end-to-end in a stage-wise way. The joint condition of input image
+features from eyes of the parent Gaussians and the target camera position
+facilitates learning to allocate child Gaussians to ``see the unseen'',
+recovering the occluded details that are often missed by parent Gaussians.
+  In experiments, the proposed method is tested on the ShapeNet-SRN and CO3D
+datasets with state-of-the-art performance obtained, especially showing
+promising capabilities of reconstructing occluded contents in the input view.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can't make an Omelette without Breaking some Eggs: Plausible Action
+  Anticipation using Large Video-Language Models <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Himangi Mittal, Nakul Agarwal, Shao-Yuan Lo, Kwonjoon Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce PlausiVL, a large video-language model for anticipating action
+sequences that are plausible in the real-world. While significant efforts have
+been made towards anticipating future actions, prior approaches do not take
+into account the aspect of plausibility in an action sequence. To address this
+limitation, we explore the generative capability of a large video-language
+model in our work and further, develop the understanding of plausibility in an
+action sequence by introducing two objective functions, a counterfactual-based
+plausible action sequence learning loss and a long-horizon action repetition
+loss. We utilize temporal logical constraints as well as verb-noun action pair
+logical constraints to create implausible/counterfactual action sequences and
+use them to train the model with plausible action sequence learning loss. This
+loss helps the model to differentiate between plausible and not plausible
+action sequences and also helps the model to learn implicit temporal cues
+crucial for the task of action anticipation. The long-horizon action repetition
+loss puts a higher penalty on the actions that are more prone to repetition
+over a longer temporal window. With this penalization, the model is able to
+generate diverse, plausible action sequences. We evaluate our approach on two
+large-scale datasets, Ego4D and EPIC-Kitchens-100, and show improvements on the
+task of action anticipation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scaling White-Box <span class="highlight-title">Transformer</span>s for Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinrui Yang, Xianhang Li, Druv Pai, Yuyin Zhou, Yi Ma, Yaodong Yu, Cihang Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  CRATE, a white-box transformer architecture designed to learn compressed and
+sparse representations, offers an intriguing alternative to standard vision
+transformers (ViTs) due to its inherent mathematical interpretability. Despite
+extensive investigations into the scaling behaviors of language and vision
+transformers, the scalability of CRATE remains an open question which this
+paper aims to address. Specifically, we propose CRATE-$\alpha$, featuring
+strategic yet minimal modifications to the sparse coding block in the CRATE
+architecture design, and a light training recipe designed to improve the
+scalability of CRATE. Through extensive experiments, we demonstrate that
+CRATE-$\alpha$ can effectively scale with larger model sizes and datasets. For
+example, our CRATE-$\alpha$-B substantially outperforms the prior best CRATE-B
+model accuracy on ImageNet classification by 3.7%, achieving an accuracy of
+83.2%. Meanwhile, when scaling further, our CRATE-$\alpha$-L obtains an
+ImageNet classification accuracy of 85.1%. More notably, these model
+performance improvements are achieved while preserving, and potentially even
+enhancing the interpretability of learned CRATE models, as we demonstrate
+through showing that the learned token representations of increasingly larger
+trained CRATE-$\alpha$ models yield increasingly higher-quality unsupervised
+object segmentation of images. The project page is
+https://rayjryang.github.io/CRATE-alpha/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page: https://rayjryang.github.io/CRATE-alpha/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling and Mitigating Backdoor Vulnerabilities based on Unlearning
+  Weight Changes and Backdoor Activeness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weilin Lin, Li Liu, Shaokui Wei, Jianze Li, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The security threat of backdoor attacks is a central concern for deep neural
+networks (DNNs). Recently, without poisoned data, unlearning models with clean
+data and then learning a pruning mask have contributed to backdoor defense.
+Additionally, vanilla fine-tuning with those clean data can help recover the
+lost clean accuracy. However, the behavior of clean unlearning is still
+under-explored, and vanilla fine-tuning unintentionally induces back the
+backdoor effect. In this work, we first investigate model unlearning from the
+perspective of weight changes and gradient norms, and find two interesting
+observations in the backdoored model: 1) the weight changes between poison and
+clean unlearning are positively correlated, making it possible for us to
+identify the backdoored-related neurons without using poisoned data; 2) the
+neurons of the backdoored model are more active (i.e., larger changes in
+gradient norm) than those in the clean model, suggesting the need to suppress
+the gradient norm during fine-tuning. Then, we propose an effective two-stage
+defense method. In the first stage, an efficient Neuron Weight Change
+(NWC)-based Backdoor Reinitialization is proposed based on observation 1). In
+the second stage, based on observation 2), we design an Activeness-Aware
+Fine-Tuning to replace the vanilla fine-tuning. Extensive experiments,
+involving eight backdoor attacks on three benchmark datasets, demonstrate the
+superior performance of our proposed method compared to recent state-of-the-art
+backdoor defense approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TetSphere Splatting: Representing High-Quality Geometry with Lagrangian
+  Volumetric Meshes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20283v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20283v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghao Guo, Bohan Wang, Kaiming He, Wojciech Matusik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present TetSphere splatting, an explicit, Lagrangian representation for
+reconstructing 3D shapes with high-quality geometry. In contrast to
+conventional object reconstruction methods which predominantly use Eulerian
+representations, including both neural implicit (e.g., NeRF, NeuS) and explicit
+representations (e.g., DMTet), and often struggle with high computational
+demands and suboptimal mesh quality, TetSphere splatting utilizes an underused
+but highly effective geometric primitive -- tetrahedral meshes. This approach
+directly yields superior mesh quality without relying on neural networks or
+post-processing. It deforms multiple initial tetrahedral spheres to accurately
+reconstruct the 3D shape through a combination of differentiable rendering and
+geometric energy optimization, resulting in significant computational
+efficiency. Serving as a robust and versatile geometry representation,
+Tet-Sphere splatting seamlessly integrates into diverse applications, including
+single-view 3D reconstruction, image-/text-to-3D content generation.
+Experimental results demonstrate that TetSphere splatting outperforms existing
+representations, delivering faster optimization speed, enhanced mesh quality,
+and reliable preservation of thin structures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SemFlow: Binding Semantic Segmentation and Image Synthesis via Rectified
+  Flow 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyang Wang, Xiangtai Li, Lu Qi, Henghui Ding, Yunhai Tong, Ming-Hsuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic segmentation and semantic image synthesis are two representative
+tasks in visual perception and generation. While existing methods consider them
+as two distinct tasks, we propose a unified diffusion-based framework (SemFlow)
+and model them as a pair of reverse problems. Specifically, motivated by
+rectified flow theory, we train an ordinary differential equation (ODE) model
+to transport between the distributions of real images and semantic masks. As
+the training object is symmetric, samples belonging to the two distributions,
+images and semantic masks, can be effortlessly transferred reversibly. For
+semantic segmentation, our approach solves the contradiction between the
+randomness of diffusion outputs and the uniqueness of segmentation results. For
+image synthesis, we propose a finite perturbation approach to enhance the
+diversity of generated results without changing the semantic categories.
+Experiments show that our SemFlow achieves competitive results on semantic
+segmentation and semantic image synthesis tasks. We hope this simple framework
+will motivate people to rethink the unification of low-level and high-level
+vision. Project page: https://github.com/wang-chaoyang/SemFlow.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CV-VAE: A Compatible Video VAE for Latent Generative Video Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sijie Zhao, Yong Zhang, Xiaodong Cun, Shaoshu Yang, Muyao Niu, Xiaoyu Li, Wenbo Hu, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatio-temporal compression of videos, utilizing networks such as Variational
+Autoencoders (VAE), plays a crucial role in OpenAI's SORA and numerous other
+video generative models. For instance, many LLM-like video models learn the
+distribution of discrete tokens derived from 3D VAEs within the VQVAE
+framework, while most diffusion-based video models capture the distribution of
+continuous latent extracted by 2D VAEs without quantization. The temporal
+compression is simply realized by uniform frame sampling which results in
+unsmooth motion between consecutive frames. Currently, there lacks of a
+commonly used continuous video (3D) VAE for latent diffusion-based video models
+in the research community. Moreover, since current diffusion-based approaches
+are often implemented using pre-trained text-to-image (T2I) models, directly
+training a video VAE without considering the compatibility with existing T2I
+models will result in a latent space gap between them, which will take huge
+computational resources for training to bridge the gap even with the T2I models
+as initialization. To address this issue, we propose a method for training a
+video VAE of latent video models, namely CV-VAE, whose latent space is
+compatible with that of a given image VAE, e.g., image VAE of Stable Diffusion
+(SD). The compatibility is achieved by the proposed novel latent space
+regularization, which involves formulating a regularization loss using the
+image VAE. Benefiting from the latent space compatibility, video models can be
+trained seamlessly from pre-trained T2I or video models in a truly
+spatio-temporally compressed latent space, rather than simply sampling video
+frames at equal intervals. With our CV-VAE, existing video models can generate
+four times more frames with minimal finetuning. Extensive experiments are
+conducted to demonstrate the effectiveness of the proposed video VAE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://ailab-cvc.github.io/cvvae/index.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ETHER: Efficient Finetuning of Large-Scale Models with Hyperplane
+  Reflections <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Massimo Bini, Karsten Roth, Zeynep Akata, Anna Khoreva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient finetuning (PEFT) has become ubiquitous to adapt
+foundation models to downstream task requirements while retaining their
+generalization ability. However, the amount of additionally introduced
+parameters and compute for successful adaptation and hyperparameter searches
+can explode quickly, especially when deployed at scale to serve numerous
+individual requests. To ensure effective, parameter-efficient, and
+hyperparameter-robust adaptation, we propose the ETHER transformation family,
+which performs Efficient fineTuning via HypErplane Reflections. By design,
+ETHER transformations require a minimal number of parameters, are less likely
+to deteriorate model performance, and exhibit robustness to hyperparameter and
+learning rate choices. In particular, we introduce ETHER and its relaxation
+ETHER+, which match or outperform existing PEFT methods with significantly
+fewer parameters ($\sim$$10$-$100$ times lower than LoRA or OFT) across
+multiple image synthesis and natural language tasks without exhaustive
+hyperparameter tuning. Finally, we investigate the recent emphasis on
+Hyperspherical Energy retention for adaptation and raise questions on its
+practical utility. The code is available at https://github.com/mwbini/ether.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024. Code available at
+  https://github.com/mwbini/ether</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FaceMixup: Enhancing Facial Expression Recognition through Mixed Face
+  Regularization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio A. Faria, Mateus M. Souza, Raoni F. da S. Teixeira, Mauricio P. Segundo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of deep learning solutions and the scarcity of large
+annotated datasets pose significant challenges in real-world applications.
+Various strategies have been explored to overcome this challenge, with data
+augmentation (DA) approaches emerging as prominent solutions. DA approaches
+involve generating additional examples by transforming existing labeled data,
+thereby enriching the dataset and helping deep learning models achieve improved
+generalization without succumbing to overfitting. In real applications, where
+solutions based on deep learning are widely used, there is facial expression
+recognition (FER), which plays an essential role in human communication,
+improving a range of knowledge areas (e.g., medicine, security, and marketing).
+In this paper, we propose a simple and comprehensive face data augmentation
+approach based on mixed face component regularization that outperforms the
+classical DA approaches from the literature, including the MixAugment which is
+a specific approach for the target task in two well-known FER datasets existing
+in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 9 figures, paper is under review on journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KerasCV and KerasNLP: Vision and Language Power-Ups 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Watson, Divyashree Shivakumar Sreepathihalli, Francois Chollet, Martin Gorner, Kiranbir Sodhia, Ramesh Sampath, Tirth Patel, Haifeng Jin, Neel Kovelamudi, Gabriel Rasskin, Samaneh Saadat, Luke Wood, Chen Qian, Jonathan Bischof, Ian Stenbit
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Keras domain packages KerasCV and KerasNLP, extensions of the
+Keras API for Computer Vision and Natural Language Processing workflows,
+capable of running on either JAX, TensorFlow, or PyTorch. These domain packages
+are designed to enable fast experimentation, with a focus on ease-of-use and
+performance. We adopt a modular, layered design: at the library's lowest level
+of abstraction, we provide building blocks for creating models and data
+preprocessing pipelines, and at the library's highest level of abstraction, we
+provide pretrained ``task" models for popular architectures such as Stable
+Diffusion, YOLOv8, GPT2, BERT, Mistral, CLIP, Gemma, T5, etc. Task models have
+built-in preprocessing, pretrained weights, and can be fine-tuned on raw
+inputs. To enable efficient training, we support XLA compilation for all
+models, and run all preprocessing via a compiled graph of TensorFlow operations
+using the tf.data API. The libraries are fully open-source (Apache 2.0 license)
+and available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Journal of Machine Learning Open Source Software</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOFA-Video: Controllable Image Animation via Generative Motion Field
+  Adaptions in Frozen Image-to-Video Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muyao Niu, Xiaodong Cun, Xintao Wang, Yong Zhang, Ying Shan, Yinqiang Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present MOFA-Video, an advanced controllable image animation method that
+generates video from the given image using various additional controllable
+signals (such as human landmarks reference, manual trajectories, and another
+even provided video) or their combinations. This is different from previous
+methods which only can work on a specific motion domain or show weak control
+abilities with diffusion prior. To achieve our goal, we design several
+domain-aware motion field adapters (\ie, MOFA-Adapters) to control the
+generated motions in the video generation pipeline. For MOFA-Adapters, we
+consider the temporal motion consistency of the video and generate the dense
+motion flow from the given sparse control conditions first, and then, the
+multi-scale features of the given image are wrapped as a guided feature for
+stable video diffusion generation. We naively train two motion adapters for the
+manual trajectories and the human landmarks individually since they both
+contain sparse information about the control. After training, the MOFA-Adapters
+in different domains can also work together for more controllable video
+generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boost Your Own Human Image Generation Model via Direct Preference
+  Optimization with AI Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanghyeon Na, Yonggyu Kim, Hyunjoon Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of high-quality human images through text-to-image (T2I)
+methods is a significant yet challenging task. Distinct from general image
+generation, human image synthesis must satisfy stringent criteria related to
+human pose, anatomy, and alignment with textual prompts, making it particularly
+difficult to achieve realistic results. Recent advancements in T2I generation
+based on diffusion models have shown promise, yet challenges remain in meeting
+human-specific preferences. In this paper, we introduce a novel approach
+tailored specifically for human image generation utilizing Direct Preference
+Optimization (DPO). Specifically, we introduce an efficient method for
+constructing a specialized DPO dataset for training human image generation
+models without the need for costly human feedback. We also propose a modified
+loss function that enhances the DPO training process by minimizing artifacts
+and improving image fidelity. Our method demonstrates its versatility and
+effectiveness in generating human images, including personalized text-to-image
+generation. Through comprehensive evaluations, we show that our approach
+significantly advances the state of human image generation, achieving superior
+results in terms of natural anatomies, poses, and text-image alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jina CLIP: Your CLIP Model Is Also Your Text Retriever <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Koukounas, Georgios Mastrapas, Michael Günther, Bo Wang, Scott Martens, Isabelle Mohr, Saba Sturua, Mohammad Kalim Akram, Joan Fontanals Martínez, Saahil Ognawala, Susana Guzman, Maximilian Werk, Nan Wang, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pretraining (CLIP) is widely used to train models
+to align images and texts in a common embedding space by mapping them to
+fixed-sized vectors. These models are key to multimodal information retrieval
+and related tasks. However, CLIP models generally underperform in text-only
+tasks compared to specialized text models. This creates inefficiencies for
+information retrieval systems that keep separate embeddings and models for
+text-only and multimodal tasks. We propose a novel, multi-task contrastive
+training method to address this issue, which we use to train the jina-clip-v1
+model to achieve the state-of-the-art performance on both text-image and
+text-text retrieval tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, ICML2024 workshop submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SPARE: Symmetrized Point-to-Plane Distance for Robust Non-Rigid
+  Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Yao, Bailin Deng, Junhui Hou, Juyong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing optimization-based methods for non-rigid registration typically
+minimize an alignment error metric based on the point-to-point or
+point-to-plane distance between corresponding point pairs on the source surface
+and target surface. However, these metrics can result in slow convergence or a
+loss of detail. In this paper, we propose SPARE, a novel formulation that
+utilizes a symmetrized point-to-plane distance for robust non-rigid
+registration. The symmetrized point-to-plane distance relies on both the
+positions and normals of the corresponding points, resulting in a more accurate
+approximation of the underlying geometry and can achieve higher accuracy than
+existing methods. To solve this optimization problem efficiently, we propose an
+alternating minimization solver using a majorization-minimization strategy.
+Moreover, for effective initialization of the solver, we incorporate a
+deformation graph-based coarse alignment that improves registration quality and
+efficiency. Extensive experiments show that the proposed method greatly
+improves the accuracy of non-rigid registration problems and maintains
+relatively high solution efficiency. The code is publicly available at
+https://github.com/yaoyx689/spare.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s and Slot Encoding for Sample Efficient Physical World
+  Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Petri, Luigi Asprino, Aldo Gangemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  World modelling, i.e. building a representation of the rules that govern the
+world so as to predict its evolution, is an essential ability for any agent
+interacting with the physical world. Recent applications of the Transformer
+architecture to the problem of world modelling from video input show notable
+improvements in sample efficiency. However, existing approaches tend to work
+only at the image level thus disregarding that the environment is composed of
+objects interacting with each other. In this paper, we propose an architecture
+combining Transformers for world modelling with the slot-attention paradigm, an
+approach for learning representations of objects appearing in a scene. We
+describe the resulting neural architecture and report experimental results
+showing an improvement over the existing solutions in terms of sample
+efficiency and a reduction of the variation of the performance over the
+training examples. The code for our architecture and experiments is available
+at https://github.com/torchipeppo/transformers-and-slot-encoding-for-wm
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Landslide mapping from Sentinel-2 imagery through change detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20161v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20161v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tommaso Monopoli, Fabio Montello, Claudio Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Landslides are one of the most critical and destructive geohazards.
+Widespread development of human activities and settlements combined with the
+effects of climate change on weather are resulting in a high increase in the
+frequency and destructive power of landslides, making them a major threat to
+human life and the economy. In this paper, we explore methodologies to map
+newly-occurred landslides using Sentinel-2 imagery automatically. All
+approaches presented are framed as a bi-temporal change detection problem,
+requiring only a pair of Sentinel-2 images, taken respectively before and after
+a landslide-triggering event. Furthermore, we introduce a novel deep learning
+architecture for fusing Sentinel-2 bi-temporal image pairs with Digital
+Elevation Model (DEM) data, showcasing its promising performances w.r.t. other
+change detection models in the literature. As a parallel task, we address
+limitations in existing datasets by creating a novel geodatabase, which
+includes manually validated open-access landslide inventories over
+heterogeneous ecoregions of the world. We release both code and dataset with an
+open-source license.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published in IEEE IGARSS 2024 conference proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MotionDreamer: Zero-Shot 3D Mesh Animation from Video Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Uzolas, Elmar Eisemann, Petr Kellnhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Animation techniques bring digital 3D worlds and characters to life. However,
+manual animation is tedious and automated techniques are often specialized to
+narrow shape classes. In our work, we propose a technique for automatic
+re-animation of arbitrary 3D shapes based on a motion prior extracted from a
+video diffusion model. Unlike existing 4D generation methods, we focus solely
+on the motion, and we leverage an explicit mesh-based representation compatible
+with existing computer-graphics pipelines. Furthermore, our utilization of
+diffusion features enhances accuracy of our motion fitting. We analyze efficacy
+of these features for animation fitting and we experimentally validate our
+approach for two different diffusion models and four animation models. Finally,
+we demonstrate that our time-efficient zero-shot method achieves a superior
+performance re-animating a diverse set of 3D shapes when compared to existing
+techniques in a user study. The project website is located at
+https://lukas.uzolas.com/MotionDreamer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncovering Bias in Large Vision-Language Models at Scale with
+  Counterfactuals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phillip Howard, Kathleen C. Fraser, Anahita Bhiwandiwalla, Svetlana Kiritchenko
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advent of Large Language Models (LLMs) possessing increasingly
+impressive capabilities, a number of Large Vision-Language Models (LVLMs) have
+been proposed to augment LLMs with visual inputs. Such models condition
+generated text on both an input image and a text prompt, enabling a variety of
+use cases such as visual question answering and multimodal chat. While prior
+studies have examined the social biases contained in text generated by LLMs,
+this topic has been relatively unexplored in LVLMs. Examining social biases in
+LVLMs is particularly challenging due to the confounding contributions of bias
+induced by information contained across the text and visual modalities. To
+address this challenging problem, we conduct a large-scale study of text
+generated by different LVLMs under counterfactual changes to input images.
+Specifically, we present LVLMs with identical open-ended text prompts while
+conditioning on images from different counterfactual sets, where each set
+contains images which are largely identical in their depiction of a common
+subject (e.g., a doctor), but vary only in terms of intersectional social
+attributes (e.g., race and gender). We comprehensively evaluate the text
+produced by different models under this counterfactual generation setting at
+scale, producing over 57 million responses from popular LVLMs. Our
+multi-dimensional analysis reveals that social attributes such as race, gender,
+and physical characteristics depicted in input images can significantly
+influence the generation of toxic content, competency-associated words, harmful
+stereotypes, and numerical ratings of depicted individuals. We additionally
+explore the relationship between social bias in LVLMs and their corresponding
+LLMs, as well as inference-time strategies to mitigate bias.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenDAS: Domain Adaptation for Open-Vocabulary Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gonca Yilmaz, Songyou Peng, Francis Engelmann, Marc Pollefeys, Hermann Blum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The advent of Vision Language Models (VLMs) transformed image understanding
+from closed-set classifications to dynamic image-language interactions,
+enabling open-vocabulary segmentation. Despite this flexibility, VLMs often
+fall behind closed-set classifiers in accuracy due to their reliance on
+ambiguous image captions and lack of domain-specific knowledge. We, therefore,
+introduce a new task domain adaptation for open-vocabulary segmentation,
+enhancing VLMs with domain-specific priors while preserving their
+open-vocabulary nature. Existing adaptation methods, when applied to
+segmentation tasks, improve performance on training queries but can reduce VLM
+performance on zero-shot text inputs. To address this shortcoming, we propose
+an approach that combines parameter-efficient prompt tuning with a
+triplet-loss-based training strategy. This strategy is designed to enhance
+open-vocabulary generalization while adapting to the visual domain. Our results
+outperform other parameter-efficient adaptation strategies in open-vocabulary
+segment classification tasks across indoor and outdoor datasets. Notably, our
+approach is the only one that consistently surpasses the original VLM on
+zero-shot queries. Our adapted VLMs can be plug-and-play integrated into
+existing open-vocabulary segmentation pipelines, improving OV-Seg by +6.0% mIoU
+on ADE20K, and OpenMask3D by +4.1% AP on ScanNet++ Offices without any changes
+to the methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multimodal Dangerous State Recognition and Early Warning System for
+  Elderly with Intermittent Dementia 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20136v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20136v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyun Deng, Lei Jin, Guangcheng Wang, Quan Shi, Han Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In response to the social issue of the increasing number of elderly
+vulnerable groups going missing due to the aggravating aging population in
+China, our team has developed a wearable anti-loss device and intelligent early
+warning system for elderly individuals with intermittent dementia using
+artificial intelligence and IoT technology. This system comprises an anti-loss
+smart helmet, a cloud computing module, and an intelligent early warning
+application on the caregiver's mobile device. The smart helmet integrates a
+miniature camera module, a GPS module, and a 5G communication module to collect
+first-person images and location information of the elderly. Data is
+transmitted remotely via 5G, FTP, and TCP protocols. In the cloud computing
+module, our team has proposed for the first time a multimodal dangerous state
+recognition network based on scene and location information to accurately
+assess the risk of elderly individuals going missing. Finally, the application
+software interface designed for the caregiver's mobile device implements
+multi-level early warnings. The system developed by our team requires no
+operation or response from the elderly, achieving fully automatic environmental
+perception, risk assessment, and proactive alarming. This overcomes the
+limitations of traditional monitoring devices, which require active operation
+and response, thus avoiding the issue of the digital divide for the elderly. It
+effectively prevents accidental loss and potential dangers for elderly
+individuals with dementia.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages,9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated and Transfer Learning for Cancer Detection Based on Image
+  Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amine Bechar, Youssef Elmir, Yassine Himeur, Rafik Medjoudj, Abbes Amira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This review article discusses the roles of federated learning (FL) and
+transfer learning (TL) in cancer detection based on image analysis. These two
+strategies powered by machine learning have drawn a lot of attention due to
+their potential to increase the precision and effectiveness of cancer diagnosis
+in light of the growing importance of machine learning techniques in cancer
+detection. FL enables the training of machine learning models on data
+distributed across multiple sites without the need for centralized data
+sharing, while TL allows for the transfer of knowledge from one task to
+another. A comprehensive assessment of the two methods, including their
+strengths, and weaknesses is presented. Moving on, their applications in cancer
+detection are discussed, including potential directions for the future.
+Finally, this article offers a thorough description of the functions of TL and
+FL in image-based cancer detection. The authors also make insightful
+suggestions for additional study in this rapidly developing area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Infinite 3D Landmarks: Improving Continuous 2D Facial Landmark Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prashanth Chandran, Gaspard Zoss, Paulo Gotardo, Derek Bradley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we examine 3 important issues in the practical use of
+state-of-the-art facial landmark detectors and show how a combination of
+specific architectural modifications can directly improve their accuracy and
+temporal stability. First, many facial landmark detectors require face
+normalization as a preprocessing step, which is accomplished by a
+separately-trained neural network that crops and resizes the face in the input
+image. There is no guarantee that this pre-trained network performs the optimal
+face normalization for landmark detection. We instead analyze the use of a
+spatial transformer network that is trained alongside the landmark detector in
+an unsupervised manner, and jointly learn optimal face normalization and
+landmark detection. Second, we show that modifying the output head of the
+landmark predictor to infer landmarks in a canonical 3D space can further
+improve accuracy. To convert the predicted 3D landmarks into screen-space, we
+additionally predict the camera intrinsics and head pose from the input image.
+As a side benefit, this allows to predict the 3D face shape from a given image
+only using 2D landmarks as supervision, which is useful in determining landmark
+visibility among other things. Finally, when training a landmark detector on
+multiple datasets at the same time, annotation inconsistencies across datasets
+forces the network to produce a suboptimal average. We propose to add a
+semantic correction network to address this issue. This additional lightweight
+neural network is trained alongside the landmark detector, without requiring
+any additional supervision. While the insights of this paper can be applied to
+most common landmark detectors, we specifically target a recently-proposed
+continuous 2D landmark detector to demonstrate how each of our additions leads
+to meaningful improvements over the state-of-the-art on standard benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 13 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RIGID: A Training-free and Model-Agnostic Framework for Robust
+  AI-Generated Image Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20112v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20112v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan He, Pin-Yu Chen, Tsung-Yi Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advances in generative AI models have empowered the creation of
+highly realistic images with arbitrary content, raising concerns about
+potential misuse and harm, such as Deepfakes. Current research focuses on
+training detectors using large datasets of generated images. However, these
+training-based solutions are often computationally expensive and show limited
+generalization to unseen generated images. In this paper, we propose a
+training-free method to distinguish between real and AI-generated images. We
+first observe that real images are more robust to tiny noise perturbations than
+AI-generated images in the representation space of vision foundation models.
+Based on this observation, we propose RIGID, a training-free and model-agnostic
+method for robust AI-generated image detection. RIGID is a simple yet effective
+approach that identifies whether an image is AI-generated by comparing the
+representation similarity between the original and the noise-perturbed
+counterpart. Our evaluation on a diverse set of AI-generated images and
+benchmarks shows that RIGID significantly outperforms existing trainingbased
+and training-free detectors. In particular, the average performance of RIGID
+exceeds the current best training-free method by more than 25%. Importantly,
+RIGID exhibits strong generalization across different image generation methods
+and robustness to image corruptions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FMARS: Annotating Remote Sensing Images for Disaster Management using
+  Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Edoardo Arnaudo, Jacopo Lungo Vaschetti, Lorenzo Innocenti, Luca Barco, Davide Lisi, Vanina Fissore, Claudio Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Very-High Resolution (VHR) remote sensing imagery is increasingly accessible,
+but often lacks annotations for effective machine learning applications. Recent
+foundation models like GroundingDINO and Segment Anything (SAM) provide
+opportunities to automatically generate annotations. This study introduces
+FMARS (Foundation Model Annotations in Remote Sensing), a methodology
+leveraging VHR imagery and foundation models for fast and robust annotation. We
+focus on disaster management and provide a large-scale dataset with labels
+obtained from pre-event imagery over 19 disaster events, derived from the Maxar
+Open Data initiative. We train segmentation models on the generated labels,
+using Unsupervised Domain Adaptation (UDA) techniques to increase
+transferability to real-world scenarios. Our results demonstrate the
+effectiveness of leveraging foundation models to automatically annotate remote
+sensing data at scale, enabling robust downstream models for critical
+applications. Code and dataset are available at
+\url{https://github.com/links-ads/igarss-fmars}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IGARSS 2024, 5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rapid Wildfire Hotspot Detection Using <span class="highlight-title">Self-Supervised</span> Learning on
+  Temporal Remote Sensing Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Barco, Angelica Urbanelli, Claudio Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapid detection and well-timed intervention are essential to mitigate the
+impacts of wildfires. Leveraging remote sensed data from satellite networks and
+advanced AI models to automatically detect hotspots (i.e., thermal anomalies
+caused by active fires) is an effective way to build wildfire monitoring
+systems. In this work, we propose a novel dataset containing time series of
+remotely sensed data related to European fire events and a Self-Supervised
+Learning (SSL)-based model able to analyse multi-temporal data and identify
+hotspots in potentially near real time. We train and evaluate the performance
+of our model using our dataset and Thraws, a dataset of thermal anomalies
+including several fire events, obtaining an F1 score of 63.58.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Attention Analysis in Online Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navarro Miriam, Becerra Álvaro, Daza Roberto, Cobos Ruth, Morales Aythami, Fierrez Julian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present an approach in the Multimodal Learning Analytics
+field. Within this approach, we have developed a tool to visualize and analyze
+eye movement data collected during learning sessions in online courses. The
+tool is named VAAD (an acronym for Visual Attention Analysis Dashboard). These
+eye movement data have been gathered using an eye-tracker and subsequently
+processed and visualized for interpretation. The purpose of the tool is to
+conduct a descriptive analysis of the data by facilitating its visualization,
+enabling the identification of differences and learning patterns among various
+learner populations. Additionally, it integrates a predictive module capable of
+anticipating learner activities during a learning session. Consequently, VAAD
+holds the potential to offer valuable insights into online learning behaviors
+from both descriptive and predictive perspectives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CEDI 2024 (VII Congreso Espa\~nol de Inform\'atica), A
+  Coru\~na, Spain</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Typography Leads Semantic Diversifying: Amplifying Adversarial
+  Transferability across Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20090v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20090v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Cheng, Erjia Xiao, Jiahang Cao, Le Yang, Kaidi Xu, Jindong Gu, Renjing Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following the advent of the Artificial Intelligence (AI) era of large models,
+Multimodal Large Language Models (MLLMs) with the ability to understand
+cross-modal interactions between vision and text have attracted wide attention.
+Adversarial examples with human-imperceptible perturbation are shown to possess
+a characteristic known as transferability, which means that a perturbation
+generated by one model could also mislead another different model. Augmenting
+the diversity in input data is one of the most significant methods for
+enhancing adversarial transferability. This method has been certified as a way
+to significantly enlarge the threat impact under black-box conditions. Research
+works also demonstrate that MLLMs can be exploited to generate adversarial
+examples in the white-box scenario. However, the adversarial transferability of
+such perturbations is quite limited, failing to achieve effective black-box
+attacks across different models. In this paper, we propose the
+Typographic-based Semantic Transfer Attack (TSTA), which is inspired by: (1)
+MLLMs tend to process semantic-level information; (2) Typographic Attack could
+effectively distract the visual information captured by MLLMs. In the scenarios
+of Harmful Word Insertion and Important Information Protection, our TSTA
+demonstrates superior performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimating Human Poses Across <span class="highlight-title">Dataset</span>s: A Unified Skeleton and
+  Multi-Teacher Distillation Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20084v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20084v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Saif Ullah Khan, Dhavalkumar Limbachiya, Didier Stricker, Muhammad Zeshan Afzal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human pose estimation is a key task in computer vision with various
+applications such as activity recognition and interactive systems. However, the
+lack of consistency in the annotated skeletons across different datasets poses
+challenges in developing universally applicable models. To address this
+challenge, we propose a novel approach integrating multi-teacher knowledge
+distillation with a unified skeleton representation. Our networks are jointly
+trained on the COCO and MPII datasets, containing 17 and 16 keypoints,
+respectively. We demonstrate enhanced adaptability by predicting an extended
+set of 21 keypoints, 4 (COCO) and 5 (MPII) more than original annotations,
+improving cross-dataset generalization. Our joint models achieved an average
+accuracy of 70.89 and 76.40, compared to 53.79 and 55.78 when trained on a
+single dataset and evaluated on both. Moreover, we also evaluate all 21
+predicted points by our two models by reporting an AP of 66.84 and 72.75 on the
+Halpe dataset. This highlights the potential of our technique to address one of
+the most pressing challenges in pose estimation research and application - the
+inconsistency in skeletal annotations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages (with references)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NoiseBoost: Alleviating Hallucination with Noise Perturbation for
+  Multimodal Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Wu, Boyuan Jiang, Zhengkai Jiang, Qingdong He, Donghao Luo, Shengzhi Wang, Qingwen Liu, Chengjie Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models (MLLMs) contribute a powerful mechanism to
+understanding visual information building on large language models. However,
+MLLMs are notorious for suffering from hallucinations, especially when
+generating lengthy, detailed descriptions for images. Our analysis reveals that
+hallucinations stem from the inherent summarization mechanism of large language
+models, leading to excessive dependence on linguistic tokens while neglecting
+vision information. In this paper, we propose NoiseBoost, a broadly applicable
+and simple method for alleviating hallucinations for MLLMs through the
+integration of noise feature perturbations. Noise perturbation acts as a
+regularizer, facilitating a balanced distribution of attention weights among
+visual and linguistic tokens. Despite its simplicity, NoiseBoost consistently
+enhances the performance of MLLMs across common training strategies, including
+supervised fine-tuning and reinforcement learning. Further, NoiseBoost
+pioneerly enables semi-supervised learning for MLLMs, unleashing the power of
+unlabeled data. Comprehensive experiments demonstrate that NoiseBoost improves
+dense caption accuracy by 8.1% with human evaluation and achieves comparable
+results with 50% of the data by mining unlabeled data. Code and models are
+available at https://kaiwu5.github.io/noiseboost.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>updating</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Faces of the Mind: Unveiling Mental Health States Through Facial
+  Expressions in 11,427 Adolescents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Xu, Keyin Zhou, Yan Zhang, Yang Wang, Fei Wang, Xizhe Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mood disorders, including depression and anxiety, often manifest through
+facial expressions. While previous research has explored the connection between
+facial features and emotions, machine learning algorithms for estimating mood
+disorder severity have been hindered by small datasets and limited real-world
+application. To address this gap, we analyzed facial videos of 11,427
+participants, a dataset two orders of magnitude larger than previous studies.
+This comprehensive collection includes standardized facial expression videos
+from reading tasks, along with a detailed psychological scale that measures
+depression, anxiety, and stress. By examining the relationships among these
+emotional states and employing clustering analysis, we identified distinct
+subgroups embodying different emotional profiles. We then trained tree-based
+classifiers and deep learning models to estimate emotional states from facial
+features. Results indicate that models previously effective on small datasets
+experienced decreased performance when applied to our large dataset,
+highlighting the importance of data scale and mitigating overfitting in
+practical settings. Notably, our study identified subtle shifts in pupil
+dynamics and gaze orientation as potential markers of mood disorders, providing
+valuable information on the interaction between facial expressions and mental
+health. This research marks the first large-scale and comprehensive
+investigation of facial expressions in the context of mental health, laying the
+groundwork for future data-driven advancements in this field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ N-Dimensional Gaussians for Fitting of High Dimensional Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20067v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20067v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stavros Diolatzis, Tobias Zirr, Alexandr Kuznetsov, Georgios Kopanas, Anton Kaplanyan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the wake of many new ML-inspired approaches for reconstructing and
+representing high-quality 3D content, recent hybrid and explicitly learned
+representations exhibit promising performance and quality characteristics.
+However, their scaling to higher dimensions is challenging, e.g. when
+accounting for dynamic content with respect to additional parameters such as
+material properties, illumination, or time. In this paper, we tackle these
+challenges for an explicit representations based on Gaussian mixture models.
+With our solutions, we arrive at efficient fitting of compact N-dimensional
+Gaussian mixtures and enable efficient evaluation at render time: For fast
+fitting and evaluation, we introduce a high-dimensional culling scheme that
+efficiently bounds N-D Gaussians, inspired by Locality Sensitive Hashing. For
+adaptive refinement yet compact representation, we introduce a loss-adaptive
+density control scheme that incrementally guides the use of additional capacity
+towards missing details. With these tools we can for the first time represent
+complex appearance that depends on many input dimensions beyond position or
+viewing angle within a compact, explicit representation optimized in minutes
+and rendered in milliseconds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can the accuracy bias by facial hairstyle be reduced through balancing
+  the training data? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kagan Ozturk, Haiyu Wu, Kevin W. Bowyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Appearance of a face can be greatly altered by growing a beard and mustache.
+The facial hairstyles in a pair of images can cause marked changes to the
+impostor distribution and the genuine distribution. Also, different
+distributions of facial hairstyle across demographics could cause a false
+impression of relative accuracy across demographics. We first show that, even
+though larger training sets boost the recognition accuracy on all facial
+hairstyles, accuracy variations caused by facial hairstyles persist regardless
+of the size of the training set. Then, we analyze the impact of having
+different fractions of the training data represent facial hairstyles. We
+created balanced training sets using a set of identities available in
+Webface42M that both have clean-shaven and facial hair images. We find that,
+even when a face recognition model is trained with a balanced clean-shaven /
+facial hair training set, accuracy variation on the test data does not
+diminish. Next, data augmentation is employed to further investigate the effect
+of facial hair distribution in training data by manipulating facial hair pixels
+with the help of facial landmark points and a facial hair segmentation model.
+Our results show facial hair causes an accuracy gap between clean-shaven and
+facial hair images, and this impact can be significantly different between
+African-Americans and Caucasians.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Plant Disease Detection: A Novel CNN-Based Approach with
+  Tensor Subspace Learning and HOWSVD-MD 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20058v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20058v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelmalik Ouamane, Ammar Chouchane, Yassine Himeur, Abderrazak Debilou, Abbes Amira, Shadi Atalla, Wathiq Mansoor, Hussain Al Ahmad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning has revolutionized the field of agricultural science,
+particularly in the early detection and management of plant diseases, which are
+crucial for maintaining crop health and productivity. Leveraging advanced
+algorithms and imaging technologies, researchers are now able to identify and
+classify plant diseases with unprecedented accuracy and speed. Effective
+management of tomato diseases is crucial for enhancing agricultural
+productivity. The development and application of tomato disease classification
+methods are central to this objective. This paper introduces a cutting-edge
+technique for the detection and classification of tomato leaf diseases,
+utilizing insights from the latest pre-trained Convolutional Neural Network
+(CNN) models. We propose a sophisticated approach within the domain of tensor
+subspace learning, known as Higher-Order Whitened Singular Value Decomposition
+(HOWSVD), designed to boost the discriminatory power of the system. Our
+approach to Tensor Subspace Learning is methodically executed in two phases,
+beginning with HOWSVD and culminating in Multilinear Discriminant Analysis
+(MDA). The efficacy of this innovative method was rigorously tested through
+comprehensive experiments on two distinct datasets, namely PlantVillage and the
+Taiwan dataset. The findings reveal that HOWSVD-MDA outperforms existing
+methods, underscoring its capability to markedly enhance the precision and
+dependability of diagnosing tomato leaf diseases. For instance, up to 98.36\%
+and 89.39\% accuracy scores have been achieved under PlantVillage and the
+Taiwan datasets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 9 figures and 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Point-Neighborhood Learning Framework for Nasal Endoscope Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyu Jie, Wanquan Liu, Chenqiang Gao, Yihui Wen, Rui He, Pengcheng Li, Jintao Zhang, Deyu Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lesion segmentation on endoscopic images is challenging due to its
+complex and ambiguous features. Fully-supervised deep learning segmentation
+methods can receive good performance based on entirely pixel-level labeled
+dataset but greatly increase experts' labeling burden. Semi-supervised and
+weakly supervised methods can ease labeling burden, but heavily strengthen the
+learning difficulty. To alleviate this difficulty, weakly semi-supervised
+segmentation adopts a new annotation protocol of adding a large number of point
+annotation samples into a few pixel-level annotation samples. However, existing
+methods only mine points' limited information while ignoring reliable prior
+surrounding the point annotations. In this paper, we propose a weakly
+semi-supervised method called Point-Neighborhood Learning (PNL) framework. To
+mine the prior of the pixels surrounding the annotated point, we transform a
+single-point annotation into a circular area named a point-neighborhood. We
+propose point-neighborhood supervision loss and pseudo-label scoring mechanism
+to enhance training supervision. Point-neighborhoods are also used to augment
+the data diversity. Our method greatly improves performance without changing
+the structure of segmentation network. Comprehensive experiments show the
+superiority of our method over the other existing methods, demonstrating its
+effectiveness in point-annotated medical images. The project code will be
+available on: https://github.com/ParryJay/PNL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures,</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Structure Gaussian SLAM with Manhattan World Hypothesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuhong Liu, Heng Zhou, Liuzhuozheng Li, Yun Liu, Tianchen Deng, Yiming Zhou, Mingrui Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gaussian SLAM systems have made significant advancements in improving the
+efficiency and fidelity of real-time reconstructions. However, these systems
+often encounter incomplete reconstructions in complex indoor environments,
+characterized by substantial holes due to unobserved geometry caused by
+obstacles or limited view angles. To address this challenge, we present
+Manhattan Gaussian SLAM (MG-SLAM), an RGB-D system that leverages the Manhattan
+World hypothesis to enhance geometric accuracy and completeness. By seamlessly
+integrating fused line segments derived from structured scenes, MG-SLAM ensures
+robust tracking in textureless indoor areas. Moreover, The extracted lines and
+planar surface assumption allow strategic interpolation of new Gaussians in
+regions of missing geometry, enabling efficient scene completion. Extensive
+experiments conducted on both synthetic and real-world scenes demonstrate that
+these advancements enable our method to achieve state-of-the-art performance,
+marking a substantial improvement in the capabilities of Gaussian SLAM systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EMAG: Ego-motion Aware and Generalizable 2D Hand Forecasting from
+  Egocentric Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20030v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20030v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masashi Hatano, Ryo Hachiuma, Hideo Saito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting future human behavior from egocentric videos is a challenging but
+critical task for human intention understanding. Existing methods for
+forecasting 2D hand positions rely on visual representations and mainly focus
+on hand-object interactions. In this paper, we investigate the hand forecasting
+task and tackle two significant issues that persist in the existing methods:
+(1) 2D hand positions in future frames are severely affected by ego-motions in
+egocentric videos; (2) prediction based on visual information tends to overfit
+to background or scene textures, posing a challenge for generalization on novel
+scenes or human behaviors. To solve the aforementioned problems, we propose
+EMAG, an ego-motion-aware and generalizable 2D hand forecasting method. In
+response to the first problem, we propose a method that considers ego-motion,
+represented by a sequence of homography matrices of two consecutive frames. We
+further leverage modalities such as optical flow, trajectories of hands and
+interacting objects, and ego-motions, thereby alleviating the second issue.
+Extensive experiments on two large-scale egocentric video datasets, Ego4D and
+EPIC-Kitchens 55, verify the effectiveness of the proposed method. In
+particular, our model outperforms prior methods by $7.0$\% on cross-dataset
+evaluations. Project page: https://masashi-hatano.github.io/EMAG/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Forest to Zoo: Great Ape Behavior Recognition with ChimpBehave 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Fuchs, Emilie Genty, Adrian Bangerter, Klaus Zuberbühler, Paul Cotofrei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the significant challenge of recognizing behaviors in
+non-human primates, specifically focusing on chimpanzees. Automated behavior
+recognition is crucial for both conservation efforts and the advancement of
+behavioral research. However, it is significantly hindered by the
+labor-intensive process of manual video annotation. Despite the availability of
+large-scale animal behavior datasets, the effective application of machine
+learning models across varied environmental settings poses a critical
+challenge, primarily due to the variability in data collection contexts and the
+specificity of annotations.
+  In this paper, we introduce ChimpBehave, a novel dataset featuring over 2
+hours of video (approximately 193,000 video frames) of zoo-housed chimpanzees,
+meticulously annotated with bounding boxes and behavior labels for action
+recognition. ChimpBehave uniquely aligns its behavior classes with existing
+datasets, allowing for the study of domain adaptation and cross-dataset
+generalization methods between different visual settings. Furthermore, we
+benchmark our dataset using a state-of-the-art CNN-based action recognition
+model, providing the first baseline results for both within and cross-dataset
+settings. The dataset, models, and code can be accessed at:
+https://github.com/MitchFuchs/ChimpBehave
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CV4Animals: Computer Vision for Animal Behavior Tracking and Modeling
+  In conjunction with Computer Vision and Pattern Recognition 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sharing Key Semantics in <span class="highlight-title">Transformer</span> Makes Efficient Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Ren, Yawei Li, Jingyun Liang, Rakesh Ranjan, Mengyuan Liu, Rita Cucchiara, Luc Van Gool, Ming-Hsuan Yang, Nicu Sebe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image Restoration (IR), a classic low-level vision task, has witnessed
+significant advancements through deep models that effectively model global
+information. Notably, the Vision Transformers (ViTs) emergence has further
+propelled these advancements. When computing, the self-attention mechanism, a
+cornerstone of ViTs, tends to encompass all global cues, even those from
+semantically unrelated objects or regions. This inclusivity introduces
+computational inefficiencies, particularly noticeable with high input
+resolution, as it requires processing irrelevant information, thereby impeding
+efficiency. Additionally, for IR, it is commonly noted that small segments of a
+degraded image, particularly those closely aligned semantically, provide
+particularly relevant information to aid in the restoration process, as they
+contribute essential contextual cues crucial for accurate reconstruction. To
+address these challenges, we propose boosting IR's performance by sharing the
+key semantics via Transformer for IR (i.e., SemanIR) in this paper.
+Specifically, SemanIR initially constructs a sparse yet comprehensive
+key-semantic dictionary within each transformer stage by establishing essential
+semantic connections for every degraded patch. Subsequently, this dictionary is
+shared across all subsequent transformer blocks within the same stage. This
+strategy optimizes attention calculation within each block by focusing
+exclusively on semantically related components stored in the key-semantic
+dictionary. As a result, attention calculation achieves linear computational
+complexity within each window. Extensive experiments across 6 IR tasks confirm
+the proposed SemanIR's state-of-the-art performance, quantitatively and
+qualitatively showcasing advancements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DP-IQA: Utilizing Diffusion Prior for Blind Image Quality Assessment in
+  the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honghao Fu, Yufei Wang, Wenhan Yang, Bihan Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image quality assessment (IQA) plays a critical role in selecting
+high-quality images and guiding compression and enhancement methods in a series
+of applications. The blind IQA, which assesses the quality of in-the-wild
+images containing complex authentic distortions without reference images, poses
+greater challenges. Existing methods are limited to modeling a uniform
+distribution with local patches and are bothered by the gap between low and
+high-level visions (caused by widely adopted pre-trained classification
+networks). In this paper, we propose a novel IQA method called diffusion
+priors-based IQA (DP-IQA), which leverages the prior knowledge from the
+pre-trained diffusion model with its excellent powers to bridge semantic gaps
+in the perception of the visual quality of images. Specifically, we use
+pre-trained stable diffusion as the backbone, extract multi-level features from
+the denoising U-Net during the upsampling process at a specified timestep, and
+decode them to estimate the image quality score. The text and image adapters
+are adopted to mitigate the domain gap for downstream tasks and correct the
+information loss caused by the variational autoencoder bottleneck. Finally, we
+distill the knowledge in the above model into a CNN-based student model,
+significantly reducing the parameter to enhance applicability, with the student
+model performing similarly or even better than the teacher model surprisingly.
+Experimental results demonstrate that our DP-IQA achieves state-of-the-art
+results on various in-the-wild datasets with better generalization capability,
+which shows the superiority of our method in global modeling and utilizing the
+hierarchical feature clues of diffusion for evaluating image quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiffPhysBA: Diffusion-based Physical Backdoor Attack against Person
+  Re-Identification in Real-World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19990v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19990v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenli Sun, Xinyang Jiang, Dongsheng Li, Cairong Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Person Re-Identification (ReID) systems pose a significant security risk from
+backdoor attacks, allowing adversaries to evade tracking or impersonate others.
+Beyond recognizing this issue, we investigate how backdoor attacks can be
+deployed in real-world scenarios, where a ReID model is typically trained on
+data collected in the digital domain and then deployed in a physical
+environment. This attack scenario requires an attack flow that embeds backdoor
+triggers in the digital domain realistically enough to also activate the buried
+backdoor in person ReID models in the physical domain. This paper realizes this
+attack flow by leveraging a diffusion model to generate realistic accessories
+on pedestrian images (e.g., bags, hats, etc.) as backdoor triggers. However,
+the noticeable domain gap between the triggers generated by the off-the-shelf
+diffusion model and their physical counterparts results in a low attack success
+rate. Therefore, we introduce a novel diffusion-based physical backdoor attack
+(DiffPhysBA) method that adopts a training-free similarity-guided sampling
+process to enhance the resemblance between generated and physical triggers.
+Consequently, DiffPhysBA can generate realistic attributes as semantic-level
+triggers in the digital domain and provides higher physical ASR compared to the
+direct paste method by 25.6% on the real-world test set. Through evaluations on
+newly proposed real-world and synthetic ReID test sets, DiffPhysBA demonstrates
+an impressive success rate exceeding 90% in both the digital and physical
+domains. Notably, it excels in digital stealth metrics and can effectively
+evade state-of-the-art defense methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PLA4D: Pixel-Level Alignments for Text-to-4D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiaowei Miao, Yawei Luo, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As text-conditioned diffusion models (DMs) achieve breakthroughs in image,
+video, and 3D generation, the research community's focus has shifted to the
+more challenging task of text-to-4D synthesis, which introduces a temporal
+dimension to generate dynamic 3D objects. In this context, we identify Score
+Distillation Sampling (SDS), a widely used technique for text-to-3D synthesis,
+as a significant hindrance to text-to-4D performance due to its Janus-faced and
+texture-unrealistic problems coupled with high computational costs. In this
+paper, we propose \textbf{P}ixel-\textbf{L}evel \textbf{A}lignments for
+Text-to-\textbf{4D} Gaussian Splatting (\textbf{PLA4D}), a novel method that
+utilizes text-to-video frames as explicit pixel alignment targets to generate
+static 3D objects and inject motion into them. Specifically, we introduce Focal
+Alignment to calibrate camera poses for rendering and GS-Mesh Contrastive
+Learning to distill geometry priors from rendered image contrasts at the pixel
+level. Additionally, we develop Motion Alignment using a deformation network to
+drive changes in Gaussians and implement Reference Refinement for smooth 4D
+object surfaces. These techniques enable 4D Gaussian Splatting to align
+geometry, texture, and motion with generated videos at the pixel level.
+Compared to previous methods, PLA4D produces synthesized outputs with better
+texture details in less time and effectively mitigates the Janus-faced problem.
+PLA4D is fully implemented using open-source models, offering an accessible,
+user-friendly, and promising direction for 4D digital content creation. Our
+project page:
+\href{https://github.com/MiaoQiaowei/PLA4D.github.io}{https://github.com/MiaoQiaowei/PLA4D.github.io}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hyper-<span class="highlight-title">Transformer</span> for Amodal Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianxiong Gao, Xuelin Qian, Longfei Liang, Junwei Han, Yanwei Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Amodal object completion is a complex task that involves predicting the
+invisible parts of an object based on visible segments and background
+information. Learning shape priors is crucial for effective amodal completion,
+but traditional methods often rely on two-stage processes or additional
+information, leading to inefficiencies and potential error accumulation. To
+address these shortcomings, we introduce a novel framework named the
+Hyper-Transformer Amodal Network (H-TAN). This framework utilizes a hyper
+transformer equipped with a dynamic convolution head to directly learn shape
+priors and accurately predict amodal masks. Specifically, H-TAN uses a
+dual-branch structure to extract multi-scale features from both images and
+masks. The multi-scale features from the image branch guide the hyper
+transformer in learning shape priors and in generating the weights for dynamic
+convolution tailored to each instance. The dynamic convolution head then uses
+the features from the mask branch to predict precise amodal masks. We
+extensively evaluate our model on three benchmark datasets: KINS, COCOA-cls,
+and D2SA, where H-TAN demonstrated superior performance compared to existing
+methods. Additional experiments validate the effectiveness and stability of the
+novel hyper transformer in our framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View People Detection in Large Scenes via Supervised View-Wise
+  Contribution Weighting <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Zhang, Yunfei Gong, Daijie Chen, Antoni B. Chan, Hui Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent deep learning-based multi-view people detection (MVD) methods have
+shown promising results on existing datasets. However, current methods are
+mainly trained and evaluated on small, single scenes with a limited number of
+multi-view frames and fixed camera views. As a result, these methods may not be
+practical for detecting people in larger, more complex scenes with severe
+occlusions and camera calibration errors. This paper focuses on improving
+multi-view people detection by developing a supervised view-wise contribution
+weighting approach that better fuses multi-camera information under large
+scenes. Besides, a large synthetic dataset is adopted to enhance the model's
+generalization ability and enable more practical evaluation and comparison. The
+model's performance on new testing scenes is further improved with a simple
+domain adaptation technique. Experimental results demonstrate the effectiveness
+of our approach in achieving promising cross-scene multi-view people detection
+performance. See code here: https://vcc.tech/research/2024/MVD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Diffusion Models' Corruption Stage in Few-Shot Fine-tuning and
+  Mitigating with Bayesian Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Wu, Jiaru Zhang, Yang Hua, Bohan Lyu, Hao Wang, Tao Song, Haibing Guan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot fine-tuning of Diffusion Models (DMs) is a key advancement,
+significantly reducing training costs and enabling personalized AI
+applications. However, we explore the training dynamics of DMs and observe an
+unanticipated phenomenon: during the training process, image fidelity initially
+improves, then unexpectedly deteriorates with the emergence of noisy patterns,
+only to recover later with severe overfitting. We term the stage with generated
+noisy patterns as corruption stage. To understand this corruption stage, we
+begin by theoretically modeling the one-shot fine-tuning scenario, and then
+extend this modeling to more general cases. Through this modeling, we identify
+the primary cause of this corruption stage: a narrowed learning distribution
+inherent in the nature of few-shot fine-tuning. To tackle this, we apply
+Bayesian Neural Networks (BNNs) on DMs with variational inference to implicitly
+broaden the learned distribution, and present that the learning target of the
+BNNs can be naturally regarded as an expectation of the diffusion loss and a
+further regularization with the pretrained DMs. This approach is highly
+compatible with current few-shot fine-tuning methods in DMs and does not
+introduce any extra inference costs. Experimental results demonstrate that our
+method significantly mitigates corruption, and improves the fidelity, quality
+and diversity of the generated images in both object-driven and subject-driven
+generation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MCDS-VSS: Moving Camera Dynamic Scene Video Semantic Segmentation by
+  Filtering with <span class="highlight-title">Self-Supervised</span> Geometry and Motion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19921v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19921v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Angel Villar-Corrales, Moritz Austermann, Sven Behnke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous systems, such as self-driving cars, rely on reliable semantic
+environment perception for decision making. Despite great advances in video
+semantic segmentation, existing approaches ignore important inductive biases
+and lack structured and interpretable internal representations. In this work,
+we propose MCDS-VSS, a structured filter model that learns in a self-supervised
+manner to estimate scene geometry and ego-motion of the camera, while also
+estimating the motion of external objects. Our model leverages these
+representations to improve the temporal consistency of semantic segmentation
+without sacrificing segmentation accuracy. MCDS-VSS follows a prediction-fusion
+approach in which scene geometry and camera motion are first used to compensate
+for ego-motion, then residual flow is used to compensate motion of dynamic
+objects, and finally the predicted scene features are fused with the current
+features to obtain a temporally consistent scene segmentation. Our model parses
+automotive scenes into multiple decoupled interpretable representations such as
+scene geometry, ego-motion, and object motion. Quantitative evaluation shows
+that MCDS-VSS achieves superior temporal consistency on video sequences while
+retaining competitive segmentation performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Cross-Domain Few-Shot Learning for Egocentric Action
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masashi Hatano, Ryo Hachiuma, Ryo Fuji, Hideo Saito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We address a novel cross-domain few-shot learning task (CD-FSL) with
+multimodal input and unlabeled target data for egocentric action recognition.
+This paper simultaneously tackles two critical challenges associated with
+egocentric action recognition in CD-FSL settings: (1) the extreme domain gap in
+egocentric videos (\eg, daily life vs. industrial domain) and (2) the
+computational cost for real-world applications. We propose MM-CDFSL, a
+domain-adaptive and computationally efficient approach designed to enhance
+adaptability to the target domain and improve inference speed. To address the
+first challenge, we propose the incorporation of multimodal distillation into
+the student RGB model using teacher models. Each teacher model is trained
+independently on source and target data for its respective modality. Leveraging
+only unlabeled target data during multimodal distillation enhances the student
+model's adaptability to the target domain. We further introduce ensemble masked
+inference, a technique that reduces the number of input tokens through masking.
+In this approach, ensemble prediction mitigates the performance degradation
+caused by masking, effectively addressing the second issue. Our approach
+outperformed the state-of-the-art CD-FSL approaches with a substantial margin
+on multiple egocentric datasets, improving by an average of 6.12/6.10 points
+for 1-shot/5-shot settings while achieving $2.2$ times faster inference speed.
+Project page: https://masashi-hatano.github.io/MM-CDFSL/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards RGB-NIR Cross-modality Image Registration and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19914v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19914v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huadong Li, Shichao Dong, Jin Wang, Rong Fu, Minhao Jing, Jiajun Liang, Haoqiang Fan, Renhe Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on the area of RGB(visible)-NIR(near-infrared)
+cross-modality image registration, which is crucial for many downstream vision
+tasks to fully leverage the complementary information present in visible and
+infrared images. In this field, researchers face two primary challenges - the
+absence of a correctly-annotated benchmark with viewpoint variations for
+evaluating RGB-NIR cross-modality registration methods and the problem of
+inconsistent local features caused by the appearance discrepancy between
+RGB-NIR cross-modality images. To address these challenges, we first present
+the RGB-NIR Image Registration (RGB-NIR-IRegis) benchmark, which, for the first
+time, enables fair and comprehensive evaluations for the task of RGB-NIR
+cross-modality image registration. Evaluations of previous methods highlight
+the significant challenges posed by our RGB-NIR-IRegis benchmark, especially on
+RGB-NIR image pairs with viewpoint variations. To analyze the causes of the
+unsatisfying performance, we then design several metrics to reveal the toxic
+impact of inconsistent local features between visible and infrared images on
+the model performance. This further motivates us to develop a baseline method
+named Semantic Guidance Transformer (SGFormer), which utilizes high-level
+semantic guidance to mitigate the negative impact of local inconsistent
+features. Despite the simplicity of our motivation, extensive experimental
+results show the effectiveness of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open-Set Domain Adaptation for Semantic Segmentation <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seun-An Choe, Ah-Hyung Shin, Keon-Hee Park, Jinwoo Choi, Gyeong-Moon Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (UDA) for semantic segmentation aims to
+transfer the pixel-wise knowledge from the labeled source domain to the
+unlabeled target domain. However, current UDA methods typically assume a shared
+label space between source and target, limiting their applicability in
+real-world scenarios where novel categories may emerge in the target domain. In
+this paper, we introduce Open-Set Domain Adaptation for Semantic Segmentation
+(OSDA-SS) for the first time, where the target domain includes unknown classes.
+We identify two major problems in the OSDA-SS scenario as follows: 1) the
+existing UDA methods struggle to predict the exact boundary of the unknown
+classes, and 2) they fail to accurately predict the shape of the unknown
+classes. To address these issues, we propose Boundary and Unknown Shape-Aware
+open-set domain adaptation, coined BUS. Our BUS can accurately discern the
+boundaries between known and unknown classes in a contrastive manner using a
+novel dilation-erosion-based contrastive loss. In addition, we propose
+OpenReMix, a new domain mixing augmentation method that guides our model to
+effectively learn domain and size-invariant features for improving the shape
+detection of the known and unknown classes. Through extensive experiments, we
+demonstrate that our proposed BUS effectively detects unknown classes in the
+challenging OSDA-SS scenario compared to the previous methods by a large
+margin. The code is available at https://github.com/KHU-AGI/BUS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 5 figures, 13 tables, CVPR 2024 Poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PixOOD: Pixel-Level Out-of-Distribution Detection <span class="chip">ECCV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19882v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19882v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomáš Vojíř, Jan Šochman, Jiří Matas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a dense image prediction out-of-distribution detection algorithm,
+called PixOOD, which does not require training on samples of anomalous data and
+is not designed for a specific application which avoids traditional training
+biases. In order to model the complex intra-class variability of the
+in-distribution data at the pixel level, we propose an online data condensation
+algorithm which is more robust than standard K-means and is easily trainable
+through SGD. We evaluate PixOOD on a wide range of problems. It achieved
+state-of-the-art results on four out of seven datasets, while being competitive
+on the rest. The source code is available at https://github.com/vojirt/PixOOD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review at ECCV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IReNe: Instant Recoloring in Neural Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessio Mazzucchelli, Adrian Garcia-Garcia, Elena Garces, Fernando Rivas-Manzaneque, Francesc Moreno-Noguer, Adrian Penate-Sanchez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advances in NERFs have allowed for 3D scene reconstructions and novel view
+synthesis. Yet, efficiently editing these representations while retaining
+photorealism is an emerging challenge. Recent methods face three primary
+limitations: they're slow for interactive use, lack precision at object
+boundaries, and struggle to ensure multi-view consistency. We introduce IReNe
+to address these limitations, enabling swift, near real-time color editing in
+NeRF. Leveraging a pre-trained NeRF model and a single training image with
+user-applied color edits, IReNe swiftly adjusts network parameters in seconds.
+This adjustment allows the model to generate new scene views, accurately
+representing the color changes from the training image while also controlling
+object boundaries and view-specific effects. Object boundary control is
+achieved by integrating a trainable segmentation module into the model. The
+process gains efficiency by retraining only the weights of the last network
+layer. We observed that neurons in this layer can be classified into those
+responsible for view-dependent appearance and those contributing to diffuse
+appearance. We introduce an automated classification approach to identify these
+neuron types and exclusively fine-tune the weights of the diffuse neurons. This
+further accelerates training and ensures consistent color edits across
+different views. A thorough validation on a new dataset, with edited object
+colors, shows significant quantitative and qualitative advancements over
+competitors, accelerating speeds by 5x to 500x.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Object-Centric Learning with Capsule Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Riccardo Renzulli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Capsule networks (CapsNets) were introduced to address convolutional neural
+networks limitations, learning object-centric representations that are more
+robust, pose-aware, and interpretable. They organize neurons into groups called
+capsules, where each capsule encodes the instantiation parameters of an object
+or one of its parts. Moreover, a routing algorithm connects capsules in
+different layers, thereby capturing hierarchical part-whole relationships in
+the data.
+  This thesis investigates the intriguing aspects of CapsNets and focuses on
+three key questions to unlock their full potential. First, we explore the
+effectiveness of the routing algorithm, particularly in small-sized networks.
+We propose a novel method that anneals the number of routing iterations during
+training, enhancing performance in architectures with fewer parameters.
+  Secondly, we investigate methods to extract more effective first-layer
+capsules, also known as primary capsules. By exploiting pruned backbones, we
+aim to improve computational efficiency by reducing the number of capsules
+while achieving high generalization. This approach reduces CapsNets memory
+requirements and computational effort.
+  Third, we explore part-relationship learning in CapsNets. Through extensive
+research, we demonstrate that capsules with low entropy can extract more
+concise and discriminative part-whole relationships compared to traditional
+capsule networks, even with reasonable network sizes.
+  Lastly, we showcase how CapsNets can be utilized in real-world applications,
+including autonomous localization of unmanned aerial vehicles, quaternion-based
+rotations prediction in synthetic datasets, and lung nodule segmentation in
+biomedical imaging.
+  The findings presented in this thesis contribute to a deeper understanding of
+CapsNets and highlight their potential to address complex computer vision
+challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated version of my PhD thesis (Nov 2023), with fixed typos. Will
+  keep updated as new typos are discovered!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RTGen: Generating Region-Text Pairs for Open-Vocabulary Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fangyi Chen, Han Zhang, Zhantao Yang, Hao Chen, Kai Hu, Marios Savvides
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-vocabulary object detection (OVD) requires solid modeling of the
+region-semantic relationship, which could be learned from massive region-text
+pairs. However, such data is limited in practice due to significant annotation
+costs. In this work, we propose RTGen to generate scalable open-vocabulary
+region-text pairs and demonstrate its capability to boost the performance of
+open-vocabulary object detection. RTGen includes both text-to-region and
+region-to-text generation processes on scalable image-caption data. The
+text-to-region generation is powered by image inpainting, directed by our
+proposed scene-aware inpainting guider for overall layout harmony. For
+region-to-text generation, we perform multiple region-level image captioning
+with various prompts and select the best matching text according to CLIP
+similarity. To facilitate detection training on region-text pairs, we also
+introduce a localization-aware region-text contrastive loss that learns object
+proposals tailored with different localization qualities. Extensive experiments
+demonstrate that our RTGen can serve as a scalable, semantically rich, and
+effective source for open-vocabulary object detection and continue to improve
+the model performance when more data is utilized, delivering superior
+performance compared to the existing state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KITRO: Refining Human Mesh by 2D Clues and Kinematic-tree Rotation <span class="chip">CVPR24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengyuan Yang, Kerui Gu, Angela Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  2D keypoints are commonly used as an additional cue to refine estimated 3D
+human meshes. Current methods optimize the pose and shape parameters with a
+reprojection loss on the provided 2D keypoints. Such an approach, while simple
+and intuitive, has limited effectiveness because the optimal solution is hard
+to find in ambiguous parameter space and may sacrifice depth. Additionally,
+divergent gradients from distal joints complicate and deviate the refinement of
+proximal joints in the kinematic chain. To address these, we introduce
+Kinematic-Tree Rotation (KITRO), a novel mesh refinement strategy that
+explicitly models depth and human kinematic-tree structure. KITRO treats
+refinement from a bone-wise perspective. Unlike previous methods which perform
+gradient-based optimizations, our method calculates bone directions in closed
+form. By accounting for the 2D pose, bone length, and parent joint's depth, the
+calculation results in two possible directions for each child joint. We then
+use a decision tree to trace binary choices for all bones along the human
+skeleton's kinematic-tree to select the most probable hypothesis. Our
+experiments across various datasets and baseline models demonstrate that KITRO
+significantly improves 3D joint estimation accuracy and achieves an ideal 2D
+fit simultaneously. Our code available at: https://github.com/MartaYang/KITRO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Object Detector Training on Synthetic Data by Starting With a
+  Strong Baseline Methodology <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frank A. Ruis, Alma M. Liezenga, Friso G. Heslinga, Luca Ballan, Thijs A. Eker, Richard J. M. den Hollander, Martin C. van Leeuwen, Judith Dijk, Wyke Huizinga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Collecting and annotating real-world data for the development of object
+detection models is a time-consuming and expensive process. In the military
+domain in particular, data collection can also be dangerous or infeasible.
+Training models on synthetic data may provide a solution for cases where access
+to real-world training data is restricted. However, bridging the reality gap
+between synthetic and real data remains a challenge. Existing methods usually
+build on top of baseline Convolutional Neural Network (CNN) models that have
+been shown to perform well when trained on real data, but have limited ability
+to perform well when trained on synthetic data. For example, some architectures
+allow for fine-tuning with the expectation of large quantities of training data
+and are prone to overfitting on synthetic data. Related work usually ignores
+various best practices from object detection on real data, e.g. by training on
+synthetic data from a single environment with relatively little variation. In
+this paper we propose a methodology for improving the performance of a
+pre-trained object detector when training on synthetic data. Our approach
+focuses on extracting the salient information from synthetic data without
+forgetting useful features learned from pre-training on real images. Based on
+the state of the art, we incorporate data augmentation methods and a
+Transformer backbone. Besides reaching relatively strong performance without
+any specialized synthetic data transfer methods, we show that our methods
+improve the state of the art on synthetic data trained object detection for the
+RarePlanes and DGTA-VisDrone datasets, and reach near-perfect performance on an
+in-house vehicle detection dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to and presented at SPIE Defense + Commercial Sensing 2024,
+  13 pages, 4 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gated Fields: Learning Scene Reconstruction from Gated Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Ramazzina, Stefanie Walz, Pragyan Dahal, Mario Bijelic, Felix Heide
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing outdoor 3D scenes from temporal observations is a challenge
+that recent work on neural fields has offered a new avenue for. However,
+existing methods that recover scene properties, such as geometry, appearance,
+or radiance, solely from RGB captures often fail when handling poorly-lit or
+texture-deficient regions. Similarly, recovering scenes with scanning LiDAR
+sensors is also difficult due to their low angular sampling rate which makes
+recovering expansive real-world scenes difficult. Tackling these gaps, we
+introduce Gated Fields - a neural scene reconstruction method that utilizes
+active gated video sequences. To this end, we propose a neural rendering
+approach that seamlessly incorporates time-gated capture and illumination. Our
+method exploits the intrinsic depth cues in the gated videos, achieving precise
+and dense geometry reconstruction irrespective of ambient illumination
+conditions. We validate the method across day and night scenarios and find that
+Gated Fields compares favorably to RGB and LiDAR reconstruction methods. Our
+code and datasets are available at https://light.princeton.edu/gatedfields/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WebUOT-1M: Advancing Deep Underwater Object Tracking with A
+  Million-Scale Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunhui Zhang, Li Liu, Guanjie Huang, Hao Wen, Xi Zhou, Yanfeng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater object tracking (UOT) is a foundational task for identifying and
+tracing submerged entities in underwater video sequences. However, current UOT
+datasets suffer from limitations in scale, diversity of target categories and
+scenarios covered, hindering the training and evaluation of modern tracking
+algorithms. To bridge this gap, we take the first step and introduce WebUOT-1M,
+\ie, the largest public UOT benchmark to date, sourced from complex and
+realistic underwater environments. It comprises 1.1 million frames across 1,500
+video clips filtered from 408 target categories, largely surpassing previous
+UOT datasets, \eg, UVOT400. Through meticulous manual annotation and
+verification, we provide high-quality bounding boxes for underwater targets.
+Additionally, WebUOT-1M includes language prompts for video sequences,
+expanding its application areas, \eg, underwater vision-language tracking. Most
+existing trackers are tailored for open-air environments, leading to
+performance degradation when applied to UOT due to domain gaps. Retraining and
+fine-tuning these trackers are challenging due to sample imbalances and limited
+real-world underwater datasets. To tackle these challenges, we propose a novel
+omni-knowledge distillation framework based on WebUOT-1M, incorporating various
+strategies to guide the learning of the student Transformer. To the best of our
+knowledge, this framework is the first to effectively transfer open-air domain
+knowledge to the UOT model through knowledge distillation, as demonstrated by
+results on both existing UOT datasets and the newly proposed WebUOT-1M.
+Furthermore, we comprehensively evaluate WebUOT-1M using 30 deep trackers,
+showcasing its value as a benchmark for UOT research by presenting new
+challenges and opportunities for future studies. The complete dataset, codes
+and tracking results, will be made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>GitHub project:
+  https://github.com/983632847/Awesome-Multimodal-Object-Tracking</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Performance Examination of Symbolic Aggregate Approximation in IoT
+  Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suzana Veljanovska, Hans Dermot Doran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Symbolic Aggregate approXimation (SAX) is a common dimensionality reduction
+approach for time-series data which has been employed in a variety of domains,
+including classification and anomaly detection in time-series data. Domains
+also include shape recognition where the shape outline is converted into
+time-series data forinstance epoch classification of archived arrowheads. In
+this paper we propose a dimensionality reduction and shape recognition approach
+based on the SAX algorithm, an application which requires responses on cost
+efficient, IoT-like, platforms. The challenge is largely dealing with the
+computational expense of the SAX algorithm in IoT-like applications, from
+simple time-series dimension reduction through shape recognition. The approach
+is based on lowering the dimensional space while capturing and preserving the
+most representative features of the shape. We present three scenarios of
+increasing computational complexity backing up our statements with measurement
+of performance characteristics
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Embedded World Conference, Nuremberg, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video Question Answering for People with Visual Impairments Using an
+  Egocentric 360-Degree Camera <span class="chip">CVPR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19794v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19794v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inpyo Song, Minjun Joo, Joonhyung Kwon, Jangwon Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the daily challenges encountered by visually impaired
+individuals, such as limited access to information, navigation difficulties,
+and barriers to social interaction. To alleviate these challenges, we introduce
+a novel visual question answering dataset. Our dataset offers two significant
+advancements over previous datasets: Firstly, it features videos captured using
+a 360-degree egocentric wearable camera, enabling observation of the entire
+surroundings, departing from the static image-centric nature of prior datasets.
+Secondly, unlike datasets centered on singular challenges, ours addresses
+multiple real-life obstacles simultaneously through an innovative
+visual-question answering framework. We validate our dataset using various
+state-of-the-art VideoQA methods and diverse metrics. Results indicate that
+while progress has been made, satisfactory performance levels for AI-powered
+assistive services remain elusive for visually impaired individuals.
+Additionally, our evaluation highlights the distinctive features of the
+proposed dataset, featuring ego-motion in videos captured via 360-degree
+cameras across varied scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CVPR2024 EgoVis Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instruction-Guided Visual Masking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19783v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19783v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinliang Zheng, Jianxiong Li, Sijie Cheng, Yinan Zheng, Jiaming Li, Jihao Liu, Yu Liu, Jingjing Liu, Xianyuan Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction following is crucial in contemporary LLM. However, when extended
+to multimodal setting, it often suffers from misalignment between specific
+textual instruction and targeted local region of an image. To achieve more
+accurate and nuanced multimodal instruction following, we introduce
+Instruction-guided Visual Masking (IVM), a new versatile visual grounding model
+that is compatible with diverse multimodal models, such as LMM and robot model.
+By constructing visual masks for instruction-irrelevant regions, IVM-enhanced
+multimodal models can effectively focus on task-relevant image regions to
+better align with complex instructions. Specifically, we design a visual
+masking data generation pipeline and create an IVM-Mix-1M dataset with 1
+million image-instruction pairs. We further introduce a new learning technique,
+Discriminator Weighted Supervised Learning (DWSL) for preferential IVM training
+that prioritizes high-quality data samples. Experimental results on generic
+multimodal tasks such as VQA and embodied robotic control demonstrate the
+versatility of IVM, which as a plug-and-play tool, significantly boosts the
+performance of diverse multimodal models, yielding new state-of-the-art results
+across challenging multimodal benchmarks. Code is available at
+https://github.com/2toinf/IVM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint, 21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Puff-Net: Efficient Style Transfer with Pure Content and Style Feature
+  Fusion Network <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sizhe Zheng, Pan Gao, Peng Zhou, Jie Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Style transfer aims to render an image with the artistic features of a style
+image, while maintaining the original structure. Various methods have been put
+forward for this task, but some challenges still exist. For instance, it is
+difficult for CNN-based methods to handle global information and long-range
+dependencies between input images, for which transformer-based methods have
+been proposed. Although transformers can better model the relationship between
+content and style images, they require high-cost hardware and time-consuming
+inference. To address these issues, we design a novel transformer model that
+includes only the encoder, thus significantly reducing the computational cost.
+In addition, we also find that existing style transfer methods may lead to
+images under-stylied or missing content. In order to achieve better
+stylization, we design a content feature extractor and a style feature
+extractor, based on which pure content and style images can be fed to the
+transformer. Finally, we propose a novel network termed Puff-Net, i.e., pure
+content and style feature fusion network. Through qualitative and quantitative
+experiments, we demonstrate the advantages of our model compared to
+state-of-the-art ones in the literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 11 figures, to be published in IEEE Conference on Computer
+  Vision and Pattern Recognition (CVPR 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VQA Training Sets are Self-play Environments for Generating Few-shot
+  Pools 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19773v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19773v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tautvydas Misiunas, Hassan Mansoor, Jasper Uijlings, Oriana Riva, Victor Carbune
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-language models and large-vision models are increasingly capable of
+solving compositional reasoning tasks, as measured by breakthroughs in
+visual-question answering benchmarks. However, state-of-the-art solutions often
+involve careful construction of large pre-training and fine-tuning datasets,
+which can be expensive. The use of external tools, whether other ML models,
+search engines, or APIs, can significantly improve performance by breaking down
+high-level reasoning questions into sub-questions that are answerable by
+individual tools, but this approach has similar dataset construction costs to
+teach fine-tuned models how to use the available tools. We propose a technique
+in which existing training sets can be directly used for constructing
+computational environments with task metrics as rewards. This enables a model
+to autonomously teach itself to use itself or another model as a tool. By doing
+so, we augment training sets by integrating external signals. The proposed
+method starts with zero-shot prompts and iteratively refines them by selecting
+few-shot examples that maximize the task metric on the training set. Our
+experiments showcase how Gemini learns how to use itself, or another smaller
+and specialized model such as ScreenAI, to iteratively improve performance on
+training sets. Our approach successfully generalizes and improves upon zeroshot
+performance on charts, infographics, and document visual question-answering
+datasets
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ All-In-One Medical Image Restoration via Task-Adaptive Routing <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwen Yang, Haowei Chen, Ziniu Qian, Yang Yi, Hui Zhang, Dan Zhao, Bingzheng Wei, Yan Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although single-task medical image restoration (MedIR) has witnessed
+remarkable success, the limited generalizability of these methods poses a
+substantial obstacle to wider application. In this paper, we focus on the task
+of all-in-one medical image restoration, aiming to address multiple distinct
+MedIR tasks with a single universal model. Nonetheless, due to significant
+differences between different MedIR tasks, training a universal model often
+encounters task interference issues, where different tasks with shared
+parameters may conflict with each other in the gradient update direction. This
+task interference leads to deviation of the model update direction from the
+optimal path, thereby affecting the model's performance. To tackle this issue,
+we propose a task-adaptive routing strategy, allowing conflicting tasks to
+select different network paths in spatial and channel dimensions, thereby
+mitigating task interference. Experimental results demonstrate that our
+proposed \textbf{A}ll-in-one \textbf{M}edical \textbf{I}mage
+\textbf{R}estoration (\textbf{AMIR}) network achieves state-of-the-art
+performance in three MedIR tasks: MRI super-resolution, CT denoising, and PET
+synthesis, both in single-task and all-in-one settings. The code and data will
+be available at
+\href{https://github.com/Yaziwel/All-In-One-Medical-Image-Restoration-via-Task-Adaptive-Routing.git}{https://github.com/Yaziwel/AMIR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article has been early accepted by MICCAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Unified Multi-granularity Text Detection with Interactive
+  Attention <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyu Wan, Chengquan Zhang, Pengyuan Lyu, Sen Fan, Zihan Ni, Kun Yao, Errui Ding, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing OCR engines or document image analysis systems typically rely on
+training separate models for text detection in varying scenarios and
+granularities, leading to significant computational complexity and resource
+demands. In this paper, we introduce "Detect Any Text" (DAT), an advanced
+paradigm that seamlessly unifies scene text detection, layout analysis, and
+document page detection into a cohesive, end-to-end model. This design enables
+DAT to efficiently manage text instances at different granularities, including
+*word*, *line*, *paragraph* and *page*. A pivotal innovation in DAT is the
+across-granularity interactive attention module, which significantly enhances
+the representation learning of text instances at varying granularities by
+correlating structural information across different text queries. As a result,
+it enables the model to achieve mutually beneficial detection performances
+across multiple text granularities. Additionally, a prompt-based segmentation
+module refines detection outcomes for texts of arbitrary curvature and complex
+layouts, thereby improving DAT's accuracy and expanding its real-world
+applicability. Experimental results demonstrate that DAT achieves
+state-of-the-art performances across a variety of text-related benchmarks,
+including multi-oriented/arbitrarily-shaped scene text detection, document
+layout analysis and page detection tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating annotation shift in cancer classification using single image
+  generative models <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marta Buetas Arcas, Richard Osuala, Karim Lekadir, Oliver Díaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial Intelligence (AI) has emerged as a valuable tool for assisting
+radiologists in breast cancer detection and diagnosis. However, the success of
+AI applications in this domain is restricted by the quantity and quality of
+available data, posing challenges due to limited and costly data annotation
+procedures that often lead to annotation shifts. This study simulates, analyses
+and mitigates annotation shifts in cancer classification in the breast
+mammography domain. First, a high-accuracy cancer risk prediction model is
+developed, which effectively distinguishes benign from malignant lesions. Next,
+model performance is used to quantify the impact of annotation shift. We
+uncover a substantial impact of annotation shift on multiclass classification
+performance particularly for malignant lesions. We thus propose a training data
+augmentation approach based on single-image generative models for the affected
+class, requiring as few as four in-domain annotations to considerably mitigate
+annotation shift, while also addressing dataset imbalance. Lastly, we further
+increase performance by proposing and validating an ensemble architecture based
+on multiple models trained under different data augmentation regimes. Our study
+offers key insights into annotation shift in deep learning breast cancer
+classification and explores the potential of single-image generative models to
+overcome domain shift challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint of paper accepted at SPIE IWBI 2024 Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HQ-DiT: Efficient Diffusion <span class="highlight-title">Transformer</span> with FP4 Hybrid Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxuan Liu, Saiqian Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion Transformers (DiTs) have recently gained substantial attention in
+both industrial and academic fields for their superior visual generation
+capabilities, outperforming traditional diffusion models that use U-Net.
+However,the enhanced performance of DiTs also comes with high parameter counts
+and implementation costs, seriously restricting their use on resource-limited
+devices such as mobile phones. To address these challenges, we introduce the
+Hybrid Floating-point Quantization for DiT(HQ-DiT), an efficient post-training
+quantization method that utilizes 4-bit floating-point (FP) precision on both
+weights and activations for DiT inference. Compared to fixed-point quantization
+(e.g., INT8), FP quantization, complemented by our proposed clipping range
+selection mechanism, naturally aligns with the data distribution within DiT,
+resulting in a minimal quantization error. Furthermore, HQ-DiT also implements
+a universal identity mathematical transform to mitigate the serious
+quantization error caused by the outliers. The experimental results demonstrate
+that DiT can achieve extremely low-precision quantization (i.e., 4 bits) with
+negligible impact on performance. Our approach marks the first instance where
+both weights and activations in DiTs are quantized to just 4 bits, with only a
+0.12 increase in sFID on ImageNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DenseSeg: Joint Learning for Semantic Segmentation and Landmark
+  Detection Using Dense Image-to-Shape Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ron Keuth, Lasse Hansen, Maren Balks, Ronja Jäger, Anne-Nele Schröder, Ludger Tüshaus, Mattias Heinrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Purpose: Semantic segmentation and landmark detection are fundamental tasks
+of medical image processing, facilitating further analysis of anatomical
+objects. Although deep learning-based pixel-wise classification has set a
+new-state-of-the-art for segmentation, it falls short in landmark detection, a
+strength of shape-based approaches.
+  Methods: In this work, we propose a dense image-to-shape representation that
+enables the joint learning of landmarks and semantic segmentation by employing
+a fully convolutional architecture. Our method intuitively allows the
+extraction of arbitrary landmarks due to its representation of anatomical
+correspondences. We benchmark our method against the state-of-the-art for
+semantic segmentation (nnUNet), a shape-based approach employing geometric deep
+learning and a CNN-based method for landmark detection.
+  Results: We evaluate our method on two medical dataset: one common benchmark
+featuring the lungs, heart, and clavicle from thorax X-rays, and another with
+17 different bones in the paediatric wrist. While our method is on pair with
+the landmark detection baseline in the thorax setting (error in mm of
+$2.6\pm0.9$ vs $2.7\pm0.9$), it substantially surpassed it in the more complex
+wrist setting ($1.1\pm0.6$ vs $1.9\pm0.5$).
+  Conclusion: We demonstrate that dense geometric shape representation is
+beneficial for challenging landmark detection tasks and outperforms previous
+state-of-the-art using heatmap regression. While it does not require explicit
+training on the landmarks themselves, allowing for the addition of new
+landmarks without necessitating retraining.}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaussianPrediction: Dynamic 3D Gaussian Prediction for Motion
+  Extrapolation and Free View Synthesis <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boming Zhao, Yuan Li, Ziyu Sun, Lin Zeng, Yujun Shen, Rui Ma, Yinda Zhang, Hujun Bao, Zhaopeng Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Forecasting future scenarios in dynamic environments is essential for
+intelligent decision-making and navigation, a challenge yet to be fully
+realized in computer vision and robotics. Traditional approaches like video
+prediction and novel-view synthesis either lack the ability to forecast from
+arbitrary viewpoints or to predict temporal dynamics. In this paper, we
+introduce GaussianPrediction, a novel framework that empowers 3D Gaussian
+representations with dynamic scene modeling and future scenario synthesis in
+dynamic environments. GaussianPrediction can forecast future states from any
+viewpoint, using video observations of dynamic scenes. To this end, we first
+propose a 3D Gaussian canonical space with deformation modeling to capture the
+appearance and geometry of dynamic scenes, and integrate the lifecycle property
+into Gaussians for irreversible deformations. To make the prediction feasible
+and efficient, a concentric motion distillation approach is developed by
+distilling the scene motion with key points. Finally, a Graph Convolutional
+Network is employed to predict the motions of key points, enabling the
+rendering of photorealistic images of future scenarios. Our framework shows
+outstanding performance on both synthetic and real-world datasets,
+demonstrating its efficacy in predicting and rendering future environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to SIGGRAPH 2024 Conference. Project Page:
+  https://zju3dv.github.io/gaussian-prediction/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ May the Dance be with You: Dance Generation Framework for Non-Humanoids 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyemin Ahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We hypothesize dance as a motion that forms a visual rhythm from music, where
+the visual rhythm can be perceived from an optical flow. If an agent can
+recognize the relationship between visual rhythm and music, it will be able to
+dance by generating a motion to create a visual rhythm that matches the music.
+Based on this, we propose a framework for any kind of non-humanoid agents to
+learn how to dance from human videos. Our framework works in two processes: (1)
+training a reward model which perceives the relationship between optical flow
+(visual rhythm) and music from human dance videos, (2) training the
+non-humanoid dancer based on that reward model, and reinforcement learning. Our
+reward model consists of two feature encoders for optical flow and music. They
+are trained based on contrastive learning which makes the higher similarity
+between concurrent optical flow and music features. With this reward model, the
+agent learns dancing by getting a higher reward when its action creates an
+optical flow whose feature has a higher similarity with the given music
+feature. Experiment results show that generated dance motion can align with the
+music beat properly, and user study result indicates that our framework is more
+preferred by humans compared to the baselines. To the best of our knowledge,
+our work of non-humanoid agents which learn dance from human videos is
+unprecedented. An example video can be found at https://youtu.be/dOUPvo-O3QY.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 6 Figures, Rejected at Neurips 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Twin Deformable Point Convolutions for Point Cloud Semantic Segmentation
+  in Remote Sensing Scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong-Qiang Mao, Hanbo Bi, Xuexue Li, Kaiqiang Chen, Zhirui Wang, Xian Sun, Kun Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thanks to the application of deep learning technology in point cloud
+processing of the remote sensing field, point cloud segmentation has become a
+research hotspot in recent years, which can be applied to real-world 3D, smart
+cities, and other fields. Although existing solutions have made unprecedented
+progress, they ignore the inherent characteristics of point clouds in remote
+sensing fields that are strictly arranged according to latitude, longitude, and
+altitude, which brings great convenience to the segmentation of point clouds in
+remote sensing fields. To consider this property cleverly, we propose novel
+convolution operators, termed Twin Deformable point Convolutions (TDConvs),
+which aim to achieve adaptive feature learning by learning deformable sampling
+points in the latitude-longitude plane and altitude direction, respectively.
+First, to model the characteristics of the latitude-longitude plane, we propose
+a Cylinder-wise Deformable point Convolution (CyDConv) operator, which
+generates a two-dimensional cylinder map by constructing a cylinder-like grid
+in the latitude-longitude direction. Furthermore, to better integrate the
+features of the latitude-longitude plane and the spatial geometric features, we
+perform a multi-scale fusion of the extracted latitude-longitude features and
+spatial geometric features, and realize it through the aggregation of adjacent
+point features of different scales. In addition, a Sphere-wise Deformable point
+Convolution (SpDConv) operator is introduced to adaptively offset the sampling
+points in three-dimensional space by constructing a sphere grid structure,
+aiming at modeling the characteristics in the altitude direction. Experiments
+on existing popular benchmarks conclude that our TDConvs achieve the best
+segmentation performance, surpassing the existing state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Two Optimizers Are Better Than One: LLM Catalyst for Enhancing
+  Gradient-Based Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixian Guo, Ming Liu, Zhilong Ji, Jinfeng Bai, Yiwen Guo, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning a skill generally relies on both practical experience by doer and
+insightful high-level guidance by instructor. Will this strategy also work well
+for solving complex non-convex optimization problems? Here, a common
+gradient-based optimizer acts like a disciplined doer, making locally optimal
+update at each step. Recent methods utilize large language models (LLMs) to
+optimize solutions for concrete problems by inferring from natural language
+instructions, akin to a high-level instructor. In this paper, we show that
+these two optimizers are complementary to each other, suggesting a
+collaborative optimization approach. The gradient-based optimizer and LLM-based
+optimizer are combined in an interleaved manner. We instruct LLMs using task
+descriptions and timely optimization trajectories recorded during
+gradient-based optimization. Inferred results from LLMs are used as restarting
+points for the next stage of gradient optimization. By leveraging both the
+locally rigorous gradient-based optimizer and the high-level deductive
+LLM-based optimizer, our combined optimization method consistently yields
+improvements over competitive baseline prompt tuning methods. Our results
+demonstrate the synergistic effect of conventional gradient-based optimization
+and the inference ability of LLMs. The code is released at
+https://github.com/guozix/LLM-catalyst.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Dance Video Segmentation for Understanding Choreography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19727v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19727v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Koki Endo, Shuhei Tsuchida, Tsukasa Fukusato, Takeo Igarashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmenting dance video into short movements is a popular way to easily
+understand dance choreography. However, it is currently done manually and
+requires a significant amount of effort by experts. That is, even if many dance
+videos are available on social media (e.g., TikTok and YouTube), it remains
+difficult for people, especially novices, to casually watch short video
+segments to practice dance choreography. In this paper, we propose a method to
+automatically segment a dance video into each movement. Given a dance video as
+input, we first extract visual and audio features: the former is computed from
+the keypoints of the dancer in the video, and the latter is computed from the
+Mel spectrogram of the music in the video. Next, these features are passed to a
+Temporal Convolutional Network (TCN), and segmentation points are estimated by
+picking peaks of the network output. To build our training dataset, we annotate
+segmentation points to dance videos in the AIST Dance Video Database, which is
+a shared database containing original street dance videos with
+copyright-cleared dance music. The evaluation study shows that the proposed
+method (i.e., combining the visual and audio features) can estimate
+segmentation points with high accuracy. In addition, we developed an
+application to help dancers practice choreography using the proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Streaming Video Diffusion: Online Video Editing with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19726v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19726v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Chen, Zhen Yang, Bohan Zhuang, Qi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel task called online video editing, which is designed to
+edit \textbf{streaming} frames while maintaining temporal consistency. Unlike
+existing offline video editing assuming all frames are pre-established and
+accessible, online video editing is tailored to real-life applications such as
+live streaming and online chat, requiring (1) fast continual step inference,
+(2) long-term temporal modeling, and (3) zero-shot video editing capability. To
+solve these issues, we propose Streaming Video Diffusion (SVDiff), which
+incorporates the compact spatial-aware temporal recurrence into off-the-shelf
+Stable Diffusion and is trained with the segment-level scheme on large-scale
+long videos. This simple yet effective setup allows us to obtain a single model
+that is capable of executing a broad range of videos and editing each streaming
+frame with temporal coherence. Our experiments indicate that our model can edit
+long, high-quality videos with remarkable results, achieving a real-time
+inference speed of 15.2 FPS at a resolution of 512x512.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum Visual Feature Encoding Revisited 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan-Bac Nguyen, Hoang-Quan Nguyen, Hugh Churchill, Samee U. Khan, Khoa Luu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although quantum machine learning has been introduced for a while, its
+applications in computer vision are still limited. This paper, therefore,
+revisits the quantum visual encoding strategies, the initial step in quantum
+machine learning. Investigating the root cause, we uncover that the existing
+quantum encoding design fails to ensure information preservation of the visual
+features after the encoding process, thus complicating the learning process of
+the quantum machine learning models. In particular, the problem, termed
+"Quantum Information Gap" (QIG), leads to a gap of information between
+classical and corresponding quantum features. We provide theoretical proof and
+practical demonstrations of that found and underscore the significance of QIG,
+as it directly impacts the performance of quantum machine learning algorithms.
+To tackle this challenge, we introduce a simple but efficient new loss function
+named Quantum Information Preserving (QIP) to minimize this gap, resulting in
+enhanced performance of quantum machine learning algorithms. Extensive
+experiments validate the effectiveness of our approach, showcasing superior
+performance compared to current methodologies and consistently achieving
+state-of-the-art results in quantum modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Encoding and Controlling Global Semantics for Long-form Video Question
+  Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19723v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19723v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thong Thanh Nguyen, Zhiyuan Hu, Xiaobao Wu, Cong-Duy T Nguyen, See-Kiong Ng, Anh Tuan Luu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Seeking answers effectively for long videos is essential to build video
+question answering (videoQA) systems. Previous methods adaptively select frames
+and regions from long videos to save computations. However, this fails to
+reason over the whole sequence of video, leading to sub-optimal performance. To
+address this problem, we introduce a state space layer (SSL) into multi-modal
+Transformer to efficiently integrate global semantics of the video, which
+mitigates the video information loss caused by frame and region selection
+modules. Our SSL includes a gating unit to enable controllability over the flow
+of global semantics into visual representations. To further enhance the
+controllability, we introduce a cross-modal compositional congruence (C^3)
+objective to encourage global semantics aligned with the question. To
+rigorously evaluate long-form videoQA capacity, we construct two new benchmarks
+Ego-QA and MAD-QA featuring videos of considerably long length, i.e. 17.5
+minutes and 1.9 hours, respectively. Extensive experiments demonstrate the
+superiority of our framework on these new as well as existing datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QClusformer: A Quantum <span class="highlight-title">Transformer</span>-based Framework for Unsupervised
+  Visual Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19722v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19722v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan-Bac Nguyen, Hoang-Quan Nguyen, Samuel Yen-Chi Chen, Samee U. Khan, Hugh Churchill, Khoa Luu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised vision clustering, a cornerstone in computer vision, has been
+studied for decades, yielding significant outcomes across numerous vision
+tasks. However, these algorithms involve substantial computational demands when
+confronted with vast amounts of unlabeled data. Conversely, Quantum computing
+holds promise in expediting unsupervised algorithms when handling large-scale
+databases. In this study, we introduce QClusformer, a pioneering
+Transformer-based framework leveraging Quantum machines to tackle unsupervised
+vision clustering challenges. Specifically, we design the Transformer
+architecture, including the self-attention module and transformer blocks, from
+a Quantum perspective to enable execution on Quantum hardware. In addition, we
+present QClusformer, a variant based on the Transformer architecture, tailored
+for unsupervised vision clustering tasks. By integrating these elements into an
+end-to-end framework, QClusformer consistently outperforms previous methods
+running on classical computers. Empirical evaluations across diverse
+benchmarks, including MS-Celeb-1M and DeepFashion, underscore the superior
+performance of QClusformer compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LED: A Large-scale Real-world Paired <span class="highlight-title">Dataset</span> for Event Camera Denoising <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19718v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19718v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxing Duan, Shihan Peng, Lin Zhu, Wei Zhang, Yi Chang, Sheng Zhong, Luxin Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event camera has significant advantages in capturing dynamic scene
+information while being prone to noise interference, particularly in
+challenging conditions like low threshold and low illumination. However, most
+existing research focuses on gentle situations, hindering event camera
+applications in realistic complex scenarios. To tackle this limitation and
+advance the field, we construct a new paired real-world event denoising dataset
+(LED), including 3K sequences with 18K seconds of high-resolution (1200*680)
+event streams and showing three notable distinctions compared to others:
+diverse noise levels and scenes, larger-scale with high-resolution, and
+high-quality GT. Specifically, it contains stepped parameters and varying
+illumination with diverse scenarios. Moreover, based on the property of noise
+events inconsistency and signal events consistency, we propose a novel
+effective denoising framework(DED) using homogeneous dual events to generate
+the GT with better separating noise from the raw. Furthermore, we design a
+bio-inspired baseline leveraging Leaky-Integrate-and-Fire (LIF) neurons with
+dynamic thresholds to realize accurate denoising. The experimental results
+demonstrate that the remarkable performance of the proposed approach on
+different datasets.The dataset and code are at https://github.com/Yee-Sing/led.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Large Vision Language Models with Self-Training on Image
+  Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19716v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19716v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihe Deng, Pan Lu, Fan Yin, Ziniu Hu, Sheng Shen, James Zou, Kai-Wei Chang, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision language models (LVLMs) integrate large language models (LLMs)
+with pre-trained vision encoders, thereby activating the perception capability
+of the model to understand image inputs for different queries and conduct
+subsequent reasoning. Improving this capability requires high-quality
+vision-language data, which is costly and labor-intensive to acquire.
+Self-training approaches have been effective in single-modal settings to
+alleviate the need for labeled data by leveraging model's own generation.
+However, effective self-training remains a challenge regarding the unique
+visual perception and reasoning capability of LVLMs. To address this, we
+introduce Self-Training on Image Comprehension (STIC), which emphasizes a
+self-training approach specifically for image comprehension. First, the model
+self-constructs a preference dataset for image descriptions using unlabeled
+images. Preferred responses are generated through a step-by-step prompt, while
+dis-preferred responses are generated from either corrupted images or
+misleading prompts. To further self-improve reasoning on the extracted visual
+information, we let the model reuse a small portion of existing
+instruction-tuning data and append its self-generated image descriptions to the
+prompts. We validate the effectiveness of STIC across seven different
+benchmarks, demonstrating substantial performance gains of 4.0% on average
+while using 70% less supervised fine-tuning data than the current method.
+Further studies investigate various components of STIC and highlight its
+potential to leverage vast quantities of unlabeled images for self-training.
+Code and data are made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 14 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HINT: Learning Complete Human Neural Representations from Limited
+  Viewpoints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Sanvito, Andrea Ramazzina, Stefanie Walz, Mario Bijelic, Felix Heide
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  No augmented application is possible without animated humanoid avatars. At
+the same time, generating human replicas from real-world monocular hand-held or
+robotic sensor setups is challenging due to the limited availability of views.
+Previous work showed the feasibility of virtual avatars but required the
+presence of 360 degree views of the targeted subject. To address this issue, we
+propose HINT, a NeRF-based algorithm able to learn a detailed and complete
+human model from limited viewing angles. We achieve this by introducing a
+symmetry prior, regularization constraints, and training cues from large human
+datasets. In particular, we introduce a sagittal plane symmetry prior to the
+appearance of the human, directly supervise the density function of the human
+model using explicit 3D body modeling, and leverage a co-learned human
+digitization network as additional supervision for the unseen angles. As a
+result, our method can reconstruct complete humans even from a few viewing
+angles, increasing performance by more than 15% PSNR compared to previous
+state-of-the-art algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text Guided Image Editing with Automatic Concept Locating and Forgetting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19708v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19708v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Li, Lijie Hu, Zhixian He, Jingfeng Zhang, Tianhang Zheng, Di Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the advancement of image-to-image diffusion models guided by text,
+significant progress has been made in image editing. However, a persistent
+challenge remains in seamlessly incorporating objects into images based on
+textual instructions, without relying on extra user-provided guidance. Text and
+images are inherently distinct modalities, bringing out difficulties in fully
+capturing the semantic intent conveyed through language and accurately
+translating that into the desired visual modifications. Therefore, text-guided
+image editing models often produce generations with residual object attributes
+that do not fully align with human expectations. To address this challenge, the
+models should comprehend the image content effectively away from a disconnect
+between the provided textual editing prompts and the actual modifications made
+to the image. In our paper, we propose a novel method called Locate and Forget
+(LaF), which effectively locates potential target concepts in the image for
+modification by comparing the syntactic trees of the target prompt and scene
+descriptions in the input image, intending to forget their existence clues in
+the generated image. Compared to the baselines, our method demonstrates its
+superiority in text-guided image editing tasks both qualitatively and
+quantitatively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeMamba: AI-Generated Video Detection on Million-Scale GenVideo
+  Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxing Chen, Yan Hong, Zizheng Huang, Zhuoer Xu, Zhangxuan Gu, Yaohui Li, Jun Lan, Huijia Zhu, Jianfu Zhang, Weiqiang Wang, Huaxiong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, video generation techniques have advanced rapidly. Given the
+popularity of video content on social media platforms, these models intensify
+concerns about the spread of fake information. Therefore, there is a growing
+demand for detectors capable of distinguishing between fake AI-generated videos
+and mitigating the potential harm caused by fake information. However, the lack
+of large-scale datasets from the most advanced video generators poses a barrier
+to the development of such detectors. To address this gap, we introduce the
+first AI-generated video detection dataset, GenVideo. It features the following
+characteristics: (1) a large volume of videos, including over one million
+AI-generated and real videos collected; (2) a rich diversity of generated
+content and methodologies, covering a broad spectrum of video categories and
+generation techniques. We conducted extensive studies of the dataset and
+proposed two evaluation methods tailored for real-world-like scenarios to
+assess the detectors' performance: the cross-generator video classification
+task assesses the generalizability of trained detectors on generators; the
+degraded video classification task evaluates the robustness of detectors to
+handle videos that have degraded in quality during dissemination. Moreover, we
+introduced a plug-and-play module, named Detail Mamba (DeMamba), designed to
+enhance the detectors by identifying AI-generated videos through the analysis
+of inconsistencies in temporal and spatial dimensions. Our extensive
+experiments demonstrate DeMamba's superior generalizability and robustness on
+GenVideo compared to existing detectors. We believe that the GenVideo dataset
+and the DeMamba module will significantly advance the field of AI-generated
+video detection. Our code and dataset will be aviliable at
+\url{https://github.com/chenhaoxing/DeMamba}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a Better Evaluation of Out-of-Domain Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19703v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19703v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duhun Hwang, Suhyun Kang, Moonjung Eo, Jimyeong Kim, Wonjong Rhee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The objective of Domain Generalization (DG) is to devise algorithms and
+models capable of achieving high performance on previously unseen test
+distributions. In the pursuit of this objective, average measure has been
+employed as the prevalent measure for evaluating models and comparing
+algorithms in the existing DG studies. Despite its significance, a
+comprehensive exploration of the average measure has been lacking and its
+suitability in approximating the true domain generalization performance has
+been questionable. In this study, we carefully investigate the limitations
+inherent in the average measure and propose worst+gap measure as a robust
+alternative. We establish theoretical grounds of the proposed measure by
+deriving two theorems starting from two different assumptions. We conduct
+extensive experimental investigations to compare the proposed worst+gap measure
+with the conventional average measure. Given the indispensable need to access
+the true DG performance for studying measures, we modify five existing datasets
+to come up with SR-CMNIST, C-Cats&Dogs, L-CIFAR10, PACS-corrupted, and
+VLCS-corrupted datasets. The experiment results unveil an inferior performance
+of the average measure in approximating the true DG performance and confirm the
+robustness of the theoretically supported worst+gap measure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distribution Aligned Semantics Adaption for Lifelong Person
+  Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19695v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19695v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizao Wang, Xuelin Qian, Bin Li, Xiangyang Xue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In real-world scenarios, person Re-IDentification (Re-ID) systems need to be
+adaptable to changes in space and time. Therefore, the adaptation of Re-ID
+models to new domains while preserving previously acquired knowledge is
+crucial, known as Lifelong person Re-IDentification (LReID). Advanced LReID
+methods rely on replaying exemplars from old domains and applying knowledge
+distillation in logits with old models. However, due to privacy concerns,
+retaining previous data is inappropriate. Additionally, the fine-grained and
+open-set characteristics of Re-ID limit the effectiveness of the distillation
+paradigm for accumulating knowledge. We argue that a Re-ID model trained on
+diverse and challenging pedestrian images at a large scale can acquire robust
+and general human semantic knowledge. These semantics can be readily utilized
+as shared knowledge for lifelong applications. In this paper, we identify the
+challenges and discrepancies associated with adapting a pre-trained model to
+each application domain, and introduce the Distribution Aligned Semantics
+Adaption (DASA) framework. It efficiently adjusts Batch Normalization (BN) to
+mitigate interference from data distribution discrepancy and freezes the
+pre-trained convolutional layers to preserve shared knowledge. Additionally, we
+propose the lightweight Semantics Adaption (SA) module, which effectively
+adapts learned semantics to enhance pedestrian representations. Extensive
+experiments demonstrate the remarkable superiority of our proposed framework
+over advanced LReID methods, and it exhibits significantly reduced storage
+consumption. DASA presents a novel and cost-effective perspective on
+effectively adapting pre-trained models for LReID.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-aware sign language video retrieval with probability
+  distribution modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan Wu, Hongxiang Li, Yuanjiang Luo, Xuxin Cheng, Xianwei Zhuang, Meng Cao, Keren Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sign language video retrieval plays a key role in facilitating information
+access for the deaf community. Despite significant advances in video-text
+retrieval, the complexity and inherent uncertainty of sign language preclude
+the direct application of these techniques. Previous methods achieve the
+mapping between sign language video and text through fine-grained modal
+alignment. However, due to the scarcity of fine-grained annotation, the
+uncertainty inherent in sign language video is underestimated, limiting the
+further development of sign language retrieval tasks. To address this
+challenge, we propose a novel Uncertainty-aware Probability Distribution
+Retrieval (UPRet), that conceptualizes the mapping process of sign language
+video and text in terms of probability distributions, explores their potential
+interrelationships, and enables flexible mappings. Experiments on three
+benchmarks demonstrate the effectiveness of our method, which achieves
+state-of-the-art results on How2Sign (59.1%), PHOENIX-2014T (72.0%), and
+CSL-Daily (78.4%).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DNPM: A Neural Parametric Model for the Synthesis of Facial Geometric
+  Details 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19688v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19688v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haitao Cao, Baoping Cheng, Qiran Pu, Haocheng Zhang, Bin Luo, Yixiang Zhuang, Juncong Lin, Liyan Chen, Xuan Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parametric 3D models have enabled a wide variety of computer vision and
+graphics tasks, such as modeling human faces, bodies and hands. In 3D face
+modeling, 3DMM is the most widely used parametric model, but can't generate
+fine geometric details solely from identity and expression inputs. To tackle
+this limitation, we propose a neural parametric model named DNPM for the facial
+geometric details, which utilizes deep neural network to extract latent codes
+from facial displacement maps encoding details and wrinkles. Built upon DNPM, a
+novel 3DMM named Detailed3DMM is proposed, which augments traditional 3DMMs by
+including the synthesis of facial details only from the identity and expression
+inputs. Moreover, we show that DNPM and Detailed3DMM can facilitate two
+downstream applications: speech-driven detailed 3D facial animation and 3D face
+reconstruction from a degraded image. Extensive experiments have shown the
+usefulness of DNPM and Detailed3DMM, and the progressiveness of two proposed
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Autonomous Driving with Spiking Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui-Jie Zhu, Ziqing Wang, Leilani Gilpin, Jason K. Eshraghian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous driving demands an integrated approach that encompasses
+perception, prediction, and planning, all while operating under strict energy
+constraints to enhance scalability and environmental sustainability. We present
+Spiking Autonomous Driving (\name{}), the first unified Spiking Neural Network
+(SNN) to address the energy challenges faced by autonomous driving systems
+through its event-driven and energy-efficient nature. SAD is trained end-to-end
+and consists of three main modules: perception, which processes inputs from
+multi-view cameras to construct a spatiotemporal bird's eye view; prediction,
+which utilizes a novel dual-pathway with spiking neurons to forecast future
+states; and planning, which generates safe trajectories considering predicted
+occupancy, traffic rules, and ride comfort. Evaluated on the nuScenes dataset,
+SAD achieves competitive performance in perception, prediction, and planning
+tasks, while drawing upon the energy efficiency of SNNs. This work highlights
+the potential of neuromorphic computing to be applied to energy-efficient
+autonomous driving, a critical step toward sustainable and safety-critical
+automotive technology. Our code is available at
+\url{https://github.com/ridgerchu/SAD}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive <span class="highlight-title">Survey</span> on Underwater Image Enhancement Based on Deep
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19684v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19684v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaofeng Cong, Yu Zhao, Jie Gui, Junming Hou, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Underwater image enhancement (UIE) is a challenging research task in the
+field of computer vision. Although hundreds of UIE algorithms have been
+proposed, a comprehensive and systematic review is still lacking. To promote
+future research, we summarize the UIE task from multiple perspectives. First,
+the physical models, data construction processes, evaluation metrics, and loss
+functions are introduced. Second, according to the contributions brought by
+different literatures, recent proposed algorithms are discussed and classified
+from six perspectives, namely network architecture, learning strategy, learning
+stage, assistance task, domain perspective and disentanglement fusion,
+respectively. Third, considering the inconsistencies in experimental settings
+in different literatures, a comprehensive and fair comparison does not yet
+exist. To this end, we quantitatively and qualitatively evaluate
+state-of-the-art algorithms on multiple benchmark datasets. Finally, issues
+worthy of further research in the UIE task are raised. A collection of useful
+materials is available at https://github.com/YuZhao1999/UIE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>A survey on the underwater image enhancement task</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fully Test-Time Adaptation for Monocular 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbin Lin, Yifan Zhang, Shuaicheng Niu, Shuguang Cui, Zhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular 3D object detection (Mono 3Det) aims to identify 3D objects from a
+single RGB image. However, existing methods often assume training and test data
+follow the same distribution, which may not hold in real-world test scenarios.
+To address the out-of-distribution (OOD) problems, we explore a new adaptation
+paradigm for Mono 3Det, termed Fully Test-time Adaptation. It aims to adapt a
+well-trained model to unlabeled test data by handling potential data
+distribution shifts at test time without access to training data and test
+labels. However, applying this paradigm in Mono 3Det poses significant
+challenges due to OOD test data causing a remarkable decline in object
+detection scores. This decline conflicts with the pre-defined score thresholds
+of existing detection methods, leading to severe object omissions (i.e., rare
+positive detections and many false negatives). Consequently, the limited
+positive detection and plenty of noisy predictions cause test-time adaptation
+to fail in Mono 3Det. To handle this problem, we propose a novel Monocular
+Test-Time Adaptation (MonoTTA) method, based on two new strategies. 1)
+Reliability-driven adaptation: we empirically find that high-score objects are
+still reliable and the optimization of high-score objects can enhance
+confidence across all detections. Thus, we devise a self-adaptive strategy to
+identify reliable objects for model adaptation, which discovers potential
+objects and alleviates omissions. 2) Noise-guard adaptation: since high-score
+objects may be scarce, we develop a negative regularization term to exploit the
+numerous low-score objects via negative learning, preventing overfitting to
+noise and trivial solutions. Experimental results show that MonoTTA brings
+significant performance gains for Mono 3Det models in OOD test scenarios,
+approximately 190% gains by average on KITTI and 198% gains on nuScenes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ View-Consistent Hierarchical 3D SegmentationUsing Ultrametric Feature
+  Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haodi He, Colton Stearns, Adam W. Harley, Leonidas J. Guibas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale vision foundation models such as Segment Anything (SAM)
+demonstrate impressive performance in zero-shot image segmentation at multiple
+levels of granularity. However, these zero-shot predictions are rarely
+3D-consistent. As the camera viewpoint changes in a scene, so do the
+segmentation predictions, as well as the characterizations of ``coarse" or
+``fine" granularity. In this work, we address the challenging task of lifting
+multi-granular and view-inconsistent image segmentations into a hierarchical
+and 3D-consistent representation. We learn a novel feature field within a
+Neural Radiance Field (NeRF) representing a 3D scene, whose segmentation
+structure can be revealed at different scales by simply using different
+thresholds on feature distance. Our key idea is to learn an ultrametric feature
+space, which unlike a Euclidean space, exhibits transitivity in distance-based
+grouping, naturally leading to a hierarchical clustering. Put together, our
+method takes view-inconsistent multi-granularity 2D segmentations as input and
+produces a hierarchy of 3D-consistent segmentations as output. We evaluate our
+method and several baselines on synthetic datasets with multi-view images and
+multi-granular segmentation, showcasing improved accuracy and
+viewpoint-consistency. We additionally provide qualitative examples of our
+model's 3D hierarchical segmentations in real world scenes.\footnote{The code
+and dataset are available at:
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge-grounded Adaptation Strategy for Vision-language Models:
+  Building Unique Case-set for Screening Mammograms for Residents Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aisha Urooj Khan, John Garrett, Tyler Bradshaw, Lonie Salkowski, Jiwoong Jason Jeong, Amara Tariq, Imon Banerjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A visual-language model (VLM) pre-trained on natural images and text pairs
+poses a significant barrier when applied to medical contexts due to domain
+shift. Yet, adapting or fine-tuning these VLMs for medical use presents
+considerable hurdles, including domain misalignment, limited access to
+extensive datasets, and high-class imbalances. Hence, there is a pressing need
+for strategies to effectively adapt these VLMs to the medical domain, as such
+adaptations would prove immensely valuable in healthcare applications. In this
+study, we propose a framework designed to adeptly tailor VLMs to the medical
+domain, employing selective sampling and hard-negative mining techniques for
+enhanced performance in retrieval tasks. We validate the efficacy of our
+proposed approach by implementing it across two distinct VLMs: the in-domain
+VLM (MedCLIP) and out-of-domain VLMs (ALBEF). We assess the performance of
+these models both in their original off-the-shelf state and after undergoing
+our proposed training strategies, using two extensive datasets containing
+mammograms and their corresponding reports. Our evaluation spans zero-shot,
+few-shot, and supervised scenarios. Through our approach, we observe a notable
+enhancement in Recall@K performance for the image-text retrieval task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CRIS: Collaborative Refinement Integrated with Segmentation for Polyp
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankush Gajanan Arudkar, Bernard J. E. Evans
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate detection of colorectal cancer and early prevention heavily rely on
+precise polyp identification during gastrointestinal colonoscopy. Due to
+limited data, many current state-of-the-art deep learning methods for polyp
+segmentation often rely on post-processing of masks to reduce noise and enhance
+results. In this study, we propose an approach that integrates mask refinement
+and binary semantic segmentation, leveraging a novel collaborative training
+strategy that surpasses current widely-used refinement strategies. We
+demonstrate the superiority of our approach through comprehensive evaluation on
+established benchmark datasets and its successful application across various
+medical image segmentation architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaussianRoom: Improving 3D Gaussian Splatting with SDF Guidance and
+  Monocular Cues for Indoor Scene Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haodong Xiang, Xinghui Li, Xiansong Lai, Wanting Zhang, Zhichao Liao, Kai Cheng, Xueping Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, 3D Gaussian Splatting(3DGS) has revolutionized neural rendering
+with its high-quality rendering and real-time speed. However, when it comes to
+indoor scenes with a significant number of textureless areas, 3DGS yields
+incomplete and noisy reconstruction results due to the poor initialization of
+the point cloud and under-constrained optimization. Inspired by the continuity
+of signed distance field (SDF), which naturally has advantages in modeling
+surfaces, we present a unified optimizing framework integrating neural SDF with
+3DGS. This framework incorporates a learnable neural SDF field to guide the
+densification and pruning of Gaussians, enabling Gaussians to accurately model
+scenes even with poor initialized point clouds. At the same time, the geometry
+represented by Gaussians improves the efficiency of the SDF field by piloting
+its point sampling. Additionally, we regularize the optimization with normal
+and edge priors to eliminate geometry ambiguity in textureless areas and
+improve the details. Extensive experiments in ScanNet and ScanNet++ show that
+our method achieves state-of-the-art performance in both surface reconstruction
+and novel view synthesis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Texture-guided Coding for Deep Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Xiong, Xin Luo, Zihao Wang, Chaofan He, Shuyuan Zhu, Bing Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of machine vision technology in recent years, many
+researchers have begun to focus on feature compression that is better suited
+for machine vision tasks. The target of feature compression is deep features,
+which arise from convolution in the middle layer of a pre-trained convolutional
+neural network. However, due to the large volume of data and high level of
+abstraction of deep features, their application is primarily limited to
+machine-centric scenarios, which poses significant constraints in situations
+requiring human-computer interaction. This paper investigates features and
+textures and proposes a texture-guided feature compression strategy based on
+their characteristics. Specifically, the strategy comprises feature layers and
+texture layers. The feature layers serve the machine, including a feature
+selection module and a feature reconstruction network. With the assistance of
+texture images, they selectively compress and transmit channels relevant to
+visual tasks, reducing feature data while providing high-quality features for
+the machine. The texture layers primarily serve humans and consist of an image
+reconstruction network. This image reconstruction network leverages features
+and texture images to reconstruct preview images for humans. Our method fully
+exploits the characteristics of texture and features. It eliminates feature
+redundancy, reconstructs high-quality preview images for humans, and supports
+decision-making. The experimental results demonstrate excellent performance
+when employing our proposed method to compress the deep features.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoBreach: Universal and Adaptive Jailbreaking with Efficient
+  Wordplay-Guided Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19668v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19668v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Chen, Xiao Yang, Zhengwei Fang, Yu Tian, Yinpeng Dong, Zhaoxia Yin, Hang Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the widespread application of large language models (LLMs) across
+various tasks, recent studies indicate that they are susceptible to jailbreak
+attacks, which can render their defense mechanisms ineffective. However,
+previous jailbreak research has frequently been constrained by limited
+universality, suboptimal efficiency, and a reliance on manual crafting. In
+response, we rethink the approach to jailbreaking LLMs and formally define
+three essential properties from the attacker' s perspective, which contributes
+to guiding the design of jailbreak methods. We further introduce AutoBreach, a
+novel method for jailbreaking LLMs that requires only black-box access.
+Inspired by the versatility of wordplay, AutoBreach employs a wordplay-guided
+mapping rule sampling strategy to generate a variety of universal mapping rules
+for creating adversarial prompts. This generation process leverages LLMs'
+automatic summarization and reasoning capabilities, thus alleviating the manual
+burden. To boost jailbreak success rates, we further suggest sentence
+compression and chain-of-thought-based mapping rules to correct errors and
+wordplay misinterpretations in target LLMs. Additionally, we propose a
+two-stage mapping rule optimization strategy that initially optimizes mapping
+rules before querying target LLMs to enhance the efficiency of AutoBreach.
+AutoBreach can efficiently identify security vulnerabilities across various
+LLMs, including three proprietary models: Claude-3, GPT-3.5, GPT-4 Turbo, and
+two LLMs' web platforms: Bingchat, GPT-4 Web, achieving an average success rate
+of over 80% with fewer than 10 queries
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video-Language Critic: Transferable Reward Functions for
+  Language-Conditioned Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minttu Alakuijala, Reginald McLean, Isaac Woungang, Nariman Farsad, Samuel Kaski, Pekka Marttinen, Kai Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language is often the easiest and most convenient modality for humans
+to specify tasks for robots. However, learning to ground language to behavior
+typically requires impractical amounts of diverse, language-annotated
+demonstrations collected on each target robot. In this work, we aim to separate
+the problem of what to accomplish from how to accomplish it, as the former can
+benefit from substantial amounts of external observation-only data, and only
+the latter depends on a specific robot embodiment. To this end, we propose
+Video-Language Critic, a reward model that can be trained on readily available
+cross-embodiment data using contrastive learning and a temporal ranking
+objective, and use it to score behavior traces from a separate reinforcement
+learning actor. When trained on Open X-Embodiment data, our reward model
+enables 2x more sample-efficient policy training on Meta-World tasks than a
+sparse reward only, despite a significant domain gap. Using in-domain data but
+in a challenging task generalization setting on Meta-World, we further
+demonstrate more sample-efficient training than is possible with prior
+language-conditioned reward models that are either trained with binary
+classification, use static images, or do not leverage the temporal information
+present in video data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages in the main text, 16 pages including references and
+  supplementary materials. 4 figures and 3 tables in the main text, 1 table in
+  supplementary materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Research on Foundation Model for Spatial Data Intelligence: China's 2024
+  White Paper on Strategic Development of Spatial Data Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaohua Wang, Xing Xie, Yong Li, Danhuai Guo, Zhi Cai, Yu Liu, Yang Yue, Xiao Pan, Feng Lu, Huayi Wu, Zhipeng Gui, Zhiming Ding, Bolong Zheng, Fuzheng Zhang, Tao Qin, Jingyuan Wang, Chuang Tao, Zhengchao Chen, Hao Lu, Jiayi Li, Hongyang Chen, Peng Yue, Wenhao Yu, Yao Yao, Leilei Sun, Yong Zhang, Longbiao Chen, Xiaoping Du, Xiang Li, Xueying Zhang, Kun Qin, Zhaoya Gong, Weihua Dong, Xiaofeng Meng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report focuses on spatial data intelligent large models, delving into
+the principles, methods, and cutting-edge applications of these models. It
+provides an in-depth discussion on the definition, development history, current
+status, and trends of spatial data intelligent large models, as well as the
+challenges they face. The report systematically elucidates the key technologies
+of spatial data intelligent large models and their applications in urban
+environments, aerospace remote sensing, geography, transportation, and other
+scenarios. Additionally, it summarizes the latest application cases of spatial
+data intelligent large models in themes such as urban development, multimodal
+systems, remote sensing, smart transportation, and resource environments.
+Finally, the report concludes with an overview and outlook on the development
+prospects of spatial data intelligent large models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in Chinese language</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing Quantum Annealing and Spiking Neuromorphic Computing for
+  Sampling Binary Sparse Coding QUBO Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20525v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20525v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyle Henke, Elijah Pelofske, Garrett Kenyon, Georg Hahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of computing a sparse binary representation of an
+image. To be precise, given an image and an overcomplete, non-orthonormal
+basis, we aim to find a sparse binary vector indicating the minimal set of
+basis vectors that when added together best reconstruct the given input. We
+formulate this problem with an $L_2$ loss on the reconstruction error, and an
+$L_0$ (or, equivalently, an $L_1$) loss on the binary vector enforcing
+sparsity. This yields a quadratic binary optimization problem (QUBO), whose
+optimal solution(s) in general is NP-hard to find. The method of unsupervised
+and unnormalized dictionary feature learning for a desired sparsity level to
+best match the data is presented. Next, we solve the sparse representation QUBO
+by implementing it both on a D-Wave quantum annealer with Pegasus chip
+connectivity via minor embedding, as well as on the Intel Loihi 2 spiking
+neuromorphic processor. On the quantum annealer, we sample from the sparse
+representation QUBO using parallel quantum annealing combined with quantum
+evolution Monte Carlo, also known as iterated reverse annealing. On Loihi 2, we
+use a stochastic winner take all network of neurons. The solutions are
+benchmarked against simulated annealing, a classical heuristic, and the optimal
+solutions are computed using CPLEX. Iterated reverse quantum annealing performs
+similarly to simulated annealing, although simulated annealing is always able
+to sample the optimal solution whereas quantum annealing was not always able
+to. The Loihi 2 solutions that are sampled are on average more sparse than the
+solutions from any of the other methods. Loihi 2 outperforms a D-Wave quantum
+annealer standard linear-schedule anneal, while iterated reverse quantum
+annealing performs much better than both unmodified linear-schedule quantum
+annealing and iterated warm starting on Loihi 2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Modeling of Non-Gaussian Aleatoric Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20513v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20513v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aastha Acharya, Caleb Lee, Marissa D'Alonzo, Jared Shamwell, Nisar R. Ahmed, Rebecca Russell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning offers promising new ways to accurately model aleatoric
+uncertainty in robotic estimation systems, particularly when the uncertainty
+distributions do not conform to traditional assumptions of being fixed and
+Gaussian. In this study, we formulate and evaluate three fundamental deep
+learning approaches for conditional probability density modeling to quantify
+non-Gaussian aleatoric uncertainty: parametric, discretized, and generative
+modeling. We systematically compare the respective strengths and weaknesses of
+these three methods on simulated non-Gaussian densities as well as on
+real-world terrain-relative navigation data. Our results show that these deep
+learning methods can accurately capture complex uncertainty patterns,
+highlighting their potential for improving the reliability and robustness of
+estimation systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physically Compatible 3D Object Modeling from a Single Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghao Guo, Bohan Wang, Pingchuan Ma, Tianyuan Zhang, Crystal Elaine Owens, Chuang Gan, Joshua B. Tenenbaum, Kaiming He, Wojciech Matusik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a computational framework that transforms single images into 3D
+physical objects. The visual geometry of a physical object in an image is
+determined by three orthogonal attributes: mechanical properties, external
+forces, and rest-shape geometry. Existing single-view 3D reconstruction methods
+often overlook this underlying composition, presuming rigidity or neglecting
+external forces. Consequently, the reconstructed objects fail to withstand
+real-world physical forces, resulting in instability or undesirable deformation
+-- diverging from their intended designs as depicted in the image. Our
+optimization framework addresses this by embedding physical compatibility into
+the reconstruction process. We explicitly decompose the three physical
+attributes and link them through static equilibrium, which serves as a hard
+constraint, ensuring that the optimized physical shapes exhibit desired
+physical behaviors. Evaluations on a dataset collected from Objaverse
+demonstrate that our framework consistently enhances the physical realism of 3D
+models over existing methods. The utility of our framework extends to practical
+applications in dynamic simulations and 3D printing, where adherence to
+physical compatibility is paramount.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ShelfHelp: Empowering Humans to Perform Vision-Independent Manipulation
+  Tasks with a Socially Assistive Robotic Cane 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivendra Agrawal, Suresh Nayak, Ashutosh Naik, Bradley Hayes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to shop independently, especially in grocery stores, is important
+for maintaining a high quality of life. This can be particularly challenging
+for people with visual impairments (PVI). Stores carry thousands of products,
+with approximately 30,000 new products introduced each year in the US market
+alone, presenting a challenge even for modern computer vision solutions.
+Through this work, we present a proof-of-concept socially assistive robotic
+system we call ShelfHelp, and propose novel technical solutions for enhancing
+instrumented canes traditionally meant for navigation tasks with additional
+capability within the domain of shopping. ShelfHelp includes a novel visual
+product locator algorithm designed for use in grocery stores and a novel
+planner that autonomously issues verbal manipulation guidance commands to guide
+the user during product retrieval. Through a human subjects study, we show the
+system's success in locating and providing effective manipulation guidance to
+retrieve desired products with novice users. We compare two autonomous verbal
+guidance modes achieving comparable performance to a human assistance baseline
+and present encouraging findings that validate our system's efficiency and
+effectiveness and through positive subjective metrics including competence,
+intelligence, and ease of use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 14 figures and charts</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Slight Corruption in <span class="highlight-title">Pre-train</span>ing Data Makes Better Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20494v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20494v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Chen, Yujin Han, Diganta Misra, Xiang Li, Kai Hu, Difan Zou, Masashi Sugiyama, Jindong Wang, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models (DMs) have shown remarkable capabilities in generating
+realistic high-quality images, audios, and videos. They benefit significantly
+from extensive pre-training on large-scale datasets, including web-crawled data
+with paired data and conditions, such as image-text and image-class pairs.
+Despite rigorous filtering, these pre-training datasets often inevitably
+contain corrupted pairs where conditions do not accurately describe the data.
+This paper presents the first comprehensive study on the impact of such
+corruption in pre-training data of DMs. We synthetically corrupt ImageNet-1K
+and CC3M to pre-train and evaluate over 50 conditional DMs. Our empirical
+findings reveal that various types of slight corruption in pre-training can
+significantly enhance the quality, diversity, and fidelity of the generated
+images across different DMs, both during pre-training and downstream adaptation
+stages. Theoretically, we consider a Gaussian mixture model and prove that
+slight corruption in the condition leads to higher entropy and a reduced
+2-Wasserstein distance to the ground truth of the data distribution generated
+by the corruptly trained DMs. Inspired by our analysis, we propose a simple
+method to improve the training of DMs on practical datasets by adding condition
+embedding perturbations (CEP). CEP significantly improves the performance of
+various DMs in both pre-training and downstream tasks. We hope that our study
+provides new insights into understanding the data and pre-training processes of
+DMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, 33 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ STHN: Deep Homography Estimation for UAV Thermal Geo-localization with
+  Satellite Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiuhong Xiao, Ning Zhang, Daniel Tortei, Giuseppe Loianno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate geo-localization of Unmanned Aerial Vehicles (UAVs) is crucial for a
+variety of outdoor applications including search and rescue operations, power
+line inspections, and environmental monitoring. The vulnerability of Global
+Navigation Satellite Systems (GNSS) signals to interference and spoofing
+necessitates the development of additional robust localization methods for
+autonomous navigation. Visual Geo-localization (VG), leveraging onboard cameras
+and reference satellite maps, offers a promising solution for absolute
+localization. Specifically, Thermal Geo-localization (TG), which relies on
+image-based matching between thermal imagery with satellite databases, stands
+out by utilizing infrared cameras for effective night-time localization.
+However, the efficiency and effectiveness of current TG approaches, are
+hindered by dense sampling on satellite maps and geometric noises in thermal
+query images. To overcome these challenges, in this paper, we introduce STHN, a
+novel UAV thermal geo-localization approach that employs a coarse-to-fine deep
+homography estimation method. This method attains reliable thermal
+geo-localization within a 512-meter radius of the UAV's last known location
+even with a challenging 11% overlap between satellite and thermal images,
+despite the presence of indistinct textures in thermal imagery and self-similar
+patterns in both spectra. Our research significantly enhances UAV thermal
+geo-localization performance and robustness against the impacts of geometric
+noises under low-visibility conditions in the wild. The code will be made
+publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures. This work has been submitted to the IEEE for
+  possible publication. Copyright may be transferred without notice, after
+  which this version may no longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Synthetic Data all We Need? Benchmarking the Robustness of Models
+  Trained with Synthetic Images <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20469v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20469v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Krishnakant Singh, Thanush Navaratnam, Jannik Holmer, Simone Schaub-Meyer, Stefan Roth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A long-standing challenge in developing machine learning approaches has been
+the lack of high-quality labeled data. Recently, models trained with purely
+synthetic data, here termed synthetic clones, generated using large-scale
+pre-trained diffusion models have shown promising results in overcoming this
+annotation bottleneck. As these synthetic clone models progress, they are
+likely to be deployed in challenging real-world settings, yet their suitability
+remains understudied. Our work addresses this gap by providing the first
+benchmark for three classes of synthetic clone models, namely supervised,
+self-supervised, and multi-modal ones, across a range of robustness measures.
+We show that existing synthetic self-supervised and multi-modal clones are
+comparable to or outperform state-of-the-art real-image baselines for a range
+of robustness metrics - shape bias, background bias, calibration, etc. However,
+we also find that synthetic clones are much more susceptible to adversarial and
+real-world noise than models trained with real data. To address this, we find
+that combining both real and synthetic data further increases the robustness,
+and that the choice of prompt used for generating synthetic images plays an
+important part in the robustness of synthetic clones.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CVPR 2024 Workshop: SyntaGen-Harnessing Generative Models
+  for Synthetic Visual Datasets. Project page at
+  https://synbenchmark.github.io/SynCloneBenchmark</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ENTIRe-ID: An Extensive and Diverse <span class="highlight-title">Dataset</span> for Person Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serdar Yildiz, Ahmet Nezih Kasim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing importance of person reidentification in computer vision has
+highlighted the need for more extensive and diverse datasets. In response, we
+introduce the ENTIRe-ID dataset, an extensive collection comprising over 4.45
+million images from 37 different cameras in varied environments. This dataset
+is uniquely designed to tackle the challenges of domain variability and model
+generalization, areas where existing datasets for person re-identification have
+fallen short. The ENTIRe-ID dataset stands out for its coverage of a wide array
+of real-world scenarios, encompassing various lighting conditions, angles of
+view, and diverse human activities. This design ensures a realistic and robust
+training platform for ReID models. The ENTIRe-ID dataset is publicly available
+at https://serdaryildiz.github.io/ENTIRe-ID
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2024 18th International Conference on Automatic Face and
+  Gesture Recognition (FG)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Label Guided Soft Contrastive Learning for Efficient Earth
+  Observation <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20462v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20462v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Wang, Conrad M Albrecht, Xiao Xiang Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised pretraining on large-scale satellite data has raised great
+interest in building Earth observation (EO) foundation models. However, many
+important resources beyond pure satellite imagery, such as land-cover-land-use
+products that provide free global semantic information, as well as vision
+foundation models that hold strong knowledge of the natural world, tend to be
+overlooked. In this work, we show these free additional resources not only help
+resolve common contrastive learning bottlenecks, but also significantly boost
+the efficiency and effectiveness of EO pretraining.
+  Specifically, we first propose soft contrastive learning that optimizes
+cross-scene soft similarity based on land-cover-generated multi-label
+supervision, naturally solving the issue of multiple positive samples and too
+strict positive matching in complex scenes. Second, we explore cross-domain
+continual pretraining for both multispectral and SAR imagery, building
+efficient EO foundation models from strongest vision models such as DINOv2.
+Integrating simple weight-initialization and Siamese masking strategies into
+our soft contrastive learning framework, we demonstrate impressive continual
+pretraining performance even when the input channels and modalities are not
+aligned.
+  Without prohibitive training, we produce multispectral and SAR foundation
+models that achieve significantly better results in 9 out of 10 downstream
+tasks than most existing SOTA models. For example, our ResNet50/ViT-S achieve
+84.8/85.0 linear probing mAP scores on BigEarthNet-10\% which are better than
+most existing ViT-L models; under the same setting, our ViT-B sets a new record
+of 86.8 in multispectral, and 82.5 in SAR, the latter even better than many
+multispectral models. Dataset and models are available at
+https://github.com/zhu-xlab/softcon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Calibration of Object Detectors: Pitfalls, Evaluation and Baselines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Selim Kuzucu, Kemal Oksuz, Jonathan Sadeghi, Puneet K. Dokania
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reliable usage of object detectors require them to be calibrated -- a crucial
+problem that requires careful attention. Recent approaches towards this involve
+(1) designing new loss functions to obtain calibrated detectors by training
+them from scratch, and (2) post-hoc Temperature Scaling (TS) that learns to
+scale the likelihood of a trained detector to output calibrated predictions.
+These approaches are then evaluated based on a combination of Detection
+Expected Calibration Error (D-ECE) and Average Precision. In this work, via
+extensive analysis and insights, we highlight that these recent evaluation
+frameworks, evaluation metrics, and the use of TS have notable drawbacks
+leading to incorrect conclusions. As a step towards fixing these issues, we
+propose a principled evaluation framework to jointly measure calibration and
+accuracy of object detectors. We also tailor efficient and easy-to-use post-hoc
+calibration approaches such as Platt Scaling and Isotonic Regression
+specifically for object detection task. Contrary to the common notion, our
+experiments show that once designed and evaluated properly, post-hoc
+calibrators, which are extremely cheap to build and use, are much more powerful
+and effective than the recent train-time calibration methods. To illustrate,
+D-DETR with our post-hoc Isotonic Regression calibrator outperforms the recent
+train-time state-of-the-art calibration method Cal-DETR by more than 7 D-ECE on
+the COCO dataset. Additionally, we propose improved versions of the recently
+proposed Localization-aware ECE and show the efficacy of our method on these
+metrics as well. Code is available at:
+https://github.com/fiveai/detection_calibration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ P-MSDiff: Parallel Multi-Scale Diffusion for Remote Sensing Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20443v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20443v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Zhang, Guohua Geng, Longquan Yan, Pengbo Zhou, Zhaodi Li, Kang Li, Qinglin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models and multi-scale features are essential components in
+semantic segmentation tasks that deal with remote-sensing images. They
+contribute to improved segmentation boundaries and offer significant contextual
+information. U-net-like architectures are frequently employed in diffusion
+models for segmentation tasks. These architectural designs include dense skip
+connections that may pose challenges for interpreting intermediate features.
+Consequently, they might not efficiently convey semantic information throughout
+various layers of the encoder-decoder architecture. To address these
+challenges, we propose a new model for semantic segmentation known as the
+diffusion model with parallel multi-scale branches. This model consists of
+Parallel Multiscale Diffusion modules (P-MSDiff) and a Cross-Bridge Linear
+Attention mechanism (CBLA). P-MSDiff enhances the understanding of semantic
+information across multiple levels of granularity and detects repetitive
+distribution data through the integration of recursive denoising branches. It
+further facilitates the amalgamation of data by connecting relevant branches to
+the primary framework to enable concurrent denoising. Furthermore, within the
+interconnected transformer architecture, the LA module has been substituted
+with the CBLA module. This module integrates a semidefinite matrix linked to
+the query into the dot product computation of keys and values. This integration
+enables the adaptation of queries within the LA framework. This adjustment
+enhances the structure for multi-head attention computation, leading to
+enhanced network performance and CBLA is a plug-and-play module. Our model
+demonstrates superior performance based on the J1 metric on both the UAVid and
+Vaihingen Building datasets, showing improvements of 1.60% and 1.40% over
+strong baseline models, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Practicality of Federated Learning: A <span class="highlight-title">Survey</span> Towards the
+  Communication Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khiem Le, Nhan Luong-Ha, Manh Nguyen-Duc, Danh Le-Phuoc, Cuong Do, Kok-Seng Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a promising paradigm that offers significant
+advancements in privacy-preserving, decentralized machine learning by enabling
+collaborative training of models across distributed devices without
+centralizing data. However, the practical deployment of FL systems faces a
+significant bottleneck: the communication overhead caused by frequently
+exchanging large model updates between numerous devices and a central server.
+This communication inefficiency can hinder training speed, model performance,
+and the overall feasibility of real-world FL applications. In this survey, we
+investigate various strategies and advancements made in communication-efficient
+FL, highlighting their impact and potential to overcome the communication
+challenges inherent in FL systems. Specifically, we define measures for
+communication efficiency, analyze sources of communication inefficiency in FL
+systems, and provide a taxonomy and comprehensive review of state-of-the-art
+communication-efficient FL methods. Additionally, we discuss promising future
+research directions for enhancing the communication efficiency of FL systems.
+By addressing the communication bottleneck, FL can be effectively applied and
+enable scalable and practical deployment across diverse applications that
+require privacy-preserving, decentralized machine learning, such as IoT,
+healthcare, or finance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Back to the Basics on Predicting Transfer Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Levy Chaves, Eduardo Valle, Alceu Bissoto, Sandra Avila
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the evolving landscape of deep learning, selecting the best pre-trained
+models from a growing number of choices is a challenge. Transferability scorers
+propose alleviating this scenario, but their recent proliferation, ironically,
+poses the challenge of their own assessment. In this work, we propose both
+robust benchmark guidelines for transferability scorers, and a well-founded
+technique to combine multiple scorers, which we show consistently improves
+their results. We extensively evaluate 13 scorers from literature across 11
+datasets, comprising generalist, fine-grained, and medical imaging datasets. We
+show that few scorers match the predictive performance of the simple raw metric
+of models on ImageNet, and that all predictors suffer on medical datasets. Our
+results highlight the potential of combining different information sources for
+reliably predicting transferability across varied domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 3 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jailbreaking Large Language Models Against Moderation Guardrails via
+  Cipher Characters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20413v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20413v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haibo Jin, Andy Zhou, Joe D. Menke, Haohan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are typically harmless but remain vulnerable to
+carefully crafted prompts known as ``jailbreaks'', which can bypass protective
+measures and induce harmful behavior. Recent advancements in LLMs have
+incorporated moderation guardrails that can filter outputs, which trigger
+processing errors for certain malicious questions. Existing red-teaming
+benchmarks often neglect to include questions that trigger moderation
+guardrails, making it difficult to evaluate jailbreak effectiveness. To address
+this issue, we introduce JAMBench, a harmful behavior benchmark designed to
+trigger and evaluate moderation guardrails. JAMBench involves 160 manually
+crafted instructions covering four major risk categories at multiple severity
+levels. Furthermore, we propose a jailbreak method, JAM (Jailbreak Against
+Moderation), designed to attack moderation guardrails using jailbreak prefixes
+to bypass input-level filters and a fine-tuned shadow model functionally
+equivalent to the guardrail model to generate cipher characters to bypass
+output-level filters. Our extensive experiments on four LLMs demonstrate that
+JAM achieves higher jailbreak success ($\sim$ $\times$ 19.88) and lower
+filtered-out rates ($\sim$ $\times$ 1/6) than baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can No-Reference Quality-Assessment Methods Serve as Perceptual Losses
+  for Super-Resolution? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20392v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20392v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Egor Kashkarov, Egor Chistov, Ivan Molodetskikh, Dmitriy Vatolin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Perceptual losses play an important role in constructing
+deep-neural-network-based methods by increasing the naturalness and realism of
+processed images and videos. Use of perceptual losses is often limited to
+LPIPS, a fullreference method. Even though deep no-reference
+image-qualityassessment methods are excellent at predicting human judgment,
+little research has examined their incorporation in loss functions. This paper
+investigates direct optimization of several video-superresolution models using
+no-reference image-quality-assessment methods as perceptual losses. Our
+experimental results show that straightforward optimization of these methods
+produce artifacts, but a special training procedure can mitigate them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 3 figures. The first two authors contributed equally to this
+  work</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gradient Inversion of Federated Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiyue Huang, Chi Hong, Lydia Y. Chen, Stefanie Roos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models are becoming defector generative models, which generate
+exceptionally high-resolution image data. Training effective diffusion models
+require massive real data, which is privately owned by distributed parties.
+Each data party can collaboratively train diffusion models in a federated
+learning manner by sharing gradients instead of the raw data. In this paper, we
+study the privacy leakage risk of gradient inversion attacks. First, we design
+a two-phase fusion optimization, GIDM, to leverage the well-trained generative
+model itself as prior knowledge to constrain the inversion search (latent)
+space, followed by pixel-wise fine-tuning. GIDM is shown to be able to
+reconstruct images almost identical to the original ones. Considering a more
+privacy-preserving training scenario, we then argue that locally initialized
+private training noise $\epsilon$ and sampling step t may raise additional
+challenges for the inversion attack. To solve this, we propose a
+triple-optimization GIDM+ that coordinates the optimization of the unknown
+data, $\epsilon$ and $t$. Our extensive evaluation results demonstrate the
+vulnerability of sharing gradient for data protection of diffusion models, even
+high-resolution images can be reconstructed with high quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning 3D Robotics Perception using Inductive Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20364v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20364v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Zubair Irshad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in deep learning have led to a data-centric intelligence i.e.
+artificially intelligent models unlocking the potential to ingest a large
+amount of data and be really good at performing digital tasks such as
+text-to-image generation, machine-human conversation, and image recognition.
+This thesis covers the topic of learning with structured inductive bias and
+priors to design approaches and algorithms unlocking the potential of
+principle-centric intelligence. Prior knowledge (priors for short), often
+available in terms of past experience as well as assumptions of how the world
+works, helps the autonomous agent generalize better and adapt their behavior
+based on past experience. In this thesis, I demonstrate the use of prior
+knowledge in three different robotics perception problems. 1. object-centric 3D
+reconstruction, 2. vision and language for decision-making, and 3. 3D scene
+understanding. To solve these challenging problems, I propose various sources
+of prior knowledge including 1. geometry and appearance priors from synthetic
+data, 2. modularity and semantic map priors and 3. semantic, structural, and
+contextual priors. I study these priors for solving robotics 3D perception
+tasks and propose ways to efficiently encode them in deep learning models. Some
+priors are used to warm-start the network for transfer learning, others are
+used as hard constraints to restrict the action space of robotics agents. While
+classical techniques are brittle and fail to generalize to unseen scenarios and
+data-centric approaches require a large amount of labeled data, this thesis
+aims to build intelligent agents which require very-less real-world data or
+data acquired only from simulation to generalize to highly dynamic and
+cluttered environments in novel simulations (i.e. sim2sim) or real-world unseen
+environments (i.e. sim2real) for a holistic scene understanding of the 3D
+world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Georgia Tech Ph.D. Thesis, December 2023. For more details:
+  https://zubairirshad.com/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-<span class="highlight-title">Prompt</span> Alignment for Multi-Source Unsupervised Domain Adaptation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.15210v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.15210v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Chen, Xintong Han, Zuxuan Wu, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing methods for unsupervised domain adaptation (UDA) rely on a
+shared network to extract domain-invariant features. However, when facing
+multiple source domains, optimizing such a network involves updating the
+parameters of the entire network, making it both computationally expensive and
+challenging, particularly when coupled with min-max objectives. Inspired by
+recent advances in prompt learning that adapts high-capacity models for
+downstream tasks in a computationally economic way, we introduce Multi-Prompt
+Alignment (MPA), a simple yet efficient framework for multi-source UDA. Given a
+source and target domain pair, MPA first trains an individual prompt to
+minimize the domain gap through a contrastive loss. Then, MPA denoises the
+learned prompts through an auto-encoding process and aligns them by maximizing
+the agreement of all the reconstructed prompts. Moreover, we show that the
+resulting subspace acquired from the auto-encoding process can easily
+generalize to a streamlined set of target domains, making our method more
+efficient for practical usage. Extensive experiments show that MPA achieves
+state-of-the-art results on three popular datasets with an impressive average
+accuracy of 54.1% on DomainNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ You Need to Pay Better Attention: Rethinking the Mathematics of
+  Attention Mechanism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehran Hosseini, Peyman Hosseini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaled Dot Product Attention (SDPA) is the backbone of many modern
+deep-learning models. It is so versatile that it has been used in natural
+language, vision, and multi-modal domains with very little change compared to
+its original formulation. This paper discusses why the current formulation is
+inefficient by delving into the mathematical details of the attention
+mechanism. We propose three improvements to mitigate these inefficiencies,
+thereby, introducing three enhanced attention mechanisms: Optimised, Efficient,
+and Super Attention. Optimised and Efficient Attention have one and two matrix
+multiplications fewer per head, respectively, and 25% and 50% fewer parameters,
+respectively, than standard SDPA, but perform similarly to standard SDPA in
+both vision and natural language tasks. They can be used in all applications
+where SDPA is used while offering smaller model sizes and faster training and
+inference without noticeable loss in performance. Super Attention introduces a
+new linear transformation on the values, transforming them from the left. It
+outperforms standard SPDA on vision and natural language tasks by up to 17%
+while having one fewer matrix multiplication per head and 25% fewer parameters
+than standard SDPA. Consequently, it is also faster than standard SDPA. Super
+Attention is ideal in applications where the attention layer's context length
+is fixed, such as Vision Transformers. In addition to providing mathematical
+reasoning, we evaluate the presented attention mechanisms on several datasets
+including MNIST, CIFAR100, ImageNet, IMDB Movie Reviews, and Amazon Reviews
+datasets, as well as combined Europarl and Anki English-Spanish datasets for
+neural machine translation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image Deraining with Frequency-Enhanced State Space Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16470v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16470v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shugo Yamashita, Masaaki Ikehara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Removing rain artifacts in images is recognized as a significant issue. In
+this field, deep learning-based approaches, such as convolutional neural
+networks (CNNs) and Transformers, have succeeded. Recently, State Space Models
+(SSMs) have exhibited superior performance across various tasks in both natural
+language processing and image processing due to their ability to model
+long-range dependencies. This study introduces SSM to rain removal and proposes
+a Deraining Frequency-Enhanced State Space Model (DFSSM). To effectively remove
+rain streaks, which produce high-intensity frequency components in specific
+directions, we employ frequency domain processing concurrently with SSM.
+Additionally, we develop a novel mixed-scale gated-convolutional block, which
+uses convolutions with multiple kernel sizes to capture various scale
+degradations effectively and integrates a gating mechanism to manage the flow
+of information. Finally, experiments on synthetic and real-world rainy image
+datasets show that our method surpasses state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Multimodal Large Language Models with Vision Detection Models:
+  An Empirical Study 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.17981v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.17981v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qirui Jiao, Daoyuan Chen, Yilun Huang, Yaliang Li, Ying Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the impressive capabilities of Multimodal Large Language Models
+(MLLMs) in integrating text and image modalities, challenges remain in
+accurately interpreting detailed visual elements. This paper presents an
+empirical study on enhancing MLLMs with state-of-the-art (SOTA) object
+detection and Optical Character Recognition (OCR) models to improve
+fine-grained understanding and reduce hallucination in responses. We
+investigate the embedding-based infusion of textual detection information, the
+impact of such infusion on MLLMs' original abilities, and the
+interchangeability of detection models. We conduct systematic and extensive
+experiments with representative models such as LLaVA-1.5, DINO, PaddleOCRv2,
+and Grounding DINO, revealing that our simple yet general approach not only
+refines MLLMs' performance in fine-grained visual tasks but also maintains
+their original strengths. Notably, the enhanced LLaVA-1.5 outperforms its
+original 7B/13B models on all 10 benchmarks, achieving an improvement of up to
+12.5% on the normalized average score. We release our codes to facilitate
+further exploration into the fine-grained multimodal capabilities of MLLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 18 tables, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIP-Guided Source-Free Object Detection in Aerial Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.05168v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.05168v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nanqing Liu, Xun Xu, Yongyi Su, Chengxin Liu, Peiliang Gong, Heng-Chao Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation is crucial in aerial imagery, as the visual representation
+of these images can significantly vary based on factors such as geographic
+location, time, and weather conditions. Additionally, high-resolution aerial
+images often require substantial storage space and may not be readily
+accessible to the public. To address these challenges, we propose a novel
+Source-Free Object Detection (SFOD) method. Specifically, our approach begins
+with a self-training framework, which significantly enhances the performance of
+baseline methods. To alleviate the noisy labels in self-training, we utilize
+Contrastive Language-Image Pre-training (CLIP) to guide the generation of
+pseudo-labels, termed CLIP-guided Aggregation (CGA). By leveraging CLIP's
+zero-shot classification capability, we aggregate its scores with the original
+predicted bounding boxes, enabling us to obtain refined scores for the
+pseudo-labels. To validate the effectiveness of our method, we constructed two
+new datasets from different domains based on the DIOR dataset, named DIOR-C and
+DIOR-Cloudy. Experimental results demonstrate that our method outperforms other
+comparative algorithms. The code is available at
+https://github.com/Lans1ng/SFOD-RS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IGARSS2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Empirical Study of Training State-of-the-Art LiDAR Segmentation
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Sun, Chunmei Qing, Xiang Xu, Lingdong Kong, Youquan Liu, Li Li, Chenming Zhu, Jingwei Zhang, Zeqi Xiao, Runnan Chen, Tai Wang, Wenwei Zhang, Kai Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly evolving field of autonomous driving, precise segmentation of
+LiDAR data is crucial for understanding complex 3D environments. Traditional
+approaches often rely on disparate, standalone codebases, hindering unified
+advancements and fair benchmarking across models. To address these challenges,
+we introduce MMDetection3D-lidarseg, a comprehensive toolbox designed for the
+efficient training and evaluation of state-of-the-art LiDAR segmentation
+models. We support a wide range of segmentation models and integrate advanced
+data augmentation techniques to enhance robustness and generalization.
+Additionally, the toolbox provides support for multiple leading sparse
+convolution backends, optimizing computational efficiency and performance. By
+fostering a unified framework, MMDetection3D-lidarseg streamlines development
+and benchmarking, setting new standards for research and application. Our
+extensive benchmark experiments on widely-used datasets demonstrate the
+effectiveness of the toolbox. The codebase and trained models have been
+publicly available, promoting further research and innovation in the field of
+LiDAR segmentation for autonomous driving.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint; 17 pages, 4 figures, 7 tables; Code at
+  https://github.com/open-mmlab/mmdetection3d</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Limits of Multi-modal Meta-Learning with Auxiliary Task
+  Modulation Using Conditional Batch Normalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18751v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18751v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordi Armengol-Estapé, Vincent Michalski, Ramnath Kumar, Pierre-Luc St-Charles, Doina Precup, Samira Ebrahimi Kahou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot learning aims to learn representations that can tackle novel tasks
+given a small number of examples. Recent studies show that cross-modal learning
+can improve representations for few-shot classification. More specifically,
+language is a rich modality that can be used to guide visual learning. In this
+work, we experiment with a multi-modal architecture for few-shot learning that
+consists of three components: a classifier, an auxiliary network, and a bridge
+network. While the classifier performs the main classification task, the
+auxiliary network learns to predict language representations from the same
+input, and the bridge network transforms high-level features of the auxiliary
+network into modulation parameters for layers of the few-shot classifier using
+conditional batch normalization. The bridge should encourage a form of
+lightweight semantic alignment between language and vision which could be
+useful for the classifier. However, after evaluating the proposed approach on
+two popular few-shot classification benchmarks we find that a) the improvements
+do not reproduce across benchmarks, and b) when they do, the improvements are
+due to the additional compute and parameters introduced by the bridge network.
+We contribute insights and recommendations for future work in multi-modal
+meta-learning, especially when using language representations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Efficient and Multi-private Key Secure Aggregation for Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08970v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08970v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xue Yang, Zifeng Liu, Xiaohu Tang, Rongxing Lu, Bo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of privacy leaks in federated learning, secure aggregation
+protocols that mainly adopt either homomorphic encryption or threshold secret
+sharing have been widely developed for federated learning to protect the
+privacy of the local training data of each client. However, these existing
+protocols suffer from many shortcomings, such as the dependence on a trusted
+third party, the vulnerability to clients being corrupted, low efficiency, the
+trade-off between security and fault tolerance, etc. To solve these
+disadvantages, we propose an efficient and multi-private key secure aggregation
+scheme for federated learning. Specifically, we skillfully modify the variant
+ElGamal encryption technique to achieve homomorphic addition operation, which
+has two important advantages: 1) The server and each client can freely select
+public and private keys without introducing a trust third party and 2) Compared
+to the variant ElGamal encryption, the plaintext space is relatively large,
+which is more suitable for the deep model. Besides, for the high dimensional
+deep model parameter, we introduce a super-increasing sequence to compress
+multi-dimensional data into 1-D, which can greatly reduce encryption and
+decryption times as well as communication for ciphertext transmission. Detailed
+security analyses show that our proposed scheme achieves the semantic security
+of both individual local gradients and the aggregated result while achieving
+optimal robustness in tolerating both client collusion and dropped clients.
+Extensive simulations demonstrate that the accuracy of our scheme is almost the
+same as the non-private approach, while the efficiency of our scheme is much
+better than the state-of-the-art homomorphic encryption-based secure
+aggregation schemes. More importantly, the efficiency advantages of our scheme
+will become increasingly prominent as the number of model parameters increases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking Skeleton-based Motion Encoder Models for Clinical
+  Applications: Estimating Parkinson's Disease Severity in Walking Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17817v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17817v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vida Adeli, Soroush Mehraban, Irene Ballester, Yasamin Zarghami, Andrea Sabo, Andrea Iaboni, Babak Taati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigates the application of general human motion encoders
+trained on large-scale human motion datasets for analyzing gait patterns in PD
+patients. Although these models have learned a wealth of human biomechanical
+knowledge, their effectiveness in analyzing pathological movements, such as
+parkinsonian gait, has yet to be fully validated. We propose a comparative
+framework and evaluate six pre-trained state-of-the-art human motion encoder
+models on their ability to predict the Movement Disorder Society - Unified
+Parkinson's Disease Rating Scale (MDS-UPDRS-III) gait scores from motion
+capture data. We compare these against a traditional gait feature-based
+predictive model in a recently released large public PD dataset, including PD
+patients on and off medication. The feature-based model currently shows higher
+weighted average accuracy, precision, recall, and F1-score. Motion encoder
+models with closely comparable results demonstrate promise for scalability and
+efficiency in clinical settings. This potential is underscored by the enhanced
+performance of the encoder model upon fine-tuning on PD training set. Four of
+the six human motion models examined provided prediction scores that were
+significantly different between on- and off-medication states. This finding
+reveals the sensitivity of motion encoder models to nuanced clinical changes.
+It also underscores the necessity for continued customization of these models
+to better capture disease-specific features, thereby reducing the reliance on
+labor-intensive feature engineering. Lastly, we establish a benchmark for the
+analysis of skeleton-based motion encoder models in clinical settings. To the
+best of our knowledge, this is the first study to provide a benchmark that
+enables state-of-the-art models to be tested and compete in a clinical context.
+Codes and benchmark leaderboard are available at code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image Coding for Machines with Edge Information Learning Using Segment
+  Anything 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.04173v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.04173v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takahiro Shindo, Kein Yamada, Taiju Watanabe, Hiroshi Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image Coding for Machines (ICM) is an image compression technique for image
+recognition.
+  This technique is essential due to the growing demand for image recognition
+AI.
+  In this paper, we propose a method for ICM that focuses on encoding and
+decoding only the edge information of object parts in an image, which we call
+SA-ICM.
+  This is an Learned Image Compression (LIC) model trained using edge
+information created by Segment Anything.
+  Our method can be used for image recognition models with various tasks.
+  SA-ICM is also robust to changes in input data, making it effective for a
+variety of use cases.
+  Additionally, our method provides benefits from a privacy point of view, as
+it removes human facial information on the encoder's side, thus protecting
+one's privacy.
+  Furthermore, this LIC model training method can be used to train Neural
+Representations for Videos (NeRV), which is a video compression model.
+  By training NeRV using edge information created by Segment Anything, it is
+possible to create a NeRV that is effective for image recognition (SA-NeRV).
+  Experimental results confirm the advantages of SA-ICM, presenting the best
+performance in image compression for image recognition.
+  We also show that SA-NeRV is superior to ordinary NeRV in video compression
+for machines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CaLa: Complementary Association Learning for Augmenting Composed Image
+  Retrieval <span class="chip">SIGIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19149v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19149v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xintong Jiang, Yaxiong Wang, Mengjian Li, Yujiao Wu, Bingwen Hu, Xueming Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Composed Image Retrieval (CIR) involves searching for target images based on
+an image-text pair query. While current methods treat this as a query-target
+matching problem, we argue that CIR triplets contain additional associations
+beyond this primary relation. In our paper, we identify two new relations
+within triplets, treating each triplet as a graph node. Firstly, we introduce
+the concept of text-bridged image alignment, where the query text serves as a
+bridge between the query image and the target image. We propose a hinge-based
+cross-attention mechanism to incorporate this relation into network learning.
+Secondly, we explore complementary text reasoning, considering CIR as a form of
+cross-modal retrieval where two images compose to reason about complementary
+text. To integrate these perspectives effectively, we design a twin
+attention-based compositor. By combining these complementary associations with
+the explicit query pair-target image relation, we establish a comprehensive set
+of constraints for CIR. Our framework, CaLa (Complementary Association Learning
+for Augmenting Composed Image Retrieval), leverages these insights. We evaluate
+CaLa on CIRR and FashionIQ benchmarks with multiple backbones, demonstrating
+its superiority in composed image retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at SIGIR 2024. arXiv admin note: text overlap with
+  arXiv:2309.02169</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CaRiNG: Learning Temporal Causal Representation under Non-Invertible
+  Generation Process <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14535v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14535v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyi Chen, Yifan Shen, Zhenhao Chen, Xiangchen Song, Yuewen Sun, Weiran Yao, Xiao Liu, Kun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying the underlying time-delayed latent causal processes in sequential
+data is vital for grasping temporal dynamics and making downstream reasoning.
+While some recent methods can robustly identify these latent causal variables,
+they rely on strict assumptions about the invertible generation process from
+latent variables to observed data. However, these assumptions are often hard to
+satisfy in real-world applications containing information loss. For instance,
+the visual perception process translates a 3D space into 2D images, or the
+phenomenon of persistence of vision incorporates historical data into current
+perceptions. To address this challenge, we establish an identifiability theory
+that allows for the recovery of independent latent components even when they
+come from a nonlinear and non-invertible mix. Using this theory as a
+foundation, we propose a principled approach, CaRiNG, to learn the CAusal
+RepresentatIon of Non-invertible Generative temporal data with identifiability
+guarantees. Specifically, we utilize temporal context to recover lost latent
+information and apply the conditions in our theory to guide the training
+process. Through experiments conducted on synthetic datasets, we validate that
+our CaRiNG method reliably identifies the causal process, even when the
+generation process is non-invertible. Moreover, we demonstrate that our
+approach considerably improves temporal understanding and reasoning in
+practical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICML 2024, 24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prismatic VLMs: Investigating the Design Space of Visually-Conditioned
+  Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Karamcheti, Suraj Nair, Ashwin Balakrishna, Percy Liang, Thomas Kollar, Dorsa Sadigh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visually-conditioned language models (VLMs) have seen growing adoption in
+applications such as visual dialogue, scene understanding, and robotic task
+planning; adoption that has fueled a wealth of new models such as LLaVa,
+InstructBLIP, and PaLI-3. Despite the volume of new releases, key design
+decisions around image preprocessing, architecture, and optimization are
+under-explored, making it challenging to understand what factors account for
+model performance $-$ a challenge further complicated by the lack of objective,
+consistent evaluations. To address these gaps, we first compile a suite of
+standardized evaluations spanning visual question answering, object
+localization, and challenge sets that probe properties such as hallucination;
+evaluations that provide fine-grained insight VLM capabilities. Second, we
+rigorously investigate VLMs along key design axes, including pretrained visual
+representations and training from base vs. instruct-tuned language models,
+amongst others. We couple our analysis with three resource contributions: (1) a
+unified framework for evaluating VLMs, (2) optimized, flexible training code,
+and (3) checkpoints for all models, including a family of VLMs at the 7-13B
+scale that strictly outperform InstructBLIP and LLaVa v1.5, the
+state-of-the-art in open VLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICML 2024. 22 pages, 11 figures. Training code and
+  models: https://github.com/TRI-ML/prismatic-vlms. Evaluation code:
+  https://github.com/TRI-ML/vlm-evaluation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond the Visible: A <span class="highlight-title">Survey</span> on Cross-spectral Face Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.04435v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.04435v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Anghelone, Cunjian Chen, Arun Ross, Antitza Dantcheva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-spectral face recognition (CFR) refers to recognizing individuals using
+face images stemming from different spectral bands, such as infrared vs.
+visible. While CFR is inherently more challenging than classical face
+recognition due to significant variation in facial appearance caused by the
+modality gap, it is useful in many scenarios including night-vision biometrics
+and detecting presentation attacks. Recent advances in convolutional neural
+networks (CNNs) have resulted in significant improvement in the performance of
+CFR systems. Given these developments, the contributions of this survey are
+three-fold. First, we provide an overview of CFR, by formalizing the CFR
+problem and presenting related applications. Secondly, we discuss the
+appropriate spectral bands for face recognition and discuss recent CFR methods,
+placing emphasis on deep neural networks. In particular we describe techniques
+that have been proposed to extract and compare heterogeneous features emerging
+from different spectral bands. We also discuss the datasets that have been used
+for evaluating CFR methods. Finally, we discuss the challenges and future lines
+of research on this topic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deform3DGS: Flexible Deformation for Fast Surgical Scene Reconstruction
+  with Gaussian Splatting <span class="chip">MICCAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17835v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17835v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuojue Yang, Qian Li, Daiyun Shen, Bingchen Gong, Qi Dou, Yueming Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tissue deformation poses a key challenge for accurate surgical scene
+reconstruction. Despite yielding high reconstruction quality, existing methods
+suffer from slow rendering speeds and long training times, limiting their
+intraoperative applicability. Motivated by recent progress in 3D Gaussian
+Splatting, an emerging technology in real-time 3D rendering, this work presents
+a novel fast reconstruction framework, termed Deform3DGS, for deformable
+tissues during endoscopic surgery. Specifically, we introduce 3D GS into
+surgical scenes by integrating a point cloud initialization to improve
+reconstruction. Furthermore, we propose a novel flexible deformation modeling
+scheme (FDM) to learn tissue deformation dynamics at the level of individual
+Gaussians. Our FDM can model the surface deformation with efficient
+representations, allowing for real-time rendering performance. More
+importantly, FDM significantly accelerates surgical scene reconstruction,
+demonstrating considerable clinical values, particularly in intraoperative
+settings where time efficiency is crucial. Experiments on DaVinci robotic
+surgery videos indicate the efficacy of our approach, showcasing superior
+reconstruction fidelity PSNR: (37.90) and rendering speed (338.8 FPS) while
+substantially reducing training time to only 1 minute/scene. Our code is
+available at https://github.com/jinlab-imvr/Deform3DGS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Early accepted at MICCAI 2024, 10 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Real World Debiasing: A Fine-grained Analysis On Spurious
+  Correlation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15240v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15240v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhibo Wang, Peng Kuang, Zhixuan Chu, Jingyi Wang, Kui Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spurious correlations in training data significantly hinder the
+generalization capability of machine learning models when faced with
+distribution shifts in real-world scenarios. To tackle the problem, numerous
+debias approaches have been proposed and benchmarked on datasets intentionally
+designed with severe biases. However, it remains to be asked: \textit{1. Do
+existing benchmarks really capture biases in the real world? 2. Can existing
+debias methods handle biases in the real world?} To answer the questions, we
+revisit biased distributions in existing benchmarks and real-world datasets,
+and propose a fine-grained framework for analyzing dataset bias by
+disentangling it into the magnitude and prevalence of bias. We observe and
+theoretically demonstrate that existing benchmarks poorly represent real-world
+biases. We further introduce two novel biased distributions to bridge this gap,
+forming a nuanced evaluation framework for real-world debiasing. Building upon
+these results, we evaluate existing debias methods with our evaluation
+framework. Results show that existing methods are incapable of handling
+real-world biases. Through in-depth analysis, we propose a simple yet effective
+approach that can be easily applied to existing debias methods, named Debias in
+Destruction (DiD). Empirical results demonstrate the superiority of DiD,
+improving the performance of existing methods on all types of biases within the
+proposed evaluation framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages of main paper, 10 pages of appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoVR: Learning Composed Video Retrieval from Web Video Captions <span class="chip">AAAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.14746v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.14746v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas Ventura, Antoine Yang, Cordelia Schmid, Gül Varol
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Composed Image Retrieval (CoIR) has recently gained popularity as a task that
+considers both text and image queries together, to search for relevant images
+in a database. Most CoIR approaches require manually annotated datasets,
+comprising image-text-image triplets, where the text describes a modification
+from the query image to the target image. However, manual curation of CoIR
+triplets is expensive and prevents scalability. In this work, we instead
+propose a scalable automatic dataset creation methodology that generates
+triplets given video-caption pairs, while also expanding the scope of the task
+to include composed video retrieval (CoVR). To this end, we mine paired videos
+with a similar caption from a large database, and leverage a large language
+model to generate the corresponding modification text. Applying this
+methodology to the extensive WebVid2M collection, we automatically construct
+our WebVid-CoVR dataset, resulting in 1.6 million triplets. Moreover, we
+introduce a new benchmark for CoVR with a manually annotated evaluation set,
+along with baseline results. Our experiments further demonstrate that training
+a CoVR model on our dataset effectively transfers to CoIR, leading to improved
+state-of-the-art performance in the zero-shot setup on both the CIRR and
+FashionIQ benchmarks. Our code, datasets, and models are publicly available at
+https://imagine.enpc.fr/~ventural/covr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2024, Updated the results on CIRR with the correct evaluation.
+  Project page: Project page: https://imagine.enpc.fr/~ventural/covr/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D StreetUnveiler with Semantic-Aware 2DGS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18416v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18416v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwei Xu, Yikai Wang, Yiqun Zhao, Yanwei Fu, Shenghua Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unveiling an empty street from crowded observations captured by in-car
+cameras is crucial for autonomous driving. However, removing all temporarily
+static objects, such as stopped vehicles and standing pedestrians, presents a
+significant challenge. Unlike object-centric 3D inpainting, which relies on
+thorough observation in a small scene, street scene cases involve long
+trajectories that differ from previous 3D inpainting tasks. The camera-centric
+moving environment of captured videos further complicates the task due to the
+limited degree and time duration of object observation. To address these
+obstacles, we introduce StreetUnveiler to reconstruct an empty street.
+StreetUnveiler learns a 3D representation of the empty street from crowded
+observations. Our representation is based on the hard-label semantic 2D
+Gaussian Splatting (2DGS) for its scalability and ability to identify Gaussians
+to be removed. We inpaint rendered image after removing unwanted Gaussians to
+provide pseudo-labels and subsequently re-optimize the 2DGS. Given its temporal
+continuous movement, we divide the empty street scene into observed,
+partial-observed, and unobserved regions, which we propose to locate through a
+rendered alpha map. This decomposition helps us to minimize the regions that
+need to be inpainted. To enhance the temporal consistency of the inpainting, we
+introduce a novel time-reversal framework to inpaint frames in reverse order
+and use later frames as references for earlier frames to fully utilize the
+long-trajectory observations. Our experiments conducted on the street scene
+dataset successfully reconstructed a 3D representation of the empty street. The
+mesh representation of the empty street can be extracted for further
+applications. The project page and more visualizations can be found at:
+https://streetunveiler.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://streetunveiler.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training-Free Consistent Text-to-Image Generation <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03286v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03286v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoad Tewel, Omri Kaduri, Rinon Gal, Yoni Kasten, Lior Wolf, Gal Chechik, Yuval Atzmon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image models offer a new level of creative flexibility by allowing
+users to guide the image generation process through natural language. However,
+using these models to consistently portray the same subject across diverse
+prompts remains challenging. Existing approaches fine-tune the model to teach
+it new words that describe specific user-provided subjects or add image
+conditioning to the model. These methods require lengthy per-subject
+optimization or large-scale pre-training. Moreover, they struggle to align
+generated images with text prompts and face difficulties in portraying multiple
+subjects. Here, we present ConsiStory, a training-free approach that enables
+consistent subject generation by sharing the internal activations of the
+pretrained model. We introduce a subject-driven shared attention block and
+correspondence-based feature injection to promote subject consistency between
+images. Additionally, we develop strategies to encourage layout diversity while
+maintaining subject consistency. We compare ConsiStory to a range of baselines,
+and demonstrate state-of-the-art performance on subject consistency and text
+alignment, without requiring a single optimization step. Finally, ConsiStory
+can naturally extend to multi-subject scenarios, and even enable training-free
+personalization for common objects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to journal track of SIGGRAPH 2024 (TOG). Project page is at
+  https://consistory-paper.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpATr: MoCap 3D Human Action Recognition based on Spiral Auto-encoder
+  and <span class="highlight-title">Transformer</span> Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17574v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17574v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hamza Bouzid, Lahoucine Ballihi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent technological advancements have significantly expanded the potential
+of human action recognition through harnessing the power of 3D data. This data
+provides a richer understanding of actions, including depth information that
+enables more accurate analysis of spatial and temporal characteristics. In this
+context, We study the challenge of 3D human action recognition.Unlike prior
+methods, that rely on sampling 2D depth images, skeleton points, or point
+clouds, often leading to substantial memory requirements and the ability to
+handle only short sequences, we introduce a novel approach for 3D human action
+recognition, denoted as SpATr (Spiral Auto-encoder and Transformer Network),
+specifically designed for fixed-topology mesh sequences. The SpATr model
+disentangles space and time in the mesh sequences. A lightweight auto-encoder,
+based on spiral convolutions, is employed to extract spatial geometrical
+features from each 3D mesh. These convolutions are lightweight and specifically
+designed for fix-topology mesh data. Subsequently, a temporal transformer,
+based on self-attention, captures the temporal context within the feature
+sequence. The self-attention mechanism enables long-range dependencies
+capturing and parallel processing, ensuring scalability for long sequences. The
+proposed method is evaluated on three prominent 3D human action datasets:
+Babel, MoVi, and BMLrub, from the Archive of Motion Capture As Surface Shapes
+(AMASS). Our results analysis demonstrates the competitive performance of our
+SpATr model in 3D human action recognition while maintaining efficient memory
+usage. The code and the training results will soon be made publicly available
+at https://github.com/h-bouzid/spatr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CVIU</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trustworthy Partial Label Learning with Out-of-distribution Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.06681v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.06681v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jintao Huang, Yiu-Ming Cheung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Partial Label Learning (PLL) grapples with learning from ambiguously labelled
+data, and it has been successfully applied in fields such as image recognition.
+Nevertheless, traditional PLL methods rely on the closed-world assumption,
+which can be limiting in open-world scenarios and negatively impact model
+performance and generalization. To tackle these challenges, our study
+introduces a novel method called PLL-OOD, which is the first to incorporate
+Out-of-Distribution (OOD) detection into the PLL framework. PLL-OOD
+significantly enhances model adaptability and accuracy by merging
+self-supervised learning with partial label loss and pioneering the
+Partial-Energy (PE) score for OOD detection. This approach improves data
+feature representation and effectively disambiguates candidate labels, using a
+dynamic label confidence matrix to refine predictions. The PE score, adjusted
+by label confidence, precisely identifies OOD instances, optimizing model
+training towards in-distribution data. This innovative method markedly boosts
+PLL model robustness and performance in open-world settings. To validate our
+approach, we conducted a comprehensive comparative experiment combining the
+existing state-of-the-art PLL model with multiple OOD scores on the CIFAR-10
+and CIFAR-100 datasets with various OOD datasets. The results demonstrate that
+the proposed PLL-OOD framework is highly effective and effectiveness
+outperforms existing models, showcasing its superiority and effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>There are many errors in the Abstract, Introduction, Related Work,
+  Proposed Method, Experiment and References of this paper, which need to be
+  further corrected to avoid misleading. Therefore, it needs to be withdrawn</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inpaint Biases: A Pathway to Accurate and Unbiased Image Generation <span class="chip">CVPR</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.18762v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.18762v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiyoon Myung, Jihyeon Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper examines the limitations of advanced text-to-image models in
+accurately rendering unconventional concepts which are scarcely represented or
+absent in their training datasets. We identify how these limitations not only
+confine the creative potential of these models but also pose risks of
+reinforcing stereotypes. To address these challenges, we introduce the Inpaint
+Biases framework, which employs user-defined masks and inpainting techniques to
+enhance the accuracy of image generation, particularly for novel or
+inaccurately rendered objects. Through experimental validation, we demonstrate
+how this framework significantly improves the fidelity of generated images to
+the user's intent, thereby expanding the models' creative capabilities and
+mitigating the risk of perpetuating biases. Our study contributes to the
+advancement of text-to-image models as unbiased, versatile tools for creative
+expression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted in CVPRW 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Physics-Based Rigid Body Object Tracking and Friction Filtering From
+  RGB-D Videos <span class="chip">3DV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15703v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15703v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rama Krishna Kandukuri, Michael Strecke, Joerg Stueckler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-based understanding of object interactions from sensory observations
+is an essential capability in augmented reality and robotics. It enables to
+capture the properties of a scene for simulation and control. In this paper, we
+propose a novel approach for real-to-sim which tracks rigid objects in 3D from
+RGB-D images and infers physical properties of the objects. We use a
+differentiable physics simulation as state-transition model in an Extended
+Kalman Filter which can model contact and friction for arbitrary mesh-based
+shapes and in this way estimate physically plausible trajectories. We
+demonstrate that our approach can filter position, orientation, velocities, and
+concurrently can estimate the coefficient of friction of the objects. We
+analyze our approach on various sliding scenarios in synthetic image sequences
+of single objects and colliding objects. We also demonstrate and evaluate our
+approach on a real-world dataset. We make our novel benchmark datasets publicly
+available to foster future research in this novel problem setting and
+comparison with our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>33 pages, 35 figures, accepted for publication at 3DV 2024, includes
+  supplementary material of the conference submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ $E^{3}$Gen: Efficient, Expressive and Editable Avatars Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19203v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19203v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weitian Zhang, Yichao Yan, Yunhui Liu, Xingdong Sheng, Xiaokang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper aims to introduce 3D Gaussian for efficient, expressive, and
+editable digital avatar generation. This task faces two major challenges: (1)
+The unstructured nature of 3D Gaussian makes it incompatible with current
+generation pipelines; (2) the expressive animation of 3D Gaussian in a
+generative setting that involves training with multiple subjects remains
+unexplored. In this paper, we propose a novel avatar generation method named
+$E^3$Gen, to effectively address these challenges. First, we propose a novel
+generative UV features plane representation that encodes unstructured 3D
+Gaussian onto a structured 2D UV space defined by the SMPL-X parametric model.
+This novel representation not only preserves the representation ability of the
+original 3D Gaussian but also introduces a shared structure among subjects to
+enable generative learning of the diffusion model. To tackle the second
+challenge, we propose a part-aware deformation module to achieve robust and
+accurate full-body expressive pose control. Extensive experiments demonstrate
+that our method achieves superior performance in avatar generation and enables
+expressive full-body pose control and editing. Our project page is
+https://olivia23333.github.io/E3Gen.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://olivia23333.github.io/E3Gen</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CL-MRI: <span class="highlight-title">Self-Supervised</span> Contrastive Learning to Improve the Accuracy of
+  Undersampled MRI Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00530v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00530v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mevan Ekanayake, Zhifeng Chen, Mehrtash Harandi, Gary Egan, Zhaolin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In Magnetic Resonance Imaging (MRI), image acquisitions are often
+undersampled in the measurement domain to accelerate the scanning process, at
+the expense of image quality. However, image quality is a crucial factor that
+influences the accuracy of clinical diagnosis; hence, high-quality image
+reconstruction from undersampled measurements has been a key area of research.
+Recently, deep learning (DL) methods have emerged as the state-of-the-art for
+MRI reconstruction, typically involving deep neural networks to transform
+undersampled MRI images into high-quality MRI images through data-driven
+processes. Nevertheless, there is clear and significant room for improvement in
+undersampled DL MRI reconstruction to meet the high standards required for
+clinical diagnosis, in terms of eliminating aliasing artifacts and reducing
+image noise. In this paper, we introduce a self-supervised pretraining
+procedure using contrastive learning to improve the accuracy of undersampled DL
+MRI reconstruction. We use contrastive learning to transform the MRI image
+representations into a latent space that maximizes mutual information among
+different undersampled representations and optimizes the information content at
+the input of the downstream DL reconstruction models. Our experiments
+demonstrate improved reconstruction accuracy across a range of acceleration
+factors and datasets, both quantitatively and qualitatively. Furthermore, our
+extended experiments validate the proposed framework's robustness under
+adversarial conditions, such as measurement noise, different k-space sampling
+patterns, and pathological abnormalities, and also prove the transfer learning
+capabilities on MRI datasets with completely different anatomy. Additionally,
+we conducted experiments to visualize and analyze the properties of the
+proposed MRI contrastive learning latent space.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multi-Branched Radial Basis Network Approach to Predicting Complex
+  Chaotic Behaviours 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00618v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00618v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aarush Sinha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we propose a multi branched network approach to predict the
+dynamics of a physics attractor characterized by intricate and chaotic
+behavior. We introduce a unique neural network architecture comprised of Radial
+Basis Function (RBF) layers combined with an attention mechanism designed to
+effectively capture nonlinear inter-dependencies inherent in the attractor's
+temporal evolution. Our results demonstrate successful prediction of the
+attractor's trajectory across 100 predictions made using a real-world dataset
+of 36,700 time-series observations encompassing approximately 28 minutes of
+activity. To further illustrate the performance of our proposed technique, we
+provide comprehensive visualizations depicting the attractor's original and
+predicted behaviors alongside quantitative measures comparing observed versus
+estimated outcomes. Overall, this work showcases the potential of advanced
+machine learning algorithms in elucidating hidden structures in complex
+physical systems while offering practical applications in various domains
+requiring accurate short-term forecasting capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defining Neural Network Architecture through Polytope Structures of
+  <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02407v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02407v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangmin Lee, Abbas Mammadov, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current theoretical and empirical research in neural networks suggests that
+complex datasets require large network architectures for thorough
+classification, yet the precise nature of this relationship remains unclear.
+This paper tackles this issue by defining upper and lower bounds for neural
+network widths, which are informed by the polytope structure of the dataset in
+question. We also delve into the application of these principles to simplicial
+complexes and specific manifold shapes, explaining how the requirement for
+network width varies in accordance with the geometric complexity of the
+dataset. Moreover, we develop an algorithm to investigate a converse situation
+where the polytope structure of a dataset can be inferred from its
+corresponding trained neural networks. Through our algorithm, it is established
+that popular datasets such as MNIST, Fashion-MNIST, and CIFAR10 can be
+efficiently encapsulated using no more than two polytopes with a small number
+of faces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-Efficient 3D Visual Grounding via Order-Aware Referring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16539v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16539v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tung-Yu Wu, Sheng-Yu Huang, Yu-Chiang Frank Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D visual grounding aims to identify the target object within a 3D point
+cloud scene referred to by a natural language description. Previous works
+usually require significant data relating to point color and their descriptions
+to exploit the corresponding complicated verbo-visual relations. In our work,
+we introduce Vigor, a novel Data-Efficient 3D Visual Grounding framework via
+Order-aware Referring. Vigor leverages LLM to produce a desirable referential
+order from the input description for 3D visual grounding. With the proposed
+stacked object-referring blocks, the predicted anchor objects in the above
+order allow one to locate the target object progressively without supervision
+on the identities of anchor objects or exact relations between anchor/target
+objects. In addition, we present an order-aware warm-up training strategy,
+which augments referential orders for pre-training the visual grounding
+framework. This allows us to better capture the complex verbo-visual relations
+and benefit the desirable data-efficient learning scheme. Experimental results
+on the NR3D and ScanRefer datasets demonstrate our superiority in low-resource
+scenarios. In particular, Vigor surpasses current state-of-the-art frameworks
+by 9.3% and 7.6% grounding accuracy under 1% data and 10% data settings on the
+NR3D dataset, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Elastic Feature Consolidation for Cold Start Exemplar-Free Incremental
+  Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03917v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03917v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simone Magistri, Tomaso Trinci, Albin Soutif-Cormerais, Joost van de Weijer, Andrew D. Bagdanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exemplar-Free Class Incremental Learning (EFCIL) aims to learn from a
+sequence of tasks without having access to previous task data. In this paper,
+we consider the challenging Cold Start scenario in which insufficient data is
+available in the first task to learn a high-quality backbone. This is
+especially challenging for EFCIL since it requires high plasticity, which
+results in feature drift which is difficult to compensate for in the
+exemplar-free setting. To address this problem, we propose a simple and
+effective approach that consolidates feature representations by regularizing
+drift in directions highly relevant to previous tasks and employs prototypes to
+reduce task-recency bias. Our method, called Elastic Feature Consolidation
+(EFC), exploits a tractable second-order approximation of feature drift based
+on an Empirical Feature Matrix (EFM). The EFM induces a pseudo-metric in
+feature space which we use to regularize feature drift in important directions
+and to update Gaussian prototypes used in a novel asymmetric cross entropy loss
+which effectively balances prototype rehearsal with data from new tasks.
+Experimental results on CIFAR-100, Tiny-ImageNet, ImageNet-Subset and
+ImageNet-1K demonstrate that Elastic Feature Consolidation is better able to
+learn new tasks by maintaining model plasticity and significantly outperform
+the state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Twelfth International Conference on Learning
+  Representations (ICLR 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Benchmarking and Improving Detail Image Caption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19092v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19092v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyuan Dong, Jiawen Li, Bohong Wu, Jiacong Wang, Yuan Zhang, Haoyuan Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image captioning has long been regarded as a fundamental task in visual
+understanding. Recently, however, few large vision-language model (LVLM)
+research discusses model's image captioning performance because of the outdated
+short-caption benchmarks and unreliable evaluation metrics. In this work, we
+propose to benchmark detail image caption task by curating high-quality
+evaluation datasets annotated by human experts, GPT-4V and Gemini-1.5-Pro. We
+also design a more reliable caption evaluation metric called CAPTURE (CAPtion
+evaluation by exTracting and coUpling coRE information). CAPTURE extracts
+visual elements, e.g., objects, attributes and relations from captions, and
+then matches these elements through three stages, achieving the highest
+consistency with expert judgements over other rule-based or model-based caption
+metrics. The proposed benchmark and metric provide reliable evaluation for
+LVLM's detailed image captioning ability. Guided by this evaluation, we further
+explore to unleash LVLM's detail caption capabilities by synthesizing
+high-quality data through a five-stage data construction pipeline. Our pipeline
+only uses a given LVLM itself and other open-source tools, without any human or
+GPT-4V annotation in the loop. Experiments show that the proposed data
+construction strategy significantly improves model-generated detail caption
+data quality for LVLMs with leading performance, and the data quality can be
+further improved in a self-looping paradigm. All code and dataset will be
+publicly available at https://github.com/foundation-multimodal-models/CAPTURE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FairCLIP: Social Bias Elimination based on Attribute Prototype Learning
+  and Representation Neutralization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.14562v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.14562v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyang Wang, Yi Zhang, Jitao Sang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Vision-Language Pre-training (VLP) models like CLIP have gained
+popularity in recent years. However, many works found that the social biases
+hidden in CLIP easily manifest in downstream tasks, especially in image
+retrieval, which can have harmful effects on human society. In this work, we
+propose FairCLIP to eliminate the social bias in CLIP-based image retrieval
+without damaging the retrieval performance achieving the compatibility between
+the debiasing effect and the retrieval performance. FairCLIP is divided into
+two steps: Attribute Prototype Learning (APL) and Representation Neutralization
+(RN). In the first step, we extract the concepts needed for debiasing in CLIP.
+We use the query with learnable word vector prefixes as the extraction
+structure. In the second step, we first divide the attributes into target and
+bias attributes. By analysis, we find that both attributes have an impact on
+the bias. Therefore, we try to eliminate the bias by using Re-Representation
+Matrix (RRM) to achieve the neutralization of the representation. We compare
+the debiasing effect and retrieval performance with other methods, and
+experiments demonstrate that FairCLIP can achieve the best compatibility.
+Although FairCLIP is used to eliminate bias in image retrieval, it achieves the
+neutralization of the representation which is common to all CLIP downstream
+tasks. This means that FairCLIP can be applied as a general debiasing method
+for other fairness issues related to CLIP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Model Patching via Mixture-of-<span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17825v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17825v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seokil Ham, Sangmin Woo, Jin-Young Kim, Hyojun Go, Byeongjun Park, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Diffusion Model Patching (DMP), a simple method to boost the
+performance of pre-trained diffusion models that have already reached
+convergence, with a negligible increase in parameters. DMP inserts a small,
+learnable set of prompts into the model's input space while keeping the
+original model frozen. The effectiveness of DMP is not merely due to the
+addition of parameters but stems from its dynamic gating mechanism, which
+selects and combines a subset of learnable prompts at every step of the
+generative process (e.g., reverse denoising steps). This strategy, which we
+term "mixture-of-prompts", enables the model to draw on the distinct expertise
+of each prompt, essentially "patching" the model's functionality at every step
+with minimal yet specialized parameters. Uniquely, DMP enhances the model by
+further training on the same dataset on which it was originally trained, even
+in a scenario where significant improvements are typically not expected due to
+model convergence. Experiments show that DMP significantly enhances the
+converged FID of DiT-L/2 on FFHQ 256x256 by 10.38%, achieved with only a 1.43%
+parameter increase and 50K additional training iterations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://sangminwoo.github.io/DMP/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CurbNet: Curb Detection Framework Based on LiDAR Point Cloud
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16794v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16794v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoyang Zhao, Fulong Ma, Weiqing Qi, Yuxuan Liu, Ming Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Curb detection is a crucial function in intelligent driving, essential for
+determining drivable areas on the road. However, the complexity of road
+environments makes curb detection challenging. This paper introduces CurbNet, a
+novel framework for curb detection utilizing point cloud segmentation. To
+address the lack of comprehensive curb datasets with 3D annotations, we have
+developed the 3D-Curb dataset based on SemanticKITTI, currently the largest and
+most diverse collection of curb point clouds. Recognizing that the primary
+characteristic of curbs is height variation, our approach leverages spatially
+rich 3D point clouds for training. To tackle the challenges posed by the uneven
+distribution of curb features on the xy-plane and their dependence on
+high-frequency features along the z-axis, we introduce the Multi-Scale and
+Channel Attention (MSCA) module, a customized solution designed to optimize
+detection performance. Additionally, we propose an adaptive weighted loss
+function group specifically formulated to counteract the imbalance in the
+distribution of curb point clouds relative to other categories. Extensive
+experiments conducted on 2 major datasets demonstrate that our method surpasses
+existing benchmarks set by leading curb detection and point cloud segmentation
+models. Through the post-processing refinement of the detection results, we
+have significantly reduced noise in curb detection, thereby improving precision
+by 4.5 points. Similarly, our tolerance experiments also achieved
+state-of-the-art results. Furthermore, real-world experiments and dataset
+analyses mutually validate each other, reinforcing CurbNet's superior detection
+capability and robust generalizability. The project website is available at:
+https://github.com/guoyangzhao/CurbNet/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SimAC: A Simple Anti-Customization Method for Protecting Face Privacy
+  against Text-to-Image Synthesis of Diffusion Models <span class="chip">CVPR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07865v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07865v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feifei Wang, Zhentao Tan, Tianyi Wei, Yue Wu, Qidong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the success of diffusion-based customization methods on visual
+content creation, increasing concerns have been raised about such techniques
+from both privacy and political perspectives. To tackle this issue, several
+anti-customization methods have been proposed in very recent months,
+predominantly grounded in adversarial attacks. Unfortunately, most of these
+methods adopt straightforward designs, such as end-to-end optimization with a
+focus on adversarially maximizing the original training loss, thereby
+neglecting nuanced internal properties intrinsic to the diffusion model, and
+even leading to ineffective optimization in some diffusion time steps.In this
+paper, we strive to bridge this gap by undertaking a comprehensive exploration
+of these inherent properties, to boost the performance of current
+anti-customization approaches. Two aspects of properties are investigated: 1)
+We examine the relationship between time step selection and the model's
+perception in the frequency domain of images and find that lower time steps can
+give much more contributions to adversarial noises. This inspires us to propose
+an adaptive greedy search for optimal time steps that seamlessly integrates
+with existing anti-customization methods. 2) We scrutinize the roles of
+features at different layers during denoising and devise a sophisticated
+feature-based optimization framework for anti-customization.Experiments on
+facial benchmarks demonstrate that our approach significantly increases
+identity disruption, thereby protecting user privacy and copyright. Our code is
+available at: https://github.com/somuchtome/SimAC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by CVPR2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SEP: Self-Enhanced <span class="highlight-title">Prompt</span> Tuning for Visual-Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15549v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15549v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hantao Yao, Rui Zhang, Lu Yu, Changsheng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt tuning based on Context Optimization (CoOp) effectively adapts
+visual-language models (VLMs) to downstream tasks by inferring additional
+learnable prompt tokens. However, these tokens are less discriminative as they
+are independent of the pre-trained tokens and fail to capture input-specific
+knowledge, such as class-aware textual or instance-aware visual knowledge.
+Leveraging the discriminative and generalization capabilities inherent in
+pre-trained tokens, we introduce a novel approach named Self-Enhanced Prompt
+Tuning (SEP). The core principle of SEP involves adapting the learnable prompt
+tokens at each encoder layer from the corresponding self-pretrained tokens,
+thereby explicitly incorporating discriminative prior knowledge to enhance both
+textual-level and visual-level embeddings. Furthermore, SEP's self-enhanced
+tokens not only boost discrimination but also mitigate domain shifts in unseen
+domains, enhancing generalization. In practice, SEP selects several
+representative tokens from all pre-trained tokens for each input data at every
+layer of the text/visual encoders. Subsequently, a Token Fusion Module (TFM) is
+introduced to generate a self-enhanced token by merging these representative
+tokens with the learnable tokens using a cross-attention mechanism. This
+self-enhanced token is then concatenated with all pre-trained tokens, serving
+as input for subsequent encoder layers to produce the relevant embeddings.
+Comprehensive evaluations across various benchmarks and tasks confirm SEP's
+efficacy in prompt tuning. Code: \href{Code}{https://github.com/htyao89/SEP}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Xmodel-VLM: A Simple Baseline for Multimodal Vision Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09215v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09215v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanting Xu, Yang Liu, Langping He, Xucheng Huang, Ling Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Xmodel-VLM, a cutting-edge multimodal vision language model. It
+is designed for efficient deployment on consumer GPU servers. Our work directly
+confronts a pivotal industry issue by grappling with the prohibitive service
+costs that hinder the broad adoption of large-scale multimodal systems. Through
+rigorous training, we have developed a 1B-scale language model from the ground
+up, employing the LLaVA paradigm for modal alignment. The result, which we call
+Xmodel-VLM, is a lightweight yet powerful multimodal vision language model.
+Extensive testing across numerous classic multimodal benchmarks has revealed
+that despite its smaller size and faster execution, Xmodel-VLM delivers
+performance comparable to that of larger models. Our model checkpoints and code
+are publicly available on GitHub at https://github.com/XiaoduoAILab/XmodelVLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OTMatch: Improving Semi-Supervised Learning with Optimal Transport <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.17455v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.17455v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiquan Tan, Kaipeng Zheng, Weiran Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-supervised learning has made remarkable strides by effectively utilizing
+a limited amount of labeled data while capitalizing on the abundant information
+present in unlabeled data. However, current algorithms often prioritize
+aligning image predictions with specific classes generated through
+self-training techniques, thereby neglecting the inherent relationships that
+exist within these classes. In this paper, we present a new approach called
+OTMatch, which leverages semantic relationships among classes by employing an
+optimal transport loss function to match distributions. We conduct experiments
+on many standard vision and language datasets. The empirical results show
+improvements in our method above baseline, this demonstrates the effectiveness
+and superiority of our approach in harnessing semantic relationships to enhance
+learning performance in a semi-supervised setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Auto-selected Knowledge Adapters for Lifelong Person Re-identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19005v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19005v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuelin Qian, Ruiqi Wu, Gong Cheng, Junwei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lifelong Person Re-Identification (LReID) extends traditional ReID by
+requiring systems to continually learn from non-overlapping datasets across
+different times and locations, adapting to new identities while preserving
+knowledge of previous ones. Existing approaches, either rehearsal-free or
+rehearsal-based, still suffer from the problem of catastrophic forgetting since
+they try to cram diverse knowledge into one fixed model. To overcome this
+limitation, we introduce a novel framework AdalReID, that adopts knowledge
+adapters and a parameter-free auto-selection mechanism for lifelong learning.
+Concretely, we incrementally build distinct adapters to learn domain-specific
+knowledge at each step, which can effectively learn and preserve knowledge
+across different datasets. Meanwhile, the proposed auto-selection strategy
+adaptively calculates the knowledge similarity between the input set and the
+adapters. On the one hand, the appropriate adapters are selected for the inputs
+to process ReID, and on the other hand, the knowledge interaction and fusion
+between adapters are enhanced to improve the generalization ability of the
+model. Extensive experiments are conducted to demonstrate the superiority of
+our AdalReID, which significantly outperforms SOTAs by about 10$\sim$20\% mAP
+on both seen and unseen domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-View Spectrogram <span class="highlight-title">Transformer</span> for Respiratory Sound Classification <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.09655v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.09655v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao He, Yuchen Yan, Jianfeng Ren, Ruibin Bai, Xudong Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have been applied to audio spectrograms for respiratory
+sound classification. Existing models often treat the spectrogram as a
+synthetic image while overlooking its physical characteristics. In this paper,
+a Multi-View Spectrogram Transformer (MVST) is proposed to embed different
+views of time-frequency characteristics into the vision transformer.
+Specifically, the proposed MVST splits the mel-spectrogram into different sized
+patches, representing the multi-view acoustic elements of a respiratory sound.
+These patches and positional embeddings are then fed into transformer encoders
+to extract the attentional information among patches through a self-attention
+mechanism. Finally, a gated fusion scheme is designed to automatically weigh
+the multi-view features to highlight the best one in a specific scenario.
+Experimental results on the ICBHI dataset demonstrate that the proposed MVST
+significantly outperforms state-of-the-art methods for classifying respiratory
+sounds.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper was published at ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Breaking the False Sense of Security in Backdoor Defense through
+  Re-Activation Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16134v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16134v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingli Zhu, Siyuan Liang, Baoyuan Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks face persistent challenges in defending against backdoor
+attacks, leading to an ongoing battle between attacks and defenses. While
+existing backdoor defense strategies have shown promising performance on
+reducing attack success rates, can we confidently claim that the backdoor
+threat has truly been eliminated from the model? To address it, we
+re-investigate the characteristics of the backdoored models after defense
+(denoted as defense models). Surprisingly, we find that the original backdoors
+still exist in defense models derived from existing post-training defense
+strategies, and the backdoor existence is measured by a novel metric called
+backdoor existence coefficient. It implies that the backdoors just lie dormant
+rather than being eliminated. To further verify this finding, we empirically
+show that these dormant backdoors can be easily re-activated during inference,
+by manipulating the original trigger with well-designed tiny perturbation using
+universal adversarial attack. More practically, we extend our backdoor
+reactivation to black-box scenario, where the defense model can only be queried
+by the adversary during inference, and develop two effective methods, i.e.,
+query-based and transfer-based backdoor re-activation attacks. The
+effectiveness of the proposed methods are verified on both image classification
+and multimodal contrastive learning (i.e., CLIP) tasks. In conclusion, this
+work uncovers a critical vulnerability that has never been explored in existing
+defense strategies, emphasizing the urgency of designing more robust and
+advanced backdoor defense mechanisms in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The RoboDrive Challenge: Drive Anytime Anywhere in Any Condition <span class="chip">ICRA 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.08816v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.08816v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingdong Kong, Shaoyuan Xie, Hanjiang Hu, Yaru Niu, Wei Tsang Ooi, Benoit R. Cottereau, Lai Xing Ng, Yuexin Ma, Wenwei Zhang, Liang Pan, Kai Chen, Ziwei Liu, Weichao Qiu, Wei Zhang, Xu Cao, Hao Lu, Ying-Cong Chen, Caixin Kang, Xinning Zhou, Chengyang Ying, Wentao Shang, Xingxing Wei, Yinpeng Dong, Bo Yang, Shengyin Jiang, Zeliang Ma, Dengyi Ji, Haiwen Li, Xingliang Huang, Yu Tian, Genghua Kou, Fan Jia, Yingfei Liu, Tiancai Wang, Ying Li, Xiaoshuai Hao, Yifan Yang, Hui Zhang, Mengchuan Wei, Yi Zhou, Haimei Zhao, Jing Zhang, Jinke Li, Xiao He, Xiaoqiang Cheng, Bingyang Zhang, Lirong Zhao, Dianlei Ding, Fangsheng Liu, Yixiang Yan, Hongming Wang, Nanfei Ye, Lun Luo, Yubo Tian, Yiwei Zuo, Zhe Cao, Yi Ren, Yunfan Li, Wenjie Liu, Xun Wu, Yifan Mao, Ming Li, Jian Liu, Jiayang Liu, Zihan Qin, Cunxi Chu, Jialei Xu, Wenbo Zhao, Junjun Jiang, Xianming Liu, Ziyan Wang, Chiwei Li, Shilong Li, Chendong Yuan, Songyue Yang, Wentao Liu, Peng Chen, Bin Zhou, Yubo Wang, Chi Zhang, Jianhang Sun, Hai Chen, Xiao Yang, Lizhong Wang, Dongyi Fu, Yongchun Lin, Huitong Yang, Haoang Li, Yadan Luo, Xianjing Cheng, Yong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of autonomous driving, robust perception under
+out-of-distribution conditions is paramount for the safe deployment of
+vehicles. Challenges such as adverse weather, sensor malfunctions, and
+environmental unpredictability can severely impact the performance of
+autonomous systems. The 2024 RoboDrive Challenge was crafted to propel the
+development of driving perception technologies that can withstand and adapt to
+these real-world variabilities. Focusing on four pivotal tasks -- BEV
+detection, map segmentation, semantic occupancy prediction, and multi-view
+depth estimation -- the competition laid down a gauntlet to innovate and
+enhance system resilience against typical and atypical disturbances. This
+year's challenge consisted of five distinct tracks and attracted 140 registered
+teams from 93 institutes across 11 countries, resulting in nearly one thousand
+submissions evaluated through our servers. The competition culminated in 15
+top-performing solutions, which introduced a range of innovative approaches
+including advanced data augmentation, multi-sensor fusion, self-supervised
+learning for error correction, and new algorithmic strategies to enhance sensor
+robustness. These contributions significantly advanced the state of the art,
+particularly in handling sensor inconsistencies and environmental variability.
+Participants, through collaborative efforts, pushed the boundaries of current
+technologies, showcasing their potential in real-world scenarios. Extensive
+evaluations and analyses provided insights into the effectiveness of these
+solutions, highlighting key trends and successful strategies for improving the
+resilience of driving perception systems. This challenge has set a new
+benchmark in the field, providing a rich repository of techniques expected to
+guide future research in this field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICRA 2024; 32 pages, 24 figures, 5 tables; Code at
+  https://robodrive-24.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SegICL: A Multimodal In-context Learning Framework for Enhanced
+  Segmentation in Medical Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.16578v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.16578v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingdong Shen, Fangxin Shang, Xiaoshuang Huang, Yehui Yang, Haifeng Huang, Shiming Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of medical image segmentation, tackling Out-of-Distribution
+(OOD) segmentation tasks in a cost-effective manner remains a significant
+challenge. Universal segmentation models is a solution, which aim to generalize
+across the diverse modality of medical images, yet their effectiveness often
+diminishes when applied to OOD data modalities and tasks, requiring intricate
+fine-tuning of model for optimal performance. Few-shot learning segmentation
+methods are typically designed for specific modalities of data and cannot be
+directly transferred for use with another modality. Therefore, we introduce
+SegICL, a novel approach leveraging In-Context Learning (ICL) for image
+segmentation. Unlike existing methods, SegICL has the capability to employ
+text-guided segmentation and conduct in-context learning with a small set of
+image-mask pairs, eliminating the need for training the model from scratch or
+fine-tuning for OOD tasks (including OOD modality and dataset). Extensive
+experimental demonstrates a positive correlation between the number of shots
+and segmentation performance on OOD tasks. The performance of segmentation when
+provided thre-shots is approximately 1.5 times better than the performance in a
+zero-shot setting. This indicates that SegICL effectively address new
+segmentation tasks based on contextual information. Additionally, SegICL also
+exhibits comparable performance to mainstream models on OOD and in-distribution
+tasks. Our code will be released after paper review.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Shadows Don't Lie and Lines Can't Bend! Generative Models don't know
+  Projective Geometry...for now 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2311.17138v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2311.17138v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayush Sarkar, Hanlin Mai, Amitabh Mahapatra, Svetlana Lazebnik, D. A. Forsyth, Anand Bhattad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models can produce impressively realistic images. This paper
+demonstrates that generated images have geometric features different from those
+of real images. We build a set of collections of generated images, prequalified
+to fool simple, signal-based classifiers into believing they are real. We then
+show that prequalified generated images can be identified reliably by
+classifiers that only look at geometric properties. We use three such
+classifiers. All three classifiers are denied access to image pixels, and look
+only at derived geometric features. The first classifier looks at the
+perspective field of the image, the second looks at lines detected in the
+image, and the third looks at relations between detected objects and shadows.
+Our procedure detects generated images more reliably than SOTA local signal
+based detectors, for images from a number of distinct generators. Saliency maps
+suggest that the classifiers can identify geometric problems reliably. We
+conclude that current generators cannot reliably reproduce geometric properties
+of real images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://projective-geometry.github.io | First three
+  authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Joint Study of Phrase Grounding and Task Performance in Vision and
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02691v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02691v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noriyuki Kojima, Hadar Averbuch-Elor, Yoav Artzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Key to tasks that require reasoning about natural language in visual contexts
+is grounding words and phrases to image regions. However, observing this
+grounding in contemporary models is complex, even if it is generally expected
+to take place if the task is addressed in a way that is conductive to
+generalization. We propose a framework to jointly study task performance and
+phrase grounding, and propose three benchmarks to study the relation between
+the two. Our results show that contemporary models demonstrate inconsistency
+between their ability to ground phrases and solve tasks. We show how this can
+be addressed through brute-force training on ground phrasing annotations, and
+analyze the dynamics it creates. Code and at available at
+https://github.com/lil-lab/phrase_grounding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This was published in TMLR in 2024, on January 24th</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GUARD: Role-playing to Generate Natural-language Jailbreakings to Test
+  Guideline Adherence of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03299v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03299v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haibo Jin, Ruoxi Chen, Andy Zhou, Yang Zhang, Haohan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The discovery of "jailbreaks" to bypass safety filters of Large Language
+Models (LLMs) and harmful responses have encouraged the community to implement
+safety measures. One major safety measure is to proactively test the LLMs with
+jailbreaks prior to the release. Therefore, such testing will require a method
+that can generate jailbreaks massively and efficiently. In this paper, we
+follow a novel yet intuitive strategy to generate jailbreaks in the style of
+the human generation. We propose a role-playing system that assigns four
+different roles to the user LLMs to collaborate on new jailbreaks. Furthermore,
+we collect existing jailbreaks and split them into different independent
+characteristics using clustering frequency and semantic patterns sentence by
+sentence. We organize these characteristics into a knowledge graph, making them
+more accessible and easier to retrieve. Our system of different roles will
+leverage this knowledge graph to generate new jailbreaks, which have proved
+effective in inducing LLMs to generate unethical or guideline-violating
+responses. In addition, we also pioneer a setting in our system that will
+automatically follow the government-issued guidelines to generate jailbreaks to
+test whether LLMs follow the guidelines accordingly. We refer to our system as
+GUARD (Guideline Upholding through Adaptive Role-play Diagnostics). We have
+empirically validated the effectiveness of GUARD on three cutting-edge
+open-sourced LLMs (Vicuna-13B, LongChat-7B, and Llama-2-7B), as well as a
+widely-utilized commercial LLM (ChatGPT). Moreover, our work extends to the
+realm of vision language models (MiniGPT-v2 and Gemini Vision Pro), showcasing
+GUARD's versatility and contributing valuable insights for the development of
+safer, more reliable LLM-based applications across diverse modalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 papges</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simulator-Free Visual Domain Randomization via Video Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01335v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01335v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chintan Trivedi, Nemanja Rašajski, Konstantinos Makantasis, Antonios Liapis, Georgios N. Yannakakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain randomization is an effective computer vision technique for improving
+transferability of vision models across visually distinct domains exhibiting
+similar content. Existing approaches, however, rely extensively on tweaking
+complex and specialized simulation engines that are difficult to construct,
+subsequently affecting their feasibility and scalability. This paper introduces
+BehAVE, a video understanding framework that uniquely leverages the plethora of
+existing commercial video games for domain randomization, without requiring
+access to their simulation engines. Under BehAVE (1) the inherent rich visual
+diversity of video games acts as the source of randomization and (2) player
+behavior -- represented semantically via textual descriptions of actions --
+guides the *alignment* of videos with similar content. We test BehAVE on 25
+games of the first-person shooter (FPS) genre across various video and text
+foundation models and we report its robustness for domain randomization. BehAVE
+successfully aligns player behavioral patterns and is able to zero-shot
+transfer them to multiple unseen FPS games when trained on just one FPS game.
+In a more challenging setting, BehAVE manages to improve the zero-shot
+transferability of foundation models to unseen FPS games (up to 22%) even when
+trained on a game of a different genre (Minecraft). Code and dataset can be
+found at https://github.com/nrasajski/BehAVE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EVEREST: Efficient Masked Video Autoencoder by Removing Redundant
+  Spatiotemporal Tokens <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.10636v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.10636v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sunil Hwang, Jaehong Yoon, Youngwan Lee, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Masked Video Autoencoder (MVA) approaches have demonstrated their potential
+by significantly outperforming previous video representation learning methods.
+However, they waste an excessive amount of computations and memory in
+predicting uninformative tokens/frames due to random masking strategies. (e.g.,
+over 16 nodes with 128 NVIDIA A100 GPUs). To resolve this issue, we exploit the
+unequal information density among the patches in videos and propose EVEREST, a
+surprisingly efficient MVA approach for video representation learning that
+finds tokens containing rich motion features and discards uninformative ones
+during both pre-training and fine-tuning. We further present an
+information-intensive frame selection strategy that allows the model to focus
+on informative and causal frames with minimal redundancy. Our method
+significantly reduces the computation and memory requirements of MVA, enabling
+the pre-training and fine-tuning on a single machine with 8 GPUs while
+achieving comparable performance to computation- and memory-heavy baselines on
+multiple benchmarks and the uncurated Ego4D dataset. We hope that our work
+contributes to reducing the barrier to further research on video understanding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupling Dynamic Monocular Videos for Dynamic View Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01716v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01716v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meng You, Junhui Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The challenge of dynamic view synthesis from dynamic monocular videos, i.e.,
+synthesizing novel views for free viewpoints given a monocular video of a
+dynamic scene captured by a moving camera, mainly lies in accurately modeling
+the \textbf{dynamic objects} of a scene using limited 2D frames, each with a
+varying timestamp and viewpoint. Existing methods usually require pre-processed
+2D optical flow and depth maps by off-the-shelf methods to supervise the
+network, making them suffer from the inaccuracy of the pre-processed
+supervision and the ambiguity when lifting the 2D information to 3D. In this
+paper, we tackle this challenge in an unsupervised fashion. Specifically, we
+decouple the motion of the dynamic objects into object motion and camera
+motion, respectively regularized by proposed unsupervised surface consistency
+and patch-based multi-view constraints. The former enforces the 3D geometric
+surfaces of moving objects to be consistent over time, while the latter
+regularizes their appearances to be consistent across different viewpoints.
+Such a fine-grained motion formulation can alleviate the learning difficulty
+for the network, thus enabling it to produce not only novel views with higher
+quality but also more accurate scene flows and depth than existing methods
+requiring extra supervision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">13</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jina CLIP: Your CLIP Model Is Also Your Text Retriever <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Koukounas, Georgios Mastrapas, Michael Günther, Bo Wang, Scott Martens, Isabelle Mohr, Saba Sturua, Mohammad Kalim Akram, Joan Fontanals Martínez, Saahil Ognawala, Susana Guzman, Maximilian Werk, Nan Wang, Han Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive Language-Image Pretraining (CLIP) is widely used to train models
+to align images and texts in a common embedding space by mapping them to
+fixed-sized vectors. These models are key to multimodal information retrieval
+and related tasks. However, CLIP models generally underperform in text-only
+tasks compared to specialized text models. This creates inefficiencies for
+information retrieval systems that keep separate embeddings and models for
+text-only and multimodal tasks. We propose a novel, multi-task contrastive
+training method to address this issue, which we use to train the jina-clip-v1
+model to achieve the state-of-the-art performance on both text-image and
+text-text retrieval tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, ICML2024 workshop submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Query Recommendations via LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19749v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19749v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bacciu, Enrico Palumbo, Andreas Damianou, Nicola Tonellotto, Fabrizio Silvestri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query recommendation systems are ubiquitous in modern search engines,
+assisting users in producing effective queries to meet their information needs.
+However, these systems require a large amount of data to produce good
+recommendations, such as a large collection of documents to index and query
+logs. In particular, query logs and user data are not available in cold start
+scenarios. Query logs are expensive to collect and maintain and require complex
+and time-consuming cascading pipelines for creating, combining, and ranking
+recommendations. To address these issues, we frame the query recommendation
+problem as a generative task, proposing a novel approach called Generative
+Query Recommendation (GQR). GQR uses an LLM as its foundation and does not
+require to be trained or fine-tuned to tackle the query recommendation problem.
+We design a prompt that enables the LLM to understand the specific
+recommendation task, even using a single example. We then improved our system
+by proposing a version that exploits query logs called Retriever-Augmented GQR
+(RA-GQR). RA-GQr dynamically composes its prompt by retrieving similar queries
+from query logs. GQR approaches reuses a pre-existing neural architecture
+resulting in a simpler and more ready-to-market approach, even in a cold start
+scenario. Our proposed GQR obtains state-of-the-art performance in terms of
+NDCG@10 and clarity score against two commercial search engines and the
+previous state-of-the-art approach on the Robust04 and ClueWeb09B collections,
+improving on average the NDCG@10 performance up to ~4% on Robust04 and
+ClueWeb09B w.r.t the previous best competitor. RA-GQR further improve the
+NDCG@10 obtaining an increase of ~11%, ~6\% on Robust04 and ClueWeb09B w.r.t
+the best competitor. Furthermore, our system obtained ~59% of user preferences
+in a blind user study, proving that our method produces the most engaging
+queries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Generating Query Recommendations via LLMs</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncertainty-aware sign language video retrieval with probability
+  distribution modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuan Wu, Hongxiang Li, Yuanjiang Luo, Xuxin Cheng, Xianwei Zhuang, Meng Cao, Keren Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sign language video retrieval plays a key role in facilitating information
+access for the deaf community. Despite significant advances in video-text
+retrieval, the complexity and inherent uncertainty of sign language preclude
+the direct application of these techniques. Previous methods achieve the
+mapping between sign language video and text through fine-grained modal
+alignment. However, due to the scarcity of fine-grained annotation, the
+uncertainty inherent in sign language video is underestimated, limiting the
+further development of sign language retrieval tasks. To address this
+challenge, we propose a novel Uncertainty-aware Probability Distribution
+Retrieval (UPRet), that conceptualizes the mapping process of sign language
+video and text in terms of probability distributions, explores their potential
+interrelationships, and enables flexible mappings. Experiments on three
+benchmarks demonstrate the effectiveness of our method, which achieves
+state-of-the-art results on How2Sign (59.1%), PHOENIX-2014T (72.0%), and
+CSL-Daily (78.4%).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Keyword-driven Retrieval-Augmented Large Language Models for Cold-start
+  User Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19612v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19612v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai-Dang Kieu, Minh Duc Nguyen, Thanh-Son Nguyen, Dung D. Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Large Language Models (LLMs) have shown significant
+potential in enhancing recommender systems. However, addressing the cold-start
+recommendation problem, where users lack historical data, remains a
+considerable challenge. In this paper, we introduce KALM4Rec (Keyword-driven
+Retrieval-Augmented Large Language Models for Cold-start User Recommendations),
+a novel framework specifically designed to tackle this problem by requiring
+only a few input keywords from users in a practical scenario of cold-start user
+restaurant recommendations. KALM4Rec operates in two main stages: candidates
+retrieval and LLM-based candidates re-ranking. In the first stage,
+keyword-driven retrieval models are used to identify potential candidates,
+addressing LLMs' limitations in processing extensive tokens and reducing the
+risk of generating misleading information. In the second stage, we employ LLMs
+with various prompting strategies, including zero-shot and few-shot techniques,
+to re-rank these candidates by integrating multiple examples directly into the
+LLM prompts. Our evaluation, using a Yelp restaurant dataset with user reviews
+from three English-speaking cities, shows that our proposed framework
+significantly improves recommendation quality. Specifically, the integration of
+in-context instructions with LLMs for re-ranking markedly enhances the
+performance of the cold-start user recommender system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval Augmented Structured Generation: Business Document Information
+  Extraction As Tool Use 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franz Louis Cesista, Rui Aguiar, Jason Kim, Paolo Acilo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Business Document Information Extraction (BDIE) is the problem of
+transforming a blob of unstructured information (raw text, scanned documents,
+etc.) into a structured format that downstream systems can parse and use. It
+has two main tasks: Key-Information Extraction (KIE) and Line Items Recognition
+(LIR). In this paper, we argue that BDIE is best modeled as a Tool Use problem,
+where the tools are these downstream systems. We then present Retrieval
+Augmented Structured Generation (RASG), a novel general framework for BDIE that
+achieves state of the art (SOTA) results on both KIE and LIR tasks on BDIE
+benchmarks.
+  The contributions of this paper are threefold: (1) We show, with ablation
+benchmarks, that Large Language Models (LLMs) with RASG are already competitive
+with or surpasses current SOTA Large Multimodal Models (LMMs) without RASG on
+BDIE benchmarks. (2) We propose a new metric class for Line Items Recognition,
+General Line Items Recognition Metric (GLIRM), that is more aligned with
+practical BDIE use cases compared to existing metrics, such as ANLS*, DocILE,
+and GriTS. (3) We provide a heuristic algorithm for backcalculating bounding
+boxes of predicted line items and tables without the need for vision encoders.
+Finally, we claim that, while LMMs might sometimes offer marginal performance
+benefits, LLMs + RASG is oftentimes superior given real-world applications and
+constraints of BDIE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE 7th International Conference on Multimedia
+  Information Processing and Retrieval (MIPR), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending the Massive Text Embedding Benchmark to French 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20468v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20468v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Ciancone, Imene Kerboua, Marion Schaeffer, Wissam Siblini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, numerous embedding models have been made available and
+widely used for various NLP tasks. Choosing a model that performs well for
+several tasks in English has been largely simplified by the Massive Text
+Embedding Benchmark (MTEB), but extensions to other languages remain
+challenging. This is why we expand MTEB to propose the first massive benchmark
+of sentence embeddings for French. Not only we gather 22 existing datasets in
+an easy-to-use interface, but we also create three new French datasets for a
+global evaluation over 8 different tasks. We perform a large scale comparison
+with 46 carefully selected embedding models, conduct comprehensive statistical
+tests, and analyze the correlation between model performance and many of their
+characteristics. We find out that even if no model is the best on all tasks,
+large multilingual models pre-trained on sentence similarity perform
+particularly well. Our work comes with open-source code, new datasets and a
+public leaderboard.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Designing an Evaluation Framework for Large Language Models in Astronomy
+  Research 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20389v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20389v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John F. Wu, Alina Hyk, Kiera McCormick, Christine Ye, Simone Astarita, Elina Baral, Jo Ciuca, Jesse Cranney, Anjalie Field, Kartheik Iyer, Philipp Koehn, Jenn Kotler, Sandor Kruk, Michelle Ntampaka, Charles O'Neill, Joshua E. G. Peek, Sanjib Sharma, Mikaeel Yunus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are shifting how scientific research is done. It
+is imperative to understand how researchers interact with these models and how
+scientific sub-communities like astronomy might benefit from them. However,
+there is currently no standard for evaluating the use of LLMs in astronomy.
+Therefore, we present the experimental design for an evaluation study on how
+astronomy researchers interact with LLMs. We deploy a Slack chatbot that can
+answer queries from users via Retrieval-Augmented Generation (RAG); these
+responses are grounded in astronomy papers from arXiv. We record and anonymize
+user questions and chatbot answers, user upvotes and downvotes to LLM
+responses, user feedback to the LLM, and retrieved documents and similarity
+scores with the query. Our data collection method will enable future dynamic
+evaluations of LLM tools for astronomy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures. Code available at
+  https://github.com/jsalt2024-evaluating-llms-for-astronomy/astro-arxiv-bot</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Croissant: A Metadata Format for ML-Ready <span class="highlight-title">Dataset</span>s <span class="chip">SIGMOD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.19546v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.19546v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mubashara Akhtar, Omar Benjelloun, Costanza Conforti, Pieter Gijsbers, Joan Giner-Miguelez, Nitisha Jain, Michael Kuchnik, Quentin Lhoest, Pierre Marcenac, Manil Maskey, Peter Mattson, Luis Oala, Pierre Ruyssen, Rajat Shinde, Elena Simperl, Goeffry Thomas, Slava Tykhonov, Joaquin Vanschoren, Jos van der Velde, Steffen Vogler, Carole-Jean Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data is a critical resource for Machine Learning (ML), yet working with data
+remains a key friction point. This paper introduces Croissant, a metadata
+format for datasets that simplifies how data is used by ML tools and
+frameworks. Croissant makes datasets more discoverable, portable and
+interoperable, thereby addressing significant challenges in ML data management
+and responsible AI. Croissant is already supported by several popular dataset
+repositories, spanning hundreds of thousands of datasets, ready to be loaded
+into the most popular ML frameworks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Proceedings of ACM SIGMOD/PODS'24 Data Management for
+  End-to-End Machine Learning (DEEM) Workshop
+  https://dl.acm.org/doi/10.1145/3650203.3663326</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CaLa: Complementary Association Learning for Augmenting Composed Image
+  Retrieval <span class="chip">SIGIR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19149v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19149v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xintong Jiang, Yaxiong Wang, Mengjian Li, Yujiao Wu, Bingwen Hu, Xueming Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Composed Image Retrieval (CIR) involves searching for target images based on
+an image-text pair query. While current methods treat this as a query-target
+matching problem, we argue that CIR triplets contain additional associations
+beyond this primary relation. In our paper, we identify two new relations
+within triplets, treating each triplet as a graph node. Firstly, we introduce
+the concept of text-bridged image alignment, where the query text serves as a
+bridge between the query image and the target image. We propose a hinge-based
+cross-attention mechanism to incorporate this relation into network learning.
+Secondly, we explore complementary text reasoning, considering CIR as a form of
+cross-modal retrieval where two images compose to reason about complementary
+text. To integrate these perspectives effectively, we design a twin
+attention-based compositor. By combining these complementary associations with
+the explicit query pair-target image relation, we establish a comprehensive set
+of constraints for CIR. Our framework, CaLa (Complementary Association Learning
+for Augmenting Composed Image Retrieval), leverages these insights. We evaluate
+CaLa on CIRR and FashionIQ benchmarks with multiple backbones, demonstrating
+its superiority in composed image retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at SIGIR 2024. arXiv admin note: text overlap with
+  arXiv:2309.02169</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lookahead: An Inference Acceleration Framework for Large Language Model
+  with Lossless Generation Accuracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12728v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12728v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Zhao, Zhitian Xie, Chen Liang, Chenyi Zhuang, Jinjie Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Large Language Models (LLMs) have made significant advancements across
+various tasks, such as question answering, translation, text summarization, and
+dialogue systems, the need for accuracy in information becomes crucial,
+especially for serious financial products serving billions of users like
+Alipay. However, for a real-world product serving millions of users, the
+inference speed of LLMs becomes a critical factor compared to a mere
+experimental model.
+  Hence, this paper presents a generic framework for accelerating the inference
+process, resulting in a substantial increase in speed and cost reduction for
+our LLM-based scenarios, with lossless generation accuracy. In the traditional
+inference process, each token is generated sequentially by the LLM, leading to
+a time consumption proportional to the number of generated tokens. To enhance
+this process, our framework, named \textit{lookahead}, introduces a
+\textit{multi-branch} strategy. Instead of generating a single token at a time,
+we propose a Trie-based retrieval and verification mechanism to be able to
+accept several tokens at a forward step. Our strategy offers two distinct
+advantages: (1) it guarantees absolute correctness of the output, avoiding any
+approximation algorithms, and (2) the worst-case performance of our approach is
+equivalent to the conventional process. We conduct extensive experiments to
+demonstrate the significant improvements achieved by applying our inference
+acceleration framework. Our framework is widely deployed in Alipay since April
+2023, and obtain remarkable 2.66x to 6.26x speedup. Our code is available at
+https://github.com/alipay/PainlessInferenceAcceleration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Setwise Approach for Effective and Highly Efficient Zero-shot Ranking
+  with Large Language Models <span class="chip">SIGIR2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09497v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09497v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyao Zhuang, Honglei Zhuang, Bevan Koopman, Guido Zuccon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel zero-shot document ranking approach based on Large
+Language Models (LLMs): the Setwise prompting approach. Our approach
+complements existing prompting approaches for LLM-based zero-shot ranking:
+Pointwise, Pairwise, and Listwise. Through the first-of-its-kind comparative
+evaluation within a consistent experimental framework and considering factors
+like model size, token consumption, latency, among others, we show that
+existing approaches are inherently characterised by trade-offs between
+effectiveness and efficiency. We find that while Pointwise approaches score
+high on efficiency, they suffer from poor effectiveness. Conversely, Pairwise
+approaches demonstrate superior effectiveness but incur high computational
+overhead. Our Setwise approach, instead, reduces the number of LLM inferences
+and the amount of prompt token consumption during the ranking procedure,
+compared to previous methods. This significantly improves the efficiency of
+LLM-based zero-shot ranking, while also retaining high zero-shot ranking
+effectiveness. We make our code and results publicly available at
+\url{https://github.com/ielab/llm-rankers}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGIR2024 full paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AntCritic: Argument Mining for Free-Form and Visually-Rich Financial
+  Comments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.09612v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.09612v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huadai Liu, Wenqiang Xu, Xuan Lin, Jingjing Huo, Hong Chen, Zhou Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Argument mining aims to detect all possible argumentative components and
+identify their relationships automatically. As a thriving task in natural
+language processing, there has been a large amount of corpus for academic study
+and application development in this field. However, the research in this area
+is still constrained by the inherent limitations of existing datasets.
+Specifically, all the publicly available datasets are relatively small in
+scale, and few of them provide information from other modalities to facilitate
+the learning process. Moreover, the statements and expressions in these corpora
+are usually in a compact form, which restricts the generalization ability of
+models. To this end, we collect a novel dataset AntCritic to serve as a helpful
+complement to this area, which consists of about 10k free-form and
+visually-rich financial comments and supports both argument component detection
+and argument relation prediction tasks. Besides, to cope with the challenges
+brought by scenario expansion, we thoroughly explore the fine-grained relation
+prediction and structure reconstruction scheme and discuss the encoding
+mechanism for visual styles and layouts. On this basis, we design two simple
+but effective model architectures and conduct various experiments on this
+dataset to provide benchmark performances as a reference and verify the
+practicability of our proposed architecture. We release our data and code in
+this link, and this dataset follows CC BY-NC-ND 4.0 license.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Better Generalization with Semantic IDs: A Case Study in Ranking for
+  Recommendations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08121v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08121v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anima Singh, Trung Vu, Nikhil Mehta, Raghunandan Keshavan, Maheswaran Sathiamoorthy, Yilin Zheng, Lichan Hong, Lukasz Heldt, Li Wei, Devansh Tandon, Ed H. Chi, Xinyang Yi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Randomly-hashed item ids are used ubiquitously in recommendation models.
+However, the learned representations from random hashing prevents
+generalization across similar items, causing problems of learning unseen and
+long-tail items, especially when item corpus is large, power-law distributed,
+and evolving dynamically. In this paper, we propose using content-derived
+features as a replacement for random ids. We show that simply replacing ID
+features with content-based embeddings can cause a drop in quality due to
+reduced memorization capability. To strike a good balance of memorization and
+generalization, we propose to use Semantic IDs -- a compact discrete item
+representation learned from frozen content embeddings using RQ-VAE that
+captures the hierarchy of concepts in items -- as a replacement for random item
+ids. Similar to content embeddings, the compactness of Semantic IDs poses a
+problem of easy adaption in recommendation models. We propose novel methods for
+adapting Semantic IDs in industry-scale ranking models, through hashing
+sub-pieces of of the Semantic-ID sequences. In particular, we find that the
+SentencePiece model that is commonly used in LLM tokenization outperforms
+manually crafted pieces such as N-grams. To the end, we evaluate our approaches
+in a real-world ranking model for YouTube recommendations. Our experiments
+demonstrate that Semantic IDs can replace the direct use of video IDs by
+improving the generalization ability on new and long-tail item slices without
+sacrificing overall model quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">151</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unique3D: High-Quality and Efficient 3D Mesh Generation from a Single
+  Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kailu Wu, Fangfu Liu, Zhihan Cai, Runjie Yan, Hanyang Wang, Yating Hu, Yueqi Duan, Kaisheng Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce Unique3D, a novel image-to-3D framework for
+efficiently generating high-quality 3D meshes from single-view images,
+featuring state-of-the-art generation fidelity and strong generalizability.
+Previous methods based on Score Distillation Sampling (SDS) can produce
+diversified 3D results by distilling 3D knowledge from large 2D diffusion
+models, but they usually suffer from long per-case optimization time with
+inconsistent issues. Recent works address the problem and generate better 3D
+results either by finetuning a multi-view diffusion model or training a fast
+feed-forward model. However, they still lack intricate textures and complex
+geometries due to inconsistency and limited generated resolution. To
+simultaneously achieve high fidelity, consistency, and efficiency in single
+image-to-3D, we propose a novel framework Unique3D that includes a multi-view
+diffusion model with a corresponding normal diffusion model to generate
+multi-view images with their normal maps, a multi-level upscale process to
+progressively improve the resolution of generated orthographic multi-views, as
+well as an instant and consistent mesh reconstruction algorithm called ISOMER,
+which fully integrates the color and geometric priors into mesh results.
+Extensive experiments demonstrate that our Unique3D significantly outperforms
+other image-to-3D baselines in terms of geometric and textural details.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://wukailu.github.io/Unique3D</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Zero to Hero: Cold-Start Anomaly Detection <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tal Reiss, George Kour, Naama Zwerdling, Ateret Anaby-Tavor, Yedid Hoshen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When first deploying an anomaly detection system, e.g., to detect
+out-of-scope queries in chatbots, there are no observed data, making
+data-driven approaches ineffective. Zero-shot anomaly detection methods offer a
+solution to such "cold-start" cases, but unfortunately they are often not
+accurate enough. This paper studies the realistic but underexplored cold-start
+setting where an anomaly detection model is initialized using zero-shot
+guidance, but subsequently receives a small number of contaminated observations
+(namely, that may include anomalies). The goal is to make efficient use of both
+the zero-shot guidance and the observations. We propose ColdFusion, a method
+that effectively adapts the zero-shot anomaly detector to contaminated
+observations. To support future development of this new setting, we propose an
+evaluation suite consisting of evaluation protocols and metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024. Our code is available at
+  https://github.com/talreiss/ColdFusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoSy: Evaluating Textual Explanations of Neurons 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Kopf, Philine Lou Bommer, Anna Hedström, Sebastian Lapuschkin, Marina M. -C. Höhne, Kirill Bykov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A crucial aspect of understanding the complex nature of Deep Neural Networks
+(DNNs) is the ability to explain learned concepts within their latent
+representations. While various methods exist to connect neurons to textual
+descriptions of human-understandable concepts, evaluating the quality of these
+explanation methods presents a major challenge in the field due to a lack of
+unified, general-purpose quantitative evaluation. In this work, we introduce
+CoSy (Concept Synthesis) -- a novel, architecture-agnostic framework to
+evaluate the quality of textual explanations for latent neurons. Given textual
+explanations, our proposed framework leverages a generative model conditioned
+on textual input to create data points representing the textual explanation.
+Then, the neuron's response to these explanation data points is compared with
+the response to control data points, providing a quality estimate of the given
+explanation. We ensure the reliability of our proposed framework in a series of
+meta-evaluation experiments and demonstrate practical value through insights
+from benchmarking various concept-based textual explanation methods for
+Computer Vision tasks, showing that tested explanation methods significantly
+differ in quality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Don't drop your samples! Coherence-aware training benefits Conditional
+  diffusion <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Dufour, Victor Besnier, Vicky Kalogeiton, David Picard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conditional diffusion models are powerful generative models that can leverage
+various types of conditional information, such as class labels, segmentation
+masks, or text captions. However, in many real-world scenarios, conditional
+information may be noisy or unreliable due to human annotation errors or weak
+alignment. In this paper, we propose the Coherence-Aware Diffusion (CAD), a
+novel method that integrates coherence in conditional information into
+diffusion models, allowing them to learn from noisy annotations without
+discarding data. We assume that each data point has an associated coherence
+score that reflects the quality of the conditional information. We then
+condition the diffusion model on both the conditional information and the
+coherence score. In this way, the model learns to ignore or discount the
+conditioning when the coherence is low. We show that CAD is theoretically sound
+and empirically effective on various conditional generation tasks. Moreover, we
+show that leveraging coherence generates realistic and diverse samples that
+respect conditional information better than models trained on cleaned datasets
+where samples with low coherence have been discarded.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CVPR 2024 as a Highlight. Project page:
+  https://nicolas-dufour.github.io/cad.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-based Manipulation from Single Human Video with Open-World Object
+  Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifeng Zhu, Arisrei Lim, Peter Stone, Yuke Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an object-centric approach to empower robots to learn vision-based
+manipulation skills from human videos. We investigate the problem of imitating
+robot manipulation from a single human video in the open-world setting, where a
+robot must learn to manipulate novel objects from one video demonstration. We
+introduce ORION, an algorithm that tackles the problem by extracting an
+object-centric manipulation plan from a single RGB-D video and deriving a
+policy that conditions on the extracted plan. Our method enables the robot to
+learn from videos captured by daily mobile devices such as an iPad and
+generalize the policies to deployment environments with varying visual
+backgrounds, camera angles, spatial layouts, and novel object instances. We
+systematically evaluate our method on both short-horizon and long-horizon
+tasks, demonstrating the efficacy of ORION in learning from a single human
+video in the open world. Videos can be found in the project website
+https://ut-austin-rpl.github.io/ORION-release.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving the Training of Rectified Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangyun Lee, Zinan Lin, Giulia Fanti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have shown great promise for image and video generation, but
+sampling from state-of-the-art models requires expensive numerical integration
+of a generative ODE. One approach for tackling this problem is rectified flows,
+which iteratively learn smooth ODE paths that are less susceptible to
+truncation error. However, rectified flows still require a relatively large
+number of function evaluations (NFEs). In this work, we propose improved
+techniques for training rectified flows, allowing them to compete with
+knowledge distillation methods even in the low NFE setting. Our main insight is
+that under realistic settings, a single iteration of the Reflow algorithm for
+training rectified flows is sufficient to learn nearly straight trajectories;
+hence, the current practice of using multiple Reflow iterations is unnecessary.
+We thus propose techniques to improve one-round training of rectified flows,
+including a U-shaped timestep distribution and LPIPS-Huber premetric. With
+these techniques, we improve the FID of the previous 2-rectified flow by up to
+72% in the 1 NFE setting on CIFAR-10. On ImageNet 64$\times$64, our improved
+rectified flow outperforms the state-of-the-art distillation methods such as
+consistency distillation and progressive distillation in both one-step and
+two-step settings and rivals the performance of improved consistency training
+(iCT) in FID. Code is available at https://github.com/sangyun884/rfpp.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CausalQuest: Collecting Natural Causal Questions for AI Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roberto Ceraolo, Dmitrii Kharlapenko, Amélie Reymond, Rada Mihalcea, Mrinmaya Sachan, Bernhard Schölkopf, Zhijing Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans have an innate drive to seek out causality. Whether fuelled by
+curiosity or specific goals, we constantly question why things happen, how they
+are interconnected, and many other related phenomena. To develop AI agents
+capable of addressing this natural human quest for causality, we urgently need
+a comprehensive dataset of natural causal questions. Unfortunately, existing
+datasets either contain only artificially-crafted questions that do not reflect
+real AI usage scenarios or have limited coverage of questions from specific
+sources. To address this gap, we present CausalQuest, a dataset of 13,500
+naturally occurring questions sourced from social networks, search engines, and
+AI assistants. We formalize the definition of causal questions and establish a
+taxonomy for finer-grained classification. Through a combined effort of human
+annotators and large language models (LLMs), we carefully label the dataset. We
+find that 42% of the questions humans ask are indeed causal, with the majority
+seeking to understand the causes behind given effects. Using this dataset, we
+train efficient classifiers (up to 2.85B parameters) for the binary task of
+identifying causal questions, achieving high performance with F1 scores of up
+to 0.877. We conclude with a rich set of future research directions that can
+build upon our data and models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sequence-Augmented SE(3)-Flow Matching For Conditional Protein Backbone
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guillaume Huguet, James Vuckovic, Kilian Fatras, Eric Thibodeau-Laufer, Pablo Lemos, Riashat Islam, Cheng-Hao Liu, Jarrid Rector-Brooks, Tara Akhound-Sadegh, Michael Bronstein, Alexander Tong, Avishek Joey Bose
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Proteins are essential for almost all biological processes and derive their
+diverse functions from complex 3D structures, which are in turn determined by
+their amino acid sequences. In this paper, we exploit the rich biological
+inductive bias of amino acid sequences and introduce FoldFlow-2, a novel
+sequence-conditioned SE(3)-equivariant flow matching model for protein
+structure generation. FoldFlow-2 presents substantial new architectural
+features over the previous FoldFlow family of models including a protein large
+language model to encode sequence, a new multi-modal fusion trunk that combines
+structure and sequence representations, and a geometric transformer based
+decoder. To increase diversity and novelty of generated samples -- crucial for
+de-novo drug design -- we train FoldFlow-2 at scale on a new dataset that is an
+order of magnitude larger than PDB datasets of prior works, containing both
+known proteins in PDB and high-quality synthetic structures achieved through
+filtering. We further demonstrate the ability to align FoldFlow-2 to arbitrary
+rewards, e.g. increasing secondary structures diversity, by introducing a
+Reinforced Finetuning (ReFT) objective. We empirically observe that FoldFlow-2
+outperforms previous state-of-the-art protein structure-based generative
+models, improving over RFDiffusion in terms of unconditional generation across
+all metrics including designability, diversity, and novelty across all protein
+lengths, as well as exhibiting generalization on the task of equilibrium
+conformation sampling. Finally, we demonstrate that a fine-tuned FoldFlow-2
+makes progress on challenging conditional design tasks such as designing
+scaffolds for the VHH nanobody.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models Can Self-Improve At Web Agent Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ajay Patel, Markus Hofmarcher, Claudiu Leoveanu-Condrei, Marius-Constantin Dinu, Chris Callison-Burch, Sepp Hochreiter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training models to act as agents that can effectively navigate and perform
+actions in a complex environment, such as a web browser, has typically been
+challenging due to lack of training data. Large language models (LLMs) have
+recently demonstrated some capability to navigate novel environments as agents
+in a zero-shot or few-shot fashion, purely guided by natural language
+instructions as prompts. Recent research has also demonstrated LLMs have the
+capability to exceed their base performance through self-improvement, i.e.
+fine-tuning on data generated by the model itself. In this work, we explore the
+extent to which LLMs can self-improve their performance as agents in
+long-horizon tasks in a complex environment using the WebArena benchmark. In
+WebArena, an agent must autonomously navigate and perform actions on web pages
+to achieve a specified objective. We explore fine-tuning on three distinct
+synthetic training data mixtures and achieve a 31\% improvement in task
+completion rate over the base model on the WebArena benchmark through a
+self-improvement procedure. We additionally contribute novel evaluation metrics
+for assessing the performance, robustness, capabilities, and quality of
+trajectories of our fine-tuned agent models to a greater degree than simple,
+aggregate-level benchmark scores currently used to measure self-improvement.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Group Robust Preference Optimization in Reward-free RLHF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shyam Sundhar Ramesh, Yifan Hu, Iason Chaimalas, Viraj Mehta, Pier Giuseppe Sessa, Haitham Bou Ammar, Ilija Bogunovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapting large language models (LLMs) for specific tasks usually involves
+fine-tuning through reinforcement learning with human feedback (RLHF) on
+preference data. While these data often come from diverse labelers' groups
+(e.g., different demographics, ethnicities, company teams, etc.), traditional
+RLHF approaches adopt a "one-size-fits-all" approach, i.e., they
+indiscriminately assume and optimize a single preference model, thus not being
+robust to unique characteristics and needs of the various groups. To address
+this limitation, we propose a novel Group Robust Preference Optimization (GRPO)
+method to align LLMs to individual groups' preferences robustly. Our approach
+builds upon reward-free direct preference optimization methods, but unlike
+previous approaches, it seeks a robust policy which maximizes the worst-case
+group performance. To achieve this, GRPO adaptively and sequentially weights
+the importance of different groups, prioritizing groups with worse cumulative
+loss. We theoretically study the feasibility of GRPO and analyze its
+convergence for the log-linear policy class. By fine-tuning LLMs with GRPO
+using diverse group-based global opinion data, we significantly improved
+performance for the worst-performing groups, reduced loss imbalances across
+groups, and improved probability accuracies compared to non-robust baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DITTO-2: Distilled Diffusion Inference-Time T-Optimization for Music
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zachary Novack, Julian McAuley, Taylor Berg-Kirkpatrick, Nicholas Bryan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controllable music generation methods are critical for human-centered
+AI-based music creation, but are currently limited by speed, quality, and
+control design trade-offs. Diffusion Inference-Time T-optimization (DITTO), in
+particular, offers state-of-the-art results, but is over 10x slower than
+real-time, limiting practical use. We propose Distilled Diffusion
+Inference-Time T -Optimization (or DITTO-2), a new method to speed up
+inference-time optimization-based control and unlock faster-than-real-time
+generation for a wide-variety of applications such as music inpainting,
+outpainting, intensity, melody, and musical structure control. Our method works
+by (1) distilling a pre-trained diffusion model for fast sampling via an
+efficient, modified consistency or consistency trajectory distillation process
+(2) performing inference-time optimization using our distilled model with
+one-step sampling as an efficient surrogate optimization task and (3) running a
+final multi-step sampling generation (decoding) using our estimated noise
+latents for best-quality, fast, controllable generation. Through thorough
+evaluation, we find our method not only speeds up generation over 10-20x, but
+simultaneously improves control adherence and generation quality all at once.
+Furthermore, we apply our approach to a new application of maximizing text
+adherence (CLAP score) and show we can convert an unconditional diffusion model
+without text inputs into a model that yields state-of-the-art text control.
+Sound examples can be found at https://ditto-music.github.io/ditto2/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flexible SE(2) graph neural networks with applications to PDE surrogates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Bånkestad, Olof Mogren, Aleksis Pirinen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a novel approach for constructing graph neural networks
+equivariant to 2D rotations and translations and leveraging them as PDE
+surrogates on non-gridded domains. We show that aligning the representations
+with the principal axis allows us to sidestep many constraints while preserving
+SE(2) equivariance. By applying our model as a surrogate for fluid flow
+simulations and conducting thorough benchmarks against non-equivariant models,
+we demonstrate significant gains in terms of both data efficiency and accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Length independent generalization bounds for deep SSM architectures with
+  stability constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dániel Rácz, Mihály Petreczky, Bálint Daróczy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many state-of-the-art models trained on long-range sequences, for example S4,
+S5 or LRU, are made of sequential blocks combining State-Space Models (SSMs)
+with neural networks. In this paper we provide a PAC bound that holds for these
+kind of architectures with stable SSM blocks and does not depend on the length
+of the input sequence. Imposing stability of the SSM blocks is a standard
+practice in the literature, and it is known to help performance. Our results
+provide a theoretical justification for the use of stable SSM blocks as the
+proposed PAC bound decreases as the degree of stability of the SSM blocks
+increases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, no figures, under submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ROAST: <span class="highlight-title">Review</span>-level Opinion Aspect Sentiment Target Joint Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20274v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20274v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siva Uday Sampreeth Chebolu, Franck Dernoncourt, Nedim Lipka, Thamar Solorio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aspect-Based Sentiment Analysis (ABSA) has experienced tremendous expansion
+and diversity due to various shared tasks spanning several languages and fields
+and organized via SemEval workshops and Germeval. Nonetheless, a few
+shortcomings still need to be addressed, such as the lack of low-resource
+language evaluations and the emphasis on sentence-level analysis. To thoroughly
+assess ABSA techniques in the context of complete reviews, this research
+presents a novel task, Review-Level Opinion Aspect Sentiment Target (ROAST).
+ROAST seeks to close the gap between sentence-level and text-level ABSA by
+identifying every ABSA constituent at the review level. We extend the available
+datasets to enable ROAST, addressing the drawbacks noted in previous research
+by incorporating low-resource languages, numerous languages, and a variety of
+topics. Through this effort, ABSA research will be able to cover more ground
+and get a deeper comprehension of the task and its practical application in a
+variety of languages and domains (https://github.com/RiTUAL-UH/ROAST-ABSA).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2309.13297</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reconstruction Attacks on Machine Unlearning: Simple Models are
+  Vulnerable 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20272v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20272v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Bertran, Shuai Tang, Michael Kearns, Jamie Morgenstern, Aaron Roth, Zhiwei Steven Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning is motivated by desire for data autonomy: a person can
+request to have their data's influence removed from deployed models, and those
+models should be updated as if they were retrained without the person's data.
+We show that, counter-intuitively, these updates expose individuals to
+high-accuracy reconstruction attacks which allow the attacker to recover their
+data in its entirety, even when the original models are so simple that privacy
+risk might not otherwise have been a concern. We show how to mount a
+near-perfect attack on the deleted data point from linear regression models. We
+then generalize our attack to other loss functions and architectures, and
+empirically demonstrate the effectiveness of our attacks across a wide range of
+datasets (capturing both tabular and image data). Our work highlights that
+privacy risk is significant even for extremely simple model classes when
+individuals can request deletion of their data from the model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ETHER: Efficient Finetuning of Large-Scale Models with Hyperplane
+  Reflections <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20271v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20271v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Massimo Bini, Karsten Roth, Zeynep Akata, Anna Khoreva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient finetuning (PEFT) has become ubiquitous to adapt
+foundation models to downstream task requirements while retaining their
+generalization ability. However, the amount of additionally introduced
+parameters and compute for successful adaptation and hyperparameter searches
+can explode quickly, especially when deployed at scale to serve numerous
+individual requests. To ensure effective, parameter-efficient, and
+hyperparameter-robust adaptation, we propose the ETHER transformation family,
+which performs Efficient fineTuning via HypErplane Reflections. By design,
+ETHER transformations require a minimal number of parameters, are less likely
+to deteriorate model performance, and exhibit robustness to hyperparameter and
+learning rate choices. In particular, we introduce ETHER and its relaxation
+ETHER+, which match or outperform existing PEFT methods with significantly
+fewer parameters ($\sim$$10$-$100$ times lower than LoRA or OFT) across
+multiple image synthesis and natural language tasks without exhaustive
+hyperparameter tuning. Finally, we investigate the recent emphasis on
+Hyperspherical Energy retention for adaptation and raise questions on its
+practical utility. The code is available at https://github.com/mwbini/ether.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024. Code available at
+  https://github.com/mwbini/ether</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entropy annealing for policy mirror descent in continuous time and space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deven Sethi, David Šiška, Yufei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entropy regularization has been extensively used in policy optimization
+algorithms to regularize the optimization landscape and accelerate convergence;
+however, it comes at the cost of introducing an additional regularization bias.
+This work quantifies the impact of entropy regularization on the convergence of
+policy gradient methods for stochastic exit time control problems. We analyze a
+continuous-time policy mirror descent dynamics, which updates the policy based
+on the gradient of an entropy-regularized value function and adjusts the
+strength of entropy regularization as the algorithm progresses. We prove that
+with a fixed entropy level, the dynamics converges exponentially to the optimal
+solution of the regularized problem. We further show that when the entropy
+level decays at suitable polynomial rates, the annealed flow converges to the
+solution of the unregularized problem at a rate of $\mathcal O(1/S)$ for
+discrete action spaces and, under suitable conditions, at a rate of $\mathcal
+O(1/\sqrt{S})$ for general action spaces, with $S$ being the gradient flow
+time. This paper explains how entropy regularization improves policy
+optimization, even with the true gradient, from the perspective of convergence
+rate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KerasCV and KerasNLP: Vision and Language Power-Ups 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Watson, Divyashree Shivakumar Sreepathihalli, Francois Chollet, Martin Gorner, Kiranbir Sodhia, Ramesh Sampath, Tirth Patel, Haifeng Jin, Neel Kovelamudi, Gabriel Rasskin, Samaneh Saadat, Luke Wood, Chen Qian, Jonathan Bischof, Ian Stenbit
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Keras domain packages KerasCV and KerasNLP, extensions of the
+Keras API for Computer Vision and Natural Language Processing workflows,
+capable of running on either JAX, TensorFlow, or PyTorch. These domain packages
+are designed to enable fast experimentation, with a focus on ease-of-use and
+performance. We adopt a modular, layered design: at the library's lowest level
+of abstraction, we provide building blocks for creating models and data
+preprocessing pipelines, and at the library's highest level of abstraction, we
+provide pretrained ``task" models for popular architectures such as Stable
+Diffusion, YOLOv8, GPT2, BERT, Mistral, CLIP, Gemma, T5, etc. Task models have
+built-in preprocessing, pretrained weights, and can be fine-tuned on raw
+inputs. To enable efficient training, we support XLA compilation for all
+models, and run all preprocessing via a compiled graph of TensorFlow operations
+using the tf.data API. The libraries are fully open-source (Apache 2.0 license)
+and available on GitHub.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Journal of Machine Learning Open Source Software</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval Augmented Structured Generation: Business Document Information
+  Extraction As Tool Use 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franz Louis Cesista, Rui Aguiar, Jason Kim, Paolo Acilo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Business Document Information Extraction (BDIE) is the problem of
+transforming a blob of unstructured information (raw text, scanned documents,
+etc.) into a structured format that downstream systems can parse and use. It
+has two main tasks: Key-Information Extraction (KIE) and Line Items Recognition
+(LIR). In this paper, we argue that BDIE is best modeled as a Tool Use problem,
+where the tools are these downstream systems. We then present Retrieval
+Augmented Structured Generation (RASG), a novel general framework for BDIE that
+achieves state of the art (SOTA) results on both KIE and LIR tasks on BDIE
+benchmarks.
+  The contributions of this paper are threefold: (1) We show, with ablation
+benchmarks, that Large Language Models (LLMs) with RASG are already competitive
+with or surpasses current SOTA Large Multimodal Models (LMMs) without RASG on
+BDIE benchmarks. (2) We propose a new metric class for Line Items Recognition,
+General Line Items Recognition Metric (GLIRM), that is more aligned with
+practical BDIE use cases compared to existing metrics, such as ANLS*, DocILE,
+and GriTS. (3) We provide a heuristic algorithm for backcalculating bounding
+boxes of predicted line items and tables without the need for vision encoders.
+Finally, we claim that, while LMMs might sometimes offer marginal performance
+benefits, LLMs + RASG is oftentimes superior given real-world applications and
+constraints of BDIE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE 7th International Conference on Multimedia
+  Information Processing and Retrieval (MIPR), 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training-efficient density quantum machine learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Brian Coyle, El Amine Cherrat, Nishant Jain, Natansh Mathur, Snehal Raj, Skander Kazdaghli, Iordanis Kerenidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum machine learning requires powerful, flexible and efficiently
+trainable models to be successful in solving challenging problems. In this
+work, we present density quantum neural networks, a learning model
+incorporating randomisation over a set of trainable unitaries. These models
+generalise quantum neural networks using parameterised quantum circuits, and
+allow a trade-off between expressibility and efficient trainability,
+particularly on quantum hardware. We demonstrate the flexibility of the
+formalism by applying it to two recently proposed model families. The first are
+commuting-block quantum neural networks (QNNs) which are efficiently trainable
+but may be limited in expressibility. The second are orthogonal (Hamming-weight
+preserving) quantum neural networks which provide well-defined and
+interpretable transformations on data but are challenging to train at scale on
+quantum devices. Density commuting QNNs improve capacity with minimal gradient
+complexity overhead, and density orthogonal neural networks admit a
+quadratic-to-constant gradient query advantage with minimal to no performance
+loss. We conduct numerical experiments on synthetic translationally invariant
+data and MNIST image data with hyperparameter optimisation to support our
+findings. Finally, we discuss the connection to post-variational quantum neural
+networks, measurement-based quantum machine learning and the dropout mechanism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages main text, 9 pages appendices. 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangling and Mitigating the Impact of Task Similarity for Continual
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoki Hiratani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning of partially similar tasks poses a challenge for
+artificial neural networks, as task similarity presents both an opportunity for
+knowledge transfer and a risk of interference and catastrophic forgetting.
+However, it remains unclear how task similarity in input features and readout
+patterns influences knowledge transfer and forgetting, as well as how they
+interact with common algorithms for continual learning. Here, we develop a
+linear teacher-student model with latent structure and show analytically that
+high input feature similarity coupled with low readout similarity is
+catastrophic for both knowledge transfer and retention. Conversely, the
+opposite scenario is relatively benign. Our analysis further reveals that
+task-dependent activity gating improves knowledge retention at the expense of
+transfer, while task-dependent plasticity gating does not affect either
+retention or transfer performance at the over-parameterized limit. In contrast,
+weight regularization based on the Fisher information metric significantly
+improves retention, regardless of task similarity, without compromising
+transfer performance. Nevertheless, its diagonal approximation and
+regularization in the Euclidean space are much less robust against task
+similarity. We demonstrate consistent results in a permuted MNIST task with
+latent variables. Overall, this work provides insights into when continual
+learning is difficult and how to mitigate it.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grokfast: Accelerated Grokking by Amplifying Slow Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One puzzling artifact in machine learning dubbed grokking is where delayed
+generalization is achieved tenfolds of iterations after near perfect
+overfitting to the training data. Focusing on the long delay itself on behalf
+of machine learning practitioners, our goal is to accelerate generalization of
+a model under grokking phenomenon. By regarding a series of gradients of a
+parameter over training iterations as a random signal over time, we can
+spectrally decompose the parameter trajectories under gradient descent into two
+components: the fast-varying, overfitting-yielding component and the
+slow-varying, generalization-inducing component. This analysis allows us to
+accelerate the grokking phenomenon more than $\times 50$ with only a few lines
+of code that amplifies the slow-varying components of gradients. The
+experiments show that our algorithm applies to diverse tasks involving images,
+languages, and graphs, enabling practical availability of this peculiar
+artifact of sudden generalization. Our code is available at
+\url{https://github.com/ironjr/grokfast}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 12 figures. Project page:
+  https://jaerinlee.com/research/grokfast</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Empirical Impact of Neural Parameter Symmetries, or Lack Thereof 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20231v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20231v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Derek Lim, Moe Putterman, Robin Walters, Haggai Maron, Stefanie Jegelka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many algorithms and observed phenomena in deep learning appear to be affected
+by parameter symmetries -- transformations of neural network parameters that do
+not change the underlying neural network function. These include linear mode
+connectivity, model merging, Bayesian neural network inference, metanetworks,
+and several other characteristics of optimization or loss-landscapes. However,
+theoretical analysis of the relationship between parameter space symmetries and
+these phenomena is difficult. In this work, we empirically investigate the
+impact of neural parameter symmetries by introducing new neural network
+architectures that have reduced parameter space symmetries. We develop two
+methods, with some provable guarantees, of modifying standard neural networks
+to reduce parameter space symmetries. With these new methods, we conduct a
+comprehensive experimental study consisting of multiple tasks aimed at
+assessing the effect of removing parameter symmetries. Our experiments reveal
+several interesting observations on the empirical impact of parameter
+symmetries; for instance, we observe linear mode connectivity between our
+networks without alignment of weight spaces, and we find that our networks
+allow for faster and more effective Bayesian neural network training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages. Preparing code for release</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boost Your Own Human Image Generation Model via Direct Preference
+  Optimization with AI Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanghyeon Na, Yonggyu Kim, Hyunjoon Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generation of high-quality human images through text-to-image (T2I)
+methods is a significant yet challenging task. Distinct from general image
+generation, human image synthesis must satisfy stringent criteria related to
+human pose, anatomy, and alignment with textual prompts, making it particularly
+difficult to achieve realistic results. Recent advancements in T2I generation
+based on diffusion models have shown promise, yet challenges remain in meeting
+human-specific preferences. In this paper, we introduce a novel approach
+tailored specifically for human image generation utilizing Direct Preference
+Optimization (DPO). Specifically, we introduce an efficient method for
+constructing a specialized DPO dataset for training human image generation
+models without the need for costly human feedback. We also propose a modified
+loss function that enhances the DPO training process by minimizing artifacts
+and improving image fidelity. Our method demonstrates its versatility and
+effectiveness in generating human images, including personalized text-to-image
+generation. Through comprehensive evaluations, we show that our approach
+significantly advances the state of human image generation, achieving superior
+results in terms of natural anatomies, poses, and text-image alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 18 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PostDoc: Generating Poster from a Long Multimodal Document Using Deep
+  Submodular Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vijay Jaisankar, Sambaran Bandyopadhyay, Kalp Vyas, Varre Chaitanya, Shwetha Somasundaram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A poster from a long input document can be considered as a one-page
+easy-to-read multimodal (text and images) summary presented on a nice template
+with good design elements. Automatic transformation of a long document into a
+poster is a very less studied but challenging task. It involves content
+summarization of the input document followed by template generation and
+harmonization. In this work, we propose a novel deep submodular function which
+can be trained on ground truth summaries to extract multimodal content from the
+document and explicitly ensures good coverage, diversity and alignment of text
+and images. Then, we use an LLM based paraphraser and propose to generate a
+template with various design aspects conditioned on the input content. We show
+the merits of our approach through extensive automated and human evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unified Explanations in Machine Learning Models: A Perturbation Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20200v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20200v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacob Dineen, Don Kridel, Daniel Dolk, David Castillo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A high-velocity paradigm shift towards Explainable Artificial Intelligence
+(XAI) has emerged in recent years. Highly complex Machine Learning (ML) models
+have flourished in many tasks of intelligence, and the questions have started
+to shift away from traditional metrics of validity towards something deeper:
+What is this model telling me about my data, and how is it arriving at these
+conclusions? Inconsistencies between XAI and modeling techniques can have the
+undesirable effect of casting doubt upon the efficacy of these explainability
+approaches. To address these problems, we propose a systematic,
+perturbation-based analysis against a popular, model-agnostic method in XAI,
+SHapley Additive exPlanations (Shap). We devise algorithms to generate relative
+feature importance in settings of dynamic inference amongst a suite of popular
+machine learning and deep learning methods, and metrics that allow us to
+quantify how well explanations generated under the static case hold. We propose
+a taxonomy for feature importance methodology, measure alignment, and observe
+quantifiable similarity amongst explanation models across several datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Occam Gradient Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        B. N. Kausik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning neural network models must be large enough to adapt to their
+problem domain, while small enough to avoid overfitting training data during
+gradient descent. To balance these competing demands, overprovisioned deep
+learning models such as transformers are trained for a single epoch on large
+data sets, and hence inefficient with both computing resources and training
+data. In response to these inefficiencies, we exploit learning theory to derive
+Occam Gradient Descent, an algorithm that interleaves adaptive reduction of
+model size to minimize generalization error, with gradient descent on model
+weights to minimize fitting error. In contrast, traditional gradient descent
+greedily minimizes fitting error without regard to generalization error. Our
+algorithm simultaneously descends the space of weights and topological size of
+any neural network without modification, and is effective in our experiments in
+outperforming traditional gradient descent with or without post-train pruning
+in accuracy, compute and model compression.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s and Slot Encoding for Sample Efficient Physical World
+  Modelling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20180v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20180v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Petri, Luigi Asprino, Aldo Gangemi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  World modelling, i.e. building a representation of the rules that govern the
+world so as to predict its evolution, is an essential ability for any agent
+interacting with the physical world. Recent applications of the Transformer
+architecture to the problem of world modelling from video input show notable
+improvements in sample efficiency. However, existing approaches tend to work
+only at the image level thus disregarding that the environment is composed of
+objects interacting with each other. In this paper, we propose an architecture
+combining Transformers for world modelling with the slot-attention paradigm, an
+approach for learning representations of objects appearing in a scene. We
+describe the resulting neural architecture and report experimental results
+showing an improvement over the existing solutions in terms of sample
+efficiency and a reduction of the variation of the performance over the
+training examples. The code for our architecture and experiments is available
+at https://github.com/torchipeppo/transformers-and-slot-encoding-for-wm
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-intrusive data-driven model order reduction for circuits based on
+  Hammerstein architectures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joshua Hanson, Biliana Paskaleva, Pavel Bochev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We demonstrate that data-driven system identification techniques can provide
+a basis for effective, non-intrusive model order reduction (MOR) for common
+circuits that are key building blocks in microelectronics. Our approach is
+motivated by the practical operation of these circuits and utilizes a canonical
+Hammerstein architecture. To demonstrate the approach we develop a parsimonious
+Hammerstein model for a non-linear CMOS differential amplifier. We train this
+model on a combination of direct current (DC) and transient Spice (Xyce)
+circuit simulation data using a novel sequential strategy to identify the
+static nonlinear and linear dynamical parts of the model. Simulation results
+show that the Hammerstein model is an effective surrogate for the differential
+amplifier circuit that accurately and efficiently reproduces its behavior over
+a wide range of operating points and input frequencies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 13 figures; submitted to IEEE Transactions on
+  Computer-Aided Design of Integrated Circuits and Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tropical Expressivity of Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20174v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20174v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiv Bhatia, Yueqi Cao, Paul Lezeau, Anthea Monod
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an algebraic geometric framework to study the expressivity of
+linear activation neural networks. A particular quantity that has been actively
+studied in the field of deep learning is the number of linear regions, which
+gives an estimate of the information capacity of the architecture. To study and
+evaluate information capacity and expressivity, we work in the setting of
+tropical geometry -- a combinatorial and polyhedral variant of algebraic
+geometry -- where there are known connections between tropical rational maps
+and feedforward neural networks. Our work builds on and expands this connection
+to capitalize on the rich theory of tropical geometry to characterize and study
+various architectural aspects of neural networks. Our contributions are
+threefold: we provide a novel tropical geometric approach to selecting sampling
+domains among linear regions; an algebraic result allowing for a guided
+restriction of the sampling domain for network architectures with symmetries;
+and an open source library to analyze neural networks as tropical Puiseux
+rational maps. We provide a comprehensive set of proof-of-concept numerical
+experiments demonstrating the breadth of neural network architectures to which
+tropical geometric theory can be applied to reveal insights on expressivity
+characteristics of a network. Our work provides the foundations for the
+adaptation of both theory and existing software from computational tropical
+geometry and symbolic computation to deep learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Feature Boosting for Explainable Speech Emotion Recognition <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alaa Nfissi, Wassim Bouachir, Nizar Bouguila, Brian Mishara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In speech emotion recognition (SER), using predefined features without
+considering their practical importance may lead to high dimensional datasets,
+including redundant and irrelevant information. Consequently, high-dimensional
+learning often results in decreasing model accuracy while increasing
+computational complexity. Our work underlines the importance of carefully
+considering and analyzing features in order to build efficient SER systems. We
+present a new supervised SER method based on an efficient feature engineering
+approach. We pay particular attention to the explainability of results to
+evaluate feature relevance and refine feature sets. This is performed
+iteratively through feature evaluation loop, using Shapley values to boost
+feature selection and improve overall framework performance. Our approach
+allows thus to balance the benefits between model performance and transparency.
+The proposed method outperforms human-level performance (HLP) and
+state-of-the-art machine learning methods in emotion recognition on the TESS
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in: 2023 International Conference on Machine Learning and
+  Applications (ICMLA)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Randomized Exploration for Reinforcement Learning with Multinomial
+  Logistic Function Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wooseong Cho, Taehyun Hwang, Joongkyu Lee, Min-hwan Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study reinforcement learning with multinomial logistic (MNL) function
+approximation where the underlying transition probability kernel of the Markov
+decision processes (MDPs) is parametrized by an unknown transition core with
+features of state and action. For the finite horizon episodic setting with
+inhomogeneous state transitions, we propose provably efficient algorithms with
+randomized exploration having frequentist regret guarantees. For our first
+algorithm, $\texttt{RRL-MNL}$, we adapt optimistic sampling to ensure the
+optimism of the estimated value function with sufficient frequency and
+establish that $\texttt{RRL-MNL}$ is both statistically and computationally
+efficient, achieving a $\tilde{O}(\kappa^{-1} d^{\frac{3}{2}} H^{\frac{3}{2}}
+\sqrt{T})$ frequentist regret bound with constant-time computational cost per
+episode. Here, $d$ is the dimension of the transition core, $H$ is the horizon
+length, $T$ is the total number of steps, and $\kappa$ is a problem-dependent
+constant. Despite the simplicity and practicality of $\texttt{RRL-MNL}$, its
+regret bound scales with $\kappa^{-1}$, which is potentially large in the worst
+case. To improve the dependence on $\kappa^{-1}$, we propose
+$\texttt{ORRL-MNL}$, which estimates the value function using local gradient
+information of the MNL transition model. We show that its frequentist regret
+bound is $\tilde{O}(d^{\frac{3}{2}} H^{\frac{3}{2}} \sqrt{T} + \kappa^{-1} d^2
+H^2)$. To the best of our knowledge, these are the first randomized RL
+algorithms for the MNL transition model that achieve both computational and
+statistical efficiency. Numerical experiments demonstrate the superior
+performance of the proposed algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GNN-RAG: Graph Neural Retrieval for Large Language Model Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Costas Mavromatis, George Karypis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graphs (KGs) represent human-crafted factual knowledge in the form
+of triplets (head, relation, tail), which collectively form a graph. Question
+Answering over KGs (KGQA) is the task of answering natural questions grounding
+the reasoning to the information provided by the KG. Large Language Models
+(LLMs) are the state-of-the-art models for QA tasks due to their remarkable
+ability to understand natural language. On the other hand, Graph Neural
+Networks (GNNs) have been widely used for KGQA as they can handle the complex
+graph information stored in the KG. In this work, we introduce GNN-RAG, a novel
+method for combining language understanding abilities of LLMs with the
+reasoning abilities of GNNs in a retrieval-augmented generation (RAG) style.
+First, a GNN reasons over a dense KG subgraph to retrieve answer candidates for
+a given question. Second, the shortest paths in the KG that connect question
+entities and answer candidates are extracted to represent KG reasoning paths.
+The extracted paths are verbalized and given as input for LLM reasoning with
+RAG. In our GNN-RAG framework, the GNN acts as a dense subgraph reasoner to
+extract useful graph information, while the LLM leverages its natural language
+processing ability for ultimate KGQA. Furthermore, we develop a retrieval
+augmentation (RA) technique to further boost KGQA performance with GNN-RAG.
+Experimental results show that GNN-RAG achieves state-of-the-art performance in
+two widely used KGQA benchmarks (WebQSP and CWQ), outperforming or matching
+GPT-4 performance with a 7B tuned LLM. In addition, GNN-RAG excels on multi-hop
+and multi-entity questions outperforming competing approaches by 8.9--15.5%
+points at answer F1.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SPAM: Stochastic Proximal Point Method with Momentum Variance Reduction
+  for Non-convex Cross-Device Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avetik Karagulyan, Egor Shulgin, Abdurakhmon Sadiev, Peter Richtárik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-device training is a crucial subfield of federated learning, where the
+number of clients can reach into the billions. Standard approaches and local
+methods are prone to issues such as client drift and insensitivity to data
+similarities. We propose a novel algorithm (SPAM) for cross-device federated
+learning with non-convex losses, which solves both issues. We provide sharp
+analysis under second-order (Hessian) similarity, a condition satisfied by a
+variety of machine learning problems in practice. Additionally, we extend our
+results to the partial participation setting, where a cohort of selected
+clients communicate with the server at each communication round. Our method is
+the first in its kind, that does not require the smoothness of the objective
+and provably benefits from clients having similar data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The main part of the paper is around 9 pages. It contains the
+  proposed algorithms, the main theoretical results and the experimental
+  setting. The proofs of the main results and other technicalities are deferred
+  to the Appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Geometric Unification of Distributionally Robust Covariance
+  Estimators: Shrinking the Spectrum by Inflating the Ambiguity Set 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20124v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20124v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Man-Chung Yue, Yves Rychener, Daniel Kuhn, Viet Anh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The state-of-the-art methods for estimating high-dimensional covariance
+matrices all shrink the eigenvalues of the sample covariance matrix towards a
+data-insensitive shrinkage target. The underlying shrinkage transformation is
+either chosen heuristically - without compelling theoretical justification - or
+optimally in view of restrictive distributional assumptions. In this paper, we
+propose a principled approach to construct covariance estimators without
+imposing restrictive assumptions. That is, we study distributionally robust
+covariance estimation problems that minimize the worst-case Frobenius error
+with respect to all data distributions close to a nominal distribution, where
+the proximity of distributions is measured via a divergence on the space of
+covariance matrices. We identify mild conditions on this divergence under which
+the resulting minimizers represent shrinkage estimators. We show that the
+corresponding shrinkage transformations are intimately related to the
+geometrical properties of the underlying divergence. We also prove that our
+robust estimators are efficiently computable and asymptotically consistent and
+that they enjoy finite-sample performance guarantees. We exemplify our general
+methodology by synthesizing explicit estimators induced by the
+Kullback-Leibler, Fisher-Rao, and Wasserstein divergences. Numerical
+experiments based on synthetic and real data show that our robust estimators
+are competitive with state-of-the-art estimators.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Near Optimal Decentralized Optimization with Compression and Momentum
+  Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rustem Islamov, Yuan Gao, Sebastian U. Stich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Communication efficiency has garnered significant attention as it is
+considered the main bottleneck for large-scale decentralized Machine Learning
+applications in distributed and federated settings. In this regime, clients are
+restricted to transmitting small amounts of quantized information to their
+neighbors over a communication graph. Numerous endeavors have been made to
+address this challenging problem by developing algorithms with compressed
+communication for decentralized non-convex optimization problems. Despite
+considerable efforts, the current results suffer from various issues such as
+non-scalability with the number of clients, requirements for large batches, or
+bounded gradient assumption. In this paper, we introduce MoTEF, a novel
+approach that integrates communication compression with Momentum Tracking and
+Error Feedback. Our analysis demonstrates that MoTEF achieves most of the
+desired properties, and significantly outperforms existing methods under
+arbitrary data heterogeneity. We provide numerical experiments to validate our
+theoretical findings and confirm the practical superiority of MoTEF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low-dimensional approximations of the conditional law of Volterra
+  processes: a non-positive curvature approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reza Arabpour, John Armstrong, Luca Galimberti, Anastasis Kratsios, Giulia Livieri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting the conditional evolution of Volterra processes with stochastic
+volatility is a crucial challenge in mathematical finance. While deep neural
+network models offer promise in approximating the conditional law of such
+processes, their effectiveness is hindered by the curse of dimensionality
+caused by the infinite dimensionality and non-smooth nature of these problems.
+To address this, we propose a two-step solution. Firstly, we develop a stable
+dimension reduction technique, projecting the law of a reasonably broad class
+of Volterra process onto a low-dimensional statistical manifold of non-positive
+sectional curvature. Next, we introduce a sequentially deep learning model
+tailored to the manifold's geometry, which we show can approximate the
+projected conditional law of the Volterra process. Our model leverages an
+auxiliary hypernetwork to dynamically update its internal parameters, allowing
+it to encode non-stationary dynamics of the Volterra process, and it can be
+interpreted as a gating mechanism in a mixture of expert models where each
+expert is specialized at a specific point in time. Our hypernetwork further
+allows us to achieve approximation rates that would seemingly only be possible
+with very large networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Main body: 25 Pages, Appendices 29 Pages, 14 Tables, 6 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Attention Analysis in Online Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navarro Miriam, Becerra Álvaro, Daza Roberto, Cobos Ruth, Morales Aythami, Fierrez Julian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present an approach in the Multimodal Learning Analytics
+field. Within this approach, we have developed a tool to visualize and analyze
+eye movement data collected during learning sessions in online courses. The
+tool is named VAAD (an acronym for Visual Attention Analysis Dashboard). These
+eye movement data have been gathered using an eye-tracker and subsequently
+processed and visualized for interpretation. The purpose of the tool is to
+conduct a descriptive analysis of the data by facilitating its visualization,
+enabling the identification of differences and learning patterns among various
+learner populations. Additionally, it integrates a predictive module capable of
+anticipating learner activities during a learning session. Consequently, VAAD
+holds the potential to offer valuable insights into online learning behaviors
+from both descriptive and predictive perspectives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CEDI 2024 (VII Congreso Espa\~nol de Inform\'atica), A
+  Coru\~na, Spain</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of a multi-target linear shrinkage covariance estimator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benoit Oriol
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-target linear shrinkage is an extension of the standard single-target
+linear shrinkage for covariance estimation. We combine several constant
+matrices - the targets - with the sample covariance matrix. We derive the
+oracle and a \textit{bona fide} multi-target linear shrinkage estimator with
+exact and empirical mean. In both settings, we proved its convergence towards
+the oracle under Kolmogorov asymptotics. Finally, we show empirically that it
+outperforms other standard estimators in various situations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Soft Partitioning of Latent Space for Semantic Channel Equalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomás Huttebraucker, Mohamed Sana, Emilio Calvanese Strinati
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic channel equalization has emerged as a solution to address language
+mismatch in multi-user semantic communications. This approach aims to align the
+latent spaces of an encoder and a decoder which were not jointly trained and it
+relies on a partition of the semantic (latent) space into atoms based on the
+the semantic meaning. In this work we explore the role of the semantic space
+partition in scenarios where the task structure involves a one-to-many mapping
+between the semantic space and the action space. In such scenarios,
+partitioning based on hard inference results results in loss of information
+which degrades the equalization performance. We propose a soft criterion to
+derive the atoms of the partition which leverages the soft decoder's output and
+offers a more comprehensive understanding of the semantic space's structure.
+Through empirical validation, we demonstrate that soft partitioning yields a
+more descriptive and regular partition of the space, consequently enhancing the
+performance of the equalization algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Segment, Shuffle, and Stitch: A Simple Mechanism for Improving
+  Time-Series Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20082v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20082v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shivam Grover, Amin Jalali, Ali Etemad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing approaches for learning representations of time-series keep the
+temporal arrangement of the time-steps intact with the presumption that the
+original order is the most optimal for learning. However, non-adjacent sections
+of real-world time-series may have strong dependencies. Accordingly we raise
+the question: Is there an alternative arrangement for time-series which could
+enable more effective representation learning? To address this, we propose a
+simple plug-and-play mechanism called Segment, Shuffle, and Stitch (S3)
+designed to improve time-series representation learning of existing models. S3
+works by creating non-overlapping segments from the original sequence and
+shuffling them in a learned manner that is the most optimal for the task at
+hand. It then re-attaches the shuffled segments back together and performs a
+learned weighted sum with the original input to capture both the newly shuffled
+sequence along with the original sequence. S3 is modular and can be stacked to
+create various degrees of granularity, and can be added to many forms of neural
+architectures including CNNs or Transformers with negligible computation
+overhead. Through extensive experiments on several datasets and
+state-of-the-art baselines, we show that incorporating S3 results in
+significant improvements for the tasks of time-series classification and
+forecasting, improving performance on certain datasets by up to 68\%. We also
+show that S3 makes the learning more stable with a smoother training loss curve
+and loss landscape compared to the original baseline. The code is available at
+https://github.com/shivam-grover/S3-TimeSeries .
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Student Answer Forecasting: <span class="highlight-title">Transformer</span>-Driven Answer Choice Prediction
+  for Language Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elena Grazia Gado, Tommaso Martorella, Luca Zunino, Paola Mejia-Domenzain, Vinitra Swamy, Jibril Frej, Tanja Käser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent Tutoring Systems (ITS) enhance personalized learning by
+predicting student answers to provide immediate and customized instruction.
+However, recent research has primarily focused on the correctness of the answer
+rather than the student's performance on specific answer choices, limiting
+insights into students' thought processes and potential misconceptions. To
+address this gap, we present MCQStudentBert, an answer forecasting model that
+leverages the capabilities of Large Language Models (LLMs) to integrate
+contextual understanding of students' answering history along with the text of
+the questions and answers. By predicting the specific answer choices students
+are likely to make, practitioners can easily extend the model to new answer
+choices or remove answer choices for the same multiple-choice question (MCQ)
+without retraining the model. In particular, we compare MLP, LSTM, BERT, and
+Mistral 7B architectures to generate embeddings from students' past
+interactions, which are then incorporated into a finetuned BERT's
+answer-forecasting mechanism. We apply our pipeline to a dataset of language
+learning MCQ, gathered from an ITS with over 10,000 students to explore the
+predictive accuracy of MCQStudentBert, which incorporates student interaction
+patterns, in comparison to correct answer prediction and traditional
+mastery-learning feature-based approaches. This work opens the door to more
+personalized content, modularization, and granular support.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a poster paper at EDM 2024: 17th International Conference
+  on Educational Data Mining in Atlanta, USA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Staged Approach using Machine Learning and Uncertainty Quantification
+  to Predict the Risk of Hip Fracture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anjum Shaik, Kristoffer Larsen, Nancy E. Lane, Chen Zhao, Kuan-Jui Su, Joyce H. Keyak, Qing Tian, Qiuying Sha, Hui Shen, Hong-Wen Deng, Weihua Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite advancements in medical care, hip fractures impose a significant
+burden on individuals and healthcare systems. This paper focuses on the
+prediction of hip fracture risk in older and middle-aged adults, where falls
+and compromised bone quality are predominant factors. We propose a novel staged
+model that combines advanced imaging and clinical data to improve predictive
+performance. By using CNNs to extract features from hip DXA images, along with
+clinical variables, shape measurements, and texture features, our method
+provides a comprehensive framework for assessing fracture risk. A staged
+machine learning-based model was developed using two ensemble models: Ensemble
+1 (clinical variables only) and Ensemble 2 (clinical variables and DXA imaging
+features). This staged approach used uncertainty quantification from Ensemble 1
+to decide if DXA features are necessary for further prediction. Ensemble 2
+exhibited the highest performance, achieving an AUC of 0.9541, an accuracy of
+0.9195, a sensitivity of 0.8078, and a specificity of 0.9427. The staged model
+also performed well, with an AUC of 0.8486, an accuracy of 0.8611, a
+sensitivity of 0.5578, and a specificity of 0.9249, outperforming Ensemble 1,
+which had an AUC of 0.5549, an accuracy of 0.7239, a sensitivity of 0.1956, and
+a specificity of 0.8343. Furthermore, the staged model suggested that 54.49% of
+patients did not require DXA scanning. It effectively balanced accuracy and
+specificity, offering a robust solution when DXA data acquisition is not always
+feasible. Statistical tests confirmed significant differences between the
+models, highlighting the advantages of the advanced modeling strategies. Our
+staged approach could identify individuals at risk with a high accuracy but
+reduce the unnecessary DXA scanning. It has great promise to guide
+interventions to prevent hip fractures with reduced cost and radiation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 5 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Would I Lie To You? Inference Time Alignment of Language Models using
+  Direct Preference Heads 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avelina Asada Hadji-Kyriacou, Ognjen Arandjelovic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained Language Models (LMs) exhibit strong zero-shot and in-context
+learning capabilities; however, their behaviors are often difficult to control.
+By utilizing Reinforcement Learning from Human Feedback (RLHF), it is possible
+to fine-tune unsupervised LMs to follow instructions and produce outputs that
+reflect human preferences. Despite its benefits, RLHF has been shown to
+potentially harm a language model's reasoning capabilities and introduce
+artifacts such as hallucinations where the model may fabricate facts. To
+address this issue we introduce Direct Preference Heads (DPH), a fine-tuning
+framework that enables LMs to learn human preference signals through an
+auxiliary reward head without directly affecting the output distribution of the
+language modeling head. We perform a theoretical analysis of our objective
+function and find strong ties to Conservative Direct Preference Optimization
+(cDPO). Finally we evaluate our models on GLUE, RACE, and the GPT4All
+evaluation suite and demonstrate that our method produces models which achieve
+higher scores than those fine-tuned with Supervised Fine-Tuning (SFT) or Direct
+Preference Optimization (DPO) alone.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hardware-Efficient EMG Decoder with an Attractor-based Neural Network
+  for Next-Generation Hand Prostheses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Kalbasi, MohammadAli Shaeri, Vincent Alexandre Mendez, Solaiman Shokur, Silvestro Micera, Mahsa Shoaran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Advancements in neural engineering have enabled the development of Robotic
+Prosthetic Hands (RPHs) aimed at restoring hand functionality. Current
+commercial RPHs offer limited control through basic on/off commands. Recent
+progresses in machine learning enable finger movement decoding with higher
+degrees of freedom, yet the high computational complexity of such models limits
+their application in portable devices. Future RPH designs must balance
+portability, low power consumption, and high decoding accuracy to be practical
+for individuals with disabilities. To this end, we introduce a novel
+attractor-based neural network to realize on-chip movement decoding for
+next-generation portable RPHs. The proposed architecture comprises an encoder,
+an attention layer, an attractor network, and a refinement regressor. We tested
+our model on four healthy subjects and achieved a decoding accuracy of
+80.6\pm3.3\%. Our proposed model is over 120 and 50 times more compact compared
+to state-of-the-art LSTM and CNN models, respectively, with comparable (or
+superior) decoding accuracy. Therefore, it exhibits minimal hardware complexity
+and can be effectively integrated as a System-on-Chip.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>\c{opyright} 2024 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Threshold-Independent Fair Matching through Score Calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Hossein Moslemi, Mostafa Milani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity Matching (EM) is a critical task in numerous fields, such as
+healthcare, finance, and public administration, as it identifies records that
+refer to the same entity within or across different databases. EM faces
+considerable challenges, particularly with false positives and negatives. These
+are typically addressed by generating matching scores and apply thresholds to
+balance false positives and negatives in various contexts. However, adjusting
+these thresholds can affect the fairness of the outcomes, a critical factor
+that remains largely overlooked in current fair EM research. The existing body
+of research on fair EM tends to concentrate on static thresholds, neglecting
+their critical impact on fairness. To address this, we introduce a new approach
+in EM using recent metrics for evaluating biases in score based binary
+classification, particularly through the lens of distributional parity. This
+approach enables the application of various bias metrics like equalized odds,
+equal opportunity, and demographic parity without depending on threshold
+settings. Our experiments with leading matching methods reveal potential
+biases, and by applying a calibration technique for EM scores using Wasserstein
+barycenters, we not only mitigate these biases but also preserve accuracy
+across real world datasets. This paper contributes to the field of fairness in
+data cleaning, especially within EM, which is a central task in data cleaning,
+by promoting a method for generating matching scores that reduce biases across
+different thresholds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Learning Control of Fast, Nonlinear, Oscillatory Dynamics
+  (Preprint) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20045v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20045v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        John W. Brooks, Christine M. Greve
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The sudden onset of deleterious and oscillatory dynamics (often called
+instabilities) is a known challenge in many fluid, plasma, and aerospace
+systems. These dynamics are difficult to address because they are nonlinear,
+chaotic, and are often too fast for active control schemes. In this work, we
+develop an alternative active controls system using an iterative,
+trajectory-optimization and parameter-tuning approach based on Iterative
+Learning Control (ILC), Time-Lagged Phase Portraits (TLPP) and Gaussian Process
+Regression (GPR). The novelty of this approach is that it can control a
+system's dynamics despite the controller being much slower than the dynamics.
+We demonstrate this controller on the Lorenz system of equations where it
+iteratively adjusts (tunes) the system's input parameters to successfully
+reproduce a desired oscillatory trajectory or state. Additionally, we
+investigate the system's dynamical sensitivity to its control parameters,
+identify continuous and bounded regions of desired dynamical trajectories, and
+demonstrate that the controller is robust to missing information and
+uncontrollable parameters as long as certain requirements are met. The
+controller presented in this work provides a framework for low-speed control
+for a variety of fast, nonlinear systems that may aid in instability
+suppression and mitigation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CycleFormer : TSP Solver Based on Language Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jieun Yook, Junpyo Seo, Joon Huh, Han Joon Byun, Byung-ro Mooon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new transformer model for the Traveling Salesman Problem (TSP)
+called CycleFormer. We identified distinctive characteristics that need to be
+considered when applying a conventional transformer model to TSP and aimed to
+fully incorporate these elements into the TSP-specific transformer. Unlike the
+token sets in typical language models, which are limited and static, the token
+(node) set in TSP is unlimited and dynamic. To exploit this fact to the
+fullest, we equated the encoder output with the decoder linear layer and
+directly connected the context vector of the encoder to the decoder encoding.
+Additionally, we added a positional encoding to the encoder tokens that
+reflects the two-dimensional nature of TSP, and devised a circular positional
+encoding for the decoder tokens that considers the cyclic properties of a tour.
+By incorporating these ideas, CycleFormer outperforms state-of-the-art (SOTA)
+transformer models for TSP from TSP-50 to TSP-500. Notably, on TSP-500, the
+optimality gap was reduced by approximately 2.8 times, from 3.09% to 1.10%,
+compared to the existing SOTA. The code will be made available at
+https://github.com/Giventicket/CycleFormer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Task-Agnostic Machine Learning-Assisted Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Miao, Qiongshi Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning (ML) is playing an increasingly important role in scientific
+research. In conjunction with classical statistical approaches, ML-assisted
+analytical strategies have shown great promise in accelerating research
+findings. This has also opened up a whole new field of methodological research
+focusing on integrative approaches that leverage both ML and statistics to
+tackle data science challenges. One type of study that has quickly gained
+popularity employs ML to predict unobserved outcomes in massive samples and
+then uses the predicted outcomes in downstream statistical inference. However,
+existing methods designed to ensure the validity of this type of
+post-prediction inference are limited to very basic tasks such as linear
+regression analysis. This is because any extension of these approaches to new,
+more sophisticated statistical tasks requires task-specific algebraic
+derivations and software implementations, which ignores the massive library of
+existing software tools already developed for complex inference tasks and
+severely constrains the scope of post-prediction inference in real
+applications. To address this challenge, we propose a novel statistical
+framework for task-agnostic ML-assisted inference. It provides a
+post-prediction inference solution that can be easily plugged into almost any
+established data analysis routine. It delivers valid and efficient inference
+that is robust to arbitrary choices of ML models, while allowing nearly all
+existing analytical frameworks to be incorporated into the analysis of
+ML-predicted outcomes. Through extensive experiments, we showcase the validity,
+versatility, and superiority of our method compared to existing approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Random Forest-based Prediction Model for Turning Points in
+  Antagonistic event-group Competitions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20029v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20029v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zishuo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  At present, most of the prediction studies related to antagonistic
+event-group competitions focus on the prediction of competition results, and
+less on the prediction of the competition process, which can not provide
+real-time feedback of the athletes' state information in the actual
+competition, and thus can not analyze the changes of the competition situation.
+In order to solve this problem, this paper proposes a prediction model based on
+Random Forest for the turning point of the antagonistic event-group. Firstly,
+the quantitative equation of competitive potential energy is proposed;
+Secondly, the quantitative value of competitive potential energy is obtained by
+using the dynamic combination of weights method, and the turning point of the
+competition situation of the antagonistic event-group is marked according to
+the quantitative time series graph; Finally, the random forest prediction model
+based on the optimisation of the KM-SMOTE algorithm and the grid search method
+is established. The experimental analysis shows that: the quantitative equation
+of competitive potential energy can effectively reflect the dynamic situation
+of the competition; The model can effectively predict the turning point of the
+competition situation of the antagonistic event-group, and the recall rate of
+the model in the test set is 86.13%; the model has certain significance for the
+future study of the competition situation of the antagonistic event-group.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple and Adaptive Learning Rate for FTRL in Online Learning with
+  Minimax Regret of $Θ(T^{2/3})$ and its Application to
+  Best-of-Both-Worlds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taira Tsuchiya, Shinji Ito
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Follow-the-Regularized-Leader (FTRL) is a powerful framework for various
+online learning problems. By designing its regularizer and learning rate to be
+adaptive to past observations, FTRL is known to work adaptively to various
+properties of an underlying environment. However, most existing adaptive
+learning rates are for online learning problems with a minimax regret of
+$\Theta(\sqrt{T})$ for the number of rounds $T$, and there are only a few
+studies on adaptive learning rates for problems with a minimax regret of
+$\Theta(T^{2/3})$, which include several important problems dealing with
+indirect feedback. To address this limitation, we establish a new adaptive
+learning rate framework for problems with a minimax regret of
+$\Theta(T^{2/3})$. Our learning rate is designed by matching the stability,
+penalty, and bias terms that naturally appear in regret upper bounds for
+problems with a minimax regret of $\Theta(T^{2/3})$. As applications of this
+framework, we consider two major problems dealing with indirect feedback:
+partial monitoring and graph bandits. We show that FTRL with our learning rate
+and the Tsallis entropy regularizer improves existing Best-of-Both-Worlds
+(BOBW) regret upper bounds, which achieve simultaneous optimality in the
+stochastic and adversarial regimes. The resulting learning rate is surprisingly
+simple compared to the existing learning rates for BOBW algorithms for problems
+with a minimax regret of $\Theta(T^{2/3})$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Safe Multi-agent Reinforcement Learning with Natural Language
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20018v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20018v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyan Wang, Meng Fang, Tristan Tomilin, Fei Fang, Yali Du
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The role of natural language constraints in Safe Multi-agent Reinforcement
+Learning (MARL) is crucial, yet often overlooked. While Safe MARL has vast
+potential, especially in fields like robotics and autonomous vehicles, its full
+potential is limited by the need to define constraints in pre-designed
+mathematical terms, which requires extensive domain expertise and reinforcement
+learning knowledge, hindering its broader adoption. To address this limitation
+and make Safe MARL more accessible and adaptable, we propose a novel approach
+named Safe Multi-agent Reinforcement Learning with Natural Language constraints
+(SMALL). Our method leverages fine-tuned language models to interpret and
+process free-form textual constraints, converting them into semantic embeddings
+that capture the essence of prohibited states and behaviours. These embeddings
+are then integrated into the multi-agent policy learning process, enabling
+agents to learn policies that minimize constraint violations while optimizing
+rewards. To evaluate the effectiveness of SMALL, we introduce the LaMaSafe, a
+multi-task benchmark designed to assess the performance of multiple agents in
+adhering to natural language constraints. Empirical evaluations across various
+environments demonstrate that SMALL achieves comparable rewards and
+significantly fewer constraint violations, highlighting its effectiveness in
+understanding and enforcing natural language constraints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ subMFL: Compatiple subModel Generation for Federated Learning in Device
+  Heterogenous Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyneddin Oz, Ceylan Soygul Oz, Abdollah Malekjafarian, Nima Afraz, Fatemeh Golpayegani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is commonly used in systems with distributed and
+heterogeneous devices with access to varying amounts of data and diverse
+computing and storage capacities. FL training process enables such devices to
+update the weights of a shared model locally using their local data and then a
+trusted central server combines all of those models to generate a global model.
+In this way, a global model is generated while the data remains local to
+devices to preserve privacy. However, training large models such as Deep Neural
+Networks (DNNs) on resource-constrained devices can take a prohibitively long
+time and consume a large amount of energy. In the current process, the
+low-capacity devices are excluded from the training process, although they
+might have access to unseen data. To overcome this challenge, we propose a
+model compression approach that enables heterogeneous devices with varying
+computing capacities to participate in the FL process. In our approach, the
+server shares a dense model with all devices to train it: Afterwards, the
+trained model is gradually compressed to obtain submodels with varying levels
+of sparsity to be used as suitable initial global models for
+resource-constrained devices that were not capable of train the first dense
+model. This results in an increased participation rate of resource-constrained
+devices while the transferred weights from the previous round of training are
+preserved. Our validation experiments show that despite reaching about 50 per
+cent global sparsity, generated submodels maintain their accuracy while can be
+shared to increase participation by around 50 per cent.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures, European Conference on Parallel Processing, pp.
+  between 52 and 64, Springer, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FlexiDrop: Theoretical Insights and Practical Advances in Random Dropout
+  Method on GNNs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiheng Zhou, Sihao Liu, Weichen Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) are powerful tools for handling graph-type data.
+Recently, GNNs have been widely applied in various domains, but they also face
+some issues, such as overfitting, over-smoothing and non-robustness. The
+existing research indicates that random dropout methods are an effective way to
+address these issues. However, random dropout methods in GNNs still face
+unresolved problems. Currently, the choice of dropout rate, often determined by
+heuristic or grid search methods, can increase the generalization error,
+contradicting the principal aims of dropout. In this paper, we propose a novel
+random dropout method for GNNs called FlexiDrop. First, we conduct a
+theoretical analysis of dropout in GNNs using rademacher complexity and
+demonstrate that the generalization error of traditional random dropout methods
+is constrained by a function related to the dropout rate. Subsequently, we use
+this function as a regularizer to unify the dropout rate and empirical loss
+within a single loss function, optimizing them simultaneously. Therefore, our
+method enables adaptive adjustment of the dropout rate and theoretically
+balances the trade-off between model complexity and generalization ability.
+Furthermore, extensive experimental results on benchmark datasets show that
+FlexiDrop outperforms traditional random dropout methods in GNNs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Kernel Language Entropy: Fine-grained Uncertainty Quantification for
+  LLMs from Semantic Similarities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Nikitin, Jannik Kossen, Yarin Gal, Pekka Marttinen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Uncertainty quantification in Large Language Models (LLMs) is crucial for
+applications where safety and reliability are important. In particular,
+uncertainty can be used to improve the trustworthiness of LLMs by detecting
+factually incorrect model responses, commonly called hallucinations.
+Critically, one should seek to capture the model's semantic uncertainty, i.e.,
+the uncertainty over the meanings of LLM outputs, rather than uncertainty over
+lexical or syntactic variations that do not affect answer correctness. To
+address this problem, we propose Kernel Language Entropy (KLE), a novel method
+for uncertainty estimation in white- and black-box LLMs. KLE defines positive
+semidefinite unit trace kernels to encode the semantic similarities of LLM
+outputs and quantifies uncertainty using the von Neumann entropy. It considers
+pairwise semantic dependencies between answers (or semantic clusters),
+providing more fine-grained uncertainty estimates than previous methods based
+on hard clustering of answers. We theoretically prove that KLE generalizes the
+previous state-of-the-art method called semantic entropy and empirically
+demonstrate that it improves uncertainty quantification performance across
+multiple natural language generation datasets and LLM architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Symmetries in Overparametrized Neural Networks: A Mean-Field View 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Maass Martínez, Joaquin Fontbona
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a Mean-Field (MF) view of the learning dynamics of
+overparametrized Artificial Neural Networks (NN) under data symmetric in law
+wrt the action of a general compact group $G$. We consider for this a class of
+generalized shallow NNs given by an ensemble of $N$ multi-layer units, jointly
+trained using stochastic gradient descent (SGD) and possibly
+symmetry-leveraging (SL) techniques, such as Data Augmentation (DA), Feature
+Averaging (FA) or Equivariant Architectures (EA). We introduce the notions of
+weakly and strongly invariant laws (WI and SI) on the parameter space of each
+single unit, corresponding, respectively, to $G$-invariant distributions, and
+to distributions supported on parameters fixed by the group action (which
+encode EA). This allows us to define symmetric models compatible with taking
+$N\to\infty$ and give an interpretation of the asymptotic dynamics of DA, FA
+and EA in terms of Wasserstein Gradient Flows describing their MF limits. When
+activations respect the group action, we show that, for symmetric data, DA, FA
+and freely-trained models obey the exact same MF dynamic, which stays in the
+space of WI laws and minimizes therein the population risk. We also give a
+counterexample to the general attainability of an optimum over SI laws. Despite
+this, quite remarkably, we show that the set of SI laws is also preserved by
+the MF dynamics even when freely trained. This sharply contrasts the finite-$N$
+setting, in which EAs are generally not preserved by unconstrained SGD. We
+illustrate the validity of our findings as $N$ gets larger in a teacher-student
+experimental setting, training a student NN to learn from a WI, SI or arbitrary
+teacher model through various SL schemes. We last deduce a data-driven
+heuristic to discover the largest subspace of parameters supporting SI
+distributions for a problem, that could be used for designing EA with minimal
+generalization error.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video-Language Critic: Transferable Reward Functions for
+  Language-Conditioned Robotics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minttu Alakuijala, Reginald McLean, Isaac Woungang, Nariman Farsad, Samuel Kaski, Pekka Marttinen, Kai Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language is often the easiest and most convenient modality for humans
+to specify tasks for robots. However, learning to ground language to behavior
+typically requires impractical amounts of diverse, language-annotated
+demonstrations collected on each target robot. In this work, we aim to separate
+the problem of what to accomplish from how to accomplish it, as the former can
+benefit from substantial amounts of external observation-only data, and only
+the latter depends on a specific robot embodiment. To this end, we propose
+Video-Language Critic, a reward model that can be trained on readily available
+cross-embodiment data using contrastive learning and a temporal ranking
+objective, and use it to score behavior traces from a separate reinforcement
+learning actor. When trained on Open X-Embodiment data, our reward model
+enables 2x more sample-efficient policy training on Meta-World tasks than a
+sparse reward only, despite a significant domain gap. Using in-domain data but
+in a challenging task generalization setting on Meta-World, we further
+demonstrate more sample-efficient training than is possible with prior
+language-conditioned reward models that are either trained with binary
+classification, use static images, or do not leverage the temporal information
+present in video data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages in the main text, 16 pages including references and
+  supplementary materials. 4 figures and 3 tables in the main text, 1 table in
+  supplementary materials</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Targeted Sequential Indirect Experiment Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elisabeth Ailer, Niclas Dern, Jason Hartford, Niki Kilbertus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scientific hypotheses typically concern specific aspects of complex,
+imperfectly understood or entirely unknown mechanisms, such as the effect of
+gene expression levels on phenotypes or how microbial communities influence
+environmental health. Such queries are inherently causal (rather than purely
+associational), but in many settings, experiments can not be conducted directly
+on the target variables of interest, but are indirect. Therefore, they perturb
+the target variable, but do not remove potential confounding factors. If,
+additionally, the resulting experimental measurements are multi-dimensional and
+the studied mechanisms nonlinear, the query of interest is generally not
+identified. We develop an adaptive strategy to design indirect experiments that
+optimally inform a targeted query about the ground truth mechanism in terms of
+sequentially narrowing the gap between an upper and lower bound on the query.
+While the general formulation consists of a bi-level optimization procedure, we
+derive an efficiently estimable analytical kernel-based estimator of the bounds
+for the causal effect, a query of key interest, and demonstrate the efficacy of
+our approach in confounded, multivariate, nonlinear synthetic settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain Adaptation with Cauchy-Schwarz Divergence <span class="chip">UAI-24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenzhe Yin, Shujian Yu, Yicong Lin, Jie Liu, Jan-Jakob Sonke, Efstratios Gavves
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation aims to use training data from one or multiple source
+domains to learn a hypothesis that can be generalized to a different, but
+related, target domain. As such, having a reliable measure for evaluating the
+discrepancy of both marginal and conditional distributions is crucial. We
+introduce Cauchy-Schwarz (CS) divergence to the problem of unsupervised domain
+adaptation (UDA). The CS divergence offers a theoretically tighter
+generalization error bound than the popular Kullback-Leibler divergence. This
+holds for the general case of supervised learning, including multi-class
+classification and regression. Furthermore, we illustrate that the CS
+divergence enables a simple estimator on the discrepancy of both marginal and
+conditional distributions between source and target domains in the
+representation space, without requiring any distributional assumptions. We
+provide multiple examples to illustrate how the CS divergence can be
+conveniently used in both distance metric- or adversarial training-based UDA
+frameworks, resulting in compelling performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by UAI-24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistent Submodular Maximization <span class="chip">ICML 24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19977v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19977v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Dütting, Federico Fusco, Silvio Lattanzi, Ashkan Norouzi-Fard, Morteza Zadimoghaddam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maximizing monotone submodular functions under cardinality constraints is a
+classic optimization task with several applications in data mining and machine
+learning. In this paper we study this problem in a dynamic environment with
+consistency constraints: elements arrive in a streaming fashion and the goal is
+maintaining a constant approximation to the optimal solution while having a
+stable solution (i.e., the number of changes between two consecutive solutions
+is bounded). We provide algorithms in this setting with different trade-offs
+between consistency and approximation quality. We also complement our
+theoretical results with an experimental analysis showing the effectiveness of
+our algorithms in real-world instances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICML 24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GasTrace: Detecting Sandwich Attack Malicious Accounts in Ethereum 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zekai Liu, Xiaoqi Li, Hongli Peng, Wenkai Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The openness and transparency of Ethereum transaction data make it easy to be
+exploited by any entities, executing malicious attacks. The sandwich attack
+manipulates the Automated Market Maker (AMM) mechanism, profiting from
+manipulating the market price through front or after-running transactions. To
+identify and prevent sandwich attacks, we propose a cascade classification
+framework GasTrace. GasTrace analyzes various transaction features to detect
+malicious accounts, notably through the analysis and modeling of Gas features.
+In the initial classification, we utilize the Support Vector Machine (SVM) with
+the Radial Basis Function (RBF) kernel to generate the predicted probabilities
+of accounts, further constructing a detailed transaction network. Subsequently,
+the behavior features are captured by the Graph Attention Network (GAT)
+technique in the second classification. Through cascade classification,
+GasTrace can analyze and classify the sandwich attacks. Our experimental
+results demonstrate that GasTrace achieves a remarkable detection and
+generation capability, performing an accuracy of 96.73\% and an F1 score of
+95.71\% for identifying sandwich attack accounts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Out-of-Scope Intent Classification with Dual Encoding and
+  Threshold-based Re-Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hossam M. Zawbaa, Wael Rashwan, Sourav Dutta, Haytham Assem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting out-of-scope user utterances is essential for task-oriented
+dialogues and intent classification. Current methodologies face difficulties
+with the unpredictable distribution of outliers and often rely on assumptions
+about data distributions. We present the Dual Encoder for Threshold-Based
+Re-Classification (DETER) to address these challenges. This end-to-end
+framework efficiently detects out-of-scope intents without requiring
+assumptions on data distributions or additional post-processing steps. The core
+of DETER utilizes dual text encoders, the Universal Sentence Encoder (USE) and
+the Transformer-based Denoising AutoEncoder (TSDAE), to generate user utterance
+embeddings, which are classified through a branched neural architecture.
+Further, DETER generates synthetic outliers using self-supervision and
+incorporates out-of-scope phrases from open-domain datasets. This approach
+ensures a comprehensive training set for out-of-scope detection. Additionally,
+a threshold-based re-classification mechanism refines the model's initial
+predictions. Evaluations on the CLINC-150, Stackoverflow, and Banking77
+datasets demonstrate DETER's efficacy. Our model outperforms previous
+benchmarks, increasing up to 13% and 5% in F1 score for known and unknown
+intents on CLINC-150 and Stackoverflow, and 16% for known and 24% % for unknown
+intents on Banking77. The source code has been released at
+https://github.com/Hossam-Mohammed-tech/Intent\_Classification\_OOS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collective Variable Free Transition Path Sampling with Generative Flow
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kiyoung Seong, Seonghyun Park, Seonghwan Kim, Woo Youn Kim, Sungsoo Ahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding transition paths between meta-stable states in molecular
+systems is fundamental for material design and drug discovery. However,
+sampling these paths via molecular dynamics simulations is computationally
+prohibitive due to the high-energy barriers between the meta-stable states.
+Recent machine learning approaches are often restricted to simple systems or
+rely on collective variables (CVs) extracted from expensive domain knowledge.
+In this work, we propose to leverage generative flow networks (GFlowNets) to
+sample transition paths without relying on CVs. We reformulate the problem as
+amortized energy-based sampling over molecular trajectories and train a bias
+potential by minimizing the squared log-ratio between the target distribution
+and the generator, derived from the flow matching objective of GFlowNets. Our
+evaluation on three proteins (Alanine Dipeptide, Polyproline, and Chignolin)
+demonstrates that our approach, called TPS-GFN, generates more realistic and
+diverse transition paths than the previous CV-free machine learning approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenKubeSec: LLM-Based Kubernetes Misconfiguration Detection,
+  Localization, Reasoning, and Remediation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ehud Malul, Yair Meidan, Dudu Mimran, Yuval Elovici, Asaf Shabtai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge associated with Kubernetes configuration files (KCFs) is that
+they are often highly complex and error-prone, leading to security
+vulnerabilities and operational setbacks. Rule-based (RB) tools for KCF
+misconfiguration detection rely on static rule sets, making them inherently
+limited and unable to detect newly-discovered misconfigurations. RB tools also
+suffer from misdetection, since mistakes are likely when coding the detection
+rules. Recent methods for detecting and remediating KCF misconfigurations are
+limited in terms of their scalability and detection coverage, or due to the
+fact that they have high expertise requirements and do not offer automated
+remediation along with misconfiguration detection. Novel approaches that employ
+LLMs in their pipeline rely on API-based, general-purpose, and mainly
+commercial models. Thus, they pose security challenges, have inconsistent
+classification performance, and can be costly. In this paper, we propose
+GenKubeSec, a comprehensive and adaptive, LLM-based method, which, in addition
+to detecting a wide variety of KCF misconfigurations, also identifies the exact
+location of the misconfigurations and provides detailed reasoning about them,
+along with suggested remediation. When empirically compared with three
+industry-standard RB tools, GenKubeSec achieved equivalent precision (0.990)
+and superior recall (0.999). When a random sample of KCFs was examined by a
+Kubernetes security expert, GenKubeSec's explanations as to misconfiguration
+localization, reasoning and remediation were 100% correct, informative and
+useful. To facilitate further advancements in this domain, we share the unique
+dataset we collected, a unified misconfiguration index we developed for label
+standardization, our experimentation code, and GenKubeSec itself as an
+open-source tool.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MM-Lego: Modular Biomedical Multimodal Models with Minimal Fine-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Konstantin Hemker, Nikola Simidjievski, Mateja Jamnik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning holistic computational representations in physical, chemical or
+biological systems requires the ability to process information from different
+distributions and modalities within the same model. Thus, the demand for
+multimodal machine learning models has sharply risen for modalities that go
+beyond vision and language, such as sequences, graphs, time series, or tabular
+data. While there are many available multimodal fusion and alignment
+approaches, most of them require end-to-end training, scale quadratically with
+the number of modalities, cannot handle cases of high modality imbalance in the
+training set, or are highly topology-specific, making them too restrictive for
+many biomedical learning tasks. This paper presents Multimodal Lego (MM-Lego),
+a modular and general-purpose fusion and model merging framework to turn any
+set of encoders into a competitive multimodal model with no or minimal
+fine-tuning. We achieve this by introducing a wrapper for unimodal encoders
+that enforces lightweight dimensionality assumptions between modalities and
+harmonises their representations by learning features in the frequency domain
+to enable model merging with little signal interference. We show that MM-Lego
+1) can be used as a model merging method which achieves competitive performance
+with end-to-end fusion models without any fine-tuning, 2) can operate on any
+unimodal encoder, and 3) is a model fusion method that, with minimal
+fine-tuning, achieves state-of-the-art results on six benchmarked multimodal
+biomedical tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Latent Graph Structures and their Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19933v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19933v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Manenti, Daniele Zambon, Cesare Alippi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within a prediction task, Graph Neural Networks (GNNs) use relational
+information as an inductive bias to enhance the model's accuracy. As
+task-relevant relations might be unknown, graph structure learning approaches
+have been proposed to learn them while solving the downstream prediction task.
+In this paper, we demonstrate that minimization of a point-prediction loss
+function, e.g., the mean absolute error, does not guarantee proper learning of
+the latent relational information and its associated uncertainty. Conversely,
+we prove that a suitable loss function on the stochastic model outputs
+simultaneously grants (i) the unknown adjacency matrix latent distribution and
+(ii) optimal performance on the prediction task. Finally, we propose a
+sampling-based method that solves this joint learning task. Empirical results
+validate our theoretical claims and demonstrate the effectiveness of the
+proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Diffusion Models' Corruption Stage in Few-Shot Fine-tuning and
+  Mitigating with Bayesian Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyu Wu, Jiaru Zhang, Yang Hua, Bohan Lyu, Hao Wang, Tao Song, Haibing Guan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot fine-tuning of Diffusion Models (DMs) is a key advancement,
+significantly reducing training costs and enabling personalized AI
+applications. However, we explore the training dynamics of DMs and observe an
+unanticipated phenomenon: during the training process, image fidelity initially
+improves, then unexpectedly deteriorates with the emergence of noisy patterns,
+only to recover later with severe overfitting. We term the stage with generated
+noisy patterns as corruption stage. To understand this corruption stage, we
+begin by theoretically modeling the one-shot fine-tuning scenario, and then
+extend this modeling to more general cases. Through this modeling, we identify
+the primary cause of this corruption stage: a narrowed learning distribution
+inherent in the nature of few-shot fine-tuning. To tackle this, we apply
+Bayesian Neural Networks (BNNs) on DMs with variational inference to implicitly
+broaden the learned distribution, and present that the learning target of the
+BNNs can be naturally regarded as an expectation of the diffusion loss and a
+further regularization with the pretrained DMs. This approach is highly
+compatible with current few-shot fine-tuning methods in DMs and does not
+introduce any extra inference costs. Experimental results demonstrate that our
+method significantly mitigates corruption, and improves the fidelity, quality
+and diversity of the generated images in both object-driven and subject-driven
+generation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BAN: Detecting Backdoors Activated by Adversarial Neuron Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyun Xu, Zhuoran Liu, Stefanos Koffas, Shujian Yu, Stjepan Picek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backdoor attacks on deep learning represent a recent threat that has gained
+significant attention in the research community. Backdoor defenses are mainly
+based on backdoor inversion, which has been shown to be generic,
+model-agnostic, and applicable to practical threat scenarios. State-of-the-art
+backdoor inversion recovers a mask in the feature space to locate prominent
+backdoor features, where benign and backdoor features can be disentangled.
+However, it suffers from high computational overhead, and we also find that it
+overly relies on prominent backdoor features that are highly distinguishable
+from benign features. To tackle these shortcomings, this paper improves
+backdoor feature inversion for backdoor detection by incorporating extra neuron
+activation information. In particular, we adversarially increase the loss of
+backdoored models with respect to weights to activate the backdoor effect,
+based on which we can easily differentiate backdoored and clean models.
+Experimental results demonstrate our defense, BAN, is 1.37$\times$ (on
+CIFAR-10) and 5.11$\times$ (on ImageNet200) more efficient with 9.99% higher
+detect success rate than the state-of-the-art defense BTI-DBF. Our code and
+trained models are publicly
+available.\url{https://anonymous.4open.science/r/ban-4B32}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unraveling the Impact of Heterophilic Structures on Graph
+  Positive-Unlabeled Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19919v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19919v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Wu, Jiangchao Yao, Bo Han, Lina Yao, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Positive-Unlabeled (PU) learning is vital in many real-world scenarios,
+its application to graph data still remains under-explored. We unveil that a
+critical challenge for PU learning on graph lies on the edge heterophily, which
+directly violates the irreducibility assumption for Class-Prior Estimation
+(class prior is essential for building PU learning algorithms) and degenerates
+the latent label inference on unlabeled nodes during classifier training. In
+response to this challenge, we introduce a new method, named Graph PU Learning
+with Label Propagation Loss (GPL). Specifically, GPL considers learning from PU
+nodes along with an intermediate heterophily reduction, which helps mitigate
+the negative impact of the heterophilic structure. We formulate this procedure
+as a bilevel optimization that reduces heterophily in the inner loop and
+efficiently learns a classifier in the outer loop. Extensive experiments across
+a variety of datasets have shown that GPL significantly outperforms baseline
+methods, confirming its effectiveness and superiority.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Kernel Hypothesis Testing under Data Corruption 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonin Schrab, Ilmun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose two general methods for constructing robust permutation tests
+under data corruption. The proposed tests effectively control the
+non-asymptotic type I error under data corruption, and we prove their
+consistency in power under minimal conditions. This contributes to the
+practical deployment of hypothesis tests for real-world applications with
+potential adversarial attacks. One of our methods inherently ensures
+differential privacy, further broadening its applicability to private data
+analysis. For the two-sample and independence settings, we show that our kernel
+robust tests are minimax optimal, in the sense that they are guaranteed to be
+non-asymptotically powerful against alternatives uniformly separated from the
+null in the kernel MMD and HSIC metrics at some optimal rate (tight with
+matching lower bound). Finally, we provide publicly available implementations
+and empirically illustrate the practicality of our proposed tests.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 2 figures, 2 algorithms</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Advantage-Guided Policy Regularization for Offline
+  Reinforcement Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tenglong Liu, Yang Li, Yixing Lan, Hao Gao, Wei Pan, Xin Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In offline reinforcement learning, the challenge of out-of-distribution (OOD)
+is pronounced. To address this, existing methods often constrain the learned
+policy through policy regularization. However, these methods often suffer from
+the issue of unnecessary conservativeness, hampering policy improvement. This
+occurs due to the indiscriminate use of all actions from the behavior policy
+that generates the offline dataset as constraints. The problem becomes
+particularly noticeable when the quality of the dataset is suboptimal. Thus, we
+propose Adaptive Advantage-guided Policy Regularization (A2PR), obtaining
+high-advantage actions from an augmented behavior policy combined with VAE to
+guide the learned policy. A2PR can select high-advantage actions that differ
+from those present in the dataset, while still effectively maintaining
+conservatism from OOD actions. This is achieved by harnessing the VAE capacity
+to generate samples matching the distribution of the data points. We
+theoretically prove that the improvement of the behavior policy is guaranteed.
+Besides, it effectively mitigates value overestimation with a bounded
+performance gap. Empirically, we conduct a series of experiments on the D4RL
+benchmark, where A2PR demonstrates state-of-the-art performance. Furthermore,
+experimental results on additional suboptimal mixed datasets reveal that A2PR
+exhibits superior performance. Code is available at
+https://github.com/ltlhuuu/A2PR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024, 19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Discriminative Dynamics with Label Corruption for Noisy Label
+  Detection <span class="chip">CVPR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19902v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19902v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suyeon Kim, Dongha Lee, SeongKu Kang, Sukang Chae, Sanghwan Jang, Hwanjo Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Label noise, commonly found in real-world datasets, has a detrimental impact
+on a model's generalization. To effectively detect incorrectly labeled
+instances, previous works have mostly relied on distinguishable training
+signals, such as training loss, as indicators to differentiate between clean
+and noisy labels. However, they have limitations in that the training signals
+incompletely reveal the model's behavior and are not effectively generalized to
+various noise types, resulting in limited detection accuracy. In this paper, we
+propose DynaCor framework that distinguishes incorrectly labeled instances from
+correctly labeled ones based on the dynamics of the training signals. To cope
+with the absence of supervision for clean and noisy labels, DynaCor first
+introduces a label corruption strategy that augments the original dataset with
+intentionally corrupted labels, enabling indirect simulation of the model's
+behavior on noisy labels. Then, DynaCor learns to identify clean and noisy
+instances by inducing two clearly distinguishable clusters from the latent
+representations of training dynamics. Our comprehensive experiments show that
+DynaCor outperforms the state-of-the-art competitors and shows strong
+robustness to various noise types and noise rates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Urban Air Pollution Forecasting: a Machine Learning Approach leveraging
+  Satellite Observations and Meteorological Forecasts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giacomo Blanco, Luca Barco, Lorenzo Innocenti, Claudio Rossi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Air pollution poses a significant threat to public health and well-being,
+particularly in urban areas. This study introduces a series of machine-learning
+models that integrate data from the Sentinel-5P satellite, meteorological
+conditions, and topological characteristics to forecast future levels of five
+major pollutants. The investigation delineates the process of data collection,
+detailing the combination of diverse data sources utilized in the study.
+Through experiments conducted in the Milan metropolitan area, the models
+demonstrate their efficacy in predicting pollutant levels for the forthcoming
+day, achieving a percentage error of around 30%. The proposed models are
+advantageous as they are independent of monitoring stations, facilitating their
+use in areas without existing infrastructure. Additionally, we have released
+the collected dataset to the public, aiming to stimulate further research in
+this field. This research contributes to advancing our understanding of urban
+air quality dynamics and emphasizes the importance of amalgamating satellite,
+meteorological, and topographical data to develop robust pollution forecasting
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, submitted to IEEE MetroLivEnv 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Similarity is Not All You Need: Endowing Retrieval Augmented Generation
+  with Multi Layered Thoughts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19893v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19893v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chunjing Gan, Dan Yang, Binbin Hu, Hanxiao Zhang, Siyuan Li, Ziqi Liu, Yue Shen, Lin Ju, Zhiqiang Zhang, Jinjie Gu, Lei Liang, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, large language models (LLMs) have made remarkable
+achievements in various domains. However, the untimeliness and cost of
+knowledge updates coupled with hallucination issues of LLMs have curtailed
+their applications in knowledge intensive tasks, where retrieval augmented
+generation (RAG) can be of help. Nevertheless, existing retrieval augmented
+models typically use similarity as a bridge between queries and documents and
+follow a retrieve then read procedure. In this work, we argue that similarity
+is not always the panacea and totally relying on similarity would sometimes
+degrade the performance of retrieval augmented generation. To this end, we
+propose MetRag, a Multi layEred Thoughts enhanced Retrieval Augmented
+Generation framework. To begin with, beyond existing similarity oriented
+thought, we embrace a small scale utility model that draws supervision from an
+LLM for utility oriented thought and further come up with a smarter model by
+comprehensively combining the similarity and utility oriented thoughts.
+Furthermore, given the fact that the retrieved document set tends to be huge
+and using them in isolation makes it difficult to capture the commonalities and
+characteristics among them, we propose to make an LLM as a task adaptive
+summarizer to endow retrieval augmented generation with compactness-oriented
+thought. Finally, with multi layered thoughts from the precedent stages, an LLM
+is called for knowledge augmented generation. Extensive experiments on
+knowledge-intensive tasks have demonstrated the superiority of MetRag.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Joint Semantic Coding and Beamforming for Near-Space Airship-Borne
+  Massive MIMO Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghui Wu, Zhen Gao, Zhaocheng Wang, Dusit Niyato, George K. Karagiannidis, Sheng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Near-space airship-borne communication network is recognized to be an
+indispensable component of the future integrated ground-air-space network
+thanks to airships' advantage of long-term residency at stratospheric
+altitudes, but it urgently needs reliable and efficient Airship-to-X link. To
+improve the transmission efficiency and capacity, this paper proposes to
+integrate semantic communication with massive multiple-input multiple-output
+(MIMO) technology. Specifically, we propose a deep joint semantic coding and
+beamforming (JSCBF) scheme for airship-based massive MIMO image transmission
+network in space, in which semantics from both source and channel are fused to
+jointly design the semantic coding and physical layer beamforming. First, we
+design two semantic extraction networks to extract semantics from image source
+and channel state information, respectively. Then, we propose a semantic fusion
+network that can fuse these semantics into complex-valued semantic features for
+subsequent physical-layer transmission. To efficiently transmit the fused
+semantic features at the physical layer, we then propose the hybrid data and
+model-driven semantic-aware beamforming networks. At the receiver, a semantic
+decoding network is designed to reconstruct the transmitted images. Finally, we
+perform end-to-end deep learning to jointly train all the modules, using the
+image reconstruction quality at the receivers as a metric. The proposed deep
+JSCBF scheme fully combines the efficient source compressibility and robust
+error correction capability of semantic communication with the high spectral
+efficiency of massive MIMO, achieving a significant performance improvement
+over existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Major Revision by IEEE JSAC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parrot: Efficient Serving of LLM-based Applications with Semantic
+  Variable <span class="chip">OSDI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaofan Lin, Zhenhua Han, Chengruidong Zhang, Yuqing Yang, Fan Yang, Chen Chen, Lili Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of large language models (LLMs) has enabled LLM-based applications
+(a.k.a. AI agents or co-pilots), a new software paradigm that combines the
+strength of LLM and conventional software. Diverse LLM applications from
+different tenants could design complex workflows using multiple LLM requests to
+accomplish one task. However, they have to use the over-simplified
+request-level API provided by today's public LLM services, losing essential
+application-level information. Public LLM services have to blindly optimize
+individual LLM requests, leading to sub-optimal end-to-end performance of LLM
+applications.
+  This paper introduces Parrot, an LLM service system that focuses on the
+end-to-end experience of LLM-based applications. Parrot proposes Semantic
+Variable, a unified abstraction to expose application-level knowledge to public
+LLM services. A Semantic Variable annotates an input/output variable in the
+prompt of a request, and creates the data pipeline when connecting multiple LLM
+requests, providing a natural way to program LLM applications. Exposing
+Semantic Variables to the public LLM service allows it to perform conventional
+data flow analysis to uncover the correlation across multiple LLM requests.
+This correlation opens a brand-new optimization space for the end-to-end
+performance of LLM-based applications. Extensive evaluations demonstrate that
+Parrot can achieve up to an order-of-magnitude improvement for popular and
+practical use cases of LLM applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear on USENIX OSDI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Learning with Multi-resolution Model Broadcast 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henrik Rydén, Reza Moosavi, Erik G. Larsson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In federated learning, a server must periodically broadcast a model to the
+agents. We propose to use multi-resolution coding and modulation (also known as
+non-uniform modulation) for this purpose. In the simplest instance, broadcast
+transmission is used, whereby all agents are targeted with one and the same
+transmission (typically without any particular favored beam direction), which
+is coded using multi-resolution coding/modulation. This enables high-SNR
+agents, with high path gains to the server, to receive a more accurate model
+than the low-SNR agents do, without consuming more downlink resources. As one
+implementation, we use transmission with a non-uniform 8-PSK constellation,
+where a high-SNR receiver (agent) can separate all 8 constellation points
+(hence receive 3 bits) whereas a low-SNR receiver can only separate 4 points
+(hence receive 2 bits). By encoding the least significant information in the
+third bit, the high-SNR receivers can obtain the model with higher accuracy,
+while the low-SNR receiver can still obtain the model although with reduced
+accuracy, thereby facilitating at least some basic participation of the low-SNR
+receiver. We show the effectiveness of our proposed scheme via experimentation
+using federated learning with the MNIST data-set.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fourier Controller Networks for Real-Time Decision-Making in Embodied
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengkai Tan, Songming Liu, Kai Ma, Chengyang Ying, Xingxing Zhang, Hang Su, Jun Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning is able to obtain generalized low-level robot policies
+on diverse robotics datasets in embodied learning scenarios, and Transformer
+has been widely used to model time-varying features. However, it still suffers
+from the issues of low data efficiency and high inference latency. In this
+paper, we propose to investigate the task from a new perspective of the
+frequency domain. We first observe that the energy density in the frequency
+domain of a robot's trajectory is mainly concentrated in the low-frequency
+part. Then, we present the Fourier Controller Network (FCNet), a new network
+that utilizes the Short-Time Fourier Transform (STFT) to extract and encode
+time-varying features through frequency domain interpolation. We further
+achieve parallel training and efficient recurrent inference by using FFT and
+Sliding DFT methods in the model architecture for real-time decision-making.
+Comprehensive analyses in both simulated (e.g., D4RL) and real-world
+environments (e.g., robot locomotion) demonstrate FCNet's substantial
+efficiency and effectiveness over existing methods such as Transformer, e.g.,
+FCNet outperforms Transformer on multi-environmental robotics datasets of all
+types of sizes (from 1.9M to 120M). The project page and code can be found
+https://thkkk.github.io/fcnet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Words to Actions: Unveiling the Theoretical Underpinnings of
+  LLM-Driven Autonomous Systems <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19883v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19883v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianliang He, Siyu Chen, Fengzhuo Zhang, Zhuoran Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, from a theoretical lens, we aim to understand why large
+language model (LLM) empowered agents are able to solve decision-making
+problems in the physical world. To this end, consider a hierarchical
+reinforcement learning (RL) model where the LLM Planner and the Actor perform
+high-level task planning and low-level execution, respectively. Under this
+model, the LLM Planner navigates a partially observable Markov decision process
+(POMDP) by iteratively generating language-based subgoals via prompting. Under
+proper assumptions on the pretraining data, we prove that the pretrained LLM
+Planner effectively performs Bayesian aggregated imitation learning (BAIL)
+through in-context learning. Additionally, we highlight the necessity for
+exploration beyond the subgoals derived from BAIL by proving that naively
+executing the subgoals returned by LLM leads to a linear regret. As a remedy,
+we introduce an $\epsilon$-greedy exploration strategy to BAIL, which is proven
+to incur sublinear regret when the pretraining error is small. Finally, we
+extend our theoretical framework to include scenarios where the LLM Planner
+serves as a world model for inferring the transition model of the environment
+and to multi-agent settings, enabling coordination among multiple Actors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning from Random Demonstrations: Offline Reinforcement Learning with
+  Importance-Sampled Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Fang, Tian Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative models such as diffusion have been employed as world models in
+offline reinforcement learning to generate synthetic data for more effective
+learning. Existing work either generates diffusion models one-time prior to
+training or requires additional interaction data to update it. In this paper,
+we propose a novel approach for offline reinforcement learning with closed-loop
+policy evaluation and world-model adaptation. It iteratively leverages a guided
+diffusion world model to directly evaluate the offline target policy with
+actions drawn from it, and then performs an importance-sampled world model
+update to adaptively align the world model with the updated policy. We analyzed
+the performance of the proposed method and provided an upper bound on the
+return gap between our method and the real environment under an optimal policy.
+The result sheds light on various factors affecting learning performance.
+Evaluations in the D4RL environment show significant improvement over
+state-of-the-art baselines, especially when only random or medium-expertise
+demonstrations are available -- thus requiring improved alignment between the
+world model and offline policy evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is In-Context Learning Sufficient for Instruction Following in LLMs? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Zhao, Maksym Andriushchenko, Francesco Croce, Nicolas Flammarion
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning (ICL) allows LLMs to learn from examples without changing
+their weights, which is a particularly promising capability for long-context
+LLMs that can potentially learn from many examples. Recently, Lin et al. (2024)
+proposed URIAL, a method using only three in-context examples to align base
+LLMs, achieving non-trivial instruction following performance. In this work, we
+show that, while effective, ICL alignment with URIAL still underperforms
+compared to instruction fine-tuning on established benchmarks such as MT-Bench
+and AlpacaEval 2.0 (LC), especially with more capable base LMs. Unlike for
+tasks such as classification, translation, or summarization, adding more ICL
+demonstrations for long-context LLMs does not systematically improve
+instruction following performance. To address this limitation, we derive a
+greedy selection approach for ICL examples that noticeably improves
+performance, yet without bridging the gap to instruction fine-tuning. Finally,
+we provide a series of ablation studies to better understand the reasons behind
+the remaining gap, and we show how some aspects of ICL depart from the existing
+knowledge and are specific to the instruction tuning setting. Overall, our work
+advances the understanding of ICL as an alignment technique. We provide our
+code at https://github.com/tml-epfl/icl-alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Code at https://github.com/tml-epfl/icl-alignment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Vessel Location Forecasting and the Effect of Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Tritsarolis, Nikos Pelekis, Konstantina Bereta, Dimitris Zissis, Yannis Theodoridis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The wide spread of Automatic Identification System (AIS) has motivated
+several maritime analytics operations. Vessel Location Forecasting (VLF) is one
+of the most critical operations for maritime awareness. However, accurate VLF
+is a challenging problem due to the complexity and dynamic nature of maritime
+traffic conditions. Furthermore, as privacy concerns and restrictions have
+grown, training data has become increasingly fragmented, resulting in dispersed
+databases of several isolated data silos among different organizations, which
+in turn decreases the quality of learning models. In this paper, we propose an
+efficient VLF solution based on LSTM neural networks, in two variants, namely
+Nautilus and FedNautilus for the centralized and the federated learning
+approach, respectively. We also demonstrate the superiority of the centralized
+approach with respect to current state of the art and discuss the advantages
+and disadvantages of the federated against the centralized approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Out-of-distribution Reject Option Method for <span class="highlight-title">Dataset</span> Shift Problem in
+  Early Disease Onset Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taisei Tosaki, Eiichiro Uchino, Ryosuke Kojima, Yohei Mineharu, Mikio Arita, Nobuyuki Miyai, Yoshinori Tamada, Tatsuya Mikami, Koichi Murashita, Shigeyuki Nakaji, Yasushi Okuno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning is increasingly used to predict lifestyle-related disease
+onset using health and medical data. However, the prediction effectiveness is
+hindered by dataset shift, which involves discrepancies in data distribution
+between the training and testing datasets, misclassifying out-of-distribution
+(OOD) data. To diminish dataset shift effects, this paper proposes the
+out-of-distribution reject option for prediction (ODROP), which integrates OOD
+detection models to preclude OOD data from the prediction phase. We
+investigated the efficacy of five OOD detection methods (variational
+autoencoder, neural network ensemble std, neural network ensemble epistemic,
+neural network energy, and neural network gaussian mixture based energy
+measurement) across two datasets, the Hirosaki and Wakayama health checkup
+data, in the context of three disease onset prediction tasks: diabetes,
+dyslipidemia, and hypertension. To evaluate the ODROP method, we trained
+disease onset prediction models and OOD detection models on Hirosaki data and
+used AUROC-rejection curve plots from Wakayama data. The variational
+autoencoder method showed superior stability and magnitude of improvement in
+Area Under the Receiver Operating Curve (AUROC) in five cases: AUROC in the
+Wakayama data was improved from 0.80 to 0.90 at a 31.1% rejection rate for
+diabetes onset and from 0.70 to 0.76 at a 34% rejection rate for dyslipidemia.
+We categorized dataset shifts into two types using SHAP clustering - those that
+considerably affect predictions and those that do not. We expect that this
+classification will help standardize measuring instruments. This study is the
+first to apply OOD detection to actual health and medical data, demonstrating
+its potential to substantially improve the accuracy and reliability of disease
+prediction models amidst dataset shift.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Merit of River Network Topology for Neural Flood Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolas Kirschstein, Yixuan Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Climate change exacerbates riverine floods, which occur with higher frequency
+and intensity than ever. The much-needed forecasting systems typically rely on
+accurate river discharge predictions. To this end, the SOTA data-driven
+approaches treat forecasting at spatially distributed gauge stations as
+isolated problems, even within the same river network. However, incorporating
+the known topology of the river network into the prediction model has the
+potential to leverage the adjacency relationship between gauges. Thus, we model
+river discharge for a network of gauging stations with GNNs and compare the
+forecasting performance achieved by different adjacency definitions. Our
+results show that the model fails to benefit from the river network topology
+information, both on the entire network and small subgraphs. The learned edge
+weights correlate with neither of the static definitions and exhibit no regular
+pattern. Furthermore, the GNNs struggle to predict sudden, narrow discharge
+spikes. Our work hints at a more general underlying phenomenon of neural
+prediction not always benefitting from graphical structure and may inspire a
+systematic study of the conditions under which this happens.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://openreview.net/forum?id=QE6iC9s6vU</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Joint Selective State Space Model and Detrending for Robust Time Series
+  Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junqi Chen, Xu Tan, Sylwan Rahardja, Jiawei Yang, Susanto Rahardja
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based sequence models are extensively employed in Time Series
+Anomaly Detection (TSAD) tasks due to their effective sequential modeling
+capabilities. However, the ability of TSAD is limited by two key challenges:
+(i) the ability to model long-range dependency and (ii) the generalization
+issue in the presence of non-stationary data. To tackle these challenges, an
+anomaly detector that leverages the selective state space model known for its
+proficiency in capturing long-term dependencies across various domains is
+proposed. Additionally, a multi-stage detrending mechanism is introduced to
+mitigate the prominent trend component in non-stationary data to address the
+generalization issue. Extensive experiments conducted on realworld public
+datasets demonstrate that the proposed methods surpass all 12 compared baseline
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Signal Processing Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Approximate Global Convergence of Independent Learning in Multi-Agent
+  Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19811v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19811v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyang Jin, Zaiwei Chen, Yiheng Lin, Jie Song, Adam Wierman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Independent learning (IL), despite being a popular approach in practice to
+achieve scalability in large-scale multi-agent systems, usually lacks global
+convergence guarantees. In this paper, we study two representative algorithms,
+independent $Q$-learning and independent natural actor-critic, within
+value-based and policy-based frameworks, and provide the first finite-sample
+analysis for approximate global convergence. The results imply a sample
+complexity of $\tilde{\mathcal{O}}(\epsilon^{-2})$ up to an error term that
+captures the dependence among agents and characterizes the fundamental limit of
+IL in achieving global convergence. To establish the result, we develop a novel
+approach for analyzing IL by constructing a separable Markov decision process
+(MDP) for convergence analysis and then bounding the gap due to model
+difference between the separable MDP and the original one. Moreover, we conduct
+numerical experiments using a synthetic MDP and an electric vehicle charging
+example to verify our theoretical findings and to demonstrate the practical
+applicability of IL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MetaCURL: Non-stationary Concave Utility Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19807v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19807v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bianca Marin Moreno, Margaux Brégère, Pierre Gaillard, Nadia Oudjane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore online learning in episodic loop-free Markov decision processes on
+non-stationary environments (changing losses and probability transitions). Our
+focus is on the Concave Utility Reinforcement Learning problem (CURL), an
+extension of classical RL for handling convex performance criteria in
+state-action distributions induced by agent policies. While various machine
+learning problems can be written as CURL, its non-linearity invalidates
+traditional Bellman equations. Despite recent solutions to classical CURL, none
+address non-stationary MDPs. This paper introduces MetaCURL, the first CURL
+algorithm for non-stationary MDPs. It employs a meta-algorithm running multiple
+black-box algorithms instances over different intervals, aggregating outputs
+via a sleeping expert framework. The key hurdle is partial information due to
+MDP uncertainty. Under partial information on the probability transitions
+(uncertainty and non-stationarity coming only from external noise, independent
+of agent state-action pairs), we achieve optimal dynamic regret without prior
+knowledge of MDP changes. Unlike approaches for RL, MetaCURL handles full
+adversarial losses, not just stochastic ones. We believe our approach for
+managing non-stationarity with experts can be of interest to the RL community.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling and Mitigating Backdoor Vulnerabilities based on Unlearning
+  Weight Changes and Backdoor Activeness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weilin Lin, Li Liu, Shaokui Wei, Jianze Li, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The security threat of backdoor attacks is a central concern for deep neural
+networks (DNNs). Recently, without poisoned data, unlearning models with clean
+data and then learning a pruning mask have contributed to backdoor defense.
+Additionally, vanilla fine-tuning with those clean data can help recover the
+lost clean accuracy. However, the behavior of clean unlearning is still
+under-explored, and vanilla fine-tuning unintentionally induces back the
+backdoor effect. In this work, we first investigate model unlearning from the
+perspective of weight changes and gradient norms, and find two interesting
+observations in the backdoored model: 1) the weight changes between poison and
+clean unlearning are positively correlated, making it possible for us to
+identify the backdoored-related neurons without using poisoned data; 2) the
+neurons of the backdoored model are more active (i.e., larger changes in
+gradient norm) than those in the clean model, suggesting the need to suppress
+the gradient norm during fine-tuning. Then, we propose an effective two-stage
+defense method. In the first stage, an efficient Neuron Weight Change
+(NWC)-based Backdoor Reinitialization is proposed based on observation 1). In
+the second stage, based on observation 2), we design an Activeness-Aware
+Fine-Tuning to replace the vanilla fine-tuning. Extensive experiments,
+involving eight backdoor attacks on three benchmark datasets, demonstrate the
+superior performance of our proposed method compared to recent state-of-the-art
+backdoor defense approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recurrent Drafter for Fast Speculative Decoding in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.09919v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.09919v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aonan Zhang, Chong Wang, Yi Wang, Xuanyu Zhang, Yunfei Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce an improved approach of speculative decoding
+aimed at enhancing the efficiency of serving large language models. Our method
+capitalizes on the strengths of two established techniques: the classic
+two-model speculative decoding approach, and the more recent single-model
+approach, Medusa. Drawing inspiration from Medusa, our approach adopts a
+single-model strategy for speculative decoding. However, our method
+distinguishes itself by employing a single, lightweight draft head with a
+recurrent dependency design, akin in essence to the small, draft model uses in
+classic speculative decoding, but without the complexities of the full
+transformer architecture. And because of the recurrent dependency, we can use
+beam search to swiftly filter out undesired candidates with the draft head. The
+outcome is a method that combines the simplicity of single-model design and
+avoids the need to create a data-dependent tree attention structure only for
+inference in Medusa. We empirically demonstrate the effectiveness of the
+proposed method on several popular open source language models, along with a
+comprehensive analysis of the trade-offs involved in adopting this approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sharp Rates in Dependent Learning Theory: Avoiding Sample Size Deflation
+  for the Square Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05928v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05928v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ingvar Ziemann, Stephen Tu, George J. Pappas, Nikolai Matni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we study statistical learning with dependent ($\beta$-mixing)
+data and square loss in a hypothesis class $\mathscr{F}\subset L_{\Psi_p}$
+where $\Psi_p$ is the norm $\|f\|_{\Psi_p} \triangleq \sup_{m\geq 1} m^{-1/p}
+\|f\|_{L^m} $ for some $p\in [2,\infty]$. Our inquiry is motivated by the
+search for a sharp noise interaction term, or variance proxy, in learning with
+dependent data. Absent any realizability assumption, typical non-asymptotic
+results exhibit variance proxies that are deflated multiplicatively by the
+mixing time of the underlying covariates process. We show that whenever the
+topologies of $L^2$ and $\Psi_p$ are comparable on our hypothesis class
+$\mathscr{F}$ -- that is, $\mathscr{F}$ is a weakly sub-Gaussian class:
+$\|f\|_{\Psi_p} \lesssim \|f\|_{L^2}^\eta$ for some $\eta\in (0,1]$ -- the
+empirical risk minimizer achieves a rate that only depends on the complexity of
+the class and second order statistics in its leading term. Our result holds
+whether the problem is realizable or not and we refer to this as a \emph{near
+mixing-free rate}, since direct dependence on mixing is relegated to an
+additive higher order term. We arrive at our result by combining the above
+notion of a weakly sub-Gaussian class with mixed tail generic chaining. This
+combination allows us to compute sharp, instance-optimal rates for a wide range
+of problems. Examples that satisfy our framework include sub-Gaussian linear
+regression, more general smoothly parameterized function classes, finite
+hypothesis classes, and bounded smoothness classes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ You Need to Pay Better Attention: Rethinking the Mathematics of
+  Attention Mechanism 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.01643v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.01643v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehran Hosseini, Peyman Hosseini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaled Dot Product Attention (SDPA) is the backbone of many modern
+deep-learning models. It is so versatile that it has been used in natural
+language, vision, and multi-modal domains with very little change compared to
+its original formulation. This paper discusses why the current formulation is
+inefficient by delving into the mathematical details of the attention
+mechanism. We propose three improvements to mitigate these inefficiencies,
+thereby, introducing three enhanced attention mechanisms: Optimised, Efficient,
+and Super Attention. Optimised and Efficient Attention have one and two matrix
+multiplications fewer per head, respectively, and 25% and 50% fewer parameters,
+respectively, than standard SDPA, but perform similarly to standard SDPA in
+both vision and natural language tasks. They can be used in all applications
+where SDPA is used while offering smaller model sizes and faster training and
+inference without noticeable loss in performance. Super Attention introduces a
+new linear transformation on the values, transforming them from the left. It
+outperforms standard SPDA on vision and natural language tasks by up to 17%
+while having one fewer matrix multiplication per head and 25% fewer parameters
+than standard SDPA. Consequently, it is also faster than standard SDPA. Super
+Attention is ideal in applications where the attention layer's context length
+is fixed, such as Vision Transformers. In addition to providing mathematical
+reasoning, we evaluate the presented attention mechanisms on several datasets
+including MNIST, CIFAR100, ImageNet, IMDB Movie Reviews, and Amazon Reviews
+datasets, as well as combined Europarl and Anki English-Spanish datasets for
+neural machine translation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tag-LLM: Repurposing General-Purpose LLMs for Specialized Domains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05140v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05140v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junhong Shen, Neil Tenenholtz, James Brian Hall, David Alvarez-Melis, Nicolo Fusi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable proficiency in
+understanding and generating natural language. However, their capabilities wane
+in highly specialized domains underrepresented in the pretraining corpus, such
+as physical and biomedical sciences. This work explores how to repurpose
+general LLMs into effective task solvers for specialized domains. We introduce
+a novel, model-agnostic framework for learning custom input tags, which are
+parameterized as continuous vectors appended to the LLM's embedding layer, to
+condition the LLM. We design two types of input tags: domain tags are used to
+delimit specialized representations (e.g., chemical formulas) and provide
+domain-relevant context; function tags are used to represent specific functions
+(e.g., predicting molecular properties) and compress function-solving
+instructions. We develop a three-stage protocol to learn these tags using
+auxiliary data and domain knowledge. By explicitly disentangling task domains
+from task functions, our method enables zero-shot generalization to unseen
+problems through diverse combinations of the input tags. It also boosts LLM's
+performance in various specialized domains, such as predicting protein or
+chemical properties and modeling drug-target interactions, outperforming expert
+models tailored to these tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nonuniqueness and Convergence to Equivalent Solutions in Observer-based
+  Inverse Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.16299v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.16299v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jared Town, Zachary Morrison, Rushikesh Kamalapurkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge in solving the deterministic inverse reinforcement learning
+(IRL) problem online and in real-time is the existence of multiple solutions.
+Nonuniqueness necessitates the study of the notion of equivalent solutions,
+i.e., solutions that result in a different cost functional but same feedback
+matrix, and convergence to such solutions. While offline algorithms that result
+in convergence to equivalent solutions have been developed in the literature,
+online, real-time techniques that address nonuniqueness are not available. In
+this paper, a regularized history stack observer that converges to
+approximately equivalent solutions of the IRL problem is developed. Novel
+data-richness conditions are developed to facilitate the analysis and
+simulation results are provided to demonstrate the effectiveness of the
+developed technique.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Provably Effective Method for Pruning Experts in Fine-tuned Sparse
+  Mixture-of-Experts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16646v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16646v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Nowaz Rabbani Chowdhury, Meng Wang, Kaoutar El Maghraoui, Naigang Wang, Pin-Yu Chen, Christopher Carothers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The sparsely gated mixture of experts (MoE) architecture sends different
+inputs to different subnetworks, i.e., experts, through trainable routers. MoE
+reduces the training computation significantly for large models, but its
+deployment can be still memory or computation expensive for some downstream
+tasks. Model pruning is a popular approach to reduce inference computation, but
+its application in MoE architecture is largely unexplored. To the best of our
+knowledge, this paper provides the first provably efficient technique for
+pruning experts in finetuned MoE models. We theoretically prove that
+prioritizing the pruning of the experts with a smaller change of the routers l2
+norm from the pretrained model guarantees the preservation of test accuracy,
+while significantly reducing the model size and the computational requirements.
+Although our theoretical analysis is centered on binary classification tasks on
+simplified MoE architecture, our expert pruning method is verified on large
+vision MoE models such as VMoE and E3MoE finetuned on benchmark datasets such
+as CIFAR10, CIFAR100, and ImageNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Oja's Algorithm for Sparse PCA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07240v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07240v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Syamantak Kumar, Purnamrita Sarkar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Oja's algorithm for streaming Principal Component Analysis (PCA) for $n$
+datapoints in a $d$ dimensional space achieves the same sin-squared error
+$O(r_\mathsf{eff}/n)$ as the offline algorithm in $O(d)$ space and $O(nd)$ time
+and a single pass through the datapoints. Here $r_\mathsf{eff}$ is the
+effective rank (ratio of the trace and the principal eigenvalue of the
+population covariance matrix $\Sigma$). Under this computational budget, we
+consider the problem of sparse PCA, where the principal eigenvector of $\Sigma$
+is $s$-sparse, and $r_\mathsf{eff}$ can be large. In this setting, to our
+knowledge, \textit{there are no known single-pass algorithms} that achieve the
+minimax error bound in $O(d)$ space and $O(nd)$ time without either requiring
+strong initialization conditions or assuming further structure (e.g., spiked)
+of the covariance matrix. We show that a simple single-pass procedure that
+thresholds the output of Oja's algorithm (the Oja vector) can achieve the
+minimax error bound under some regularity conditions in $O(d)$ space and
+$O(nd)$ time as long as $r_\mathsf{eff}=O(n/\log n)$. We present a nontrivial
+and novel analysis of the entries of the unnormalized Oja vector, which
+involves the projection of a product of independent random matrices on a random
+initial vector. This is completely different from previous analyses of Oja's
+algorithm and matrix products, which have been done when the $r_\mathsf{eff}$
+is bounded.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAP-Neo: Highly Capable and Transparent Bilingual Large Language Model
+  Series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19327v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19327v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ge Zhang, Scott Qu, Jiaheng Liu, Chenchen Zhang, Chenghua Lin, Chou Leuang Yu, Danny Pan, Esther Cheng, Jie Liu, Qunshu Lin, Raven Yuan, Tuney Zheng, Wei Pang, Xinrun Du, Yiming Liang, Yinghao Ma, Yizhi Li, Ziyang Ma, Bill Lin, Emmanouil Benetos, Huan Yang, Junting Zhou, Kaijing Ma, Minghao Liu, Morry Niu, Noah Wang, Quehry Que, Ruibo Liu, Sine Liu, Shawn Guo, Soren Gao, Wangchunshu Zhou, Xinyue Zhang, Yizhi Zhou, Yubo Wang, Yuelin Bai, Yuhan Zhang, Yuxiang Zhang, Zenith Wang, Zhenzhu Yang, Zijian Zhao, Jiajun Zhang, Wanli Ouyang, Wenhao Huang, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have made great strides in recent years to
+achieve unprecedented performance across different tasks. However, due to
+commercial interest, the most competitive models like GPT, Gemini, and Claude
+have been gated behind proprietary interfaces without disclosing the training
+details. Recently, many institutions have open-sourced several strong LLMs like
+LLaMA-3, comparable to existing closed-source LLMs. However, only the model's
+weights are provided with most details (e.g., intermediate checkpoints,
+pre-training corpus, and training code, etc.) being undisclosed. To improve the
+transparency of LLMs, the research community has formed to open-source truly
+open LLMs (e.g., Pythia, Amber, OLMo), where more details (e.g., pre-training
+corpus and training code) are being provided. These models have greatly
+advanced the scientific study of these large models including their strengths,
+weaknesses, biases and risks. However, we observe that the existing truly open
+LLMs on reasoning, knowledge, and coding tasks are still inferior to existing
+state-of-the-art LLMs with similar model sizes. To this end, we open-source
+MAP-Neo, a highly capable and transparent bilingual language model with 7B
+parameters trained from scratch on 4.5T high-quality tokens. Our MAP-Neo is the
+first fully open-sourced bilingual LLM with comparable performance compared to
+existing state-of-the-art LLMs. Moreover, we open-source all details to
+reproduce our MAP-Neo, where the cleaned pre-training corpus, data cleaning
+pipeline, checkpoints, and well-optimized training/evaluation framework are
+provided. Finally, we hope our MAP-Neo will enhance and strengthen the open
+research community and inspire more innovations and creativities to facilitate
+the further improvements of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://map-neo.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to Leverage Diverse Demonstrations in Offline Imitation Learning <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17476v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17476v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Yue, Jiani Liu, Xingyuan Hua, Ju Ren, Sen Lin, Junshan Zhang, Yaoxue Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline Imitation Learning (IL) with imperfect demonstrations has garnered
+increasing attention owing to the scarcity of expert data in many real-world
+domains. A fundamental problem in this scenario is how to extract positive
+behaviors from noisy data. In general, current approaches to the problem select
+data building on state-action similarity to given expert demonstrations,
+neglecting precious information in (potentially abundant) $\textit{diverse}$
+state-actions that deviate from expert ones. In this paper, we introduce a
+simple yet effective data selection method that identifies positive behaviors
+based on their resultant states -- a more informative criterion enabling
+explicit utilization of dynamics information and effective extraction of both
+expert and beneficial diverse behaviors. Further, we devise a lightweight
+behavior cloning algorithm capable of leveraging the expert and selected data
+correctly. In the experiments, we evaluate our method on a suite of complex and
+high-dimensional offline IL benchmarks, including continuous-control and
+vision-based tasks. The results demonstrate that our method achieves
+state-of-the-art performance, outperforming existing methods on
+$\textbf{20/21}$ benchmarks, typically by $\textbf{2-5x}$, while maintaining a
+comparable runtime to Behavior Cloning ($\texttt{BC}$).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Conference on Machine Learning (ICML)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Expert Proximity as Surrogate Rewards for Single Demonstration Imitation
+  Learning <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01057v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01057v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chia-Cheng Chiang, Li-Cheng Lan, Wei-Fang Sun, Chien Feng, Cho-Jui Hsieh, Chun-Yi Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on single-demonstration imitation learning (IL), a
+practical approach for real-world applications where acquiring multiple expert
+demonstrations is costly or infeasible and the ground truth reward function is
+not available. In contrast to typical IL settings with multiple demonstrations,
+single-demonstration IL involves an agent having access to only one expert
+trajectory. We highlight the issue of sparse reward signals in this setting and
+propose to mitigate this issue through our proposed Transition
+Discriminator-based IL (TDIL) method. TDIL is an IRL method designed to address
+reward sparsity by introducing a denser surrogate reward function that
+considers environmental dynamics. This surrogate reward function encourages the
+agent to navigate towards states that are proximal to expert states. In
+practice, TDIL trains a transition discriminator to differentiate between valid
+and non-valid transitions in a given environment to compute the surrogate
+rewards. The experiments demonstrate that TDIL outperforms existing IL
+approaches and achieves expert-level performance in the single-demonstration IL
+setting across five widely adopted MuJoCo benchmarks as well as the "Adroit
+Door" robotic environment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICML 2024. Code: https://github.com/stanl1y/tdil</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Absolute Policy Optimization <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.13230v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.13230v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weiye Zhao, Feihan Li, Yifan Sun, Rui Chen, Tianhao Wei, Changliu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, trust region on-policy reinforcement learning has achieved
+impressive results in addressing complex control tasks and gaming scenarios.
+However, contemporary state-of-the-art algorithms within this category
+primarily emphasize improvement in expected performance, lacking the ability to
+control over the worst-case performance outcomes. To address this limitation,
+we introduce a novel objective function, optimizing which leads to guaranteed
+monotonic improvement in the lower probability bound of performance with high
+confidence. Building upon this groundbreaking theoretical advancement, we
+further introduce a practical solution called Absolute Policy Optimization
+(APO). Our experiments demonstrate the effectiveness of our approach across
+challenging continuous control benchmark tasks and extend its applicability to
+mastering Atari games. Our findings reveal that APO as well as its efficient
+variation Proximal Absolute Policy Optimization (PAPO) significantly
+outperforms state-of-the-art policy gradient algorithms, resulting in
+substantial improvements in worst-case performance, as well as expected
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published in ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OLLIE: Imitation Learning from Offline <span class="highlight-title">Pretrain</span>ing to Online Finetuning <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17477v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17477v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Yue, Xingyuan Hua, Ju Ren, Sen Lin, Junshan Zhang, Yaoxue Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study offline-to-online Imitation Learning (IL) that
+pretrains an imitation policy from static demonstration data, followed by fast
+finetuning with minimal environmental interaction. We find the na\"ive
+combination of existing offline IL and online IL methods tends to behave poorly
+in this context, because the initial discriminator (often used in online IL)
+operates randomly and discordantly against the policy initialization, leading
+to misguided policy optimization and $\textit{unlearning}$ of pretraining
+knowledge. To overcome this challenge, we propose a principled
+offline-to-online IL method, named $\texttt{OLLIE}$, that simultaneously learns
+a near-expert policy initialization along with an $\textit{aligned
+discriminator initialization}$, which can be seamlessly integrated into online
+IL, achieving smooth and fast finetuning. Empirically, $\texttt{OLLIE}$
+consistently and significantly outperforms the baseline methods in
+$\textbf{20}$ challenging tasks, from continuous control to vision-based
+domains, in terms of performance, demonstration efficiency, and convergence
+speed. This work may serve as a foundation for further exploration of
+pretraining and finetuning in the context of IL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Conference on Machine Learning (ICML)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Formalizing and Benchmarking <span class="highlight-title">Prompt</span> Injection Attacks and Defenses <span class="chip">USENIX Security</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12815v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12815v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupei Liu, Yuqi Jia, Runpeng Geng, Jinyuan Jia, Neil Zhenqiang Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A prompt injection attack aims to inject malicious instruction/data into the
+input of an LLM-Integrated Application such that it produces results as an
+attacker desires. Existing works are limited to case studies. As a result, the
+literature lacks a systematic understanding of prompt injection attacks and
+their defenses. We aim to bridge the gap in this work. In particular, we
+propose a framework to formalize prompt injection attacks. Existing attacks are
+special cases in our framework. Moreover, based on our framework, we design a
+new attack by combining existing ones. Using our framework, we conduct a
+systematic evaluation on 5 prompt injection attacks and 10 defenses with 10
+LLMs and 7 tasks. Our work provides a common benchmark for quantitatively
+evaluating future prompt injection attacks and defenses. To facilitate research
+on this topic, we make our platform public at
+https://github.com/liu00222/Open-Prompt-Injection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in USENIX Security Symposium 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WW-FL: Secure and Private Large-Scale Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09904v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09904v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Marx, Thomas Schneider, Ajith Suresh, Tobias Wehrle, Christian Weinert, Hossein Yalame
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is an efficient approach for large-scale distributed
+machine learning that promises data privacy by keeping training data on client
+devices. However, recent research has uncovered vulnerabilities in FL,
+impacting both security and privacy through poisoning attacks and the potential
+disclosure of sensitive information in individual model updates as well as the
+aggregated global model. This paper explores the inadequacies of existing FL
+protection measures when applied independently, and the challenges of creating
+effective compositions.
+  Addressing these issues, we propose WW-FL, an innovative framework that
+combines secure multi-party computation (MPC) with hierarchical FL to guarantee
+data and global model privacy. One notable feature of WW-FL is its capability
+to prevent malicious clients from directly poisoning model parameters,
+confining them to less destructive data poisoning attacks. We furthermore
+provide a PyTorch-based FL implementation integrated with Meta's CrypTen MPC
+framework to systematically measure the performance and robustness of WW-FL.
+Our extensive evaluation demonstrates that WW-FL is a promising solution for
+secure and private large-scale federated learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>WWFL combines private training and inference with secure aggregation
+  and hierarchical FL to provide end-to-end protection and to facilitate
+  large-scale global deployment</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Last-Iterate Convergence of Shuffling Gradient Methods <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07723v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07723v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Liu, Zhengyuan Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shuffling gradient methods are widely implemented in practice, particularly
+including three popular algorithms: Random Reshuffle (RR), Shuffle Once (SO),
+and Incremental Gradient (IG). Compared to the empirical success, the
+theoretical guarantee of shuffling gradient methods was not well-understood for
+a long time. Until recently, the convergence rates had just been established
+for the average iterate for convex functions and the last iterate for strongly
+convex problems (using squared distance as the metric). However, when using the
+function value gap as the convergence criterion, existing theories cannot
+interpret the good performance of the last iterate in different settings (e.g.,
+constrained optimization). To bridge this gap between practice and theory, we
+prove the first last-iterate convergence rates for shuffling gradient methods
+with respect to the objective value even without strong convexity. Our new
+results either (nearly) match the existing last-iterate lower bounds or are as
+fast as the previous best upper bounds for the average iterate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Not All Experts are Equal: Efficient Expert Pruning and Skipping for
+  Mixture-of-Experts Large Language Models <span class="chip">ACL2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.14800v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.14800v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xudong Lu, Qi Liu, Yuhui Xu, Aojun Zhou, Siyuan Huang, Bo Zhang, Junchi Yan, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A pivotal advancement in the progress of large language models (LLMs) is the
+emergence of the Mixture-of-Experts (MoE) LLMs. Compared to traditional LLMs,
+MoE LLMs can achieve higher performance with fewer parameters, but it is still
+hard to deploy them due to their immense parameter sizes. Different from
+previous weight pruning methods that rely on specifically designed hardware,
+this paper mainly aims to enhance the deployment efficiency of MoE LLMs by
+introducing plug-and-play expert-level sparsification techniques. Specifically,
+we propose, for the first time to our best knowledge, post-training approaches
+for task-agnostic and task-specific expert pruning and skipping of MoE LLMs,
+tailored to improve deployment efficiency while maintaining model performance
+across a wide range of tasks. Extensive experiments show that our proposed
+methods can simultaneously reduce model sizes and increase the inference speed,
+while maintaining satisfactory performance. Data and code will be available at
+https://github.com/Lucky-Lance/Expert_Sparsity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Mixture-of-Experts Large Language Models, ACL2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Croissant: A Metadata Format for ML-Ready <span class="highlight-title">Dataset</span>s <span class="chip">SIGMOD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.19546v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.19546v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mubashara Akhtar, Omar Benjelloun, Costanza Conforti, Pieter Gijsbers, Joan Giner-Miguelez, Nitisha Jain, Michael Kuchnik, Quentin Lhoest, Pierre Marcenac, Manil Maskey, Peter Mattson, Luis Oala, Pierre Ruyssen, Rajat Shinde, Elena Simperl, Goeffry Thomas, Slava Tykhonov, Joaquin Vanschoren, Jos van der Velde, Steffen Vogler, Carole-Jean Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data is a critical resource for Machine Learning (ML), yet working with data
+remains a key friction point. This paper introduces Croissant, a metadata
+format for datasets that simplifies how data is used by ML tools and
+frameworks. Croissant makes datasets more discoverable, portable and
+interoperable, thereby addressing significant challenges in ML data management
+and responsible AI. Croissant is already supported by several popular dataset
+repositories, spanning hundreds of thousands of datasets, ready to be loaded
+into the most popular ML frameworks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Proceedings of ACM SIGMOD/PODS'24 Data Management for
+  End-to-End Machine Learning (DEEM) Workshop
+  https://dl.acm.org/doi/10.1145/3650203.3663326</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AnalogCoder: Analog Circuit Design via Training-Free Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14918v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14918v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Lai, Sungyoung Lee, Guojin Chen, Souradip Poddar, Mengkang Hu, David Z. Pan, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analog circuit design is a significant task in modern chip technology,
+focusing on the selection of component types, connectivity, and parameters to
+ensure proper circuit functionality. Despite advances made by Large Language
+Models (LLMs) in digital circuit design, the complexity and scarcity of data in
+analog circuitry pose significant challenges. To mitigate these issues, we
+introduce AnalogCoder, the first training-free LLM agent for designing analog
+circuits through Python code generation. Firstly, AnalogCoder incorporates a
+feedback-enhanced flow with tailored domain-specific prompts, enabling the
+automated and self-correcting design of analog circuits with a high success
+rate. Secondly, it proposes a circuit tool library to archive successful
+designs as reusable modular sub-circuits, simplifying composite circuit
+creation. Thirdly, extensive experiments on a benchmark designed to cover a
+wide range of analog circuit tasks show that AnalogCoder outperforms other
+LLM-based methods. It has successfully designed 20 circuits, 5 more than
+standard GPT-4o. We believe AnalogCoder can significantly improve the
+labor-intensive chip design process, enabling non-experts to design analog
+circuits efficiently.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Unlearning of <span class="highlight-title">Pre-train</span>ed Large Language Models <span class="chip">ACL 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.15159v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.15159v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jin Yao, Eli Chien, Minxin Du, Xinyao Niu, Tianhao Wang, Zezhou Cheng, Xiang Yue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study investigates the concept of the `right to be forgotten' within the
+context of large language models (LLMs). We explore machine unlearning as a
+pivotal solution, with a focus on pre-trained models--a notably
+under-researched area. Our research delineates a comprehensive framework for
+machine unlearning in pre-trained LLMs, encompassing a critical analysis of
+seven diverse unlearning methods. Through rigorous evaluation using curated
+datasets from arXiv, books, and GitHub, we establish a robust benchmark for
+unlearning performance, demonstrating that these methods are over $10^5$ times
+more computationally efficient than retraining. Our results show that
+integrating gradient ascent with gradient descent on in-distribution data
+improves hyperparameter robustness. We also provide detailed guidelines for
+efficient hyperparameter tuning in the unlearning process. Our findings advance
+the discourse on ethical AI practices, offering substantive insights into the
+mechanics of machine unlearning for pre-trained LLMs and underscoring the
+potential for responsible AI development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACL 2024 main. Code and data at
+  https://github.com/yaojin17/Unlearning_LLM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Hierarchical Classification on the Common Procurement
+  Vocabulary Taxonomy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.09983v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.09983v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Moiraghi, Matteo Palmonari, Davide Allavena, Federico Morando
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classifying public tenders is a useful task for both companies that are
+invited to participate and for inspecting fraudulent activities. To facilitate
+the task for both participants and public administrations, the European Union
+presented a common taxonomy (Common Procurement Vocabulary, CPV) which is
+mandatory for tenders of certain importance; however, the contracts in which a
+CPV label is mandatory are the minority compared to all the Public
+Administrations activities. Classifying over a real-world taxonomy introduces
+some difficulties that can not be ignored. First of all, some fine-grained
+classes have an insufficient (if any) number of observations in the training
+set, while other classes are far more frequent (even thousands of times) than
+the average. To overcome those difficulties, we present a zero-shot approach,
+based on a pre-trained language model that relies only on label description and
+respects the label taxonomy. To train our proposed model, we used industrial
+data, which comes from contrattipubblici.org, a service by SpazioDati s.r.l.
+that collects public contracts stipulated in Italy in the last 25 years.
+Results show that the proposed model achieves better performance in classifying
+low-frequent classes compared to three different baselines, and is also able to
+predict never-seen classes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Full-length version of the short paper accepted at COMPSAC 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Characterizing Data Point Vulnerability via Average-Case Robustness <span class="chip">UAI 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13885v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13885v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tessa Han, Suraj Srinivas, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Studying the robustness of machine learning models is important to ensure
+consistent model behaviour across real-world settings. To this end, adversarial
+robustness is a standard framework, which views robustness of predictions
+through a binary lens: either a worst-case adversarial misclassification exists
+in the local region around an input, or it does not. However, this binary
+perspective does not account for the degrees of vulnerability, as data points
+with a larger number of misclassified examples in their neighborhoods are more
+vulnerable. In this work, we consider a complementary framework for robustness,
+called average-case robustness, which measures the fraction of points in a
+local region that provides consistent predictions. However, computing this
+quantity is hard, as standard Monte Carlo approaches are inefficient especially
+for high-dimensional inputs. In this work, we propose the first analytical
+estimators for average-case robustness for multi-class classifiers. We show
+empirically that our estimators are accurate and efficient for standard deep
+learning models and demonstrate their usefulness for identifying vulnerable
+data points, as well as quantifying robustness bias of models. Overall, our
+tools provide a complementary view to robustness, improving our ability to
+characterize model behaviour.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>UAI 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Optimal Deterministic Policies with Stochastic Policy Gradients <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.02235v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.02235v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessandro Montenegro, Marco Mussi, Alberto Maria Metelli, Matteo Papini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Policy gradient (PG) methods are successful approaches to deal with
+continuous reinforcement learning (RL) problems. They learn stochastic
+parametric (hyper)policies by either exploring in the space of actions or in
+the space of parameters. Stochastic controllers, however, are often undesirable
+from a practical perspective because of their lack of robustness, safety, and
+traceability. In common practice, stochastic (hyper)policies are learned only
+to deploy their deterministic version. In this paper, we make a step towards
+the theoretical understanding of this practice. After introducing a novel
+framework for modeling this scenario, we study the global convergence to the
+best deterministic policy, under (weak) gradient domination assumptions. Then,
+we illustrate how to tune the exploration level used for learning to optimize
+the trade-off between the sample complexity and the performance of the deployed
+deterministic policy. Finally, we quantitatively compare action-based and
+parameter-based exploration, giving a formal guise to intuitive results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text clustering with LLM embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.15112v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.15112v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alina Petukhova, João P. Matos-Carvalho, Nuno Fachada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text clustering is an important approach for organising the growing amount of
+digital content, helping to structure and find hidden patterns in uncategorised
+data. However, the effectiveness of text clustering heavily relies on the
+choice of textual embeddings and clustering algorithms. We argue that recent
+advances in large language models (LLMs) can potentially improve this task. In
+this research, we investigated how different textual embeddings -- particularly
+those used in LLMs -- and clustering algorithms affect how text datasets are
+clustered. A series of experiments were conducted to assess how embeddings
+influence clustering results, the role played by dimensionality reduction
+through summarisation, and model size adjustment. Findings reveal that LLM
+embeddings excel at capturing subtleties in structured language, while BERT
+leads the lightweight options in performance. In addition, we observe that
+increasing model dimensionality and employing summarization techniques do not
+consistently lead to improvements in clustering efficiency, suggesting that
+these strategies require careful analysis to use in real-life models. These
+results highlight a complex balance between the need for refined text
+representation and computational feasibility in text clustering applications.
+This study extends traditional text clustering frameworks by incorporating
+embeddings from LLMs, providing a path for improved methodologies, while
+informing new avenues for future research in various types of textual analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Optimal Transport with General Cost Functionals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.15403v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.15403v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arip Asadulaev, Alexander Korotin, Vage Egiazarian, Petr Mokrov, Evgeny Burnaev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel neural network-based algorithm to compute optimal
+transport (OT) plans for general cost functionals. In contrast to common
+Euclidean costs, i.e., $\ell^1$ or $\ell^2$, such functionals provide more
+flexibility and allow using auxiliary information, such as class labels, to
+construct the required transport map. Existing methods for general costs are
+discrete and have limitations in practice, i.e. they do not provide an
+out-of-sample estimation. We address the challenge of designing a continuous OT
+approach for general costs that generalizes to new data points in
+high-dimensional spaces, such as images. Additionally, we provide the
+theoretical error analysis for our recovered transport plans. As an
+application, we construct a cost functional to map data distributions while
+preserving the class-wise structure.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A2PO: Towards Effective Offline Reinforcement Learning from an
+  Advantage-aware Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.07262v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.07262v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunpeng Qing, Shunyu liu, Jingyuan Cong, Kaixuan Chen, Yihe Zhou, Mingli Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Offline reinforcement learning endeavors to leverage offline datasets to
+craft effective agent policy without online interaction, which imposes proper
+conservative constraints with the support of behavior policies to tackle the
+out-of-distribution problem. However, existing works often suffer from the
+constraint conflict issue when offline datasets are collected from multiple
+behavior policies, i.e., different behavior policies may exhibit inconsistent
+actions with distinct returns across the state space. To remedy this issue,
+recent advantage-weighted methods prioritize samples with high advantage values
+for agent training while inevitably ignoring the diversity of behavior policy.
+In this paper, we introduce a novel Advantage-Aware Policy Optimization (A2PO)
+method to explicitly construct advantage-aware policy constraints for offline
+learning under mixed-quality datasets. Specifically, A2PO employs a conditional
+variational auto-encoder to disentangle the action distributions of intertwined
+behavior policies by modeling the advantage values of all training data as
+conditional variables. Then the agent can follow such disentangled action
+distribution constraints to optimize the advantage-aware policy towards high
+advantage values. Extensive experiments conducted on both the single-quality
+and mixed-quality datasets of the D4RL benchmark demonstrate that A2PO yields
+results superior to the counterparts. Our code will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Trust-based Consensus in Multi-Agent Reinforcement Learning Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.12880v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.12880v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ho Long Fung, Victor-Alexandru Darvariu, Stephen Hailes, Mirco Musolesi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An often neglected issue in multi-agent reinforcement learning (MARL) is the
+potential presence of unreliable agents in the environment whose deviations
+from expected behavior can prevent a system from accomplishing its intended
+tasks. In particular, consensus is a fundamental underpinning problem of
+cooperative distributed multi-agent systems. Consensus requires different
+agents, situated in a decentralized communication network, to reach an
+agreement out of a set of initial proposals that they put forward.
+Learning-based agents should adopt a protocol that allows them to reach
+consensus despite having one or more unreliable agents in the system. This
+paper investigates the problem of unreliable agents in MARL, considering
+consensus as a case study. Echoing established results in the distributed
+systems literature, our experiments show that even a moderate fraction of such
+agents can greatly impact the ability of reaching consensus in a networked
+environment. We propose Reinforcement Learning-based Trusted Consensus (RLTC),
+a decentralized trust mechanism, in which agents can independently decide which
+neighbors to communicate with. We empirically demonstrate that our trust
+mechanism is able to handle unreliable agents effectively, as evidenced by
+higher consensus success rates.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in proceedings of the first Reinforcement
+  Learning Conference (RLC 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PV-Tuning: Beyond Straight-Through Estimation for Extreme LLM
+  Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.14852v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.14852v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vladimir Malinovskii, Denis Mazur, Ivan Ilin, Denis Kuznedelev, Konstantin Burlachenko, Kai Yi, Dan Alistarh, Peter Richtarik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There has been significant interest in "extreme" compression of large
+language models (LLMs), i.e., to 1-2 bits per parameter, which allows such
+models to be executed efficiently on resource-constrained devices. Existing
+work focused on improved one-shot quantization techniques and weight
+representations; yet, purely post-training approaches are reaching diminishing
+returns in terms of the accuracy-vs-bit-width trade-off. State-of-the-art
+quantization methods such as QuIP# and AQLM include fine-tuning (part of) the
+compressed parameters over a limited amount of calibration data; however, such
+fine-tuning techniques over compressed weights often make exclusive use of
+straight-through estimators (STE), whose performance is not well-understood in
+this setting. In this work, we question the use of STE for extreme LLM
+compression, showing that it can be sub-optimal, and perform a systematic study
+of quantization-aware fine-tuning strategies for LLMs. We propose PV-Tuning - a
+representation-agnostic framework that generalizes and improves upon existing
+fine-tuning strategies, and provides convergence guarantees in restricted
+cases. On the practical side, when used for 1-2 bit vector quantization,
+PV-Tuning outperforms prior techniques for highly-performant models such as
+Llama and Mistral. Using PV-Tuning, we achieve the first Pareto-optimal
+quantization for Llama 2 family models at 2 bits per parameter.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Adam Optimizer via Online Learning of Updates: Adam is
+  FTRL in Disguise <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.01567v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.01567v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kwangjun Ahn, Zhiyu Zhang, Yunbum Kook, Yan Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the success of the Adam optimizer in practice, the theoretical
+understanding of its algorithmic components still remains limited. In
+particular, most existing analyses of Adam show the convergence rate that can
+be simply achieved by non-adative algorithms like SGD. In this work, we provide
+a different perspective based on online learning that underscores the
+importance of Adam's algorithmic components. Inspired by Cutkosky et al.
+(2023), we consider the framework called online learning of updates/increments,
+where we choose the updates/increments of an optimizer based on an online
+learner. With this framework, the design of a good optimizer is reduced to the
+design of a good online learner. Our main observation is that Adam corresponds
+to a principled online learning framework called Follow-the-Regularized-Leader
+(FTRL). Building on this observation, we study the benefits of its algorithmic
+components from the online learning perspective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Representational Capacity of Recurrent Neural Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12942v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12942v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Franz Nowak, Anej Svete, Li Du, Ryan Cotterell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work investigates the computational expressivity of language models
+(LMs) based on recurrent neural networks (RNNs). Siegelmann and Sontag (1992)
+famously showed that RNNs with rational weights and hidden states and unbounded
+computation time are Turing complete. However, LMs define weightings over
+strings in addition to just (unweighted) language membership and the analysis
+of the computational power of RNN LMs (RLMs) should reflect this. We extend the
+Turing completeness result to the probabilistic case, showing how a rationally
+weighted RLM with unbounded computation time can simulate any deterministic
+probabilistic Turing machine (PTM) with rationally weighted transitions. Since,
+in practice, RLMs work in real-time, processing a symbol at every time step, we
+treat the above result as an upper bound on the expressivity of RLMs. We also
+provide a lower bound by showing that under the restriction to real-time
+computation, such models can simulate deterministic real-time rational PTMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added requirement for non-negative probabilities to definitions 2.3
+  and 3.1, fixed typos</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Identifying Drivers of Predictive Aleatoric Uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.07252v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.07252v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Iversen, Simon Witzke, Katharina Baum, Bernhard Y. Renard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainability and uncertainty quantification are two pillars of trustable
+artificial intelligence. However, the reasoning behind uncertainty estimates is
+generally left unexplained. Identifying the drivers of uncertainty complements
+explanations of point predictions in recognizing model limitations and enhances
+trust in decisions and their communication. So far, explanations of
+uncertainties have been rarely studied. The few exceptions rely on Bayesian
+neural networks or technically intricate approaches, such as auxiliary
+generative models, thereby hindering their broad adoption. We present a simple
+approach to explain predictive aleatoric uncertainties. We estimate uncertainty
+as predictive variance by adapting a neural network with a Gaussian output
+distribution. Subsequently, we apply out-of-the-box explainers to the model's
+variance output. This approach can explain uncertainty influences more reliably
+than literature baselines, which we evaluate in a synthetic setting with a
+known data-generating process. We further adapt multiple metrics from
+conventional XAI research to uncertainty explanations. We quantify our findings
+with a nuanced benchmark analysis that includes real-world datasets. Finally,
+we apply our approach to an age regression model and discover reasonable
+sources of uncertainty. Overall, we explain uncertainty estimates with little
+modifications to the model architecture and demonstrate that our approach
+competes effectively with more intricate methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Simon Witzke and Pascal Iversen contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Simultaneous identification of models and parameters of scientific
+  simulators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15174v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15174v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cornelius Schröder, Jakob H. Macke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many scientific models are composed of multiple discrete components, and
+scientists often make heuristic decisions about which components to include.
+Bayesian inference provides a mathematical framework for systematically
+selecting model components, but defining prior distributions over model
+components and developing associated inference schemes has been challenging. We
+approach this problem in a simulation-based inference framework: We define
+model priors over candidate components and, from model simulations, train
+neural networks to infer joint probability distributions over both model
+components and associated parameters. Our method, simulation-based model
+inference (SBMI), represents distributions over model components as a
+conditional mixture of multivariate binary distributions in the Grassmann
+formalism. SBMI can be applied to any compositional stochastic simulator
+without requiring likelihood evaluations. We evaluate SBMI on a simple time
+series model and on two scientific models from neuroscience, and show that it
+can discover multiple data-consistent model configurations, and that it reveals
+non-identifiable model components and parameters. SBMI provides a powerful tool
+for data-driven scientific inquiry which will allow scientists to identify
+essential model components and make uncertainty-informed modelling decisions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interpretable classifiers for tabular data via discretization and
+  feature selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.05680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.05680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Reijo Jaakkola, Tomi Janhunen, Antti Kuusisto, Masood Feyzbakhsh Rankooh, Miikka Vilander
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a method for computing immediately human interpretable yet
+accurate classifiers from tabular data. The classifiers obtained are short
+Boolean formulas, computed via first discretizing the original data and then
+using feature selection coupled with a very fast algorithm for producing the
+best possible Boolean classifier for the setting. We demonstrate the approach
+via 13 experiments, obtaining results with accuracies comparable to ones
+obtained via random forests, XGBoost, and existing results for the same
+datasets in the literature. In most cases, the accuracy of our method is in
+fact similar to that of the reference methods, even though the main objective
+of our study is the immediate interpretability of our classifiers. We also
+prove a new result on the probability that the classifier we obtain from
+real-life data corresponds to the ideally best classifier with respect to the
+background distribution the data comes from.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Changes in relation to version 1: more thorough and detailed
+  experiments, general corrections and refinements</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information
+  Seeking in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03271v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03271v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Hu, Chumin Liu, Xidong Feng, Yilun Zhao, See-Kiong Ng, Anh Tuan Luu, Junxian He, Pang Wei Koh, Bryan Hooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the face of uncertainty, the ability to *seek information* is of
+fundamental importance. In many practical applications, such as medical
+diagnosis and troubleshooting, the information needed to solve the task is not
+initially given and has to be actively sought by asking follow-up questions
+(for example, a doctor asking a patient for more details about their symptoms).
+In this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to
+augment large language models with the ability to actively seek information by
+asking effective questions. UoT combines 1) an *uncertainty-aware simulation
+approach* which enables the model to simulate possible future scenarios and how
+likely they are to occur, 2) *uncertainty-based rewards* motivated by
+information gain which incentivizes the model to seek information, and 3) a
+*reward propagation scheme* to select the optimal question to ask in a way that
+maximizes the expected reward. In experiments on medical diagnosis,
+troubleshooting, and the `20 Questions` game, UoT achieves an average
+performance improvement of 38.1% in the rate of successful task completion
+across multiple LLMs compared with direct prompting and also improves
+efficiency (i.e., the number of questions needed to complete the task). Our
+code has been released [here](https://github.com/zhiyuanhubj/UoT)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update Results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Efficient and Multi-private Key Secure Aggregation for Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08970v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08970v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xue Yang, Zifeng Liu, Xiaohu Tang, Rongxing Lu, Bo Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of privacy leaks in federated learning, secure aggregation
+protocols that mainly adopt either homomorphic encryption or threshold secret
+sharing have been widely developed for federated learning to protect the
+privacy of the local training data of each client. However, these existing
+protocols suffer from many shortcomings, such as the dependence on a trusted
+third party, the vulnerability to clients being corrupted, low efficiency, the
+trade-off between security and fault tolerance, etc. To solve these
+disadvantages, we propose an efficient and multi-private key secure aggregation
+scheme for federated learning. Specifically, we skillfully modify the variant
+ElGamal encryption technique to achieve homomorphic addition operation, which
+has two important advantages: 1) The server and each client can freely select
+public and private keys without introducing a trust third party and 2) Compared
+to the variant ElGamal encryption, the plaintext space is relatively large,
+which is more suitable for the deep model. Besides, for the high dimensional
+deep model parameter, we introduce a super-increasing sequence to compress
+multi-dimensional data into 1-D, which can greatly reduce encryption and
+decryption times as well as communication for ciphertext transmission. Detailed
+security analyses show that our proposed scheme achieves the semantic security
+of both individual local gradients and the aggregated result while achieving
+optimal robustness in tolerating both client collusion and dropped clients.
+Extensive simulations demonstrate that the accuracy of our scheme is almost the
+same as the non-private approach, while the efficiency of our scheme is much
+better than the state-of-the-art homomorphic encryption-based secure
+aggregation schemes. More importantly, the efficiency advantages of our scheme
+will become increasingly prominent as the number of model parameters increases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Algebraic and Statistical Properties of the Ordinary Least Squares
+  Interpolator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15769v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15769v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dennis Shen, Dogyoon Song, Peng Ding, Jasjeet S. Sekhon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning research has uncovered the phenomenon of benign overfitting for
+overparameterized statistical models, which has drawn significant theoretical
+interest in recent years. Given its simplicity and practicality, the ordinary
+least squares (OLS) interpolator has become essential to gain foundational
+insights into this phenomenon. While properties of OLS are well established in
+classical, underparameterized settings, its behavior in high-dimensional,
+overparameterized regimes is less explored (unlike for ridge or lasso
+regression) though significant progress has been made of late. We contribute to
+this growing literature by providing fundamental algebraic and statistical
+results for the minimum $\ell_2$-norm OLS interpolator. In particular, we
+provide algebraic equivalents of (i) the leave-$k$-out residual formula, (ii)
+Cochran's formula, and (iii) the Frisch-Waugh-Lovell theorem in the
+overparameterized regime. These results aid in understanding the OLS
+interpolator's ability to generalize and have substantive implications for
+causal inference. Under the Gauss-Markov model, we present statistical results
+such as an extension of the Gauss-Markov theorem and an analysis of variance
+estimation under homoskedastic errors for the overparameterized regime. To
+substantiate our theoretical contributions, we conduct simulations that further
+explore the stochastic properties of the OLS interpolator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CaRiNG: Learning Temporal Causal Representation under Non-Invertible
+  Generation Process <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2401.14535v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2401.14535v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyi Chen, Yifan Shen, Zhenhao Chen, Xiangchen Song, Yuewen Sun, Weiran Yao, Xiao Liu, Kun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying the underlying time-delayed latent causal processes in sequential
+data is vital for grasping temporal dynamics and making downstream reasoning.
+While some recent methods can robustly identify these latent causal variables,
+they rely on strict assumptions about the invertible generation process from
+latent variables to observed data. However, these assumptions are often hard to
+satisfy in real-world applications containing information loss. For instance,
+the visual perception process translates a 3D space into 2D images, or the
+phenomenon of persistence of vision incorporates historical data into current
+perceptions. To address this challenge, we establish an identifiability theory
+that allows for the recovery of independent latent components even when they
+come from a nonlinear and non-invertible mix. Using this theory as a
+foundation, we propose a principled approach, CaRiNG, to learn the CAusal
+RepresentatIon of Non-invertible Generative temporal data with identifiability
+guarantees. Specifically, we utilize temporal context to recover lost latent
+information and apply the conditions in our theory to guide the training
+process. Through experiments conducted on synthetic datasets, we validate that
+our CaRiNG method reliably identifies the causal process, even when the
+generation process is non-invertible. Moreover, we demonstrate that our
+approach considerably improves temporal understanding and reasoning in
+practical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ICML 2024, 24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prismatic VLMs: Investigating the Design Space of Visually-Conditioned
+  Language Models <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.07865v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.07865v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddharth Karamcheti, Suraj Nair, Ashwin Balakrishna, Percy Liang, Thomas Kollar, Dorsa Sadigh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visually-conditioned language models (VLMs) have seen growing adoption in
+applications such as visual dialogue, scene understanding, and robotic task
+planning; adoption that has fueled a wealth of new models such as LLaVa,
+InstructBLIP, and PaLI-3. Despite the volume of new releases, key design
+decisions around image preprocessing, architecture, and optimization are
+under-explored, making it challenging to understand what factors account for
+model performance $-$ a challenge further complicated by the lack of objective,
+consistent evaluations. To address these gaps, we first compile a suite of
+standardized evaluations spanning visual question answering, object
+localization, and challenge sets that probe properties such as hallucination;
+evaluations that provide fine-grained insight VLM capabilities. Second, we
+rigorously investigate VLMs along key design axes, including pretrained visual
+representations and training from base vs. instruct-tuned language models,
+amongst others. We couple our analysis with three resource contributions: (1) a
+unified framework for evaluating VLMs, (2) optimized, flexible training code,
+and (3) checkpoints for all models, including a family of VLMs at the 7-13B
+scale that strictly outperform InstructBLIP and LLaVa v1.5, the
+state-of-the-art in open VLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at ICML 2024. 22 pages, 11 figures. Training code and
+  models: https://github.com/TRI-ML/prismatic-vlms. Evaluation code:
+  https://github.com/TRI-ML/vlm-evaluation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Drug Discovery with Dynamic Goal-aware Fragments <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00841v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00841v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seul Lee, Seanie Lee, Kenji Kawaguchi, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fragment-based drug discovery is an effective strategy for discovering drug
+candidates in the vast chemical space, and has been widely employed in
+molecular generative models. However, many existing fragment extraction methods
+in such models do not take the target chemical properties into account or rely
+on heuristic rules. Additionally, the existing fragment-based generative models
+cannot update the fragment vocabulary with goal-aware fragments newly
+discovered during the generation. To this end, we propose a molecular
+generative framework for drug discovery, named Goal-aware fragment Extraction,
+Assembly, and Modification (GEAM). GEAM consists of three modules, each
+responsible for goal-aware fragment extraction, fragment assembly, and fragment
+modification. The fragment extraction module identifies important fragments
+contributing to the desired target properties with the information bottleneck
+principle, thereby constructing an effective goal-aware fragment vocabulary.
+Moreover, GEAM can explore beyond the initial vocabulary with the fragment
+modification module, and the exploration is further enhanced through the
+dynamic goal-aware vocabulary update. We experimentally demonstrate that GEAM
+effectively discovers drug candidates through the generative cycle of the three
+modules in various drug discovery tasks. Our code is available at
+https://github.com/SeulLee05/GEAM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta-Task Planning for Language Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16510v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16510v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Zhang, Derrick Goh Xin Deik, Dexun Li, Hao Zhang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancement of neural language models has sparked a new surge of
+intelligent agent research. Unlike traditional agents, large language
+model-based agents (LLM agents) have emerged as a promising paradigm for
+achieving artificial general intelligence (AGI) due to their superior reasoning
+and generalization capabilities. Effective planning is crucial for the success
+of LLM agents in real-world tasks, making it a highly pursued topic in the
+community. Current planning methods typically translate tasks into executable
+action sequences. However, determining a feasible or optimal sequence for
+complex tasks at fine granularity, which often requires compositing long chains
+of heterogeneous actions, remains challenging. This paper introduces Meta-Task
+Planning (MTP), a zero-shot methodology for collaborative LLM-based multi-agent
+systems that simplifies complex task planning by decomposing it into a
+hierarchy of subordinate tasks, or meta-tasks. Each meta-task is then mapped
+into executable actions. MTP was assessed on two rigorous benchmarks,
+TravelPlanner and API-Bank. Notably, MTP achieved an average $\sim40\%$ success
+rate on TravelPlanner, significantly higher than the state-of-the-art (SOTA)
+baseline ($2.92\%$), and outperforming $LLM_{api}$-4 with ReAct on API-Bank by
+$\sim14\%$, showing the immense potential of integrating LLM with multi-agent
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Real World Debiasing: A Fine-grained Analysis On Spurious
+  Correlation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.15240v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.15240v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhibo Wang, Peng Kuang, Zhixuan Chu, Jingyi Wang, Kui Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spurious correlations in training data significantly hinder the
+generalization capability of machine learning models when faced with
+distribution shifts in real-world scenarios. To tackle the problem, numerous
+debias approaches have been proposed and benchmarked on datasets intentionally
+designed with severe biases. However, it remains to be asked: \textit{1. Do
+existing benchmarks really capture biases in the real world? 2. Can existing
+debias methods handle biases in the real world?} To answer the questions, we
+revisit biased distributions in existing benchmarks and real-world datasets,
+and propose a fine-grained framework for analyzing dataset bias by
+disentangling it into the magnitude and prevalence of bias. We observe and
+theoretically demonstrate that existing benchmarks poorly represent real-world
+biases. We further introduce two novel biased distributions to bridge this gap,
+forming a nuanced evaluation framework for real-world debiasing. Building upon
+these results, we evaluate existing debias methods with our evaluation
+framework. Results show that existing methods are incapable of handling
+real-world biases. Through in-depth analysis, we propose a simple yet effective
+approach that can be easily applied to existing debias methods, named Debias in
+Destruction (DiD). Empirical results demonstrate the superiority of DiD,
+improving the performance of existing methods on all types of biases within the
+proposed evaluation framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages of main paper, 10 pages of appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DataSP: A Differential All-to-All Shortest Path Algorithm for Learning
+  Costs and Predicting Paths with Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.04923v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.04923v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alan A. Lahoud, Erik Schaffernicht, Johannes A. Stork
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning latent costs of transitions on graphs from trajectories
+demonstrations under various contextual features is challenging but useful for
+path planning. Yet, existing methods either oversimplify cost assumptions or
+scale poorly with the number of observed trajectories. This paper introduces
+DataSP, a differentiable all-to-all shortest path algorithm to facilitate
+learning latent costs from trajectories. It allows to learn from a large number
+of trajectories in each learning step without additional computation. Complex
+latent cost functions from contextual features can be represented in the
+algorithm through a neural network approximation. We further propose a method
+to sample paths from DataSP in order to reconstruct/mimic observed paths'
+distributions. We prove that the inferred distribution follows the maximum
+entropy principle. We show that DataSP outperforms state-of-the-art
+differentiable combinatorial solver and classical machine learning approaches
+in predicting paths on graphs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unity by Diversity: Improved Representation Learning in Multimodal VAEs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.05300v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.05300v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas M. Sutter, Yang Meng, Andrea Agostini, Daphné Chopard, Norbert Fortin, Julia E. Vogt, Bahbak Shahbaba, Stephan Mandt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational Autoencoders for multimodal data hold promise for many tasks in
+data analysis, such as representation learning, conditional generation, and
+imputation. Current architectures either share the encoder output, decoder
+input, or both across modalities to learn a shared representation. Such
+architectures impose hard constraints on the model. In this work, we show that
+a better latent representation can be obtained by replacing these hard
+constraints with a soft constraint. We propose a new mixture-of-experts prior,
+softly guiding each modality's latent representation towards a shared aggregate
+posterior. This approach results in a superior latent representation and allows
+each encoding to preserve information better from its uncompressed original
+features. In extensive experiments on multiple benchmark datasets and two
+challenging real-world datasets, we show improved learned latent
+representations and imputation of missing data modalities compared to existing
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Achievable Fairness on Your Data With Utility Guarantees 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17106v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17106v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Faaiz Taufiq, Jean-Francois Ton, Yang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In machine learning fairness, training models that minimize disparity across
+different sensitive groups often leads to diminished accuracy, a phenomenon
+known as the fairness-accuracy trade-off. The severity of this trade-off
+inherently depends on dataset characteristics such as dataset imbalances or
+biases and therefore, using a uniform fairness requirement across diverse
+datasets remains questionable. To address this, we present a computationally
+efficient approach to approximate the fairness-accuracy trade-off curve
+tailored to individual datasets, backed by rigorous statistical guarantees. By
+utilizing the You-Only-Train-Once (YOTO) framework, our approach mitigates the
+computational burden of having to train multiple models when approximating the
+trade-off curve. Crucially, we introduce a novel methodology for quantifying
+uncertainty in our estimates, thereby providing practitioners with a robust
+framework for auditing model fairness while avoiding false conclusions due to
+estimation errors. Our experiments spanning tabular (e.g., Adult), image
+(CelebA), and language (Jigsaw) datasets underscore that our approach not only
+reliably quantifies the optimum achievable trade-offs across various data
+modalities but also helps detect suboptimality in SOTA fairness methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training-Free Consistent Text-to-Image Generation <span class="chip">SIGGRAPH 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03286v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03286v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoad Tewel, Omri Kaduri, Rinon Gal, Yoni Kasten, Lior Wolf, Gal Chechik, Yuval Atzmon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image models offer a new level of creative flexibility by allowing
+users to guide the image generation process through natural language. However,
+using these models to consistently portray the same subject across diverse
+prompts remains challenging. Existing approaches fine-tune the model to teach
+it new words that describe specific user-provided subjects or add image
+conditioning to the model. These methods require lengthy per-subject
+optimization or large-scale pre-training. Moreover, they struggle to align
+generated images with text prompts and face difficulties in portraying multiple
+subjects. Here, we present ConsiStory, a training-free approach that enables
+consistent subject generation by sharing the internal activations of the
+pretrained model. We introduce a subject-driven shared attention block and
+correspondence-based feature injection to promote subject consistency between
+images. Additionally, we develop strategies to encourage layout diversity while
+maintaining subject consistency. We compare ConsiStory to a range of baselines,
+and demonstrate state-of-the-art performance on subject consistency and text
+alignment, without requiring a single optimization step. Finally, ConsiStory
+can naturally extend to multi-subject scenarios, and even enable training-free
+personalization for common objects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to journal track of SIGGRAPH 2024 (TOG). Project page is at
+  https://consistory-paper.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lookahead: An Inference Acceleration Framework for Large Language Model
+  with Lossless Generation Accuracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2312.12728v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2312.12728v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Zhao, Zhitian Xie, Chen Liang, Chenyi Zhuang, Jinjie Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Large Language Models (LLMs) have made significant advancements across
+various tasks, such as question answering, translation, text summarization, and
+dialogue systems, the need for accuracy in information becomes crucial,
+especially for serious financial products serving billions of users like
+Alipay. However, for a real-world product serving millions of users, the
+inference speed of LLMs becomes a critical factor compared to a mere
+experimental model.
+  Hence, this paper presents a generic framework for accelerating the inference
+process, resulting in a substantial increase in speed and cost reduction for
+our LLM-based scenarios, with lossless generation accuracy. In the traditional
+inference process, each token is generated sequentially by the LLM, leading to
+a time consumption proportional to the number of generated tokens. To enhance
+this process, our framework, named \textit{lookahead}, introduces a
+\textit{multi-branch} strategy. Instead of generating a single token at a time,
+we propose a Trie-based retrieval and verification mechanism to be able to
+accept several tokens at a forward step. Our strategy offers two distinct
+advantages: (1) it guarantees absolute correctness of the output, avoiding any
+approximation algorithms, and (2) the worst-case performance of our approach is
+equivalent to the conventional process. We conduct extensive experiments to
+demonstrate the significant improvements achieved by applying our inference
+acceleration framework. Our framework is widely deployed in Alipay since April
+2023, and obtain remarkable 2.66x to 6.26x speedup. Our code is available at
+https://github.com/alipay/PainlessInferenceAcceleration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedSheafHN: Personalized Federated Learning on Graph-structured Data <span class="chip">ICML 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.16056v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.16056v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenfei Liang, Yanan Zhao, Rui She, Yiming Li, Wee Peng Tay
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized subgraph Federated Learning (FL) is a task that customizes Graph
+Neural Networks (GNNs) to individual client needs, accommodating diverse data
+distributions. However, applying hypernetworks in FL, while aiming to
+facilitate model personalization, often encounters challenges due to inadequate
+representation of client-specific characteristics. To overcome these
+limitations, we propose a model called FedSheafHN, using enhanced collaboration
+graph embedding and efficient personalized model parameter generation.
+Specifically, our model embeds each client's local subgraph into a
+server-constructed collaboration graph. We utilize sheaf diffusion in the
+collaboration graph to learn client representations. Our model improves the
+integration and interpretation of complex client characteristics. Furthermore,
+our model ensures the generation of personalized models through advanced
+hypernetworks optimized for parallel operations across clients. Empirical
+evaluations demonstrate that FedSheafHN outperforms existing methods in most
+scenarios, in terms of client model performance on various graph-structured
+datasets. It also has fast model convergence and effective new clients
+generalization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was submitted to ICML 2024 in Feb 2024. You can find a
+  record
+  here:https://github.com/CarrieWFF/ICML-2024-submission-recording/blob/main/Screenshot%20of%20FedSheafHN%20submission%20to%20ICML%202024.png</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spectral Truncation Kernels: Noncommutativity in $C^*$-algebraic Kernel
+  Machines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.17823v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.17823v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuka Hashimoto, Ayoub Hafid, Masahiro Ikeda, Hachem Kadri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a new class of positive definite kernels based on
+the spectral truncation, which has been discussed in the fields of
+noncommutative geometry and $C^*$-algebra. We focus on kernels whose inputs and
+outputs are functions and generalize existing kernels, such as polynomial,
+product, and separable kernels, by introducing a truncation parameter $n$ that
+describes the noncommutativity of the products appearing in the kernels. When
+$n$ goes to infinity, the proposed kernels tend to the existing commutative
+kernels. If $n$ is finite, they exhibit different behavior, and the
+noncommutativity induces interactions along the data function domain. We show
+that the truncation parameter $n$ is a governing factor leading to performance
+enhancement: by setting an appropriate $n$, we can balance the representation
+power and the complexity of the representation space. The flexibility of the
+proposed class of kernels allows us to go beyond previous commutative kernels.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TensorKrowch: Smooth integration of tensor networks in machine learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08595v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08595v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José Ramón Pareja Monturiol, David Pérez-García, Alejandro Pozas-Kerstjens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tensor networks are factorizations of high-dimensional tensors into networks
+of smaller tensors. They have applications in physics and mathematics, and
+recently have been proposed as promising machine learning architectures. To
+ease the integration of tensor networks in machine learning pipelines, we
+introduce TensorKrowch, an open source Python library built on top of PyTorch.
+Providing a user-friendly interface, TensorKrowch allows users to construct any
+tensor network, train it, and integrate it as a layer in more intricate deep
+learning models. In this paper, we describe the main functionality and basic
+usage of TensorKrowch, and provide technical details on its building blocks and
+the optimizations performed to achieve efficient operation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 2 figures. The TensorKrowch GitHub repository is in
+  https://github.com/joserapa98/tensorkrowch and the TensorKrowch documentation
+  is in https://joserapa98.github.io/tensorkrowch. V2: Accepted version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DeepLag: Discovering Deep Lagrangian Dynamics for Intuitive Fluid
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02425v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02425v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qilong Ma, Haixu Wu, Lanxiang Xing, Shangchen Miao, Mingsheng Long
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately predicting the future fluid is vital to extensive areas such as
+meteorology, oceanology, and aerodynamics. However, since the fluid is usually
+observed from an Eulerian perspective, its moving and intricate dynamics are
+seriously obscured and confounded in static grids, bringing thorny challenges
+to the prediction. This paper introduces a new Lagrangian-Eulerian combined
+paradigm to tackle the tanglesome fluid dynamics. Instead of solely predicting
+the future based on Eulerian observations, we propose DeepLag to discover
+hidden Lagrangian dynamics within the fluid by tracking the movements of
+adaptively sampled key particles. DeepLag utilizes the proposed where the
+Lagrangian movement of the tracked particles is inferred from Eulerian
+observations, and their accumulated Lagrangian dynamics information is
+incorporated into global Eulerian evolving features to guide future prediction
+respectively. Tracking key particles not only provides a transparent and
+interpretable clue for fluid dynamics but also makes our model free from
+modeling complex correlations among massive grids for better efficiency.
+Experimentally, DeepLag excels in three challenging fluid prediction tasks
+covering 2D and 3D, simulated and real-world fluids.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Multi-Branched Radial Basis Network Approach to Predicting Complex
+  Chaotic Behaviours 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2404.00618v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2404.00618v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aarush Sinha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we propose a multi branched network approach to predict the
+dynamics of a physics attractor characterized by intricate and chaotic
+behavior. We introduce a unique neural network architecture comprised of Radial
+Basis Function (RBF) layers combined with an attention mechanism designed to
+effectively capture nonlinear inter-dependencies inherent in the attractor's
+temporal evolution. Our results demonstrate successful prediction of the
+attractor's trajectory across 100 predictions made using a real-world dataset
+of 36,700 time-series observations encompassing approximately 28 minutes of
+activity. To further illustrate the performance of our proposed technique, we
+provide comprehensive visualizations depicting the attractor's original and
+predicted behaviors alongside quantitative measures comparing observed versus
+estimated outcomes. Overall, this work showcases the potential of advanced
+machine learning algorithms in elucidating hidden structures in complex
+physical systems while offering practical applications in various domains
+requiring accurate short-term forecasting capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defining Neural Network Architecture through Polytope Structures of
+  <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02407v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02407v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangmin Lee, Abbas Mammadov, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current theoretical and empirical research in neural networks suggests that
+complex datasets require large network architectures for thorough
+classification, yet the precise nature of this relationship remains unclear.
+This paper tackles this issue by defining upper and lower bounds for neural
+network widths, which are informed by the polytope structure of the dataset in
+question. We also delve into the application of these principles to simplicial
+complexes and specific manifold shapes, explaining how the requirement for
+network width varies in accordance with the geometric complexity of the
+dataset. Moreover, we develop an algorithm to investigate a converse situation
+where the polytope structure of a dataset can be inferred from its
+corresponding trained neural networks. Through our algorithm, it is established
+that popular datasets such as MNIST, Fashion-MNIST, and CIFAR10 can be
+efficiently encapsulated using no more than two polytopes with a small number
+of faces.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilling Robustness into Natural Language Inference Models with
+  Domain-Targeted Augmentation <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13067v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13067v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joe Stacey, Marek Rei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge distillation optimises a smaller student model to behave similarly
+to a larger teacher model, retaining some of the performance benefits. While
+this method can improve results on in-distribution examples, it does not
+necessarily generalise to out-of-distribution (OOD) settings. We investigate
+two complementary methods for improving the robustness of the resulting student
+models on OOD domains. The first approach augments the distillation with
+generated unlabelled examples that match the target distribution. The second
+method upsamples data points among the training set that are similar to the
+target distribution. When applied on the task of natural language inference
+(NLI), our experiments on MNLI show that distillation with these modifications
+outperforms previous robustness solutions. We also find that these methods
+improve performance on OOD domains even beyond the target domain.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACL Findings 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Position: Tensor Networks are a Valuable Asset for Green AI <span class="chip">ICML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.12961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.12961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eva Memmel, Clara Menzen, Jetze Schuurmans, Frederiek Wesel, Kim Batselier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For the first time, this position paper introduces a fundamental link between
+tensor networks (TNs) and Green AI, highlighting their synergistic potential to
+enhance both the inclusivity and sustainability of AI research. We argue that
+TNs are valuable for Green AI due to their strong mathematical backbone and
+inherent logarithmic compression potential. We undertake a comprehensive review
+of the ongoing discussions on Green AI, emphasizing the importance of
+sustainability and inclusivity in AI research to demonstrate the significance
+of establishing the link between Green AI and TNs. To support our position, we
+first provide a comprehensive overview of efficiency metrics proposed in Green
+AI literature and then evaluate examples of TNs in the fields of kernel
+machines and deep learning using the proposed efficiency metrics. This position
+paper aims to incentivize meaningful, constructive discussions by bridging
+fundamental principles of Green AI and TNs. We advocate for researchers to
+seriously evaluate the integration of TNs into their research projects, and in
+alignment with the link established in this paper, we support prior calls
+encouraging researchers to treat Green AI principles as a research priority.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted for presentation at the International
+  Conference on Machine Learning (ICML) 2024 and will appear in the conference
+  proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LQER: Low-Rank Quantization Error Reconstruction for LLMs <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.02446v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.02446v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Zhang, Jianyi Cheng, George A. Constantinides, Yiren Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Post-training quantization of Large Language Models (LLMs) is challenging. In
+this work, we introduce Low-rank Quantization Error Reduction (LQER), which
+combines quantization and low-rank approximation to recover the model
+capability. LQER leverages an activation-induced scale matrix to drive the
+singular value distribution of quantization error towards a desirable
+distribution, which enables nearly-lossless W4A8 quantization on various LLMs
+and downstream tasks without the need for knowledge distillation, grid search,
+or gradient-base iterative optimization. Unlike existing methods, the
+computation pattern of LQER eliminates the need for specialized Scatter and
+Gather processes to collect high-precision weights from irregular memory
+locations. Our W4A8 LLMs achieve near-lossless performance on six popular
+downstream tasks, while using 1.36$\times$ fewer hardware resources than the
+leading state-of-the-art method. We open-source our framework at
+https://github.com/ChengZhang-98/lqer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Latent Dynamic Robust Representations for World Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.06263v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.06263v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixiang Sun, Hongyu Zang, Xin Li, Riashat Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Model-Based Reinforcement Learning (MBRL) promises to encapsulate
+agent's knowledge about the underlying dynamics of the environment, enabling
+learning a world model as a useful planner. However, top MBRL agents such as
+Dreamer often struggle with visual pixel-based inputs in the presence of
+exogenous or irrelevant noise in the observation space, due to failure to
+capture task-specific features while filtering out irrelevant spatio-temporal
+details. To tackle this problem, we apply a spatio-temporal masking strategy, a
+bisimulation principle, combined with latent reconstruction, to capture
+endogenous task-specific aspects of the environment for world models,
+effectively eliminating non-essential information. Joint training of
+representations, dynamics, and policy often leads to instabilities. To further
+address this issue, we develop a Hybrid Recurrent State-Space Model (HRSSM)
+structure, enhancing state representation robustness for effective policy
+learning. Our empirical evaluation demonstrates significant performance
+improvements over existing methods in a range of visually complex control tasks
+such as Maniskill \cite{gu2023maniskill2} with exogenous distractors from the
+Matterport environment. Our code is avaliable at
+https://github.com/bit1029public/HRSSM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provably Robust Cost-Sensitive Learning via Randomized Smoothing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08732v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08732v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Xin, Michael Backes, Xiao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of robust learning against adversarial perturbations
+under cost-sensitive scenarios, where the potential harm of different types of
+misclassifications is encoded in a cost matrix. Existing approaches are either
+empirical and cannot certify robustness or suffer from inherent scalability
+issues. In this work, we investigate whether randomized smoothing, a scalable
+framework for robustness certification, can be leveraged to certify and train
+for cost-sensitive robustness. Built upon the notion of cost-sensitive
+certified radius, we first illustrate how to adapt the standard certification
+algorithm of randomized smoothing to produce tight robustness certificates for
+any binary cost matrix, and then develop a robust training method to promote
+certified cost-sensitive robustness while maintaining the model's overall
+accuracy. Through extensive experiments on image benchmarks, we demonstrate the
+superiority of our proposed certification algorithm and training method under
+various cost-sensitive scenarios. Our implementation is available as open
+source code at: https://github.com/TrustMLRG/CS-RS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 9 tables, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Elastic Feature Consolidation for Cold Start Exemplar-Free Incremental
+  Learning <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.03917v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.03917v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simone Magistri, Tomaso Trinci, Albin Soutif-Cormerais, Joost van de Weijer, Andrew D. Bagdanov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Exemplar-Free Class Incremental Learning (EFCIL) aims to learn from a
+sequence of tasks without having access to previous task data. In this paper,
+we consider the challenging Cold Start scenario in which insufficient data is
+available in the first task to learn a high-quality backbone. This is
+especially challenging for EFCIL since it requires high plasticity, which
+results in feature drift which is difficult to compensate for in the
+exemplar-free setting. To address this problem, we propose a simple and
+effective approach that consolidates feature representations by regularizing
+drift in directions highly relevant to previous tasks and employs prototypes to
+reduce task-recency bias. Our method, called Elastic Feature Consolidation
+(EFC), exploits a tractable second-order approximation of feature drift based
+on an Empirical Feature Matrix (EFM). The EFM induces a pseudo-metric in
+feature space which we use to regularize feature drift in important directions
+and to update Gaussian prototypes used in a novel asymmetric cross entropy loss
+which effectively balances prototype rehearsal with data from new tasks.
+Experimental results on CIFAR-100, Tiny-ImageNet, ImageNet-Subset and
+ImageNet-1K demonstrate that Elastic Feature Consolidation is better able to
+learn new tasks by maintaining model plasticity and significantly outperform
+the state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Twelfth International Conference on Learning
+  Representations (ICLR 2024)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stochastic Gradient Descent-like relaxation is equivalent to Metropolis
+  dynamics in discrete optimization and inference problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05337v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05337v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Chiara Angelini, Angelo Giorgio Cavaliere, Raffaele Marino, Federico Ricci-Tersenghi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Is Stochastic Gradient Descent (SGD) substantially different from Metropolis
+Monte Carlo dynamics? This is a fundamental question at the time of
+understanding the most used training algorithm in the field of Machine
+Learning, but it received no answer until now. Here we show that in discrete
+optimization and inference problems, the dynamics of an SGD-like algorithm
+resemble very closely that of Metropolis Monte Carlo with a properly chosen
+temperature, which depends on the mini-batch size. This quantitative matching
+holds both at equilibrium and in the out-of-equilibrium regime, despite the two
+algorithms having fundamental differences (e.g.\ SGD does not satisfy detailed
+balance). Such equivalence allows us to use results about performances and
+limits of Monte Carlo algorithms to optimize the mini-batch size in the
+SGD-like algorithm and make it efficient at recovering the signal in hard
+inference problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Resilience of Deep Learning applications: a systematic literature <span class="highlight-title">review</span>
+  of analysis and hardening techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16733v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16733v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristiana Bolchini, Luca Cassano, Antonio Miele
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning (ML) is currently being exploited in numerous applications
+being one of the most effective Artificial Intelligence (AI) technologies, used
+in diverse fields, such as vision, autonomous systems, and alike. The trend
+motivated a significant amount of contributions to the analysis and design of
+ML applications against faults affecting the underlying hardware. The authors
+investigate the existing body of knowledge on Deep Learning (among ML
+techniques) resilience against hardware faults systematically through a
+thoughtful review in which the strengths and weaknesses of this literature
+stream are presented clearly and then future avenues of research are set out.
+The review is based on 220 scientific articles published between January 2019
+and March 2024. The authors adopt a classifying framework to interpret and
+highlight research similarities and peculiarities, based on several parameters,
+starting from the main scope of the work, the adopted fault and error models,
+to their reproducibility. This framework allows for a comparison of the
+different solutions and the identification of possible synergies. Furthermore,
+suggestions concerning the future direction of research are proposed in the
+form of open challenges to be addressed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Elsevier Computer Science Review on May 9, 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Position: Topological Deep Learning is the New Frontier for Relational
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.08871v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.08871v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theodore Papamarkou, Tolga Birdal, Michael Bronstein, Gunnar Carlsson, Justin Curry, Yue Gao, Mustafa Hajij, Roland Kwitt, Pietro Liò, Paolo Di Lorenzo, Vasileios Maroulas, Nina Miolane, Farzana Nasrin, Karthikeyan Natesan Ramamurthy, Bastian Rieck, Simone Scardapane, Michael T. Schaub, Petar Veličković, Bei Wang, Yusu Wang, Guo-Wei Wei, Ghada Zamzmi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topological deep learning (TDL) is a rapidly evolving field that uses
+topological features to understand and design deep learning models. This paper
+posits that TDL is the new frontier for relational learning. TDL may complement
+graph representation learning and geometric deep learning by incorporating
+topological concepts, and can thus provide a natural choice for various machine
+learning settings. To this end, this paper discusses open problems in TDL,
+ranging from practical benefits to theoretical foundations. For each problem,
+it outlines potential solutions and future research opportunities. At the same
+time, this paper serves as an invitation to the scientific community to
+actively participate in TDL research to unlock the potential of this emerging
+field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CICLe: Conformal In-Context Learning for Largescale Multi-Class Food
+  Risk Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2403.11904v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2403.11904v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Korbinian Randl, John Pavlopoulos, Aron Henriksson, Tony Lindgren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contaminated or adulterated food poses a substantial risk to human health.
+Given sets of labeled web texts for training, Machine Learning and Natural
+Language Processing can be applied to automatically detect such risks. We
+publish a dataset of 7,546 short texts describing public food recall
+announcements. Each text is manually labeled, on two granularity levels (coarse
+and fine), for food products and hazards that the recall corresponds to. We
+describe the dataset and benchmark naive, traditional, and Transformer models.
+Based on our analysis, Logistic Regression based on a tf-idf representation
+outperforms RoBERTa and XLM-R on classes with low support. Finally, we discuss
+different prompting strategies and present an LLM-in-the-loop framework, based
+on Conformal Prediction, which boosts the performance of the base classifier
+while reducing energy consumption compared to normal prompting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RIME: Robust Preference-based Reinforcement Learning with Noisy
+  Preferences <span class="chip">ICML2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2402.17257v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2402.17257v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Cheng, Gang Xiong, Xingyuan Dai, Qinghai Miao, Yisheng Lv, Fei-Yue Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Preference-based Reinforcement Learning (PbRL) circumvents the need for
+reward engineering by harnessing human preferences as the reward signal.
+However, current PbRL methods excessively depend on high-quality feedback from
+domain experts, which results in a lack of robustness. In this paper, we
+present RIME, a robust PbRL algorithm for effective reward learning from noisy
+preferences. Our method utilizes a sample selection-based discriminator to
+dynamically filter out noise and ensure robust training. To counteract the
+cumulative error stemming from incorrect selection, we suggest a warm start for
+the reward model, which additionally bridges the performance gap during the
+transition from pre-training to online training in PbRL. Our experiments on
+robotic manipulation and locomotion tasks demonstrate that RIME significantly
+enhances the robustness of the state-of-the-art PbRL method. Code is available
+at https://github.com/CJReinforce/RIME_ICML2024.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICML2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Causal Inference from Observational Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.13047v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.13047v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thanh Vinh Vo, Young lee, Tze-Yun Leong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralized data sources are prevalent in real-world applications, posing a
+formidable challenge for causal inference. These sources cannot be consolidated
+into a single entity owing to privacy constraints. The presence of dissimilar
+data distributions and missing values within them can potentially introduce
+bias to the causal estimands. In this article, we propose a framework to
+estimate causal effects from decentralized data sources. The proposed framework
+avoid exchanging raw data among the sources, thus contributing towards
+privacy-preserving causal learning. Three instances of the proposed framework
+are introduced to estimate causal effects across a wide range of diverse
+scenarios within a federated setting. (1) FedCI: a Bayesian framework based on
+Gaussian processes for estimating causal effects from federated observational
+data sources. It estimates the posterior distributions of the causal effects to
+compute the higher-order statistics that capture the uncertainty. (2)
+CausalRFF: an adaptive transfer algorithm that learns the similarities among
+the data sources by utilizing Random Fourier Features to disentangle the loss
+function into multiple components, each of which is associated with a data
+source. It estimates the similarities among the sources through transfer
+coefficients, and hence requiring no prior information about the similarity
+measures. (3) CausalFI: a new approach for federated causal inference from
+incomplete data, enabling the estimation of causal effects from multiple
+decentralized and incomplete data sources. It accounts for the missing data
+under the missing at random assumption, while also estimating higher-order
+statistics of the causal estimands. The proposed federated framework and its
+instances are an important step towards a privacy-preserving causal learning
+model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. arXiv admin note: substantial text overlap with
+  arXiv:2301.00346</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeRF View Synthesis: Subjective Quality Assessment and Objective Metrics
+  Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Martin, Antonio Rodrigues, Joao Ascenso, Maria Paula Queluz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural radiance fields (NeRF) are a groundbreaking computer vision technology
+that enables the generation of high-quality, immersive visual content from
+multiple viewpoints. This capability holds significant advantages for
+applications such as virtual/augmented reality, 3D modelling and content
+creation for the film and entertainment industry. However, the evaluation of
+NeRF methods poses several challenges, including a lack of comprehensive
+datasets, reliable assessment methodologies, and objective quality metrics.
+This paper addresses the problem of NeRF quality assessment thoroughly, by
+conducting a rigorous subjective quality assessment test that considers several
+scene classes and recently proposed NeRF view synthesis methods. Additionally,
+the performance of a wide range of state-of-the-art conventional and
+learning-based full-reference 2D image and video quality assessment metrics is
+evaluated against the subjective scores of the subjective study. The
+experimental results are analyzed in depth, providing a comparative evaluation
+of several NeRF methods and objective quality metrics, across different classes
+of visual scenes, including real and synthetic content for front-face and
+360-degree camera trajectories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>us: Can <span class="highlight-title">Prompt</span>s Streaming Replace Video Streaming with Stable
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.20032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.20032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiangkai Wu, Liming Liu, Yunpeng Tan, Junlin Hao, Xinggong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the exponential growth of video traffic, traditional video streaming
+systems are approaching their limits in compression efficiency and
+communication capacity. To further reduce bitrate while maintaining quality, we
+propose Promptus, a disruptive novel system that streaming prompts instead of
+video content with Stable Diffusion, which converts video frames into a series
+of "prompts" for delivery. To ensure pixel alignment, a gradient descent-based
+prompt fitting framework is proposed. To achieve adaptive bitrate for prompts,
+a low-rank decomposition-based bitrate control algorithm is introduced. For
+inter-frame compression of prompts, a temporal smoothing-based prompt
+interpolation algorithm is proposed. Evaluations across various video domains
+and real network traces demonstrate Promptus can enhance the perceptual quality
+by 0.111 and 0.092 (in LPIPS) compared to VAE and H.265, respectively, and
+decreases the ratio of severely distorted frames by 89.3% and 91.7%. Moreover,
+Promptus achieves real-time video generation from prompts at over 150 FPS. To
+the best of our knowledge, Promptus is the first attempt to replace video
+codecs with prompt inversion and the first to use prompt streaming instead of
+video streaming. Our work opens up a new paradigm for efficient video
+communication beyond the Shannon limit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Joint Semantic Coding and Beamforming for Near-Space Airship-Borne
+  Massive MIMO Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minghui Wu, Zhen Gao, Zhaocheng Wang, Dusit Niyato, George K. Karagiannidis, Sheng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Near-space airship-borne communication network is recognized to be an
+indispensable component of the future integrated ground-air-space network
+thanks to airships' advantage of long-term residency at stratospheric
+altitudes, but it urgently needs reliable and efficient Airship-to-X link. To
+improve the transmission efficiency and capacity, this paper proposes to
+integrate semantic communication with massive multiple-input multiple-output
+(MIMO) technology. Specifically, we propose a deep joint semantic coding and
+beamforming (JSCBF) scheme for airship-based massive MIMO image transmission
+network in space, in which semantics from both source and channel are fused to
+jointly design the semantic coding and physical layer beamforming. First, we
+design two semantic extraction networks to extract semantics from image source
+and channel state information, respectively. Then, we propose a semantic fusion
+network that can fuse these semantics into complex-valued semantic features for
+subsequent physical-layer transmission. To efficiently transmit the fused
+semantic features at the physical layer, we then propose the hybrid data and
+model-driven semantic-aware beamforming networks. At the receiver, a semantic
+decoding network is designed to reconstruct the transmitted images. Finally, we
+perform end-to-end deep learning to jointly train all the modules, using the
+image reconstruction quality at the receivers as a metric. The proposed deep
+JSCBF scheme fully combines the efficient source compressibility and robust
+error correction capability of semantic communication with the high spectral
+efficiency of massive MIMO, achieving a significant performance improvement
+over existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Major Revision by IEEE JSAC</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Robustness of Decision-Level Through Adversarial Attacks
+  on LLM-Based Embodied Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2405.19802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2405.19802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyuan Liu, Jiawei Chen, Shouwei Ruan, Hang Su, Zhaoxia Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Embodied intelligence empowers agents with a profound sense of perception,
+enabling them to respond in a manner closely aligned with real-world
+situations. Large Language Models (LLMs) delve into language instructions with
+depth, serving a crucial role in generating plans for intricate tasks. Thus,
+LLM-based embodied models further enhance the agent's capacity to comprehend
+and process information. However, this amalgamation also ushers in new
+challenges in the pursuit of heightened intelligence. Specifically, attackers
+can manipulate LLMs to produce irrelevant or even malicious outputs by altering
+their prompts. Confronted with this challenge, we observe a notable absence of
+multi-modal datasets essential for comprehensively evaluating the robustness of
+LLM-based embodied models. Consequently, we construct the Embodied Intelligent
+Robot Attack Dataset (EIRAD), tailored specifically for robustness evaluation.
+Additionally, two attack strategies are devised, including untargeted attacks
+and targeted attacks, to effectively simulate a range of diverse attack
+scenarios. At the same time, during the attack process, to more accurately
+ascertain whether our method is successful in attacking the LLM-based embodied
+model, we devise a new attack success evaluation method utilizing the BLIP2
+model. Recognizing the time and cost-intensive nature of the GCG algorithm in
+attacks, we devise a scheme for prompt suffix initialization based on various
+target tasks, thus expediting the convergence process. Experimental results
+demonstrate that our method exhibits a superior attack success rate when
+targeting LLM-based embodied models, indicating a lower level of decision-level
+robustness in these models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+
+</body>
+
+<footer>
+    <div>
+        <time id="build-timestamp" datetime="2024-06-07T05:24:35.917989845Z">
+            2024-06-07 05:24:35 UTC
+        </time>
+    </div>
+</footer>
+<script src="index.js"></script>
+</html>
diff --git a/index.js b/index.js
new file mode 100644
index 00000000..69f5da7b
--- /dev/null
+++ b/index.js
@@ -0,0 +1,39 @@
+/* Exapand/Collapse with TAB key */
+var expanded = false;
+document.onkeydown = function (e) {
+    if (e.keyCode === 9) {
+        expanded = !expanded;
+        document.querySelectorAll("details").forEach(detail => detail.open = expanded);
+        return false;
+    }
+};
+
+/* Switch Theme */
+const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+
+function switchTheme(e) {
+    if (e.target.checked) {
+        document.documentElement.setAttribute('data-theme', 'light');
+        document.getElementById("theme-icon").className = "ri-sun-line";
+        localStorage.setItem('theme', 'light'); //add this
+    } else {
+        document.documentElement.setAttribute('data-theme', 'dark');
+        document.getElementById("theme-icon").className = "ri-moon-line";
+        localStorage.setItem('theme', 'dark'); //add this
+    }
+}
+
+toggleSwitch.addEventListener('change', switchTheme, false);
+const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+if (currentTheme) {
+    document.documentElement.setAttribute('data-theme', currentTheme);
+    if (currentTheme === 'light') {
+        toggleSwitch.checked = true;
+    }
+}
+
+const timestamp = document.getElementById("build-timestamp");
+const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString();
+
+const badge = document.getElementById("build-timestamp-badge");
+// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`